1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "Operations"
18 
19 #include <algorithm>
20 #include <cfloat>
21 #include <cmath>
22 #include <numeric>
23 #include <utility>
24 #include <vector>
25 
26 #include "OperationResolver.h"
27 #include "OperationsUtils.h"
28 #include "Tracing.h"
29 
30 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
31 #include "CpuOperationUtils.h"
32 #endif  // NN_INCLUDE_CPU_IMPLEMENTATION
33 
34 namespace android {
35 namespace nn {
36 namespace bbox_ops {
37 
38 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
39 namespace {
40 
41 struct BoxEncodingCorner {
42     float x1, y1, x2, y2;
43 };
44 struct BoxEncodingCenter {
45     float w, h, x, y;
46 };
toBoxEncodingCorner(const BoxEncodingCenter & ctr)47 BoxEncodingCorner toBoxEncodingCorner(const BoxEncodingCenter& ctr) {
48     return {.x1 = ctr.x - ctr.w / 2,
49             .y1 = ctr.y - ctr.h / 2,
50             .x2 = ctr.x + ctr.w / 2,
51             .y2 = ctr.y + ctr.h / 2};
52 }
toBoxEncodingCenter(const BoxEncodingCorner & cnr)53 BoxEncodingCenter toBoxEncodingCenter(const BoxEncodingCorner& cnr) {
54     return {.w = cnr.x2 - cnr.x1,
55             .h = cnr.y2 - cnr.y1,
56             .x = (cnr.x1 + cnr.x2) / 2,
57             .y = (cnr.y1 + cnr.y2) / 2};
58 }
59 
bboxTransformFloat32(const float * roiData,const Shape & roiShape,const float * bboxDeltasData,const Shape & bboxDeltasShape,const int32_t * batchesData,const Shape & batchesShape,const float * imageInfoData,const Shape & imageInfoDataShape,float * outputData,const Shape & outputShape)60 inline bool bboxTransformFloat32(const float* roiData, const Shape& roiShape,
61                                  const float* bboxDeltasData, const Shape& bboxDeltasShape,
62                                  const int32_t* batchesData, const Shape& batchesShape,
63                                  const float* imageInfoData, const Shape& imageInfoDataShape,
64                                  float* outputData, const Shape& outputShape) {
65     const uint32_t roiLength = 4;
66     const uint32_t imageLength = 2;
67 
68     uint32_t numClasses = getSizeOfDimension(bboxDeltasShape, 1) / roiLength;
69     uint32_t numBatches = getSizeOfDimension(imageInfoDataShape, 0);
70 
71     const float* roiDataEnd = roiData + getNumberOfElements(roiShape);
72     const float* deltas = bboxDeltasData;
73     float* outPtr = outputData;
74     uint32_t roiIndex = 0;
75     for (const float* roiBase = roiData; roiBase < roiDataEnd; roiBase += roiLength, roiIndex++) {
76         uint32_t batchIndex = batchesData[roiIndex];
77         // Check for malformed data
78         // 1. Invalid batch id
79         // 2. Invalid region: x2 < x1 || y2 < y1
80         NN_RET_CHECK_GE(batchIndex, 0);
81         NN_RET_CHECK_LT(batchIndex, numBatches);
82         NN_RET_CHECK_LE(roiBase[0], roiBase[2]);
83         NN_RET_CHECK_LE(roiBase[1], roiBase[3]);
84 
85         const float* imageInfoBase = imageInfoData + batchIndex * imageLength;
86         float imageHeight = imageInfoBase[0];
87         float imageWidth = imageInfoBase[1];
88         auto roiBefore = toBoxEncodingCenter(
89                 {.x1 = roiBase[0], .y1 = roiBase[1], .x2 = roiBase[2], .y2 = roiBase[3]});
90         for (uint32_t i = 0; i < numClasses; i++) {
91             auto roiAfter = toBoxEncodingCorner({.w = std::exp(deltas[2]) * roiBefore.w,
92                                                  .h = std::exp(deltas[3]) * roiBefore.h,
93                                                  .x = roiBefore.x + deltas[0] * roiBefore.w,
94                                                  .y = roiBefore.y + deltas[1] * roiBefore.h});
95             BoxEncodingCorner cliped = {.x1 = std::min(std::max(roiAfter.x1, 0.0f), imageWidth),
96                                         .y1 = std::min(std::max(roiAfter.y1, 0.0f), imageHeight),
97                                         .x2 = std::min(std::max(roiAfter.x2, 0.0f), imageWidth),
98                                         .y2 = std::min(std::max(roiAfter.y2, 0.0f), imageHeight)};
99             outPtr[0] = cliped.x1;
100             outPtr[1] = cliped.y1;
101             outPtr[2] = cliped.x2;
102             outPtr[3] = cliped.y2;
103             deltas += roiLength;
104             outPtr += roiLength;
105         }
106     }
107     return true;
108 }
109 
bboxTransformFloat16(const _Float16 * roiData,const Shape & roiShape,const _Float16 * bboxDeltasData,const Shape & bboxDeltasShape,const int32_t * batchesData,const Shape & batchesShape,const _Float16 * imageInfoData,const Shape & imageInfoDataShape,_Float16 * outputData,const Shape & outputShape)110 inline bool bboxTransformFloat16(const _Float16* roiData, const Shape& roiShape,
111                                  const _Float16* bboxDeltasData, const Shape& bboxDeltasShape,
112                                  const int32_t* batchesData, const Shape& batchesShape,
113                                  const _Float16* imageInfoData, const Shape& imageInfoDataShape,
114                                  _Float16* outputData, const Shape& outputShape) {
115     std::vector<float> roi_float32(getNumberOfElements(roiShape));
116     convertFloat16ToFloat32(roiData, &roi_float32);
117     std::vector<float> delta_float32(getNumberOfElements(bboxDeltasShape));
118     convertFloat16ToFloat32(bboxDeltasData, &delta_float32);
119     std::vector<float> imageInfo_float32(getNumberOfElements(imageInfoDataShape));
120     convertFloat16ToFloat32(imageInfoData, &imageInfo_float32);
121     std::vector<float> output_float32(getNumberOfElements(outputShape));
122     NN_RET_CHECK(bboxTransformFloat32(roi_float32.data(), roiShape, delta_float32.data(),
123                                       bboxDeltasShape, batchesData, batchesShape,
124                                       imageInfo_float32.data(), imageInfoDataShape,
125                                       output_float32.data(), outputShape));
126     convertFloat32ToFloat16(output_float32, outputData);
127     return true;
128 }
129 
bboxTransformQuant(const uint16_t * roiData,const Shape & roiShape,const uint8_t * bboxDeltasData,const Shape & bboxDeltasShape,const int32_t * batchesData,const Shape & batchesShape,const uint16_t * imageInfoData,const Shape & imageInfoDataShape,uint16_t * outputData,const Shape & outputShape)130 inline bool bboxTransformQuant(const uint16_t* roiData, const Shape& roiShape,
131                                const uint8_t* bboxDeltasData, const Shape& bboxDeltasShape,
132                                const int32_t* batchesData, const Shape& batchesShape,
133                                const uint16_t* imageInfoData, const Shape& imageInfoDataShape,
134                                uint16_t* outputData, const Shape& outputShape) {
135     std::vector<float> roi_float32(getNumberOfElements(roiShape));
136     convertQuantToFloat32(roiData, roiShape.scale, roiShape.offset, &roi_float32);
137     std::vector<float> delta_float32(getNumberOfElements(bboxDeltasShape));
138     convertQuantToFloat32(bboxDeltasData, bboxDeltasShape.scale, bboxDeltasShape.offset,
139                           &delta_float32);
140     std::vector<float> imageInfo_float32(getNumberOfElements(imageInfoDataShape));
141     convertQuantToFloat32(imageInfoData, imageInfoDataShape.scale, imageInfoDataShape.offset,
142                           &imageInfo_float32);
143     std::vector<float> output_float32(getNumberOfElements(outputShape));
144     NN_RET_CHECK(bboxTransformFloat32(roi_float32.data(), roiShape, delta_float32.data(),
145                                       bboxDeltasShape, batchesData, batchesShape,
146                                       imageInfo_float32.data(), imageInfoDataShape,
147                                       output_float32.data(), outputShape));
148     convertFloat32ToQuant(output_float32, outputShape.scale, outputShape.offset, outputData);
149     return true;
150 }
151 
bboxTransformQuant(const uint16_t * roiData,const Shape & roiShape,const int8_t * bboxDeltasData,const Shape & bboxDeltasShape,const int32_t * batchesData,const Shape & batchesShape,const uint16_t * imageInfoData,const Shape & imageInfoDataShape,uint16_t * outputData,const Shape & outputShape)152 inline bool bboxTransformQuant(const uint16_t* roiData, const Shape& roiShape,
153                                const int8_t* bboxDeltasData, const Shape& bboxDeltasShape,
154                                const int32_t* batchesData, const Shape& batchesShape,
155                                const uint16_t* imageInfoData, const Shape& imageInfoDataShape,
156                                uint16_t* outputData, const Shape& outputShape) {
157     std::vector<float> roi_float32(getNumberOfElements(roiShape));
158     convertQuantToFloat32(roiData, roiShape.scale, roiShape.offset, &roi_float32);
159     std::vector<float> delta_float32(getNumberOfElements(bboxDeltasShape));
160     convertQuantToFloat32<int8_t>(bboxDeltasData, bboxDeltasShape.scale, bboxDeltasShape.offset,
161                                   &delta_float32);
162     std::vector<float> imageInfo_float32(getNumberOfElements(imageInfoDataShape));
163     convertQuantToFloat32(imageInfoData, imageInfoDataShape.scale, imageInfoDataShape.offset,
164                           &imageInfo_float32);
165     std::vector<float> output_float32(getNumberOfElements(outputShape));
166     NN_RET_CHECK(bboxTransformFloat32(roi_float32.data(), roiShape, delta_float32.data(),
167                                       bboxDeltasShape, batchesData, batchesShape,
168                                       imageInfo_float32.data(), imageInfoDataShape,
169                                       output_float32.data(), outputShape));
170     convertFloat32ToQuant(output_float32, outputShape.scale, outputShape.offset, outputData);
171     return true;
172 }
173 
174 // Taking two indices of bounding boxes, return the intersection-of-union.
getIoUAxisAligned(const float * roi1,const float * roi2)175 float getIoUAxisAligned(const float* roi1, const float* roi2) {
176     const float area1 = (roi1[2] - roi1[0]) * (roi1[3] - roi1[1]);
177     const float area2 = (roi2[2] - roi2[0]) * (roi2[3] - roi2[1]);
178     const float x1 = std::max(roi1[0], roi2[0]);
179     const float x2 = std::min(roi1[2], roi2[2]);
180     const float y1 = std::max(roi1[1], roi2[1]);
181     const float y2 = std::min(roi1[3], roi2[3]);
182     const float w = std::max(x2 - x1, 0.0f);
183     const float h = std::max(y2 - y1, 0.0f);
184     const float areaIntersect = w * h;
185     const float areaUnion = area1 + area2 - areaIntersect;
186     return areaIntersect / areaUnion;
187 }
188 
189 }  // namespace
190 #endif  // NN_INCLUDE_CPU_IMPLEMENTATION
191 
192 namespace axis_aligned_bbox_transform {
193 
194 constexpr char kOperationName[] = "AXIS_ALIGNED_BBOX_TRANSFORM";
195 
196 constexpr uint32_t kNumInputs = 4;
197 constexpr uint32_t kRoiTensor = 0;
198 constexpr uint32_t kDeltaTensor = 1;
199 constexpr uint32_t kBatchesTensor = 2;
200 constexpr uint32_t kImageInfoTensor = 3;
201 
202 constexpr uint32_t kNumOutputs = 1;
203 constexpr uint32_t kOutputTensor = 0;
204 
validate(const IOperationValidationContext * context)205 Result<Version> validate(const IOperationValidationContext* context) {
206     NN_RET_CHECK_EQ(context->getNumInputs(), kNumInputs);
207     NN_RET_CHECK_EQ(context->getNumOutputs(), kNumOutputs);
208     std::vector<OperandType> inExpectedTypes;
209     auto inputType = context->getInputType(kRoiTensor);
210     auto deltaInputType = context->getInputType(kDeltaTensor);
211     if (inputType == OperandType::TENSOR_FLOAT32 || inputType == OperandType::TENSOR_FLOAT16) {
212         inExpectedTypes = {inputType, inputType, OperandType::TENSOR_INT32, inputType};
213     } else if (inputType == OperandType::TENSOR_QUANT16_ASYMM) {
214         if (deltaInputType == OperandType::TENSOR_QUANT8_ASYMM ||
215             deltaInputType == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
216             inExpectedTypes = {OperandType::TENSOR_QUANT16_ASYMM, deltaInputType,
217                                OperandType::TENSOR_INT32, OperandType::TENSOR_QUANT16_ASYMM};
218         } else {
219             return NN_ERROR() << "Unsupported input tensor type for operation " << kOperationName;
220         }
221     } else {
222         return NN_ERROR() << "Unsupported input tensor type for operation " << kOperationName;
223     }
224     NN_RET_CHECK(validateInputTypes(context, inExpectedTypes));
225     NN_RET_CHECK(validateOutputTypes(context, {inputType}));
226     return Version::ANDROID_Q;
227 }
228 
229 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
prepare(IOperationExecutionContext * context)230 bool prepare(IOperationExecutionContext* context) {
231     Shape roiShape = context->getInputShape(kRoiTensor);
232     Shape bboxDeltasShape = context->getInputShape(kDeltaTensor);
233     Shape batchesShape = context->getInputShape(kBatchesTensor);
234     Shape imageInfoShape = context->getInputShape(kImageInfoTensor);
235     Shape outputShape = context->getOutputShape(kOutputTensor);
236 
237     NN_RET_CHECK_EQ(getNumberOfDimensions(roiShape), 2);
238     NN_RET_CHECK_EQ(getNumberOfDimensions(bboxDeltasShape), 2);
239     NN_RET_CHECK_EQ(getNumberOfDimensions(batchesShape), 1);
240     NN_RET_CHECK_EQ(getNumberOfDimensions(imageInfoShape), 2);
241 
242     // Only numRois can be zero.
243     const uint32_t kRoiDim = 4;
244     uint32_t numRois = getSizeOfDimension(roiShape, 0);
245     uint32_t numClasses = getSizeOfDimension(bboxDeltasShape, 1) / kRoiDim;
246     uint32_t numBatches = getSizeOfDimension(imageInfoShape, 0);
247     NN_RET_CHECK_GT(numClasses, 0);
248     NN_RET_CHECK_GT(numBatches, 0);
249     NN_RET_CHECK_EQ(getSizeOfDimension(roiShape, 1), kRoiDim);
250     NN_RET_CHECK_EQ(getSizeOfDimension(bboxDeltasShape, 0), numRois);
251     NN_RET_CHECK_EQ(getSizeOfDimension(bboxDeltasShape, 1), kRoiDim * numClasses);
252     NN_RET_CHECK_EQ(getSizeOfDimension(batchesShape, 0), numRois);
253     NN_RET_CHECK_EQ(getSizeOfDimension(imageInfoShape, 1), 2);
254 
255     if (roiShape.type == OperandType::TENSOR_QUANT16_ASYMM) {
256         NN_RET_CHECK_EQ(roiShape.scale, 0.125f);
257         NN_RET_CHECK_EQ(roiShape.offset, 0);
258         NN_RET_CHECK_EQ(imageInfoShape.scale, 0.125f);
259         NN_RET_CHECK_EQ(imageInfoShape.offset, 0);
260     }
261 
262     outputShape.type = roiShape.type;
263     outputShape.dimensions = {numRois, numClasses * kRoiDim};
264     outputShape.scale = 0.f;
265     outputShape.offset = 0;
266     if (roiShape.type == OperandType::TENSOR_QUANT16_ASYMM) {
267         outputShape.scale = 0.125f;
268     }
269     NN_RET_CHECK(context->setOutputShape(kOutputTensor, outputShape));
270     return true;
271 }
272 
execute(IOperationExecutionContext * context)273 bool execute(IOperationExecutionContext* context) {
274     NNTRACE_TRANS("axisAlignedBBoxTransform");
275     // Bypass execution in the case of zero-sized input.
276     if (getNumberOfElements(context->getOutputShape(kOutputTensor)) == 0) return true;
277     switch (context->getInputType(kRoiTensor)) {
278         case OperandType::TENSOR_FLOAT16: {
279             return bboxTransformFloat16(context->getInputBuffer<_Float16>(kRoiTensor),
280                                         context->getInputShape(kRoiTensor),
281                                         context->getInputBuffer<_Float16>(kDeltaTensor),
282                                         context->getInputShape(kDeltaTensor),
283                                         context->getInputBuffer<int32_t>(kBatchesTensor),
284                                         context->getInputShape(kBatchesTensor),
285                                         context->getInputBuffer<_Float16>(kImageInfoTensor),
286                                         context->getInputShape(kImageInfoTensor),
287                                         context->getOutputBuffer<_Float16>(kOutputTensor),
288                                         context->getOutputShape(kOutputTensor));
289         }
290         case OperandType::TENSOR_FLOAT32: {
291             return bboxTransformFloat32(context->getInputBuffer<float>(kRoiTensor),
292                                         context->getInputShape(kRoiTensor),
293                                         context->getInputBuffer<float>(kDeltaTensor),
294                                         context->getInputShape(kDeltaTensor),
295                                         context->getInputBuffer<int32_t>(kBatchesTensor),
296                                         context->getInputShape(kBatchesTensor),
297                                         context->getInputBuffer<float>(kImageInfoTensor),
298                                         context->getInputShape(kImageInfoTensor),
299                                         context->getOutputBuffer<float>(kOutputTensor),
300                                         context->getOutputShape(kOutputTensor));
301         }
302         case OperandType::TENSOR_QUANT16_ASYMM: {
303             if (context->getInputType(kDeltaTensor) == OperandType::TENSOR_QUANT8_ASYMM) {
304                 return bboxTransformQuant(context->getInputBuffer<uint16_t>(kRoiTensor),
305                                           context->getInputShape(kRoiTensor),
306                                           context->getInputBuffer<uint8_t>(kDeltaTensor),
307                                           context->getInputShape(kDeltaTensor),
308                                           context->getInputBuffer<int32_t>(kBatchesTensor),
309                                           context->getInputShape(kBatchesTensor),
310                                           context->getInputBuffer<uint16_t>(kImageInfoTensor),
311                                           context->getInputShape(kImageInfoTensor),
312                                           context->getOutputBuffer<uint16_t>(kOutputTensor),
313                                           context->getOutputShape(kOutputTensor));
314             } else {
315                 return bboxTransformQuant(context->getInputBuffer<uint16_t>(kRoiTensor),
316                                           context->getInputShape(kRoiTensor),
317                                           context->getInputBuffer<int8_t>(kDeltaTensor),
318                                           context->getInputShape(kDeltaTensor),
319                                           context->getInputBuffer<int32_t>(kBatchesTensor),
320                                           context->getInputShape(kBatchesTensor),
321                                           context->getInputBuffer<uint16_t>(kImageInfoTensor),
322                                           context->getInputShape(kImageInfoTensor),
323                                           context->getOutputBuffer<uint16_t>(kOutputTensor),
324                                           context->getOutputShape(kOutputTensor));
325             }
326         }
327         default:
328             NN_RET_CHECK_FAIL() << "Unsupported tensor type for operation " << kOperationName;
329     }
330 }
331 #endif  // NN_INCLUDE_CPU_IMPLEMENTATION
332 
333 }  // namespace axis_aligned_bbox_transform
334 
335 namespace box_with_nms_limit {
336 
337 constexpr char kOperationName[] = "BOX_WITH_NMS_LIMIT";
338 
339 constexpr uint32_t kNumInputs = 9;
340 constexpr uint32_t kScoreTensor = 0;
341 constexpr uint32_t kRoiTensor = 1;
342 constexpr uint32_t kBatchesTensor = 2;
343 constexpr uint32_t kScoreThresholdScalar = 3;
344 constexpr uint32_t kMaxNumDetectionScalar = 4;
345 constexpr uint32_t kNmsKernelScalar = 5;
346 constexpr uint32_t kIoUThresholdScalar = 6;
347 constexpr uint32_t kSigmaScalar = 7;
348 constexpr uint32_t kNmsScoreThresholdScalar = 8;
349 
350 constexpr uint32_t kNumOutputs = 4;
351 constexpr uint32_t kOutputScoreTensor = 0;
352 constexpr uint32_t kOutputRoiTensor = 1;
353 constexpr uint32_t kOutputClassTensor = 2;
354 constexpr uint32_t kOutputBatchesTensor = 3;
355 
356 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
357 namespace {
358 
359 // TODO(xusongw): Reduce code duplication with hard/soft nms path.
360 
361 // Inplace hard NMS within range [select, select + selectLength).
hardNmsSingleClass(const float * scoresData,float iouThreshold,int32_t maxNumDetections,std::function<const float * (uint32_t)> getRoiBase,uint32_t * select,uint32_t selectLength)362 uint32_t* hardNmsSingleClass(const float* scoresData, float iouThreshold, int32_t maxNumDetections,
363                              std::function<const float*(uint32_t)> getRoiBase, uint32_t* select,
364                              uint32_t selectLength) {
365     uint32_t *selectStart = select, *selectEnd = select + selectLength, numDetections = 0;
366     if (maxNumDetections < 0) {
367         maxNumDetections = selectLength;
368     }
369     while (selectStart < selectEnd && numDetections < maxNumDetections) {
370         // find max score and swap to the front
371         auto& maxScore = *std::max_element(selectStart, selectEnd,
372                                            [&scoresData](const uint32_t& lhs, const uint32_t& rhs) {
373                                                return scoresData[lhs] < scoresData[rhs];
374                                            });
375         std::swap(maxScore, *selectStart);
376 
377         // Calculate IoU of the rest, swap to the end (disgard) if needed.
378         for (uint32_t* i = selectStart + 1; i < selectEnd; i++) {
379             float iou = getIoUAxisAligned(getRoiBase(*i), getRoiBase(*selectStart));
380             if (iou >= iouThreshold) {
381                 std::swap(*i--, *(--selectEnd));
382             }
383         }
384         selectStart++;
385         numDetections++;
386     }
387     return selectStart;
388 }
389 
hardNmsMultiClass(const float * scoresData,uint32_t numClasses,uint32_t numRois,float scoreThreshold,float iouThreshold,int32_t maxNumDetections,int32_t maxNumDetectionsPerClass,std::function<const float * (uint32_t)> getRoiBase,std::vector<uint32_t> * select)390 void hardNmsMultiClass(const float* scoresData, uint32_t numClasses, uint32_t numRois,
391                        float scoreThreshold, float iouThreshold, int32_t maxNumDetections,
392                        int32_t maxNumDetectionsPerClass,
393                        std::function<const float*(uint32_t)> getRoiBase,
394                        std::vector<uint32_t>* select) {
395     // Exclude class 0 (background)
396     for (uint32_t c = 1; c < numClasses; c++) {
397         uint32_t size = select->size();
398         for (uint32_t b = 0; b < numRois; b++) {
399             const uint32_t index = b * numClasses + c;
400             const float score = scoresData[index];
401             if (score > scoreThreshold) {
402                 select->push_back(index);
403             }
404         }
405         uint32_t* selectStart = select->data() + size;
406         uint32_t selectLength = select->size() - size;
407         uint32_t* selectEnd = hardNmsSingleClass(scoresData, iouThreshold, maxNumDetectionsPerClass,
408                                                  getRoiBase, selectStart, selectLength);
409         select->resize(selectEnd - select->data());
410     }
411 
412     // Take top maxNumDetections.
413     std::sort(select->begin(), select->end(),
414               [&scoresData](const uint32_t& lhs, const uint32_t& rhs) {
415                   return scoresData[lhs] > scoresData[rhs];
416               });
417     if (maxNumDetections < 0 || select->size() <= maxNumDetections) {
418         return;
419     }
420     select->resize(maxNumDetections);
421 }
422 
423 // Inplace soft NMS within range [select, select + selectLength).
424 using SoftNmsKernel = std::function<float(float)>;
softNmsSingleClass(float * scoresData,float scoreThreshold,int32_t maxNumDetections,std::function<const float * (uint32_t)> getRoiBase,SoftNmsKernel kernel,uint32_t * select,uint32_t selectLength)425 uint32_t* softNmsSingleClass(float* scoresData, float scoreThreshold, int32_t maxNumDetections,
426                              std::function<const float*(uint32_t)> getRoiBase, SoftNmsKernel kernel,
427                              uint32_t* select, uint32_t selectLength) {
428     uint32_t *selectStart = select, *selectEnd = select + selectLength, numDetections = 0;
429     if (maxNumDetections < 0) {
430         maxNumDetections = selectLength;
431     }
432     while (selectStart < selectEnd && numDetections < maxNumDetections) {
433         // find max score and swap to the front
434         auto& maxScore = *std::max_element(selectStart, selectEnd,
435                                            [&scoresData](const uint32_t& lhs, const uint32_t& rhs) {
436                                                return scoresData[lhs] < scoresData[rhs];
437                                            });
438         std::swap(maxScore, *selectStart);
439 
440         // Calculate IoU of the rest, swap to the end (disgard) if needed.
441         for (uint32_t* i = selectStart + 1; i < selectEnd; i++) {
442             float iou = getIoUAxisAligned(getRoiBase(*i), getRoiBase(*selectStart));
443             scoresData[*i] *= kernel(iou);
444             if (scoresData[*i] < scoreThreshold) {
445                 std::swap(*i--, *(--selectEnd));
446             }
447         }
448         selectStart++;
449         numDetections++;
450     }
451     return selectStart;
452 }
453 
softNmsMultiClass(float * scoresData,uint32_t numClasses,uint32_t numRois,float scoreThreshold,float nmsScoreThreshold,int32_t maxNumDetections,int32_t maxNumDetectionsPerClass,std::function<const float * (uint32_t)> getRoiBase,SoftNmsKernel kernel,std::vector<uint32_t> * select)454 void softNmsMultiClass(float* scoresData, uint32_t numClasses, uint32_t numRois,
455                        float scoreThreshold, float nmsScoreThreshold, int32_t maxNumDetections,
456                        int32_t maxNumDetectionsPerClass,
457                        std::function<const float*(uint32_t)> getRoiBase, SoftNmsKernel kernel,
458                        std::vector<uint32_t>* select) {
459     // Exclude class 0 (background)
460     for (uint32_t c = 1; c < numClasses; c++) {
461         uint32_t size = select->size();
462         for (uint32_t b = 0; b < numRois; b++) {
463             const uint32_t index = b * numClasses + c;
464             const float score = scoresData[index];
465             if (score > scoreThreshold) {
466                 select->push_back(index);
467             }
468         }
469         uint32_t* selectStart = select->data() + size;
470         uint32_t selectLength = select->size() - size;
471         uint32_t* selectEnd =
472                 softNmsSingleClass(scoresData, nmsScoreThreshold, maxNumDetectionsPerClass,
473                                    getRoiBase, kernel, selectStart, selectLength);
474         select->resize(selectEnd - select->data());
475     }
476 
477     // Take top maxNumDetections.
478     std::sort(select->begin(), select->end(),
479               [&scoresData](const uint32_t& lhs, const uint32_t& rhs) {
480                   return scoresData[lhs] > scoresData[rhs];
481               });
482     if (maxNumDetections < 0 || select->size() <= maxNumDetections) {
483         return;
484     }
485     select->resize(maxNumDetections);
486 }
487 
boxWithNmsLimitFloat32Compute(float * scoresData,const Shape & scoresShape,const float * roiData,const Shape & roiShape,const int32_t * batchesData,const Shape & batchesShape,float scoreThreshold,int32_t maxNumDetections,int32_t softNmsKernel,float iouThreshold,float sigma,float nmsScoreThreshold,std::vector<uint32_t> * batchSplitIn,std::vector<uint32_t> * batchSplitOut,std::vector<uint32_t> * selected)488 bool boxWithNmsLimitFloat32Compute(float* scoresData, const Shape& scoresShape,
489                                    const float* roiData, const Shape& roiShape,
490                                    const int32_t* batchesData, const Shape& batchesShape,
491                                    float scoreThreshold, int32_t maxNumDetections,
492                                    int32_t softNmsKernel, float iouThreshold, float sigma,
493                                    float nmsScoreThreshold, std::vector<uint32_t>* batchSplitIn,
494                                    std::vector<uint32_t>* batchSplitOut,
495                                    std::vector<uint32_t>* selected) {
496     SoftNmsKernel kernel = nullptr;
497     if (softNmsKernel == 0) {
498         kernel = [&iouThreshold](float iou) { return iou < iouThreshold ? 1.0f : 0.0f; };
499     } else if (softNmsKernel == 1) {
500         kernel = [&iouThreshold](float iou) { return iou < iouThreshold ? 1.0f : 1.0f - iou; };
501     } else if (softNmsKernel == 2) {
502         kernel = [&sigma](float iou) { return std::exp(-1.0f * iou * iou / sigma); };
503     } else {
504         NN_RET_CHECK_FAIL() << "Unsupported soft NMS kernel " << softNmsKernel;
505     }
506 
507     const uint32_t kRoiDim = 4;
508     uint32_t numRois = getSizeOfDimension(scoresShape, 0);
509     uint32_t numClasses = getSizeOfDimension(scoresShape, 1);
510 
511     // We assume boxes of the same batch are grouped together.
512     std::vector<uint32_t> batch;
513     for (uint32_t i = 0, ind = -1; i < numRois; i++) {
514         if (batchesData[i] == ind) {
515             (batchSplitIn->back())++;
516         } else {
517             ind = batchesData[i];
518             batchSplitIn->push_back(1);
519         }
520     }
521 
522     float* scoresBase = scoresData;
523     const float* roiBase = roiData;
524     selected->clear();
525     for (uint32_t b = 0; b < batchSplitIn->size(); b++) {
526         for (uint32_t i = 0; i < batchSplitIn->at(b); i++) {
527             const float* roi = roiBase + i * kRoiDim;
528             // Check for malformed data: invalid region: x2 < x1 || y2 < y1
529             NN_RET_CHECK_LE(roi[0], roi[2]);
530             NN_RET_CHECK_LE(roi[1], roi[3]);
531         }
532         std::vector<uint32_t> result;
533         softNmsMultiClass(
534                 scoresBase, numClasses, batchSplitIn->at(b), scoreThreshold, nmsScoreThreshold,
535                 maxNumDetections, maxNumDetections,
536                 [&roiBase](uint32_t ind) { return roiBase + ind * kRoiDim; }, kernel, &result);
537         // Sort again by class.
538         std::sort(result.begin(), result.end(),
539                   [&scoresBase, numClasses](const uint32_t& lhs, const uint32_t& rhs) {
540                       uint32_t lhsClass = lhs % numClasses, rhsClass = rhs % numClasses;
541                       return lhsClass == rhsClass ? scoresBase[lhs] > scoresBase[rhs]
542                                                   : lhsClass < rhsClass;
543                   });
544         selected->insert(selected->end(), result.begin(), result.end());
545         batchSplitOut->push_back(result.size());
546         scoresBase += batchSplitIn->at(b) * numClasses;
547         roiBase += batchSplitIn->at(b) * numClasses * kRoiDim;
548     }
549     return true;
550 }
551 
552 template <typename T>
castTo(float val,const Shape &)553 T castTo(float val, const Shape&) {
554     return val;
555 }
556 template <>
castTo(float val,const Shape & shape)557 uint8_t castTo(float val, const Shape& shape) {
558     return saturateCast<uint8_t>(std::round(val / shape.scale + shape.offset));
559 }
560 
561 template <>
castTo(float val,const Shape & shape)562 int8_t castTo(float val, const Shape& shape) {
563     return saturateCast<int8_t>(std::round(val / shape.scale + shape.offset));
564 }
565 
566 template <typename T_Score, typename T_Roi>
boxWithNmsLimitWriteOutput(const std::vector<uint32_t> & selected,const std::vector<uint32_t> & batchSplitIn,const std::vector<uint32_t> & batchSplitOut,const std::vector<float> & scores,IOperationExecutionContext * context)567 bool boxWithNmsLimitWriteOutput(const std::vector<uint32_t>& selected,
568                                 const std::vector<uint32_t>& batchSplitIn,
569                                 const std::vector<uint32_t>& batchSplitOut,
570                                 const std::vector<float>& scores,
571                                 IOperationExecutionContext* context) {
572     const uint32_t kRoiDim = 4;
573     Shape scoresShape = context->getInputShape(kScoreTensor);
574     uint32_t numClasses = getSizeOfDimension(scoresShape, 1);
575 
576     // Set output dimensions.
577     uint32_t numOutRois = selected.size();
578     if (numOutRois == 0) return true;
579     Shape scoresOutShape = context->getOutputShape(kOutputScoreTensor);
580     scoresOutShape.dimensions = {numOutRois};
581     NN_RET_CHECK(context->setOutputShape(kOutputScoreTensor, scoresOutShape));
582 
583     Shape roiOutShape = context->getOutputShape(kOutputRoiTensor);
584     roiOutShape.dimensions = {numOutRois, 4};
585     NN_RET_CHECK(context->setOutputShape(kOutputRoiTensor, roiOutShape));
586 
587     Shape classesOutShape = context->getOutputShape(kOutputClassTensor);
588     classesOutShape.dimensions = {numOutRois};
589     NN_RET_CHECK(context->setOutputShape(kOutputClassTensor, classesOutShape));
590 
591     Shape batchesOutShape = context->getOutputShape(kOutputBatchesTensor);
592     batchesOutShape.dimensions = {numOutRois};
593     NN_RET_CHECK(context->setOutputShape(kOutputBatchesTensor, batchesOutShape));
594 
595     // Write outputs.
596     const float* scoresBase = scores.data();
597     const T_Roi* roiBase = context->getInputBuffer<T_Roi>(kRoiTensor);
598     const int32_t* batchesInPtr = context->getInputBuffer<int32_t>(kBatchesTensor);
599     T_Score* scoresOutPtr = context->getOutputBuffer<T_Score>(kOutputScoreTensor);
600     T_Roi* roiOutPtr = context->getOutputBuffer<T_Roi>(kOutputRoiTensor);
601     int32_t* classesOutPtr = context->getOutputBuffer<int32_t>(kOutputClassTensor);
602     int32_t* batchesOutPtr = context->getOutputBuffer<int32_t>(kOutputBatchesTensor);
603     uint32_t i = 0;
604     for (uint32_t b = 0; b < batchSplitOut.size(); b++) {
605         for (uint32_t j = 0; j < batchSplitOut[b]; j++) {
606             uint32_t index = selected[i++];
607             *scoresOutPtr++ = castTo<T_Score>(scoresBase[index], scoresOutShape);
608             memcpy(roiOutPtr, roiBase + index * kRoiDim, kRoiDim * sizeof(T_Roi));
609             roiOutPtr += kRoiDim;
610             *classesOutPtr++ = index % numClasses;
611             *batchesOutPtr++ = *batchesInPtr;
612         }
613         scoresBase += batchSplitIn[b] * numClasses;
614         roiBase += batchSplitIn[b] * numClasses * kRoiDim;
615         batchesInPtr += batchSplitIn[b];
616     }
617     return true;
618 }
619 
boxWithNmsLimitFloat32(const float * scoresData,const Shape & scoresShape,const float * roiData,const Shape & roiShape,const int32_t * batchesData,const Shape & batchesShape,float scoreThreshold,int32_t maxNumDetections,int32_t softNmsKernel,float iouThreshold,float sigma,float nmsScoreThreshold,float * scoresOutData,Shape scoresOutShape,float * roiOutData,Shape roiOutShape,int32_t * classesOutData,Shape classesOutShape,int32_t * batchesOutData,const Shape & batchSplitOutShape,IOperationExecutionContext * context)620 bool boxWithNmsLimitFloat32(const float* scoresData, const Shape& scoresShape, const float* roiData,
621                             const Shape& roiShape, const int32_t* batchesData,
622                             const Shape& batchesShape, float scoreThreshold,
623                             int32_t maxNumDetections, int32_t softNmsKernel, float iouThreshold,
624                             float sigma, float nmsScoreThreshold, float* scoresOutData,
625                             Shape scoresOutShape, float* roiOutData, Shape roiOutShape,
626                             int32_t* classesOutData, Shape classesOutShape, int32_t* batchesOutData,
627                             const Shape& batchSplitOutShape, IOperationExecutionContext* context) {
628     NNTRACE_TRANS("boxWithNmsLimit");
629     std::vector<float> scores_float32(getNumberOfElements(scoresShape));
630     for (uint32_t i = 0; i < scores_float32.size(); i++) {
631         scores_float32[i] = scoresData[i];
632     }
633     std::vector<uint32_t> selected, batchSplitIn, batchSplitOut;
634     NN_RET_CHECK(boxWithNmsLimitFloat32Compute(
635             scores_float32.data(), scoresShape, roiData, roiShape, batchesData, batchesShape,
636             scoreThreshold, maxNumDetections, softNmsKernel, iouThreshold, sigma, nmsScoreThreshold,
637             &batchSplitIn, &batchSplitOut, &selected));
638     return boxWithNmsLimitWriteOutput<float, float>(selected, batchSplitIn, batchSplitOut,
639                                                     scores_float32, context);
640 }
641 
boxWithNmsLimitFloat16(const _Float16 * scoresData,const Shape & scoresShape,const _Float16 * roiData,const Shape & roiShape,const int32_t * batchesData,const Shape & batchesShape,_Float16 scoreThreshold,int32_t maxNumDetections,int32_t softNmsKernel,_Float16 iouThreshold,_Float16 sigma,_Float16 nmsScoreThreshold,_Float16 * scoresOutData,const Shape & scoresOutShape,_Float16 * roiOutData,const Shape & roiOutShape,int32_t * classesOutData,const Shape & classesOutShape,int32_t * batchesOutData,const Shape & batchSplitOutShape,IOperationExecutionContext * context)642 bool boxWithNmsLimitFloat16(const _Float16* scoresData, const Shape& scoresShape,
643                             const _Float16* roiData, const Shape& roiShape,
644                             const int32_t* batchesData, const Shape& batchesShape,
645                             _Float16 scoreThreshold, int32_t maxNumDetections,
646                             int32_t softNmsKernel, _Float16 iouThreshold, _Float16 sigma,
647                             _Float16 nmsScoreThreshold, _Float16* scoresOutData,
648                             const Shape& scoresOutShape, _Float16* roiOutData,
649                             const Shape& roiOutShape, int32_t* classesOutData,
650                             const Shape& classesOutShape, int32_t* batchesOutData,
651                             const Shape& batchSplitOutShape, IOperationExecutionContext* context) {
652     std::vector<float> scores_float32(getNumberOfElements(scoresShape));
653     convertFloat16ToFloat32(scoresData, &scores_float32);
654     std::vector<float> roi_float32(getNumberOfElements(roiShape));
655     convertFloat16ToFloat32(roiData, &roi_float32);
656     std::vector<uint32_t> selected, batchSplitIn, batchSplitOut;
657     NN_RET_CHECK(boxWithNmsLimitFloat32Compute(
658             scores_float32.data(), scoresShape, roi_float32.data(), roiShape, batchesData,
659             batchesShape, scoreThreshold, maxNumDetections, softNmsKernel, iouThreshold, sigma,
660             nmsScoreThreshold, &batchSplitIn, &batchSplitOut, &selected));
661     return boxWithNmsLimitWriteOutput<_Float16, _Float16>(selected, batchSplitIn, batchSplitOut,
662                                                           scores_float32, context);
663 }
664 
boxWithNmsLimitQuant(const uint8_t * scoresData,const Shape & scoresShape,const uint16_t * roiData,const Shape & roiShape,const int32_t * batchesData,const Shape & batchesShape,float scoreThreshold,int32_t maxNumDetections,int32_t softNmsKernel,float iouThreshold,float sigma,float nmsScoreThreshold,uint8_t * scoresOutData,const Shape & scoresOutShape,uint16_t * roiOutData,const Shape & roiOutShape,int32_t * classesOutData,const Shape & classesOutShape,int32_t * batchesOutData,const Shape & batchSplitOutShape,IOperationExecutionContext * context)665 bool boxWithNmsLimitQuant(const uint8_t* scoresData, const Shape& scoresShape,
666                           const uint16_t* roiData, const Shape& roiShape,
667                           const int32_t* batchesData, const Shape& batchesShape,
668                           float scoreThreshold, int32_t maxNumDetections, int32_t softNmsKernel,
669                           float iouThreshold, float sigma, float nmsScoreThreshold,
670                           uint8_t* scoresOutData, const Shape& scoresOutShape, uint16_t* roiOutData,
671                           const Shape& roiOutShape, int32_t* classesOutData,
672                           const Shape& classesOutShape, int32_t* batchesOutData,
673                           const Shape& batchSplitOutShape, IOperationExecutionContext* context) {
674     std::vector<float> scores_float32(getNumberOfElements(scoresShape));
675     convertQuantToFloat32(scoresData, scoresShape.scale, scoresShape.offset, &scores_float32);
676     std::vector<float> roi_float32(getNumberOfElements(roiShape));
677     convertQuantToFloat32(roiData, roiShape.scale, roiShape.offset, &roi_float32);
678     std::vector<uint32_t> selected, batchSplitIn, batchSplitOut;
679     NN_RET_CHECK(boxWithNmsLimitFloat32Compute(
680             scores_float32.data(), scoresShape, roi_float32.data(), roiShape, batchesData,
681             batchesShape, scoreThreshold, maxNumDetections, softNmsKernel, iouThreshold, sigma,
682             nmsScoreThreshold, &batchSplitIn, &batchSplitOut, &selected));
683     return boxWithNmsLimitWriteOutput<uint8_t, uint16_t>(selected, batchSplitIn, batchSplitOut,
684                                                          scores_float32, context);
685 }
686 
boxWithNmsLimitQuant(const int8_t * scoresData,const Shape & scoresShape,const uint16_t * roiData,const Shape & roiShape,const int32_t * batchesData,const Shape & batchesShape,float scoreThreshold,int32_t maxNumDetections,int32_t softNmsKernel,float iouThreshold,float sigma,float nmsScoreThreshold,int8_t * scoresOutData,const Shape & scoresOutShape,uint16_t * roiOutData,const Shape & roiOutShape,int32_t * classesOutData,const Shape & classesOutShape,int32_t * batchesOutData,const Shape & batchSplitOutShape,IOperationExecutionContext * context)687 bool boxWithNmsLimitQuant(const int8_t* scoresData, const Shape& scoresShape,
688                           const uint16_t* roiData, const Shape& roiShape,
689                           const int32_t* batchesData, const Shape& batchesShape,
690                           float scoreThreshold, int32_t maxNumDetections, int32_t softNmsKernel,
691                           float iouThreshold, float sigma, float nmsScoreThreshold,
692                           int8_t* scoresOutData, const Shape& scoresOutShape, uint16_t* roiOutData,
693                           const Shape& roiOutShape, int32_t* classesOutData,
694                           const Shape& classesOutShape, int32_t* batchesOutData,
695                           const Shape& batchSplitOutShape, IOperationExecutionContext* context) {
696     std::vector<float> scores_float32(getNumberOfElements(scoresShape));
697     convertQuantToFloat32<int8_t>(scoresData, scoresShape.scale, scoresShape.offset,
698                                   &scores_float32);
699     std::vector<float> roi_float32(getNumberOfElements(roiShape));
700     convertQuantToFloat32(roiData, roiShape.scale, roiShape.offset, &roi_float32);
701     std::vector<uint32_t> selected, batchSplitIn, batchSplitOut;
702     NN_RET_CHECK(boxWithNmsLimitFloat32Compute(
703             scores_float32.data(), scoresShape, roi_float32.data(), roiShape, batchesData,
704             batchesShape, scoreThreshold, maxNumDetections, softNmsKernel, iouThreshold, sigma,
705             nmsScoreThreshold, &batchSplitIn, &batchSplitOut, &selected));
706     return boxWithNmsLimitWriteOutput<int8_t, uint16_t>(selected, batchSplitIn, batchSplitOut,
707                                                         scores_float32, context);
708 }
709 
710 }  // namespace
711 #endif  // NN_INCLUDE_CPU_IMPLEMENTATION
712 
validate(const IOperationValidationContext * context)713 Result<Version> validate(const IOperationValidationContext* context) {
714     NN_RET_CHECK_EQ(context->getNumInputs(), kNumInputs);
715     NN_RET_CHECK_EQ(context->getNumOutputs(), kNumOutputs);
716     std::vector<OperandType> inExpectedTypes;
717     std::vector<OperandType> outExpectedTypes;
718     auto inputType = context->getInputType(kScoreTensor);
719     if (inputType == OperandType::TENSOR_FLOAT16) {
720         inExpectedTypes = {
721                 OperandType::TENSOR_FLOAT16, OperandType::TENSOR_FLOAT16, OperandType::TENSOR_INT32,
722                 OperandType::FLOAT16,        OperandType::INT32,          OperandType::INT32,
723                 OperandType::FLOAT16,        OperandType::FLOAT16,        OperandType::FLOAT16};
724         outExpectedTypes = {OperandType::TENSOR_FLOAT16, OperandType::TENSOR_FLOAT16,
725                             OperandType::TENSOR_INT32, OperandType::TENSOR_INT32};
726     } else if (inputType == OperandType::TENSOR_FLOAT32) {
727         inExpectedTypes = {
728                 OperandType::TENSOR_FLOAT32, OperandType::TENSOR_FLOAT32, OperandType::TENSOR_INT32,
729                 OperandType::FLOAT32,        OperandType::INT32,          OperandType::INT32,
730                 OperandType::FLOAT32,        OperandType::FLOAT32,        OperandType::FLOAT32};
731         outExpectedTypes = {OperandType::TENSOR_FLOAT32, OperandType::TENSOR_FLOAT32,
732                             OperandType::TENSOR_INT32, OperandType::TENSOR_INT32};
733     } else if (inputType == OperandType::TENSOR_QUANT8_ASYMM ||
734                inputType == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
735         inExpectedTypes = {inputType,
736                            OperandType::TENSOR_QUANT16_ASYMM,
737                            OperandType::TENSOR_INT32,
738                            OperandType::FLOAT32,
739                            OperandType::INT32,
740                            OperandType::INT32,
741                            OperandType::FLOAT32,
742                            OperandType::FLOAT32,
743                            OperandType::FLOAT32};
744         outExpectedTypes = {inputType, OperandType::TENSOR_QUANT16_ASYMM, OperandType::TENSOR_INT32,
745                             OperandType::TENSOR_INT32};
746     } else {
747         NN_RET_CHECK_FAIL() << "Unsupported tensor type for operation " << kOperationName;
748     }
749     NN_RET_CHECK(validateInputTypes(context, inExpectedTypes));
750     NN_RET_CHECK(validateOutputTypes(context, outExpectedTypes));
751     if (inputType == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
752         return Version::ANDROID_R;
753     } else {
754         return Version::ANDROID_Q;
755     }
756 }
757 
758 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
prepare(IOperationExecutionContext * context)759 bool prepare(IOperationExecutionContext* context) {
760     Shape scoreShape = context->getInputShape(kScoreTensor);
761     Shape roiShape = context->getInputShape(kRoiTensor);
762     Shape batchesShape = context->getInputShape(kBatchesTensor);
763     Shape outputScoreShape = context->getOutputShape(kOutputScoreTensor);
764     Shape outputRoiShape = context->getOutputShape(kOutputRoiTensor);
765     Shape outputClassShape = context->getOutputShape(kOutputClassTensor);
766     Shape outputBatchSplitShape = context->getOutputShape(kOutputBatchesTensor);
767 
768     NN_RET_CHECK(getNumberOfDimensions(scoreShape) == 2);
769     NN_RET_CHECK(getNumberOfDimensions(roiShape) == 2);
770     NN_RET_CHECK(getNumberOfDimensions(batchesShape) == 1);
771 
772     // Only numRois can be zero.
773     const uint32_t kRoiDim = 4;
774     uint32_t numRois = getSizeOfDimension(scoreShape, 0);
775     uint32_t numClasses = getSizeOfDimension(scoreShape, 1);
776     NN_RET_CHECK(getSizeOfDimension(roiShape, 0) == numRois);
777     NN_RET_CHECK(getSizeOfDimension(roiShape, 1) == kRoiDim * numClasses);
778     NN_RET_CHECK(getSizeOfDimension(batchesShape, 0) == numRois);
779     NN_RET_CHECK_GT(numClasses, 1);
780 
781     if (scoreShape.type == OperandType::TENSOR_QUANT8_ASYMM ||
782         scoreShape.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
783         NN_RET_CHECK_EQ(roiShape.scale, 0.125f);
784         NN_RET_CHECK_EQ(roiShape.offset, 0);
785     }
786 
787     outputScoreShape.type = scoreShape.type;
788     outputScoreShape.dimensions = {0};
789     outputScoreShape.scale = scoreShape.scale;
790     outputScoreShape.offset = scoreShape.offset;
791     NN_RET_CHECK(context->setOutputShape(kOutputScoreTensor, outputScoreShape));
792 
793     outputRoiShape.type = roiShape.type;
794     outputRoiShape.dimensions = {0, 4};
795     outputRoiShape.scale = 0.f;
796     outputRoiShape.offset = 0;
797     if (scoreShape.type == OperandType::TENSOR_QUANT8_ASYMM ||
798         scoreShape.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
799         outputRoiShape.scale = 0.125f;
800     }
801     NN_RET_CHECK(context->setOutputShape(kOutputRoiTensor, outputRoiShape));
802 
803     outputClassShape.type = OperandType::TENSOR_INT32;
804     outputClassShape.dimensions = {0};
805     NN_RET_CHECK(context->setOutputShape(kOutputClassTensor, outputClassShape));
806 
807     outputBatchSplitShape.type = batchesShape.type;
808     outputBatchSplitShape.dimensions = {0};
809     NN_RET_CHECK(context->setOutputShape(kOutputBatchesTensor, outputBatchSplitShape));
810     return true;
811 }
812 
execute(IOperationExecutionContext * context)813 bool execute(IOperationExecutionContext* context) {
814     NNTRACE_TRANS("boxWithNMSLimit");
815     // Bypass execution in the case of zero numRois.
816     if (getSizeOfDimension(context->getInputShape(kScoreTensor), 0) == 0) return true;
817     switch (context->getInputType(kScoreTensor)) {
818         case OperandType::TENSOR_FLOAT16: {
819             return boxWithNmsLimitFloat16(
820                     context->getInputBuffer<_Float16>(kScoreTensor),
821                     context->getInputShape(kScoreTensor),
822                     context->getInputBuffer<_Float16>(kRoiTensor),
823                     context->getInputShape(kRoiTensor),
824                     context->getInputBuffer<int32_t>(kBatchesTensor),
825                     context->getInputShape(kBatchesTensor),
826                     context->getInputValue<_Float16>(kScoreThresholdScalar),
827                     context->getInputValue<int32_t>(kMaxNumDetectionScalar),
828                     context->getInputValue<int32_t>(kNmsKernelScalar),
829                     context->getInputValue<_Float16>(kIoUThresholdScalar),
830                     context->getInputValue<_Float16>(kSigmaScalar),
831                     context->getInputValue<_Float16>(kNmsScoreThresholdScalar),
832                     context->getOutputBuffer<_Float16>(kOutputScoreTensor),
833                     context->getOutputShape(kOutputScoreTensor),
834                     context->getOutputBuffer<_Float16>(kOutputRoiTensor),
835                     context->getOutputShape(kOutputRoiTensor),
836                     context->getOutputBuffer<int32_t>(kOutputClassTensor),
837                     context->getOutputShape(kOutputClassTensor),
838                     context->getOutputBuffer<int32_t>(kOutputBatchesTensor),
839                     context->getOutputShape(kOutputBatchesTensor), context);
840         }
841         case OperandType::TENSOR_FLOAT32: {
842             return boxWithNmsLimitFloat32(context->getInputBuffer<float>(kScoreTensor),
843                                           context->getInputShape(kScoreTensor),
844                                           context->getInputBuffer<float>(kRoiTensor),
845                                           context->getInputShape(kRoiTensor),
846                                           context->getInputBuffer<int32_t>(kBatchesTensor),
847                                           context->getInputShape(kBatchesTensor),
848                                           context->getInputValue<float>(kScoreThresholdScalar),
849                                           context->getInputValue<int32_t>(kMaxNumDetectionScalar),
850                                           context->getInputValue<int32_t>(kNmsKernelScalar),
851                                           context->getInputValue<float>(kIoUThresholdScalar),
852                                           context->getInputValue<float>(kSigmaScalar),
853                                           context->getInputValue<float>(kNmsScoreThresholdScalar),
854                                           context->getOutputBuffer<float>(kOutputScoreTensor),
855                                           context->getOutputShape(kOutputScoreTensor),
856                                           context->getOutputBuffer<float>(kOutputRoiTensor),
857                                           context->getOutputShape(kOutputRoiTensor),
858                                           context->getOutputBuffer<int32_t>(kOutputClassTensor),
859                                           context->getOutputShape(kOutputClassTensor),
860                                           context->getOutputBuffer<int32_t>(kOutputBatchesTensor),
861                                           context->getOutputShape(kOutputBatchesTensor), context);
862         }
863         case OperandType::TENSOR_QUANT8_ASYMM: {
864             return boxWithNmsLimitQuant(context->getInputBuffer<uint8_t>(kScoreTensor),
865                                         context->getInputShape(kScoreTensor),
866                                         context->getInputBuffer<uint16_t>(kRoiTensor),
867                                         context->getInputShape(kRoiTensor),
868                                         context->getInputBuffer<int32_t>(kBatchesTensor),
869                                         context->getInputShape(kBatchesTensor),
870                                         context->getInputValue<float>(kScoreThresholdScalar),
871                                         context->getInputValue<int32_t>(kMaxNumDetectionScalar),
872                                         context->getInputValue<int32_t>(kNmsKernelScalar),
873                                         context->getInputValue<float>(kIoUThresholdScalar),
874                                         context->getInputValue<float>(kSigmaScalar),
875                                         context->getInputValue<float>(kNmsScoreThresholdScalar),
876                                         context->getOutputBuffer<uint8_t>(kOutputScoreTensor),
877                                         context->getOutputShape(kOutputScoreTensor),
878                                         context->getOutputBuffer<uint16_t>(kOutputRoiTensor),
879                                         context->getOutputShape(kOutputRoiTensor),
880                                         context->getOutputBuffer<int32_t>(kOutputClassTensor),
881                                         context->getOutputShape(kOutputClassTensor),
882                                         context->getOutputBuffer<int32_t>(kOutputBatchesTensor),
883                                         context->getOutputShape(kOutputBatchesTensor), context);
884         }
885         case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
886             return boxWithNmsLimitQuant(context->getInputBuffer<int8_t>(kScoreTensor),
887                                         context->getInputShape(kScoreTensor),
888                                         context->getInputBuffer<uint16_t>(kRoiTensor),
889                                         context->getInputShape(kRoiTensor),
890                                         context->getInputBuffer<int32_t>(kBatchesTensor),
891                                         context->getInputShape(kBatchesTensor),
892                                         context->getInputValue<float>(kScoreThresholdScalar),
893                                         context->getInputValue<int32_t>(kMaxNumDetectionScalar),
894                                         context->getInputValue<int32_t>(kNmsKernelScalar),
895                                         context->getInputValue<float>(kIoUThresholdScalar),
896                                         context->getInputValue<float>(kSigmaScalar),
897                                         context->getInputValue<float>(kNmsScoreThresholdScalar),
898                                         context->getOutputBuffer<int8_t>(kOutputScoreTensor),
899                                         context->getOutputShape(kOutputScoreTensor),
900                                         context->getOutputBuffer<uint16_t>(kOutputRoiTensor),
901                                         context->getOutputShape(kOutputRoiTensor),
902                                         context->getOutputBuffer<int32_t>(kOutputClassTensor),
903                                         context->getOutputShape(kOutputClassTensor),
904                                         context->getOutputBuffer<int32_t>(kOutputBatchesTensor),
905                                         context->getOutputShape(kOutputBatchesTensor), context);
906         }
907         default:
908             NN_RET_CHECK_FAIL() << "Unsupported tensor type for operation " << kOperationName;
909     }
910 }
911 #endif  // NN_INCLUDE_CPU_IMPLEMENTATION
912 
913 }  // namespace box_with_nms_limit
914 
915 namespace generate_proposals {
916 
917 constexpr char kOperationName[] = "GENERATE_PROPOSALS";
918 
919 constexpr uint32_t kNumInputs = 11;
920 constexpr uint32_t kScoreTensor = 0;
921 constexpr uint32_t kDeltaTensor = 1;
922 constexpr uint32_t kAnchorTensor = 2;
923 constexpr uint32_t kImageInfoTensor = 3;
924 constexpr uint32_t kHeightStrideSalar = 4;
925 constexpr uint32_t kWidthStrideScalar = 5;
926 constexpr uint32_t kPreNmsMaxScalar = 6;
927 constexpr uint32_t kPostNmsMaxScalar = 7;
928 constexpr uint32_t kIoUThresholdScalar = 8;
929 constexpr uint32_t kMinSizeScalar = 9;
930 constexpr uint32_t kLayoutScalar = 10;
931 
932 constexpr uint32_t kNumOutputs = 3;
933 constexpr uint32_t kOutputScoreTensor = 0;
934 constexpr uint32_t kOutputRoiTensor = 1;
935 constexpr uint32_t kOutputBatchesTensor = 2;
936 
937 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
938 namespace {
939 
filterBoxes(const float * roiBase,const float * imageInfoBase,float minSize,std::vector<uint32_t> * select)940 void filterBoxes(const float* roiBase, const float* imageInfoBase, float minSize,
941                  std::vector<uint32_t>* select) {
942     const uint32_t kRoiDim = 4;
943     uint32_t i = 0;
944     for (uint32_t j = 0; j < select->size(); j++) {
945         const float* roiInfo = roiBase + (*select)[j] * kRoiDim;
946         float roiWidth, roiHeight, xRoiCenter, yRoiCenter;
947         roiWidth = roiInfo[2] - roiInfo[0];
948         roiHeight = roiInfo[3] - roiInfo[1];
949         xRoiCenter = roiInfo[0] + roiWidth / 2.0f;
950         yRoiCenter = roiInfo[1] + roiHeight / 2.0f;
951         if (roiWidth > minSize && roiHeight > minSize && xRoiCenter < imageInfoBase[1] &&
952             yRoiCenter < imageInfoBase[0]) {
953             (*select)[i++] = (*select)[j];
954         }
955     }
956     select->resize(i);
957 }
958 
generateProposalsNhwcFloat32Compute(const float * scoresData,const Shape & scoresShape,const float * bboxDeltasData,const Shape & bboxDeltasShape,const float * anchorsData,const Shape & anchorsShape,const float * imageInfoData,const Shape & imageInfoShape,float heightStride,float widthStride,int32_t preNmsTopN,int32_t postNmsTopN,float iouThreshold,float minSize,std::vector<float> * scoresOutData,std::vector<float> * roiOutData,std::vector<int32_t> * batchesOutData)959 bool generateProposalsNhwcFloat32Compute(const float* scoresData, const Shape& scoresShape,
960                                          const float* bboxDeltasData, const Shape& bboxDeltasShape,
961                                          const float* anchorsData, const Shape& anchorsShape,
962                                          const float* imageInfoData, const Shape& imageInfoShape,
963                                          float heightStride, float widthStride, int32_t preNmsTopN,
964                                          int32_t postNmsTopN, float iouThreshold, float minSize,
965                                          std::vector<float>* scoresOutData,
966                                          std::vector<float>* roiOutData,
967                                          std::vector<int32_t>* batchesOutData) {
968     const uint32_t kRoiDim = 4;
969     uint32_t numBatches = getSizeOfDimension(scoresShape, 0);
970     uint32_t height = getSizeOfDimension(scoresShape, 1);
971     uint32_t width = getSizeOfDimension(scoresShape, 2);
972     uint32_t numAnchors = getSizeOfDimension(scoresShape, 3);
973     uint32_t imageInfoLength = getSizeOfDimension(imageInfoShape, 1);
974 
975     uint32_t batchSize = height * width * numAnchors;
976     uint32_t roiBufferSize = batchSize * kRoiDim;
977     std::vector<float> roiBuffer(roiBufferSize);
978     std::vector<float> roiTransformedBuffer(roiBufferSize);
979     scoresOutData->clear();
980     roiOutData->clear();
981     batchesOutData->clear();
982 
983     // Compute the roi region for each anchor.
984     float* roiBase = roiBuffer.data();
985     for (uint32_t h = 0; h < height; h++) {
986         float hShift = h * heightStride;
987         for (uint32_t w = 0; w < width; w++) {
988             const float* anchorsBase = anchorsData;
989             float wShift = w * widthStride;
990             for (uint32_t a = 0; a < numAnchors; a++, roiBase += kRoiDim, anchorsBase += kRoiDim) {
991                 roiBase[0] = anchorsBase[0] + wShift;
992                 roiBase[1] = anchorsBase[1] + hShift;
993                 roiBase[2] = anchorsBase[2] + wShift;
994                 roiBase[3] = anchorsBase[3] + hShift;
995             }
996         }
997     }
998 
999     const float* scoresBase = scoresData;
1000     const float* bboxDeltasBase = bboxDeltasData;
1001     const float* imageInfoBase = imageInfoData;
1002     // Need to fake some data to satisfy bboxTransform.
1003     Shape tempRoiShape = anchorsShape;
1004     tempRoiShape.dimensions = {batchSize, kRoiDim};
1005     Shape tempBBoxDeltasShape = bboxDeltasShape;
1006     tempBBoxDeltasShape.dimensions = {batchSize, kRoiDim};
1007     std::vector<int32_t> tempBatchSplitData(batchSize, 0);
1008     Shape tempbatchSplitShape = {.dimensions = {batchSize}};
1009     Shape tempImageInfoShape = imageInfoShape;
1010     tempImageInfoShape.dimensions = {1, imageInfoLength};
1011 
1012     for (uint32_t b = 0; b < numBatches; b++) {
1013         // Apply bboxDeltas to anchor locations.
1014         float tempImageInfo[] = {imageInfoBase[0], imageInfoBase[1]};
1015         if (!bboxTransformFloat32(roiBuffer.data(), tempRoiShape, bboxDeltasBase,
1016                                   tempBBoxDeltasShape, tempBatchSplitData.data(),
1017                                   tempbatchSplitShape, tempImageInfo, tempImageInfoShape,
1018                                   roiTransformedBuffer.data(), tempRoiShape)) {
1019             LOG(ERROR) << "BBoxTransform step failed in GENERATE_PROPOSALS op.";
1020             return false;
1021         }
1022 
1023         // Find the top preNmsTopN scores.
1024         std::vector<uint32_t> select(batchSize);
1025         std::iota(select.begin(), select.end(), 0);
1026         if (preNmsTopN > 0 && preNmsTopN < select.size()) {
1027             std::sort(select.begin(), select.end(),
1028                       [&scoresBase](const uint32_t lhs, const uint32_t rhs) {
1029                           return scoresBase[lhs] > scoresBase[rhs];
1030                       });
1031             select.resize(preNmsTopN);
1032         }
1033 
1034         // Filter boxes, disgard regions with height or width < minSize.
1035         filterBoxes(roiTransformedBuffer.data(), imageInfoBase, minSize, &select);
1036 
1037         // Apply hard NMS.
1038         uint32_t* selectEnd = box_with_nms_limit::hardNmsSingleClass(
1039                 scoresBase, iouThreshold, postNmsTopN,
1040                 [&roiTransformedBuffer](uint32_t ind) {
1041                     return roiTransformedBuffer.data() + ind * kRoiDim;
1042                 },
1043                 select.data(), select.size());
1044         uint32_t selectSize = selectEnd - select.data();
1045         select.resize(selectSize);
1046 
1047         // Write output.
1048         for (auto i : select) {
1049             roiOutData->insert(roiOutData->end(), roiTransformedBuffer.begin() + i * kRoiDim,
1050                                roiTransformedBuffer.begin() + (i + 1) * kRoiDim);
1051             scoresOutData->push_back(scoresBase[i]);
1052             batchesOutData->push_back(b);
1053         }
1054         scoresBase += batchSize;
1055         bboxDeltasBase += roiBufferSize;
1056         imageInfoBase += imageInfoLength;
1057     }
1058     return true;
1059 }
1060 
generateProposalsFloat32Compute(const float * scoresData,const Shape & scoresShape,const float * bboxDeltasData,const Shape & bboxDeltasShape,const float * anchorsData,const Shape & anchorsShape,const float * imageInfoData,const Shape & imageInfoShape,float heightStride,float widthStride,int32_t preNmsTopN,int32_t postNmsTopN,float iouThreshold,float minSize,bool useNchw,std::vector<float> * scoresOutData,std::vector<float> * roiOutData,std::vector<int32_t> * batchesOutData)1061 bool generateProposalsFloat32Compute(const float* scoresData, const Shape& scoresShape,
1062                                      const float* bboxDeltasData, const Shape& bboxDeltasShape,
1063                                      const float* anchorsData, const Shape& anchorsShape,
1064                                      const float* imageInfoData, const Shape& imageInfoShape,
1065                                      float heightStride, float widthStride, int32_t preNmsTopN,
1066                                      int32_t postNmsTopN, float iouThreshold, float minSize,
1067                                      bool useNchw, std::vector<float>* scoresOutData,
1068                                      std::vector<float>* roiOutData,
1069                                      std::vector<int32_t>* batchesOutData) {
1070     InputWithLayout<float> score_nhwc(useNchw), delta_nhwc(useNchw);
1071     NN_RET_CHECK(score_nhwc.initialize(scoresData, scoresShape));
1072     NN_RET_CHECK(delta_nhwc.initialize(bboxDeltasData, bboxDeltasShape));
1073     return generateProposalsNhwcFloat32Compute(
1074             score_nhwc.getNhwcBuffer(), score_nhwc.getNhwcShape(), delta_nhwc.getNhwcBuffer(),
1075             delta_nhwc.getNhwcShape(), anchorsData, anchorsShape, imageInfoData, imageInfoShape,
1076             heightStride, widthStride, preNmsTopN, postNmsTopN, iouThreshold, minSize,
1077             scoresOutData, roiOutData, batchesOutData);
1078 }
1079 
generateProposalsFloat32(const float * scoresData,const Shape & scoresShape,const float * bboxDeltasData,const Shape & bboxDeltasShape,const float * anchorsData,const Shape & anchorsShape,const float * imageInfoData,const Shape & imageInfoShape,float heightStride,float widthStride,int32_t preNmsTopN,int32_t postNmsTopN,float iouThreshold,float minSize,bool useNchw,IOperationExecutionContext * context)1080 bool generateProposalsFloat32(const float* scoresData, const Shape& scoresShape,
1081                               const float* bboxDeltasData, const Shape& bboxDeltasShape,
1082                               const float* anchorsData, const Shape& anchorsShape,
1083                               const float* imageInfoData, const Shape& imageInfoShape,
1084                               float heightStride, float widthStride, int32_t preNmsTopN,
1085                               int32_t postNmsTopN, float iouThreshold, float minSize, bool useNchw,
1086                               IOperationExecutionContext* context) {
1087     std::vector<float> scoresOut_float32, roiOut_float32;
1088     std::vector<int32_t> batchesOut;
1089     NN_RET_CHECK(generateProposalsFloat32Compute(
1090             scoresData, scoresShape, bboxDeltasData, bboxDeltasShape, anchorsData, anchorsShape,
1091             imageInfoData, imageInfoShape, heightStride, widthStride, preNmsTopN, postNmsTopN,
1092             iouThreshold, minSize, useNchw, &scoresOut_float32, &roiOut_float32, &batchesOut));
1093 
1094     // Set output dimensions.
1095     uint32_t numOutRois = scoresOut_float32.size();
1096     if (numOutRois == 0) return true;
1097     Shape scoresOutShape = context->getOutputShape(kOutputScoreTensor);
1098     scoresOutShape.dimensions = {numOutRois};
1099     NN_RET_CHECK(context->setOutputShape(kOutputScoreTensor, scoresOutShape));
1100     Shape roiOutShape = context->getOutputShape(kOutputRoiTensor);
1101     roiOutShape.dimensions = {numOutRois, 4};
1102     NN_RET_CHECK(context->setOutputShape(kOutputRoiTensor, roiOutShape));
1103     Shape batchesOutShape = context->getOutputShape(kOutputBatchesTensor);
1104     batchesOutShape.dimensions = {numOutRois};
1105     NN_RET_CHECK(context->setOutputShape(kOutputBatchesTensor, batchesOutShape));
1106 
1107     // Write outputs.
1108     float* scoresOutData = context->getOutputBuffer<float>(kOutputScoreTensor);
1109     for (uint32_t i = 0; i < scoresOut_float32.size(); i++) {
1110         scoresOutData[i] = scoresOut_float32[i];
1111     }
1112     float* roiOutData = context->getOutputBuffer<float>(kOutputRoiTensor);
1113     for (uint32_t i = 0; i < roiOut_float32.size(); i++) {
1114         roiOutData[i] = roiOut_float32[i];
1115     }
1116     int32_t* batchesOutData = context->getOutputBuffer<int32_t>(kOutputBatchesTensor);
1117     for (uint32_t i = 0; i < batchesOut.size(); i++) {
1118         batchesOutData[i] = batchesOut[i];
1119     }
1120     return true;
1121 }
1122 
generateProposalsFloat16(const _Float16 * scoresData,const Shape & scoresShape,const _Float16 * bboxDeltasData,const Shape & bboxDeltasShape,const _Float16 * anchorsData,const Shape & anchorsShape,const _Float16 * imageInfoData,const Shape & imageInfoShape,float heightStride,float widthStride,int32_t preNmsTopN,int32_t postNmsTopN,float iouThreshold,float minSize,bool useNchw,IOperationExecutionContext * context)1123 bool generateProposalsFloat16(const _Float16* scoresData, const Shape& scoresShape,
1124                               const _Float16* bboxDeltasData, const Shape& bboxDeltasShape,
1125                               const _Float16* anchorsData, const Shape& anchorsShape,
1126                               const _Float16* imageInfoData, const Shape& imageInfoShape,
1127                               float heightStride, float widthStride, int32_t preNmsTopN,
1128                               int32_t postNmsTopN, float iouThreshold, float minSize, bool useNchw,
1129                               IOperationExecutionContext* context) {
1130     std::vector<float> score_float32(getNumberOfElements(scoresShape));
1131     convertFloat16ToFloat32(scoresData, &score_float32);
1132     std::vector<float> delta_float32(getNumberOfElements(bboxDeltasShape));
1133     convertFloat16ToFloat32(bboxDeltasData, &delta_float32);
1134     std::vector<float> anchors_float32(getNumberOfElements(anchorsShape));
1135     convertFloat16ToFloat32(anchorsData, &anchors_float32);
1136     std::vector<float> imageInfo_float32(getNumberOfElements(imageInfoShape));
1137     convertFloat16ToFloat32(imageInfoData, &imageInfo_float32);
1138     std::vector<float> scoresOut_float32, roiOut_float32;
1139     std::vector<int32_t> batchesOut;
1140     NN_RET_CHECK(generateProposalsFloat32Compute(
1141             score_float32.data(), scoresShape, delta_float32.data(), bboxDeltasShape,
1142             anchors_float32.data(), anchorsShape, imageInfo_float32.data(), imageInfoShape,
1143             heightStride, widthStride, preNmsTopN, postNmsTopN, iouThreshold, minSize, useNchw,
1144             &scoresOut_float32, &roiOut_float32, &batchesOut));
1145 
1146     // Set output dimensions.
1147     uint32_t numOutRois = scoresOut_float32.size();
1148     if (numOutRois == 0) return true;
1149     Shape scoresOutShape = context->getOutputShape(kOutputScoreTensor);
1150     scoresOutShape.dimensions = {numOutRois};
1151     NN_RET_CHECK(context->setOutputShape(kOutputScoreTensor, scoresOutShape));
1152     Shape roiOutShape = context->getOutputShape(kOutputRoiTensor);
1153     roiOutShape.dimensions = {numOutRois, 4};
1154     NN_RET_CHECK(context->setOutputShape(kOutputRoiTensor, roiOutShape));
1155     Shape batchesOutShape = context->getOutputShape(kOutputBatchesTensor);
1156     batchesOutShape.dimensions = {numOutRois};
1157     NN_RET_CHECK(context->setOutputShape(kOutputBatchesTensor, batchesOutShape));
1158 
1159     // Write outputs.
1160     _Float16* scoresOutData = context->getOutputBuffer<_Float16>(kOutputScoreTensor);
1161     convertFloat32ToFloat16(scoresOut_float32, scoresOutData);
1162     _Float16* roiOutData = context->getOutputBuffer<_Float16>(kOutputRoiTensor);
1163     convertFloat32ToFloat16(roiOut_float32, roiOutData);
1164     int32_t* batchesOutData = context->getOutputBuffer<int32_t>(kOutputBatchesTensor);
1165     for (uint32_t i = 0; i < batchesOut.size(); i++) {
1166         batchesOutData[i] = batchesOut[i];
1167     }
1168     return true;
1169 }
1170 
1171 template <typename T_8QInput>
generateProposalsQuant(const T_8QInput * scoresData,const Shape & scoresShape,const T_8QInput * bboxDeltasData,const Shape & bboxDeltasShape,const int16_t * anchorsData,const Shape & anchorsShape,const uint16_t * imageInfoData,const Shape & imageInfoShape,float heightStride,float widthStride,int32_t preNmsTopN,int32_t postNmsTopN,float iouThreshold,float minSize,bool useNchw,IOperationExecutionContext * context)1172 bool generateProposalsQuant(const T_8QInput* scoresData, const Shape& scoresShape,
1173                             const T_8QInput* bboxDeltasData, const Shape& bboxDeltasShape,
1174                             const int16_t* anchorsData, const Shape& anchorsShape,
1175                             const uint16_t* imageInfoData, const Shape& imageInfoShape,
1176                             float heightStride, float widthStride, int32_t preNmsTopN,
1177                             int32_t postNmsTopN, float iouThreshold, float minSize, bool useNchw,
1178                             IOperationExecutionContext* context) {
1179     std::vector<float> score_float32(getNumberOfElements(scoresShape));
1180     convertQuantToFloat32<T_8QInput>(scoresData, scoresShape.scale, scoresShape.offset,
1181                                      &score_float32);
1182     std::vector<float> delta_float32(getNumberOfElements(bboxDeltasShape));
1183     convertQuantToFloat32<T_8QInput>(bboxDeltasData, bboxDeltasShape.scale, bboxDeltasShape.offset,
1184                                      &delta_float32);
1185     std::vector<float> anchors_float32(getNumberOfElements(anchorsShape));
1186     convertQuantToFloat32(anchorsData, anchorsShape.scale, anchorsShape.offset, &anchors_float32);
1187     std::vector<float> imageInfo_float32(getNumberOfElements(imageInfoShape));
1188     convertQuantToFloat32(imageInfoData, imageInfoShape.scale, imageInfoShape.offset,
1189                           &imageInfo_float32);
1190     std::vector<float> scoresOut_float32, roiOut_float32;
1191     std::vector<int32_t> batchesOut;
1192     NN_RET_CHECK(generateProposalsFloat32Compute(
1193             score_float32.data(), scoresShape, delta_float32.data(), bboxDeltasShape,
1194             anchors_float32.data(), anchorsShape, imageInfo_float32.data(), imageInfoShape,
1195             heightStride, widthStride, preNmsTopN, postNmsTopN, iouThreshold, minSize, useNchw,
1196             &scoresOut_float32, &roiOut_float32, &batchesOut));
1197 
1198     // Set output dimensions.
1199     uint32_t numOutRois = scoresOut_float32.size();
1200     if (numOutRois == 0) return true;
1201     Shape scoresOutShape = context->getOutputShape(kOutputScoreTensor);
1202     scoresOutShape.dimensions = {numOutRois};
1203     NN_RET_CHECK(context->setOutputShape(kOutputScoreTensor, scoresOutShape));
1204     Shape roiOutShape = context->getOutputShape(kOutputRoiTensor);
1205     roiOutShape.dimensions = {numOutRois, 4};
1206     NN_RET_CHECK(context->setOutputShape(kOutputRoiTensor, roiOutShape));
1207     Shape batchesOutShape = context->getOutputShape(kOutputBatchesTensor);
1208     batchesOutShape.dimensions = {numOutRois};
1209     NN_RET_CHECK(context->setOutputShape(kOutputBatchesTensor, batchesOutShape));
1210 
1211     // Write outputs.
1212     T_8QInput* scoresOutData = context->getOutputBuffer<T_8QInput>(kOutputScoreTensor);
1213     convertFloat32ToQuant<T_8QInput>(scoresOut_float32, scoresOutShape.scale, scoresOutShape.offset,
1214                                      scoresOutData);
1215     uint16_t* roiOutData = context->getOutputBuffer<uint16_t>(kOutputRoiTensor);
1216     convertFloat32ToQuant(roiOut_float32, roiOutShape.scale, roiOutShape.offset, roiOutData);
1217     int32_t* batchesOutData = context->getOutputBuffer<int32_t>(kOutputBatchesTensor);
1218     for (uint32_t i = 0; i < batchesOut.size(); i++) {
1219         batchesOutData[i] = batchesOut[i];
1220     }
1221     return true;
1222 }
1223 
1224 }  // namespace
1225 #endif  // NN_INCLUDE_CPU_IMPLEMENTATION
1226 
validate(const IOperationValidationContext * context)1227 Result<Version> validate(const IOperationValidationContext* context) {
1228     NN_RET_CHECK_EQ(context->getNumInputs(), kNumInputs);
1229     NN_RET_CHECK_EQ(context->getNumOutputs(), kNumOutputs);
1230     std::vector<OperandType> inExpectedTypes;
1231     std::vector<OperandType> outExpectedTypes;
1232     auto inputType = context->getInputType(kScoreTensor);
1233     if (inputType == OperandType::TENSOR_FLOAT16) {
1234         inExpectedTypes = {OperandType::TENSOR_FLOAT16,
1235                            OperandType::TENSOR_FLOAT16,
1236                            OperandType::TENSOR_FLOAT16,
1237                            OperandType::TENSOR_FLOAT16,
1238                            OperandType::FLOAT16,
1239                            OperandType::FLOAT16,
1240                            OperandType::INT32,
1241                            OperandType::INT32,
1242                            OperandType::FLOAT16,
1243                            OperandType::FLOAT16,
1244                            OperandType::BOOL};
1245         outExpectedTypes = {OperandType::TENSOR_FLOAT16, OperandType::TENSOR_FLOAT16,
1246                             OperandType::TENSOR_INT32};
1247     } else if (inputType == OperandType::TENSOR_FLOAT32) {
1248         inExpectedTypes = {OperandType::TENSOR_FLOAT32,
1249                            OperandType::TENSOR_FLOAT32,
1250                            OperandType::TENSOR_FLOAT32,
1251                            OperandType::TENSOR_FLOAT32,
1252                            OperandType::FLOAT32,
1253                            OperandType::FLOAT32,
1254                            OperandType::INT32,
1255                            OperandType::INT32,
1256                            OperandType::FLOAT32,
1257                            OperandType::FLOAT32,
1258                            OperandType::BOOL};
1259         outExpectedTypes = {OperandType::TENSOR_FLOAT32, OperandType::TENSOR_FLOAT32,
1260                             OperandType::TENSOR_INT32};
1261     } else if (inputType == OperandType::TENSOR_QUANT8_ASYMM ||
1262                inputType == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1263         inExpectedTypes = {inputType,
1264                            inputType,
1265                            OperandType::TENSOR_QUANT16_SYMM,
1266                            OperandType::TENSOR_QUANT16_ASYMM,
1267                            OperandType::FLOAT32,
1268                            OperandType::FLOAT32,
1269                            OperandType::INT32,
1270                            OperandType::INT32,
1271                            OperandType::FLOAT32,
1272                            OperandType::FLOAT32,
1273                            OperandType::BOOL};
1274         outExpectedTypes = {inputType, OperandType::TENSOR_QUANT16_ASYMM,
1275                             OperandType::TENSOR_INT32};
1276     } else {
1277         NN_RET_CHECK_FAIL() << "Unsupported tensor type for operation " << kOperationName;
1278     }
1279     NN_RET_CHECK(validateInputTypes(context, inExpectedTypes));
1280     NN_RET_CHECK(validateOutputTypes(context, outExpectedTypes));
1281     if (inputType == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1282         return Version::ANDROID_R;
1283     } else {
1284         return Version::ANDROID_Q;
1285     }
1286 }
1287 
1288 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
prepare(IOperationExecutionContext * context)1289 bool prepare(IOperationExecutionContext* context) {
1290     bool useNchw = context->getInputValue<bool>(kLayoutScalar);
1291     Shape scoreShape = context->getInputShape(kScoreTensor);
1292     Shape bboxDeltasShape = context->getInputShape(kDeltaTensor);
1293     Shape anchorsShape = context->getInputShape(kAnchorTensor);
1294     Shape imageInfoDataShape = context->getInputShape(kImageInfoTensor);
1295     Shape outputScoreShape = context->getOutputShape(kOutputScoreTensor);
1296     Shape outputRoiShape = context->getOutputShape(kOutputRoiTensor);
1297     Shape outputBatchSplitShape = context->getOutputShape(kOutputBatchesTensor);
1298 
1299     NN_RET_CHECK_EQ(getNumberOfDimensions(scoreShape), 4);
1300     NN_RET_CHECK_EQ(getNumberOfDimensions(bboxDeltasShape), 4);
1301     NN_RET_CHECK_EQ(getNumberOfDimensions(anchorsShape), 2);
1302     NN_RET_CHECK_EQ(getNumberOfDimensions(imageInfoDataShape), 2);
1303 
1304     const uint32_t kRoiDim = 4;
1305     uint32_t numBatches = getSizeOfDimension(scoreShape, 0);
1306     uint32_t height = getSizeOfDimension(scoreShape, useNchw ? 2 : 1);
1307     uint32_t width = getSizeOfDimension(scoreShape, useNchw ? 3 : 2);
1308     uint32_t numAnchors = getSizeOfDimension(scoreShape, useNchw ? 1 : 3);
1309 
1310     NN_RET_CHECK_EQ(getSizeOfDimension(bboxDeltasShape, 0), numBatches);
1311     NN_RET_CHECK_EQ(getSizeOfDimension(bboxDeltasShape, useNchw ? 2 : 1), height);
1312     NN_RET_CHECK_EQ(getSizeOfDimension(bboxDeltasShape, useNchw ? 3 : 2), width);
1313     NN_RET_CHECK_EQ(getSizeOfDimension(bboxDeltasShape, useNchw ? 1 : 3), numAnchors * kRoiDim);
1314     NN_RET_CHECK_EQ(getSizeOfDimension(imageInfoDataShape, 0), numBatches);
1315     NN_RET_CHECK_EQ(getSizeOfDimension(imageInfoDataShape, 1), 2);
1316     NN_RET_CHECK_EQ(getSizeOfDimension(anchorsShape, 0), numAnchors);
1317     NN_RET_CHECK_EQ(getSizeOfDimension(anchorsShape, 1), kRoiDim);
1318 
1319     if (scoreShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
1320         NN_RET_CHECK_EQ(anchorsShape.scale, 0.125f);
1321         NN_RET_CHECK_EQ(imageInfoDataShape.scale, 0.125f);
1322         NN_RET_CHECK_EQ(imageInfoDataShape.offset, 0);
1323     }
1324 
1325     outputScoreShape.type = scoreShape.type;
1326     outputScoreShape.dimensions = {0};
1327     outputScoreShape.scale = scoreShape.scale;
1328     outputScoreShape.offset = scoreShape.offset;
1329     NN_RET_CHECK(context->setOutputShape(kOutputScoreTensor, outputScoreShape));
1330 
1331     outputRoiShape.dimensions = {0, 4};
1332     if (scoreShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
1333         outputRoiShape.scale = 0.125f;
1334         outputRoiShape.offset = 0;
1335     }
1336     NN_RET_CHECK(context->setOutputShape(kOutputRoiTensor, outputRoiShape));
1337 
1338     outputBatchSplitShape.dimensions = {0};
1339     NN_RET_CHECK(context->setOutputShape(kOutputBatchesTensor, outputBatchSplitShape));
1340     return true;
1341 }
1342 
execute(IOperationExecutionContext * context)1343 bool execute(IOperationExecutionContext* context) {
1344     NNTRACE_TRANS("generateProposals");
1345     switch (context->getInputType(kScoreTensor)) {
1346         case OperandType::TENSOR_FLOAT16: {
1347             return generateProposalsFloat16(context->getInputBuffer<_Float16>(kScoreTensor),
1348                                             context->getInputShape(kScoreTensor),
1349                                             context->getInputBuffer<_Float16>(kDeltaTensor),
1350                                             context->getInputShape(kDeltaTensor),
1351                                             context->getInputBuffer<_Float16>(kAnchorTensor),
1352                                             context->getInputShape(kAnchorTensor),
1353                                             context->getInputBuffer<_Float16>(kImageInfoTensor),
1354                                             context->getInputShape(kImageInfoTensor),
1355                                             context->getInputValue<_Float16>(kHeightStrideSalar),
1356                                             context->getInputValue<_Float16>(kWidthStrideScalar),
1357                                             context->getInputValue<int32_t>(kPreNmsMaxScalar),
1358                                             context->getInputValue<int32_t>(kPostNmsMaxScalar),
1359                                             context->getInputValue<_Float16>(kIoUThresholdScalar),
1360                                             context->getInputValue<_Float16>(kMinSizeScalar),
1361                                             context->getInputValue<bool>(kLayoutScalar), context);
1362         }
1363         case OperandType::TENSOR_FLOAT32: {
1364             return generateProposalsFloat32(context->getInputBuffer<float>(kScoreTensor),
1365                                             context->getInputShape(kScoreTensor),
1366                                             context->getInputBuffer<float>(kDeltaTensor),
1367                                             context->getInputShape(kDeltaTensor),
1368                                             context->getInputBuffer<float>(kAnchorTensor),
1369                                             context->getInputShape(kAnchorTensor),
1370                                             context->getInputBuffer<float>(kImageInfoTensor),
1371                                             context->getInputShape(kImageInfoTensor),
1372                                             context->getInputValue<float>(kHeightStrideSalar),
1373                                             context->getInputValue<float>(kWidthStrideScalar),
1374                                             context->getInputValue<int32_t>(kPreNmsMaxScalar),
1375                                             context->getInputValue<int32_t>(kPostNmsMaxScalar),
1376                                             context->getInputValue<float>(kIoUThresholdScalar),
1377                                             context->getInputValue<float>(kMinSizeScalar),
1378                                             context->getInputValue<bool>(kLayoutScalar), context);
1379         }
1380         case OperandType::TENSOR_QUANT8_ASYMM: {
1381             return generateProposalsQuant(context->getInputBuffer<uint8_t>(kScoreTensor),
1382                                           context->getInputShape(kScoreTensor),
1383                                           context->getInputBuffer<uint8_t>(kDeltaTensor),
1384                                           context->getInputShape(kDeltaTensor),
1385                                           context->getInputBuffer<int16_t>(kAnchorTensor),
1386                                           context->getInputShape(kAnchorTensor),
1387                                           context->getInputBuffer<uint16_t>(kImageInfoTensor),
1388                                           context->getInputShape(kImageInfoTensor),
1389                                           context->getInputValue<float>(kHeightStrideSalar),
1390                                           context->getInputValue<float>(kWidthStrideScalar),
1391                                           context->getInputValue<int32_t>(kPreNmsMaxScalar),
1392                                           context->getInputValue<int32_t>(kPostNmsMaxScalar),
1393                                           context->getInputValue<float>(kIoUThresholdScalar),
1394                                           context->getInputValue<float>(kMinSizeScalar),
1395                                           context->getInputValue<bool>(kLayoutScalar), context);
1396         }
1397         case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
1398             return generateProposalsQuant(context->getInputBuffer<int8_t>(kScoreTensor),
1399                                           context->getInputShape(kScoreTensor),
1400                                           context->getInputBuffer<int8_t>(kDeltaTensor),
1401                                           context->getInputShape(kDeltaTensor),
1402                                           context->getInputBuffer<int16_t>(kAnchorTensor),
1403                                           context->getInputShape(kAnchorTensor),
1404                                           context->getInputBuffer<uint16_t>(kImageInfoTensor),
1405                                           context->getInputShape(kImageInfoTensor),
1406                                           context->getInputValue<float>(kHeightStrideSalar),
1407                                           context->getInputValue<float>(kWidthStrideScalar),
1408                                           context->getInputValue<int32_t>(kPreNmsMaxScalar),
1409                                           context->getInputValue<int32_t>(kPostNmsMaxScalar),
1410                                           context->getInputValue<float>(kIoUThresholdScalar),
1411                                           context->getInputValue<float>(kMinSizeScalar),
1412                                           context->getInputValue<bool>(kLayoutScalar), context);
1413         }
1414         default:
1415             NN_RET_CHECK_FAIL() << "Unsupported tensor type for operation " << kOperationName;
1416     }
1417 }
1418 #endif  // NN_INCLUDE_CPU_IMPLEMENTATION
1419 
1420 }  // namespace generate_proposals
1421 
1422 namespace detection_postprocess {
1423 
1424 constexpr char kOperationName[] = "DETECTION_POSTPROCESS";
1425 
1426 constexpr uint32_t kNumInputs = 14;
1427 constexpr uint32_t kScoreTensor = 0;
1428 constexpr uint32_t kDeltaTensor = 1;
1429 constexpr uint32_t kAnchorTensor = 2;
1430 constexpr uint32_t kScaleYScalar = 3;
1431 constexpr uint32_t kScaleXScalar = 4;
1432 constexpr uint32_t kScaleHScalar = 5;
1433 constexpr uint32_t kScaleWScalar = 6;
1434 constexpr uint32_t kUseRegularNmsScalar = 7;
1435 constexpr uint32_t kMaxNumDetectionScalar = 8;
1436 constexpr uint32_t kMaxClassesPerDetectionScalar = 9;
1437 constexpr uint32_t kMaxNumDetectionPerClassScalar = 10;
1438 constexpr uint32_t kScoreThresholdScalar = 11;
1439 constexpr uint32_t kIoUThresholdScalar = 12;
1440 constexpr uint32_t kIsBGInLabelScalar = 13;
1441 
1442 constexpr uint32_t kNumOutputs = 4;
1443 constexpr uint32_t kOutputScoreTensor = 0;
1444 constexpr uint32_t kOutputRoiTensor = 1;
1445 constexpr uint32_t kOutputClassTensor = 2;
1446 constexpr uint32_t kOutputDetectionTensor = 3;
1447 
1448 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
1449 namespace {
1450 
detectionPostprocessFloat32(const float * scoreData,const Shape & scoreShape,const float * deltaData,const Shape & deltaShape,const float * anchorData,const Shape & anchorShape,float scaleY,float scaleX,float scaleH,float scaleW,bool useRegularNms,int32_t maxNumDetections,int32_t maxClassesPerDetection,int32_t maxNumDetectionsPerClass,float iouThreshold,float scoreThreshold,bool isBGInLabel,float * scoreOutData,const Shape & scoreOutShape,float * roiOutData,const Shape & roiOutShape,int32_t * classOutData,const Shape & classOutShape,int32_t * detectionOutData,const Shape & detectionOutShape)1451 bool detectionPostprocessFloat32(
1452         const float* scoreData, const Shape& scoreShape, const float* deltaData,
1453         const Shape& deltaShape, const float* anchorData, const Shape& anchorShape, float scaleY,
1454         float scaleX, float scaleH, float scaleW, bool useRegularNms, int32_t maxNumDetections,
1455         int32_t maxClassesPerDetection, int32_t maxNumDetectionsPerClass, float iouThreshold,
1456         float scoreThreshold, bool isBGInLabel, float* scoreOutData, const Shape& scoreOutShape,
1457         float* roiOutData, const Shape& roiOutShape, int32_t* classOutData,
1458         const Shape& classOutShape, int32_t* detectionOutData, const Shape& detectionOutShape) {
1459     const uint32_t kRoiDim = 4;
1460     uint32_t numBatches = getSizeOfDimension(scoreShape, 0);
1461     uint32_t numAnchors = getSizeOfDimension(scoreShape, 1);
1462     uint32_t numClasses = getSizeOfDimension(scoreShape, 2);
1463     uint32_t lengthBoxEncoding = getSizeOfDimension(deltaShape, 2);
1464     uint32_t numOutDetection = getSizeOfDimension(scoreOutShape, 1);
1465 
1466     memset(scoreOutData, 0, getNumberOfElements(scoreOutShape) * sizeof(float));
1467     memset(roiOutData, 0, getNumberOfElements(roiOutShape) * sizeof(float));
1468     memset(classOutData, 0, getNumberOfElements(classOutShape) * sizeof(int32_t));
1469     memset(detectionOutData, 0, getNumberOfElements(detectionOutShape) * sizeof(int32_t));
1470 
1471     const float* scoreBase = scoreData;
1472     const float* deltaBase = deltaData;
1473     float* scoreOutBase = scoreOutData;
1474     float* roiOutBase = roiOutData;
1475     int32_t* classOutBase = classOutData;
1476     std::vector<float> roiBuffer(numAnchors * kRoiDim);
1477     std::vector<float> scoreBuffer(numAnchors);
1478     for (uint32_t b = 0; b < numBatches; b++) {
1479         const float* anchorBase = anchorData;
1480         for (uint32_t a = 0; a < numAnchors; a++) {
1481             float yCtr = anchorBase[0] + anchorBase[2] * deltaBase[0] / scaleY;
1482             float xCtr = anchorBase[1] + anchorBase[3] * deltaBase[1] / scaleX;
1483             float hHalf = anchorBase[2] * std::exp(deltaBase[2] / scaleH) * 0.5f;
1484             float wHalf = anchorBase[3] * std::exp(deltaBase[3] / scaleW) * 0.5f;
1485             roiBuffer[a * kRoiDim] = yCtr - hHalf;
1486             roiBuffer[a * kRoiDim + 1] = xCtr - wHalf;
1487             roiBuffer[a * kRoiDim + 2] = yCtr + hHalf;
1488             roiBuffer[a * kRoiDim + 3] = xCtr + wHalf;
1489             anchorBase += kRoiDim;
1490             deltaBase += lengthBoxEncoding;
1491         }
1492 
1493         if (useRegularNms) {
1494             std::vector<uint32_t> select;
1495             box_with_nms_limit::hardNmsMultiClass(
1496                     scoreBase, numClasses, numAnchors, scoreThreshold, iouThreshold,
1497                     maxNumDetections, maxNumDetectionsPerClass,
1498                     [&roiBuffer, numClasses](uint32_t ind) {
1499                         return roiBuffer.data() + (ind / numClasses) * kRoiDim;
1500                     },
1501                     &select);
1502             for (uint32_t i = 0; i < select.size(); i++) {
1503                 uint32_t ind = select[i];
1504                 scoreOutBase[i] = scoreBase[ind];
1505                 memcpy(roiOutBase + i * kRoiDim, &roiBuffer[(ind / numClasses) * kRoiDim],
1506                        kRoiDim * sizeof(float));
1507                 classOutBase[i] = (ind % numClasses) - (isBGInLabel ? 0 : 1);
1508             }
1509             *detectionOutData++ = select.size();
1510         } else {
1511             uint32_t numOutClasses = std::min<uint32_t>(numClasses - 1, maxClassesPerDetection);
1512             std::vector<float> maxScores(numAnchors);
1513             for (uint32_t a = 0; a < numAnchors; a++) {
1514                 maxScores[a] = *std::max_element(scoreBase + a * numClasses + 1,
1515                                                  scoreBase + (a + 1) * numClasses);
1516             }
1517             std::vector<uint32_t> select;
1518             for (uint32_t a = 0; a < numAnchors; a++) {
1519                 if (maxScores[a] > scoreThreshold) {
1520                     select.push_back(a);
1521                 }
1522             }
1523             uint32_t* selectEnd = box_with_nms_limit::hardNmsSingleClass(
1524                     maxScores.data(), iouThreshold, maxNumDetections,
1525                     [&roiBuffer](uint32_t ind) { return roiBuffer.data() + ind * kRoiDim; },
1526                     select.data(), select.size());
1527             select.resize(selectEnd - select.data());
1528             float* scoreOutPtr = scoreOutBase;
1529             float* roiOutPtr = roiOutBase;
1530             int32_t* classOutPtr = classOutBase;
1531             for (auto i : select) {
1532                 const float* score = scoreBase + i * numClasses;
1533                 std::vector<uint32_t> scoreInds(numClasses - 1);
1534                 std::iota(scoreInds.begin(), scoreInds.end(), 1);
1535                 std::sort(scoreInds.begin(), scoreInds.end(),
1536                           [&score](const uint32_t lhs, const uint32_t rhs) {
1537                               return score[lhs] > score[rhs];
1538                           });
1539                 for (uint32_t c = 0; c < numOutClasses; c++) {
1540                     *scoreOutPtr++ = score[scoreInds[c]];
1541                     memcpy(roiOutPtr, &roiBuffer[i * kRoiDim], kRoiDim * sizeof(float));
1542                     roiOutPtr += kRoiDim;
1543                     *classOutPtr++ = scoreInds[c] - (isBGInLabel ? 0 : 1);
1544                 }
1545             }
1546             *detectionOutData++ = select.size() * numOutClasses;
1547         }
1548         scoreBase += numAnchors * numClasses;
1549         scoreOutBase += numOutDetection;
1550         roiOutBase += numOutDetection * kRoiDim;
1551         classOutBase += numOutDetection;
1552     }
1553     return true;
1554 }
1555 
detectionPostprocessFloat16(const _Float16 * scoreData,const Shape & scoreShape,const _Float16 * deltaData,const Shape & deltaShape,const _Float16 * anchorData,const Shape & anchorShape,float scaleY,float scaleX,float scaleH,float scaleW,bool useRegularNms,int32_t maxNumDetections,int32_t maxClassesPerDetection,int32_t maxNumDetectionsPerClass,float iouThreshold,float scoreThreshold,bool isBGInLabel,_Float16 * scoreOutData,const Shape & scoreOutShape,_Float16 * roiOutData,const Shape & roiOutShape,int32_t * classOutData,const Shape & classOutShape,int32_t * detectionOutData,const Shape & detectionOutShape)1556 bool detectionPostprocessFloat16(
1557         const _Float16* scoreData, const Shape& scoreShape, const _Float16* deltaData,
1558         const Shape& deltaShape, const _Float16* anchorData, const Shape& anchorShape, float scaleY,
1559         float scaleX, float scaleH, float scaleW, bool useRegularNms, int32_t maxNumDetections,
1560         int32_t maxClassesPerDetection, int32_t maxNumDetectionsPerClass, float iouThreshold,
1561         float scoreThreshold, bool isBGInLabel, _Float16* scoreOutData, const Shape& scoreOutShape,
1562         _Float16* roiOutData, const Shape& roiOutShape, int32_t* classOutData,
1563         const Shape& classOutShape, int32_t* detectionOutData, const Shape& detectionOutShape) {
1564     std::vector<float> scores_float32(getNumberOfElements(scoreShape));
1565     convertFloat16ToFloat32(scoreData, &scores_float32);
1566     std::vector<float> delta_float32(getNumberOfElements(deltaShape));
1567     convertFloat16ToFloat32(deltaData, &delta_float32);
1568     std::vector<float> anchor_float32(getNumberOfElements(anchorShape));
1569     convertFloat16ToFloat32(anchorData, &anchor_float32);
1570     std::vector<float> outputScore_float32(getNumberOfElements(scoreOutShape));
1571     std::vector<float> outputRoi_float32(getNumberOfElements(roiOutShape));
1572     NN_RET_CHECK(detectionPostprocessFloat32(
1573             scores_float32.data(), scoreShape, delta_float32.data(), deltaShape,
1574             anchor_float32.data(), anchorShape, scaleY, scaleX, scaleH, scaleW, useRegularNms,
1575             maxNumDetections, maxClassesPerDetection, maxNumDetectionsPerClass, iouThreshold,
1576             scoreThreshold, isBGInLabel, outputScore_float32.data(), scoreOutShape,
1577             outputRoi_float32.data(), roiOutShape, classOutData, classOutShape, detectionOutData,
1578             detectionOutShape));
1579     convertFloat32ToFloat16(outputScore_float32, scoreOutData);
1580     convertFloat32ToFloat16(outputRoi_float32, roiOutData);
1581     return true;
1582 }
1583 
1584 }  // namespace
1585 #endif  // NN_INCLUDE_CPU_IMPLEMENTATION
1586 
validate(const IOperationValidationContext * context)1587 Result<Version> validate(const IOperationValidationContext* context) {
1588     NN_RET_CHECK_EQ(context->getNumInputs(), kNumInputs);
1589     NN_RET_CHECK_EQ(context->getNumOutputs(), kNumOutputs);
1590     std::vector<OperandType> inExpectedTypes;
1591     std::vector<OperandType> outExpectedTypes;
1592     auto inputType = context->getInputType(kScoreTensor);
1593     if (inputType == OperandType::TENSOR_FLOAT16) {
1594         inExpectedTypes = {OperandType::TENSOR_FLOAT16, OperandType::TENSOR_FLOAT16,
1595                            OperandType::TENSOR_FLOAT16, OperandType::FLOAT16,
1596                            OperandType::FLOAT16,        OperandType::FLOAT16,
1597                            OperandType::FLOAT16,        OperandType::BOOL,
1598                            OperandType::INT32,          OperandType::INT32,
1599                            OperandType::INT32,          OperandType::FLOAT16,
1600                            OperandType::FLOAT16,        OperandType::BOOL};
1601     } else if (inputType == OperandType::TENSOR_FLOAT32) {
1602         inExpectedTypes = {OperandType::TENSOR_FLOAT32, OperandType::TENSOR_FLOAT32,
1603                            OperandType::TENSOR_FLOAT32, OperandType::FLOAT32,
1604                            OperandType::FLOAT32,        OperandType::FLOAT32,
1605                            OperandType::FLOAT32,        OperandType::BOOL,
1606                            OperandType::INT32,          OperandType::INT32,
1607                            OperandType::INT32,          OperandType::FLOAT32,
1608                            OperandType::FLOAT32,        OperandType::BOOL};
1609     } else {
1610         NN_RET_CHECK_FAIL() << "Unsupported tensor type for operation " << kOperationName;
1611     }
1612     NN_RET_CHECK(validateInputTypes(context, inExpectedTypes));
1613     NN_RET_CHECK(validateOutputTypes(
1614             context, {inputType, inputType, OperandType::TENSOR_INT32, OperandType::TENSOR_INT32}));
1615     return Version::ANDROID_Q;
1616 }
1617 
1618 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
prepare(IOperationExecutionContext * context)1619 bool prepare(IOperationExecutionContext* context) {
1620     Shape scoreShape = context->getInputShape(kScoreTensor);
1621     Shape deltasShape = context->getInputShape(kDeltaTensor);
1622     Shape anchorsShape = context->getInputShape(kAnchorTensor);
1623     Shape outputScoreShape = context->getOutputShape(kOutputScoreTensor);
1624     Shape outputRoiShape = context->getOutputShape(kOutputRoiTensor);
1625     Shape outputClassShape = context->getOutputShape(kOutputClassTensor);
1626     Shape outputDetectionShape = context->getOutputShape(kOutputDetectionTensor);
1627 
1628     NN_RET_CHECK_EQ(getNumberOfDimensions(scoreShape), 3);
1629     NN_RET_CHECK_EQ(getNumberOfDimensions(deltasShape), 3);
1630     NN_RET_CHECK_EQ(getNumberOfDimensions(anchorsShape), 2);
1631 
1632     const uint32_t kRoiDim = 4;
1633     uint32_t numBatches = getSizeOfDimension(scoreShape, 0);
1634     uint32_t numAnchors = getSizeOfDimension(scoreShape, 1);
1635     uint32_t numClasses = getSizeOfDimension(scoreShape, 2);
1636     uint32_t lengthBoxEncoding = getSizeOfDimension(deltasShape, 2);
1637     uint32_t maxNumDetections = context->getInputValue<int32_t>(kMaxNumDetectionScalar);
1638     uint32_t maxClassesPerDetection =
1639             context->getInputValue<int32_t>(kMaxClassesPerDetectionScalar);
1640     uint32_t numOutDetections = maxNumDetections;
1641 
1642     NN_RET_CHECK_EQ(getSizeOfDimension(deltasShape, 0), numBatches);
1643     NN_RET_CHECK_EQ(getSizeOfDimension(deltasShape, 1), numAnchors);
1644     NN_RET_CHECK_EQ(getSizeOfDimension(anchorsShape, 0), numAnchors);
1645     NN_RET_CHECK_EQ(getSizeOfDimension(anchorsShape, 1), kRoiDim);
1646 
1647     if (scoreShape.type == OperandType::TENSOR_FLOAT32) {
1648         NN_RET_CHECK_GT(context->getInputValue<float>(kScaleYScalar), 0);
1649         NN_RET_CHECK_GT(context->getInputValue<float>(kScaleXScalar), 0);
1650         NN_RET_CHECK_GT(context->getInputValue<float>(kScaleHScalar), 0);
1651         NN_RET_CHECK_GT(context->getInputValue<float>(kScaleWScalar), 0);
1652         NN_RET_CHECK_GE(context->getInputValue<float>(kScoreThresholdScalar), 0);
1653         NN_RET_CHECK_GE(context->getInputValue<float>(kIoUThresholdScalar), 0);
1654     } else if (scoreShape.type == OperandType::TENSOR_FLOAT16) {
1655         NN_RET_CHECK(context->getInputValue<_Float16>(kScaleYScalar) > 0);
1656         NN_RET_CHECK(context->getInputValue<_Float16>(kScaleXScalar) > 0);
1657         NN_RET_CHECK(context->getInputValue<_Float16>(kScaleHScalar) > 0);
1658         NN_RET_CHECK(context->getInputValue<_Float16>(kScaleWScalar) > 0);
1659         NN_RET_CHECK(context->getInputValue<_Float16>(kScoreThresholdScalar) >= 0);
1660         NN_RET_CHECK(context->getInputValue<_Float16>(kIoUThresholdScalar) >= 0);
1661     } else {
1662         NN_RET_CHECK_FAIL() << "Unsupported tensor type for operation " << kOperationName;
1663     }
1664     NN_RET_CHECK_GT(numClasses, 1);
1665     NN_RET_CHECK_GE(lengthBoxEncoding, 4);
1666     NN_RET_CHECK_GT(maxNumDetections, 0);
1667     if (context->getInputValue<bool>(kUseRegularNmsScalar)) {
1668         NN_RET_CHECK_GT(context->getInputValue<int32_t>(kMaxNumDetectionPerClassScalar), 0);
1669     } else {
1670         NN_RET_CHECK_GT(maxClassesPerDetection, 0);
1671         numOutDetections *= maxClassesPerDetection;
1672     }
1673 
1674     outputScoreShape.type = scoreShape.type;
1675     outputScoreShape.dimensions = {numBatches, numOutDetections};
1676     NN_RET_CHECK(context->setOutputShape(kOutputScoreTensor, outputScoreShape));
1677 
1678     outputRoiShape.type = anchorsShape.type;
1679     outputRoiShape.dimensions = {numBatches, numOutDetections, 4};
1680     NN_RET_CHECK(context->setOutputShape(kOutputRoiTensor, outputRoiShape));
1681 
1682     outputClassShape.type = OperandType::TENSOR_INT32;
1683     outputClassShape.dimensions = {numBatches, numOutDetections};
1684     NN_RET_CHECK(context->setOutputShape(kOutputClassTensor, outputClassShape));
1685 
1686     outputDetectionShape.type = OperandType::TENSOR_INT32;
1687     outputDetectionShape.dimensions = {numBatches};
1688     NN_RET_CHECK(context->setOutputShape(kOutputDetectionTensor, outputDetectionShape));
1689     return true;
1690 }
1691 
execute(IOperationExecutionContext * context)1692 bool execute(IOperationExecutionContext* context) {
1693     NNTRACE_TRANS("detectionPostProcess");
1694     switch (context->getInputType(kScoreTensor)) {
1695         case OperandType::TENSOR_FLOAT16: {
1696             return detectionPostprocessFloat16(
1697                     context->getInputBuffer<_Float16>(kScoreTensor),
1698                     context->getInputShape(kScoreTensor),
1699                     context->getInputBuffer<_Float16>(kDeltaTensor),
1700                     context->getInputShape(kDeltaTensor),
1701                     context->getInputBuffer<_Float16>(kAnchorTensor),
1702                     context->getInputShape(kAnchorTensor),
1703                     context->getInputValue<_Float16>(kScaleYScalar),
1704                     context->getInputValue<_Float16>(kScaleXScalar),
1705                     context->getInputValue<_Float16>(kScaleHScalar),
1706                     context->getInputValue<_Float16>(kScaleWScalar),
1707                     context->getInputValue<bool>(kUseRegularNmsScalar),
1708                     context->getInputValue<int32_t>(kMaxNumDetectionScalar),
1709                     context->getInputValue<int32_t>(kMaxClassesPerDetectionScalar),
1710                     context->getInputValue<int32_t>(kMaxNumDetectionPerClassScalar),
1711                     context->getInputValue<_Float16>(kIoUThresholdScalar),
1712                     context->getInputValue<_Float16>(kScoreThresholdScalar),
1713                     context->getInputValue<bool>(kIsBGInLabelScalar),
1714                     context->getOutputBuffer<_Float16>(kOutputScoreTensor),
1715                     context->getOutputShape(kOutputScoreTensor),
1716                     context->getOutputBuffer<_Float16>(kOutputRoiTensor),
1717                     context->getOutputShape(kOutputRoiTensor),
1718                     context->getOutputBuffer<int32_t>(kOutputClassTensor),
1719                     context->getOutputShape(kOutputClassTensor),
1720                     context->getOutputBuffer<int32_t>(kOutputDetectionTensor),
1721                     context->getOutputShape(kOutputDetectionTensor));
1722         }
1723         case OperandType::TENSOR_FLOAT32: {
1724             return detectionPostprocessFloat32(
1725                     context->getInputBuffer<float>(kScoreTensor),
1726                     context->getInputShape(kScoreTensor),
1727                     context->getInputBuffer<float>(kDeltaTensor),
1728                     context->getInputShape(kDeltaTensor),
1729                     context->getInputBuffer<float>(kAnchorTensor),
1730                     context->getInputShape(kAnchorTensor),
1731                     context->getInputValue<float>(kScaleYScalar),
1732                     context->getInputValue<float>(kScaleXScalar),
1733                     context->getInputValue<float>(kScaleHScalar),
1734                     context->getInputValue<float>(kScaleWScalar),
1735                     context->getInputValue<bool>(kUseRegularNmsScalar),
1736                     context->getInputValue<int32_t>(kMaxNumDetectionScalar),
1737                     context->getInputValue<int32_t>(kMaxClassesPerDetectionScalar),
1738                     context->getInputValue<int32_t>(kMaxNumDetectionPerClassScalar),
1739                     context->getInputValue<float>(kIoUThresholdScalar),
1740                     context->getInputValue<float>(kScoreThresholdScalar),
1741                     context->getInputValue<bool>(kIsBGInLabelScalar),
1742                     context->getOutputBuffer<float>(kOutputScoreTensor),
1743                     context->getOutputShape(kOutputScoreTensor),
1744                     context->getOutputBuffer<float>(kOutputRoiTensor),
1745                     context->getOutputShape(kOutputRoiTensor),
1746                     context->getOutputBuffer<int32_t>(kOutputClassTensor),
1747                     context->getOutputShape(kOutputClassTensor),
1748                     context->getOutputBuffer<int32_t>(kOutputDetectionTensor),
1749                     context->getOutputShape(kOutputDetectionTensor));
1750         }
1751         default:
1752             NN_RET_CHECK_FAIL() << "Unsupported tensor type for operation " << kOperationName;
1753     }
1754 }
1755 #endif  // NN_INCLUDE_CPU_IMPLEMENTATION
1756 
1757 }  // namespace detection_postprocess
1758 
1759 }  // namespace bbox_ops
1760 
1761 NN_REGISTER_OPERATION(AXIS_ALIGNED_BBOX_TRANSFORM,
1762                       bbox_ops::axis_aligned_bbox_transform::kOperationName,
1763                       bbox_ops::axis_aligned_bbox_transform::validate,
1764                       bbox_ops::axis_aligned_bbox_transform::prepare,
1765                       bbox_ops::axis_aligned_bbox_transform::execute, .allowZeroSizedInput = true);
1766 
1767 NN_REGISTER_OPERATION(BOX_WITH_NMS_LIMIT, bbox_ops::box_with_nms_limit::kOperationName,
1768                       bbox_ops::box_with_nms_limit::validate, bbox_ops::box_with_nms_limit::prepare,
1769                       bbox_ops::box_with_nms_limit::execute, .allowZeroSizedInput = true);
1770 
1771 NN_REGISTER_OPERATION(GENERATE_PROPOSALS, bbox_ops::generate_proposals::kOperationName,
1772                       bbox_ops::generate_proposals::validate, bbox_ops::generate_proposals::prepare,
1773                       bbox_ops::generate_proposals::execute);
1774 
1775 NN_REGISTER_OPERATION(DETECTION_POSTPROCESSING, bbox_ops::detection_postprocess::kOperationName,
1776                       bbox_ops::detection_postprocess::validate,
1777                       bbox_ops::detection_postprocess::prepare,
1778                       bbox_ops::detection_postprocess::execute);
1779 }  // namespace nn
1780 }  // namespace android
1781