1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "Operations"
18
19 #include <algorithm>
20 #include <cfloat>
21 #include <cmath>
22 #include <numeric>
23 #include <utility>
24 #include <vector>
25
26 #include "OperationResolver.h"
27 #include "OperationsUtils.h"
28 #include "Tracing.h"
29
30 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
31 #include "CpuOperationUtils.h"
32 #endif // NN_INCLUDE_CPU_IMPLEMENTATION
33
34 namespace android {
35 namespace nn {
36 namespace bbox_ops {
37
38 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
39 namespace {
40
41 struct BoxEncodingCorner {
42 float x1, y1, x2, y2;
43 };
44 struct BoxEncodingCenter {
45 float w, h, x, y;
46 };
toBoxEncodingCorner(const BoxEncodingCenter & ctr)47 BoxEncodingCorner toBoxEncodingCorner(const BoxEncodingCenter& ctr) {
48 return {.x1 = ctr.x - ctr.w / 2,
49 .y1 = ctr.y - ctr.h / 2,
50 .x2 = ctr.x + ctr.w / 2,
51 .y2 = ctr.y + ctr.h / 2};
52 }
toBoxEncodingCenter(const BoxEncodingCorner & cnr)53 BoxEncodingCenter toBoxEncodingCenter(const BoxEncodingCorner& cnr) {
54 return {.w = cnr.x2 - cnr.x1,
55 .h = cnr.y2 - cnr.y1,
56 .x = (cnr.x1 + cnr.x2) / 2,
57 .y = (cnr.y1 + cnr.y2) / 2};
58 }
59
bboxTransformFloat32(const float * roiData,const Shape & roiShape,const float * bboxDeltasData,const Shape & bboxDeltasShape,const int32_t * batchesData,const Shape & batchesShape,const float * imageInfoData,const Shape & imageInfoDataShape,float * outputData,const Shape & outputShape)60 inline bool bboxTransformFloat32(const float* roiData, const Shape& roiShape,
61 const float* bboxDeltasData, const Shape& bboxDeltasShape,
62 const int32_t* batchesData, const Shape& batchesShape,
63 const float* imageInfoData, const Shape& imageInfoDataShape,
64 float* outputData, const Shape& outputShape) {
65 const uint32_t roiLength = 4;
66 const uint32_t imageLength = 2;
67
68 uint32_t numClasses = getSizeOfDimension(bboxDeltasShape, 1) / roiLength;
69 uint32_t numBatches = getSizeOfDimension(imageInfoDataShape, 0);
70
71 const float* roiDataEnd = roiData + getNumberOfElements(roiShape);
72 const float* deltas = bboxDeltasData;
73 float* outPtr = outputData;
74 uint32_t roiIndex = 0;
75 for (const float* roiBase = roiData; roiBase < roiDataEnd; roiBase += roiLength, roiIndex++) {
76 uint32_t batchIndex = batchesData[roiIndex];
77 // Check for malformed data
78 // 1. Invalid batch id
79 // 2. Invalid region: x2 < x1 || y2 < y1
80 NN_RET_CHECK_GE(batchIndex, 0);
81 NN_RET_CHECK_LT(batchIndex, numBatches);
82 NN_RET_CHECK_LE(roiBase[0], roiBase[2]);
83 NN_RET_CHECK_LE(roiBase[1], roiBase[3]);
84
85 const float* imageInfoBase = imageInfoData + batchIndex * imageLength;
86 float imageHeight = imageInfoBase[0];
87 float imageWidth = imageInfoBase[1];
88 auto roiBefore = toBoxEncodingCenter(
89 {.x1 = roiBase[0], .y1 = roiBase[1], .x2 = roiBase[2], .y2 = roiBase[3]});
90 for (uint32_t i = 0; i < numClasses; i++) {
91 auto roiAfter = toBoxEncodingCorner({.w = std::exp(deltas[2]) * roiBefore.w,
92 .h = std::exp(deltas[3]) * roiBefore.h,
93 .x = roiBefore.x + deltas[0] * roiBefore.w,
94 .y = roiBefore.y + deltas[1] * roiBefore.h});
95 BoxEncodingCorner cliped = {.x1 = std::min(std::max(roiAfter.x1, 0.0f), imageWidth),
96 .y1 = std::min(std::max(roiAfter.y1, 0.0f), imageHeight),
97 .x2 = std::min(std::max(roiAfter.x2, 0.0f), imageWidth),
98 .y2 = std::min(std::max(roiAfter.y2, 0.0f), imageHeight)};
99 outPtr[0] = cliped.x1;
100 outPtr[1] = cliped.y1;
101 outPtr[2] = cliped.x2;
102 outPtr[3] = cliped.y2;
103 deltas += roiLength;
104 outPtr += roiLength;
105 }
106 }
107 return true;
108 }
109
bboxTransformFloat16(const _Float16 * roiData,const Shape & roiShape,const _Float16 * bboxDeltasData,const Shape & bboxDeltasShape,const int32_t * batchesData,const Shape & batchesShape,const _Float16 * imageInfoData,const Shape & imageInfoDataShape,_Float16 * outputData,const Shape & outputShape)110 inline bool bboxTransformFloat16(const _Float16* roiData, const Shape& roiShape,
111 const _Float16* bboxDeltasData, const Shape& bboxDeltasShape,
112 const int32_t* batchesData, const Shape& batchesShape,
113 const _Float16* imageInfoData, const Shape& imageInfoDataShape,
114 _Float16* outputData, const Shape& outputShape) {
115 std::vector<float> roi_float32(getNumberOfElements(roiShape));
116 convertFloat16ToFloat32(roiData, &roi_float32);
117 std::vector<float> delta_float32(getNumberOfElements(bboxDeltasShape));
118 convertFloat16ToFloat32(bboxDeltasData, &delta_float32);
119 std::vector<float> imageInfo_float32(getNumberOfElements(imageInfoDataShape));
120 convertFloat16ToFloat32(imageInfoData, &imageInfo_float32);
121 std::vector<float> output_float32(getNumberOfElements(outputShape));
122 NN_RET_CHECK(bboxTransformFloat32(roi_float32.data(), roiShape, delta_float32.data(),
123 bboxDeltasShape, batchesData, batchesShape,
124 imageInfo_float32.data(), imageInfoDataShape,
125 output_float32.data(), outputShape));
126 convertFloat32ToFloat16(output_float32, outputData);
127 return true;
128 }
129
bboxTransformQuant(const uint16_t * roiData,const Shape & roiShape,const uint8_t * bboxDeltasData,const Shape & bboxDeltasShape,const int32_t * batchesData,const Shape & batchesShape,const uint16_t * imageInfoData,const Shape & imageInfoDataShape,uint16_t * outputData,const Shape & outputShape)130 inline bool bboxTransformQuant(const uint16_t* roiData, const Shape& roiShape,
131 const uint8_t* bboxDeltasData, const Shape& bboxDeltasShape,
132 const int32_t* batchesData, const Shape& batchesShape,
133 const uint16_t* imageInfoData, const Shape& imageInfoDataShape,
134 uint16_t* outputData, const Shape& outputShape) {
135 std::vector<float> roi_float32(getNumberOfElements(roiShape));
136 convertQuantToFloat32(roiData, roiShape.scale, roiShape.offset, &roi_float32);
137 std::vector<float> delta_float32(getNumberOfElements(bboxDeltasShape));
138 convertQuantToFloat32(bboxDeltasData, bboxDeltasShape.scale, bboxDeltasShape.offset,
139 &delta_float32);
140 std::vector<float> imageInfo_float32(getNumberOfElements(imageInfoDataShape));
141 convertQuantToFloat32(imageInfoData, imageInfoDataShape.scale, imageInfoDataShape.offset,
142 &imageInfo_float32);
143 std::vector<float> output_float32(getNumberOfElements(outputShape));
144 NN_RET_CHECK(bboxTransformFloat32(roi_float32.data(), roiShape, delta_float32.data(),
145 bboxDeltasShape, batchesData, batchesShape,
146 imageInfo_float32.data(), imageInfoDataShape,
147 output_float32.data(), outputShape));
148 convertFloat32ToQuant(output_float32, outputShape.scale, outputShape.offset, outputData);
149 return true;
150 }
151
bboxTransformQuant(const uint16_t * roiData,const Shape & roiShape,const int8_t * bboxDeltasData,const Shape & bboxDeltasShape,const int32_t * batchesData,const Shape & batchesShape,const uint16_t * imageInfoData,const Shape & imageInfoDataShape,uint16_t * outputData,const Shape & outputShape)152 inline bool bboxTransformQuant(const uint16_t* roiData, const Shape& roiShape,
153 const int8_t* bboxDeltasData, const Shape& bboxDeltasShape,
154 const int32_t* batchesData, const Shape& batchesShape,
155 const uint16_t* imageInfoData, const Shape& imageInfoDataShape,
156 uint16_t* outputData, const Shape& outputShape) {
157 std::vector<float> roi_float32(getNumberOfElements(roiShape));
158 convertQuantToFloat32(roiData, roiShape.scale, roiShape.offset, &roi_float32);
159 std::vector<float> delta_float32(getNumberOfElements(bboxDeltasShape));
160 convertQuantToFloat32<int8_t>(bboxDeltasData, bboxDeltasShape.scale, bboxDeltasShape.offset,
161 &delta_float32);
162 std::vector<float> imageInfo_float32(getNumberOfElements(imageInfoDataShape));
163 convertQuantToFloat32(imageInfoData, imageInfoDataShape.scale, imageInfoDataShape.offset,
164 &imageInfo_float32);
165 std::vector<float> output_float32(getNumberOfElements(outputShape));
166 NN_RET_CHECK(bboxTransformFloat32(roi_float32.data(), roiShape, delta_float32.data(),
167 bboxDeltasShape, batchesData, batchesShape,
168 imageInfo_float32.data(), imageInfoDataShape,
169 output_float32.data(), outputShape));
170 convertFloat32ToQuant(output_float32, outputShape.scale, outputShape.offset, outputData);
171 return true;
172 }
173
174 // Taking two indices of bounding boxes, return the intersection-of-union.
getIoUAxisAligned(const float * roi1,const float * roi2)175 float getIoUAxisAligned(const float* roi1, const float* roi2) {
176 const float area1 = (roi1[2] - roi1[0]) * (roi1[3] - roi1[1]);
177 const float area2 = (roi2[2] - roi2[0]) * (roi2[3] - roi2[1]);
178 const float x1 = std::max(roi1[0], roi2[0]);
179 const float x2 = std::min(roi1[2], roi2[2]);
180 const float y1 = std::max(roi1[1], roi2[1]);
181 const float y2 = std::min(roi1[3], roi2[3]);
182 const float w = std::max(x2 - x1, 0.0f);
183 const float h = std::max(y2 - y1, 0.0f);
184 const float areaIntersect = w * h;
185 const float areaUnion = area1 + area2 - areaIntersect;
186 return areaIntersect / areaUnion;
187 }
188
189 } // namespace
190 #endif // NN_INCLUDE_CPU_IMPLEMENTATION
191
192 namespace axis_aligned_bbox_transform {
193
194 constexpr char kOperationName[] = "AXIS_ALIGNED_BBOX_TRANSFORM";
195
196 constexpr uint32_t kNumInputs = 4;
197 constexpr uint32_t kRoiTensor = 0;
198 constexpr uint32_t kDeltaTensor = 1;
199 constexpr uint32_t kBatchesTensor = 2;
200 constexpr uint32_t kImageInfoTensor = 3;
201
202 constexpr uint32_t kNumOutputs = 1;
203 constexpr uint32_t kOutputTensor = 0;
204
validate(const IOperationValidationContext * context)205 Result<Version> validate(const IOperationValidationContext* context) {
206 NN_RET_CHECK_EQ(context->getNumInputs(), kNumInputs);
207 NN_RET_CHECK_EQ(context->getNumOutputs(), kNumOutputs);
208 std::vector<OperandType> inExpectedTypes;
209 auto inputType = context->getInputType(kRoiTensor);
210 auto deltaInputType = context->getInputType(kDeltaTensor);
211 if (inputType == OperandType::TENSOR_FLOAT32 || inputType == OperandType::TENSOR_FLOAT16) {
212 inExpectedTypes = {inputType, inputType, OperandType::TENSOR_INT32, inputType};
213 } else if (inputType == OperandType::TENSOR_QUANT16_ASYMM) {
214 if (deltaInputType == OperandType::TENSOR_QUANT8_ASYMM ||
215 deltaInputType == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
216 inExpectedTypes = {OperandType::TENSOR_QUANT16_ASYMM, deltaInputType,
217 OperandType::TENSOR_INT32, OperandType::TENSOR_QUANT16_ASYMM};
218 } else {
219 return NN_ERROR() << "Unsupported input tensor type for operation " << kOperationName;
220 }
221 } else {
222 return NN_ERROR() << "Unsupported input tensor type for operation " << kOperationName;
223 }
224 NN_RET_CHECK(validateInputTypes(context, inExpectedTypes));
225 NN_RET_CHECK(validateOutputTypes(context, {inputType}));
226 return Version::ANDROID_Q;
227 }
228
229 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
prepare(IOperationExecutionContext * context)230 bool prepare(IOperationExecutionContext* context) {
231 Shape roiShape = context->getInputShape(kRoiTensor);
232 Shape bboxDeltasShape = context->getInputShape(kDeltaTensor);
233 Shape batchesShape = context->getInputShape(kBatchesTensor);
234 Shape imageInfoShape = context->getInputShape(kImageInfoTensor);
235 Shape outputShape = context->getOutputShape(kOutputTensor);
236
237 NN_RET_CHECK_EQ(getNumberOfDimensions(roiShape), 2);
238 NN_RET_CHECK_EQ(getNumberOfDimensions(bboxDeltasShape), 2);
239 NN_RET_CHECK_EQ(getNumberOfDimensions(batchesShape), 1);
240 NN_RET_CHECK_EQ(getNumberOfDimensions(imageInfoShape), 2);
241
242 // Only numRois can be zero.
243 const uint32_t kRoiDim = 4;
244 uint32_t numRois = getSizeOfDimension(roiShape, 0);
245 uint32_t numClasses = getSizeOfDimension(bboxDeltasShape, 1) / kRoiDim;
246 uint32_t numBatches = getSizeOfDimension(imageInfoShape, 0);
247 NN_RET_CHECK_GT(numClasses, 0);
248 NN_RET_CHECK_GT(numBatches, 0);
249 NN_RET_CHECK_EQ(getSizeOfDimension(roiShape, 1), kRoiDim);
250 NN_RET_CHECK_EQ(getSizeOfDimension(bboxDeltasShape, 0), numRois);
251 NN_RET_CHECK_EQ(getSizeOfDimension(bboxDeltasShape, 1), kRoiDim * numClasses);
252 NN_RET_CHECK_EQ(getSizeOfDimension(batchesShape, 0), numRois);
253 NN_RET_CHECK_EQ(getSizeOfDimension(imageInfoShape, 1), 2);
254
255 if (roiShape.type == OperandType::TENSOR_QUANT16_ASYMM) {
256 NN_RET_CHECK_EQ(roiShape.scale, 0.125f);
257 NN_RET_CHECK_EQ(roiShape.offset, 0);
258 NN_RET_CHECK_EQ(imageInfoShape.scale, 0.125f);
259 NN_RET_CHECK_EQ(imageInfoShape.offset, 0);
260 }
261
262 outputShape.type = roiShape.type;
263 outputShape.dimensions = {numRois, numClasses * kRoiDim};
264 outputShape.scale = 0.f;
265 outputShape.offset = 0;
266 if (roiShape.type == OperandType::TENSOR_QUANT16_ASYMM) {
267 outputShape.scale = 0.125f;
268 }
269 NN_RET_CHECK(context->setOutputShape(kOutputTensor, outputShape));
270 return true;
271 }
272
execute(IOperationExecutionContext * context)273 bool execute(IOperationExecutionContext* context) {
274 NNTRACE_TRANS("axisAlignedBBoxTransform");
275 // Bypass execution in the case of zero-sized input.
276 if (getNumberOfElements(context->getOutputShape(kOutputTensor)) == 0) return true;
277 switch (context->getInputType(kRoiTensor)) {
278 case OperandType::TENSOR_FLOAT16: {
279 return bboxTransformFloat16(context->getInputBuffer<_Float16>(kRoiTensor),
280 context->getInputShape(kRoiTensor),
281 context->getInputBuffer<_Float16>(kDeltaTensor),
282 context->getInputShape(kDeltaTensor),
283 context->getInputBuffer<int32_t>(kBatchesTensor),
284 context->getInputShape(kBatchesTensor),
285 context->getInputBuffer<_Float16>(kImageInfoTensor),
286 context->getInputShape(kImageInfoTensor),
287 context->getOutputBuffer<_Float16>(kOutputTensor),
288 context->getOutputShape(kOutputTensor));
289 }
290 case OperandType::TENSOR_FLOAT32: {
291 return bboxTransformFloat32(context->getInputBuffer<float>(kRoiTensor),
292 context->getInputShape(kRoiTensor),
293 context->getInputBuffer<float>(kDeltaTensor),
294 context->getInputShape(kDeltaTensor),
295 context->getInputBuffer<int32_t>(kBatchesTensor),
296 context->getInputShape(kBatchesTensor),
297 context->getInputBuffer<float>(kImageInfoTensor),
298 context->getInputShape(kImageInfoTensor),
299 context->getOutputBuffer<float>(kOutputTensor),
300 context->getOutputShape(kOutputTensor));
301 }
302 case OperandType::TENSOR_QUANT16_ASYMM: {
303 if (context->getInputType(kDeltaTensor) == OperandType::TENSOR_QUANT8_ASYMM) {
304 return bboxTransformQuant(context->getInputBuffer<uint16_t>(kRoiTensor),
305 context->getInputShape(kRoiTensor),
306 context->getInputBuffer<uint8_t>(kDeltaTensor),
307 context->getInputShape(kDeltaTensor),
308 context->getInputBuffer<int32_t>(kBatchesTensor),
309 context->getInputShape(kBatchesTensor),
310 context->getInputBuffer<uint16_t>(kImageInfoTensor),
311 context->getInputShape(kImageInfoTensor),
312 context->getOutputBuffer<uint16_t>(kOutputTensor),
313 context->getOutputShape(kOutputTensor));
314 } else {
315 return bboxTransformQuant(context->getInputBuffer<uint16_t>(kRoiTensor),
316 context->getInputShape(kRoiTensor),
317 context->getInputBuffer<int8_t>(kDeltaTensor),
318 context->getInputShape(kDeltaTensor),
319 context->getInputBuffer<int32_t>(kBatchesTensor),
320 context->getInputShape(kBatchesTensor),
321 context->getInputBuffer<uint16_t>(kImageInfoTensor),
322 context->getInputShape(kImageInfoTensor),
323 context->getOutputBuffer<uint16_t>(kOutputTensor),
324 context->getOutputShape(kOutputTensor));
325 }
326 }
327 default:
328 NN_RET_CHECK_FAIL() << "Unsupported tensor type for operation " << kOperationName;
329 }
330 }
331 #endif // NN_INCLUDE_CPU_IMPLEMENTATION
332
333 } // namespace axis_aligned_bbox_transform
334
335 namespace box_with_nms_limit {
336
337 constexpr char kOperationName[] = "BOX_WITH_NMS_LIMIT";
338
339 constexpr uint32_t kNumInputs = 9;
340 constexpr uint32_t kScoreTensor = 0;
341 constexpr uint32_t kRoiTensor = 1;
342 constexpr uint32_t kBatchesTensor = 2;
343 constexpr uint32_t kScoreThresholdScalar = 3;
344 constexpr uint32_t kMaxNumDetectionScalar = 4;
345 constexpr uint32_t kNmsKernelScalar = 5;
346 constexpr uint32_t kIoUThresholdScalar = 6;
347 constexpr uint32_t kSigmaScalar = 7;
348 constexpr uint32_t kNmsScoreThresholdScalar = 8;
349
350 constexpr uint32_t kNumOutputs = 4;
351 constexpr uint32_t kOutputScoreTensor = 0;
352 constexpr uint32_t kOutputRoiTensor = 1;
353 constexpr uint32_t kOutputClassTensor = 2;
354 constexpr uint32_t kOutputBatchesTensor = 3;
355
356 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
357 namespace {
358
359 // TODO(xusongw): Reduce code duplication with hard/soft nms path.
360
361 // Inplace hard NMS within range [select, select + selectLength).
hardNmsSingleClass(const float * scoresData,float iouThreshold,int32_t maxNumDetections,std::function<const float * (uint32_t)> getRoiBase,uint32_t * select,uint32_t selectLength)362 uint32_t* hardNmsSingleClass(const float* scoresData, float iouThreshold, int32_t maxNumDetections,
363 std::function<const float*(uint32_t)> getRoiBase, uint32_t* select,
364 uint32_t selectLength) {
365 uint32_t *selectStart = select, *selectEnd = select + selectLength, numDetections = 0;
366 if (maxNumDetections < 0) {
367 maxNumDetections = selectLength;
368 }
369 while (selectStart < selectEnd && numDetections < maxNumDetections) {
370 // find max score and swap to the front
371 auto& maxScore = *std::max_element(selectStart, selectEnd,
372 [&scoresData](const uint32_t& lhs, const uint32_t& rhs) {
373 return scoresData[lhs] < scoresData[rhs];
374 });
375 std::swap(maxScore, *selectStart);
376
377 // Calculate IoU of the rest, swap to the end (disgard) if needed.
378 for (uint32_t* i = selectStart + 1; i < selectEnd; i++) {
379 float iou = getIoUAxisAligned(getRoiBase(*i), getRoiBase(*selectStart));
380 if (iou >= iouThreshold) {
381 std::swap(*i--, *(--selectEnd));
382 }
383 }
384 selectStart++;
385 numDetections++;
386 }
387 return selectStart;
388 }
389
hardNmsMultiClass(const float * scoresData,uint32_t numClasses,uint32_t numRois,float scoreThreshold,float iouThreshold,int32_t maxNumDetections,int32_t maxNumDetectionsPerClass,std::function<const float * (uint32_t)> getRoiBase,std::vector<uint32_t> * select)390 void hardNmsMultiClass(const float* scoresData, uint32_t numClasses, uint32_t numRois,
391 float scoreThreshold, float iouThreshold, int32_t maxNumDetections,
392 int32_t maxNumDetectionsPerClass,
393 std::function<const float*(uint32_t)> getRoiBase,
394 std::vector<uint32_t>* select) {
395 // Exclude class 0 (background)
396 for (uint32_t c = 1; c < numClasses; c++) {
397 uint32_t size = select->size();
398 for (uint32_t b = 0; b < numRois; b++) {
399 const uint32_t index = b * numClasses + c;
400 const float score = scoresData[index];
401 if (score > scoreThreshold) {
402 select->push_back(index);
403 }
404 }
405 uint32_t* selectStart = select->data() + size;
406 uint32_t selectLength = select->size() - size;
407 uint32_t* selectEnd = hardNmsSingleClass(scoresData, iouThreshold, maxNumDetectionsPerClass,
408 getRoiBase, selectStart, selectLength);
409 select->resize(selectEnd - select->data());
410 }
411
412 // Take top maxNumDetections.
413 std::sort(select->begin(), select->end(),
414 [&scoresData](const uint32_t& lhs, const uint32_t& rhs) {
415 return scoresData[lhs] > scoresData[rhs];
416 });
417 if (maxNumDetections < 0 || select->size() <= maxNumDetections) {
418 return;
419 }
420 select->resize(maxNumDetections);
421 }
422
423 // Inplace soft NMS within range [select, select + selectLength).
424 using SoftNmsKernel = std::function<float(float)>;
softNmsSingleClass(float * scoresData,float scoreThreshold,int32_t maxNumDetections,std::function<const float * (uint32_t)> getRoiBase,SoftNmsKernel kernel,uint32_t * select,uint32_t selectLength)425 uint32_t* softNmsSingleClass(float* scoresData, float scoreThreshold, int32_t maxNumDetections,
426 std::function<const float*(uint32_t)> getRoiBase, SoftNmsKernel kernel,
427 uint32_t* select, uint32_t selectLength) {
428 uint32_t *selectStart = select, *selectEnd = select + selectLength, numDetections = 0;
429 if (maxNumDetections < 0) {
430 maxNumDetections = selectLength;
431 }
432 while (selectStart < selectEnd && numDetections < maxNumDetections) {
433 // find max score and swap to the front
434 auto& maxScore = *std::max_element(selectStart, selectEnd,
435 [&scoresData](const uint32_t& lhs, const uint32_t& rhs) {
436 return scoresData[lhs] < scoresData[rhs];
437 });
438 std::swap(maxScore, *selectStart);
439
440 // Calculate IoU of the rest, swap to the end (disgard) if needed.
441 for (uint32_t* i = selectStart + 1; i < selectEnd; i++) {
442 float iou = getIoUAxisAligned(getRoiBase(*i), getRoiBase(*selectStart));
443 scoresData[*i] *= kernel(iou);
444 if (scoresData[*i] < scoreThreshold) {
445 std::swap(*i--, *(--selectEnd));
446 }
447 }
448 selectStart++;
449 numDetections++;
450 }
451 return selectStart;
452 }
453
softNmsMultiClass(float * scoresData,uint32_t numClasses,uint32_t numRois,float scoreThreshold,float nmsScoreThreshold,int32_t maxNumDetections,int32_t maxNumDetectionsPerClass,std::function<const float * (uint32_t)> getRoiBase,SoftNmsKernel kernel,std::vector<uint32_t> * select)454 void softNmsMultiClass(float* scoresData, uint32_t numClasses, uint32_t numRois,
455 float scoreThreshold, float nmsScoreThreshold, int32_t maxNumDetections,
456 int32_t maxNumDetectionsPerClass,
457 std::function<const float*(uint32_t)> getRoiBase, SoftNmsKernel kernel,
458 std::vector<uint32_t>* select) {
459 // Exclude class 0 (background)
460 for (uint32_t c = 1; c < numClasses; c++) {
461 uint32_t size = select->size();
462 for (uint32_t b = 0; b < numRois; b++) {
463 const uint32_t index = b * numClasses + c;
464 const float score = scoresData[index];
465 if (score > scoreThreshold) {
466 select->push_back(index);
467 }
468 }
469 uint32_t* selectStart = select->data() + size;
470 uint32_t selectLength = select->size() - size;
471 uint32_t* selectEnd =
472 softNmsSingleClass(scoresData, nmsScoreThreshold, maxNumDetectionsPerClass,
473 getRoiBase, kernel, selectStart, selectLength);
474 select->resize(selectEnd - select->data());
475 }
476
477 // Take top maxNumDetections.
478 std::sort(select->begin(), select->end(),
479 [&scoresData](const uint32_t& lhs, const uint32_t& rhs) {
480 return scoresData[lhs] > scoresData[rhs];
481 });
482 if (maxNumDetections < 0 || select->size() <= maxNumDetections) {
483 return;
484 }
485 select->resize(maxNumDetections);
486 }
487
boxWithNmsLimitFloat32Compute(float * scoresData,const Shape & scoresShape,const float * roiData,const Shape & roiShape,const int32_t * batchesData,const Shape & batchesShape,float scoreThreshold,int32_t maxNumDetections,int32_t softNmsKernel,float iouThreshold,float sigma,float nmsScoreThreshold,std::vector<uint32_t> * batchSplitIn,std::vector<uint32_t> * batchSplitOut,std::vector<uint32_t> * selected)488 bool boxWithNmsLimitFloat32Compute(float* scoresData, const Shape& scoresShape,
489 const float* roiData, const Shape& roiShape,
490 const int32_t* batchesData, const Shape& batchesShape,
491 float scoreThreshold, int32_t maxNumDetections,
492 int32_t softNmsKernel, float iouThreshold, float sigma,
493 float nmsScoreThreshold, std::vector<uint32_t>* batchSplitIn,
494 std::vector<uint32_t>* batchSplitOut,
495 std::vector<uint32_t>* selected) {
496 SoftNmsKernel kernel = nullptr;
497 if (softNmsKernel == 0) {
498 kernel = [&iouThreshold](float iou) { return iou < iouThreshold ? 1.0f : 0.0f; };
499 } else if (softNmsKernel == 1) {
500 kernel = [&iouThreshold](float iou) { return iou < iouThreshold ? 1.0f : 1.0f - iou; };
501 } else if (softNmsKernel == 2) {
502 kernel = [&sigma](float iou) { return std::exp(-1.0f * iou * iou / sigma); };
503 } else {
504 NN_RET_CHECK_FAIL() << "Unsupported soft NMS kernel " << softNmsKernel;
505 }
506
507 const uint32_t kRoiDim = 4;
508 uint32_t numRois = getSizeOfDimension(scoresShape, 0);
509 uint32_t numClasses = getSizeOfDimension(scoresShape, 1);
510
511 // We assume boxes of the same batch are grouped together.
512 std::vector<uint32_t> batch;
513 for (uint32_t i = 0, ind = -1; i < numRois; i++) {
514 if (batchesData[i] == ind) {
515 (batchSplitIn->back())++;
516 } else {
517 ind = batchesData[i];
518 batchSplitIn->push_back(1);
519 }
520 }
521
522 float* scoresBase = scoresData;
523 const float* roiBase = roiData;
524 selected->clear();
525 for (uint32_t b = 0; b < batchSplitIn->size(); b++) {
526 for (uint32_t i = 0; i < batchSplitIn->at(b); i++) {
527 const float* roi = roiBase + i * kRoiDim;
528 // Check for malformed data: invalid region: x2 < x1 || y2 < y1
529 NN_RET_CHECK_LE(roi[0], roi[2]);
530 NN_RET_CHECK_LE(roi[1], roi[3]);
531 }
532 std::vector<uint32_t> result;
533 softNmsMultiClass(
534 scoresBase, numClasses, batchSplitIn->at(b), scoreThreshold, nmsScoreThreshold,
535 maxNumDetections, maxNumDetections,
536 [&roiBase](uint32_t ind) { return roiBase + ind * kRoiDim; }, kernel, &result);
537 // Sort again by class.
538 std::sort(result.begin(), result.end(),
539 [&scoresBase, numClasses](const uint32_t& lhs, const uint32_t& rhs) {
540 uint32_t lhsClass = lhs % numClasses, rhsClass = rhs % numClasses;
541 return lhsClass == rhsClass ? scoresBase[lhs] > scoresBase[rhs]
542 : lhsClass < rhsClass;
543 });
544 selected->insert(selected->end(), result.begin(), result.end());
545 batchSplitOut->push_back(result.size());
546 scoresBase += batchSplitIn->at(b) * numClasses;
547 roiBase += batchSplitIn->at(b) * numClasses * kRoiDim;
548 }
549 return true;
550 }
551
552 template <typename T>
castTo(float val,const Shape &)553 T castTo(float val, const Shape&) {
554 return val;
555 }
556 template <>
castTo(float val,const Shape & shape)557 uint8_t castTo(float val, const Shape& shape) {
558 return saturateCast<uint8_t>(std::round(val / shape.scale + shape.offset));
559 }
560
561 template <>
castTo(float val,const Shape & shape)562 int8_t castTo(float val, const Shape& shape) {
563 return saturateCast<int8_t>(std::round(val / shape.scale + shape.offset));
564 }
565
566 template <typename T_Score, typename T_Roi>
boxWithNmsLimitWriteOutput(const std::vector<uint32_t> & selected,const std::vector<uint32_t> & batchSplitIn,const std::vector<uint32_t> & batchSplitOut,const std::vector<float> & scores,IOperationExecutionContext * context)567 bool boxWithNmsLimitWriteOutput(const std::vector<uint32_t>& selected,
568 const std::vector<uint32_t>& batchSplitIn,
569 const std::vector<uint32_t>& batchSplitOut,
570 const std::vector<float>& scores,
571 IOperationExecutionContext* context) {
572 const uint32_t kRoiDim = 4;
573 Shape scoresShape = context->getInputShape(kScoreTensor);
574 uint32_t numClasses = getSizeOfDimension(scoresShape, 1);
575
576 // Set output dimensions.
577 uint32_t numOutRois = selected.size();
578 if (numOutRois == 0) return true;
579 Shape scoresOutShape = context->getOutputShape(kOutputScoreTensor);
580 scoresOutShape.dimensions = {numOutRois};
581 NN_RET_CHECK(context->setOutputShape(kOutputScoreTensor, scoresOutShape));
582
583 Shape roiOutShape = context->getOutputShape(kOutputRoiTensor);
584 roiOutShape.dimensions = {numOutRois, 4};
585 NN_RET_CHECK(context->setOutputShape(kOutputRoiTensor, roiOutShape));
586
587 Shape classesOutShape = context->getOutputShape(kOutputClassTensor);
588 classesOutShape.dimensions = {numOutRois};
589 NN_RET_CHECK(context->setOutputShape(kOutputClassTensor, classesOutShape));
590
591 Shape batchesOutShape = context->getOutputShape(kOutputBatchesTensor);
592 batchesOutShape.dimensions = {numOutRois};
593 NN_RET_CHECK(context->setOutputShape(kOutputBatchesTensor, batchesOutShape));
594
595 // Write outputs.
596 const float* scoresBase = scores.data();
597 const T_Roi* roiBase = context->getInputBuffer<T_Roi>(kRoiTensor);
598 const int32_t* batchesInPtr = context->getInputBuffer<int32_t>(kBatchesTensor);
599 T_Score* scoresOutPtr = context->getOutputBuffer<T_Score>(kOutputScoreTensor);
600 T_Roi* roiOutPtr = context->getOutputBuffer<T_Roi>(kOutputRoiTensor);
601 int32_t* classesOutPtr = context->getOutputBuffer<int32_t>(kOutputClassTensor);
602 int32_t* batchesOutPtr = context->getOutputBuffer<int32_t>(kOutputBatchesTensor);
603 uint32_t i = 0;
604 for (uint32_t b = 0; b < batchSplitOut.size(); b++) {
605 for (uint32_t j = 0; j < batchSplitOut[b]; j++) {
606 uint32_t index = selected[i++];
607 *scoresOutPtr++ = castTo<T_Score>(scoresBase[index], scoresOutShape);
608 memcpy(roiOutPtr, roiBase + index * kRoiDim, kRoiDim * sizeof(T_Roi));
609 roiOutPtr += kRoiDim;
610 *classesOutPtr++ = index % numClasses;
611 *batchesOutPtr++ = *batchesInPtr;
612 }
613 scoresBase += batchSplitIn[b] * numClasses;
614 roiBase += batchSplitIn[b] * numClasses * kRoiDim;
615 batchesInPtr += batchSplitIn[b];
616 }
617 return true;
618 }
619
boxWithNmsLimitFloat32(const float * scoresData,const Shape & scoresShape,const float * roiData,const Shape & roiShape,const int32_t * batchesData,const Shape & batchesShape,float scoreThreshold,int32_t maxNumDetections,int32_t softNmsKernel,float iouThreshold,float sigma,float nmsScoreThreshold,float * scoresOutData,Shape scoresOutShape,float * roiOutData,Shape roiOutShape,int32_t * classesOutData,Shape classesOutShape,int32_t * batchesOutData,const Shape & batchSplitOutShape,IOperationExecutionContext * context)620 bool boxWithNmsLimitFloat32(const float* scoresData, const Shape& scoresShape, const float* roiData,
621 const Shape& roiShape, const int32_t* batchesData,
622 const Shape& batchesShape, float scoreThreshold,
623 int32_t maxNumDetections, int32_t softNmsKernel, float iouThreshold,
624 float sigma, float nmsScoreThreshold, float* scoresOutData,
625 Shape scoresOutShape, float* roiOutData, Shape roiOutShape,
626 int32_t* classesOutData, Shape classesOutShape, int32_t* batchesOutData,
627 const Shape& batchSplitOutShape, IOperationExecutionContext* context) {
628 NNTRACE_TRANS("boxWithNmsLimit");
629 std::vector<float> scores_float32(getNumberOfElements(scoresShape));
630 for (uint32_t i = 0; i < scores_float32.size(); i++) {
631 scores_float32[i] = scoresData[i];
632 }
633 std::vector<uint32_t> selected, batchSplitIn, batchSplitOut;
634 NN_RET_CHECK(boxWithNmsLimitFloat32Compute(
635 scores_float32.data(), scoresShape, roiData, roiShape, batchesData, batchesShape,
636 scoreThreshold, maxNumDetections, softNmsKernel, iouThreshold, sigma, nmsScoreThreshold,
637 &batchSplitIn, &batchSplitOut, &selected));
638 return boxWithNmsLimitWriteOutput<float, float>(selected, batchSplitIn, batchSplitOut,
639 scores_float32, context);
640 }
641
boxWithNmsLimitFloat16(const _Float16 * scoresData,const Shape & scoresShape,const _Float16 * roiData,const Shape & roiShape,const int32_t * batchesData,const Shape & batchesShape,_Float16 scoreThreshold,int32_t maxNumDetections,int32_t softNmsKernel,_Float16 iouThreshold,_Float16 sigma,_Float16 nmsScoreThreshold,_Float16 * scoresOutData,const Shape & scoresOutShape,_Float16 * roiOutData,const Shape & roiOutShape,int32_t * classesOutData,const Shape & classesOutShape,int32_t * batchesOutData,const Shape & batchSplitOutShape,IOperationExecutionContext * context)642 bool boxWithNmsLimitFloat16(const _Float16* scoresData, const Shape& scoresShape,
643 const _Float16* roiData, const Shape& roiShape,
644 const int32_t* batchesData, const Shape& batchesShape,
645 _Float16 scoreThreshold, int32_t maxNumDetections,
646 int32_t softNmsKernel, _Float16 iouThreshold, _Float16 sigma,
647 _Float16 nmsScoreThreshold, _Float16* scoresOutData,
648 const Shape& scoresOutShape, _Float16* roiOutData,
649 const Shape& roiOutShape, int32_t* classesOutData,
650 const Shape& classesOutShape, int32_t* batchesOutData,
651 const Shape& batchSplitOutShape, IOperationExecutionContext* context) {
652 std::vector<float> scores_float32(getNumberOfElements(scoresShape));
653 convertFloat16ToFloat32(scoresData, &scores_float32);
654 std::vector<float> roi_float32(getNumberOfElements(roiShape));
655 convertFloat16ToFloat32(roiData, &roi_float32);
656 std::vector<uint32_t> selected, batchSplitIn, batchSplitOut;
657 NN_RET_CHECK(boxWithNmsLimitFloat32Compute(
658 scores_float32.data(), scoresShape, roi_float32.data(), roiShape, batchesData,
659 batchesShape, scoreThreshold, maxNumDetections, softNmsKernel, iouThreshold, sigma,
660 nmsScoreThreshold, &batchSplitIn, &batchSplitOut, &selected));
661 return boxWithNmsLimitWriteOutput<_Float16, _Float16>(selected, batchSplitIn, batchSplitOut,
662 scores_float32, context);
663 }
664
boxWithNmsLimitQuant(const uint8_t * scoresData,const Shape & scoresShape,const uint16_t * roiData,const Shape & roiShape,const int32_t * batchesData,const Shape & batchesShape,float scoreThreshold,int32_t maxNumDetections,int32_t softNmsKernel,float iouThreshold,float sigma,float nmsScoreThreshold,uint8_t * scoresOutData,const Shape & scoresOutShape,uint16_t * roiOutData,const Shape & roiOutShape,int32_t * classesOutData,const Shape & classesOutShape,int32_t * batchesOutData,const Shape & batchSplitOutShape,IOperationExecutionContext * context)665 bool boxWithNmsLimitQuant(const uint8_t* scoresData, const Shape& scoresShape,
666 const uint16_t* roiData, const Shape& roiShape,
667 const int32_t* batchesData, const Shape& batchesShape,
668 float scoreThreshold, int32_t maxNumDetections, int32_t softNmsKernel,
669 float iouThreshold, float sigma, float nmsScoreThreshold,
670 uint8_t* scoresOutData, const Shape& scoresOutShape, uint16_t* roiOutData,
671 const Shape& roiOutShape, int32_t* classesOutData,
672 const Shape& classesOutShape, int32_t* batchesOutData,
673 const Shape& batchSplitOutShape, IOperationExecutionContext* context) {
674 std::vector<float> scores_float32(getNumberOfElements(scoresShape));
675 convertQuantToFloat32(scoresData, scoresShape.scale, scoresShape.offset, &scores_float32);
676 std::vector<float> roi_float32(getNumberOfElements(roiShape));
677 convertQuantToFloat32(roiData, roiShape.scale, roiShape.offset, &roi_float32);
678 std::vector<uint32_t> selected, batchSplitIn, batchSplitOut;
679 NN_RET_CHECK(boxWithNmsLimitFloat32Compute(
680 scores_float32.data(), scoresShape, roi_float32.data(), roiShape, batchesData,
681 batchesShape, scoreThreshold, maxNumDetections, softNmsKernel, iouThreshold, sigma,
682 nmsScoreThreshold, &batchSplitIn, &batchSplitOut, &selected));
683 return boxWithNmsLimitWriteOutput<uint8_t, uint16_t>(selected, batchSplitIn, batchSplitOut,
684 scores_float32, context);
685 }
686
boxWithNmsLimitQuant(const int8_t * scoresData,const Shape & scoresShape,const uint16_t * roiData,const Shape & roiShape,const int32_t * batchesData,const Shape & batchesShape,float scoreThreshold,int32_t maxNumDetections,int32_t softNmsKernel,float iouThreshold,float sigma,float nmsScoreThreshold,int8_t * scoresOutData,const Shape & scoresOutShape,uint16_t * roiOutData,const Shape & roiOutShape,int32_t * classesOutData,const Shape & classesOutShape,int32_t * batchesOutData,const Shape & batchSplitOutShape,IOperationExecutionContext * context)687 bool boxWithNmsLimitQuant(const int8_t* scoresData, const Shape& scoresShape,
688 const uint16_t* roiData, const Shape& roiShape,
689 const int32_t* batchesData, const Shape& batchesShape,
690 float scoreThreshold, int32_t maxNumDetections, int32_t softNmsKernel,
691 float iouThreshold, float sigma, float nmsScoreThreshold,
692 int8_t* scoresOutData, const Shape& scoresOutShape, uint16_t* roiOutData,
693 const Shape& roiOutShape, int32_t* classesOutData,
694 const Shape& classesOutShape, int32_t* batchesOutData,
695 const Shape& batchSplitOutShape, IOperationExecutionContext* context) {
696 std::vector<float> scores_float32(getNumberOfElements(scoresShape));
697 convertQuantToFloat32<int8_t>(scoresData, scoresShape.scale, scoresShape.offset,
698 &scores_float32);
699 std::vector<float> roi_float32(getNumberOfElements(roiShape));
700 convertQuantToFloat32(roiData, roiShape.scale, roiShape.offset, &roi_float32);
701 std::vector<uint32_t> selected, batchSplitIn, batchSplitOut;
702 NN_RET_CHECK(boxWithNmsLimitFloat32Compute(
703 scores_float32.data(), scoresShape, roi_float32.data(), roiShape, batchesData,
704 batchesShape, scoreThreshold, maxNumDetections, softNmsKernel, iouThreshold, sigma,
705 nmsScoreThreshold, &batchSplitIn, &batchSplitOut, &selected));
706 return boxWithNmsLimitWriteOutput<int8_t, uint16_t>(selected, batchSplitIn, batchSplitOut,
707 scores_float32, context);
708 }
709
710 } // namespace
711 #endif // NN_INCLUDE_CPU_IMPLEMENTATION
712
validate(const IOperationValidationContext * context)713 Result<Version> validate(const IOperationValidationContext* context) {
714 NN_RET_CHECK_EQ(context->getNumInputs(), kNumInputs);
715 NN_RET_CHECK_EQ(context->getNumOutputs(), kNumOutputs);
716 std::vector<OperandType> inExpectedTypes;
717 std::vector<OperandType> outExpectedTypes;
718 auto inputType = context->getInputType(kScoreTensor);
719 if (inputType == OperandType::TENSOR_FLOAT16) {
720 inExpectedTypes = {
721 OperandType::TENSOR_FLOAT16, OperandType::TENSOR_FLOAT16, OperandType::TENSOR_INT32,
722 OperandType::FLOAT16, OperandType::INT32, OperandType::INT32,
723 OperandType::FLOAT16, OperandType::FLOAT16, OperandType::FLOAT16};
724 outExpectedTypes = {OperandType::TENSOR_FLOAT16, OperandType::TENSOR_FLOAT16,
725 OperandType::TENSOR_INT32, OperandType::TENSOR_INT32};
726 } else if (inputType == OperandType::TENSOR_FLOAT32) {
727 inExpectedTypes = {
728 OperandType::TENSOR_FLOAT32, OperandType::TENSOR_FLOAT32, OperandType::TENSOR_INT32,
729 OperandType::FLOAT32, OperandType::INT32, OperandType::INT32,
730 OperandType::FLOAT32, OperandType::FLOAT32, OperandType::FLOAT32};
731 outExpectedTypes = {OperandType::TENSOR_FLOAT32, OperandType::TENSOR_FLOAT32,
732 OperandType::TENSOR_INT32, OperandType::TENSOR_INT32};
733 } else if (inputType == OperandType::TENSOR_QUANT8_ASYMM ||
734 inputType == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
735 inExpectedTypes = {inputType,
736 OperandType::TENSOR_QUANT16_ASYMM,
737 OperandType::TENSOR_INT32,
738 OperandType::FLOAT32,
739 OperandType::INT32,
740 OperandType::INT32,
741 OperandType::FLOAT32,
742 OperandType::FLOAT32,
743 OperandType::FLOAT32};
744 outExpectedTypes = {inputType, OperandType::TENSOR_QUANT16_ASYMM, OperandType::TENSOR_INT32,
745 OperandType::TENSOR_INT32};
746 } else {
747 NN_RET_CHECK_FAIL() << "Unsupported tensor type for operation " << kOperationName;
748 }
749 NN_RET_CHECK(validateInputTypes(context, inExpectedTypes));
750 NN_RET_CHECK(validateOutputTypes(context, outExpectedTypes));
751 if (inputType == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
752 return Version::ANDROID_R;
753 } else {
754 return Version::ANDROID_Q;
755 }
756 }
757
758 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
prepare(IOperationExecutionContext * context)759 bool prepare(IOperationExecutionContext* context) {
760 Shape scoreShape = context->getInputShape(kScoreTensor);
761 Shape roiShape = context->getInputShape(kRoiTensor);
762 Shape batchesShape = context->getInputShape(kBatchesTensor);
763 Shape outputScoreShape = context->getOutputShape(kOutputScoreTensor);
764 Shape outputRoiShape = context->getOutputShape(kOutputRoiTensor);
765 Shape outputClassShape = context->getOutputShape(kOutputClassTensor);
766 Shape outputBatchSplitShape = context->getOutputShape(kOutputBatchesTensor);
767
768 NN_RET_CHECK(getNumberOfDimensions(scoreShape) == 2);
769 NN_RET_CHECK(getNumberOfDimensions(roiShape) == 2);
770 NN_RET_CHECK(getNumberOfDimensions(batchesShape) == 1);
771
772 // Only numRois can be zero.
773 const uint32_t kRoiDim = 4;
774 uint32_t numRois = getSizeOfDimension(scoreShape, 0);
775 uint32_t numClasses = getSizeOfDimension(scoreShape, 1);
776 NN_RET_CHECK(getSizeOfDimension(roiShape, 0) == numRois);
777 NN_RET_CHECK(getSizeOfDimension(roiShape, 1) == kRoiDim * numClasses);
778 NN_RET_CHECK(getSizeOfDimension(batchesShape, 0) == numRois);
779 NN_RET_CHECK_GT(numClasses, 1);
780
781 if (scoreShape.type == OperandType::TENSOR_QUANT8_ASYMM ||
782 scoreShape.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
783 NN_RET_CHECK_EQ(roiShape.scale, 0.125f);
784 NN_RET_CHECK_EQ(roiShape.offset, 0);
785 }
786
787 outputScoreShape.type = scoreShape.type;
788 outputScoreShape.dimensions = {0};
789 outputScoreShape.scale = scoreShape.scale;
790 outputScoreShape.offset = scoreShape.offset;
791 NN_RET_CHECK(context->setOutputShape(kOutputScoreTensor, outputScoreShape));
792
793 outputRoiShape.type = roiShape.type;
794 outputRoiShape.dimensions = {0, 4};
795 outputRoiShape.scale = 0.f;
796 outputRoiShape.offset = 0;
797 if (scoreShape.type == OperandType::TENSOR_QUANT8_ASYMM ||
798 scoreShape.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
799 outputRoiShape.scale = 0.125f;
800 }
801 NN_RET_CHECK(context->setOutputShape(kOutputRoiTensor, outputRoiShape));
802
803 outputClassShape.type = OperandType::TENSOR_INT32;
804 outputClassShape.dimensions = {0};
805 NN_RET_CHECK(context->setOutputShape(kOutputClassTensor, outputClassShape));
806
807 outputBatchSplitShape.type = batchesShape.type;
808 outputBatchSplitShape.dimensions = {0};
809 NN_RET_CHECK(context->setOutputShape(kOutputBatchesTensor, outputBatchSplitShape));
810 return true;
811 }
812
execute(IOperationExecutionContext * context)813 bool execute(IOperationExecutionContext* context) {
814 NNTRACE_TRANS("boxWithNMSLimit");
815 // Bypass execution in the case of zero numRois.
816 if (getSizeOfDimension(context->getInputShape(kScoreTensor), 0) == 0) return true;
817 switch (context->getInputType(kScoreTensor)) {
818 case OperandType::TENSOR_FLOAT16: {
819 return boxWithNmsLimitFloat16(
820 context->getInputBuffer<_Float16>(kScoreTensor),
821 context->getInputShape(kScoreTensor),
822 context->getInputBuffer<_Float16>(kRoiTensor),
823 context->getInputShape(kRoiTensor),
824 context->getInputBuffer<int32_t>(kBatchesTensor),
825 context->getInputShape(kBatchesTensor),
826 context->getInputValue<_Float16>(kScoreThresholdScalar),
827 context->getInputValue<int32_t>(kMaxNumDetectionScalar),
828 context->getInputValue<int32_t>(kNmsKernelScalar),
829 context->getInputValue<_Float16>(kIoUThresholdScalar),
830 context->getInputValue<_Float16>(kSigmaScalar),
831 context->getInputValue<_Float16>(kNmsScoreThresholdScalar),
832 context->getOutputBuffer<_Float16>(kOutputScoreTensor),
833 context->getOutputShape(kOutputScoreTensor),
834 context->getOutputBuffer<_Float16>(kOutputRoiTensor),
835 context->getOutputShape(kOutputRoiTensor),
836 context->getOutputBuffer<int32_t>(kOutputClassTensor),
837 context->getOutputShape(kOutputClassTensor),
838 context->getOutputBuffer<int32_t>(kOutputBatchesTensor),
839 context->getOutputShape(kOutputBatchesTensor), context);
840 }
841 case OperandType::TENSOR_FLOAT32: {
842 return boxWithNmsLimitFloat32(context->getInputBuffer<float>(kScoreTensor),
843 context->getInputShape(kScoreTensor),
844 context->getInputBuffer<float>(kRoiTensor),
845 context->getInputShape(kRoiTensor),
846 context->getInputBuffer<int32_t>(kBatchesTensor),
847 context->getInputShape(kBatchesTensor),
848 context->getInputValue<float>(kScoreThresholdScalar),
849 context->getInputValue<int32_t>(kMaxNumDetectionScalar),
850 context->getInputValue<int32_t>(kNmsKernelScalar),
851 context->getInputValue<float>(kIoUThresholdScalar),
852 context->getInputValue<float>(kSigmaScalar),
853 context->getInputValue<float>(kNmsScoreThresholdScalar),
854 context->getOutputBuffer<float>(kOutputScoreTensor),
855 context->getOutputShape(kOutputScoreTensor),
856 context->getOutputBuffer<float>(kOutputRoiTensor),
857 context->getOutputShape(kOutputRoiTensor),
858 context->getOutputBuffer<int32_t>(kOutputClassTensor),
859 context->getOutputShape(kOutputClassTensor),
860 context->getOutputBuffer<int32_t>(kOutputBatchesTensor),
861 context->getOutputShape(kOutputBatchesTensor), context);
862 }
863 case OperandType::TENSOR_QUANT8_ASYMM: {
864 return boxWithNmsLimitQuant(context->getInputBuffer<uint8_t>(kScoreTensor),
865 context->getInputShape(kScoreTensor),
866 context->getInputBuffer<uint16_t>(kRoiTensor),
867 context->getInputShape(kRoiTensor),
868 context->getInputBuffer<int32_t>(kBatchesTensor),
869 context->getInputShape(kBatchesTensor),
870 context->getInputValue<float>(kScoreThresholdScalar),
871 context->getInputValue<int32_t>(kMaxNumDetectionScalar),
872 context->getInputValue<int32_t>(kNmsKernelScalar),
873 context->getInputValue<float>(kIoUThresholdScalar),
874 context->getInputValue<float>(kSigmaScalar),
875 context->getInputValue<float>(kNmsScoreThresholdScalar),
876 context->getOutputBuffer<uint8_t>(kOutputScoreTensor),
877 context->getOutputShape(kOutputScoreTensor),
878 context->getOutputBuffer<uint16_t>(kOutputRoiTensor),
879 context->getOutputShape(kOutputRoiTensor),
880 context->getOutputBuffer<int32_t>(kOutputClassTensor),
881 context->getOutputShape(kOutputClassTensor),
882 context->getOutputBuffer<int32_t>(kOutputBatchesTensor),
883 context->getOutputShape(kOutputBatchesTensor), context);
884 }
885 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
886 return boxWithNmsLimitQuant(context->getInputBuffer<int8_t>(kScoreTensor),
887 context->getInputShape(kScoreTensor),
888 context->getInputBuffer<uint16_t>(kRoiTensor),
889 context->getInputShape(kRoiTensor),
890 context->getInputBuffer<int32_t>(kBatchesTensor),
891 context->getInputShape(kBatchesTensor),
892 context->getInputValue<float>(kScoreThresholdScalar),
893 context->getInputValue<int32_t>(kMaxNumDetectionScalar),
894 context->getInputValue<int32_t>(kNmsKernelScalar),
895 context->getInputValue<float>(kIoUThresholdScalar),
896 context->getInputValue<float>(kSigmaScalar),
897 context->getInputValue<float>(kNmsScoreThresholdScalar),
898 context->getOutputBuffer<int8_t>(kOutputScoreTensor),
899 context->getOutputShape(kOutputScoreTensor),
900 context->getOutputBuffer<uint16_t>(kOutputRoiTensor),
901 context->getOutputShape(kOutputRoiTensor),
902 context->getOutputBuffer<int32_t>(kOutputClassTensor),
903 context->getOutputShape(kOutputClassTensor),
904 context->getOutputBuffer<int32_t>(kOutputBatchesTensor),
905 context->getOutputShape(kOutputBatchesTensor), context);
906 }
907 default:
908 NN_RET_CHECK_FAIL() << "Unsupported tensor type for operation " << kOperationName;
909 }
910 }
911 #endif // NN_INCLUDE_CPU_IMPLEMENTATION
912
913 } // namespace box_with_nms_limit
914
915 namespace generate_proposals {
916
917 constexpr char kOperationName[] = "GENERATE_PROPOSALS";
918
919 constexpr uint32_t kNumInputs = 11;
920 constexpr uint32_t kScoreTensor = 0;
921 constexpr uint32_t kDeltaTensor = 1;
922 constexpr uint32_t kAnchorTensor = 2;
923 constexpr uint32_t kImageInfoTensor = 3;
924 constexpr uint32_t kHeightStrideSalar = 4;
925 constexpr uint32_t kWidthStrideScalar = 5;
926 constexpr uint32_t kPreNmsMaxScalar = 6;
927 constexpr uint32_t kPostNmsMaxScalar = 7;
928 constexpr uint32_t kIoUThresholdScalar = 8;
929 constexpr uint32_t kMinSizeScalar = 9;
930 constexpr uint32_t kLayoutScalar = 10;
931
932 constexpr uint32_t kNumOutputs = 3;
933 constexpr uint32_t kOutputScoreTensor = 0;
934 constexpr uint32_t kOutputRoiTensor = 1;
935 constexpr uint32_t kOutputBatchesTensor = 2;
936
937 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
938 namespace {
939
filterBoxes(const float * roiBase,const float * imageInfoBase,float minSize,std::vector<uint32_t> * select)940 void filterBoxes(const float* roiBase, const float* imageInfoBase, float minSize,
941 std::vector<uint32_t>* select) {
942 const uint32_t kRoiDim = 4;
943 uint32_t i = 0;
944 for (uint32_t j = 0; j < select->size(); j++) {
945 const float* roiInfo = roiBase + (*select)[j] * kRoiDim;
946 float roiWidth, roiHeight, xRoiCenter, yRoiCenter;
947 roiWidth = roiInfo[2] - roiInfo[0];
948 roiHeight = roiInfo[3] - roiInfo[1];
949 xRoiCenter = roiInfo[0] + roiWidth / 2.0f;
950 yRoiCenter = roiInfo[1] + roiHeight / 2.0f;
951 if (roiWidth > minSize && roiHeight > minSize && xRoiCenter < imageInfoBase[1] &&
952 yRoiCenter < imageInfoBase[0]) {
953 (*select)[i++] = (*select)[j];
954 }
955 }
956 select->resize(i);
957 }
958
generateProposalsNhwcFloat32Compute(const float * scoresData,const Shape & scoresShape,const float * bboxDeltasData,const Shape & bboxDeltasShape,const float * anchorsData,const Shape & anchorsShape,const float * imageInfoData,const Shape & imageInfoShape,float heightStride,float widthStride,int32_t preNmsTopN,int32_t postNmsTopN,float iouThreshold,float minSize,std::vector<float> * scoresOutData,std::vector<float> * roiOutData,std::vector<int32_t> * batchesOutData)959 bool generateProposalsNhwcFloat32Compute(const float* scoresData, const Shape& scoresShape,
960 const float* bboxDeltasData, const Shape& bboxDeltasShape,
961 const float* anchorsData, const Shape& anchorsShape,
962 const float* imageInfoData, const Shape& imageInfoShape,
963 float heightStride, float widthStride, int32_t preNmsTopN,
964 int32_t postNmsTopN, float iouThreshold, float minSize,
965 std::vector<float>* scoresOutData,
966 std::vector<float>* roiOutData,
967 std::vector<int32_t>* batchesOutData) {
968 const uint32_t kRoiDim = 4;
969 uint32_t numBatches = getSizeOfDimension(scoresShape, 0);
970 uint32_t height = getSizeOfDimension(scoresShape, 1);
971 uint32_t width = getSizeOfDimension(scoresShape, 2);
972 uint32_t numAnchors = getSizeOfDimension(scoresShape, 3);
973 uint32_t imageInfoLength = getSizeOfDimension(imageInfoShape, 1);
974
975 uint32_t batchSize = height * width * numAnchors;
976 uint32_t roiBufferSize = batchSize * kRoiDim;
977 std::vector<float> roiBuffer(roiBufferSize);
978 std::vector<float> roiTransformedBuffer(roiBufferSize);
979 scoresOutData->clear();
980 roiOutData->clear();
981 batchesOutData->clear();
982
983 // Compute the roi region for each anchor.
984 float* roiBase = roiBuffer.data();
985 for (uint32_t h = 0; h < height; h++) {
986 float hShift = h * heightStride;
987 for (uint32_t w = 0; w < width; w++) {
988 const float* anchorsBase = anchorsData;
989 float wShift = w * widthStride;
990 for (uint32_t a = 0; a < numAnchors; a++, roiBase += kRoiDim, anchorsBase += kRoiDim) {
991 roiBase[0] = anchorsBase[0] + wShift;
992 roiBase[1] = anchorsBase[1] + hShift;
993 roiBase[2] = anchorsBase[2] + wShift;
994 roiBase[3] = anchorsBase[3] + hShift;
995 }
996 }
997 }
998
999 const float* scoresBase = scoresData;
1000 const float* bboxDeltasBase = bboxDeltasData;
1001 const float* imageInfoBase = imageInfoData;
1002 // Need to fake some data to satisfy bboxTransform.
1003 Shape tempRoiShape = anchorsShape;
1004 tempRoiShape.dimensions = {batchSize, kRoiDim};
1005 Shape tempBBoxDeltasShape = bboxDeltasShape;
1006 tempBBoxDeltasShape.dimensions = {batchSize, kRoiDim};
1007 std::vector<int32_t> tempBatchSplitData(batchSize, 0);
1008 Shape tempbatchSplitShape = {.dimensions = {batchSize}};
1009 Shape tempImageInfoShape = imageInfoShape;
1010 tempImageInfoShape.dimensions = {1, imageInfoLength};
1011
1012 for (uint32_t b = 0; b < numBatches; b++) {
1013 // Apply bboxDeltas to anchor locations.
1014 float tempImageInfo[] = {imageInfoBase[0], imageInfoBase[1]};
1015 if (!bboxTransformFloat32(roiBuffer.data(), tempRoiShape, bboxDeltasBase,
1016 tempBBoxDeltasShape, tempBatchSplitData.data(),
1017 tempbatchSplitShape, tempImageInfo, tempImageInfoShape,
1018 roiTransformedBuffer.data(), tempRoiShape)) {
1019 LOG(ERROR) << "BBoxTransform step failed in GENERATE_PROPOSALS op.";
1020 return false;
1021 }
1022
1023 // Find the top preNmsTopN scores.
1024 std::vector<uint32_t> select(batchSize);
1025 std::iota(select.begin(), select.end(), 0);
1026 if (preNmsTopN > 0 && preNmsTopN < select.size()) {
1027 std::sort(select.begin(), select.end(),
1028 [&scoresBase](const uint32_t lhs, const uint32_t rhs) {
1029 return scoresBase[lhs] > scoresBase[rhs];
1030 });
1031 select.resize(preNmsTopN);
1032 }
1033
1034 // Filter boxes, disgard regions with height or width < minSize.
1035 filterBoxes(roiTransformedBuffer.data(), imageInfoBase, minSize, &select);
1036
1037 // Apply hard NMS.
1038 uint32_t* selectEnd = box_with_nms_limit::hardNmsSingleClass(
1039 scoresBase, iouThreshold, postNmsTopN,
1040 [&roiTransformedBuffer](uint32_t ind) {
1041 return roiTransformedBuffer.data() + ind * kRoiDim;
1042 },
1043 select.data(), select.size());
1044 uint32_t selectSize = selectEnd - select.data();
1045 select.resize(selectSize);
1046
1047 // Write output.
1048 for (auto i : select) {
1049 roiOutData->insert(roiOutData->end(), roiTransformedBuffer.begin() + i * kRoiDim,
1050 roiTransformedBuffer.begin() + (i + 1) * kRoiDim);
1051 scoresOutData->push_back(scoresBase[i]);
1052 batchesOutData->push_back(b);
1053 }
1054 scoresBase += batchSize;
1055 bboxDeltasBase += roiBufferSize;
1056 imageInfoBase += imageInfoLength;
1057 }
1058 return true;
1059 }
1060
generateProposalsFloat32Compute(const float * scoresData,const Shape & scoresShape,const float * bboxDeltasData,const Shape & bboxDeltasShape,const float * anchorsData,const Shape & anchorsShape,const float * imageInfoData,const Shape & imageInfoShape,float heightStride,float widthStride,int32_t preNmsTopN,int32_t postNmsTopN,float iouThreshold,float minSize,bool useNchw,std::vector<float> * scoresOutData,std::vector<float> * roiOutData,std::vector<int32_t> * batchesOutData)1061 bool generateProposalsFloat32Compute(const float* scoresData, const Shape& scoresShape,
1062 const float* bboxDeltasData, const Shape& bboxDeltasShape,
1063 const float* anchorsData, const Shape& anchorsShape,
1064 const float* imageInfoData, const Shape& imageInfoShape,
1065 float heightStride, float widthStride, int32_t preNmsTopN,
1066 int32_t postNmsTopN, float iouThreshold, float minSize,
1067 bool useNchw, std::vector<float>* scoresOutData,
1068 std::vector<float>* roiOutData,
1069 std::vector<int32_t>* batchesOutData) {
1070 InputWithLayout<float> score_nhwc(useNchw), delta_nhwc(useNchw);
1071 NN_RET_CHECK(score_nhwc.initialize(scoresData, scoresShape));
1072 NN_RET_CHECK(delta_nhwc.initialize(bboxDeltasData, bboxDeltasShape));
1073 return generateProposalsNhwcFloat32Compute(
1074 score_nhwc.getNhwcBuffer(), score_nhwc.getNhwcShape(), delta_nhwc.getNhwcBuffer(),
1075 delta_nhwc.getNhwcShape(), anchorsData, anchorsShape, imageInfoData, imageInfoShape,
1076 heightStride, widthStride, preNmsTopN, postNmsTopN, iouThreshold, minSize,
1077 scoresOutData, roiOutData, batchesOutData);
1078 }
1079
generateProposalsFloat32(const float * scoresData,const Shape & scoresShape,const float * bboxDeltasData,const Shape & bboxDeltasShape,const float * anchorsData,const Shape & anchorsShape,const float * imageInfoData,const Shape & imageInfoShape,float heightStride,float widthStride,int32_t preNmsTopN,int32_t postNmsTopN,float iouThreshold,float minSize,bool useNchw,IOperationExecutionContext * context)1080 bool generateProposalsFloat32(const float* scoresData, const Shape& scoresShape,
1081 const float* bboxDeltasData, const Shape& bboxDeltasShape,
1082 const float* anchorsData, const Shape& anchorsShape,
1083 const float* imageInfoData, const Shape& imageInfoShape,
1084 float heightStride, float widthStride, int32_t preNmsTopN,
1085 int32_t postNmsTopN, float iouThreshold, float minSize, bool useNchw,
1086 IOperationExecutionContext* context) {
1087 std::vector<float> scoresOut_float32, roiOut_float32;
1088 std::vector<int32_t> batchesOut;
1089 NN_RET_CHECK(generateProposalsFloat32Compute(
1090 scoresData, scoresShape, bboxDeltasData, bboxDeltasShape, anchorsData, anchorsShape,
1091 imageInfoData, imageInfoShape, heightStride, widthStride, preNmsTopN, postNmsTopN,
1092 iouThreshold, minSize, useNchw, &scoresOut_float32, &roiOut_float32, &batchesOut));
1093
1094 // Set output dimensions.
1095 uint32_t numOutRois = scoresOut_float32.size();
1096 if (numOutRois == 0) return true;
1097 Shape scoresOutShape = context->getOutputShape(kOutputScoreTensor);
1098 scoresOutShape.dimensions = {numOutRois};
1099 NN_RET_CHECK(context->setOutputShape(kOutputScoreTensor, scoresOutShape));
1100 Shape roiOutShape = context->getOutputShape(kOutputRoiTensor);
1101 roiOutShape.dimensions = {numOutRois, 4};
1102 NN_RET_CHECK(context->setOutputShape(kOutputRoiTensor, roiOutShape));
1103 Shape batchesOutShape = context->getOutputShape(kOutputBatchesTensor);
1104 batchesOutShape.dimensions = {numOutRois};
1105 NN_RET_CHECK(context->setOutputShape(kOutputBatchesTensor, batchesOutShape));
1106
1107 // Write outputs.
1108 float* scoresOutData = context->getOutputBuffer<float>(kOutputScoreTensor);
1109 for (uint32_t i = 0; i < scoresOut_float32.size(); i++) {
1110 scoresOutData[i] = scoresOut_float32[i];
1111 }
1112 float* roiOutData = context->getOutputBuffer<float>(kOutputRoiTensor);
1113 for (uint32_t i = 0; i < roiOut_float32.size(); i++) {
1114 roiOutData[i] = roiOut_float32[i];
1115 }
1116 int32_t* batchesOutData = context->getOutputBuffer<int32_t>(kOutputBatchesTensor);
1117 for (uint32_t i = 0; i < batchesOut.size(); i++) {
1118 batchesOutData[i] = batchesOut[i];
1119 }
1120 return true;
1121 }
1122
generateProposalsFloat16(const _Float16 * scoresData,const Shape & scoresShape,const _Float16 * bboxDeltasData,const Shape & bboxDeltasShape,const _Float16 * anchorsData,const Shape & anchorsShape,const _Float16 * imageInfoData,const Shape & imageInfoShape,float heightStride,float widthStride,int32_t preNmsTopN,int32_t postNmsTopN,float iouThreshold,float minSize,bool useNchw,IOperationExecutionContext * context)1123 bool generateProposalsFloat16(const _Float16* scoresData, const Shape& scoresShape,
1124 const _Float16* bboxDeltasData, const Shape& bboxDeltasShape,
1125 const _Float16* anchorsData, const Shape& anchorsShape,
1126 const _Float16* imageInfoData, const Shape& imageInfoShape,
1127 float heightStride, float widthStride, int32_t preNmsTopN,
1128 int32_t postNmsTopN, float iouThreshold, float minSize, bool useNchw,
1129 IOperationExecutionContext* context) {
1130 std::vector<float> score_float32(getNumberOfElements(scoresShape));
1131 convertFloat16ToFloat32(scoresData, &score_float32);
1132 std::vector<float> delta_float32(getNumberOfElements(bboxDeltasShape));
1133 convertFloat16ToFloat32(bboxDeltasData, &delta_float32);
1134 std::vector<float> anchors_float32(getNumberOfElements(anchorsShape));
1135 convertFloat16ToFloat32(anchorsData, &anchors_float32);
1136 std::vector<float> imageInfo_float32(getNumberOfElements(imageInfoShape));
1137 convertFloat16ToFloat32(imageInfoData, &imageInfo_float32);
1138 std::vector<float> scoresOut_float32, roiOut_float32;
1139 std::vector<int32_t> batchesOut;
1140 NN_RET_CHECK(generateProposalsFloat32Compute(
1141 score_float32.data(), scoresShape, delta_float32.data(), bboxDeltasShape,
1142 anchors_float32.data(), anchorsShape, imageInfo_float32.data(), imageInfoShape,
1143 heightStride, widthStride, preNmsTopN, postNmsTopN, iouThreshold, minSize, useNchw,
1144 &scoresOut_float32, &roiOut_float32, &batchesOut));
1145
1146 // Set output dimensions.
1147 uint32_t numOutRois = scoresOut_float32.size();
1148 if (numOutRois == 0) return true;
1149 Shape scoresOutShape = context->getOutputShape(kOutputScoreTensor);
1150 scoresOutShape.dimensions = {numOutRois};
1151 NN_RET_CHECK(context->setOutputShape(kOutputScoreTensor, scoresOutShape));
1152 Shape roiOutShape = context->getOutputShape(kOutputRoiTensor);
1153 roiOutShape.dimensions = {numOutRois, 4};
1154 NN_RET_CHECK(context->setOutputShape(kOutputRoiTensor, roiOutShape));
1155 Shape batchesOutShape = context->getOutputShape(kOutputBatchesTensor);
1156 batchesOutShape.dimensions = {numOutRois};
1157 NN_RET_CHECK(context->setOutputShape(kOutputBatchesTensor, batchesOutShape));
1158
1159 // Write outputs.
1160 _Float16* scoresOutData = context->getOutputBuffer<_Float16>(kOutputScoreTensor);
1161 convertFloat32ToFloat16(scoresOut_float32, scoresOutData);
1162 _Float16* roiOutData = context->getOutputBuffer<_Float16>(kOutputRoiTensor);
1163 convertFloat32ToFloat16(roiOut_float32, roiOutData);
1164 int32_t* batchesOutData = context->getOutputBuffer<int32_t>(kOutputBatchesTensor);
1165 for (uint32_t i = 0; i < batchesOut.size(); i++) {
1166 batchesOutData[i] = batchesOut[i];
1167 }
1168 return true;
1169 }
1170
1171 template <typename T_8QInput>
generateProposalsQuant(const T_8QInput * scoresData,const Shape & scoresShape,const T_8QInput * bboxDeltasData,const Shape & bboxDeltasShape,const int16_t * anchorsData,const Shape & anchorsShape,const uint16_t * imageInfoData,const Shape & imageInfoShape,float heightStride,float widthStride,int32_t preNmsTopN,int32_t postNmsTopN,float iouThreshold,float minSize,bool useNchw,IOperationExecutionContext * context)1172 bool generateProposalsQuant(const T_8QInput* scoresData, const Shape& scoresShape,
1173 const T_8QInput* bboxDeltasData, const Shape& bboxDeltasShape,
1174 const int16_t* anchorsData, const Shape& anchorsShape,
1175 const uint16_t* imageInfoData, const Shape& imageInfoShape,
1176 float heightStride, float widthStride, int32_t preNmsTopN,
1177 int32_t postNmsTopN, float iouThreshold, float minSize, bool useNchw,
1178 IOperationExecutionContext* context) {
1179 std::vector<float> score_float32(getNumberOfElements(scoresShape));
1180 convertQuantToFloat32<T_8QInput>(scoresData, scoresShape.scale, scoresShape.offset,
1181 &score_float32);
1182 std::vector<float> delta_float32(getNumberOfElements(bboxDeltasShape));
1183 convertQuantToFloat32<T_8QInput>(bboxDeltasData, bboxDeltasShape.scale, bboxDeltasShape.offset,
1184 &delta_float32);
1185 std::vector<float> anchors_float32(getNumberOfElements(anchorsShape));
1186 convertQuantToFloat32(anchorsData, anchorsShape.scale, anchorsShape.offset, &anchors_float32);
1187 std::vector<float> imageInfo_float32(getNumberOfElements(imageInfoShape));
1188 convertQuantToFloat32(imageInfoData, imageInfoShape.scale, imageInfoShape.offset,
1189 &imageInfo_float32);
1190 std::vector<float> scoresOut_float32, roiOut_float32;
1191 std::vector<int32_t> batchesOut;
1192 NN_RET_CHECK(generateProposalsFloat32Compute(
1193 score_float32.data(), scoresShape, delta_float32.data(), bboxDeltasShape,
1194 anchors_float32.data(), anchorsShape, imageInfo_float32.data(), imageInfoShape,
1195 heightStride, widthStride, preNmsTopN, postNmsTopN, iouThreshold, minSize, useNchw,
1196 &scoresOut_float32, &roiOut_float32, &batchesOut));
1197
1198 // Set output dimensions.
1199 uint32_t numOutRois = scoresOut_float32.size();
1200 if (numOutRois == 0) return true;
1201 Shape scoresOutShape = context->getOutputShape(kOutputScoreTensor);
1202 scoresOutShape.dimensions = {numOutRois};
1203 NN_RET_CHECK(context->setOutputShape(kOutputScoreTensor, scoresOutShape));
1204 Shape roiOutShape = context->getOutputShape(kOutputRoiTensor);
1205 roiOutShape.dimensions = {numOutRois, 4};
1206 NN_RET_CHECK(context->setOutputShape(kOutputRoiTensor, roiOutShape));
1207 Shape batchesOutShape = context->getOutputShape(kOutputBatchesTensor);
1208 batchesOutShape.dimensions = {numOutRois};
1209 NN_RET_CHECK(context->setOutputShape(kOutputBatchesTensor, batchesOutShape));
1210
1211 // Write outputs.
1212 T_8QInput* scoresOutData = context->getOutputBuffer<T_8QInput>(kOutputScoreTensor);
1213 convertFloat32ToQuant<T_8QInput>(scoresOut_float32, scoresOutShape.scale, scoresOutShape.offset,
1214 scoresOutData);
1215 uint16_t* roiOutData = context->getOutputBuffer<uint16_t>(kOutputRoiTensor);
1216 convertFloat32ToQuant(roiOut_float32, roiOutShape.scale, roiOutShape.offset, roiOutData);
1217 int32_t* batchesOutData = context->getOutputBuffer<int32_t>(kOutputBatchesTensor);
1218 for (uint32_t i = 0; i < batchesOut.size(); i++) {
1219 batchesOutData[i] = batchesOut[i];
1220 }
1221 return true;
1222 }
1223
1224 } // namespace
1225 #endif // NN_INCLUDE_CPU_IMPLEMENTATION
1226
validate(const IOperationValidationContext * context)1227 Result<Version> validate(const IOperationValidationContext* context) {
1228 NN_RET_CHECK_EQ(context->getNumInputs(), kNumInputs);
1229 NN_RET_CHECK_EQ(context->getNumOutputs(), kNumOutputs);
1230 std::vector<OperandType> inExpectedTypes;
1231 std::vector<OperandType> outExpectedTypes;
1232 auto inputType = context->getInputType(kScoreTensor);
1233 if (inputType == OperandType::TENSOR_FLOAT16) {
1234 inExpectedTypes = {OperandType::TENSOR_FLOAT16,
1235 OperandType::TENSOR_FLOAT16,
1236 OperandType::TENSOR_FLOAT16,
1237 OperandType::TENSOR_FLOAT16,
1238 OperandType::FLOAT16,
1239 OperandType::FLOAT16,
1240 OperandType::INT32,
1241 OperandType::INT32,
1242 OperandType::FLOAT16,
1243 OperandType::FLOAT16,
1244 OperandType::BOOL};
1245 outExpectedTypes = {OperandType::TENSOR_FLOAT16, OperandType::TENSOR_FLOAT16,
1246 OperandType::TENSOR_INT32};
1247 } else if (inputType == OperandType::TENSOR_FLOAT32) {
1248 inExpectedTypes = {OperandType::TENSOR_FLOAT32,
1249 OperandType::TENSOR_FLOAT32,
1250 OperandType::TENSOR_FLOAT32,
1251 OperandType::TENSOR_FLOAT32,
1252 OperandType::FLOAT32,
1253 OperandType::FLOAT32,
1254 OperandType::INT32,
1255 OperandType::INT32,
1256 OperandType::FLOAT32,
1257 OperandType::FLOAT32,
1258 OperandType::BOOL};
1259 outExpectedTypes = {OperandType::TENSOR_FLOAT32, OperandType::TENSOR_FLOAT32,
1260 OperandType::TENSOR_INT32};
1261 } else if (inputType == OperandType::TENSOR_QUANT8_ASYMM ||
1262 inputType == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1263 inExpectedTypes = {inputType,
1264 inputType,
1265 OperandType::TENSOR_QUANT16_SYMM,
1266 OperandType::TENSOR_QUANT16_ASYMM,
1267 OperandType::FLOAT32,
1268 OperandType::FLOAT32,
1269 OperandType::INT32,
1270 OperandType::INT32,
1271 OperandType::FLOAT32,
1272 OperandType::FLOAT32,
1273 OperandType::BOOL};
1274 outExpectedTypes = {inputType, OperandType::TENSOR_QUANT16_ASYMM,
1275 OperandType::TENSOR_INT32};
1276 } else {
1277 NN_RET_CHECK_FAIL() << "Unsupported tensor type for operation " << kOperationName;
1278 }
1279 NN_RET_CHECK(validateInputTypes(context, inExpectedTypes));
1280 NN_RET_CHECK(validateOutputTypes(context, outExpectedTypes));
1281 if (inputType == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1282 return Version::ANDROID_R;
1283 } else {
1284 return Version::ANDROID_Q;
1285 }
1286 }
1287
1288 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
prepare(IOperationExecutionContext * context)1289 bool prepare(IOperationExecutionContext* context) {
1290 bool useNchw = context->getInputValue<bool>(kLayoutScalar);
1291 Shape scoreShape = context->getInputShape(kScoreTensor);
1292 Shape bboxDeltasShape = context->getInputShape(kDeltaTensor);
1293 Shape anchorsShape = context->getInputShape(kAnchorTensor);
1294 Shape imageInfoDataShape = context->getInputShape(kImageInfoTensor);
1295 Shape outputScoreShape = context->getOutputShape(kOutputScoreTensor);
1296 Shape outputRoiShape = context->getOutputShape(kOutputRoiTensor);
1297 Shape outputBatchSplitShape = context->getOutputShape(kOutputBatchesTensor);
1298
1299 NN_RET_CHECK_EQ(getNumberOfDimensions(scoreShape), 4);
1300 NN_RET_CHECK_EQ(getNumberOfDimensions(bboxDeltasShape), 4);
1301 NN_RET_CHECK_EQ(getNumberOfDimensions(anchorsShape), 2);
1302 NN_RET_CHECK_EQ(getNumberOfDimensions(imageInfoDataShape), 2);
1303
1304 const uint32_t kRoiDim = 4;
1305 uint32_t numBatches = getSizeOfDimension(scoreShape, 0);
1306 uint32_t height = getSizeOfDimension(scoreShape, useNchw ? 2 : 1);
1307 uint32_t width = getSizeOfDimension(scoreShape, useNchw ? 3 : 2);
1308 uint32_t numAnchors = getSizeOfDimension(scoreShape, useNchw ? 1 : 3);
1309
1310 NN_RET_CHECK_EQ(getSizeOfDimension(bboxDeltasShape, 0), numBatches);
1311 NN_RET_CHECK_EQ(getSizeOfDimension(bboxDeltasShape, useNchw ? 2 : 1), height);
1312 NN_RET_CHECK_EQ(getSizeOfDimension(bboxDeltasShape, useNchw ? 3 : 2), width);
1313 NN_RET_CHECK_EQ(getSizeOfDimension(bboxDeltasShape, useNchw ? 1 : 3), numAnchors * kRoiDim);
1314 NN_RET_CHECK_EQ(getSizeOfDimension(imageInfoDataShape, 0), numBatches);
1315 NN_RET_CHECK_EQ(getSizeOfDimension(imageInfoDataShape, 1), 2);
1316 NN_RET_CHECK_EQ(getSizeOfDimension(anchorsShape, 0), numAnchors);
1317 NN_RET_CHECK_EQ(getSizeOfDimension(anchorsShape, 1), kRoiDim);
1318
1319 if (scoreShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
1320 NN_RET_CHECK_EQ(anchorsShape.scale, 0.125f);
1321 NN_RET_CHECK_EQ(imageInfoDataShape.scale, 0.125f);
1322 NN_RET_CHECK_EQ(imageInfoDataShape.offset, 0);
1323 }
1324
1325 outputScoreShape.type = scoreShape.type;
1326 outputScoreShape.dimensions = {0};
1327 outputScoreShape.scale = scoreShape.scale;
1328 outputScoreShape.offset = scoreShape.offset;
1329 NN_RET_CHECK(context->setOutputShape(kOutputScoreTensor, outputScoreShape));
1330
1331 outputRoiShape.dimensions = {0, 4};
1332 if (scoreShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
1333 outputRoiShape.scale = 0.125f;
1334 outputRoiShape.offset = 0;
1335 }
1336 NN_RET_CHECK(context->setOutputShape(kOutputRoiTensor, outputRoiShape));
1337
1338 outputBatchSplitShape.dimensions = {0};
1339 NN_RET_CHECK(context->setOutputShape(kOutputBatchesTensor, outputBatchSplitShape));
1340 return true;
1341 }
1342
execute(IOperationExecutionContext * context)1343 bool execute(IOperationExecutionContext* context) {
1344 NNTRACE_TRANS("generateProposals");
1345 switch (context->getInputType(kScoreTensor)) {
1346 case OperandType::TENSOR_FLOAT16: {
1347 return generateProposalsFloat16(context->getInputBuffer<_Float16>(kScoreTensor),
1348 context->getInputShape(kScoreTensor),
1349 context->getInputBuffer<_Float16>(kDeltaTensor),
1350 context->getInputShape(kDeltaTensor),
1351 context->getInputBuffer<_Float16>(kAnchorTensor),
1352 context->getInputShape(kAnchorTensor),
1353 context->getInputBuffer<_Float16>(kImageInfoTensor),
1354 context->getInputShape(kImageInfoTensor),
1355 context->getInputValue<_Float16>(kHeightStrideSalar),
1356 context->getInputValue<_Float16>(kWidthStrideScalar),
1357 context->getInputValue<int32_t>(kPreNmsMaxScalar),
1358 context->getInputValue<int32_t>(kPostNmsMaxScalar),
1359 context->getInputValue<_Float16>(kIoUThresholdScalar),
1360 context->getInputValue<_Float16>(kMinSizeScalar),
1361 context->getInputValue<bool>(kLayoutScalar), context);
1362 }
1363 case OperandType::TENSOR_FLOAT32: {
1364 return generateProposalsFloat32(context->getInputBuffer<float>(kScoreTensor),
1365 context->getInputShape(kScoreTensor),
1366 context->getInputBuffer<float>(kDeltaTensor),
1367 context->getInputShape(kDeltaTensor),
1368 context->getInputBuffer<float>(kAnchorTensor),
1369 context->getInputShape(kAnchorTensor),
1370 context->getInputBuffer<float>(kImageInfoTensor),
1371 context->getInputShape(kImageInfoTensor),
1372 context->getInputValue<float>(kHeightStrideSalar),
1373 context->getInputValue<float>(kWidthStrideScalar),
1374 context->getInputValue<int32_t>(kPreNmsMaxScalar),
1375 context->getInputValue<int32_t>(kPostNmsMaxScalar),
1376 context->getInputValue<float>(kIoUThresholdScalar),
1377 context->getInputValue<float>(kMinSizeScalar),
1378 context->getInputValue<bool>(kLayoutScalar), context);
1379 }
1380 case OperandType::TENSOR_QUANT8_ASYMM: {
1381 return generateProposalsQuant(context->getInputBuffer<uint8_t>(kScoreTensor),
1382 context->getInputShape(kScoreTensor),
1383 context->getInputBuffer<uint8_t>(kDeltaTensor),
1384 context->getInputShape(kDeltaTensor),
1385 context->getInputBuffer<int16_t>(kAnchorTensor),
1386 context->getInputShape(kAnchorTensor),
1387 context->getInputBuffer<uint16_t>(kImageInfoTensor),
1388 context->getInputShape(kImageInfoTensor),
1389 context->getInputValue<float>(kHeightStrideSalar),
1390 context->getInputValue<float>(kWidthStrideScalar),
1391 context->getInputValue<int32_t>(kPreNmsMaxScalar),
1392 context->getInputValue<int32_t>(kPostNmsMaxScalar),
1393 context->getInputValue<float>(kIoUThresholdScalar),
1394 context->getInputValue<float>(kMinSizeScalar),
1395 context->getInputValue<bool>(kLayoutScalar), context);
1396 }
1397 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
1398 return generateProposalsQuant(context->getInputBuffer<int8_t>(kScoreTensor),
1399 context->getInputShape(kScoreTensor),
1400 context->getInputBuffer<int8_t>(kDeltaTensor),
1401 context->getInputShape(kDeltaTensor),
1402 context->getInputBuffer<int16_t>(kAnchorTensor),
1403 context->getInputShape(kAnchorTensor),
1404 context->getInputBuffer<uint16_t>(kImageInfoTensor),
1405 context->getInputShape(kImageInfoTensor),
1406 context->getInputValue<float>(kHeightStrideSalar),
1407 context->getInputValue<float>(kWidthStrideScalar),
1408 context->getInputValue<int32_t>(kPreNmsMaxScalar),
1409 context->getInputValue<int32_t>(kPostNmsMaxScalar),
1410 context->getInputValue<float>(kIoUThresholdScalar),
1411 context->getInputValue<float>(kMinSizeScalar),
1412 context->getInputValue<bool>(kLayoutScalar), context);
1413 }
1414 default:
1415 NN_RET_CHECK_FAIL() << "Unsupported tensor type for operation " << kOperationName;
1416 }
1417 }
1418 #endif // NN_INCLUDE_CPU_IMPLEMENTATION
1419
1420 } // namespace generate_proposals
1421
1422 namespace detection_postprocess {
1423
1424 constexpr char kOperationName[] = "DETECTION_POSTPROCESS";
1425
1426 constexpr uint32_t kNumInputs = 14;
1427 constexpr uint32_t kScoreTensor = 0;
1428 constexpr uint32_t kDeltaTensor = 1;
1429 constexpr uint32_t kAnchorTensor = 2;
1430 constexpr uint32_t kScaleYScalar = 3;
1431 constexpr uint32_t kScaleXScalar = 4;
1432 constexpr uint32_t kScaleHScalar = 5;
1433 constexpr uint32_t kScaleWScalar = 6;
1434 constexpr uint32_t kUseRegularNmsScalar = 7;
1435 constexpr uint32_t kMaxNumDetectionScalar = 8;
1436 constexpr uint32_t kMaxClassesPerDetectionScalar = 9;
1437 constexpr uint32_t kMaxNumDetectionPerClassScalar = 10;
1438 constexpr uint32_t kScoreThresholdScalar = 11;
1439 constexpr uint32_t kIoUThresholdScalar = 12;
1440 constexpr uint32_t kIsBGInLabelScalar = 13;
1441
1442 constexpr uint32_t kNumOutputs = 4;
1443 constexpr uint32_t kOutputScoreTensor = 0;
1444 constexpr uint32_t kOutputRoiTensor = 1;
1445 constexpr uint32_t kOutputClassTensor = 2;
1446 constexpr uint32_t kOutputDetectionTensor = 3;
1447
1448 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
1449 namespace {
1450
detectionPostprocessFloat32(const float * scoreData,const Shape & scoreShape,const float * deltaData,const Shape & deltaShape,const float * anchorData,const Shape & anchorShape,float scaleY,float scaleX,float scaleH,float scaleW,bool useRegularNms,int32_t maxNumDetections,int32_t maxClassesPerDetection,int32_t maxNumDetectionsPerClass,float iouThreshold,float scoreThreshold,bool isBGInLabel,float * scoreOutData,const Shape & scoreOutShape,float * roiOutData,const Shape & roiOutShape,int32_t * classOutData,const Shape & classOutShape,int32_t * detectionOutData,const Shape & detectionOutShape)1451 bool detectionPostprocessFloat32(
1452 const float* scoreData, const Shape& scoreShape, const float* deltaData,
1453 const Shape& deltaShape, const float* anchorData, const Shape& anchorShape, float scaleY,
1454 float scaleX, float scaleH, float scaleW, bool useRegularNms, int32_t maxNumDetections,
1455 int32_t maxClassesPerDetection, int32_t maxNumDetectionsPerClass, float iouThreshold,
1456 float scoreThreshold, bool isBGInLabel, float* scoreOutData, const Shape& scoreOutShape,
1457 float* roiOutData, const Shape& roiOutShape, int32_t* classOutData,
1458 const Shape& classOutShape, int32_t* detectionOutData, const Shape& detectionOutShape) {
1459 const uint32_t kRoiDim = 4;
1460 uint32_t numBatches = getSizeOfDimension(scoreShape, 0);
1461 uint32_t numAnchors = getSizeOfDimension(scoreShape, 1);
1462 uint32_t numClasses = getSizeOfDimension(scoreShape, 2);
1463 uint32_t lengthBoxEncoding = getSizeOfDimension(deltaShape, 2);
1464 uint32_t numOutDetection = getSizeOfDimension(scoreOutShape, 1);
1465
1466 memset(scoreOutData, 0, getNumberOfElements(scoreOutShape) * sizeof(float));
1467 memset(roiOutData, 0, getNumberOfElements(roiOutShape) * sizeof(float));
1468 memset(classOutData, 0, getNumberOfElements(classOutShape) * sizeof(int32_t));
1469 memset(detectionOutData, 0, getNumberOfElements(detectionOutShape) * sizeof(int32_t));
1470
1471 const float* scoreBase = scoreData;
1472 const float* deltaBase = deltaData;
1473 float* scoreOutBase = scoreOutData;
1474 float* roiOutBase = roiOutData;
1475 int32_t* classOutBase = classOutData;
1476 std::vector<float> roiBuffer(numAnchors * kRoiDim);
1477 std::vector<float> scoreBuffer(numAnchors);
1478 for (uint32_t b = 0; b < numBatches; b++) {
1479 const float* anchorBase = anchorData;
1480 for (uint32_t a = 0; a < numAnchors; a++) {
1481 float yCtr = anchorBase[0] + anchorBase[2] * deltaBase[0] / scaleY;
1482 float xCtr = anchorBase[1] + anchorBase[3] * deltaBase[1] / scaleX;
1483 float hHalf = anchorBase[2] * std::exp(deltaBase[2] / scaleH) * 0.5f;
1484 float wHalf = anchorBase[3] * std::exp(deltaBase[3] / scaleW) * 0.5f;
1485 roiBuffer[a * kRoiDim] = yCtr - hHalf;
1486 roiBuffer[a * kRoiDim + 1] = xCtr - wHalf;
1487 roiBuffer[a * kRoiDim + 2] = yCtr + hHalf;
1488 roiBuffer[a * kRoiDim + 3] = xCtr + wHalf;
1489 anchorBase += kRoiDim;
1490 deltaBase += lengthBoxEncoding;
1491 }
1492
1493 if (useRegularNms) {
1494 std::vector<uint32_t> select;
1495 box_with_nms_limit::hardNmsMultiClass(
1496 scoreBase, numClasses, numAnchors, scoreThreshold, iouThreshold,
1497 maxNumDetections, maxNumDetectionsPerClass,
1498 [&roiBuffer, numClasses](uint32_t ind) {
1499 return roiBuffer.data() + (ind / numClasses) * kRoiDim;
1500 },
1501 &select);
1502 for (uint32_t i = 0; i < select.size(); i++) {
1503 uint32_t ind = select[i];
1504 scoreOutBase[i] = scoreBase[ind];
1505 memcpy(roiOutBase + i * kRoiDim, &roiBuffer[(ind / numClasses) * kRoiDim],
1506 kRoiDim * sizeof(float));
1507 classOutBase[i] = (ind % numClasses) - (isBGInLabel ? 0 : 1);
1508 }
1509 *detectionOutData++ = select.size();
1510 } else {
1511 uint32_t numOutClasses = std::min<uint32_t>(numClasses - 1, maxClassesPerDetection);
1512 std::vector<float> maxScores(numAnchors);
1513 for (uint32_t a = 0; a < numAnchors; a++) {
1514 maxScores[a] = *std::max_element(scoreBase + a * numClasses + 1,
1515 scoreBase + (a + 1) * numClasses);
1516 }
1517 std::vector<uint32_t> select;
1518 for (uint32_t a = 0; a < numAnchors; a++) {
1519 if (maxScores[a] > scoreThreshold) {
1520 select.push_back(a);
1521 }
1522 }
1523 uint32_t* selectEnd = box_with_nms_limit::hardNmsSingleClass(
1524 maxScores.data(), iouThreshold, maxNumDetections,
1525 [&roiBuffer](uint32_t ind) { return roiBuffer.data() + ind * kRoiDim; },
1526 select.data(), select.size());
1527 select.resize(selectEnd - select.data());
1528 float* scoreOutPtr = scoreOutBase;
1529 float* roiOutPtr = roiOutBase;
1530 int32_t* classOutPtr = classOutBase;
1531 for (auto i : select) {
1532 const float* score = scoreBase + i * numClasses;
1533 std::vector<uint32_t> scoreInds(numClasses - 1);
1534 std::iota(scoreInds.begin(), scoreInds.end(), 1);
1535 std::sort(scoreInds.begin(), scoreInds.end(),
1536 [&score](const uint32_t lhs, const uint32_t rhs) {
1537 return score[lhs] > score[rhs];
1538 });
1539 for (uint32_t c = 0; c < numOutClasses; c++) {
1540 *scoreOutPtr++ = score[scoreInds[c]];
1541 memcpy(roiOutPtr, &roiBuffer[i * kRoiDim], kRoiDim * sizeof(float));
1542 roiOutPtr += kRoiDim;
1543 *classOutPtr++ = scoreInds[c] - (isBGInLabel ? 0 : 1);
1544 }
1545 }
1546 *detectionOutData++ = select.size() * numOutClasses;
1547 }
1548 scoreBase += numAnchors * numClasses;
1549 scoreOutBase += numOutDetection;
1550 roiOutBase += numOutDetection * kRoiDim;
1551 classOutBase += numOutDetection;
1552 }
1553 return true;
1554 }
1555
detectionPostprocessFloat16(const _Float16 * scoreData,const Shape & scoreShape,const _Float16 * deltaData,const Shape & deltaShape,const _Float16 * anchorData,const Shape & anchorShape,float scaleY,float scaleX,float scaleH,float scaleW,bool useRegularNms,int32_t maxNumDetections,int32_t maxClassesPerDetection,int32_t maxNumDetectionsPerClass,float iouThreshold,float scoreThreshold,bool isBGInLabel,_Float16 * scoreOutData,const Shape & scoreOutShape,_Float16 * roiOutData,const Shape & roiOutShape,int32_t * classOutData,const Shape & classOutShape,int32_t * detectionOutData,const Shape & detectionOutShape)1556 bool detectionPostprocessFloat16(
1557 const _Float16* scoreData, const Shape& scoreShape, const _Float16* deltaData,
1558 const Shape& deltaShape, const _Float16* anchorData, const Shape& anchorShape, float scaleY,
1559 float scaleX, float scaleH, float scaleW, bool useRegularNms, int32_t maxNumDetections,
1560 int32_t maxClassesPerDetection, int32_t maxNumDetectionsPerClass, float iouThreshold,
1561 float scoreThreshold, bool isBGInLabel, _Float16* scoreOutData, const Shape& scoreOutShape,
1562 _Float16* roiOutData, const Shape& roiOutShape, int32_t* classOutData,
1563 const Shape& classOutShape, int32_t* detectionOutData, const Shape& detectionOutShape) {
1564 std::vector<float> scores_float32(getNumberOfElements(scoreShape));
1565 convertFloat16ToFloat32(scoreData, &scores_float32);
1566 std::vector<float> delta_float32(getNumberOfElements(deltaShape));
1567 convertFloat16ToFloat32(deltaData, &delta_float32);
1568 std::vector<float> anchor_float32(getNumberOfElements(anchorShape));
1569 convertFloat16ToFloat32(anchorData, &anchor_float32);
1570 std::vector<float> outputScore_float32(getNumberOfElements(scoreOutShape));
1571 std::vector<float> outputRoi_float32(getNumberOfElements(roiOutShape));
1572 NN_RET_CHECK(detectionPostprocessFloat32(
1573 scores_float32.data(), scoreShape, delta_float32.data(), deltaShape,
1574 anchor_float32.data(), anchorShape, scaleY, scaleX, scaleH, scaleW, useRegularNms,
1575 maxNumDetections, maxClassesPerDetection, maxNumDetectionsPerClass, iouThreshold,
1576 scoreThreshold, isBGInLabel, outputScore_float32.data(), scoreOutShape,
1577 outputRoi_float32.data(), roiOutShape, classOutData, classOutShape, detectionOutData,
1578 detectionOutShape));
1579 convertFloat32ToFloat16(outputScore_float32, scoreOutData);
1580 convertFloat32ToFloat16(outputRoi_float32, roiOutData);
1581 return true;
1582 }
1583
1584 } // namespace
1585 #endif // NN_INCLUDE_CPU_IMPLEMENTATION
1586
validate(const IOperationValidationContext * context)1587 Result<Version> validate(const IOperationValidationContext* context) {
1588 NN_RET_CHECK_EQ(context->getNumInputs(), kNumInputs);
1589 NN_RET_CHECK_EQ(context->getNumOutputs(), kNumOutputs);
1590 std::vector<OperandType> inExpectedTypes;
1591 std::vector<OperandType> outExpectedTypes;
1592 auto inputType = context->getInputType(kScoreTensor);
1593 if (inputType == OperandType::TENSOR_FLOAT16) {
1594 inExpectedTypes = {OperandType::TENSOR_FLOAT16, OperandType::TENSOR_FLOAT16,
1595 OperandType::TENSOR_FLOAT16, OperandType::FLOAT16,
1596 OperandType::FLOAT16, OperandType::FLOAT16,
1597 OperandType::FLOAT16, OperandType::BOOL,
1598 OperandType::INT32, OperandType::INT32,
1599 OperandType::INT32, OperandType::FLOAT16,
1600 OperandType::FLOAT16, OperandType::BOOL};
1601 } else if (inputType == OperandType::TENSOR_FLOAT32) {
1602 inExpectedTypes = {OperandType::TENSOR_FLOAT32, OperandType::TENSOR_FLOAT32,
1603 OperandType::TENSOR_FLOAT32, OperandType::FLOAT32,
1604 OperandType::FLOAT32, OperandType::FLOAT32,
1605 OperandType::FLOAT32, OperandType::BOOL,
1606 OperandType::INT32, OperandType::INT32,
1607 OperandType::INT32, OperandType::FLOAT32,
1608 OperandType::FLOAT32, OperandType::BOOL};
1609 } else {
1610 NN_RET_CHECK_FAIL() << "Unsupported tensor type for operation " << kOperationName;
1611 }
1612 NN_RET_CHECK(validateInputTypes(context, inExpectedTypes));
1613 NN_RET_CHECK(validateOutputTypes(
1614 context, {inputType, inputType, OperandType::TENSOR_INT32, OperandType::TENSOR_INT32}));
1615 return Version::ANDROID_Q;
1616 }
1617
1618 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
prepare(IOperationExecutionContext * context)1619 bool prepare(IOperationExecutionContext* context) {
1620 Shape scoreShape = context->getInputShape(kScoreTensor);
1621 Shape deltasShape = context->getInputShape(kDeltaTensor);
1622 Shape anchorsShape = context->getInputShape(kAnchorTensor);
1623 Shape outputScoreShape = context->getOutputShape(kOutputScoreTensor);
1624 Shape outputRoiShape = context->getOutputShape(kOutputRoiTensor);
1625 Shape outputClassShape = context->getOutputShape(kOutputClassTensor);
1626 Shape outputDetectionShape = context->getOutputShape(kOutputDetectionTensor);
1627
1628 NN_RET_CHECK_EQ(getNumberOfDimensions(scoreShape), 3);
1629 NN_RET_CHECK_EQ(getNumberOfDimensions(deltasShape), 3);
1630 NN_RET_CHECK_EQ(getNumberOfDimensions(anchorsShape), 2);
1631
1632 const uint32_t kRoiDim = 4;
1633 uint32_t numBatches = getSizeOfDimension(scoreShape, 0);
1634 uint32_t numAnchors = getSizeOfDimension(scoreShape, 1);
1635 uint32_t numClasses = getSizeOfDimension(scoreShape, 2);
1636 uint32_t lengthBoxEncoding = getSizeOfDimension(deltasShape, 2);
1637 uint32_t maxNumDetections = context->getInputValue<int32_t>(kMaxNumDetectionScalar);
1638 uint32_t maxClassesPerDetection =
1639 context->getInputValue<int32_t>(kMaxClassesPerDetectionScalar);
1640 uint32_t numOutDetections = maxNumDetections;
1641
1642 NN_RET_CHECK_EQ(getSizeOfDimension(deltasShape, 0), numBatches);
1643 NN_RET_CHECK_EQ(getSizeOfDimension(deltasShape, 1), numAnchors);
1644 NN_RET_CHECK_EQ(getSizeOfDimension(anchorsShape, 0), numAnchors);
1645 NN_RET_CHECK_EQ(getSizeOfDimension(anchorsShape, 1), kRoiDim);
1646
1647 if (scoreShape.type == OperandType::TENSOR_FLOAT32) {
1648 NN_RET_CHECK_GT(context->getInputValue<float>(kScaleYScalar), 0);
1649 NN_RET_CHECK_GT(context->getInputValue<float>(kScaleXScalar), 0);
1650 NN_RET_CHECK_GT(context->getInputValue<float>(kScaleHScalar), 0);
1651 NN_RET_CHECK_GT(context->getInputValue<float>(kScaleWScalar), 0);
1652 NN_RET_CHECK_GE(context->getInputValue<float>(kScoreThresholdScalar), 0);
1653 NN_RET_CHECK_GE(context->getInputValue<float>(kIoUThresholdScalar), 0);
1654 } else if (scoreShape.type == OperandType::TENSOR_FLOAT16) {
1655 NN_RET_CHECK(context->getInputValue<_Float16>(kScaleYScalar) > 0);
1656 NN_RET_CHECK(context->getInputValue<_Float16>(kScaleXScalar) > 0);
1657 NN_RET_CHECK(context->getInputValue<_Float16>(kScaleHScalar) > 0);
1658 NN_RET_CHECK(context->getInputValue<_Float16>(kScaleWScalar) > 0);
1659 NN_RET_CHECK(context->getInputValue<_Float16>(kScoreThresholdScalar) >= 0);
1660 NN_RET_CHECK(context->getInputValue<_Float16>(kIoUThresholdScalar) >= 0);
1661 } else {
1662 NN_RET_CHECK_FAIL() << "Unsupported tensor type for operation " << kOperationName;
1663 }
1664 NN_RET_CHECK_GT(numClasses, 1);
1665 NN_RET_CHECK_GE(lengthBoxEncoding, 4);
1666 NN_RET_CHECK_GT(maxNumDetections, 0);
1667 if (context->getInputValue<bool>(kUseRegularNmsScalar)) {
1668 NN_RET_CHECK_GT(context->getInputValue<int32_t>(kMaxNumDetectionPerClassScalar), 0);
1669 } else {
1670 NN_RET_CHECK_GT(maxClassesPerDetection, 0);
1671 numOutDetections *= maxClassesPerDetection;
1672 }
1673
1674 outputScoreShape.type = scoreShape.type;
1675 outputScoreShape.dimensions = {numBatches, numOutDetections};
1676 NN_RET_CHECK(context->setOutputShape(kOutputScoreTensor, outputScoreShape));
1677
1678 outputRoiShape.type = anchorsShape.type;
1679 outputRoiShape.dimensions = {numBatches, numOutDetections, 4};
1680 NN_RET_CHECK(context->setOutputShape(kOutputRoiTensor, outputRoiShape));
1681
1682 outputClassShape.type = OperandType::TENSOR_INT32;
1683 outputClassShape.dimensions = {numBatches, numOutDetections};
1684 NN_RET_CHECK(context->setOutputShape(kOutputClassTensor, outputClassShape));
1685
1686 outputDetectionShape.type = OperandType::TENSOR_INT32;
1687 outputDetectionShape.dimensions = {numBatches};
1688 NN_RET_CHECK(context->setOutputShape(kOutputDetectionTensor, outputDetectionShape));
1689 return true;
1690 }
1691
execute(IOperationExecutionContext * context)1692 bool execute(IOperationExecutionContext* context) {
1693 NNTRACE_TRANS("detectionPostProcess");
1694 switch (context->getInputType(kScoreTensor)) {
1695 case OperandType::TENSOR_FLOAT16: {
1696 return detectionPostprocessFloat16(
1697 context->getInputBuffer<_Float16>(kScoreTensor),
1698 context->getInputShape(kScoreTensor),
1699 context->getInputBuffer<_Float16>(kDeltaTensor),
1700 context->getInputShape(kDeltaTensor),
1701 context->getInputBuffer<_Float16>(kAnchorTensor),
1702 context->getInputShape(kAnchorTensor),
1703 context->getInputValue<_Float16>(kScaleYScalar),
1704 context->getInputValue<_Float16>(kScaleXScalar),
1705 context->getInputValue<_Float16>(kScaleHScalar),
1706 context->getInputValue<_Float16>(kScaleWScalar),
1707 context->getInputValue<bool>(kUseRegularNmsScalar),
1708 context->getInputValue<int32_t>(kMaxNumDetectionScalar),
1709 context->getInputValue<int32_t>(kMaxClassesPerDetectionScalar),
1710 context->getInputValue<int32_t>(kMaxNumDetectionPerClassScalar),
1711 context->getInputValue<_Float16>(kIoUThresholdScalar),
1712 context->getInputValue<_Float16>(kScoreThresholdScalar),
1713 context->getInputValue<bool>(kIsBGInLabelScalar),
1714 context->getOutputBuffer<_Float16>(kOutputScoreTensor),
1715 context->getOutputShape(kOutputScoreTensor),
1716 context->getOutputBuffer<_Float16>(kOutputRoiTensor),
1717 context->getOutputShape(kOutputRoiTensor),
1718 context->getOutputBuffer<int32_t>(kOutputClassTensor),
1719 context->getOutputShape(kOutputClassTensor),
1720 context->getOutputBuffer<int32_t>(kOutputDetectionTensor),
1721 context->getOutputShape(kOutputDetectionTensor));
1722 }
1723 case OperandType::TENSOR_FLOAT32: {
1724 return detectionPostprocessFloat32(
1725 context->getInputBuffer<float>(kScoreTensor),
1726 context->getInputShape(kScoreTensor),
1727 context->getInputBuffer<float>(kDeltaTensor),
1728 context->getInputShape(kDeltaTensor),
1729 context->getInputBuffer<float>(kAnchorTensor),
1730 context->getInputShape(kAnchorTensor),
1731 context->getInputValue<float>(kScaleYScalar),
1732 context->getInputValue<float>(kScaleXScalar),
1733 context->getInputValue<float>(kScaleHScalar),
1734 context->getInputValue<float>(kScaleWScalar),
1735 context->getInputValue<bool>(kUseRegularNmsScalar),
1736 context->getInputValue<int32_t>(kMaxNumDetectionScalar),
1737 context->getInputValue<int32_t>(kMaxClassesPerDetectionScalar),
1738 context->getInputValue<int32_t>(kMaxNumDetectionPerClassScalar),
1739 context->getInputValue<float>(kIoUThresholdScalar),
1740 context->getInputValue<float>(kScoreThresholdScalar),
1741 context->getInputValue<bool>(kIsBGInLabelScalar),
1742 context->getOutputBuffer<float>(kOutputScoreTensor),
1743 context->getOutputShape(kOutputScoreTensor),
1744 context->getOutputBuffer<float>(kOutputRoiTensor),
1745 context->getOutputShape(kOutputRoiTensor),
1746 context->getOutputBuffer<int32_t>(kOutputClassTensor),
1747 context->getOutputShape(kOutputClassTensor),
1748 context->getOutputBuffer<int32_t>(kOutputDetectionTensor),
1749 context->getOutputShape(kOutputDetectionTensor));
1750 }
1751 default:
1752 NN_RET_CHECK_FAIL() << "Unsupported tensor type for operation " << kOperationName;
1753 }
1754 }
1755 #endif // NN_INCLUDE_CPU_IMPLEMENTATION
1756
1757 } // namespace detection_postprocess
1758
1759 } // namespace bbox_ops
1760
1761 NN_REGISTER_OPERATION(AXIS_ALIGNED_BBOX_TRANSFORM,
1762 bbox_ops::axis_aligned_bbox_transform::kOperationName,
1763 bbox_ops::axis_aligned_bbox_transform::validate,
1764 bbox_ops::axis_aligned_bbox_transform::prepare,
1765 bbox_ops::axis_aligned_bbox_transform::execute, .allowZeroSizedInput = true);
1766
1767 NN_REGISTER_OPERATION(BOX_WITH_NMS_LIMIT, bbox_ops::box_with_nms_limit::kOperationName,
1768 bbox_ops::box_with_nms_limit::validate, bbox_ops::box_with_nms_limit::prepare,
1769 bbox_ops::box_with_nms_limit::execute, .allowZeroSizedInput = true);
1770
1771 NN_REGISTER_OPERATION(GENERATE_PROPOSALS, bbox_ops::generate_proposals::kOperationName,
1772 bbox_ops::generate_proposals::validate, bbox_ops::generate_proposals::prepare,
1773 bbox_ops::generate_proposals::execute);
1774
1775 NN_REGISTER_OPERATION(DETECTION_POSTPROCESSING, bbox_ops::detection_postprocess::kOperationName,
1776 bbox_ops::detection_postprocess::validate,
1777 bbox_ops::detection_postprocess::prepare,
1778 bbox_ops::detection_postprocess::execute);
1779 } // namespace nn
1780 } // namespace android
1781