1 /*
2  * Copyright (C) 2019 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "Operations"
18 
19 #include <algorithm>
20 #include <cfloat>
21 #include <limits>
22 #include <vector>
23 
24 #include "OperationResolver.h"
25 #include "Tracing.h"
26 #include "nnapi/Validation.h"
27 
28 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
29 #include <tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h>
30 #include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
31 
32 #include "CpuOperationUtils.h"
33 #endif  // NN_INCLUDE_CPU_IMPLEMENTATION
34 
35 namespace android {
36 namespace nn {
37 
38 namespace softmax {
39 
40 constexpr char kOperationName[] = "SOFTMAX";
41 
42 constexpr uint32_t kNumInputs = 3;
43 constexpr uint32_t kInputTensor = 0;
44 constexpr uint32_t kBetaScalar = 1;
45 constexpr uint32_t kAxisScalar = 2;
46 
47 constexpr uint32_t kNumOutputs = 1;
48 constexpr uint32_t kOutputTensor = 0;
49 
50 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
51 namespace {
52 
softmaxSlowFloat32(const float * inputData,const Shape & inputShape,const float beta,int32_t axis,float * outputData,const Shape & outputShape)53 inline bool softmaxSlowFloat32(const float* inputData, const Shape& inputShape, const float beta,
54                                int32_t axis, float* outputData, const Shape& outputShape) {
55     NNTRACE_TRANS("softmaxFloatSlow32");
56     const uint32_t outerSize = getNumberOfElements(inputShape, 0, axis);
57     const uint32_t axisSize = getSizeOfDimension(inputShape, axis);
58     const uint32_t innerSize =
59             getNumberOfElements(inputShape, axis + 1, getNumberOfDimensions(inputShape));
60     for (uint32_t outer = 0; outer < outerSize; ++outer) {
61         const float* inputBeg = inputData + outer * axisSize * innerSize;
62         const float* inputEnd = inputBeg + axisSize * innerSize;
63         float* outputBeg = outputData + outer * axisSize * innerSize;
64         for (uint32_t inner = 0; inner < innerSize; ++inner, ++inputBeg, ++inputEnd, ++outputBeg) {
65             // Find max
66             float maxValue = -FLT_MAX;
67             for (const float* p = inputBeg; p < inputEnd; p += innerSize) {
68                 maxValue = std::max(maxValue, *p);
69             }
70             // Compute sum
71             float sum = 0.0f;
72             for (const float* p = inputBeg; p < inputEnd; p += innerSize) {
73                 sum += std::exp((*p - maxValue) * beta);
74             }
75             // Compute result
76             float* pOut = outputBeg;
77             for (const float* p = inputBeg; p < inputEnd; p += innerSize, pOut += innerSize) {
78                 *pOut = std::exp((*p - maxValue) * beta) / sum;
79             }
80         }
81     }
82     return true;
83 }
84 
softmaxFloat32(const float * inputData,const Shape & inputShape,const float beta,int32_t axis,float * outputData,const Shape & outputShape)85 bool softmaxFloat32(const float* inputData, const Shape& inputShape, const float beta, int32_t axis,
86                     float* outputData, const Shape& outputShape) {
87     int32_t ndim = getNumberOfDimensions(inputShape);
88     NN_CHECK(handleNegativeAxis(inputShape, &axis));
89     // TFLite optimized implementation only supports computation along the last axis
90     if (axis == ndim - 1) {
91         NNTRACE_COMP("optimized_ops::Softmax::float");
92         tflite::SoftmaxParams param = {.beta = beta};
93         tflite::optimized_ops::Softmax(param, convertShapeToTflshape(inputShape), inputData,
94                                        convertShapeToTflshape(outputShape), outputData);
95         return true;
96     } else {
97         return softmaxSlowFloat32(inputData, inputShape, beta, axis, outputData, outputShape);
98     }
99 }
100 
softmaxFloat16(const _Float16 * inputData,const Shape & inputShape,const float beta,int32_t axis,_Float16 * outputData,const Shape & outputShape)101 bool softmaxFloat16(const _Float16* inputData, const Shape& inputShape, const float beta,
102                     int32_t axis, _Float16* outputData, const Shape& outputShape) {
103     NNTRACE_TRANS("softmaxFloat16");
104     std::vector<float> inputData_float32(getNumberOfElements(inputShape));
105     convertFloat16ToFloat32(inputData, &inputData_float32);
106     std::vector<float> outputData_float32(getNumberOfElements(outputShape));
107 
108     softmaxFloat32(inputData_float32.data(), inputShape, beta, axis, outputData_float32.data(),
109                    outputShape);
110     convertFloat32ToFloat16(outputData_float32, outputData);
111 
112     return true;
113 }
114 
115 template <typename T>
softmaxQuant8Impl(const T * inputData,const Shape & inputShape,const float beta,int32_t axis,int32_t inputMultiplier,int32_t inputLeftShift,float diffMin,T * outputData,const Shape & outputShape)116 bool softmaxQuant8Impl(const T* inputData, const Shape& inputShape, const float beta, int32_t axis,
117                        int32_t inputMultiplier, int32_t inputLeftShift, float diffMin,
118                        T* outputData, const Shape& outputShape) {
119     NNTRACE_TRANS("softmaxQuant8");
120     // The representation chosen for the input to the exp() function is Q5.26.
121     // We need to leave extra space since values that we skip might be as large as
122     // -32 before multiplying by input_beta_multiplier, and therefore as large as
123     // -16 afterwards.  Note that exp(-8) is definitely not insignificant to
124     // accumulation, but exp(-16) definitely is.
125     static const int32_t kScaledDiffIntegerBits = 5;
126     static const int kAccumulationIntegerBits = 12;
127     using FixedPointScaledDiff = gemmlowp::FixedPoint<int32_t, kScaledDiffIntegerBits>;
128     using FixedPointAccum = gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
129     using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
130 
131     const uint32_t outerSize = getNumberOfElements(inputShape, 0, axis);
132     const uint32_t axisSize = getSizeOfDimension(inputShape, axis);
133     const uint32_t innerSize =
134             getNumberOfElements(inputShape, axis + 1, getNumberOfDimensions(inputShape));
135     for (uint32_t outer = 0; outer < outerSize; ++outer) {
136         const T* inputBeg = inputData + outer * axisSize * innerSize;
137         const T* inputEnd = inputBeg + axisSize * innerSize;
138         T* outputBeg = outputData + outer * axisSize * innerSize;
139         for (uint32_t inner = 0; inner < innerSize; ++inner, ++inputBeg, ++inputEnd, ++outputBeg) {
140             // Find max
141             T maxValue = std::is_same_v<T, int8_t> ? -128 : 0;
142             for (const T* p = inputBeg; p < inputEnd; p += innerSize) {
143                 maxValue = std::max(maxValue, *p);
144             }
145 
146             // Compute sum
147             FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
148             for (const T* p = inputBeg; p < inputEnd; p += innerSize) {
149                 int32_t input_diff = static_cast<int32_t>(*p) - maxValue;
150                 if (input_diff >= diffMin) {
151                     const int32_t input_diff_rescaled =
152                             tflite::MultiplyByQuantizedMultiplierGreaterThanOne(
153                                     input_diff, inputMultiplier, inputLeftShift);
154                     const auto scaled_diff_f8 = FixedPointScaledDiff::FromRaw(input_diff_rescaled);
155                     sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
156                                                         exp_on_negative_values(scaled_diff_f8));
157                 }
158             }
159 
160             uint32_t fixed_sum_of_exps = static_cast<uint32_t>(sum_of_exps.raw());
161             int32_t headroom_plus_one = tflite::CountLeadingZeros(fixed_sum_of_exps);
162             // This is the number of bits to the left of the binary point above 1.0.
163             // Consider fixed_sum_of_exps=1.25.  In that case shifted_scale=0.8 and
164             // no later adjustment will be needed.
165             int32_t num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
166             int32_t shifted_sum_minus_one = static_cast<int32_t>(
167                     (fixed_sum_of_exps << headroom_plus_one) - (static_cast<uint32_t>(1) << 31));
168 
169             FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1(
170                     FixedPoint0::FromRaw(shifted_sum_minus_one));
171 
172             // Compute result
173             constexpr int32_t q_min = std::numeric_limits<T>::min();
174             constexpr int32_t q_max = std::numeric_limits<T>::max();
175             T* pOut = outputBeg;
176             for (const T* p = inputBeg; p < inputEnd; p += innerSize, pOut += innerSize) {
177                 int32_t input_diff = static_cast<int32_t>(*p) - maxValue;
178                 if (input_diff >= diffMin) {
179                     const int32_t input_diff_rescaled =
180                             tflite::MultiplyByQuantizedMultiplierGreaterThanOne(
181                                     input_diff, inputMultiplier, inputLeftShift);
182                     const auto scaled_diff_f8 = FixedPointScaledDiff::FromRaw(input_diff_rescaled);
183 
184                     FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
185                     int32_t unsat_output = gemmlowp::RoundingDivideByPOT(
186                             (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8);
187                     if (std::is_same_v<T, int8_t>) {
188                         unsat_output -= 128;
189                     }
190 
191                     *pOut = static_cast<T>(std::max(std::min(unsat_output, q_max), q_min));
192 
193                 } else {
194                     *pOut = std::is_same_v<T, int8_t> ? -128 : 0;
195                 }
196             }
197         }
198     }
199     return true;
200 }
201 
202 template <typename T>
softmaxQuant8(const T * inputData,const Shape & inputShape,const float beta,int32_t axis,T * outputData,const Shape & outputShape)203 bool softmaxQuant8(const T* inputData, const Shape& inputShape, const float beta, int32_t axis,
204                    T* outputData, const Shape& outputShape) {
205     int32_t ndim = getNumberOfDimensions(inputShape);
206     NN_CHECK(handleNegativeAxis(inputShape, &axis));
207 
208     if ((inputShape.type == OperandType::TENSOR_QUANT8_ASYMM && outputShape.offset != 0) ||
209         (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED &&
210          outputShape.offset != -128) ||
211         outputShape.scale != 1.f / 256) {
212         LOG(ERROR) << "incorrect scale / offset for output";
213         return false;
214     }
215 
216     static const int32_t kScaledDiffIntegerBits = 5;
217     const double input_beta_real_multiplier =
218             std::min(1.0 * beta * inputShape.scale * (1 << (31 - kScaledDiffIntegerBits)),
219                      (1LL << 31) - 1.0);
220 
221     int32_t inputMultiplier = 0, inputLeftShift = 0;
222     if (!QuantizeMultiplierGreaterThanOne(input_beta_real_multiplier, &inputMultiplier,
223                                           &inputLeftShift)) {
224         return false;
225     }
226     int32_t diffMin = -CalculateInputRadius(kScaledDiffIntegerBits, inputLeftShift);
227 
228     return softmaxQuant8Impl(inputData, inputShape, beta, axis, inputMultiplier, inputLeftShift,
229                              diffMin, outputData, outputShape);
230 }
231 
232 }  // namespace
233 #endif  // NN_INCLUDE_CPU_IMPLEMENTATION
234 
validate(const IOperationValidationContext * context)235 Result<Version> validate(const IOperationValidationContext* context) {
236     NN_RET_CHECK(context->getNumInputs() == kNumInputs ||
237                  context->getNumInputs() == kNumInputs - 1);
238     NN_RET_CHECK_EQ(context->getNumOutputs(), kNumOutputs);
239     auto inputType = context->getInputType(kInputTensor);
240     std::vector<OperandType> inExpectedTypes;
241     auto minSupportedVersion = Version::ANDROID_OC_MR1;
242     if (inputType == OperandType::TENSOR_FLOAT32 || inputType == OperandType::TENSOR_QUANT8_ASYMM) {
243         minSupportedVersion = Version::ANDROID_OC_MR1;
244         inExpectedTypes = {inputType, OperandType::FLOAT32};
245     } else if (inputType == OperandType::TENSOR_FLOAT16) {
246         minSupportedVersion = Version::ANDROID_Q;
247         inExpectedTypes = {inputType, OperandType::FLOAT16};
248     } else if (inputType == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
249         minSupportedVersion = Version::ANDROID_R;
250         inExpectedTypes = {inputType, OperandType::FLOAT32};
251     } else {
252         NN_RET_CHECK_FAIL() << "Unsupported tensor type for operation " << kOperationName;
253     }
254     const auto inputRank = getNumberOfDimensions(context->getInputShape(kInputTensor));
255     if (inputRank != 0) {
256         NN_RET_CHECK_LE(inputRank, 4);
257     }
258     if (context->getNumInputs() == kNumInputs) {
259         minSupportedVersion = combineVersions(minSupportedVersion, Version::ANDROID_Q);
260         inExpectedTypes.push_back(OperandType::INT32);
261     } else {
262         if (inputRank != 2 && inputRank != 4 && inputRank != 0) {
263             minSupportedVersion = combineVersions(minSupportedVersion, Version::ANDROID_Q);
264         }
265     }
266     NN_RET_CHECK(validateInputTypes(context, inExpectedTypes));
267     NN_RET_CHECK(validateOutputTypes(context, {inputType}));
268     return minSupportedVersion;
269 }
270 
271 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
prepare(IOperationExecutionContext * context)272 bool prepare(IOperationExecutionContext* context) {
273     Shape input = context->getInputShape(kInputTensor);
274     float beta = (input.type == OperandType::TENSOR_FLOAT16)
275                          ? context->getInputValue<_Float16>(kBetaScalar)
276                          : context->getInputValue<float>(kBetaScalar);
277     NN_RET_CHECK_LE(getNumberOfDimensions(input), 4);
278     NN_RET_CHECK_GT(beta, 0.0f);
279     Shape output = context->getOutputShape(kOutputTensor);
280     output.dimensions = input.dimensions;
281     return context->setOutputShape(kOutputTensor, output);
282 }
283 
execute(IOperationExecutionContext * context)284 bool execute(IOperationExecutionContext* context) {
285     // Bypass execution in the case of zero-sized input.
286     if (getNumberOfElements(context->getOutputShape(kOutputTensor)) == 0) return true;
287     int32_t axis = (context->getNumInputs() == kNumInputs)
288                            ? context->getInputValue<int32_t>(kAxisScalar)
289                            : -1;
290     switch (context->getInputType(kInputTensor)) {
291         case OperandType::TENSOR_FLOAT16:
292             return softmaxFloat16(context->getInputBuffer<_Float16>(kInputTensor),
293                                   context->getInputShape(kInputTensor),
294                                   context->getInputValue<_Float16>(kBetaScalar), axis,
295                                   context->getOutputBuffer<_Float16>(kOutputTensor),
296                                   context->getOutputShape(kOutputTensor));
297         case OperandType::TENSOR_FLOAT32:
298             return softmaxFloat32(context->getInputBuffer<float>(kInputTensor),
299                                   context->getInputShape(kInputTensor),
300                                   context->getInputValue<float>(kBetaScalar), axis,
301                                   context->getOutputBuffer<float>(kOutputTensor),
302                                   context->getOutputShape(kOutputTensor));
303         case OperandType::TENSOR_QUANT8_ASYMM:
304             return softmaxQuant8(context->getInputBuffer<uint8_t>(kInputTensor),
305                                  context->getInputShape(kInputTensor),
306                                  context->getInputValue<float>(kBetaScalar), axis,
307                                  context->getOutputBuffer<uint8_t>(kOutputTensor),
308                                  context->getOutputShape(kOutputTensor));
309         case OperandType::TENSOR_QUANT8_ASYMM_SIGNED:
310             return softmaxQuant8(context->getInputBuffer<int8_t>(kInputTensor),
311                                  context->getInputShape(kInputTensor),
312                                  context->getInputValue<float>(kBetaScalar), axis,
313                                  context->getOutputBuffer<int8_t>(kOutputTensor),
314                                  context->getOutputShape(kOutputTensor));
315         default:
316             NN_RET_CHECK_FAIL() << "Unsupported tensor type for operation " << kOperationName;
317     }
318 }
319 #endif  // NN_INCLUDE_CPU_IMPLEMENTATION
320 
321 }  // namespace softmax
322 
323 NN_REGISTER_OPERATION(SOFTMAX, "SOFTMAX", softmax::validate, softmax::prepare, softmax::execute,
324                       .allowZeroSizedInput = true);
325 
326 }  // namespace nn
327 }  // namespace android
328