1 /*
2 * Copyright (C) 2019 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "Operations"
18
19 #include <algorithm>
20 #include <cfloat>
21 #include <limits>
22 #include <vector>
23
24 #include "OperationResolver.h"
25 #include "Tracing.h"
26 #include "nnapi/Validation.h"
27
28 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
29 #include <tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h>
30 #include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
31
32 #include "CpuOperationUtils.h"
33 #endif // NN_INCLUDE_CPU_IMPLEMENTATION
34
35 namespace android {
36 namespace nn {
37
38 namespace softmax {
39
40 constexpr char kOperationName[] = "SOFTMAX";
41
42 constexpr uint32_t kNumInputs = 3;
43 constexpr uint32_t kInputTensor = 0;
44 constexpr uint32_t kBetaScalar = 1;
45 constexpr uint32_t kAxisScalar = 2;
46
47 constexpr uint32_t kNumOutputs = 1;
48 constexpr uint32_t kOutputTensor = 0;
49
50 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
51 namespace {
52
softmaxSlowFloat32(const float * inputData,const Shape & inputShape,const float beta,int32_t axis,float * outputData,const Shape & outputShape)53 inline bool softmaxSlowFloat32(const float* inputData, const Shape& inputShape, const float beta,
54 int32_t axis, float* outputData, const Shape& outputShape) {
55 NNTRACE_TRANS("softmaxFloatSlow32");
56 const uint32_t outerSize = getNumberOfElements(inputShape, 0, axis);
57 const uint32_t axisSize = getSizeOfDimension(inputShape, axis);
58 const uint32_t innerSize =
59 getNumberOfElements(inputShape, axis + 1, getNumberOfDimensions(inputShape));
60 for (uint32_t outer = 0; outer < outerSize; ++outer) {
61 const float* inputBeg = inputData + outer * axisSize * innerSize;
62 const float* inputEnd = inputBeg + axisSize * innerSize;
63 float* outputBeg = outputData + outer * axisSize * innerSize;
64 for (uint32_t inner = 0; inner < innerSize; ++inner, ++inputBeg, ++inputEnd, ++outputBeg) {
65 // Find max
66 float maxValue = -FLT_MAX;
67 for (const float* p = inputBeg; p < inputEnd; p += innerSize) {
68 maxValue = std::max(maxValue, *p);
69 }
70 // Compute sum
71 float sum = 0.0f;
72 for (const float* p = inputBeg; p < inputEnd; p += innerSize) {
73 sum += std::exp((*p - maxValue) * beta);
74 }
75 // Compute result
76 float* pOut = outputBeg;
77 for (const float* p = inputBeg; p < inputEnd; p += innerSize, pOut += innerSize) {
78 *pOut = std::exp((*p - maxValue) * beta) / sum;
79 }
80 }
81 }
82 return true;
83 }
84
softmaxFloat32(const float * inputData,const Shape & inputShape,const float beta,int32_t axis,float * outputData,const Shape & outputShape)85 bool softmaxFloat32(const float* inputData, const Shape& inputShape, const float beta, int32_t axis,
86 float* outputData, const Shape& outputShape) {
87 int32_t ndim = getNumberOfDimensions(inputShape);
88 NN_CHECK(handleNegativeAxis(inputShape, &axis));
89 // TFLite optimized implementation only supports computation along the last axis
90 if (axis == ndim - 1) {
91 NNTRACE_COMP("optimized_ops::Softmax::float");
92 tflite::SoftmaxParams param = {.beta = beta};
93 tflite::optimized_ops::Softmax(param, convertShapeToTflshape(inputShape), inputData,
94 convertShapeToTflshape(outputShape), outputData);
95 return true;
96 } else {
97 return softmaxSlowFloat32(inputData, inputShape, beta, axis, outputData, outputShape);
98 }
99 }
100
softmaxFloat16(const _Float16 * inputData,const Shape & inputShape,const float beta,int32_t axis,_Float16 * outputData,const Shape & outputShape)101 bool softmaxFloat16(const _Float16* inputData, const Shape& inputShape, const float beta,
102 int32_t axis, _Float16* outputData, const Shape& outputShape) {
103 NNTRACE_TRANS("softmaxFloat16");
104 std::vector<float> inputData_float32(getNumberOfElements(inputShape));
105 convertFloat16ToFloat32(inputData, &inputData_float32);
106 std::vector<float> outputData_float32(getNumberOfElements(outputShape));
107
108 softmaxFloat32(inputData_float32.data(), inputShape, beta, axis, outputData_float32.data(),
109 outputShape);
110 convertFloat32ToFloat16(outputData_float32, outputData);
111
112 return true;
113 }
114
115 template <typename T>
softmaxQuant8Impl(const T * inputData,const Shape & inputShape,const float beta,int32_t axis,int32_t inputMultiplier,int32_t inputLeftShift,float diffMin,T * outputData,const Shape & outputShape)116 bool softmaxQuant8Impl(const T* inputData, const Shape& inputShape, const float beta, int32_t axis,
117 int32_t inputMultiplier, int32_t inputLeftShift, float diffMin,
118 T* outputData, const Shape& outputShape) {
119 NNTRACE_TRANS("softmaxQuant8");
120 // The representation chosen for the input to the exp() function is Q5.26.
121 // We need to leave extra space since values that we skip might be as large as
122 // -32 before multiplying by input_beta_multiplier, and therefore as large as
123 // -16 afterwards. Note that exp(-8) is definitely not insignificant to
124 // accumulation, but exp(-16) definitely is.
125 static const int32_t kScaledDiffIntegerBits = 5;
126 static const int kAccumulationIntegerBits = 12;
127 using FixedPointScaledDiff = gemmlowp::FixedPoint<int32_t, kScaledDiffIntegerBits>;
128 using FixedPointAccum = gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
129 using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
130
131 const uint32_t outerSize = getNumberOfElements(inputShape, 0, axis);
132 const uint32_t axisSize = getSizeOfDimension(inputShape, axis);
133 const uint32_t innerSize =
134 getNumberOfElements(inputShape, axis + 1, getNumberOfDimensions(inputShape));
135 for (uint32_t outer = 0; outer < outerSize; ++outer) {
136 const T* inputBeg = inputData + outer * axisSize * innerSize;
137 const T* inputEnd = inputBeg + axisSize * innerSize;
138 T* outputBeg = outputData + outer * axisSize * innerSize;
139 for (uint32_t inner = 0; inner < innerSize; ++inner, ++inputBeg, ++inputEnd, ++outputBeg) {
140 // Find max
141 T maxValue = std::is_same_v<T, int8_t> ? -128 : 0;
142 for (const T* p = inputBeg; p < inputEnd; p += innerSize) {
143 maxValue = std::max(maxValue, *p);
144 }
145
146 // Compute sum
147 FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
148 for (const T* p = inputBeg; p < inputEnd; p += innerSize) {
149 int32_t input_diff = static_cast<int32_t>(*p) - maxValue;
150 if (input_diff >= diffMin) {
151 const int32_t input_diff_rescaled =
152 tflite::MultiplyByQuantizedMultiplierGreaterThanOne(
153 input_diff, inputMultiplier, inputLeftShift);
154 const auto scaled_diff_f8 = FixedPointScaledDiff::FromRaw(input_diff_rescaled);
155 sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
156 exp_on_negative_values(scaled_diff_f8));
157 }
158 }
159
160 uint32_t fixed_sum_of_exps = static_cast<uint32_t>(sum_of_exps.raw());
161 int32_t headroom_plus_one = tflite::CountLeadingZeros(fixed_sum_of_exps);
162 // This is the number of bits to the left of the binary point above 1.0.
163 // Consider fixed_sum_of_exps=1.25. In that case shifted_scale=0.8 and
164 // no later adjustment will be needed.
165 int32_t num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
166 int32_t shifted_sum_minus_one = static_cast<int32_t>(
167 (fixed_sum_of_exps << headroom_plus_one) - (static_cast<uint32_t>(1) << 31));
168
169 FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1(
170 FixedPoint0::FromRaw(shifted_sum_minus_one));
171
172 // Compute result
173 constexpr int32_t q_min = std::numeric_limits<T>::min();
174 constexpr int32_t q_max = std::numeric_limits<T>::max();
175 T* pOut = outputBeg;
176 for (const T* p = inputBeg; p < inputEnd; p += innerSize, pOut += innerSize) {
177 int32_t input_diff = static_cast<int32_t>(*p) - maxValue;
178 if (input_diff >= diffMin) {
179 const int32_t input_diff_rescaled =
180 tflite::MultiplyByQuantizedMultiplierGreaterThanOne(
181 input_diff, inputMultiplier, inputLeftShift);
182 const auto scaled_diff_f8 = FixedPointScaledDiff::FromRaw(input_diff_rescaled);
183
184 FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
185 int32_t unsat_output = gemmlowp::RoundingDivideByPOT(
186 (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8);
187 if (std::is_same_v<T, int8_t>) {
188 unsat_output -= 128;
189 }
190
191 *pOut = static_cast<T>(std::max(std::min(unsat_output, q_max), q_min));
192
193 } else {
194 *pOut = std::is_same_v<T, int8_t> ? -128 : 0;
195 }
196 }
197 }
198 }
199 return true;
200 }
201
202 template <typename T>
softmaxQuant8(const T * inputData,const Shape & inputShape,const float beta,int32_t axis,T * outputData,const Shape & outputShape)203 bool softmaxQuant8(const T* inputData, const Shape& inputShape, const float beta, int32_t axis,
204 T* outputData, const Shape& outputShape) {
205 int32_t ndim = getNumberOfDimensions(inputShape);
206 NN_CHECK(handleNegativeAxis(inputShape, &axis));
207
208 if ((inputShape.type == OperandType::TENSOR_QUANT8_ASYMM && outputShape.offset != 0) ||
209 (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED &&
210 outputShape.offset != -128) ||
211 outputShape.scale != 1.f / 256) {
212 LOG(ERROR) << "incorrect scale / offset for output";
213 return false;
214 }
215
216 static const int32_t kScaledDiffIntegerBits = 5;
217 const double input_beta_real_multiplier =
218 std::min(1.0 * beta * inputShape.scale * (1 << (31 - kScaledDiffIntegerBits)),
219 (1LL << 31) - 1.0);
220
221 int32_t inputMultiplier = 0, inputLeftShift = 0;
222 if (!QuantizeMultiplierGreaterThanOne(input_beta_real_multiplier, &inputMultiplier,
223 &inputLeftShift)) {
224 return false;
225 }
226 int32_t diffMin = -CalculateInputRadius(kScaledDiffIntegerBits, inputLeftShift);
227
228 return softmaxQuant8Impl(inputData, inputShape, beta, axis, inputMultiplier, inputLeftShift,
229 diffMin, outputData, outputShape);
230 }
231
232 } // namespace
233 #endif // NN_INCLUDE_CPU_IMPLEMENTATION
234
validate(const IOperationValidationContext * context)235 Result<Version> validate(const IOperationValidationContext* context) {
236 NN_RET_CHECK(context->getNumInputs() == kNumInputs ||
237 context->getNumInputs() == kNumInputs - 1);
238 NN_RET_CHECK_EQ(context->getNumOutputs(), kNumOutputs);
239 auto inputType = context->getInputType(kInputTensor);
240 std::vector<OperandType> inExpectedTypes;
241 auto minSupportedVersion = Version::ANDROID_OC_MR1;
242 if (inputType == OperandType::TENSOR_FLOAT32 || inputType == OperandType::TENSOR_QUANT8_ASYMM) {
243 minSupportedVersion = Version::ANDROID_OC_MR1;
244 inExpectedTypes = {inputType, OperandType::FLOAT32};
245 } else if (inputType == OperandType::TENSOR_FLOAT16) {
246 minSupportedVersion = Version::ANDROID_Q;
247 inExpectedTypes = {inputType, OperandType::FLOAT16};
248 } else if (inputType == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
249 minSupportedVersion = Version::ANDROID_R;
250 inExpectedTypes = {inputType, OperandType::FLOAT32};
251 } else {
252 NN_RET_CHECK_FAIL() << "Unsupported tensor type for operation " << kOperationName;
253 }
254 const auto inputRank = getNumberOfDimensions(context->getInputShape(kInputTensor));
255 if (inputRank != 0) {
256 NN_RET_CHECK_LE(inputRank, 4);
257 }
258 if (context->getNumInputs() == kNumInputs) {
259 minSupportedVersion = combineVersions(minSupportedVersion, Version::ANDROID_Q);
260 inExpectedTypes.push_back(OperandType::INT32);
261 } else {
262 if (inputRank != 2 && inputRank != 4 && inputRank != 0) {
263 minSupportedVersion = combineVersions(minSupportedVersion, Version::ANDROID_Q);
264 }
265 }
266 NN_RET_CHECK(validateInputTypes(context, inExpectedTypes));
267 NN_RET_CHECK(validateOutputTypes(context, {inputType}));
268 return minSupportedVersion;
269 }
270
271 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
prepare(IOperationExecutionContext * context)272 bool prepare(IOperationExecutionContext* context) {
273 Shape input = context->getInputShape(kInputTensor);
274 float beta = (input.type == OperandType::TENSOR_FLOAT16)
275 ? context->getInputValue<_Float16>(kBetaScalar)
276 : context->getInputValue<float>(kBetaScalar);
277 NN_RET_CHECK_LE(getNumberOfDimensions(input), 4);
278 NN_RET_CHECK_GT(beta, 0.0f);
279 Shape output = context->getOutputShape(kOutputTensor);
280 output.dimensions = input.dimensions;
281 return context->setOutputShape(kOutputTensor, output);
282 }
283
execute(IOperationExecutionContext * context)284 bool execute(IOperationExecutionContext* context) {
285 // Bypass execution in the case of zero-sized input.
286 if (getNumberOfElements(context->getOutputShape(kOutputTensor)) == 0) return true;
287 int32_t axis = (context->getNumInputs() == kNumInputs)
288 ? context->getInputValue<int32_t>(kAxisScalar)
289 : -1;
290 switch (context->getInputType(kInputTensor)) {
291 case OperandType::TENSOR_FLOAT16:
292 return softmaxFloat16(context->getInputBuffer<_Float16>(kInputTensor),
293 context->getInputShape(kInputTensor),
294 context->getInputValue<_Float16>(kBetaScalar), axis,
295 context->getOutputBuffer<_Float16>(kOutputTensor),
296 context->getOutputShape(kOutputTensor));
297 case OperandType::TENSOR_FLOAT32:
298 return softmaxFloat32(context->getInputBuffer<float>(kInputTensor),
299 context->getInputShape(kInputTensor),
300 context->getInputValue<float>(kBetaScalar), axis,
301 context->getOutputBuffer<float>(kOutputTensor),
302 context->getOutputShape(kOutputTensor));
303 case OperandType::TENSOR_QUANT8_ASYMM:
304 return softmaxQuant8(context->getInputBuffer<uint8_t>(kInputTensor),
305 context->getInputShape(kInputTensor),
306 context->getInputValue<float>(kBetaScalar), axis,
307 context->getOutputBuffer<uint8_t>(kOutputTensor),
308 context->getOutputShape(kOutputTensor));
309 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED:
310 return softmaxQuant8(context->getInputBuffer<int8_t>(kInputTensor),
311 context->getInputShape(kInputTensor),
312 context->getInputValue<float>(kBetaScalar), axis,
313 context->getOutputBuffer<int8_t>(kOutputTensor),
314 context->getOutputShape(kOutputTensor));
315 default:
316 NN_RET_CHECK_FAIL() << "Unsupported tensor type for operation " << kOperationName;
317 }
318 }
319 #endif // NN_INCLUDE_CPU_IMPLEMENTATION
320
321 } // namespace softmax
322
323 NN_REGISTER_OPERATION(SOFTMAX, "SOFTMAX", softmax::validate, softmax::prepare, softmax::execute,
324 .allowZeroSizedInput = true);
325
326 } // namespace nn
327 } // namespace android
328