1 /*
2 * Copyright (C) 2021 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <android-base/logging.h>
18 #include <android-base/unique_fd.h>
19 #include <android/hardware_buffer.h>
20 #include <gtest/gtest.h>
21 #include <vulkan/vulkan.h>
22 #include <vulkan/vulkan_android.h>
23
24 #include <algorithm>
25 #include <cmath>
26 #include <cstring>
27 #include <memory>
28 #include <string>
29 #include <utility>
30 #include <vector>
31
32 #include "TestNeuralNetworksWrapper.h"
33
34 #ifndef NNTEST_ONLY_PUBLIC_API
35 #include "Manager.h"
36 #endif
37
38 namespace android::nn {
39 namespace {
40
41 using Type = test_wrapper::Type;
42 using OperandType = test_wrapper::OperandType;
43 using Result = test_wrapper::Result;
44
45 constexpr uint32_t kOperandSizeX = 256;
46 constexpr uint32_t kOperandSizeY = 256;
47 constexpr uint32_t kOperandLength = kOperandSizeX * kOperandSizeY;
48 constexpr uint32_t kNumberOfIterationsToTest = 100;
49 constexpr uint32_t kMaxNumberOfPrintedErrors = 10;
50
51 // This file implements a test suite that exercises a GPU -> NNAPI pipeline using AHardwareBuffer
52 // and sync fence. One pass of the pipeline involves the following three stages:
53 //
54 // - GPU: Invoke the compute shader to clear the all elements in the output buffer to value "1"
55 // of the corresponding element type. Because GPU may not be able to natively support
56 // float16/int8/uint8 data types, we pack each data type into a 4-byte chunk as uint32_t
57 // and pass to the shader. E.g., float16 will be packed as 0x3c003c00 -- float16 value
58 // of "1" (0x3c00) repeated twice. The compute shader will use this 4-byte chunk to clear
59 // the data in the output buffer (see CLEAR_DATA in the compute shader code).
60 //
61 // The GPU workload will output directly to an AHardwareBuffer and export an Android sync
62 // fence.
63 //
64 // - NNAPI: Execute a broadcast ADD operation
65 //
66 // output = ADD(input, const, act)
67 //
68 // where "input" and "output" are of size [kOperandSizeY, kOperandSizeX], "const" and
69 // "act" are model constant operands, "const" is of size [1] and value "1" of the
70 // corresponding element type, "act" = 0. The ADD operation will increment each element
71 // in the input tensor by 1.
72 //
73 // The NNAPI executor takes the GPU output AHardwareBuffer as its input memory,
74 // and directly outputs to another AHardwareBuffer. We use startComputeWithDependencies
75 // to wait on the sync fence from the GPU workload. If supported, the NNAPI executor will
76 // emit a sync fence; Otherwise, it will wait until the workload is finished.
77 //
78 // - Check: Verify that each element in the resulting tensor is 1 + 1 = 2.
79 //
80 // We use introspection API to run the pipeline with each individual driver. Because this test is
81 // added in NNAPI feature level 5, we will exclude devices with a lower feature level. We expect
82 // that if the driver successfully prepares the model, it should finish execution without an error.
83 //
84 // The pipeline is tested with four data types: float32, float16, quant8_asymm, and
85 // quant8_asymm_signed. These data types are chosen to make sure that a driver is likely to
86 // support at least one of the data types.
87 //
88 // For each configuration, we run the pipeline for kNumberOfIterationsToTest iterations.
89
90 const std::vector<uint32_t> kComputeShader =
91 #include "shaders/TestGpuNnapi.comp.spv.inl"
92 ;
93
94 // The expected element value in the final NNAPI output AHardwareBuffer.
95 constexpr uint32_t kExpectedResultInInt = 2;
96
97 // Helper templates for information related to a primary tensor data type. Only four specializations
98 // exists for this template: Type::TENSOR_FLOAT32, Type::TENSOR_FLOAT16, Type::TENSOR_QUANT8_ASYMM,
99 // and Type::TENSOR_QUANT8_ASYMM_SIGNED. Each specialization corresponds to a primary data type for
100 // the testing pipeline.
101 //
102 // Each template specialization defines the following fields:
103 // - ElementType: The corresponding C++ type. Use sizeof(ElementType) to get the element size.
104 // - kIsQuantized: Whether the data type is a quantized type or not.
105 // - kClearData: The CLEAR_DATA used in the compute shader.
106 // - kTolerance: The absolute tolerance used to check the computation result.
107 template <Type dataType>
108 struct TestTypeHelper;
109 template <>
110 struct TestTypeHelper<Type::TENSOR_FLOAT32> {
111 using ElementType = float;
112 static constexpr bool kIsQuantized = false;
113 // One float32 of value (1.0) packed into uint32_t
114 static constexpr uint32_t kClearData = 0x3f800000;
115 static constexpr double kTolerance = 1e-6;
116 };
117 template <>
118 struct TestTypeHelper<Type::TENSOR_FLOAT16> {
119 using ElementType = _Float16;
120 static constexpr bool kIsQuantized = false;
121 // Two float16 of value (1.0) packed into uint32_t
122 static constexpr uint32_t kClearData = 0x3c003c00;
123 static constexpr double kTolerance = 1e-3;
124 };
125 template <>
126 struct TestTypeHelper<Type::TENSOR_QUANT8_ASYMM> {
127 using ElementType = uint8_t;
128 static constexpr bool kIsQuantized = true;
129 // Four uint8_t of value (1) packed into uint32_t
130 static constexpr uint32_t kClearData = 0x01010101;
131 static constexpr double kTolerance = 0;
132 };
133 template <>
134 struct TestTypeHelper<Type::TENSOR_QUANT8_ASYMM_SIGNED> {
135 using ElementType = int8_t;
136 static constexpr bool kIsQuantized = true;
137 // Four int8_t of value (1) packed into uint32_t
138 static constexpr uint32_t kClearData = 0x01010101;
139 static constexpr double kTolerance = 0;
140 };
141
isExtensionSupported(const std::vector<VkExtensionProperties> & supportedExtensions,const char * requestedExtension)142 bool isExtensionSupported(const std::vector<VkExtensionProperties>& supportedExtensions,
143 const char* requestedExtension) {
144 return std::any_of(supportedExtensions.begin(), supportedExtensions.end(),
145 [requestedExtension](const auto& extension) {
146 return strcmp(extension.extensionName, requestedExtension) == 0;
147 });
148 }
149
150 // Records the workgroup size and the group counts of dispatching the compute shader.
151 struct DispatchSize {
152 uint32_t workgroupSize;
153 uint32_t groupCountX;
154 uint32_t groupCountY;
155 };
156
157 // Choose an appropriate dispatch size. We are using a square workgroup size.
158 template <Type dataType>
chooseDispatchSize(const VkPhysicalDeviceLimits & limits)159 DispatchSize chooseDispatchSize(const VkPhysicalDeviceLimits& limits) {
160 // Compute the number of invocations along each dimension.
161 const uint32_t elementSize = sizeof(typename TestTypeHelper<dataType>::ElementType);
162 const uint32_t numberOfElementsPerInvocation = sizeof(uint32_t) / elementSize;
163 const uint32_t workgroupInvocationsX = kOperandSizeX / numberOfElementsPerInvocation;
164 const uint32_t workgroupInvocationsY = kOperandSizeY;
165
166 // Make sure the workgroup size does not exceed the number of invocations along the X and Y
167 // dimensions.
168 uint32_t workgroupSize = std::min(workgroupInvocationsX, workgroupInvocationsY);
169
170 // Make sure the workgroup size does not exceed the device limit along the X and Y dimensions.
171 workgroupSize = std::min<uint32_t>(workgroupSize, limits.maxComputeWorkGroupSize[0]);
172 workgroupSize = std::min<uint32_t>(workgroupSize, limits.maxComputeWorkGroupSize[1]);
173
174 // Make sure the total number of invocations does not exceed the device limit.
175 uint32_t maxSquareWorkGroupSize =
176 static_cast<uint32_t>(std::sqrt(limits.maxComputeWorkGroupInvocations));
177 workgroupSize = std::min(workgroupSize, maxSquareWorkGroupSize);
178
179 // Round down to a power of 2. This is to make sure workgroupInvocationsX and
180 // workgroupInvocationsY are divisible by the workgroup size so that we don't need to apply
181 // bound check in the shader.
182 uint32_t power = static_cast<uint32_t>(std::log2(static_cast<float>(workgroupSize)));
183 workgroupSize = 1u << power;
184 CHECK(workgroupInvocationsX % workgroupSize == 0);
185 CHECK(workgroupInvocationsY % workgroupSize == 0);
186
187 return {
188 .workgroupSize = workgroupSize,
189 .groupCountX = workgroupInvocationsX / workgroupSize,
190 .groupCountY = workgroupInvocationsY / workgroupSize,
191 };
192 }
193
194 // Find the first memory index that satisfies the requirements
195 // See VkAndroidHardwareBufferPropertiesANDROID::memoryTypeBits for the semantics of
196 // "memoryTypeBitsRequirement"
findMemoryType(const VkPhysicalDeviceMemoryProperties & properties,uint32_t memoryTypeBitsRequirement,VkDeviceSize sizeRequirement)197 std::optional<uint32_t> findMemoryType(const VkPhysicalDeviceMemoryProperties& properties,
198 uint32_t memoryTypeBitsRequirement,
199 VkDeviceSize sizeRequirement) {
200 for (uint32_t memoryIndex = 0; memoryIndex < VK_MAX_MEMORY_TYPES; ++memoryIndex) {
201 const uint32_t memoryTypeBits = (1 << memoryIndex);
202 const bool isRequiredMemoryType = memoryTypeBitsRequirement & memoryTypeBits;
203 const uint32_t heapIndex = properties.memoryTypes[memoryIndex].heapIndex;
204 const bool isLargeEnough = properties.memoryHeaps[heapIndex].size >= sizeRequirement;
205 if (isRequiredMemoryType && isLargeEnough) return memoryIndex;
206 }
207
208 // failed to find memory type.
209 return std::nullopt;
210 }
211
addBufferTransitionBarrier(VkCommandBuffer commandBuffer,VkBuffer buffer,VkPipelineStageFlags srcStageMask,VkPipelineStageFlags dstStageMask,VkAccessFlags srcAccessMask,VkAccessFlags dstAccessMask,uint32_t srcQueue,uint32_t dstQueue)212 void addBufferTransitionBarrier(VkCommandBuffer commandBuffer, VkBuffer buffer,
213 VkPipelineStageFlags srcStageMask,
214 VkPipelineStageFlags dstStageMask, VkAccessFlags srcAccessMask,
215 VkAccessFlags dstAccessMask, uint32_t srcQueue, uint32_t dstQueue) {
216 const VkBufferMemoryBarrier bufferBarrier = {
217 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
218 .pNext = nullptr,
219 .srcAccessMask = srcAccessMask,
220 .dstAccessMask = dstAccessMask,
221 .srcQueueFamilyIndex = srcQueue,
222 .dstQueueFamilyIndex = dstQueue,
223 .buffer = buffer,
224 .offset = 0,
225 .size = VK_WHOLE_SIZE,
226 };
227 vkCmdPipelineBarrier(commandBuffer, srcStageMask, dstStageMask, 0, 0, nullptr, 1,
228 &bufferBarrier, 0, nullptr);
229 }
230
allocateBlobAhwb(uint32_t size,uint64_t usage,AHardwareBuffer ** outAhwb)231 void allocateBlobAhwb(uint32_t size, uint64_t usage, AHardwareBuffer** outAhwb) {
232 AHardwareBuffer_Desc desc = {
233 .width = size,
234 .height = 1u,
235 .layers = 1u,
236 .format = AHARDWAREBUFFER_FORMAT_BLOB,
237 .usage = usage,
238 };
239 if (AHardwareBuffer_allocate(&desc, outAhwb) != 0) {
240 GTEST_SKIP() << "Device failed to allocate Android hardware buffer";
241 }
242 }
243
244 using NameAndDevice = std::pair<const char*, const ANeuralNetworksDevice*>;
245
getNnapiDevices(std::vector<NameAndDevice> * outDevices)246 void getNnapiDevices(std::vector<NameAndDevice>* outDevices) {
247 // Get the number of available NNAPI devices
248 uint32_t numDevices = 0;
249 ASSERT_EQ(ANeuralNetworks_getDeviceCount(&numDevices), ANEURALNETWORKS_NO_ERROR);
250
251 std::vector<NameAndDevice> devices;
252 for (uint32_t i = 0; i < numDevices; i++) {
253 // Get device
254 ANeuralNetworksDevice* device;
255 ASSERT_EQ(ANeuralNetworks_getDevice(/*devIndex=*/i, &device), ANEURALNETWORKS_NO_ERROR);
256
257 // Get device name
258 const char* deviceName = nullptr;
259 ASSERT_EQ(ANeuralNetworksDevice_getName(device, &deviceName), ANEURALNETWORKS_NO_ERROR);
260
261 // Check device feature level. This test is added in NNAPI feature level 5, so skip if the
262 // device is of a lower feature level.
263 int64_t featureLevel;
264 ASSERT_EQ(ANeuralNetworksDevice_getFeatureLevel(device, &featureLevel),
265 ANEURALNETWORKS_NO_ERROR);
266 if (featureLevel < ANEURALNETWORKS_FEATURE_LEVEL_5) {
267 continue;
268 }
269
270 devices.emplace_back(deviceName, device);
271 }
272 *outDevices = std::move(devices);
273 }
274
getNnapiDevices()275 std::vector<NameAndDevice> getNnapiDevices() {
276 std::vector<NameAndDevice> devices;
277 getNnapiDevices(&devices);
278 return devices;
279 }
280
printGpuNnapiTest(const testing::TestParamInfo<NameAndDevice> & info)281 std::string printGpuNnapiTest(const testing::TestParamInfo<NameAndDevice>& info) {
282 std::string name = info.param.first;
283 // gtest test names must only contain alphanumeric characters
284 std::replace_if(
285 name.begin(), name.end(), [](char c) { return !std::isalnum(c); }, '_');
286 return name;
287 }
288
289 template <Type dataType>
290 class VulkanComputePipeline {
291 public:
292 // Returns the created object on success, or nullptr on failure.
create(AHardwareBuffer * output)293 static std::unique_ptr<VulkanComputePipeline> create(AHardwareBuffer* output) {
294 auto pipeline = std::make_unique<VulkanComputePipeline>();
295 pipeline->initialize(output);
296 return pipeline->mIsValid ? std::move(pipeline) : nullptr;
297 }
298
~VulkanComputePipeline()299 ~VulkanComputePipeline() {
300 if (mDevice != VK_NULL_HANDLE) {
301 vkDestroyFence(mDevice, mFence, nullptr);
302 vkDestroyPipeline(mDevice, mPipeline, nullptr);
303 vkDestroyDescriptorSetLayout(mDevice, mDescriptorSetLayout, nullptr);
304 vkDestroyPipelineLayout(mDevice, mPipelineLayout, nullptr);
305 vkFreeMemory(mDevice, mOutputBufferMemory, nullptr);
306 vkDestroyBuffer(mDevice, mOutputBuffer, nullptr);
307 vkDestroyShaderModule(mDevice, mShaderModule, nullptr);
308 vkDestroyCommandPool(mDevice, mCommandPool, nullptr);
309 vkDestroyDescriptorPool(mDevice, mDescriptorPool, nullptr);
310 }
311 vkDestroyDevice(mDevice, nullptr);
312 vkDestroyInstance(mInstance, nullptr);
313 }
314
315 // Returns {success, sync_fd}
run()316 std::pair<bool, base::unique_fd> run() {
317 bool success = false;
318 base::unique_fd outSyncFd;
319 runInternal(&success, &outSyncFd);
320 return {success, std::move(outSyncFd)};
321 }
322
323 private:
initialize(AHardwareBuffer * output)324 void initialize(AHardwareBuffer* output) {
325 // Create instance
326 const VkApplicationInfo applicationDesc = {
327 .sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
328 .pApplicationName = "TestGpuNnapi",
329 .applicationVersion = VK_MAKE_VERSION(1, 0, 0),
330 .apiVersion = VK_API_VERSION_1_1,
331 };
332 const VkInstanceCreateInfo instanceDesc = {
333 .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
334 .pApplicationInfo = &applicationDesc,
335 .enabledLayerCount = 0,
336 .ppEnabledLayerNames = nullptr,
337 .enabledExtensionCount = 0,
338 .ppEnabledExtensionNames = nullptr,
339 };
340 ASSERT_EQ(vkCreateInstance(&instanceDesc, nullptr, &mInstance), VK_SUCCESS);
341
342 // Enumerate physical devices
343 uint32_t numberOfDevices = 0;
344 ASSERT_EQ(vkEnumeratePhysicalDevices(mInstance, &numberOfDevices, nullptr), VK_SUCCESS);
345 std::vector<VkPhysicalDevice> physicalDevices(numberOfDevices);
346 ASSERT_EQ(vkEnumeratePhysicalDevices(mInstance, &numberOfDevices, physicalDevices.data()),
347 VK_SUCCESS);
348
349 // Pick the first device with a compute queue
350 for (const auto& physicalDevice : physicalDevices) {
351 uint32_t numberOfQueueFamilies = 0;
352 vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &numberOfQueueFamilies,
353 nullptr);
354 std::vector<VkQueueFamilyProperties> queueFamilies(numberOfQueueFamilies);
355 vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &numberOfQueueFamilies,
356 queueFamilies.data());
357
358 uint32_t pickedQueueFamilyIndex = 0;
359 bool hasComputeQueue = false;
360 for (uint32_t i = 0; i < queueFamilies.size(); i++) {
361 if (queueFamilies[i].queueFlags & VK_QUEUE_COMPUTE_BIT) {
362 pickedQueueFamilyIndex = i;
363 hasComputeQueue = true;
364 break;
365 }
366 }
367 if (!hasComputeQueue) continue;
368 mPhysicalDevice = physicalDevice;
369 mQueueFamilyIndex = pickedQueueFamilyIndex;
370 break;
371 }
372 if (mPhysicalDevice == VK_NULL_HANDLE) {
373 GTEST_SKIP() << "No device can handle a compute queue";
374 }
375
376 // Get physical device properties
377 vkGetPhysicalDeviceProperties(mPhysicalDevice, &mPhysicalDeviceProperties);
378 vkGetPhysicalDeviceMemoryProperties(mPhysicalDevice, &mPhysicalDeviceMemoryProperties);
379
380 // Check physical device version
381 if (mPhysicalDeviceProperties.apiVersion < VK_API_VERSION_1_1) {
382 GTEST_SKIP() << "Device API version too low";
383 }
384
385 // Check if the physical device is able to handle the compute work
386 const auto dispatchSize = chooseDispatchSize<dataType>(mPhysicalDeviceProperties.limits);
387 if (mPhysicalDeviceProperties.limits.maxComputeWorkGroupCount[0] <
388 dispatchSize.groupCountX) {
389 GTEST_SKIP() << "Device cannot handle " << dispatchSize.groupCountX
390 << " workgroups for the X dimension";
391 }
392 if (mPhysicalDeviceProperties.limits.maxComputeWorkGroupCount[1] <
393 dispatchSize.groupCountY) {
394 GTEST_SKIP() << "Device cannot handle " << dispatchSize.groupCountY
395 << " workgroups for the Y dimension";
396 }
397
398 // Enumerate device extensions
399 uint32_t numberOfExtensions = 0;
400 ASSERT_EQ(vkEnumerateDeviceExtensionProperties(mPhysicalDevice, nullptr,
401 &numberOfExtensions, nullptr),
402 VK_SUCCESS);
403 std::vector<VkExtensionProperties> extensions(numberOfExtensions);
404 ASSERT_EQ(vkEnumerateDeviceExtensionProperties(mPhysicalDevice, nullptr,
405 &numberOfExtensions, extensions.data()),
406 VK_SUCCESS);
407
408 // Required device extensions
409 std::vector<const char*> requiredDeviceExtensions = {
410 // The following extensions are required to import an AHardwareBuffer to Vulkan
411 VK_ANDROID_EXTERNAL_MEMORY_ANDROID_HARDWARE_BUFFER_EXTENSION_NAME,
412 VK_EXT_QUEUE_FAMILY_FOREIGN_EXTENSION_NAME,
413 VK_KHR_GET_MEMORY_REQUIREMENTS_2_EXTENSION_NAME,
414 VK_KHR_BIND_MEMORY_2_EXTENSION_NAME,
415 VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME,
416 // The following extensions are required to export a sync fence
417 VK_KHR_EXTERNAL_FENCE_FD_EXTENSION_NAME,
418 VK_KHR_MAINTENANCE1_EXTENSION_NAME,
419 };
420 for (const char* requiredDeviceExtension : requiredDeviceExtensions) {
421 if (!isExtensionSupported(extensions, requiredDeviceExtension)) {
422 GTEST_SKIP() << "Device extension " << requiredDeviceExtension
423 << " is not supported";
424 }
425 }
426
427 // Check external memory properties
428 const VkPhysicalDeviceExternalBufferInfo externalBufferInfo = {
429 .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_BUFFER_INFO,
430 .pNext = nullptr,
431 .flags = 0u,
432 .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
433 .handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID,
434 };
435 VkExternalBufferProperties externalBufferProperties;
436 vkGetPhysicalDeviceExternalBufferProperties(mPhysicalDevice, &externalBufferInfo,
437 &externalBufferProperties);
438 if (!(externalBufferProperties.externalMemoryProperties.externalMemoryFeatures &
439 VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT)) {
440 GTEST_SKIP() << "Device is not able to import Android hardware buffer";
441 }
442 ASSERT_FALSE(externalBufferProperties.externalMemoryProperties.externalMemoryFeatures &
443 VK_EXTERNAL_MEMORY_FEATURE_DEDICATED_ONLY_BIT);
444
445 // Check external fence properties
446 const VkPhysicalDeviceExternalFenceInfo externalFenceInfo = {
447 .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_FENCE_INFO,
448 .pNext = nullptr,
449 .handleType = VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT,
450 };
451 VkExternalFenceProperties externalFenceProperties;
452 vkGetPhysicalDeviceExternalFenceProperties(mPhysicalDevice, &externalFenceInfo,
453 &externalFenceProperties);
454 if (!(externalFenceProperties.externalFenceFeatures &
455 VK_EXTERNAL_FENCE_FEATURE_EXPORTABLE_BIT)) {
456 GTEST_SKIP() << "Device is not able to export Android sync fence FD";
457 }
458
459 // Create logical device
460 const float queuePriority = 1.0f;
461 const VkDeviceQueueCreateInfo queueDesc = {
462 .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
463 .queueFamilyIndex = mQueueFamilyIndex,
464 .queueCount = 1,
465 .pQueuePriorities = &queuePriority,
466 };
467 const VkDeviceCreateInfo deviceDesc = {
468 .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
469 .queueCreateInfoCount = 1,
470 .pQueueCreateInfos = &queueDesc,
471 .enabledExtensionCount = static_cast<uint32_t>(requiredDeviceExtensions.size()),
472 .ppEnabledExtensionNames = requiredDeviceExtensions.data(),
473 .pEnabledFeatures = nullptr,
474 };
475 ASSERT_EQ(vkCreateDevice(mPhysicalDevice, &deviceDesc, nullptr, &mDevice), VK_SUCCESS);
476 vkGetDeviceQueue(mDevice, mQueueFamilyIndex, 0, &mQueue);
477
478 // Get extension function pointers
479 mPfnVkGetFenceFdKHR = reinterpret_cast<PFN_vkGetFenceFdKHR>(
480 vkGetDeviceProcAddr(mDevice, "vkGetFenceFdKHR"));
481 ASSERT_NE(mPfnVkGetFenceFdKHR, nullptr);
482
483 // Create descriptor pool
484 const std::vector<VkDescriptorPoolSize> descriptorPoolSizes = {
485 {
486 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
487 .descriptorCount = 1,
488 },
489 };
490 const VkDescriptorPoolCreateInfo descriptorPoolCreateInfo = {
491 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
492 .maxSets = 1,
493 .poolSizeCount = static_cast<uint32_t>(descriptorPoolSizes.size()),
494 .pPoolSizes = descriptorPoolSizes.data(),
495 };
496 ASSERT_EQ(vkCreateDescriptorPool(mDevice, &descriptorPoolCreateInfo, nullptr,
497 &mDescriptorPool),
498 VK_SUCCESS);
499
500 // Create descriptor set layout
501 const std::vector<VkDescriptorSetLayoutBinding> descriptorsetLayoutBinding = {
502 {
503 .binding = 0, // output buffer
504 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
505 .descriptorCount = 1,
506 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
507 },
508
509 };
510 const VkDescriptorSetLayoutCreateInfo descriptorsetLayoutDesc = {
511 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
512 .bindingCount = static_cast<uint32_t>(descriptorsetLayoutBinding.size()),
513 .pBindings = descriptorsetLayoutBinding.data(),
514 };
515 ASSERT_EQ(vkCreateDescriptorSetLayout(mDevice, &descriptorsetLayoutDesc, nullptr,
516 &mDescriptorSetLayout),
517 VK_SUCCESS);
518
519 // Allocate descriptor set
520 const VkDescriptorSetAllocateInfo descriptorSetAllocateInfo = {
521 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
522 .descriptorPool = mDescriptorPool,
523 .descriptorSetCount = 1,
524 .pSetLayouts = &mDescriptorSetLayout,
525 };
526 ASSERT_EQ(vkAllocateDescriptorSets(mDevice, &descriptorSetAllocateInfo, &mDescriptorSet),
527 VK_SUCCESS);
528
529 // Check the output AHardwareBuffer format and usage bits
530 AHardwareBuffer_Desc desc;
531 AHardwareBuffer_describe(output, &desc);
532 ASSERT_EQ(desc.format, AHARDWAREBUFFER_FORMAT_BLOB);
533 ASSERT_TRUE(desc.usage & AHARDWAREBUFFER_USAGE_GPU_DATA_BUFFER);
534
535 // Get AHardwareBuffer properties
536 VkAndroidHardwareBufferPropertiesANDROID properties = {
537 .sType = VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_PROPERTIES_ANDROID,
538 .pNext = nullptr,
539 };
540 ASSERT_EQ(vkGetAndroidHardwareBufferPropertiesANDROID(mDevice, output, &properties),
541 VK_SUCCESS);
542
543 // Create the output buffer with AHardwareBuffer memory
544 const VkExternalMemoryBufferCreateInfo externalMemoryBufferCreateInfo = {
545 .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO,
546 .pNext = nullptr,
547 .handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID,
548 };
549 const VkBufferCreateInfo bufferCreateInfo = {
550 .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
551 .pNext = &externalMemoryBufferCreateInfo,
552 .flags = 0u,
553 .size = desc.width,
554 .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
555 .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
556 .queueFamilyIndexCount = 0u,
557 .pQueueFamilyIndices = nullptr,
558 };
559 ASSERT_EQ(vkCreateBuffer(mDevice, &bufferCreateInfo, nullptr, &mOutputBuffer), VK_SUCCESS);
560
561 // Find a proper memory type
562 const auto maybeMemoryTypeIndex =
563 findMemoryType(mPhysicalDeviceMemoryProperties, properties.memoryTypeBits,
564 properties.allocationSize);
565 if (!maybeMemoryTypeIndex.has_value()) {
566 GTEST_SKIP() << "None of the memory type is suitable for allocation";
567 }
568
569 // Import the AHardwareBuffer memory
570 const VkImportAndroidHardwareBufferInfoANDROID importMemoryAllocateInfo = {
571 .sType = VK_STRUCTURE_TYPE_IMPORT_ANDROID_HARDWARE_BUFFER_INFO_ANDROID,
572 .pNext = nullptr,
573 .buffer = output,
574 };
575 const VkMemoryAllocateInfo memoryAllocInfo = {
576 .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
577 .pNext = &importMemoryAllocateInfo,
578 .allocationSize = properties.allocationSize,
579 .memoryTypeIndex = maybeMemoryTypeIndex.value(),
580 };
581 const auto allocationResult =
582 vkAllocateMemory(mDevice, &memoryAllocInfo, nullptr, &mOutputBufferMemory);
583 // Memory allocation may fail if the size exceeds the upper limit of a single allocation
584 // that the platform supports
585 if (allocationResult == VK_ERROR_OUT_OF_DEVICE_MEMORY) {
586 GTEST_SKIP() << "Unable to allocate device memory of " << properties.allocationSize
587 << " bytes";
588 }
589 ASSERT_EQ(allocationResult, VK_SUCCESS);
590
591 // Bind the memory with the buffer
592 ASSERT_EQ(vkBindBufferMemory(mDevice, mOutputBuffer, mOutputBufferMemory, 0), VK_SUCCESS);
593
594 // Update the descriptor sets
595 const VkDescriptorBufferInfo outputBufferDesc = {
596 .buffer = mOutputBuffer,
597 .offset = 0,
598 .range = VK_WHOLE_SIZE,
599 };
600 const std::vector<VkWriteDescriptorSet> writeDst = {
601 {
602 .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
603 .pNext = nullptr,
604 .dstSet = mDescriptorSet,
605 .dstBinding = 0, // output buffer
606 .dstArrayElement = 0,
607 .descriptorCount = 1,
608 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
609 .pImageInfo = nullptr,
610 .pBufferInfo = &outputBufferDesc,
611 .pTexelBufferView = nullptr,
612 },
613 };
614 vkUpdateDescriptorSets(mDevice, writeDst.size(), writeDst.data(), 0, nullptr);
615
616 // Create shader module
617 const VkShaderModuleCreateInfo shaderDesc = {
618 .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
619 .flags = 0,
620 .codeSize = kComputeShader.size() * sizeof(uint32_t),
621 .pCode = kComputeShader.data(),
622 };
623 ASSERT_EQ(vkCreateShaderModule(mDevice, &shaderDesc, nullptr, &mShaderModule), VK_SUCCESS);
624
625 // Create pipeline layout
626 const VkPipelineLayoutCreateInfo layoutDesc = {
627 .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
628 .setLayoutCount = 1,
629 .pSetLayouts = &mDescriptorSetLayout,
630 .pushConstantRangeCount = 0,
631 .pPushConstantRanges = nullptr,
632 };
633 ASSERT_EQ(vkCreatePipelineLayout(mDevice, &layoutDesc, nullptr, &mPipelineLayout),
634 VK_SUCCESS);
635
636 // Create compute pipeline
637 const uint32_t specializationData[] = {
638 dispatchSize.workgroupSize, // local_size_x
639 dispatchSize.workgroupSize, // local_size_y
640 TestTypeHelper<dataType>::kClearData, // CLEAR_DATA
641 };
642 const std::vector<VkSpecializationMapEntry> specializationMap = {
643 // {constantID, offset, size}
644 {0, 0 * sizeof(uint32_t), sizeof(uint32_t)},
645 {1, 1 * sizeof(uint32_t), sizeof(uint32_t)},
646 {2, 2 * sizeof(uint32_t), sizeof(uint32_t)},
647 };
648 const VkSpecializationInfo specializationInfo = {
649 .mapEntryCount = static_cast<uint32_t>(specializationMap.size()),
650 .pMapEntries = specializationMap.data(),
651 .dataSize = sizeof(specializationData),
652 .pData = specializationData,
653 };
654 const VkComputePipelineCreateInfo pipelineDesc = {
655 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
656 .stage =
657 {
658 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
659 .stage = VK_SHADER_STAGE_COMPUTE_BIT,
660 .module = mShaderModule,
661 .pName = "main",
662 .pSpecializationInfo = &specializationInfo,
663 },
664 .layout = mPipelineLayout,
665 };
666 ASSERT_EQ(vkCreateComputePipelines(mDevice, VK_NULL_HANDLE, 1, &pipelineDesc, nullptr,
667 &mPipeline),
668 VK_SUCCESS);
669
670 // Create command pool
671 const VkCommandPoolCreateInfo cmdpoolDesc = {
672 .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
673 .flags = 0u,
674 .queueFamilyIndex = mQueueFamilyIndex,
675 };
676 ASSERT_EQ(vkCreateCommandPool(mDevice, &cmdpoolDesc, nullptr, &mCommandPool), VK_SUCCESS);
677
678 // Create a command buffer
679 const VkCommandBufferAllocateInfo cmdBufferCreateInfo = {
680 .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
681 .pNext = nullptr,
682 .commandPool = mCommandPool,
683 .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
684 .commandBufferCount = 1,
685 };
686 ASSERT_EQ(vkAllocateCommandBuffers(mDevice, &cmdBufferCreateInfo, &mCommandBuffer),
687 VK_SUCCESS);
688
689 // Record command buffer
690 const VkCommandBufferBeginInfo commandBufferBeginInfo = {
691 .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
692 .pNext = nullptr,
693 .flags = 0,
694 .pInheritanceInfo = nullptr,
695 };
696 ASSERT_EQ(vkBeginCommandBuffer(mCommandBuffer, &commandBufferBeginInfo), VK_SUCCESS);
697
698 // Buffer barrier to acquire the ownership of the output buffer
699 addBufferTransitionBarrier(mCommandBuffer, mOutputBuffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
700 VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0,
701 VK_ACCESS_SHADER_WRITE_BIT, VK_QUEUE_FAMILY_FOREIGN_EXT,
702 mQueueFamilyIndex);
703
704 // Setup resources
705 vkCmdBindPipeline(mCommandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, mPipeline);
706 vkCmdBindDescriptorSets(mCommandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, mPipelineLayout, 0,
707 1, &mDescriptorSet, 0, nullptr);
708
709 // Dispatch compute
710 vkCmdDispatch(mCommandBuffer, dispatchSize.groupCountX, dispatchSize.groupCountY, 1);
711
712 // Buffer barrier to release the ownership of the output buffer
713 addBufferTransitionBarrier(mCommandBuffer, mOutputBuffer,
714 VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
715 VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, VK_ACCESS_SHADER_WRITE_BIT,
716 0, mQueueFamilyIndex, VK_QUEUE_FAMILY_FOREIGN_EXT);
717
718 // Finish recording the command buffer
719 ASSERT_EQ(vkEndCommandBuffer(mCommandBuffer), VK_SUCCESS);
720
721 // Create fence
722 const VkExportFenceCreateInfo exportFenceCreateInfo = {
723 .sType = VK_STRUCTURE_TYPE_EXPORT_FENCE_CREATE_INFO,
724 .pNext = nullptr,
725 .handleTypes = VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT,
726 };
727 const VkFenceCreateInfo fenceCreateInfo = {
728 .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO,
729 .pNext = &exportFenceCreateInfo,
730 .flags = 0,
731 };
732 ASSERT_EQ(vkCreateFence(mDevice, &fenceCreateInfo, nullptr, &mFence), VK_SUCCESS);
733
734 mIsValid = true;
735 }
736
runInternal(bool * outSuccess,base::unique_fd * outSyncFd)737 void runInternal(bool* outSuccess, base::unique_fd* outSyncFd) {
738 *outSuccess = false;
739
740 // Submit to queue
741 const VkSubmitInfo submitInfo = {
742 .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
743 .waitSemaphoreCount = 0,
744 .pWaitSemaphores = nullptr,
745 .pWaitDstStageMask = nullptr,
746 .commandBufferCount = 1,
747 .pCommandBuffers = &mCommandBuffer,
748 .signalSemaphoreCount = 0,
749 .pSignalSemaphores = nullptr,
750 };
751 ASSERT_EQ(vkResetFences(mDevice, 1, &mFence), VK_SUCCESS);
752 ASSERT_EQ(vkQueueSubmit(mQueue, 1, &submitInfo, mFence), VK_SUCCESS);
753
754 // Export a Android sync fence FD
755 int syncFd = -1;
756 const VkFenceGetFdInfoKHR fenceGetFdInfo = {
757 .sType = VK_STRUCTURE_TYPE_FENCE_GET_FD_INFO_KHR,
758 .pNext = nullptr,
759 .fence = mFence,
760 .handleType = VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT,
761 };
762 ASSERT_EQ(mPfnVkGetFenceFdKHR(mDevice, &fenceGetFdInfo, &syncFd), VK_SUCCESS);
763 *outSyncFd = base::unique_fd(syncFd);
764
765 *outSuccess = true;
766 }
767
768 // Instance
769 VkInstance mInstance = VK_NULL_HANDLE;
770
771 // Physical device and queue family
772 VkPhysicalDevice mPhysicalDevice = VK_NULL_HANDLE;
773 VkPhysicalDeviceProperties mPhysicalDeviceProperties{};
774 VkPhysicalDeviceMemoryProperties mPhysicalDeviceMemoryProperties{};
775 uint32_t mQueueFamilyIndex = 0;
776
777 // Logical device and queue
778 VkDevice mDevice = VK_NULL_HANDLE;
779 VkQueue mQueue = VK_NULL_HANDLE;
780
781 // Extension functions
782 PFN_vkGetFenceFdKHR mPfnVkGetFenceFdKHR = nullptr;
783
784 // Resource descriptors
785 VkDescriptorPool mDescriptorPool = VK_NULL_HANDLE;
786 VkDescriptorSetLayout mDescriptorSetLayout = VK_NULL_HANDLE;
787 VkDescriptorSet mDescriptorSet = VK_NULL_HANDLE;
788
789 // Output buffer
790 VkBuffer mOutputBuffer = VK_NULL_HANDLE;
791 VkDeviceMemory mOutputBufferMemory = VK_NULL_HANDLE;
792
793 // Compute pipeline
794 VkShaderModule mShaderModule = VK_NULL_HANDLE;
795 VkPipelineLayout mPipelineLayout = VK_NULL_HANDLE;
796 VkPipeline mPipeline = VK_NULL_HANDLE;
797
798 // Command buffer
799 VkCommandPool mCommandPool = VK_NULL_HANDLE;
800 VkCommandBuffer mCommandBuffer = VK_NULL_HANDLE;
801 VkFence mFence = VK_NULL_HANDLE;
802
803 bool mIsValid = false;
804 };
805
806 template <Type dataType>
807 class NnapiExecutor {
808 public:
809 // Returns the created object on success, or nullptr on failure.
create(const ANeuralNetworksDevice * device,AHardwareBuffer * input,AHardwareBuffer * output)810 static std::unique_ptr<NnapiExecutor> create(const ANeuralNetworksDevice* device,
811 AHardwareBuffer* input, AHardwareBuffer* output) {
812 auto nnapi = std::make_unique<NnapiExecutor>(input, output);
813 nnapi->initialize(device);
814 return nnapi->mIsValid ? std::move(nnapi) : nullptr;
815 }
816
817 // Prefer NnapiExecutor::create
NnapiExecutor(AHardwareBuffer * input,AHardwareBuffer * output)818 NnapiExecutor(AHardwareBuffer* input, AHardwareBuffer* output)
819 : mInputMemory(input), mOutputMemory(output) {}
820
821 // Returns {success, sync_fd}
run(const base::unique_fd & inSyncFd)822 std::pair<bool, base::unique_fd> run(const base::unique_fd& inSyncFd) {
823 bool success = false;
824 base::unique_fd outSyncFd;
825 runInternal(inSyncFd, &success, &outSyncFd);
826 return {success, std::move(outSyncFd)};
827 }
828
829 private:
830 using ElementType = typename TestTypeHelper<dataType>::ElementType;
831
initialize(const ANeuralNetworksDevice * device)832 void initialize(const ANeuralNetworksDevice* device) {
833 ASSERT_TRUE(mInputMemory.isValid());
834 ASSERT_TRUE(mOutputMemory.isValid());
835
836 // Model input
837 const float scale = TestTypeHelper<dataType>::kIsQuantized ? 1.0f : 0.0f;
838 const OperandType tensorType(dataType, {kOperandSizeY, kOperandSizeX}, scale,
839 /*zeroPoint=*/0);
840 uint32_t inputTensor = mModel.addOperand(&tensorType);
841
842 // Constant tensor
843 const OperandType constTensorType(dataType, {1}, scale, /*zeroPoint=*/0);
844 const ElementType constTensorData = static_cast<ElementType>(1);
845 uint32_t constTensor =
846 mModel.addConstantOperand<ElementType>(&constTensorType, constTensorData);
847
848 // Activation (NONE)
849 const OperandType activationType(Type::INT32, {});
850 uint32_t activationScalar = mModel.addConstantOperand<int32_t>(&activationType, 0);
851
852 // Model output
853 uint32_t outputTensor = mModel.addOperand(&tensorType);
854
855 // Model operation
856 mModel.addOperation(ANEURALNETWORKS_ADD, {inputTensor, constTensor, activationScalar},
857 {outputTensor});
858
859 // Finish model
860 mModel.identifyInputsAndOutputs({inputTensor}, {outputTensor});
861 mModel.relaxComputationFloat32toFloat16(/*isRelax=*/true);
862 ASSERT_TRUE(mModel.isValid());
863 ASSERT_EQ(mModel.finish(), Result::NO_ERROR);
864
865 // Create compilation for the target device
866 Result result;
867 std::tie(result, mCompilation) =
868 test_wrapper::Compilation::createForDevice(&mModel, device);
869 ASSERT_EQ(result, Result::NO_ERROR);
870
871 // Finish the compilation
872 result = mCompilation.finish();
873 if (result != Result::NO_ERROR) {
874 GTEST_SKIP() << "Model is not supported by the device";
875 }
876
877 mIsValid = true;
878 }
879
runInternal(const base::unique_fd & inSyncFd,bool * outSuccess,base::unique_fd * outSyncFd)880 void runInternal(const base::unique_fd& inSyncFd, bool* outSuccess,
881 base::unique_fd* outSyncFd) {
882 *outSuccess = false;
883
884 // Setup execution
885 mExecution = std::make_unique<test_wrapper::Execution>(&mCompilation);
886 ASSERT_EQ(mExecution->setInputFromMemory(/*index=*/0, &mInputMemory, /*offset=*/0,
887 kOperandLength * sizeof(ElementType)),
888 Result::NO_ERROR);
889 ASSERT_EQ(mExecution->setOutputFromMemory(/*index=*/0, &mOutputMemory, /*offset=*/0,
890 kOperandLength * sizeof(ElementType)),
891 Result::NO_ERROR);
892
893 // Setup dependencies
894 std::vector<const test_wrapper::Event*> dependencies;
895 test_wrapper::Event start;
896 // The sync fence from Vulkan may not be valid if GPU workload has already finished
897 // prior to exporting the fence.
898 if (inSyncFd.ok()) {
899 start = test_wrapper::Event(inSyncFd.get());
900 ASSERT_TRUE(start.isValid());
901 dependencies = {&start};
902 }
903
904 // Fenced compute
905 test_wrapper::Event finished;
906 mExecution->startComputeWithDependencies(dependencies, /*infinite timeout*/ 0, &finished);
907
908 // Get the output sync fence if supported; Otherwise, wait until the execution is finished
909 int syncFd = -1;
910 finished.getSyncFenceFd(&syncFd);
911 if (syncFd == -1) {
912 ASSERT_EQ(finished.wait(), Result::NO_ERROR);
913 }
914 *outSyncFd = base::unique_fd(syncFd);
915 *outSuccess = true;
916 }
917
918 test_wrapper::Model mModel;
919 test_wrapper::Compilation mCompilation;
920 std::unique_ptr<test_wrapper::Execution> mExecution;
921 test_wrapper::Memory mInputMemory, mOutputMemory;
922 bool mIsValid = false;
923 };
924
925 class GpuNnapiTest : public testing::TestWithParam<NameAndDevice> {
926 protected:
TearDown()927 void TearDown() override {
928 if (mGpuOutput) {
929 AHardwareBuffer_release(mGpuOutput);
930 }
931 if (mNnapiOutput) {
932 AHardwareBuffer_release(mNnapiOutput);
933 }
934 }
935
936 template <Type dataType>
runTest()937 void runTest() {
938 #ifndef NNTEST_ONLY_PUBLIC_API
939 if (DeviceManager::get()->getUseCpuOnly()) {
940 GTEST_SKIP();
941 }
942 #endif
943
944 // Allocate hardware buffers for GPU and NNAPI outputs
945 const size_t size = kOperandLength * sizeof(typename TestTypeHelper<dataType>::ElementType);
946 allocateBlobAhwb(
947 size, AHARDWAREBUFFER_USAGE_GPU_DATA_BUFFER | AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN,
948 &mGpuOutput);
949 allocateBlobAhwb(
950 size, AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN | AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN,
951 &mNnapiOutput);
952 if (mGpuOutput == nullptr || mNnapiOutput == nullptr) return;
953
954 // Create Vulkan compute pipeline
955 auto vulkan = VulkanComputePipeline<dataType>::create(mGpuOutput);
956 if (vulkan == nullptr) return;
957
958 // Create NNAPI executor
959 auto nnapi = NnapiExecutor<dataType>::create(kDevice, mGpuOutput, mNnapiOutput);
960 if (nnapi == nullptr) return;
961
962 // Run the test repeatly for kNumberOfIterationsToTest iterations
963 for (uint32_t i = 0; i < kNumberOfIterationsToTest; i++) {
964 auto [gpuSuccess, gpuSyncFd] = vulkan->run();
965 ASSERT_TRUE(gpuSuccess);
966
967 auto [nnapiSuccess, nnapiSyncFd] = nnapi->run(gpuSyncFd);
968 ASSERT_TRUE(nnapiSuccess);
969
970 const double tolerance = TestTypeHelper<dataType>::kTolerance;
971 checkResults<dataType>(std::move(nnapiSyncFd), tolerance);
972 }
973 }
974
975 template <Type dataType>
checkResults(base::unique_fd syncFd,double tolerance)976 void checkResults(base::unique_fd syncFd, double tolerance) {
977 using ElementType = typename TestTypeHelper<dataType>::ElementType;
978
979 // Lock the buffer with the sync fence
980 // AHardwareBuffer_lock will take the ownership and close the sync fence even on errors
981 void* data;
982 ASSERT_EQ(AHardwareBuffer_lock(mNnapiOutput, AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN,
983 syncFd.release(), /*rect=*/nullptr, &data),
984 0);
985
986 // Compare the actual results with the expect value
987 uint32_t numberOfErrors = 0;
988 const ElementType expected = static_cast<ElementType>(kExpectedResultInInt);
989 for (uint32_t i = 0; i < kOperandLength; i++) {
990 const ElementType actual = reinterpret_cast<ElementType*>(data)[i];
991
992 // We expect the absolute difference in double is within the tolerance.
993 const double expected_f64 = static_cast<double>(expected);
994 const double actual_f64 = static_cast<double>(actual);
995 const double diff = std::abs(expected_f64 - actual_f64);
996 if (diff > tolerance) {
997 // Print at most kMaxNumberOfPrintedErrors errors by EXPECT_EQ
998 if (numberOfErrors < kMaxNumberOfPrintedErrors) {
999 EXPECT_NEAR(actual_f64, expected_f64, tolerance)
1000 << "When comparing element [" << kOperandLength / kOperandSizeX << ", "
1001 << kOperandLength % kOperandSizeX << "]";
1002 }
1003 numberOfErrors++;
1004 }
1005 }
1006 EXPECT_EQ(numberOfErrors, 0u);
1007 ASSERT_EQ(AHardwareBuffer_unlock(mNnapiOutput, /*fence=*/nullptr), 0);
1008 }
1009
1010 // The NNAPI device under test
1011 const ANeuralNetworksDevice* kDevice = GetParam().second;
1012
1013 AHardwareBuffer* mGpuOutput = nullptr;
1014 AHardwareBuffer* mNnapiOutput = nullptr;
1015 };
1016
TEST_P(GpuNnapiTest,Float32)1017 TEST_P(GpuNnapiTest, Float32) {
1018 runTest<Type::TENSOR_FLOAT32>();
1019 }
TEST_P(GpuNnapiTest,Float16)1020 TEST_P(GpuNnapiTest, Float16) {
1021 runTest<Type::TENSOR_FLOAT16>();
1022 }
TEST_P(GpuNnapiTest,Quant8Asymm)1023 TEST_P(GpuNnapiTest, Quant8Asymm) {
1024 runTest<Type::TENSOR_QUANT8_ASYMM>();
1025 }
TEST_P(GpuNnapiTest,Quant8AsymmSigned)1026 TEST_P(GpuNnapiTest, Quant8AsymmSigned) {
1027 runTest<Type::TENSOR_QUANT8_ASYMM_SIGNED>();
1028 }
1029
1030 INSTANTIATE_TEST_SUITE_P(TestGpuNnapi, GpuNnapiTest, testing::ValuesIn(getNnapiDevices()),
1031 printGpuNnapiTest);
1032
1033 } // namespace
1034 } // namespace android::nn
1035