runtime/test/TestIntrospectionControl.cpp

/*
 * Copyright (C) 2018 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <ExecutionBurstServer.h>
#include <HalInterfaces.h>
#include <SampleDriver.h>
#include <ValidateHal.h>
#include <gtest/gtest.h>

#include <algorithm>
#include <chrono>
#include <iterator>
#include <map>
#include <queue>
#include <set>
#include <string>
#include <thread>
#include <tuple>
#include <utility>
#include <vector>

#include "CompilationBuilder.h"
#include "HalUtils.h"
#include "Manager.h"
#include "NeuralNetworks.h"
#include "NeuralNetworksOEM.h"
#include "TestNeuralNetworksWrapper.h"

namespace {

using namespace ::android;
namespace V1_0 = ::android::hardware::neuralnetworks::V1_0;
namespace V1_1 = ::android::hardware::neuralnetworks::V1_1;
namespace V1_2 = ::android::hardware::neuralnetworks::V1_2;
namespace V1_3 = ::android::hardware::neuralnetworks::V1_3;

using CompilationBuilder = nn::CompilationBuilder;
using Device = nn::Device;
using DeviceManager = nn::DeviceManager;
using ExecutePreference = nn::test_wrapper::ExecutePreference;
using ExecutionBurstServer = nn::ExecutionBurstServer;
using HidlModel = V1_3::Model;
using Result = nn::test_wrapper::Result;
using SampleDriver = nn::sample_driver::SampleDriver;
using SamplePreparedModel = nn::sample_driver::SamplePreparedModel;
using SampleFencedExecutionCallback = nn::sample_driver::SampleFencedExecutionCallback;
using WrapperModel = nn::test_wrapper::Model;
using WrapperOperandType = nn::test_wrapper::OperandType;
using WrapperType = nn::test_wrapper::Type;
using nn::convertToV1_0;
using nn::convertToV1_3;

template <typename T>
using MQDescriptorSync = hardware::MQDescriptorSync<T>;

constexpr V1_2::Timing kBadTiming = {.timeOnDevice = UINT64_MAX, .timeInDriver = UINT64_MAX};
constexpr V1_2::Timing kGoodUnfencedTiming = {.timeOnDevice = 123, .timeInDriver = 456};
constexpr V1_2::Timing kGoodFencedTiming = {.timeOnDevice = 23, .timeInDriver = 56};

// This is an IDevice for testing purposes. The test driver has customized
// getCapabilities_1_3 and getSupportedOperations_1_3.
class TestDriver : public SampleDriver {
   public:
    TestDriver(const char* name, V1_3::Capabilities capabilities,
               const std::vector<bool>& supportedOps)
        : SampleDriver(name), mCapabilities(capabilities), mSupportedOps(supportedOps) {}
    ~TestDriver() override {}

    hardware::Return<void> getCapabilities_1_3(getCapabilities_1_3_cb cb) override {
        cb(V1_3::ErrorStatus::NONE, mCapabilities);
        return hardware::Void();
    }

    hardware::Return<void> getSupportedOperations_1_3(const V1_3::Model& model,
                                                      getSupportedOperations_1_3_cb cb) override {
        if (!android::nn::validateModel(model)) {
            cb(V1_3::ErrorStatus::INVALID_ARGUMENT, std::vector<bool>());
            return hardware::Void();
        }
        const size_t count = model.main.operations.size();
        std::vector<bool> supported(count);
        std::transform(model.main.operations.begin(), model.main.operations.end(),
                       supported.begin(), [this](V1_3::Operation op) {
                           return mSupportedOps[static_cast<int32_t>(op.type)];
                       });
        cb(V1_3::ErrorStatus::NONE, supported);
        return hardware::Void();
    }

   private:
    V1_3::Capabilities mCapabilities;
    std::vector<bool> mSupportedOps;
};

class IntrospectionControlTest : public ::testing::Test {
   protected:
    virtual void SetUp() {}
    virtual void TearDown() {
        if (mEvent) {
            ANeuralNetworksEvent_free(mEvent);
        }
        if (mExecution) {
            ANeuralNetworksExecution_free(mExecution);
        }
        if (mCompilation) {
            ANeuralNetworksCompilation_free(mCompilation);
        }
        DeviceManager::get()->forTest_reInitializeDeviceList();
    }

    struct DeviceSpecification {
        DeviceSpecification(const std::string& name, float perf, std::vector<bool>& supportedOps)
            : mName(name), mSupportedOps(supportedOps) {
            V1_0::PerformanceInfo perfInfo = {.execTime = perf, .powerUsage = perf};
            mCapabilities = {
                    .relaxedFloat32toFloat16PerformanceScalar = perfInfo,
                    .relaxedFloat32toFloat16PerformanceTensor = perfInfo,
                    .operandPerformance =
                            nn::nonExtensionOperandPerformance<nn::HalVersion::V1_3>(perfInfo),
                    .ifPerformance = perfInfo,
                    .whilePerformance = perfInfo};
        }
        std::string mName;
        V1_3::Capabilities mCapabilities;
        std::vector<bool> mSupportedOps;
    };

    // From a vector of DeviceSpecification, register new Devices.
    void registerDevices(std::vector<DeviceSpecification> specifications) {
        for (const auto& specification : specifications) {
            DeviceManager::get()->forTest_registerDevice(nn::makeSharedDevice(
                    specification.mName.c_str(),
                    new TestDriver(specification.mName.c_str(), specification.mCapabilities,
                                   specification.mSupportedOps)));
        }
    }

    bool selectDeviceByName(const std::string& name) {
        uint32_t numDevices = 0;
        EXPECT_EQ(ANeuralNetworks_getDeviceCount(&numDevices), ANEURALNETWORKS_NO_ERROR);
        EXPECT_GE(numDevices, (uint32_t)1);

        for (uint32_t i = 0; i < numDevices; i++) {
            ANeuralNetworksDevice* device = nullptr;
            EXPECT_EQ(ANeuralNetworks_getDevice(i, &device), ANEURALNETWORKS_NO_ERROR);
            const char* buffer = nullptr;
            int result = ANeuralNetworksDevice_getName(device, &buffer);
            if (result == ANEURALNETWORKS_NO_ERROR && name.compare(buffer) == 0) {
                mDevices.push_back(device);
                return true;
            }
        }
        return false;
    }

    bool isSupportedOpListExpected(const std::vector<bool>& expected) {
        const uint32_t kMaxNumberOperations = 256;
        EXPECT_LE(expected.size(), kMaxNumberOperations);
        ANeuralNetworksModel* modelHandle = mModel.getHandle();
        bool supported[kMaxNumberOperations] = {false};
        EXPECT_EQ(ANeuralNetworksModel_getSupportedOperationsForDevices(
                          modelHandle, mDevices.data(), mDevices.size(), supported),
                  ANEURALNETWORKS_NO_ERROR);
        return std::equal(expected.begin(), expected.end(), supported);
    }

    int prepareForExecution(bool measureTiming = false) {
        ANeuralNetworksModel* modelHandle = mModel.getHandle();
        int result = ANeuralNetworksCompilation_createForDevices(modelHandle, mDevices.data(),
                                                                 mDevices.size(), &mCompilation);
        if (result != ANEURALNETWORKS_NO_ERROR) {
            return result;
        }
        EXPECT_EQ(ANeuralNetworksCompilation_finish(mCompilation), ANEURALNETWORKS_NO_ERROR);
        EXPECT_EQ(ANeuralNetworksExecution_create(mCompilation, &mExecution),
                  ANEURALNETWORKS_NO_ERROR);
        if (measureTiming) {
            // Don't call setMeasureTiming unless we need to -- cannot call this
            // API unless there is exactly one device.
            EXPECT_EQ(ANeuralNetworksExecution_setMeasureTiming(mExecution, true),
                      ANEURALNETWORKS_NO_ERROR);
        }
        return ANEURALNETWORKS_NO_ERROR;
    }

    std::vector<ANeuralNetworksDevice*> mDevices;
    ANeuralNetworksEvent* mEvent = nullptr;
    ANeuralNetworksExecution* mExecution = nullptr;
    ANeuralNetworksCompilation* mCompilation = nullptr;
    WrapperModel mModel;
};

void createSimpleAddModel(WrapperModel* model) {
    WrapperOperandType type0(WrapperType::TENSOR_FLOAT32, {2});
    WrapperOperandType type1(WrapperType::INT32, {});
    // Phase 1, operands
    auto op1 = model->addOperand(&type0);
    auto op2 = model->addOperand(&type0);
    auto act = model->addOperand(&type1);
    auto op3 = model->addOperand(&type0);
    // Phase 2, operations
    static int32_t act_init[] = {0};
    model->setOperandValue(act, act_init, sizeof(act_init));
    model->addOperation(ANEURALNETWORKS_ADD, {op1, op2, act}, {op3});
    // Phase 3, inputs and outputs
    model->identifyInputsAndOutputs({op1, op2}, {op3});
    model->finish();
    ASSERT_TRUE(model->isValid());
}

// This test verifies that a simple ADD model is able to run on a single device that claims being
// able to handle all operations.
TEST_F(IntrospectionControlTest, SimpleAddModel) {
    // This is needed before we have the CPU fallback path being treated as a Device.
    // TODO(miaowang): remove once b/72506261 is fixed.
    if (DeviceManager::get()->getUseCpuOnly()) {
        GTEST_SKIP();
    }

    createSimpleAddModel(&mModel);

    std::string driverName = "test-all";
    std::vector<bool> ops(android::nn::kNumberOfOperationTypes, true);
    registerDevices({{driverName, 0.9, ops}});

    EXPECT_TRUE(selectDeviceByName(driverName));
    EXPECT_TRUE(isSupportedOpListExpected({true}));
    EXPECT_EQ(prepareForExecution(), ANEURALNETWORKS_NO_ERROR);

    // Verify that the mCompilation is actually using the "test-all" device.
    CompilationBuilder* c = reinterpret_cast<CompilationBuilder*>(mCompilation);
    const std::string& deviceNameBuffer =
            c->forTest_getExecutionPlan().forTest_simpleGetDevice()->getName();
    EXPECT_EQ(driverName, deviceNameBuffer);

    float input1[2] = {1.0f, 2.0f};
    float input2[2] = {3.0f, 4.0f};
    float output[2];
    EXPECT_EQ(ANeuralNetworksExecution_setInput(mExecution, 0, nullptr, input1, sizeof(input1)),
              ANEURALNETWORKS_NO_ERROR);
    EXPECT_EQ(ANeuralNetworksExecution_setInput(mExecution, 1, nullptr, input2, sizeof(input2)),
              ANEURALNETWORKS_NO_ERROR);
    EXPECT_EQ(ANeuralNetworksExecution_setOutput(mExecution, 0, nullptr, output, sizeof(output)),
              ANEURALNETWORKS_NO_ERROR);
    EXPECT_EQ(ANeuralNetworksExecution_setMeasureTiming(mExecution, true),
              ANEURALNETWORKS_NO_ERROR);

    EXPECT_EQ(ANeuralNetworksExecution_startCompute(mExecution, &mEvent), ANEURALNETWORKS_NO_ERROR);
    EXPECT_EQ(ANeuralNetworksEvent_wait(mEvent), ANEURALNETWORKS_NO_ERROR);
    EXPECT_EQ(output[0], input1[0] + input2[0]);
    EXPECT_EQ(output[1], input1[1] + input2[1]);

    uint64_t timeOnHardware, timeInDriver;
    EXPECT_EQ(ANeuralNetworksExecution_getDuration(mExecution, ANEURALNETWORKS_DURATION_ON_HARDWARE,
                                                   &timeOnHardware),
              ANEURALNETWORKS_NO_ERROR);
    EXPECT_EQ(ANeuralNetworksExecution_getDuration(mExecution, ANEURALNETWORKS_DURATION_IN_DRIVER,
                                                   &timeInDriver),
              ANEURALNETWORKS_NO_ERROR);
    if (timeOnHardware != UINT64_MAX && timeInDriver != UINT64_MAX) {
        EXPECT_LE(timeOnHardware, timeInDriver);
    }
}

/*-- Begin test drivers -------------------------------------------------------------------------*/

namespace test_drivers {

enum class Success : uint32_t {
    // ASYNC: Return ErrorStatus::GENERAL_FAILURE; notify ErrorStatus::GENERAL_FAILURE and
    // kBadTiming
    // SYNC, BURST: Return ErrorStatus::GENERAL_FAILURE and kBadTiming
    // FENCED: Return ErrorStatus::GENERAL_FAILURE, empty hidl_handle, and a nullptr callback
    FAIL_LAUNCH,

    // ASYNC: Return ErrorStatus::NONE; notify ErrorStatus::GENERAL_FAILURE and kBadTiming
    FAIL_WAIT,

    // Bit representation for PASS: One bit set to indicate PASS rather than
    // FAIL, one bit for each of the four timing fields (Unfenced, Fenced) x
    // (OnDevice, InDriver) to distinguish between unavailable timing (bit is
    // clear) and available timing (bit is set), and one bit to call out the
    // special case of CPU.
    PASS_BIT = 1 << 4,
    PASS_UNFENCED_DEVICE_BIT = 1 << 5,
    PASS_UNFENCED_DRIVER_BIT = 1 << 6,
    PASS_FENCED_DEVICE_BIT = 1 << 7,
    PASS_FENCED_DRIVER_BIT = 1 << 8,
    PASS_CPU_BIT = 1 << 9,

    // Each of the four timing fields may be either unavailable or 0
    PASS_CPU = PASS_BIT | PASS_CPU_BIT,

    // ASYNC: Return ErrorStatus::NONE; notify ErrorStatus::NONE and timing
    // SYNC, BURST: Return ErrorStatus::NONE and timing
    // FENCED: Return ErrorStatus::NONE, empty hidl_handle, and a callback with timing.
    //
    // For each PASS other than PASS_CPU, an enum name has the form
    // PASS_${UNFENCED_TIME}_${FENCED_TIME}.  For example, PASS_NEITHER_BOTH
    // means that only fenced timing is available (both timeOnDevice and
    // timeInDriver).  If _${FENCED_TIME} is omitted, it is equivalent to
    // _NEITHER; so PASS_BOTH means that only unfenced timing is available (both
    // timeOnDevice and timeInDriver).
    PASS_NEITHER = PASS_BIT,
    PASS_DEVICE = PASS_BIT | PASS_UNFENCED_DEVICE_BIT,
    PASS_DRIVER = PASS_BIT | PASS_UNFENCED_DRIVER_BIT,
    PASS_BOTH = PASS_BIT | PASS_UNFENCED_DEVICE_BIT | PASS_UNFENCED_DRIVER_BIT,
    PASS_NEITHER_DEVICE = PASS_BIT | PASS_FENCED_DEVICE_BIT,
    PASS_NEITHER_DRIVER = PASS_BIT | PASS_FENCED_DRIVER_BIT,
    PASS_NEITHER_BOTH = PASS_BIT | PASS_FENCED_DEVICE_BIT | PASS_FENCED_DRIVER_BIT,
    PASS_DEVICE_DEVICE = PASS_DEVICE | PASS_NEITHER_DEVICE,
    PASS_DEVICE_DRIVER = PASS_DEVICE | PASS_NEITHER_DRIVER,
    PASS_DEVICE_BOTH = PASS_DEVICE | PASS_NEITHER_BOTH,
    PASS_DRIVER_DEVICE = PASS_DRIVER | PASS_NEITHER_DEVICE,
    PASS_DRIVER_DRIVER = PASS_DRIVER | PASS_NEITHER_DRIVER,
    PASS_DRIVER_BOTH = PASS_DRIVER | PASS_NEITHER_BOTH,
    PASS_BOTH_DEVICE = PASS_BOTH | PASS_NEITHER_DEVICE,
    PASS_BOTH_DRIVER = PASS_BOTH | PASS_NEITHER_DRIVER,
    PASS_BOTH_BOTH = PASS_BOTH | PASS_NEITHER_BOTH,
};

bool hasBit(Success mask, Success bit) {
    const uint32_t bitAsInt = static_cast<uint32_t>(bit);
    CHECK(bitAsInt && (bitAsInt & (bitAsInt - 1)) == 0)
            << "second argument must be a single bit rather than " << static_cast<uint32_t>(bit);
    return static_cast<uint32_t>(mask) & bitAsInt;
}

Success clearBit(Success mask, Success bit) {
    const uint32_t bitAsInt = static_cast<uint32_t>(bit);
    CHECK(bitAsInt && (bitAsInt & (bitAsInt - 1)) == 0)
            << "second argument must be a single bit rather than " << static_cast<uint32_t>(bit);
    return static_cast<Success>(static_cast<uint32_t>(mask) & ~bitAsInt);
}

std::ostream& operator<<(std::ostream& os, Success success) {
    switch (success) {
        case Success::FAIL_LAUNCH:
            return os << "FAIL_LAUNCH";
        case Success::FAIL_WAIT:
            return os << "FAIL_WAIT";
        case Success::PASS_CPU:
            return os << "PASS_CPU";
        default:
            break;
    }

    static const std::vector<std::pair<Success, const char*>> bits = {
            {Success::PASS_BIT, "PASS"},
            {Success::PASS_UNFENCED_DEVICE_BIT, "UNFENCED_DEVICE"},
            {Success::PASS_UNFENCED_DRIVER_BIT, "UNFENCED_DRIVER"},
            {Success::PASS_FENCED_DEVICE_BIT, "FENCED_DEVICE"},
            {Success::PASS_FENCED_DRIVER_BIT, "FENCED_DRIVER"},
    };
    bool gotOutput = false;
    for (const auto& b : bits) {
        if (hasBit(success, b.first)) {
            if (gotOutput) {
                os << '|';
            } else {
                gotOutput = true;
            }
            os << b.second;
            success = clearBit(success, b.first);
        }
    }
    if (uint32_t successAsInt = static_cast<uint32_t>(success)) {
        if (gotOutput) {
            os << '|';
        }
        os << successAsInt;
    }
    return os;
}

// Returns (unfenced timing, fenced timing).
// Not for PASS_CPU.
std::pair<V1_2::Timing, V1_2::Timing> getExpectedTiming(Success s, bool fencedExecution) {
    CHECK_NE(s, Success::PASS_CPU);

    if (!hasBit(s, Success::PASS_BIT)) {
        return {kBadTiming, kBadTiming};
    }

    std::pair<V1_2::Timing, V1_2::Timing> result;
    result.first.timeOnDevice = hasBit(s, Success::PASS_UNFENCED_DEVICE_BIT)
                                        ? kGoodUnfencedTiming.timeOnDevice
                                        : UINT64_MAX;
    result.first.timeInDriver = hasBit(s, Success::PASS_UNFENCED_DRIVER_BIT)
                                        ? kGoodUnfencedTiming.timeInDriver
                                        : UINT64_MAX;
    if (fencedExecution) {
        result.second.timeOnDevice = hasBit(s, Success::PASS_FENCED_DEVICE_BIT)
                                             ? kGoodFencedTiming.timeOnDevice
                                             : UINT64_MAX;
        result.second.timeInDriver = hasBit(s, Success::PASS_FENCED_DRIVER_BIT)
                                             ? kGoodFencedTiming.timeInDriver
                                             : UINT64_MAX;
    } else {
        result.second = result.first;
    }
    return result;
}

// For these tests we don't care about actually running an inference -- we
// just want to placeholder up execution status and timing results, and control
// when the execution finishes.
class TestPreparedModelLatest : public SamplePreparedModel {
   public:
    TestPreparedModelLatest(const HidlModel& model, const SampleDriver* driver, Success success)
        : SamplePreparedModel(model, driver, V1_1::ExecutionPreference::FAST_SINGLE_ANSWER, uid_t{},
                              nn::kDefaultPriority13),
          mSuccess(success) {}

    hardware::Return<V1_0::ErrorStatus> execute(
            const V1_0::Request&, const sp<V1_0::IExecutionCallback>& callback) override {
        switch (mSuccess) {
            case Success::PASS_NEITHER:
                std::thread([callback] {
                    dummyExecution();
                    callback->notify(V1_0::ErrorStatus::NONE);
                }).detach();
                return V1_0::ErrorStatus::NONE;
            case Success::FAIL_LAUNCH:
                dummyExecution();
                callback->notify(V1_0::ErrorStatus::GENERAL_FAILURE);
                return V1_0::ErrorStatus::GENERAL_FAILURE;
            case Success::FAIL_WAIT:
                std::thread([callback] {
                    dummyExecution();
                    callback->notify(V1_0::ErrorStatus::GENERAL_FAILURE);
                }).detach();
                return V1_0::ErrorStatus::NONE;
            default:
                ADD_FAILURE() << "Unexpected Success kind";
                return V1_0::ErrorStatus::GENERAL_FAILURE;
        }
    }

    hardware::Return<V1_0::ErrorStatus> execute_1_2(
            const V1_0::Request&, V1_2::MeasureTiming measure,
            const sp<V1_2::IExecutionCallback>& callback) override {
        EXPECT_EQ(measure, V1_2::MeasureTiming::YES);
        switch (mSuccess) {
            case Success::PASS_NEITHER:
            case Success::PASS_DEVICE:
            case Success::PASS_DRIVER:
            case Success::PASS_BOTH:
                std::thread([this, callback] {
                    dummyExecution();
                    callback->notify_1_2(V1_0::ErrorStatus::NONE, {},
                                         getExpectedTiming(mSuccess, false).first);
                }).detach();
                return V1_0::ErrorStatus::NONE;
            case Success::FAIL_LAUNCH:
                dummyExecution();
                callback->notify(V1_0::ErrorStatus::GENERAL_FAILURE);
                return V1_0::ErrorStatus::GENERAL_FAILURE;
            case Success::FAIL_WAIT:
                std::thread([callback] {
                    dummyExecution();
                    callback->notify(V1_0::ErrorStatus::GENERAL_FAILURE);
                }).detach();
                return V1_0::ErrorStatus::NONE;
            default:
                ADD_FAILURE() << "Unexpected Success kind";
                return V1_0::ErrorStatus::GENERAL_FAILURE;
        }
    }

    hardware::Return<V1_3::ErrorStatus> execute_1_3(
            const V1_3::Request&, V1_2::MeasureTiming measure, const V1_3::OptionalTimePoint&,
            const V1_3::OptionalTimeoutDuration&,
            const sp<V1_3::IExecutionCallback>& callback) override {
        // Use a placeholder V1_0::Request because execute_1_2 ignores request entirely.
        const V1_0::ErrorStatus status = execute_1_2(V1_0::Request{}, measure, callback);
        return convertToV1_3(status);
    }

    hardware::Return<void> executeSynchronously(const V1_0::Request&, V1_2::MeasureTiming measure,
                                                executeSynchronously_cb cb) override {
        EXPECT_EQ(measure, V1_2::MeasureTiming::YES);
        switch (mSuccess) {
            case Success::PASS_NEITHER:
            case Success::PASS_DEVICE:
            case Success::PASS_DRIVER:
            case Success::PASS_BOTH:
                dummyExecution();
                cb(V1_0::ErrorStatus::NONE, {}, getExpectedTiming(mSuccess, false).first);
                return hardware::Void();
            case Success::FAIL_WAIT:
                // While this is a synchronous execution method, the NNAPI
                // runtime may call it even for asynchronous execution, so we
                // need to tolerate Success::FAIL_WAIT here, not just
                // Success::FAIL_LAUNCH.
                FALLTHROUGH_INTENDED;
            case Success::FAIL_LAUNCH:
                dummyExecution();
                cb(V1_0::ErrorStatus::GENERAL_FAILURE, {}, kBadTiming);
                return hardware::Void();
            default:
                ADD_FAILURE() << "Unexpected Success kind";
                cb(V1_0::ErrorStatus::GENERAL_FAILURE, {}, kBadTiming);
                return hardware::Void();
        }
    }

    hardware::Return<void> executeSynchronously_1_3(const V1_3::Request&,
                                                    V1_2::MeasureTiming measure,
                                                    const V1_3::OptionalTimePoint&,
                                                    const V1_3::OptionalTimeoutDuration&,
                                                    executeSynchronously_1_3_cb cb) override {
        const auto wrappedCb = [&cb](V1_0::ErrorStatus status,
                                     const hardware::hidl_vec<V1_2::OutputShape>& outputShapes,
                                     V1_2::Timing timing) {
            cb(convertToV1_3(status), outputShapes, timing);
        };
        // Use a placeholder V1_0::Request because executeSynchronously ignores request entirely.
        return executeSynchronously(V1_0::Request{}, measure, wrappedCb);
    }

    // ExecutionBurstServer::create has an overload that will use
    // IPreparedModel::executeSynchronously(), so we can rely on that, rather
    // than having to implement ExecutionBurstServer::IExecutorWithCache.
    hardware::Return<void> configureExecutionBurst(
            const sp<V1_2::IBurstCallback>& callback,
            const MQDescriptorSync<V1_2::FmqRequestDatum>& requestChannel,
            const MQDescriptorSync<V1_2::FmqResultDatum>& resultChannel,
            configureExecutionBurst_cb cb) override {
        const sp<V1_2::IBurstContext> burst = ExecutionBurstServer::create(
                callback, requestChannel, resultChannel, this, std::chrono::microseconds{0});

        cb(burst == nullptr ? V1_0::ErrorStatus::GENERAL_FAILURE : V1_0::ErrorStatus::NONE, burst);
        return hardware::Void();
    }

    hardware::Return<void> executeFenced(const V1_3::Request&,
                                         const hardware::hidl_vec<hardware::hidl_handle>&,
                                         V1_2::MeasureTiming measure,
                                         const V1_3::OptionalTimePoint&,
                                         const V1_3::OptionalTimeoutDuration&,
                                         const V1_3::OptionalTimeoutDuration&,
                                         executeFenced_cb callback) override {
        EXPECT_EQ(measure, V1_2::MeasureTiming::YES);
        if (hasBit(mSuccess, Success::PASS_BIT)) {
            dummyExecution();
            const auto expectedTiming = getExpectedTiming(mSuccess, true);
            sp<SampleFencedExecutionCallback> fencedExecutionCallback =
                    new SampleFencedExecutionCallback(expectedTiming.first, expectedTiming.second,
                                                      V1_3::ErrorStatus::NONE);
            callback(V1_3::ErrorStatus::NONE, hardware::hidl_handle(nullptr),
                     fencedExecutionCallback);
            return hardware::Void();
        }
        switch (mSuccess) {
            case Success::FAIL_WAIT:
                // Due to the limitation of the SampleDriver,
                // FAIL_WAIT behaves the same as FAIL_LAUNCH.
                // If the SampleDriver is updated to return real
                // sync fences, this must be updated.
                FALLTHROUGH_INTENDED;
            case Success::FAIL_LAUNCH:
                dummyExecution();
                callback(V1_3::ErrorStatus::GENERAL_FAILURE, hardware::hidl_handle(nullptr),
                         nullptr);
                return hardware::Void();
            default:
                ADD_FAILURE() << "Unexpected Success kind";
                return hardware::Void();
        }
    }

    // We can place the TestPreparedModelLatest system in a "pause" mode where
    // no execution will complete until the system is taken out of that mode.
    // Initially, the system is not in that mode.
    static void pauseExecutions(bool v) { mPauseExecutions.store(v); }

    // This function is only guaranteed to work in the following pattern:
    // Consider thread A as primary thread
    // - thread A: pauseExecutions(true);
    // - thread A: launch execution (as thread B)
    // - thread A: waitForExecutionToBegin(), block until call to dummyExecution by
    //                                        thread B makes mExecutionsInFlight nonzero
    // - thread B: dummyExecution(), which makes mExecutionsInFlight nonzero and blocks
    //                               until thread A calls pauseExecutions(false)
    // - thread A: waitForExecutionToBegin() returns
    // - thread A: pauseExecutions(false), allowing dummyExecution() on thread B to continue
    // - thread B: dummyExecution() zeroes mExecutionsInFlight and returns
    // - thread B: thread exits
    static void waitForExecutionToBegin() {
        CHECK(mPauseExecutions.load());
        while (mExecutionsInFlight.load() == 0) {
        }
    }

   private:
    Success mSuccess;

    static std::atomic<bool> mPauseExecutions;
    static std::atomic<unsigned int> mExecutionsInFlight;

    static void dummyExecution() {
        CHECK_EQ(mExecutionsInFlight.fetch_add(1), 0u) << "We do not support concurrent executions";
        while (mPauseExecutions.load()) {
        }
        mExecutionsInFlight.fetch_sub(1);
    }
};
std::atomic<bool> TestPreparedModelLatest::mPauseExecutions = false;
std::atomic<unsigned int> TestPreparedModelLatest::mExecutionsInFlight = 0;

using TestPreparedModel13 = TestPreparedModelLatest;

// Like TestPreparedModelLatest, but implementing 1.2
class TestPreparedModel12 : public V1_2::IPreparedModel {
   public:
    TestPreparedModel12(const HidlModel& model, const SampleDriver* driver, Success success)
        : mLatestPreparedModel(new TestPreparedModelLatest(model, driver, success)) {}

    hardware::Return<V1_0::ErrorStatus> execute(
            const V1_0::Request& request, const sp<V1_0::IExecutionCallback>& callback) override {
        return mLatestPreparedModel->execute(request, callback);
    }

    hardware::Return<V1_0::ErrorStatus> execute_1_2(
            const V1_0::Request& request, V1_2::MeasureTiming measure,
            const sp<V1_2::IExecutionCallback>& callback) override {
        return mLatestPreparedModel->execute_1_2(request, measure, callback);
    }

    hardware::Return<void> executeSynchronously(const V1_0::Request& request,
                                                V1_2::MeasureTiming measure,
                                                executeSynchronously_cb cb) override {
        return mLatestPreparedModel->executeSynchronously(request, measure, cb);
    }

    hardware::Return<void> configureExecutionBurst(
            const sp<V1_2::IBurstCallback>& callback,
            const MQDescriptorSync<V1_2::FmqRequestDatum>& requestChannel,
            const MQDescriptorSync<V1_2::FmqResultDatum>& resultChannel,
            configureExecutionBurst_cb cb) override {
        return mLatestPreparedModel->configureExecutionBurst(callback, requestChannel,
                                                             resultChannel, cb);
    }

   private:
    const sp<V1_3::IPreparedModel> mLatestPreparedModel;
};

// Like TestPreparedModelLatest, but implementing 1.0
class TestPreparedModel10 : public V1_0::IPreparedModel {
   public:
    TestPreparedModel10(const HidlModel& model, const SampleDriver* driver, Success success)
        : mLatestPreparedModel(new TestPreparedModelLatest(model, driver, success)) {}

    hardware::Return<V1_0::ErrorStatus> execute(
            const V1_0::Request& request, const sp<V1_0::IExecutionCallback>& callback) override {
        return mLatestPreparedModel->execute(request, callback);
    }

   private:
    const sp<V1_3::IPreparedModel> mLatestPreparedModel;
};

// Behaves like SampleDriver, except that it produces customized IPrepareModel.
class TestDriver13 : public SampleDriver {
   public:
    TestDriver13(const std::string& name, Success success)
        : SampleDriver(name.c_str()), mSuccess(success) {}

    hardware::Return<void> getCapabilities_1_3(getCapabilities_1_3_cb _hidl_cb) override {
        android::nn::initVLogMask();
        V1_3::Capabilities capabilities = nn::makeCapabilities(0.75f);
        _hidl_cb(V1_3::ErrorStatus::NONE, capabilities);
        return hardware::Void();
    }

    hardware::Return<void> getSupportedOperations_1_3(const HidlModel& model,
                                                      getSupportedOperations_1_3_cb cb) override {
        if (nn::validateModel(model)) {
            std::vector<bool> supported(model.main.operations.size(), true);
            cb(V1_3::ErrorStatus::NONE, supported);
        } else {
            cb(V1_3::ErrorStatus::INVALID_ARGUMENT, {});
        }
        return hardware::Void();
    }

    hardware::Return<void> getSupportedOperations_1_2(const V1_2::Model& model,
                                                      getSupportedOperations_1_2_cb cb) override {
        if (nn::validateModel(model)) {
            std::vector<bool> supported(model.operations.size(), true);
            cb(V1_0::ErrorStatus::NONE, supported);
        } else {
            std::vector<bool> supported;
            cb(V1_0::ErrorStatus::INVALID_ARGUMENT, supported);
        }
        return hardware::Void();
    }

    hardware::Return<V1_3::ErrorStatus> prepareModel_1_3(
            const HidlModel& model, V1_1::ExecutionPreference, V1_3::Priority,
            const V1_3::OptionalTimePoint&, const hardware::hidl_vec<hardware::hidl_handle>&,
            const hardware::hidl_vec<hardware::hidl_handle>&, const nn::HalCacheToken&,
            const sp<V1_3::IPreparedModelCallback>& callback) override {
        callback->notify_1_3(V1_3::ErrorStatus::NONE,
                             new TestPreparedModel13(model, this, mSuccess));
        return V1_3::ErrorStatus::NONE;
    }

    hardware::Return<V1_0::ErrorStatus> prepareModel_1_2(
            const V1_2::Model& model, V1_1::ExecutionPreference,
            const hardware::hidl_vec<hardware::hidl_handle>&,
            const hardware::hidl_vec<hardware::hidl_handle>&, const nn::HalCacheToken&,
            const sp<V1_2::IPreparedModelCallback>& callback) override {
        callback->notify_1_2(V1_0::ErrorStatus::NONE,
                             new TestPreparedModel12(nn::convertToV1_3(model), this, mSuccess));
        return V1_0::ErrorStatus::NONE;
    }

    hardware::Return<V1_0::ErrorStatus> prepareModel_1_1(
            const V1_1::Model& model, V1_1::ExecutionPreference,
            const sp<V1_0::IPreparedModelCallback>& callback) override {
        callback->notify(V1_0::ErrorStatus::NONE,
                         new TestPreparedModel10(nn::convertToV1_3(model), this, mSuccess));
        return V1_0::ErrorStatus::NONE;
    }

    hardware::Return<V1_0::ErrorStatus> prepareModel(
            const V1_0::Model& model, const sp<V1_0::IPreparedModelCallback>& callback) override {
        return prepareModel_1_1(nn::convertToV1_1(model),
                                V1_1::ExecutionPreference::FAST_SINGLE_ANSWER, callback);
    }

   private:
    Success mSuccess;
};

// Like TestDriver, but implementing 1.1
class TestDriver11 : public V1_1::IDevice {
   public:
    TestDriver11(const std::string& name, Success success)
        : mLatestDriver(new TestDriver13(name, success)) {}
    hardware::Return<void> getCapabilities_1_1(getCapabilities_1_1_cb _hidl_cb) override {
        return mLatestDriver->getCapabilities_1_1(_hidl_cb);
    }
    hardware::Return<void> getSupportedOperations_1_1(
            const V1_1::Model& model, getSupportedOperations_1_1_cb _hidl_cb) override {
        return mLatestDriver->getSupportedOperations_1_1(model, _hidl_cb);
    }
    hardware::Return<V1_0::ErrorStatus> prepareModel_1_1(
            const V1_1::Model& model, V1_1::ExecutionPreference preference,
            const sp<V1_0::IPreparedModelCallback>& actualCallback) override {
        return mLatestDriver->prepareModel_1_1(model, preference, actualCallback);
    }
    hardware::Return<V1_0::DeviceStatus> getStatus() override { return mLatestDriver->getStatus(); }
    hardware::Return<void> getCapabilities(getCapabilities_cb _hidl_cb) override {
        return mLatestDriver->getCapabilities(_hidl_cb);
    }
    hardware::Return<void> getSupportedOperations(const V1_0::Model& model,
                                                  getSupportedOperations_cb _hidl_cb) override {
        return mLatestDriver->getSupportedOperations(model, _hidl_cb);
    }
    hardware::Return<V1_0::ErrorStatus> prepareModel(
            const V1_0::Model& model,
            const sp<V1_0::IPreparedModelCallback>& actualCallback) override {
        return mLatestDriver->prepareModel(model, actualCallback);
    }

   private:
    const sp<V1_3::IDevice> mLatestDriver;
};

}  // namespace test_drivers

/*-- End   test drivers -------------------------------------------------------------------------*/

/*-- Begin timing tests -------------------------------------------------------------------------*/

namespace timing_tests {

using namespace test_drivers;

enum class DriverKind {
    CPU,
    OLD,  // too old to support timing (1.1 or earlier)
    NEW   // new enough to support timing (1.2 or later)
};

std::ostream& operator<<(std::ostream& os, DriverKind kind) {
    const char* names[] = {"CPU", "OLD", "NEW"};
    const uint32_t index = static_cast<uint32_t>(kind);
    CHECK(index < std::size(names));
    return os << names[index];
}

enum class Compute { ASYNC, SYNC, BURST, FENCED };

std::ostream& operator<<(std::ostream& os, Compute compute) {
    const char* names[] = {"ASYNC", "SYNC", "BURST", "FENCED"};
    const uint32_t index = static_cast<uint32_t>(compute);
    CHECK(index < std::size(names));
    return os << names[index];
}

class TimingTest : public IntrospectionControlTest,
                   public ::testing::WithParamInterface<std::tuple<DriverKind, Success, Compute>> {
   public:
    TimingTest()
        : kDriverKind(std::get<0>(GetParam())),
          kSuccess(std::get<1>(GetParam())),
          kCompute(std::get<2>(GetParam())) {}

   protected:
    const DriverKind kDriverKind;
    const Success kSuccess;
    const Compute kCompute;
};

TEST_P(TimingTest, Test) {
    // There's no straightforward way to force CPU execution to fail.
    ASSERT_EQ(kDriverKind == DriverKind::CPU, kSuccess == Success::PASS_CPU);

    // FAIL_WAIT only makes sense for ASYNC and FENCED.
    ASSERT_TRUE(kCompute == Compute::ASYNC || kCompute == Compute::FENCED ||
                kSuccess != Success::FAIL_WAIT);

    if (DeviceManager::get()->getUseCpuOnly() != (kDriverKind == DriverKind::CPU)) {
        // We don't have an elegant way to request the CPU driver.  Therefore,
        // we rely on our test framework to make the choice between CPU and
        // non-CPU.
        GTEST_SKIP();
    }

    createSimpleAddModel(&mModel);

    switch (kDriverKind) {
        case DriverKind::CPU: {
            // There should be only one driver -- the CPU
            const std::string& name = DeviceManager::get()->getDrivers()[0]->getName();
            ASSERT_TRUE(selectDeviceByName(name));
            break;
        }
        case DriverKind::OLD: {
            static const char name[] = "old";
            DeviceManager::get()->forTest_registerDevice(
                    nn::makeSharedDevice(name, new TestDriver11(name, kSuccess)));
            ASSERT_TRUE(selectDeviceByName(name));
            break;
        }
        case DriverKind::NEW: {
            static const char name[] = "new";
            DeviceManager::get()->forTest_registerDevice(
                    nn::makeSharedDevice(name, new TestDriver13(name, kSuccess)));
            ASSERT_TRUE(selectDeviceByName(name));
            break;
        }
        default:
            FAIL() << "Unexpected DriverKind";
    }

    EXPECT_EQ(prepareForExecution(true /*measureTiming*/), ANEURALNETWORKS_NO_ERROR);

    float input1[2] = {1.0f, 2.0f};
    float input2[2] = {3.0f, 4.0f};
    float output[2];
    EXPECT_EQ(ANeuralNetworksExecution_setInput(mExecution, 0, nullptr, input1, sizeof(input1)),
              ANEURALNETWORKS_NO_ERROR);
    EXPECT_EQ(ANeuralNetworksExecution_setInput(mExecution, 1, nullptr, input2, sizeof(input2)),
              ANEURALNETWORKS_NO_ERROR);
    EXPECT_EQ(ANeuralNetworksExecution_setOutput(mExecution, 0, nullptr, output, sizeof(output)),
              ANEURALNETWORKS_NO_ERROR);
    EXPECT_EQ(ANeuralNetworksExecution_setMeasureTiming(mExecution, true),
              ANEURALNETWORKS_NO_ERROR);

    auto Check = [](bool expectPass, int result) {
        if (expectPass) {
            ASSERT_EQ(result, ANEURALNETWORKS_NO_ERROR);
        } else {
            ASSERT_NE(result, ANEURALNETWORKS_NO_ERROR);
        }
    };

    const bool isPass = hasBit(kSuccess, Success::PASS_BIT);
    const int expectedGetDurationResultCode =
            isPass ? ANEURALNETWORKS_NO_ERROR : ANEURALNETWORKS_BAD_STATE;

    const auto getDurationWhileRunning = [this] {
        if (kDriverKind == DriverKind::CPU) {
            // Testing DriverKind::CPU would require modifying the CPU execution
            // path to control execution completion, similarly to how this test
            // case does with TestPreparedModel::dummyExecution(). This does not
            // seem worthwhile -- it's intrusive into the runtime code solely
            // for the sake of testing, and we do not expect that the code paths
            // needed to ensure correct behavior of
            // ANeuralNetworksExecution_getDuration() on a running execution
            // would be any different for CPU than for actual drivers.
            return;
        }
        TestPreparedModelLatest::waitForExecutionToBegin();
        for (int durationCode :
             std::vector{ANEURALNETWORKS_DURATION_ON_HARDWARE, ANEURALNETWORKS_DURATION_IN_DRIVER,
                         ANEURALNETWORKS_FENCED_DURATION_ON_HARDWARE,
                         ANEURALNETWORKS_FENCED_DURATION_IN_DRIVER}) {
            uint64_t time;
            // Cannot query duration while execution is running
            EXPECT_EQ(ANeuralNetworksExecution_getDuration(mExecution, durationCode, &time),
                      ANEURALNETWORKS_BAD_STATE);
        }
    };

    switch (kCompute) {
        case Compute::ASYNC: {
            // Ideally what we'd like to do here is
            //
            //     Check(kSuccess != Success::FAIL_LAUNCH,
            //         ANeuralNetworksExecution_startCompute(mExecution, &mEvent));
            //     Check(isPass, ANeuralNetworksEvent_wait(mEvent));
            //
            // However, in the current implementation of the runtime, a launch
            // failure at the HAL level does not show up as a launch failure at
            // the NDK level ("startCompute"): The NNAPI runtime does not call a
            // driver until it (the runtime) begins execution, so a launch
            // failure at the HAL level looks like an execution failure at the
            // NDK level ("wait").
            SCOPED_TRACE("ASYNC startCompute");
            TestPreparedModelLatest::pauseExecutions(true);
            Check(true,  // rather than kSuccess != Success::FAIL_LAUNCH
                  ANeuralNetworksExecution_startCompute(mExecution, &mEvent));
            getDurationWhileRunning();
            TestPreparedModelLatest::pauseExecutions(false);
            SCOPED_TRACE("ASYNC wait");
            Check(isPass, ANeuralNetworksEvent_wait(mEvent));
            break;
        }
        case Compute::SYNC: {
            SCOPED_TRACE("SYNC");
            TestPreparedModelLatest::pauseExecutions(true);
            std::thread run([this, Check, isPass] {
                Check(isPass, ANeuralNetworksExecution_compute(mExecution));
            });
            getDurationWhileRunning();
            TestPreparedModelLatest::pauseExecutions(false);
            run.join();
            break;
        }
        case Compute::BURST: {
            SCOPED_TRACE("BURST");
            ANeuralNetworksBurst* burst;
            ASSERT_EQ(ANeuralNetworksBurst_create(mCompilation, &burst), ANEURALNETWORKS_NO_ERROR);
            TestPreparedModelLatest::pauseExecutions(true);
            std::thread run([this, Check, isPass, burst] {
                Check(isPass, ANeuralNetworksExecution_burstCompute(mExecution, burst));
            });
            getDurationWhileRunning();
            TestPreparedModelLatest::pauseExecutions(false);
            run.join();
            ANeuralNetworksBurst_free(burst);
            break;
        }
        case Compute::FENCED: {
            SCOPED_TRACE("FENCED startComputeWithDependencies");
            TestPreparedModelLatest::pauseExecutions(true);

            // Note, due to the limitation of SampleDriver implementation, the call is synchronous.
            // If the SampleDriver is updated to return real sync fence, this must be updated.
            std::thread run([this, Check, isPass] {
                Check(isPass, ANeuralNetworksExecution_startComputeWithDependencies(
                                      mExecution, nullptr, 0, 0, &mEvent));
            });
            getDurationWhileRunning();
            TestPreparedModelLatest::pauseExecutions(false);
            run.join();
            SCOPED_TRACE("FENCED wait");
            Check(isPass, ANeuralNetworksEvent_wait(mEvent));
            break;
        }
        default:
            FAIL() << "unreachable";
    }

    uint64_t timeOnHardware, timeInDriver, timeOnHardwareFenced, timeInDriverFenced;
    EXPECT_EQ(ANeuralNetworksExecution_getDuration(mExecution, ANEURALNETWORKS_DURATION_ON_HARDWARE,
                                                   &timeOnHardware),
              expectedGetDurationResultCode);
    EXPECT_EQ(ANeuralNetworksExecution_getDuration(mExecution, ANEURALNETWORKS_DURATION_IN_DRIVER,
                                                   &timeInDriver),
              expectedGetDurationResultCode);
    EXPECT_EQ(
            ANeuralNetworksExecution_getDuration(
                    mExecution, ANEURALNETWORKS_FENCED_DURATION_ON_HARDWARE, &timeOnHardwareFenced),
            expectedGetDurationResultCode);
    EXPECT_EQ(ANeuralNetworksExecution_getDuration(
                      mExecution, ANEURALNETWORKS_FENCED_DURATION_IN_DRIVER, &timeInDriverFenced),
              expectedGetDurationResultCode);
    switch (kDriverKind) {
        case DriverKind::CPU: {
            // TODO: Should we require timing to be reported as 0?
            EXPECT_TRUE(timeOnHardware == 0 || timeOnHardware == UINT64_MAX)
                    << "timeOnHardware = " << timeOnHardware;
            EXPECT_TRUE(timeInDriver == 0 || timeInDriver == UINT64_MAX)
                    << "timeInDriver = " << timeOnHardware;
            EXPECT_TRUE(timeOnHardwareFenced == 0 || timeOnHardwareFenced == UINT64_MAX)
                    << "timeOnHardwareFenced = " << timeOnHardwareFenced;
            EXPECT_TRUE(timeInDriverFenced == 0 || timeInDriverFenced == UINT64_MAX)
                    << "timeInDriver = " << timeInDriverFenced;
            break;
        }
        case DriverKind::OLD: {
            EXPECT_EQ(timeOnHardware, UINT64_MAX);
            EXPECT_EQ(timeInDriver, UINT64_MAX);
            EXPECT_EQ(timeOnHardwareFenced, UINT64_MAX);
            EXPECT_EQ(timeInDriverFenced, UINT64_MAX);
            break;
        }
        case DriverKind::NEW: {
            auto microsToNanos = [](uint64_t micros) {
                constexpr uint64_t kNanosPerMicro = 1000;
                return micros == UINT64_MAX ? UINT64_MAX : kNanosPerMicro * micros;
            };
            auto expectedTiming = getExpectedTiming(kSuccess, kCompute == Compute::FENCED);
            EXPECT_EQ(timeOnHardware, microsToNanos(expectedTiming.first.timeOnDevice));
            EXPECT_EQ(timeInDriver, microsToNanos(expectedTiming.first.timeInDriver));
            EXPECT_EQ(timeOnHardwareFenced, microsToNanos(expectedTiming.second.timeOnDevice));
            EXPECT_EQ(timeInDriverFenced, microsToNanos(expectedTiming.second.timeInDriver));
            break;
        }
        default:
            FAIL() << "unreachable";
    }
    if (kCompute != Compute::FENCED) {
        EXPECT_EQ(timeOnHardware, timeOnHardwareFenced);
        EXPECT_EQ(timeInDriver, timeInDriverFenced);
    }
    auto expectTimingLe = [](uint64_t a, const char* aName, uint64_t b, const char* bName) {
        if (a != UINT64_MAX && b != UINT64_MAX) {
            EXPECT_LE(a, b) << aName << " exceeds " << bName;
        }
    };
#define EXPECT_TIMING_LE(a, b) expectTimingLe(a, #a, b, #b)
    EXPECT_TIMING_LE(timeOnHardware, timeInDriver);
    EXPECT_TIMING_LE(timeOnHardwareFenced, timeInDriverFenced);

    EXPECT_TIMING_LE(timeOnHardwareFenced, timeOnHardware);
    EXPECT_TIMING_LE(timeInDriverFenced, timeInDriver);
#undef EXPECT_TIMING_LE
}

auto kTimingTestUnfencedValues = ::testing::Values(
        // NOTE: We cannot force CPU execution to fail
        std::make_tuple(DriverKind::CPU, Success::PASS_CPU, Compute::ASYNC),
        std::make_tuple(DriverKind::CPU, Success::PASS_CPU, Compute::SYNC),
        std::make_tuple(DriverKind::CPU, Success::PASS_CPU, Compute::BURST),

        // NOTE: OLD driver does not provide timing
        std::make_tuple(DriverKind::OLD, Success::PASS_NEITHER, Compute::ASYNC),
        std::make_tuple(DriverKind::OLD, Success::PASS_NEITHER, Compute::SYNC),
        std::make_tuple(DriverKind::OLD, Success::PASS_NEITHER, Compute::BURST),

        std::make_tuple(DriverKind::OLD, Success::FAIL_LAUNCH, Compute::ASYNC),
        std::make_tuple(DriverKind::OLD, Success::FAIL_LAUNCH, Compute::SYNC),
        std::make_tuple(DriverKind::OLD, Success::FAIL_LAUNCH, Compute::BURST),

        // NOTE: Only ASYNC is paired with a wait
        std::make_tuple(DriverKind::OLD, Success::FAIL_WAIT, Compute::ASYNC),

        std::make_tuple(DriverKind::NEW, Success::PASS_NEITHER, Compute::ASYNC),
        std::make_tuple(DriverKind::NEW, Success::PASS_NEITHER, Compute::SYNC),
        std::make_tuple(DriverKind::NEW, Success::PASS_NEITHER, Compute::BURST),

        std::make_tuple(DriverKind::NEW, Success::PASS_DEVICE, Compute::ASYNC),
        std::make_tuple(DriverKind::NEW, Success::PASS_DEVICE, Compute::SYNC),
        std::make_tuple(DriverKind::NEW, Success::PASS_DEVICE, Compute::BURST),

        std::make_tuple(DriverKind::NEW, Success::PASS_DRIVER, Compute::ASYNC),
        std::make_tuple(DriverKind::NEW, Success::PASS_DRIVER, Compute::SYNC),
        std::make_tuple(DriverKind::NEW, Success::PASS_DRIVER, Compute::BURST),

        std::make_tuple(DriverKind::NEW, Success::PASS_BOTH, Compute::ASYNC),
        std::make_tuple(DriverKind::NEW, Success::PASS_BOTH, Compute::SYNC),
        std::make_tuple(DriverKind::NEW, Success::PASS_BOTH, Compute::BURST),

        std::make_tuple(DriverKind::NEW, Success::FAIL_LAUNCH, Compute::ASYNC),
        std::make_tuple(DriverKind::NEW, Success::FAIL_LAUNCH, Compute::SYNC),
        std::make_tuple(DriverKind::NEW, Success::FAIL_LAUNCH, Compute::BURST),

        // NOTE: Only ASYNC is paired with a wait
        std::make_tuple(DriverKind::NEW, Success::FAIL_WAIT, Compute::ASYNC));

auto kTimingTestFencedValues = ::testing::Values(
        // NOTE: We cannot force CPU execution to fail
        std::make_tuple(DriverKind::CPU, Success::PASS_CPU, Compute::FENCED),

        // NOTE: OLD driver does not provide timing
        std::make_tuple(DriverKind::OLD, Success::PASS_NEITHER, Compute::FENCED),

        std::make_tuple(DriverKind::OLD, Success::FAIL_LAUNCH, Compute::FENCED),

        std::make_tuple(DriverKind::NEW, Success::PASS_NEITHER, Compute::FENCED),
        std::make_tuple(DriverKind::NEW, Success::PASS_DEVICE, Compute::FENCED),
        std::make_tuple(DriverKind::NEW, Success::PASS_DRIVER, Compute::FENCED),
        std::make_tuple(DriverKind::NEW, Success::PASS_BOTH, Compute::FENCED),
        std::make_tuple(DriverKind::NEW, Success::PASS_NEITHER_DEVICE, Compute::FENCED),
        std::make_tuple(DriverKind::NEW, Success::PASS_NEITHER_DRIVER, Compute::FENCED),
        std::make_tuple(DriverKind::NEW, Success::PASS_NEITHER_BOTH, Compute::FENCED),
        std::make_tuple(DriverKind::NEW, Success::PASS_DEVICE_DEVICE, Compute::FENCED),
        std::make_tuple(DriverKind::NEW, Success::PASS_DEVICE_DRIVER, Compute::FENCED),
        std::make_tuple(DriverKind::NEW, Success::PASS_DEVICE_BOTH, Compute::FENCED),
        std::make_tuple(DriverKind::NEW, Success::PASS_DRIVER_DEVICE, Compute::FENCED),
        std::make_tuple(DriverKind::NEW, Success::PASS_DRIVER_DRIVER, Compute::FENCED),
        std::make_tuple(DriverKind::NEW, Success::PASS_DRIVER_BOTH, Compute::FENCED),
        std::make_tuple(DriverKind::NEW, Success::PASS_BOTH_DEVICE, Compute::FENCED),
        std::make_tuple(DriverKind::NEW, Success::PASS_BOTH_DRIVER, Compute::FENCED),
        std::make_tuple(DriverKind::NEW, Success::PASS_BOTH_BOTH, Compute::FENCED),

        std::make_tuple(DriverKind::NEW, Success::FAIL_LAUNCH, Compute::FENCED));

INSTANTIATE_TEST_SUITE_P(Unfenced, TimingTest, kTimingTestUnfencedValues);
INSTANTIATE_TEST_SUITE_P(Fenced, TimingTest, kTimingTestFencedValues);

}  // namespace timing_tests

/*-- End   timing tests -------------------------------------------------------------------------*/

const float kSimpleCeiling = 2.0f;

void createAddMaxModel(WrapperModel* model, bool reverseOrder) {
    WrapperOperandType type0(WrapperType::TENSOR_FLOAT32, {2});
    WrapperOperandType type1(WrapperType::INT32, {});
    // Phase 1, operands
    auto op1 = model->addOperand(&type0);
    auto op2 = model->addOperand(&type0);
    auto act = model->addOperand(&type1);
    auto op3 = model->addOperand(&type0);
    auto op4 = model->addOperand(&type0);
    auto op5 = model->addOperand(&type0);
    // Phase 2, operations
    static int32_t act_init[] = {0};
    model->setOperandValue(act, act_init, sizeof(act_init));
    static float ceiling[] = {kSimpleCeiling, kSimpleCeiling};
    model->setOperandValue(op4, ceiling, sizeof(ceiling));
    if (reverseOrder) {
        // In this case, add MAXIMUM first, but the execution order is still ADD -> MAXIMUM.
        model->addOperation(ANEURALNETWORKS_MAXIMUM, {op3, op4}, {op5});
        model->addOperation(ANEURALNETWORKS_ADD, {op1, op2, act}, {op3});
    } else {
        model->addOperation(ANEURALNETWORKS_ADD, {op1, op2, act}, {op3});
        model->addOperation(ANEURALNETWORKS_MAXIMUM, {op3, op4}, {op5});
    }
    // Phase 3, inputs and outputs
    model->identifyInputsAndOutputs({op1, op2}, {op5});
    model->finish();
    ASSERT_TRUE(model->isValid());
}

TEST_F(IntrospectionControlTest, SlicingAddMax) {
    // This is needed before we have the CPU fallback path being treated as a Device.
    if (DeviceManager::get()->getUseCpuOnly()) {
        GTEST_SKIP();
    }

    using namespace test_drivers;

    static const char name[] = "driver11";
    DeviceManager::get()->forTest_registerDevice(
            nn::makeSharedDevice(name, new TestDriver11(name, Success::PASS_BOTH)));
    ASSERT_TRUE(selectDeviceByName(name));

    createAddMaxModel(&mModel, false);
    EXPECT_TRUE(isSupportedOpListExpected({true, false}));
}

TEST_F(IntrospectionControlTest, SlicingMaxAdd) {
    // This is needed before we have the CPU fallback path being treated as a Device.
    if (DeviceManager::get()->getUseCpuOnly()) {
        GTEST_SKIP();
    }

    using namespace test_drivers;

    static const char name[] = "driver11";
    DeviceManager::get()->forTest_registerDevice(
            nn::makeSharedDevice(name, new TestDriver11(name, Success::PASS_BOTH)));
    ASSERT_TRUE(selectDeviceByName(name));

    createAddMaxModel(&mModel, true);
    EXPECT_TRUE(isSupportedOpListExpected({false, true}));
}

const float kSimpleMultiplier = 2.0f;

void createAddMulModel(WrapperModel* model, bool reverseOrder) {
    WrapperOperandType type0(WrapperType::TENSOR_FLOAT32, {2});
    WrapperOperandType type1(WrapperType::INT32, {});
    // Phase 1, operands
    auto op1 = model->addOperand(&type0);
    auto op2 = model->addOperand(&type0);
    auto act = model->addOperand(&type1);
    auto op3 = model->addOperand(&type0);
    auto op4 = model->addOperand(&type0);
    auto op5 = model->addOperand(&type0);
    // Phase 2, operations
    static int32_t act_init[] = {0};
    model->setOperandValue(act, act_init, sizeof(act_init));
    static float multiplier[] = {kSimpleMultiplier, kSimpleMultiplier};
    model->setOperandValue(op4, multiplier, sizeof(multiplier));
    if (reverseOrder) {
        // In this case, add MUL first, but the execution order is still ADD -> MUL.
        model->addOperation(ANEURALNETWORKS_MUL, {op3, op4, act}, {op5});
        model->addOperation(ANEURALNETWORKS_ADD, {op1, op2, act}, {op3});
    } else {
        model->addOperation(ANEURALNETWORKS_ADD, {op1, op2, act}, {op3});
        model->addOperation(ANEURALNETWORKS_MUL, {op3, op4, act}, {op5});
    }
    // Phase 3, inputs and outputs
    model->identifyInputsAndOutputs({op1, op2}, {op5});
    model->finish();
    ASSERT_TRUE(model->isValid());
}

TEST_F(IntrospectionControlTest, SlicingFullySupported) {
    // This is needed before we have the CPU fallback path being treated as a Device.
    if (DeviceManager::get()->getUseCpuOnly()) {
        GTEST_SKIP();
    }

    using namespace test_drivers;

    static const char name[] = "driver11";
    DeviceManager::get()->forTest_registerDevice(
            nn::makeSharedDevice(name, new TestDriver11(name, Success::PASS_BOTH)));
    ASSERT_TRUE(selectDeviceByName(name));

    createAddMulModel(&mModel, false);
    EXPECT_TRUE(isSupportedOpListExpected({true, true}));
}

void createCondModel(WrapperModel* model, bool dynamicRank) {
    const auto dimensions = dynamicRank ? std::vector<uint32_t>{} : std::vector<uint32_t>{1};
    WrapperOperandType floatType(WrapperType::TENSOR_FLOAT32, dimensions);
    WrapperOperandType boolType(WrapperType::TENSOR_BOOL8, {1});
    // Phase 1, operands
    auto op1 = model->addOperand(&floatType);
    auto op2 = model->addOperand(&boolType);
    // Phase 2, operations
    model->addOperation(ANEURALNETWORKS_LESS, {op1, op1}, {op2});
    // Phase 3, inputs and outputs
    model->identifyInputsAndOutputs({op1}, {op2});
    model->finish();
}

void addReluOperation(WrapperModel* model, std::vector<uint32_t>* modelInputIndexes,
                      std::vector<uint32_t>* modelOutputIndexes, bool dynamicRank) {
    const auto dimensions = dynamicRank ? std::vector<uint32_t>{} : std::vector<uint32_t>{1};
    WrapperOperandType type(WrapperType::TENSOR_FLOAT32, dimensions);
    // Phase 1, operands
    auto op1 = model->addOperand(&type);
    auto op2 = model->addOperand(&type);
    // Phase 2, operations
    model->addOperation(ANEURALNETWORKS_RELU, {op1}, {op2});
    // Phase 3, inputs and outputs
    modelInputIndexes->push_back(op1);
    modelOutputIndexes->push_back(op2);
}

void createReluModel(WrapperModel* model, bool dynamicRank) {
    std::vector<uint32_t> modelInputIndexes, modelOutputIndexes;
    addReluOperation(model, &modelInputIndexes, &modelOutputIndexes, dynamicRank);
    model->identifyInputsAndOutputs(modelInputIndexes, modelOutputIndexes);
    model->finish();
}

void addWhileOperation(std::vector<WrapperModel>* extraModels, WrapperModel* mainModel,
                       std::vector<uint32_t>* modelInputIndexes,
                       std::vector<uint32_t>* modelOutputIndexes, bool dynamicRank) {
    const auto dimensions = dynamicRank ? std::vector<uint32_t>{} : std::vector<uint32_t>{1};
    WrapperOperandType floatType(WrapperType::TENSOR_FLOAT32, dimensions);
    WrapperOperandType modelType(WrapperType::MODEL, {});

    extraModels->emplace_back();
    extraModels->emplace_back();
    WrapperModel* condModel = &extraModels->at(extraModels->size() - 2);
    WrapperModel* bodyModel = &extraModels->at(extraModels->size() - 1);
    createCondModel(condModel, dynamicRank);
    createReluModel(bodyModel, dynamicRank);
    ASSERT_TRUE(condModel->isValid());
    ASSERT_TRUE(bodyModel->isValid());

    // Phase 1, operands
    const uint32_t op1 = mainModel->addOperand(&modelType);
    const uint32_t op2 = mainModel->addOperand(&modelType);
    const uint32_t op3 = mainModel->addOperand(&floatType);
    const uint32_t op4 = mainModel->addOperand(&floatType);
    mainModel->setOperandValueFromModel(op1, condModel);
    mainModel->setOperandValueFromModel(op2, bodyModel);
    // Phase 2, operations
    mainModel->addOperation(ANEURALNETWORKS_WHILE, {op1, op2, op3}, {op4});
    // Phase 3, inputs and outputs
    modelInputIndexes->push_back(op3);
    modelOutputIndexes->push_back(op4);
}

void createReluStaticWhileModel(std::vector<WrapperModel>* extraModels, WrapperModel* mainModel) {
    std::vector<uint32_t> modelInputIndexes, modelOutputIndexes;

    // Operation supported in Android API level 27
    addReluOperation(mainModel, &modelInputIndexes, &modelOutputIndexes, /*dynamicRank=*/false);
    // Operation supported in Android API level 30
    addWhileOperation(extraModels, mainModel, &modelInputIndexes, &modelOutputIndexes,
                      /*dynamicRank=*/false);

    mainModel->identifyInputsAndOutputs(modelInputIndexes, modelOutputIndexes);
    mainModel->finish();
    ASSERT_TRUE(mainModel->isValid());
}

TEST_F(IntrospectionControlTest, ControlFlowNotSupported) {
    // This is needed before we have the CPU fallback path being treated as a Device.
    if (DeviceManager::get()->getUseCpuOnly()) {
        GTEST_SKIP();
    }

    using namespace test_drivers;

    static const char name[] = "driver11";
    DeviceManager::get()->forTest_registerDevice(
            nn::makeSharedDevice(name, new TestDriver11(name, Success::PASS_BOTH)));
    ASSERT_TRUE(selectDeviceByName(name));

    std::vector<WrapperModel> extraModels;
    createReluStaticWhileModel(&extraModels, &mModel);
    EXPECT_TRUE(isSupportedOpListExpected({true, false}));

    // Clear mModel early because it may reference `extraModels`.
    mModel = WrapperModel{};
}

TEST_F(IntrospectionControlTest, ControlFlowSupported) {
    // This is needed before we have the CPU fallback path being treated as a Device.
    if (DeviceManager::get()->getUseCpuOnly()) {
        GTEST_SKIP();
    }

    using namespace test_drivers;

    static const char name[] = "driver13";
    DeviceManager::get()->forTest_registerDevice(
            nn::makeSharedDevice(name, new TestDriver13(name, Success::PASS_BOTH)));
    ASSERT_TRUE(selectDeviceByName(name));

    std::vector<WrapperModel> extraModels;
    createReluStaticWhileModel(&extraModels, &mModel);
    EXPECT_TRUE(isSupportedOpListExpected({true, true}));

    // Clear mModel early because it may reference `extraModels`.
    mModel = WrapperModel{};
}

void createStaticWhileDynamicWhileModel(std::vector<WrapperModel>* extraModels,
                                        WrapperModel* mainModel) {
    std::vector<uint32_t> modelInputIndexes, modelOutputIndexes;

    // Operation supported in Android API level 30
    addWhileOperation(extraModels, mainModel, &modelInputIndexes, &modelOutputIndexes,
                      /*dynamicRank=*/false);
    // Operation supported only by NNAPI runtime
    addWhileOperation(extraModels, mainModel, &modelInputIndexes, &modelOutputIndexes,
                      /*dynamicRank=*/true);

    mainModel->identifyInputsAndOutputs(modelInputIndexes, modelOutputIndexes);
    mainModel->finish();
    ASSERT_TRUE(mainModel->isValid());
}

TEST_F(IntrospectionControlTest, ControlFlowFailedToSlice) {
    // This is needed before we have the CPU fallback path being treated as a Device.
    if (DeviceManager::get()->getUseCpuOnly()) {
        GTEST_SKIP();
    }

    using namespace test_drivers;

    static const char name[] = "driver13";
    DeviceManager::get()->forTest_registerDevice(
            nn::makeSharedDevice(name, new TestDriver13(name, Success::PASS_BOTH)));
    ASSERT_TRUE(selectDeviceByName(name));

    std::vector<WrapperModel> extraModels;
    createStaticWhileDynamicWhileModel(&extraModels, &mModel);
    EXPECT_TRUE(isSupportedOpListExpected({false, false}));

    // Clear mModel early because it may reference `extraModels`.
    mModel = WrapperModel{};
}

// TODO(miaowang): add a test to make sure ANNCompilation_create() has CPU
// fallback.
// This test verifies that a device that could only handle ADD would correctly report that an
// ADD->MUL model could not be fully supported.
TEST_F(IntrospectionControlTest, PartialModelNotSupported) {
    // This is needed before we have the CPU fallback path being treated as a Device.
    if (DeviceManager::get()->getUseCpuOnly()) {
        GTEST_SKIP();
    }

    createAddMulModel(&mModel, false);

    std::string addOnlyDriver = "test-onlyAdd";
    std::vector<bool> addOnlyOp(android::nn::kNumberOfOperationTypes, false);
    addOnlyOp[ANEURALNETWORKS_ADD] = true;

    registerDevices({{addOnlyDriver, 0.9, addOnlyOp}});

    EXPECT_TRUE(selectDeviceByName(addOnlyDriver));
    EXPECT_TRUE(isSupportedOpListExpected({true, false}));

    ANeuralNetworksModel* modelHandle = mModel.getHandle();
    EXPECT_EQ(ANeuralNetworksCompilation_createForDevices(modelHandle, mDevices.data(),
                                                          mDevices.size(), &mCompilation),
              ANEURALNETWORKS_NO_ERROR);
    // The compilation must fail as there is no fallback when using
    // Introspection API.
    EXPECT_NE(ANeuralNetworksCompilation_finish(mCompilation), ANEURALNETWORKS_NO_ERROR);
}

// This test verifies that a device that could only handle ADD would correctly report that an
// ADD->MUL model could not be fully supported. Also verifies that the indices of returned
// supported op list correctly map to the order of operations being added by the user.
TEST_F(IntrospectionControlTest, PartialModelNotSupportedOrder) {
    // This is needed before we have the CPU fallback path being treated as a Device.
    if (DeviceManager::get()->getUseCpuOnly()) {
        GTEST_SKIP();
    }

    createAddMulModel(&mModel, true);

    std::string addOnlyDriver = "test-onlyAdd";
    std::vector<bool> addOnlyOp(android::nn::kNumberOfOperationTypes, false);
    addOnlyOp[ANEURALNETWORKS_ADD] = true;

    registerDevices({{addOnlyDriver, 0.9, addOnlyOp}});

    EXPECT_TRUE(selectDeviceByName(addOnlyDriver));
    EXPECT_TRUE(isSupportedOpListExpected({false, true}));
}

// TODO(miaowang): update the test to make sure the model is actually running on the test devices.
// This test verifies that an ADD->MUL model is able to run on two selected devices that together
// can handle all operations.
TEST_F(IntrospectionControlTest, ModelNeedTwoDevices) {
    // This is needed before we have the CPU fallback path being treated as a Device.
    if (DeviceManager::get()->getUseCpuOnly()) {
        GTEST_SKIP();
    }

    createAddMulModel(&mModel, false);

    std::string addOnlyDriver = "test-onlyAdd";
    std::vector<bool> addOnlyOp(android::nn::kNumberOfOperationTypes, false);
    addOnlyOp[ANEURALNETWORKS_ADD] = true;

    std::string mulOnlyDriver = "test-onlyMul";
    std::vector<bool> mulOnlyOp(android::nn::kNumberOfOperationTypes, false);
    mulOnlyOp[ANEURALNETWORKS_MUL] = true;

    registerDevices({
            {addOnlyDriver, 0.9, addOnlyOp},
            {mulOnlyDriver, 0.9, mulOnlyOp},
    });

    EXPECT_TRUE(selectDeviceByName(addOnlyDriver));
    EXPECT_TRUE(selectDeviceByName(mulOnlyDriver));
    EXPECT_TRUE(isSupportedOpListExpected({true, true}));
    EXPECT_EQ(prepareForExecution(), ANEURALNETWORKS_NO_ERROR);

    float input1[2] = {1.0f, 2.0f};
    float input2[2] = {3.0f, 4.0f};
    float output[2];
    EXPECT_EQ(ANeuralNetworksExecution_setInput(mExecution, 0, nullptr, input1, sizeof(input1)),
              ANEURALNETWORKS_NO_ERROR);
    EXPECT_EQ(ANeuralNetworksExecution_setInput(mExecution, 1, nullptr, input2, sizeof(input2)),
              ANEURALNETWORKS_NO_ERROR);
    EXPECT_EQ(ANeuralNetworksExecution_setOutput(mExecution, 0, nullptr, output, sizeof(output)),
              ANEURALNETWORKS_NO_ERROR);

    EXPECT_EQ(ANeuralNetworksExecution_startCompute(mExecution, &mEvent), ANEURALNETWORKS_NO_ERROR);
    EXPECT_EQ(ANeuralNetworksEvent_wait(mEvent), ANEURALNETWORKS_NO_ERROR);
    EXPECT_EQ(output[0], kSimpleMultiplier * (input1[0] + input2[0]));
    EXPECT_EQ(output[1], kSimpleMultiplier * (input1[1] + input2[1]));
}
}  // namespace