1 /* 2 * Copyright (C) 2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef ANDROID_SOUND_TRIGGER_H 18 #define ANDROID_SOUND_TRIGGER_H 19 20 #include <stdbool.h> 21 #include <system/audio.h> 22 23 #define SOUND_TRIGGER_MAX_STRING_LEN 64 /* max length of strings in properties or 24 descriptor structs */ 25 #define SOUND_TRIGGER_MAX_LOCALE_LEN 6 /* max length of locale string. e.g en_US */ 26 #define SOUND_TRIGGER_MAX_USERS 10 /* max number of concurrent users */ 27 #define SOUND_TRIGGER_MAX_PHRASES 10 /* max number of concurrent phrases */ 28 29 typedef enum { 30 SOUND_TRIGGER_STATE_NO_INIT = -1, /* The sound trigger service is not initialized */ 31 SOUND_TRIGGER_STATE_ENABLED = 0, /* The sound trigger service is enabled */ 32 SOUND_TRIGGER_STATE_DISABLED = 1 /* The sound trigger service is disabled */ 33 } sound_trigger_service_state_t; 34 35 #define RECOGNITION_MODE_VOICE_TRIGGER 0x1 /* simple voice trigger */ 36 #define RECOGNITION_MODE_USER_IDENTIFICATION 0x2 /* trigger only if one user in model identified */ 37 #define RECOGNITION_MODE_USER_AUTHENTICATION 0x4 /* trigger only if one user in mode 38 authenticated */ 39 #define RECOGNITION_MODE_GENERIC_TRIGGER 0x8 /* generic sound trigger */ 40 41 #define RECOGNITION_STATUS_SUCCESS 0 42 #define RECOGNITION_STATUS_ABORT 1 43 #define RECOGNITION_STATUS_FAILURE 2 44 #define RECOGNITION_STATUS_GET_STATE_RESPONSE 3 /* Indicates that the recognition event is in 45 response to a state request and was not 46 triggered by a real DSP recognition */ 47 48 #define SOUND_MODEL_STATUS_UPDATED 0 49 50 typedef enum { 51 SOUND_MODEL_TYPE_UNKNOWN = -1, /* use for unspecified sound model type */ 52 SOUND_MODEL_TYPE_KEYPHRASE = 0, /* use for key phrase sound models */ 53 SOUND_MODEL_TYPE_GENERIC = 1 /* use for all models other than keyphrase */ 54 } sound_trigger_sound_model_type_t; 55 56 /** 57 * AudioCapabilities supported by the implemented HAL 58 * driver. 59 */ 60 typedef enum AudioCapabilities : uint32_t { 61 /** 62 * If set the underlying module supports AEC. 63 */ 64 SOUND_TRIGGER_ECHO_CANCELLATION = 1 << 0, 65 /** 66 * If set, the underlying module supports noise suppression. 67 */ 68 SOUND_TRIGGER_NOISE_SUPPRESSION = 1 << 1, 69 } sound_trigger_audio_capabilities_t; 70 71 typedef audio_uuid_t sound_trigger_uuid_t; 72 73 /* 74 * sound trigger implementation descriptor read by the framework via get_properties(). 75 * Used by SoundTrigger service to report to applications and manage concurrency and policy. 76 */ 77 struct sound_trigger_properties { 78 char implementor[SOUND_TRIGGER_MAX_STRING_LEN]; /* implementor name */ 79 char description[SOUND_TRIGGER_MAX_STRING_LEN]; /* implementation description */ 80 unsigned int version; /* implementation version */ 81 sound_trigger_uuid_t uuid; /* unique implementation ID. 82 Must change with version each version */ 83 unsigned int max_sound_models; /* maximum number of concurrent sound models 84 loaded */ 85 unsigned int max_key_phrases; /* maximum number of key phrases */ 86 unsigned int max_users; /* maximum number of concurrent users detected */ 87 unsigned int recognition_modes; /* all supported modes. 88 e.g RECOGNITION_MODE_VOICE_TRIGGER */ 89 bool capture_transition; /* supports seamless transition from detection 90 to capture */ 91 unsigned int max_buffer_ms; /* maximum buffering capacity in ms if 92 capture_transition is true*/ 93 bool concurrent_capture; /* supports capture by other use cases while 94 detection is active */ 95 bool trigger_in_event; /* returns the trigger capture in event */ 96 unsigned int power_consumption_mw; /* Rated power consumption when detection is active 97 with TDB silence/sound/speech ratio */ 98 }; 99 100 /* 101 * Properties header used to describe the version and size of extended properties. 102 * A header struct can be passed as a polymorphic struct (see usage below). 103 * 104 * Ex. cast to access properties: 105 * if (header->version >= SOUND_TRIGGER_DEVICE_API_VERSION_1_3) { 106 * sound_trigger_properties_extended_1_3 *properties = 107 * (sound_trigger_properties_extended_1_3*)header; 108 * } 109 * 110 * Ex. copy based on total size: 111 * void* buffer = malloc(header->size); 112 * memcpy(buffer, header, header->size); 113 * 114 * Each new version update must append to the previous one. This allows higher 115 * versioned extended properties structs to be cast down to previous versions. 116 */ 117 struct sound_trigger_properties_header { 118 uint32_t version; 119 size_t size; 120 }; 121 122 /* 123 * extended soundtrigger implementation descriptor containing verbose implementation 124 * properties. This is an extension of the base sound_trigger_properties struct. 125 * sound_trigger_properties_extended_1_3.header.version is expected to be 126 * SOUND_TRIGGER_DEVICE_API_VERSION_1_3. 127 */ 128 struct sound_trigger_properties_extended_1_3 { 129 /** header descriptor defining the struct's version */ 130 struct sound_trigger_properties_header header; 131 /** base properties */ 132 struct sound_trigger_properties base; 133 /** 134 * String naming the architecture used for running the supported models. 135 * (eg. DSP architecture) 136 */ 137 char supported_model_arch[SOUND_TRIGGER_MAX_STRING_LEN]; 138 /** 139 * Bit field encoding of the 140 * sound_trigger_audio_capabilities_t supported by the firmware. 141 */ 142 uint32_t audio_capabilities; 143 }; 144 145 typedef int sound_trigger_module_handle_t; 146 147 struct sound_trigger_module_descriptor { 148 sound_trigger_module_handle_t handle; 149 struct sound_trigger_properties properties; 150 }; 151 152 typedef int sound_model_handle_t; 153 154 /* 155 * Base sound model descriptor. This struct is the header of a larger block passed to 156 * load_sound_model() and containing the binary data of the sound model. 157 * Proprietary representation of users in binary data must match information indicated 158 * by users field 159 */ 160 struct sound_trigger_sound_model { 161 sound_trigger_sound_model_type_t type; /* model type. e.g. SOUND_MODEL_TYPE_KEYPHRASE */ 162 sound_trigger_uuid_t uuid; /* unique sound model ID. */ 163 sound_trigger_uuid_t vendor_uuid; /* unique vendor ID. Identifies the engine the 164 sound model was build for */ 165 unsigned int data_size; /* size of opaque model data */ 166 unsigned int data_offset; /* offset of opaque data start from head of struct 167 (e.g sizeof struct sound_trigger_sound_model) */ 168 }; 169 170 /* key phrase descriptor */ 171 struct sound_trigger_phrase { 172 unsigned int id; /* keyphrase ID */ 173 unsigned int recognition_mode; /* recognition modes supported by this key phrase */ 174 unsigned int num_users; /* number of users in the key phrase */ 175 unsigned int users[SOUND_TRIGGER_MAX_USERS]; /* users ids: (not uid_t but sound trigger 176 specific IDs */ 177 char locale[SOUND_TRIGGER_MAX_LOCALE_LEN]; /* locale - JAVA Locale style (e.g. en_US) */ 178 char text[SOUND_TRIGGER_MAX_STRING_LEN]; /* phrase text in UTF-8 format. */ 179 }; 180 181 /* 182 * Specialized sound model for key phrase detection. 183 * Proprietary representation of key phrases in binary data must match information indicated 184 * by phrases field 185 */ 186 struct sound_trigger_phrase_sound_model { 187 struct sound_trigger_sound_model common; 188 unsigned int num_phrases; /* number of key phrases in model */ 189 struct sound_trigger_phrase phrases[SOUND_TRIGGER_MAX_PHRASES]; 190 }; 191 192 193 /* 194 * Generic sound model, used for all cases except key phrase detection. 195 */ 196 struct sound_trigger_generic_sound_model { 197 struct sound_trigger_sound_model common; 198 }; 199 200 /* 201 * Model specific parameters to be used with parameter set and get APIs 202 */ 203 typedef enum { 204 /* 205 * Controls the sensitivity threshold adjustment factor for a given model. 206 * Negative value corresponds to less sensitive model (high threshold) and 207 * a positive value corresponds to a more sensitive model (low threshold). 208 * Default value is 0. 209 */ 210 MODEL_PARAMETER_THRESHOLD_FACTOR = 0, 211 212 /* 213 * Placeholder for invalid model parameter used for returning error or 214 * passing an invalid value. 215 */ 216 MODEL_PARAMETER_INVALID = -1, 217 } sound_trigger_model_parameter_t; 218 219 /** 220 * Model specific support for a given parameter 221 */ 222 typedef struct { 223 /** 224 * Boolean flag to determine if the parameter is supported by the hardware. 225 * The value of this parameter must be true to consider the start and end 226 * fields to be valid values. 227 */ 228 bool is_supported; 229 /** 230 * start of supported value range inclusive 231 */ 232 int32_t start; 233 /** 234 * end of supported value range inclusive 235 */ 236 int32_t end; 237 } sound_trigger_model_parameter_range_t; 238 239 /* 240 * Generic recognition event sent via recognition callback 241 * Must be aligned to transmit as raw memory through Binder. 242 */ 243 struct __attribute__((aligned(8))) sound_trigger_recognition_event { 244 int status; /* recognition status e.g. 245 RECOGNITION_STATUS_SUCCESS */ 246 sound_trigger_sound_model_type_t type; /* event type, same as sound model type. 247 e.g. SOUND_MODEL_TYPE_KEYPHRASE */ 248 sound_model_handle_t model; /* loaded sound model that triggered the 249 event */ 250 bool capture_available; /* it is possible to capture audio from this 251 utterance buffered by the 252 implementation */ 253 int capture_session; /* audio session ID. framework use */ 254 int capture_delay_ms; /* delay in ms between end of model 255 detection and start of audio available 256 for capture. A negative value is possible 257 (e.g. if key phrase is also available for 258 capture */ 259 int capture_preamble_ms; /* duration in ms of audio captured 260 before the start of the trigger. 261 0 if none. */ 262 bool trigger_in_data; /* the opaque data is the capture of 263 the trigger sound */ 264 audio_config_t audio_config; /* audio format of either the trigger in 265 event data or to use for capture of the 266 rest of the utterance */ 267 unsigned int data_size; /* size of opaque event data */ 268 unsigned int data_offset; /* offset of opaque data start from start of 269 this struct (e.g sizeof struct 270 sound_trigger_phrase_recognition_event) */ 271 }; 272 273 /* 274 * Confidence level for each user in struct sound_trigger_phrase_recognition_extra 275 */ 276 struct sound_trigger_confidence_level { 277 unsigned int user_id; /* user ID */ 278 unsigned int level; /* confidence level in percent (0 - 100). 279 - min level for recognition configuration 280 - detected level for recognition event */ 281 }; 282 283 /* 284 * Specialized recognition event for key phrase detection 285 */ 286 struct sound_trigger_phrase_recognition_extra { 287 unsigned int id; /* keyphrase ID */ 288 unsigned int recognition_modes; /* recognition modes used for this keyphrase */ 289 unsigned int confidence_level; /* confidence level for mode RECOGNITION_MODE_VOICE_TRIGGER */ 290 unsigned int num_levels; /* number of user confidence levels */ 291 struct sound_trigger_confidence_level levels[SOUND_TRIGGER_MAX_USERS]; 292 }; 293 294 struct sound_trigger_phrase_recognition_event { 295 struct sound_trigger_recognition_event common; 296 unsigned int num_phrases; 297 struct sound_trigger_phrase_recognition_extra phrase_extras[SOUND_TRIGGER_MAX_PHRASES]; 298 }; 299 300 struct sound_trigger_generic_recognition_event { 301 struct sound_trigger_recognition_event common; 302 }; 303 304 /* 305 * configuration for sound trigger capture session passed to start_recognition() 306 */ 307 struct sound_trigger_recognition_config { 308 audio_io_handle_t capture_handle; /* IO handle that will be used for capture. 309 N/A if capture_requested is false */ 310 audio_devices_t capture_device; /* input device requested for detection capture */ 311 bool capture_requested; /* capture and buffer audio for this recognition 312 instance */ 313 unsigned int num_phrases; /* number of key phrases recognition extras */ 314 struct sound_trigger_phrase_recognition_extra phrases[SOUND_TRIGGER_MAX_PHRASES]; 315 /* configuration for each key phrase */ 316 unsigned int data_size; /* size of opaque capture configuration data */ 317 unsigned int data_offset; /* offset of opaque data start from start of this struct 318 (e.g sizeof struct sound_trigger_recognition_config) */ 319 }; 320 321 /* 322 * Recognition config header used to describe the version and size of extended struct. 323 * A header struct can be passed as a polymorphic struct (see usage below). 324 * 325 * Ex. cast to access properties: 326 * if (header->version >= SOUND_TRIGGER_DEVICE_API_VERSION_1_3) { 327 * sound_trigger_recognition_config_extended_1_3 *config = 328 * (sound_trigger_recognition_config_extended_1_3*)header; 329 * } 330 * 331 * Ex. copy based on total size: 332 * void* buffer = malloc(header->size); 333 * memcpy(buffer, header, header->size); 334 * 335 * Each new version update must append to the previous one. This allows higher 336 * versioned extended properties structs to be cast down to previous versions. 337 */ 338 struct sound_trigger_recognition_config_header { 339 uint32_t version; 340 size_t size; 341 }; 342 343 /* 344 * Configuration for sound trigger capture session. 345 * This is an extension of the base sound_trigger_recognition_config struct. 346 * sound_trigger_recognition_config_extended_1_3.header.version is expected to be 347 * SOUND_TRIGGER_DEVICE_API_VERSION_1_3. 348 */ 349 struct sound_trigger_recognition_config_extended_1_3 { 350 /** header descriptor defining the struct's version */ 351 struct sound_trigger_recognition_config_header header; 352 /** base config */ 353 struct sound_trigger_recognition_config base; 354 /** 355 * Bit field encoding of the 356 * sound_trigger_audio_capabilities_t supported by the firmware. 357 */ 358 uint32_t audio_capabilities; 359 }; 360 361 /* 362 * Event sent via load sound model callback 363 */ 364 struct sound_trigger_model_event { 365 int status; /* sound model status e.g. SOUND_MODEL_STATUS_UPDATED */ 366 sound_model_handle_t model; /* loaded sound model that triggered the event */ 367 unsigned int data_size; /* size of event data if any. Size of updated sound model if 368 status is SOUND_MODEL_STATUS_UPDATED */ 369 unsigned int data_offset; /* offset of data start from start of this struct 370 (e.g sizeof struct sound_trigger_model_event) */ 371 }; 372 373 374 #endif // ANDROID_SOUND_TRIGGER_H 375