1 /*
2  * Copyright (C) 2014 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef ANDROID_SOUND_TRIGGER_H
18 #define ANDROID_SOUND_TRIGGER_H
19 
20 #include <stdbool.h>
21 #include <system/audio.h>
22 
23 #define SOUND_TRIGGER_MAX_STRING_LEN 64 /* max length of strings in properties or
24                                            descriptor structs */
25 #define SOUND_TRIGGER_MAX_LOCALE_LEN 6  /* max length of locale string. e.g en_US */
26 #define SOUND_TRIGGER_MAX_USERS 10      /* max number of concurrent users */
27 #define SOUND_TRIGGER_MAX_PHRASES 10    /* max number of concurrent phrases */
28 
29 typedef enum {
30     SOUND_TRIGGER_STATE_NO_INIT = -1,   /* The sound trigger service is not initialized */
31     SOUND_TRIGGER_STATE_ENABLED = 0,    /* The sound trigger service is enabled */
32     SOUND_TRIGGER_STATE_DISABLED = 1    /* The sound trigger service is disabled */
33 } sound_trigger_service_state_t;
34 
35 #define RECOGNITION_MODE_VOICE_TRIGGER 0x1       /* simple voice trigger */
36 #define RECOGNITION_MODE_USER_IDENTIFICATION 0x2 /* trigger only if one user in model identified */
37 #define RECOGNITION_MODE_USER_AUTHENTICATION 0x4 /* trigger only if one user in mode
38                                                     authenticated */
39 #define RECOGNITION_MODE_GENERIC_TRIGGER 0x8     /* generic sound trigger */
40 
41 #define RECOGNITION_STATUS_SUCCESS 0
42 #define RECOGNITION_STATUS_ABORT 1
43 #define RECOGNITION_STATUS_FAILURE 2
44 #define RECOGNITION_STATUS_GET_STATE_RESPONSE 3  /* Indicates that the recognition event is in
45                                                     response to a state request and was not
46                                                     triggered by a real DSP recognition */
47 
48 #define SOUND_MODEL_STATUS_UPDATED 0
49 
50 typedef enum {
51     SOUND_MODEL_TYPE_UNKNOWN = -1,    /* use for unspecified sound model type */
52     SOUND_MODEL_TYPE_KEYPHRASE = 0,    /* use for key phrase sound models */
53     SOUND_MODEL_TYPE_GENERIC = 1      /* use for all models other than keyphrase */
54 } sound_trigger_sound_model_type_t;
55 
56 /**
57  * AudioCapabilities supported by the implemented HAL
58  * driver.
59  */
60 typedef enum AudioCapabilities : uint32_t {
61     /**
62      * If set the underlying module supports AEC.
63      */
64     SOUND_TRIGGER_ECHO_CANCELLATION = 1 << 0,
65     /**
66      * If set, the underlying module supports noise suppression.
67      */
68     SOUND_TRIGGER_NOISE_SUPPRESSION = 1 << 1,
69 } sound_trigger_audio_capabilities_t;
70 
71 typedef audio_uuid_t sound_trigger_uuid_t;
72 
73 /*
74  * sound trigger implementation descriptor read by the framework via get_properties().
75  * Used by SoundTrigger service to report to applications and manage concurrency and policy.
76  */
77 struct sound_trigger_properties {
78     char                 implementor[SOUND_TRIGGER_MAX_STRING_LEN]; /* implementor name */
79     char                 description[SOUND_TRIGGER_MAX_STRING_LEN]; /* implementation description */
80     unsigned int         version;               /* implementation version */
81     sound_trigger_uuid_t uuid;                  /* unique implementation ID.
82                                                    Must change with version each version */
83     unsigned int         max_sound_models;      /* maximum number of concurrent sound models
84                                                    loaded */
85     unsigned int         max_key_phrases;       /* maximum number of key phrases */
86     unsigned int         max_users;             /* maximum number of concurrent users detected */
87     unsigned int         recognition_modes;     /* all supported modes.
88                                                    e.g RECOGNITION_MODE_VOICE_TRIGGER */
89     bool                 capture_transition;    /* supports seamless transition from detection
90                                                    to capture */
91     unsigned int         max_buffer_ms;         /* maximum buffering capacity in ms if
92                                                    capture_transition is true*/
93     bool                 concurrent_capture;    /* supports capture by other use cases while
94                                                    detection is active */
95     bool                 trigger_in_event;      /* returns the trigger capture in event */
96     unsigned int         power_consumption_mw;  /* Rated power consumption when detection is active
97                                                    with TDB silence/sound/speech ratio */
98 };
99 
100 /*
101  * Properties header used to describe the version and size of extended properties.
102  * A header struct can be passed as a polymorphic struct (see usage below).
103  *
104  * Ex. cast to access properties:
105  * if (header->version >= SOUND_TRIGGER_DEVICE_API_VERSION_1_3) {
106  *   sound_trigger_properties_extended_1_3 *properties =
107  *       (sound_trigger_properties_extended_1_3*)header;
108  * }
109  *
110  * Ex. copy based on total size:
111  * void* buffer = malloc(header->size);
112  * memcpy(buffer, header, header->size);
113  *
114  * Each new version update must append to the previous one. This allows higher
115  * versioned extended properties structs to be cast down to previous versions.
116  */
117 struct sound_trigger_properties_header {
118     uint32_t version;
119     size_t size;
120 };
121 
122 /*
123  * extended soundtrigger implementation descriptor containing verbose implementation
124  * properties. This is an extension of the base sound_trigger_properties struct.
125  * sound_trigger_properties_extended_1_3.header.version is expected to be
126  * SOUND_TRIGGER_DEVICE_API_VERSION_1_3.
127  */
128 struct sound_trigger_properties_extended_1_3 {
129     /** header descriptor defining the struct's version */
130     struct sound_trigger_properties_header header;
131     /** base properties */
132     struct sound_trigger_properties base;
133     /**
134      * String naming the architecture used for running the supported models.
135      * (eg. DSP architecture)
136      */
137     char supported_model_arch[SOUND_TRIGGER_MAX_STRING_LEN];
138     /**
139      * Bit field encoding of the
140      * sound_trigger_audio_capabilities_t supported by the firmware.
141      */
142     uint32_t audio_capabilities;
143 };
144 
145 typedef int sound_trigger_module_handle_t;
146 
147 struct sound_trigger_module_descriptor {
148     sound_trigger_module_handle_t   handle;
149     struct sound_trigger_properties properties;
150 };
151 
152 typedef int sound_model_handle_t;
153 
154 /*
155  * Base sound model descriptor. This struct is the header of a larger block passed to
156  * load_sound_model() and containing the binary data of the sound model.
157  * Proprietary representation of users in binary data must match information indicated
158  * by users field
159  */
160 struct sound_trigger_sound_model {
161     sound_trigger_sound_model_type_t type;        /* model type. e.g. SOUND_MODEL_TYPE_KEYPHRASE */
162     sound_trigger_uuid_t             uuid;        /* unique sound model ID. */
163     sound_trigger_uuid_t             vendor_uuid; /* unique vendor ID. Identifies the engine the
164                                                   sound model was build for */
165     unsigned int                     data_size;   /* size of opaque model data */
166     unsigned int                     data_offset; /* offset of opaque data start from head of struct
167                                                     (e.g sizeof struct sound_trigger_sound_model) */
168 };
169 
170 /* key phrase descriptor */
171 struct sound_trigger_phrase {
172     unsigned int id;                /* keyphrase ID */
173     unsigned int recognition_mode;  /* recognition modes supported by this key phrase */
174     unsigned int num_users;         /* number of users in the key phrase */
175     unsigned int users[SOUND_TRIGGER_MAX_USERS]; /* users ids: (not uid_t but sound trigger
176                                                  specific IDs */
177     char         locale[SOUND_TRIGGER_MAX_LOCALE_LEN]; /* locale - JAVA Locale style (e.g. en_US) */
178     char         text[SOUND_TRIGGER_MAX_STRING_LEN];   /* phrase text in UTF-8 format. */
179 };
180 
181 /*
182  * Specialized sound model for key phrase detection.
183  * Proprietary representation of key phrases in binary data must match information indicated
184  * by phrases field
185  */
186 struct sound_trigger_phrase_sound_model {
187     struct sound_trigger_sound_model common;
188     unsigned int                     num_phrases;   /* number of key phrases in model */
189     struct sound_trigger_phrase      phrases[SOUND_TRIGGER_MAX_PHRASES];
190 };
191 
192 
193 /*
194  * Generic sound model, used for all cases except key phrase detection.
195  */
196 struct sound_trigger_generic_sound_model {
197     struct sound_trigger_sound_model common;
198 };
199 
200 /*
201  * Model specific parameters to be used with parameter set and get APIs
202  */
203 typedef enum {
204     /*
205      * Controls the sensitivity threshold adjustment factor for a given model.
206      * Negative value corresponds to less sensitive model (high threshold) and
207      * a positive value corresponds to a more sensitive model (low threshold).
208      * Default value is 0.
209      */
210     MODEL_PARAMETER_THRESHOLD_FACTOR = 0,
211 
212     /*
213      * Placeholder for invalid model parameter used for returning error or
214      * passing an invalid value.
215      */
216     MODEL_PARAMETER_INVALID = -1,
217 } sound_trigger_model_parameter_t;
218 
219 /**
220  * Model specific support for a given parameter
221  */
222 typedef struct {
223     /**
224      * Boolean flag to determine if the parameter is supported by the hardware.
225      * The value of this parameter must be true to consider the start and end
226      * fields to be valid values.
227      */
228     bool is_supported;
229     /**
230      * start of supported value range inclusive
231      */
232     int32_t start;
233     /**
234      * end of supported value range inclusive
235      */
236     int32_t end;
237 } sound_trigger_model_parameter_range_t;
238 
239 /*
240  * Generic recognition event sent via recognition callback
241  * Must be aligned to transmit as raw memory through Binder.
242  */
243 struct __attribute__((aligned(8))) sound_trigger_recognition_event {
244     int                              status;            /* recognition status e.g.
245                                                            RECOGNITION_STATUS_SUCCESS */
246     sound_trigger_sound_model_type_t type;              /* event type, same as sound model type.
247                                                            e.g. SOUND_MODEL_TYPE_KEYPHRASE */
248     sound_model_handle_t             model;             /* loaded sound model that triggered the
249                                                            event */
250     bool                             capture_available; /* it is possible to capture audio from this
251                                                            utterance buffered by the
252                                                            implementation */
253     int                              capture_session;   /* audio session ID. framework use */
254     int                              capture_delay_ms;  /* delay in ms between end of model
255                                                            detection and start of audio available
256                                                            for capture. A negative value is possible
257                                                            (e.g. if key phrase is also available for
258                                                            capture */
259     int                              capture_preamble_ms; /* duration in ms of audio captured
260                                                             before the start of the trigger.
261                                                             0 if none. */
262     bool                             trigger_in_data; /* the opaque data is the capture of
263                                                             the trigger sound */
264     audio_config_t                   audio_config;        /* audio format of either the trigger in
265                                                              event data or to use for capture of the
266                                                              rest of the utterance */
267     unsigned int                     data_size;         /* size of opaque event data */
268     unsigned int                     data_offset;       /* offset of opaque data start from start of
269                                                           this struct (e.g sizeof struct
270                                                           sound_trigger_phrase_recognition_event) */
271 };
272 
273 /*
274  * Confidence level for each user in struct sound_trigger_phrase_recognition_extra
275  */
276 struct sound_trigger_confidence_level {
277     unsigned int user_id;   /* user ID */
278     unsigned int level;     /* confidence level in percent (0 - 100).
279                                - min level for recognition configuration
280                                - detected level for recognition event */
281 };
282 
283 /*
284  * Specialized recognition event for key phrase detection
285  */
286 struct sound_trigger_phrase_recognition_extra {
287     unsigned int id;                /* keyphrase ID */
288     unsigned int recognition_modes; /* recognition modes used for this keyphrase */
289     unsigned int confidence_level;  /* confidence level for mode RECOGNITION_MODE_VOICE_TRIGGER */
290     unsigned int num_levels;        /* number of user confidence levels */
291     struct sound_trigger_confidence_level levels[SOUND_TRIGGER_MAX_USERS];
292 };
293 
294 struct sound_trigger_phrase_recognition_event {
295     struct sound_trigger_recognition_event common;
296     unsigned int                           num_phrases;
297     struct sound_trigger_phrase_recognition_extra phrase_extras[SOUND_TRIGGER_MAX_PHRASES];
298 };
299 
300 struct sound_trigger_generic_recognition_event {
301     struct sound_trigger_recognition_event common;
302 };
303 
304 /*
305  * configuration for sound trigger capture session passed to start_recognition()
306  */
307 struct sound_trigger_recognition_config {
308     audio_io_handle_t    capture_handle;    /* IO handle that will be used for capture.
309                                             N/A if capture_requested is false */
310     audio_devices_t      capture_device;    /* input device requested for detection capture */
311     bool                 capture_requested; /* capture and buffer audio for this recognition
312                                             instance */
313     unsigned int         num_phrases;   /* number of key phrases recognition extras */
314     struct sound_trigger_phrase_recognition_extra phrases[SOUND_TRIGGER_MAX_PHRASES];
315                                            /* configuration for each key phrase */
316     unsigned int        data_size;         /* size of opaque capture configuration data */
317     unsigned int        data_offset;       /* offset of opaque data start from start of this struct
318                                            (e.g sizeof struct sound_trigger_recognition_config) */
319 };
320 
321 /*
322  * Recognition config header used to describe the version and size of extended struct.
323  * A header struct can be passed as a polymorphic struct (see usage below).
324  *
325  * Ex. cast to access properties:
326  * if (header->version >= SOUND_TRIGGER_DEVICE_API_VERSION_1_3) {
327  *   sound_trigger_recognition_config_extended_1_3 *config =
328  *       (sound_trigger_recognition_config_extended_1_3*)header;
329  * }
330  *
331  * Ex. copy based on total size:
332  * void* buffer = malloc(header->size);
333  * memcpy(buffer, header, header->size);
334  *
335  * Each new version update must append to the previous one. This allows higher
336  * versioned extended properties structs to be cast down to previous versions.
337  */
338 struct sound_trigger_recognition_config_header {
339     uint32_t version;
340     size_t size;
341 };
342 
343 /*
344  * Configuration for sound trigger capture session.
345  * This is an extension of the base sound_trigger_recognition_config struct.
346  * sound_trigger_recognition_config_extended_1_3.header.version is expected to be
347  * SOUND_TRIGGER_DEVICE_API_VERSION_1_3.
348  */
349 struct sound_trigger_recognition_config_extended_1_3 {
350     /** header descriptor defining the struct's version */
351     struct sound_trigger_recognition_config_header header;
352     /** base config */
353     struct sound_trigger_recognition_config base;
354     /**
355      * Bit field encoding of the
356      * sound_trigger_audio_capabilities_t supported by the firmware.
357      */
358     uint32_t audio_capabilities;
359 };
360 
361 /*
362  * Event sent via load sound model callback
363  */
364 struct sound_trigger_model_event {
365     int                  status;      /* sound model status e.g. SOUND_MODEL_STATUS_UPDATED */
366     sound_model_handle_t model;       /* loaded sound model that triggered the event */
367     unsigned int         data_size;   /* size of event data if any. Size of updated sound model if
368                                        status is SOUND_MODEL_STATUS_UPDATED */
369     unsigned int         data_offset; /* offset of data start from start of this struct
370                                        (e.g sizeof struct sound_trigger_model_event) */
371 };
372 
373 
374 #endif  // ANDROID_SOUND_TRIGGER_H
375