diff --git a/build/modules.conf.in b/build/modules.conf.in
index 8851ded303c..c2e0d7c08b4 100755
--- a/build/modules.conf.in
+++ b/build/modules.conf.in
@@ -47,6 +47,8 @@ applications/mod_voicemail
#asr_tts/mod_flite
#asr_tts/mod_pocketsphinx
#asr_tts/mod_tts_commandline
+#asr_tts/mod_google_asr
+#asr_tts/mod_openai_asr
codecs/mod_amr
#codecs/mod_amrwb
codecs/mod_b64
diff --git a/configure.ac b/configure.ac
index 2141e5e3c0f..c9c807f8d4b 100755
--- a/configure.ac
+++ b/configure.ac
@@ -2016,6 +2016,8 @@ AC_CONFIG_FILES([Makefile
src/mod/asr_tts/mod_flite/Makefile
src/mod/asr_tts/mod_pocketsphinx/Makefile
src/mod/asr_tts/mod_tts_commandline/Makefile
+ src/mod/asr_tts/mod_google_asr/Makefile
+ src/mod/asr_tts/mod_openai_asr/Makefile
src/mod/codecs/mod_amr/Makefile
src/mod/codecs/mod_amrwb/Makefile
src/mod/codecs/mod_b64/Makefile
diff --git a/src/mod/asr_tts/mod_google_asr/Makefile.am b/src/mod/asr_tts/mod_google_asr/Makefile.am
new file mode 100644
index 00000000000..a829a855c9c
--- /dev/null
+++ b/src/mod/asr_tts/mod_google_asr/Makefile.am
@@ -0,0 +1,12 @@
+
+include $(top_srcdir)/build/modmake.rulesam
+
+MODNAME=mod_google_asr
+mod_LTLIBRARIES = mod_google_asr.la
+mod_google_asr_la_SOURCES = mod_google_asr.c utils.c curl.c
+mod_google_asr_la_CFLAGS = $(AM_CFLAGS) -I. -Wno-pointer-arith
+mod_google_asr_la_LIBADD = $(switch_builddir)/libfreeswitch.la
+mod_google_asr_la_LDFLAGS = -avoid-version -module -no-undefined -shared
+
+$(am_mod_google_asr_la_OBJECTS): mod_google_asr.h
+
diff --git a/src/mod/asr_tts/mod_google_asr/conf/autoload_configs/google_asr.conf.xml b/src/mod/asr_tts/mod_google_asr/conf/autoload_configs/google_asr.conf.xml
new file mode 100644
index 00000000000..76772fa7776
--- /dev/null
+++ b/src/mod/asr_tts/mod_google_asr/conf/autoload_configs/google_asr.conf.xml
@@ -0,0 +1,42 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/mod/asr_tts/mod_google_asr/conf/dialplan/dialplan.xml b/src/mod/asr_tts/mod_google_asr/conf/dialplan/dialplan.xml
new file mode 100644
index 00000000000..121cc2b13d4
--- /dev/null
+++ b/src/mod/asr_tts/mod_google_asr/conf/dialplan/dialplan.xml
@@ -0,0 +1,11 @@
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/mod/asr_tts/mod_google_asr/curl.c b/src/mod/asr_tts/mod_google_asr/curl.c
new file mode 100644
index 00000000000..636a49c59c6
--- /dev/null
+++ b/src/mod/asr_tts/mod_google_asr/curl.c
@@ -0,0 +1,129 @@
+/*
+ * FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application
+ * Copyright (C) 2005-2014, Anthony Minessale II
+ *
+ * Version: MPL 1.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * Module Contributor(s):
+ * Konstantin Alexandrin
+ *
+ *
+ */
+#include "mod_google_asr.h"
+
+static size_t curl_io_write_callback(char *buffer, size_t size, size_t nitems, void *user_data) {
+ asr_ctx_t *asr_ctx = (asr_ctx_t *)user_data;
+ size_t len = (size * nitems);
+
+ if(len > 0 && asr_ctx->curl_recv_buffer_ref) {
+ switch_buffer_write(asr_ctx->curl_recv_buffer_ref, buffer, len);
+ }
+
+ return len;
+}
+
+static size_t curl_io_read_callback(char *buffer, size_t size, size_t nitems, void *user_data) {
+ asr_ctx_t *asr_ctx = (asr_ctx_t *)user_data;
+ size_t nmax = (size * nitems);
+ size_t ncur = (asr_ctx->curl_send_buffer_len > nmax) ? nmax : asr_ctx->curl_send_buffer_len;
+
+ if(ncur > 0) {
+ memmove(buffer, asr_ctx->curl_send_buffer_ref, ncur);
+ asr_ctx->curl_send_buffer_ref += ncur;
+ asr_ctx->curl_send_buffer_len -= ncur;
+ }
+
+ return ncur;
+}
+
+switch_status_t curl_perform(asr_ctx_t *asr_ctx, globals_t *globals) {
+ switch_status_t status = SWITCH_STATUS_SUCCESS;
+ CURL *curl_handle = NULL;
+ switch_curl_slist_t *headers = NULL;
+ char *epurl = NULL;
+ switch_CURLcode curl_ret = 0;
+ long http_resp = 0;
+
+ if(asr_ctx->api_key) {
+ epurl = switch_string_replace(globals->api_url, "${api-key}", asr_ctx->api_key);
+ } else {
+ epurl = strdup(globals->api_url);
+ }
+
+ curl_handle = switch_curl_easy_init();
+ headers = switch_curl_slist_append(headers, "Content-Type: application/json; charset=utf-8");
+
+ switch_curl_easy_setopt(curl_handle, CURLOPT_HTTPHEADER, headers);
+ switch_curl_easy_setopt(curl_handle, CURLOPT_POST, 1);
+ switch_curl_easy_setopt(curl_handle, CURLOPT_NOSIGNAL, 1);
+ switch_curl_easy_setopt(curl_handle, CURLOPT_READFUNCTION, curl_io_read_callback);
+ switch_curl_easy_setopt(curl_handle, CURLOPT_READDATA, (void *)asr_ctx);
+ switch_curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, curl_io_write_callback);
+ switch_curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *)asr_ctx);
+
+ if(globals->connect_timeout > 0) {
+ switch_curl_easy_setopt(curl_handle, CURLOPT_CONNECTTIMEOUT, globals->connect_timeout);
+ }
+ if(globals->request_timeout > 0) {
+ switch_curl_easy_setopt(curl_handle, CURLOPT_TIMEOUT, globals->request_timeout);
+ }
+ if(globals->user_agent) {
+ switch_curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, globals->user_agent);
+ }
+ if(strncasecmp(epurl, "https", 5) == 0) {
+ switch_curl_easy_setopt(curl_handle, CURLOPT_SSL_VERIFYPEER, 0);
+ switch_curl_easy_setopt(curl_handle, CURLOPT_SSL_VERIFYHOST, 0);
+ }
+ if(globals->proxy) {
+ if(globals->proxy_credentials != NULL) {
+ switch_curl_easy_setopt(curl_handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY);
+ switch_curl_easy_setopt(curl_handle, CURLOPT_PROXYUSERPWD, globals->proxy_credentials);
+ }
+ if(strncasecmp(globals->proxy, "https", 5) == 0) {
+ switch_curl_easy_setopt(curl_handle, CURLOPT_PROXY_SSL_VERIFYPEER, 0);
+ }
+ switch_curl_easy_setopt(curl_handle, CURLOPT_PROXY, globals->proxy);
+ }
+
+ switch_curl_easy_setopt(curl_handle, CURLOPT_URL, epurl);
+
+ curl_ret = switch_curl_easy_perform(curl_handle);
+ if(!curl_ret) {
+ switch_curl_easy_getinfo(curl_handle, CURLINFO_RESPONSE_CODE, &http_resp);
+ if(!http_resp) { switch_curl_easy_getinfo(curl_handle, CURLINFO_HTTP_CONNECTCODE, &http_resp); }
+ } else {
+ http_resp = curl_ret;
+ }
+
+ if(http_resp != 200) {
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "http-error=[%ld] (%s)\n", http_resp, globals->api_url);
+ status = SWITCH_STATUS_FALSE;
+ }
+
+ if(asr_ctx->curl_recv_buffer_ref) {
+ if(switch_buffer_inuse(asr_ctx->curl_recv_buffer_ref) > 0) {
+ switch_buffer_write(asr_ctx->curl_recv_buffer_ref, "\0", 1);
+ }
+ }
+
+ if(curl_handle) {
+ switch_curl_easy_cleanup(curl_handle);
+ }
+
+ if(headers) {
+ switch_curl_slist_free_all(headers);
+ }
+
+ switch_safe_free(epurl);
+ return status;
+}
diff --git a/src/mod/asr_tts/mod_google_asr/mod_google_asr.c b/src/mod/asr_tts/mod_google_asr/mod_google_asr.c
new file mode 100644
index 00000000000..736577b26be
--- /dev/null
+++ b/src/mod/asr_tts/mod_google_asr/mod_google_asr.c
@@ -0,0 +1,792 @@
+/*
+ * FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application
+ * Copyright (C) 2005-2014, Anthony Minessale II
+ *
+ * Version: MPL 1.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * Module Contributor(s):
+ * Konstantin Alexandrin
+ *
+ *
+ * Google Speech-To-Text service for the Freeswitch.
+ * https://cloud.google.com/speech-to-text/docs/reference/rest
+ *
+ * Development repository:
+ * https://github.com/akscf/mod_google_asr
+ *
+ */
+#include "mod_google_asr.h"
+
+globals_t globals;
+
+SWITCH_MODULE_LOAD_FUNCTION(mod_google_asr_load);
+SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_google_asr_shutdown);
+SWITCH_MODULE_DEFINITION(mod_google_asr, mod_google_asr_load, mod_google_asr_shutdown, NULL);
+
+
+static void *SWITCH_THREAD_FUNC transcribe_thread(switch_thread_t *thread, void *obj) {
+ volatile asr_ctx_t *_ref = (asr_ctx_t *)obj;
+ asr_ctx_t *asr_ctx = (asr_ctx_t *)_ref;
+ switch_status_t status = SWITCH_STATUS_FALSE;
+ switch_byte_t *base64_buffer = NULL;
+ switch_byte_t *curl_send_buffer = NULL;
+ switch_buffer_t *chunk_buffer = NULL;
+ switch_buffer_t *curl_recv_buffer = NULL;
+ switch_memory_pool_t *pool = NULL;
+ time_t speech_timeout = 0;
+ uint32_t base64_buffer_size = 0, chunk_buffer_size = 0, recv_len = 0;
+ uint32_t schunks = 0;
+ uint8_t fl_cbuff_overflow = SWITCH_FALSE;
+ const void *curl_recv_buffer_ptr = NULL;
+ void *pop = NULL;
+
+ switch_mutex_lock(asr_ctx->mutex);
+ asr_ctx->refs++;
+ switch_mutex_unlock(asr_ctx->mutex);
+
+ if(switch_core_new_memory_pool(&pool) != SWITCH_STATUS_SUCCESS) {
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "switch_core_new_memory_pool()\n");
+ goto out;
+ }
+ if(switch_buffer_create_dynamic(&curl_recv_buffer, 1024, 4096, 32648) != SWITCH_STATUS_SUCCESS) {
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "switch_buffer_create_dynamic()\n");
+ goto out;
+ }
+
+ while(SWITCH_TRUE) {
+ if(globals.fl_shutdown || asr_ctx->fl_destroyed ) {
+ break;
+ }
+
+ if(chunk_buffer_size == 0) {
+ switch_mutex_lock(asr_ctx->mutex);
+ chunk_buffer_size = asr_ctx->chunk_buffer_size;
+ switch_mutex_unlock(asr_ctx->mutex);
+
+ if(chunk_buffer_size > 0) {
+ if(switch_buffer_create(pool, &chunk_buffer, chunk_buffer_size) != SWITCH_STATUS_SUCCESS) {
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "switch_buffer_create()\n");
+ break;
+ }
+ switch_buffer_zero(chunk_buffer);
+ }
+
+ goto timer_next;
+ }
+
+ fl_cbuff_overflow = SWITCH_FALSE;
+ while(switch_queue_trypop(asr_ctx->q_audio, &pop) == SWITCH_STATUS_SUCCESS) {
+ xdata_buffer_t *audio_buffer = (xdata_buffer_t *)pop;
+ if(globals.fl_shutdown || asr_ctx->fl_destroyed ) {
+ xdata_buffer_free(&audio_buffer);
+ break;
+ }
+ if(audio_buffer && audio_buffer->len) {
+ if(switch_buffer_write(chunk_buffer, audio_buffer->data, audio_buffer->len) >= chunk_buffer_size) {
+ fl_cbuff_overflow = SWITCH_TRUE;
+ break;
+ }
+ schunks++;
+ }
+ xdata_buffer_free(&audio_buffer);
+ }
+
+ if(fl_cbuff_overflow) {
+ speech_timeout = 1;
+ } else {
+ if(schunks && asr_ctx->vad_state == SWITCH_VAD_STATE_STOP_TALKING) {
+ if(!speech_timeout) {
+ speech_timeout = asr_ctx->silence_sec + switch_epoch_time_now(NULL);
+ }
+ }
+ if(speech_timeout && (asr_ctx->vad_state == SWITCH_VAD_STATE_START_TALKING || asr_ctx->vad_state == SWITCH_VAD_STATE_TALKING)) {
+ speech_timeout = 0;
+ }
+ }
+
+ if(speech_timeout && speech_timeout <= switch_epoch_time_now(NULL)) {
+ const void *chunk_buffer_ptr = NULL;
+ uint32_t buf_len = switch_buffer_peek_zerocopy(chunk_buffer, &chunk_buffer_ptr);
+ uint32_t b64_len = BASE64_ENC_SZ(buf_len) + 1;
+ uint32_t stt_failed = 0;
+
+ if(base64_buffer_size == 0 || base64_buffer_size < b64_len) {
+ if(base64_buffer_size > 0) { switch_safe_free(base64_buffer); }
+ switch_zmalloc(base64_buffer, b64_len);
+ base64_buffer_size = b64_len;
+ } else {
+ memset(base64_buffer, 0x0, b64_len);
+ }
+
+ if(switch_b64_encode((uint8_t *)chunk_buffer_ptr, buf_len, base64_buffer, base64_buffer_size) == SWITCH_STATUS_SUCCESS) {
+ curl_send_buffer = (switch_byte_t *)switch_mprintf( "{'config':{" \
+ "'languageCode':'%s', 'encoding':'%s', 'sampleRateHertz':'%u', 'audioChannelCount':'%u', 'maxAlternatives':'%u', " \
+ "'profanityFilter':'%s', 'enableWordTimeOffsets':'%s', 'enableWordConfidence':'%s', 'enableAutomaticPunctuation':'%s', " \
+ "'enableSpokenPunctuation':'%s', 'enableSpokenEmojis':'%s', 'model':'%s', 'useEnhanced':'%s', " \
+ " 'diarizationConfig':{'enableSpeakerDiarization': '%s', 'minSpeakerCount': '%u', 'maxSpeakerCount': '%u'}, " \
+ "'metadata':{'interactionType':'%s', 'microphoneDistance':'%s', 'recordingDeviceType':'%s'}}, 'audio':{'content':'%s'}}",
+ asr_ctx->lang,
+ globals.opt_encoding,
+ asr_ctx->samplerate,
+ asr_ctx->channels,
+ asr_ctx->opt_max_alternatives,
+ BOOL2STR(asr_ctx->opt_enable_profanity_filter),
+ BOOL2STR(asr_ctx->opt_enable_word_time_offsets),
+ BOOL2STR(asr_ctx->opt_enable_word_confidence),
+ BOOL2STR(asr_ctx->opt_enable_automatic_punctuation),
+ BOOL2STR(asr_ctx->opt_enable_spoken_punctuation),
+ BOOL2STR(asr_ctx->opt_enable_spoken_emojis),
+ asr_ctx->opt_speech_model,
+ BOOL2STR(asr_ctx->opt_use_enhanced_model),
+ BOOL2STR(asr_ctx->opt_enable_speaker_diarization),
+ asr_ctx->opt_diarization_min_speaker_count,
+ asr_ctx->opt_diarization_max_speaker_count,
+ asr_ctx->opt_meta_interaction_type,
+ asr_ctx->opt_meta_microphone_distance,
+ asr_ctx->opt_meta_recording_device_type,
+ base64_buffer
+ );
+
+ asr_ctx->curl_send_buffer_ref = curl_send_buffer;
+ asr_ctx->curl_send_buffer_len = strlen((const char *)curl_send_buffer);
+ asr_ctx->curl_recv_buffer_ref = curl_recv_buffer;
+
+ for(int rqtry = 0; rqtry < asr_ctx->retries_on_error; rqtry++) {
+ switch_buffer_zero(curl_recv_buffer);
+ status = curl_perform(asr_ctx, &globals);
+ if(status == SWITCH_STATUS_SUCCESS || globals.fl_shutdown || asr_ctx->fl_destroyed) { break; }
+ switch_yield(1000);
+ }
+
+ recv_len = switch_buffer_peek_zerocopy(curl_recv_buffer, &curl_recv_buffer_ptr);
+ if(status == SWITCH_STATUS_SUCCESS) {
+ if(curl_recv_buffer_ptr && recv_len) {
+ char *txt = parse_response((char *)curl_recv_buffer_ptr, NULL);
+#ifdef MOD_GOOGLE_ASR_DEBUG
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Service response [%s]\n", (char *)curl_recv_buffer_ptr);
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Text [%s]\n", txt ? txt : "null");
+#endif
+ if(!txt) txt = strdup("");
+ if(switch_queue_trypush(asr_ctx->q_text, txt) == SWITCH_STATUS_SUCCESS) {
+ switch_mutex_lock(asr_ctx->mutex);
+ asr_ctx->transcription_results++;
+ switch_mutex_unlock(asr_ctx->mutex);
+ } else {
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Queue is full!\n");
+ switch_safe_free(txt);
+ }
+ } else {
+ stt_failed = 1;
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Empty service response!\n");
+ }
+ } else {
+ stt_failed = 1;
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Unable to perform request!\n");
+ }
+ switch_safe_free(curl_send_buffer);
+ } else {
+ stt_failed = 1;
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "switch_b64_encode() failed\n");
+ }
+
+ if(stt_failed) {
+ char *txt = strdup("[transcription failed]");
+ if(switch_queue_trypush(asr_ctx->q_text, txt) == SWITCH_STATUS_SUCCESS) {
+ switch_mutex_lock(asr_ctx->mutex);
+ asr_ctx->transcription_results++;
+ switch_mutex_unlock(asr_ctx->mutex);
+ } else {
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Queue is full!\n");
+ switch_safe_free(txt);
+ }
+ }
+
+ schunks = 0;
+ speech_timeout = 0;
+ switch_buffer_zero(chunk_buffer);
+ }
+
+ timer_next:
+ switch_yield(10000);
+ }
+
+out:
+ switch_safe_free(base64_buffer);
+ switch_safe_free(curl_send_buffer);
+
+ if(curl_recv_buffer) {
+ switch_buffer_destroy(&curl_recv_buffer);
+ }
+ if(chunk_buffer) {
+ switch_buffer_destroy(&chunk_buffer);
+ }
+ if(pool) {
+ switch_core_destroy_memory_pool(&pool);
+ }
+
+ switch_mutex_lock(asr_ctx->mutex);
+ if(asr_ctx->refs > 0) asr_ctx->refs--;
+ switch_mutex_unlock(asr_ctx->mutex);
+
+ switch_mutex_lock(globals.mutex);
+ if(globals.active_threads) globals.active_threads--;
+ switch_mutex_unlock(globals.mutex);
+
+ return NULL;
+}
+
+// ---------------------------------------------------------------------------------------------------------------------------------------------
+// asr interface
+// ---------------------------------------------------------------------------------------------------------------------------------------------
+static switch_status_t asr_open(switch_asr_handle_t *ah, const char *codec, int samplerate, const char *dest, switch_asr_flag_t *flags) {
+ switch_status_t status = SWITCH_STATUS_SUCCESS;
+ switch_threadattr_t *attr = NULL;
+ switch_thread_t *thread = NULL;
+ asr_ctx_t *asr_ctx = NULL;
+
+ if(strcmp(codec, "L16") !=0) {
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Unsupported encoding: %s\n", codec);
+ switch_goto_status(SWITCH_STATUS_FALSE, out);
+ }
+
+ if((asr_ctx = switch_core_alloc(ah->memory_pool, sizeof(asr_ctx_t))) == NULL) {
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "switch_core_alloc()\n");
+ switch_goto_status(SWITCH_STATUS_GENERR, out);
+ }
+
+ asr_ctx->channels = 1;
+ asr_ctx->chunk_buffer_size = 0;
+ asr_ctx->samplerate = samplerate;
+ asr_ctx->silence_sec = globals.speech_silence_sec;
+ asr_ctx->lang = (char *)globals.default_lang;
+ asr_ctx->api_key = globals.api_key;
+ asr_ctx->retries_on_error = globals.retries_on_error;
+
+ asr_ctx->opt_max_alternatives = globals.opt_max_alternatives;
+ asr_ctx->opt_enable_profanity_filter = globals.opt_enable_profanity_filter;
+ asr_ctx->opt_enable_word_time_offsets = globals.opt_enable_word_time_offsets;
+ asr_ctx->opt_enable_word_confidence = globals.opt_enable_word_confidence;
+ asr_ctx->opt_enable_automatic_punctuation = globals.opt_enable_automatic_punctuation;
+ asr_ctx->opt_enable_spoken_punctuation = globals.opt_enable_spoken_punctuation;
+ asr_ctx->opt_enable_spoken_emojis = globals.opt_enable_spoken_emojis;
+ asr_ctx->opt_meta_interaction_type = globals.opt_meta_interaction_type;
+ asr_ctx->opt_meta_microphone_distance = globals.opt_meta_microphone_distance;
+ asr_ctx->opt_meta_recording_device_type = globals.opt_meta_recording_device_type;
+ asr_ctx->opt_speech_model = globals.opt_speech_model;
+ asr_ctx->opt_use_enhanced_model = globals.opt_use_enhanced_model;
+ asr_ctx->opt_enable_speaker_diarization = SWITCH_FALSE;
+ asr_ctx->opt_diarization_min_speaker_count = 1;
+ asr_ctx->opt_diarization_max_speaker_count = 1;
+
+ if((status = switch_mutex_init(&asr_ctx->mutex, SWITCH_MUTEX_NESTED, ah->memory_pool)) != SWITCH_STATUS_SUCCESS) {
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "switch_mutex_init()\n");
+ switch_goto_status(SWITCH_STATUS_GENERR, out);
+ }
+
+ switch_queue_create(&asr_ctx->q_audio, QUEUE_SIZE, ah->memory_pool);
+ switch_queue_create(&asr_ctx->q_text, QUEUE_SIZE, ah->memory_pool);
+
+ asr_ctx->vad_buffer = NULL;
+ asr_ctx->frame_len = 0;
+ asr_ctx->vad_buffer_size = 0;
+ asr_ctx->vad_stored_frames = 0;
+ asr_ctx->fl_vad_first_cycle = SWITCH_TRUE;
+
+ if((asr_ctx->vad = switch_vad_init(asr_ctx->samplerate, asr_ctx->channels)) == NULL) {
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "switch_vad_init()\n");
+ switch_goto_status(SWITCH_STATUS_GENERR, out);
+ }
+ switch_vad_set_mode(asr_ctx->vad, -1);
+ switch_vad_set_param(asr_ctx->vad, "debug", globals.fl_vad_debug);
+ if(globals.vad_silence_ms > 0) { switch_vad_set_param(asr_ctx->vad, "silence_ms", globals.vad_silence_ms); }
+ if(globals.vad_voice_ms > 0) { switch_vad_set_param(asr_ctx->vad, "voice_ms", globals.vad_voice_ms); }
+ if(globals.vad_threshold > 0) { switch_vad_set_param(asr_ctx->vad, "thresh", globals.vad_threshold); }
+
+ ah->private_info = asr_ctx;
+
+ switch_mutex_lock(globals.mutex);
+ globals.active_threads++;
+ switch_mutex_unlock(globals.mutex);
+
+ switch_threadattr_create(&attr, ah->memory_pool);
+ switch_threadattr_detach_set(attr, 1);
+ switch_threadattr_stacksize_set(attr, SWITCH_THREAD_STACKSIZE);
+ switch_thread_create(&thread, attr, transcribe_thread, asr_ctx, ah->memory_pool);
+
+out:
+ return status;
+}
+
+static switch_status_t asr_close(switch_asr_handle_t *ah, switch_asr_flag_t *flags) {
+ asr_ctx_t *asr_ctx = (asr_ctx_t *)ah->private_info;
+ uint8_t fl_wloop = SWITCH_TRUE;
+
+ assert(asr_ctx != NULL);
+
+ asr_ctx->fl_abort = SWITCH_TRUE;
+ asr_ctx->fl_destroyed = SWITCH_TRUE;
+
+ switch_mutex_lock(asr_ctx->mutex);
+ fl_wloop = (asr_ctx->refs != 0);
+ switch_mutex_unlock(asr_ctx->mutex);
+
+ if(fl_wloop) {
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Waiting for unlock (refs=%u)...\n", asr_ctx->refs);
+ while(fl_wloop) {
+ switch_mutex_lock(asr_ctx->mutex);
+ fl_wloop = (asr_ctx->refs != 0);
+ switch_mutex_unlock(asr_ctx->mutex);
+ switch_yield(100000);
+ }
+ }
+
+ if(asr_ctx->q_audio) {
+ xdata_buffer_queue_clean(asr_ctx->q_audio);
+ switch_queue_term(asr_ctx->q_audio);
+ }
+ if(asr_ctx->q_text) {
+ text_queue_clean(asr_ctx->q_text);
+ switch_queue_term(asr_ctx->q_text);
+ }
+ if(asr_ctx->vad) {
+ switch_vad_destroy(&asr_ctx->vad);
+ }
+
+ if(asr_ctx->vad_buffer) {
+ switch_buffer_destroy(&asr_ctx->vad_buffer);
+ }
+
+ switch_set_flag(ah, SWITCH_ASR_FLAG_CLOSED);
+
+ return SWITCH_STATUS_SUCCESS;
+}
+
+static switch_status_t asr_feed(switch_asr_handle_t *ah, void *data, unsigned int data_len, switch_asr_flag_t *flags) {
+ asr_ctx_t *asr_ctx = (asr_ctx_t *)ah->private_info;
+ switch_vad_state_t vad_state = 0;
+ uint8_t fl_has_audio = SWITCH_FALSE;
+
+ assert(asr_ctx != NULL);
+
+ if(switch_test_flag(ah, SWITCH_ASR_FLAG_CLOSED)) {
+ return SWITCH_STATUS_BREAK;
+ }
+ if(asr_ctx->fl_destroyed || asr_ctx->fl_abort) {
+ return SWITCH_STATUS_BREAK;
+ }
+ if(asr_ctx->fl_pause) {
+ return SWITCH_STATUS_SUCCESS;
+ }
+ if(!data || !data_len) {
+ return SWITCH_STATUS_BREAK;
+ }
+
+ if(data_len > 0 && asr_ctx->frame_len == 0) {
+ switch_mutex_lock(asr_ctx->mutex);
+ asr_ctx->frame_len = data_len;
+ asr_ctx->vad_buffer_size = asr_ctx->frame_len * VAD_STORE_FRAMES;
+ asr_ctx->chunk_buffer_size = asr_ctx->samplerate * globals.speech_max_sec;
+ switch_mutex_unlock(asr_ctx->mutex);
+
+ if(switch_buffer_create(ah->memory_pool, &asr_ctx->vad_buffer, asr_ctx->vad_buffer_size) != SWITCH_STATUS_SUCCESS) {
+ asr_ctx->vad_buffer_size = 0;
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "switch_buffer_create()\n");
+ }
+ }
+
+ if(asr_ctx->vad_buffer_size) {
+ if(asr_ctx->vad_state == SWITCH_VAD_STATE_STOP_TALKING || (asr_ctx->vad_state == vad_state && vad_state == SWITCH_VAD_STATE_NONE)) {
+ if(data_len <= asr_ctx->frame_len) {
+ if(asr_ctx->vad_stored_frames >= VAD_STORE_FRAMES) {
+ switch_buffer_zero(asr_ctx->vad_buffer);
+ asr_ctx->vad_stored_frames = 0;
+ asr_ctx->fl_vad_first_cycle = SWITCH_FALSE;
+ }
+ switch_buffer_write(asr_ctx->vad_buffer, data, MIN(asr_ctx->frame_len, data_len));
+ asr_ctx->vad_stored_frames++;
+ }
+ }
+
+ vad_state = switch_vad_process(asr_ctx->vad, (int16_t *)data, (data_len / sizeof(int16_t)));
+ if(vad_state == SWITCH_VAD_STATE_START_TALKING) {
+ asr_ctx->vad_state = vad_state;
+ fl_has_audio = SWITCH_TRUE;
+ } else if (vad_state == SWITCH_VAD_STATE_STOP_TALKING) {
+ asr_ctx->vad_state = vad_state;
+ fl_has_audio = SWITCH_FALSE;
+ switch_vad_reset(asr_ctx->vad);
+ } else if (vad_state == SWITCH_VAD_STATE_TALKING) {
+ asr_ctx->vad_state = vad_state;
+ fl_has_audio = SWITCH_TRUE;
+ }
+ } else {
+ fl_has_audio = SWITCH_TRUE;
+ }
+
+ if(fl_has_audio) {
+ if(vad_state == SWITCH_VAD_STATE_START_TALKING && asr_ctx->vad_stored_frames > 0) {
+ xdata_buffer_t *tau_buf = NULL;
+ const void *ptr = NULL;
+ switch_size_t vblen = 0;
+ uint32_t rframes = 0, rlen = 0;
+ int ofs = 0;
+
+ if((vblen = switch_buffer_peek_zerocopy(asr_ctx->vad_buffer, &ptr)) && ptr && vblen > 0) {
+ rframes = (asr_ctx->vad_stored_frames >= VAD_RECOVERY_FRAMES ? VAD_RECOVERY_FRAMES : (asr_ctx->fl_vad_first_cycle ? asr_ctx->vad_stored_frames : VAD_RECOVERY_FRAMES));
+ rlen = (rframes * asr_ctx->frame_len);
+ ofs = (vblen - rlen);
+
+ if(ofs < 0) {
+ uint32_t hdr_sz = -ofs;
+ uint32_t hdr_ofs = (asr_ctx->vad_buffer_size - hdr_sz);
+
+ switch_zmalloc(tau_buf, sizeof(xdata_buffer_t));
+
+ tau_buf->len = (hdr_sz + vblen + data_len);
+ switch_malloc(tau_buf->data, tau_buf->len);
+
+ memcpy(tau_buf->data, (void *)(ptr + hdr_ofs), hdr_sz);
+ memcpy(tau_buf->data + hdr_sz , (void *)(ptr + 0), vblen);
+ memcpy(tau_buf->data + rlen, data, data_len);
+
+ if(switch_queue_trypush(asr_ctx->q_audio, tau_buf) != SWITCH_STATUS_SUCCESS) {
+ xdata_buffer_free(&tau_buf);
+ }
+
+ switch_buffer_zero(asr_ctx->vad_buffer);
+ asr_ctx->vad_stored_frames = 0;
+ } else {
+ switch_zmalloc(tau_buf, sizeof(xdata_buffer_t));
+
+ tau_buf->len = (rlen + data_len);
+ switch_malloc(tau_buf->data, tau_buf->len);
+
+ memcpy(tau_buf->data, (void *)(ptr + ofs), rlen);
+ memcpy(tau_buf->data + rlen, data, data_len);
+
+ if(switch_queue_trypush(asr_ctx->q_audio, tau_buf) != SWITCH_STATUS_SUCCESS) {
+ xdata_buffer_free(&tau_buf);
+ }
+
+ switch_buffer_zero(asr_ctx->vad_buffer);
+ asr_ctx->vad_stored_frames = 0;
+ }
+ }
+ } else {
+ xdata_buffer_push(asr_ctx->q_audio, data, data_len);
+ }
+ }
+
+ return SWITCH_STATUS_SUCCESS;
+}
+
+static switch_status_t asr_check_results(switch_asr_handle_t *ah, switch_asr_flag_t *flags) {
+ asr_ctx_t *asr_ctx = (asr_ctx_t *)ah->private_info;
+
+ assert(asr_ctx != NULL);
+
+ if(asr_ctx->fl_pause) {
+ return SWITCH_STATUS_FALSE;
+ }
+
+ return (asr_ctx->transcription_results > 0 ? SWITCH_STATUS_SUCCESS : SWITCH_STATUS_FALSE);
+}
+
+static switch_status_t asr_get_results(switch_asr_handle_t *ah, char **xmlstr, switch_asr_flag_t *flags) {
+ asr_ctx_t *asr_ctx = (asr_ctx_t *)ah->private_info;
+ switch_status_t status = SWITCH_STATUS_FALSE;
+ void *pop = NULL;
+
+ assert(asr_ctx != NULL);
+
+ if(switch_queue_trypop(asr_ctx->q_text, &pop) == SWITCH_STATUS_SUCCESS) {
+ if(pop) {
+ *xmlstr = (char *)pop;
+ status = SWITCH_STATUS_SUCCESS;
+
+ switch_mutex_lock(asr_ctx->mutex);
+ if(asr_ctx->transcription_results > 0) asr_ctx->transcription_results--;
+ switch_mutex_unlock(asr_ctx->mutex);
+ }
+ }
+
+ return status;
+}
+
+static switch_status_t asr_start_input_timers(switch_asr_handle_t *ah) {
+ asr_ctx_t *asr_ctx = (asr_ctx_t *)ah->private_info;
+
+ assert(asr_ctx != NULL);
+
+ asr_ctx->fl_start_timers = SWITCH_TRUE;
+
+ return SWITCH_STATUS_SUCCESS;
+}
+
+static switch_status_t asr_pause(switch_asr_handle_t *ah) {
+ asr_ctx_t *asr_ctx = (asr_ctx_t *)ah->private_info;
+
+ assert(asr_ctx != NULL);
+
+ asr_ctx->fl_pause = SWITCH_TRUE;
+
+ return SWITCH_STATUS_SUCCESS;
+}
+
+static switch_status_t asr_resume(switch_asr_handle_t *ah) {
+ asr_ctx_t *asr_ctx = (asr_ctx_t *)ah->private_info;
+
+ assert(asr_ctx != NULL);
+
+ asr_ctx->fl_pause = SWITCH_FALSE;
+
+ return SWITCH_STATUS_SUCCESS;
+}
+
+static void asr_text_param(switch_asr_handle_t *ah, char *param, const char *val) {
+ asr_ctx_t *asr_ctx = (asr_ctx_t *)ah->private_info;
+
+ assert(asr_ctx != NULL);
+
+ if(strcasecmp(param, "lang") == 0) {
+ if(val) asr_ctx->lang = switch_core_strdup(ah->memory_pool, gcp_get_language(val));
+ } else if(strcasecmp(param, "silence") == 0) {
+ if(val) asr_ctx->silence_sec = atoi(val);
+ } else if(strcasecmp(param, "key") == 0) {
+ if(val) asr_ctx->api_key = switch_core_strdup(ah->memory_pool, val);
+ } else if(!strcasecmp(param, "speech-model")) {
+ if(val) asr_ctx->opt_speech_model = switch_core_strdup(ah->memory_pool, val);
+ } else if(!strcasecmp(param, "use-enhanced-model")) {
+ if(val) asr_ctx->opt_use_enhanced_model = switch_true(val);
+ } else if(!strcasecmp(param, "max-alternatives")) {
+ if(val) asr_ctx->opt_max_alternatives = atoi(val);
+ } else if(!strcasecmp(param, "enable-word-time-offsets")) {
+ if(val) asr_ctx->opt_enable_word_time_offsets = switch_true(val);
+ } else if(!strcasecmp(param, "enable-enable-word-confidence;")) {
+ if(val) asr_ctx->opt_enable_word_confidence = switch_true(val);
+ } else if(!strcasecmp(param, "enable-profanity-filter")) {
+ if(val) asr_ctx->opt_enable_profanity_filter = switch_true(val);
+ } else if(!strcasecmp(param, "enable-automatic-punctuation")) {
+ if(val) asr_ctx->opt_enable_automatic_punctuation = switch_true(val);
+ } else if(!strcasecmp(param, "enable-spoken-punctuation")) {
+ if(val) asr_ctx->opt_enable_spoken_punctuation = switch_true(val);
+ } else if(!strcasecmp(param, "enable-spoken-emojis")) {
+ if(val) asr_ctx->opt_enable_spoken_emojis = switch_true(val);
+ } else if(!strcasecmp(param, "microphone-distance")) {
+ if(val) asr_ctx->opt_meta_microphone_distance = switch_core_strdup(ah->memory_pool, gcp_get_microphone_distance(val));
+ } else if(!strcasecmp(param, "recording-device-type")) {
+ if(val) asr_ctx->opt_meta_recording_device_type = switch_core_strdup(ah->memory_pool, gcp_get_recording_device(val));
+ } else if(!strcasecmp(param, "interaction-type")) {
+ if(val) asr_ctx->opt_meta_interaction_type = switch_core_strdup(ah->memory_pool, gcp_get_interaction(val));
+ } else if(!strcasecmp(param, "enable-speaker-diarizatio")) {
+ if(val) asr_ctx->opt_enable_speaker_diarization = switch_true(val);
+ } else if(!strcasecmp(param, "diarization-min-speakers")) {
+ if(val) asr_ctx->opt_diarization_min_speaker_count = atoi(val);
+ } else if(!strcasecmp(param, "diarization-max-speakers")) {
+ if(val) asr_ctx->opt_diarization_max_speaker_count = atoi(val);
+ }
+}
+
+static void asr_numeric_param(switch_asr_handle_t *ah, char *param, int val) {
+}
+
+static void asr_float_param(switch_asr_handle_t *ah, char *param, double val) {
+}
+
+static switch_status_t asr_load_grammar(switch_asr_handle_t *ah, const char *grammar, const char *name) {
+ return SWITCH_STATUS_SUCCESS;
+}
+
+static switch_status_t asr_unload_grammar(switch_asr_handle_t *ah, const char *name) {
+ return SWITCH_STATUS_SUCCESS;
+}
+
+#define CMD_SYNTAX "path_to/filename.(mp3|wav) []\n"
+SWITCH_STANDARD_API(google_asr_cmd_handler) {
+ //switch_status_t status = 0;
+ char *mycmd = NULL, *argv[10] = { 0 }; int argc = 0;
+
+ if (!zstr(cmd)) {
+ mycmd = strdup(cmd);
+ switch_assert(mycmd);
+ argc = switch_separate_string(mycmd, ' ', argv, (sizeof(argv) / sizeof(argv[0])));
+ }
+ if(argc == 0) {
+ goto usage;
+ }
+
+ //
+ // todo
+ //
+
+ stream->write_function(stream, "-ERR: not yet implemented\n");
+ goto out;
+usage:
+ stream->write_function(stream, "-ERR:\nUsage: %s\n", CMD_SYNTAX);
+
+out:
+
+ switch_safe_free(mycmd);
+ return SWITCH_STATUS_SUCCESS;
+}
+
+// ---------------------------------------------------------------------------------------------------------------------------------------------
+// main
+// ---------------------------------------------------------------------------------------------------------------------------------------------
+SWITCH_MODULE_LOAD_FUNCTION(mod_google_asr_load) {
+ switch_status_t status = SWITCH_STATUS_SUCCESS;
+ switch_xml_t cfg, xml, settings, param;
+ switch_api_interface_t *commands_interface;
+ switch_asr_interface_t *asr_interface;
+
+ memset(&globals, 0, sizeof(globals));
+ switch_mutex_init(&globals.mutex, SWITCH_MUTEX_NESTED, pool);
+
+ if((xml = switch_xml_open_cfg(MOD_CONFIG_NAME, &cfg, NULL)) == NULL) {
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Unable to open configuration: %s\n", MOD_CONFIG_NAME);
+ switch_goto_status(SWITCH_STATUS_GENERR, out);
+ }
+
+ if((settings = switch_xml_child(cfg, "settings"))) {
+ for (param = switch_xml_child(settings, "param"); param; param = param->next) {
+ char *var = (char *) switch_xml_attr_soft(param, "name");
+ char *val = (char *) switch_xml_attr_soft(param, "value");
+
+ if(!strcasecmp(var, "vad-silence-ms")) {
+ if(val) globals.vad_silence_ms = atoi (val);
+ } else if(!strcasecmp(var, "vad-voice-ms")) {
+ if(val) globals.vad_voice_ms = atoi (val);
+ } else if(!strcasecmp(var, "vad-threshold")) {
+ if(val) globals.vad_threshold = atoi (val);
+ } else if(!strcasecmp(var, "vad-debug")) {
+ if(val) globals.fl_vad_debug = switch_true(val);
+ } else if(!strcasecmp(var, "api-key")) {
+ if(val) globals.api_key = switch_core_strdup(pool, val);
+ } else if(!strcasecmp(var, "api-url")) {
+ if(val) globals.api_url = switch_core_strdup(pool, val);
+ } else if(!strcasecmp(var, "user-agent")) {
+ if(val) globals.user_agent = switch_core_strdup(pool, val);
+ } else if(!strcasecmp(var, "proxy")) {
+ if(val) globals.proxy = switch_core_strdup(pool, val);
+ } else if(!strcasecmp(var, "proxy-credentials")) {
+ if(val) globals.proxy_credentials = switch_core_strdup(pool, val);
+ } else if(!strcasecmp(var, "default-language")) {
+ if(val) globals.default_lang = switch_core_strdup(pool, gcp_get_language(val));
+ } else if(!strcasecmp(var, "encoding")) {
+ if(val) globals.opt_encoding = switch_core_strdup(pool, gcp_get_encoding(val));
+ } else if(!strcasecmp(var, "speech-max-sec")) {
+ if(val) globals.speech_max_sec = atoi(val);
+ } else if(!strcasecmp(var, "speech-silence-sec")) {
+ if(val) globals.speech_silence_sec = atoi(val);
+ } else if(!strcasecmp(var, "request-timeout")) {
+ if(val) globals.request_timeout = atoi(val);
+ } else if(!strcasecmp(var, "connect-timeout")) {
+ if(val) globals.connect_timeout = atoi(val);
+ } else if(!strcasecmp(var, "retries-on-error")) {
+ if(val) globals.retries_on_error = atoi(val);
+ } else if(!strcasecmp(var, "speech-model")) {
+ if(val) globals.opt_speech_model = switch_core_strdup(pool, val);
+ } else if(!strcasecmp(var, "use-enhanced-model")) {
+ if(val) globals.opt_use_enhanced_model = switch_true(val);
+ } else if(!strcasecmp(var, "max-alternatives")) {
+ if(val) globals.opt_max_alternatives = atoi(val);
+ } else if(!strcasecmp(var, "enable-word-time-offsets")) {
+ if(val) globals.opt_enable_word_time_offsets = switch_true(val);
+ } else if(!strcasecmp(var, "enable-word-confidence")) {
+ if(val) globals.opt_enable_word_confidence = switch_true(val);
+ } else if(!strcasecmp(var, "enable-profanity-filter")) {
+ if(val) globals.opt_enable_profanity_filter = switch_true(val);
+ } else if(!strcasecmp(var, "enable-automatic-punctuation")) {
+ if(val) globals.opt_enable_automatic_punctuation = switch_true(val);
+ } else if(!strcasecmp(var, "enable-spoken-punctuation")) {
+ if(val) globals.opt_enable_spoken_punctuation = switch_true(val);
+ } else if(!strcasecmp(var, "enable-spoken-emojis")) {
+ if(val) globals.opt_enable_spoken_emojis = switch_true(val);
+ } else if(!strcasecmp(var, "microphone-distance")) {
+ if(val) globals.opt_meta_microphone_distance = switch_core_strdup(pool, gcp_get_microphone_distance(val));
+ } else if(!strcasecmp(var, "recording-device-type")) {
+ if(val) globals.opt_meta_recording_device_type = switch_core_strdup(pool, gcp_get_recording_device(val));
+ } else if(!strcasecmp(var, "interaction-type")) {
+ if(val) globals.opt_meta_interaction_type = switch_core_strdup(pool, gcp_get_interaction(val));
+ }
+ }
+ }
+
+ if(!globals.api_url) {
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Missing required parameter: api-url\n");
+ switch_goto_status(SWITCH_STATUS_GENERR, out);
+ }
+
+ globals.speech_max_sec = !globals.speech_max_sec ? 35 : globals.speech_max_sec;
+ globals.speech_silence_sec = !globals.speech_silence_sec ? 3 : globals.speech_silence_sec;
+ globals.opt_encoding = globals.opt_encoding ? globals.opt_encoding : gcp_get_encoding("l16");
+ globals.opt_speech_model = globals.opt_speech_model ? globals.opt_speech_model : "phone_call";
+ globals.opt_max_alternatives = globals.opt_max_alternatives > 0 ? globals.opt_max_alternatives : 1;
+ globals.opt_meta_microphone_distance = globals.opt_meta_microphone_distance ? globals.opt_meta_microphone_distance : gcp_get_microphone_distance("unspecified");
+ globals.opt_meta_recording_device_type = globals.opt_meta_recording_device_type ? globals.opt_meta_recording_device_type : gcp_get_recording_device("unspecified");
+ globals.opt_meta_interaction_type = globals.opt_meta_interaction_type ? globals.opt_meta_interaction_type : gcp_get_interaction("unspecified");
+ globals.retries_on_error = !globals.retries_on_error ? 1 : globals.retries_on_error;
+
+ globals.tmp_path = switch_core_sprintf(pool, "%s%sgoogle-asr-cache", SWITCH_GLOBAL_dirs.temp_dir, SWITCH_PATH_SEPARATOR);
+ if(switch_directory_exists(globals.tmp_path, NULL) != SWITCH_STATUS_SUCCESS) {
+ switch_dir_make(globals.tmp_path, SWITCH_FPROT_OS_DEFAULT, NULL);
+ }
+
+ *module_interface = switch_loadable_module_create_module_interface(pool, modname);
+ SWITCH_ADD_API(commands_interface, "google_asr_transcript", "Google speech-to-text", google_asr_cmd_handler, CMD_SYNTAX);
+
+ asr_interface = switch_loadable_module_create_interface(*module_interface, SWITCH_ASR_INTERFACE);
+ asr_interface->interface_name = "google";
+ asr_interface->asr_open = asr_open;
+ asr_interface->asr_close = asr_close;
+ asr_interface->asr_feed = asr_feed;
+ asr_interface->asr_pause = asr_pause;
+ asr_interface->asr_resume = asr_resume;
+ asr_interface->asr_check_results = asr_check_results;
+ asr_interface->asr_get_results = asr_get_results;
+ asr_interface->asr_start_input_timers = asr_start_input_timers;
+ asr_interface->asr_text_param = asr_text_param;
+ asr_interface->asr_numeric_param = asr_numeric_param;
+ asr_interface->asr_float_param = asr_float_param;
+ asr_interface->asr_load_grammar = asr_load_grammar;
+ asr_interface->asr_unload_grammar = asr_unload_grammar;
+
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "Google-ASR (%s)\n", MOD_VERSION);
+out:
+ if(xml) {
+ switch_xml_free(xml);
+ }
+ return status;
+}
+
+SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_google_asr_shutdown) {
+ uint8_t fl_wloop = SWITCH_TRUE;
+
+ globals.fl_shutdown = SWITCH_TRUE;
+
+ switch_mutex_lock(globals.mutex);
+ fl_wloop = (globals.active_threads > 0);
+ switch_mutex_unlock(globals.mutex);
+
+ if(fl_wloop) {
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Waiting for termination (%d) threads...\n", globals.active_threads);
+ while(fl_wloop) {
+ switch_mutex_lock(globals.mutex);
+ fl_wloop = (globals.active_threads > 0);
+ switch_mutex_unlock(globals.mutex);
+ switch_yield(100000);
+ }
+ }
+
+ return SWITCH_STATUS_SUCCESS;
+}
diff --git a/src/mod/asr_tts/mod_google_asr/mod_google_asr.h b/src/mod/asr_tts/mod_google_asr/mod_google_asr.h
new file mode 100644
index 00000000000..ec56f244876
--- /dev/null
+++ b/src/mod/asr_tts/mod_google_asr/mod_google_asr.h
@@ -0,0 +1,149 @@
+/*
+ * FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application
+ * Copyright (C) 2005-2014, Anthony Minessale II
+ *
+ * Version: MPL 1.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * Module Contributor(s):
+ * Konstantin Alexandrin
+ *
+ *
+ *
+ */
+#ifndef MOD_GOOGLE_ASR_H
+#define MOD_GOOGLE_ASR_H
+
+#include
+#include
+#include
+
+#define MIN(a,b) (((a)<(b))?(a):(b))
+#define MAX(a,b) (((a)>(b))?(a):(b))
+
+#define MOD_CONFIG_NAME "google_asr.conf"
+#define MOD_VERSION "1.0.4"
+#define QUEUE_SIZE 128
+#define VAD_STORE_FRAMES 64
+#define VAD_RECOVERY_FRAMES 20
+#define BASE64_ENC_SZ(n) (4*((n+2)/3))
+#define BOOL2STR(v) (v ? "true" : "false")
+
+//#define MOD_GOOGLE_ASR_DEBUG
+
+typedef struct {
+ switch_mutex_t *mutex;
+ uint32_t active_threads;
+ uint32_t speech_max_sec;
+ uint32_t speech_silence_sec;
+ uint32_t vad_silence_ms;
+ uint32_t vad_voice_ms;
+ uint32_t vad_threshold;
+ uint32_t request_timeout; // seconds
+ uint32_t connect_timeout; // seconds
+ uint32_t retries_on_error;
+ uint8_t fl_vad_debug;
+ uint8_t fl_shutdown;
+ char *tmp_path;
+ char *api_key;
+ char *api_url;
+ char *user_agent;
+ char *default_lang;
+ char *proxy;
+ char *proxy_credentials;
+ char *opt_encoding;
+ char *opt_speech_model;
+ char *opt_meta_microphone_distance;
+ char *opt_meta_recording_device_type;
+ char *opt_meta_interaction_type;
+ uint32_t opt_max_alternatives;
+ uint32_t opt_use_enhanced_model;
+ uint32_t opt_enable_word_time_offsets;
+ uint32_t opt_enable_word_confidence;
+ uint32_t opt_enable_profanity_filter;
+ uint32_t opt_enable_automatic_punctuation;
+ uint32_t opt_enable_spoken_punctuation;
+ uint32_t opt_enable_spoken_emojis;
+} globals_t;
+
+
+typedef struct {
+ switch_memory_pool_t *pool;
+ switch_vad_t *vad;
+ switch_buffer_t *vad_buffer;
+ switch_mutex_t *mutex;
+ switch_queue_t *q_audio;
+ switch_queue_t *q_text;
+ switch_buffer_t *curl_recv_buffer_ref;
+ switch_byte_t *curl_send_buffer_ref;
+ char *api_key;
+ char *lang;
+ switch_vad_state_t vad_state;
+ uint32_t retries_on_error;
+ uint32_t curl_send_buffer_len;
+ uint32_t transcription_results;
+ uint32_t vad_buffer_size;
+ uint32_t vad_stored_frames;
+ uint32_t chunk_buffer_size;
+ uint32_t refs;
+ uint32_t samplerate;
+ uint32_t channels;
+ uint32_t frame_len;
+ uint32_t silence_sec;
+ uint8_t fl_start_timers;
+ uint8_t fl_pause;
+ uint8_t fl_vad_first_cycle;
+ uint8_t fl_destroyed;
+ uint8_t fl_abort;
+ //
+ char *opt_speech_model;
+ char *opt_meta_microphone_distance;
+ char *opt_meta_recording_device_type;
+ char *opt_meta_interaction_type;
+ uint32_t opt_max_alternatives;
+ uint32_t opt_use_enhanced_model;
+ uint32_t opt_enable_word_time_offsets;
+ uint32_t opt_enable_word_confidence;
+ uint32_t opt_enable_profanity_filter;
+ uint32_t opt_enable_automatic_punctuation;
+ uint32_t opt_enable_spoken_punctuation;
+ uint32_t opt_enable_spoken_emojis;
+ uint32_t opt_enable_speaker_diarization;
+ uint32_t opt_diarization_min_speaker_count;
+ uint32_t opt_diarization_max_speaker_count;
+} asr_ctx_t;
+
+typedef struct {
+ uint32_t len;
+ switch_byte_t *data;
+} xdata_buffer_t;
+
+
+/* curl.c */
+switch_status_t curl_perform(asr_ctx_t *asr_ctx, globals_t *globals);
+
+/* utils.c */
+switch_status_t xdata_buffer_push(switch_queue_t *queue, switch_byte_t *data, uint32_t data_len);
+switch_status_t xdata_buffer_alloc(xdata_buffer_t **out, switch_byte_t *data, uint32_t data_len);
+void xdata_buffer_free(xdata_buffer_t **buf);
+void xdata_buffer_queue_clean(switch_queue_t *queue);
+void text_queue_clean(switch_queue_t *queue);
+char *parse_response(char *data, switch_stream_handle_t *stream);
+
+char *gcp_get_language(const char *val);
+char *gcp_get_encoding(const char *val);
+char *gcp_get_microphone_distance(const char *val);
+char *gcp_get_recording_device(const char *val);
+char *gcp_get_interaction(const char *val);
+
+
+#endif
diff --git a/src/mod/asr_tts/mod_google_asr/utils.c b/src/mod/asr_tts/mod_google_asr/utils.c
new file mode 100644
index 00000000000..9842da557d3
--- /dev/null
+++ b/src/mod/asr_tts/mod_google_asr/utils.c
@@ -0,0 +1,171 @@
+/*
+ * FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application
+ * Copyright (C) 2005-2014, Anthony Minessale II
+ *
+ * Version: MPL 1.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * Module Contributor(s):
+ * Konstantin Alexandrin
+ *
+ *
+ */
+#include "mod_google_asr.h"
+
+extern globals_t globals;
+
+/**
+ ** https://cloud.google.com/speech-to-text/docs/reference/rest/v1/RecognitionConfig
+ ** https://cloud.google.com/speech-to-text/docs/speech-to-text-supported-languages
+ **
+ **/
+char *gcp_get_language(const char *val) {
+ if(strcasecmp(val, "en") == 0) { return "en-US"; }
+ if(strcasecmp(val, "de") == 0) { return "de-DE"; }
+ if(strcasecmp(val, "es") == 0) { return "es-US"; }
+ if(strcasecmp(val, "it") == 0) { return "it-IT"; }
+ if(strcasecmp(val, "ru") == 0) { return "ru-RU"; }
+ return (char *)val;
+}
+
+char *gcp_get_encoding(const char *val) {
+ if(strcasecmp(val, "unspecified") == 0) { return "ENCODING_UNSPECIFIED"; }
+ if(strcasecmp(val, "l16") == 0) { return "LINEAR16"; }
+ if(strcasecmp(val, "flac") == 0) { return "FLAC"; }
+ if(strcasecmp(val, "ulaw") == 0) { return "MULAW"; }
+ if(strcasecmp(val, "amr") == 0) { return "AMR"; }
+ return (char *)val;
+}
+
+char *gcp_get_microphone_distance(const char *val) {
+ if(strcasecmp(val, "unspecified") == 0) { return "MICROPHONE_DISTANCE_UNSPECIFIED"; }
+ if(strcasecmp(val, "nearfield") == 0) { return "NEARFIELD"; }
+ if(strcasecmp(val, "midfield") == 0) { return "MIDFIELD"; }
+ if(strcasecmp(val, "farfield") == 0) { return "FARFIELD"; }
+ return (char *)val;
+}
+
+char *gcp_get_recording_device(const char *val) {
+ if(strcasecmp(val, "unspecified") == 0) { return "RECORDING_DEVICE_TYPE_UNSPECIFIED"; }
+ if(strcasecmp(val, "smartphone") == 0) { return "SMARTPHONE"; }
+ if(strcasecmp(val, "pc") == 0) { return "PC"; }
+ if(strcasecmp(val, "phone_line") == 0) { return "PHONE_LINE"; }
+ if(strcasecmp(val, "vehicle") == 0) { return "VEHICLE"; }
+ if(strcasecmp(val, "other_outdoor_device") == 0) { return "OTHER_OUTDOOR_DEVICE"; }
+ if(strcasecmp(val, "other_indoor_device") == 0) { return "OTHER_INDOOR_DEVICE"; }
+ return (char *)val;
+}
+
+char *gcp_get_interaction(const char *val) {
+ if(strcasecmp(val, "unspecified") == 0) { return "INTERACTION_TYPE_UNSPECIFIED"; }
+ if(strcasecmp(val, "discussion") == 0) { return "DISCUSSION"; }
+ if(strcasecmp(val, "presentation") == 0) { return "PRESENTATION"; }
+ if(strcasecmp(val, "phone_call") == 0) { return "PHONE_CALL"; }
+ if(strcasecmp(val, "voicemal") == 0) { return "VOICEMAIL"; }
+ if(strcasecmp(val, "professionally_produced") == 0) { return "PROFESSIONALLY_PRODUCED"; }
+ if(strcasecmp(val, "voice_search") == 0) { return "VOICE_SEARCH"; }
+ if(strcasecmp(val, "voice_command") == 0) { return "VOICE_COMMAND"; }
+ if(strcasecmp(val, "dictation") == 0) { return "DICTATION"; }
+ return (char *)val;
+}
+
+switch_status_t xdata_buffer_alloc(xdata_buffer_t **out, switch_byte_t *data, uint32_t data_len) {
+ xdata_buffer_t *buf = NULL;
+
+ switch_zmalloc(buf, sizeof(xdata_buffer_t));
+
+ if(data_len) {
+ switch_malloc(buf->data, data_len);
+ switch_assert(buf->data);
+
+ buf->len = data_len;
+ memcpy(buf->data, data, data_len);
+ }
+
+ *out = buf;
+ return SWITCH_STATUS_SUCCESS;
+}
+
+void xdata_buffer_free(xdata_buffer_t **buf) {
+ if(buf && *buf) {
+ switch_safe_free((*buf)->data);
+ free(*buf);
+ }
+}
+
+void xdata_buffer_queue_clean(switch_queue_t *queue) {
+ xdata_buffer_t *data = NULL;
+
+ if(!queue || !switch_queue_size(queue)) { return; }
+
+ while(switch_queue_trypop(queue, (void *) &data) == SWITCH_STATUS_SUCCESS) {
+ if(data) { xdata_buffer_free(&data); }
+ }
+}
+
+switch_status_t xdata_buffer_push(switch_queue_t *queue, switch_byte_t *data, uint32_t data_len) {
+ xdata_buffer_t *buff = NULL;
+
+ if(xdata_buffer_alloc(&buff, data, data_len) == SWITCH_STATUS_SUCCESS) {
+ if(switch_queue_trypush(queue, buff) == SWITCH_STATUS_SUCCESS) {
+ return SWITCH_STATUS_SUCCESS;
+ }
+ xdata_buffer_free(&buff);
+ }
+ return SWITCH_STATUS_FALSE;
+}
+
+void text_queue_clean(switch_queue_t *queue) {
+ void *data = NULL;
+
+ if(!queue || !switch_queue_size(queue)) {
+ return;
+ }
+
+ while(switch_queue_trypop(queue, (void *)&data) == SWITCH_STATUS_SUCCESS) {
+ switch_safe_free(data);
+ }
+}
+
+char *parse_response(char *data, switch_stream_handle_t *stream) {
+ char *result = NULL;
+ cJSON *json = NULL;
+
+ if(!data) {
+ return NULL;
+ }
+
+ if((json = cJSON_Parse(data)) != NULL) {
+ cJSON *jres = cJSON_GetObjectItem(json, "results");
+ if(jres && cJSON_GetArraySize(jres) > 0) {
+ cJSON *jelem = cJSON_GetArrayItem(jres, 0);
+ if(jelem) {
+ jres = cJSON_GetObjectItem(jelem, "alternatives");
+ if(jres && cJSON_GetArraySize(jres) > 0) {
+ jelem = cJSON_GetArrayItem(jres, 0);
+ if(jelem) {
+ cJSON *jt = cJSON_GetObjectItem(jelem, "transcript");
+ if(jt && jt->valuestring) {
+ result = strdup(jt->valuestring);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ if(json) {
+ cJSON_Delete(json);
+ }
+
+ return result;
+}
diff --git a/src/mod/asr_tts/mod_openai_asr/Makefile.am b/src/mod/asr_tts/mod_openai_asr/Makefile.am
new file mode 100644
index 00000000000..2213c833904
--- /dev/null
+++ b/src/mod/asr_tts/mod_openai_asr/Makefile.am
@@ -0,0 +1,12 @@
+
+include $(top_srcdir)/build/modmake.rulesam
+
+MODNAME=mod_openai_asr
+mod_LTLIBRARIES = mod_openai_asr.la
+mod_openai_asr_la_SOURCES = mod_openai_asr.c utils.c curl.c
+mod_openai_asr_la_CFLAGS = $(AM_CFLAGS) -I. -Wno-pointer-arith
+mod_openai_asr_la_LIBADD = $(switch_builddir)/libfreeswitch.la
+mod_openai_asr_la_LDFLAGS = -avoid-version -module -no-undefined -shared
+
+$(am_mod_openai_asr_la_OBJECTS): mod_openai_asr.h
+
diff --git a/src/mod/asr_tts/mod_openai_asr/conf/autoload_configs/openai_asr.conf.xml b/src/mod/asr_tts/mod_openai_asr/conf/autoload_configs/openai_asr.conf.xml
new file mode 100644
index 00000000000..99b35e99dfd
--- /dev/null
+++ b/src/mod/asr_tts/mod_openai_asr/conf/autoload_configs/openai_asr.conf.xml
@@ -0,0 +1,26 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/mod/asr_tts/mod_openai_asr/conf/dialplan/dialplan.xml b/src/mod/asr_tts/mod_openai_asr/conf/dialplan/dialplan.xml
new file mode 100644
index 00000000000..e72bd6f5927
--- /dev/null
+++ b/src/mod/asr_tts/mod_openai_asr/conf/dialplan/dialplan.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
diff --git a/src/mod/asr_tts/mod_openai_asr/curl.c b/src/mod/asr_tts/mod_openai_asr/curl.c
new file mode 100644
index 00000000000..622aafe9334
--- /dev/null
+++ b/src/mod/asr_tts/mod_openai_asr/curl.c
@@ -0,0 +1,127 @@
+/*
+ * FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application
+ * Copyright (C) 2005-2014, Anthony Minessale II
+ *
+ * Version: MPL 1.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * Module Contributor(s):
+ * Konstantin Alexandrin
+ *
+ *
+ */
+#include "mod_openai_asr.h"
+
+static size_t curl_io_write_callback(char *buffer, size_t size, size_t nitems, void *user_data) {
+ switch_buffer_t *recv_buffer = (switch_buffer_t *)user_data;
+ size_t len = (size * nitems);
+
+ if(len > 0 && recv_buffer) {
+ switch_buffer_write(recv_buffer, buffer, len);
+ }
+
+ return len;
+}
+
+switch_status_t curl_perform(switch_buffer_t *recv_buffer, char *api_key, char *model_name, char *filename, globals_t *globals) {
+ switch_status_t status = SWITCH_STATUS_SUCCESS;
+ CURL *curl_handle = NULL;
+ curl_mime *form = NULL;
+ curl_mimepart *field1=NULL, *field2=NULL;
+ switch_curl_slist_t *headers = NULL;
+ switch_CURLcode curl_ret = 0;
+ long http_resp = 0;
+
+ curl_handle = switch_curl_easy_init();
+ headers = switch_curl_slist_append(headers, "Content-Type: multipart/form-data");
+
+ switch_curl_easy_setopt(curl_handle, CURLOPT_HTTPHEADER, headers);
+ switch_curl_easy_setopt(curl_handle, CURLOPT_POST, 1);
+ switch_curl_easy_setopt(curl_handle, CURLOPT_NOSIGNAL, 1);
+ switch_curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, curl_io_write_callback);
+ switch_curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *) recv_buffer);
+
+ if(globals->connect_timeout > 0) {
+ switch_curl_easy_setopt(curl_handle, CURLOPT_CONNECTTIMEOUT, globals->connect_timeout);
+ }
+ if(globals->request_timeout > 0) {
+ switch_curl_easy_setopt(curl_handle, CURLOPT_TIMEOUT, globals->request_timeout);
+ }
+ if(globals->user_agent) {
+ switch_curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, globals->user_agent);
+ }
+ if(strncasecmp(globals->api_url, "https", 5) == 0) {
+ switch_curl_easy_setopt(curl_handle, CURLOPT_SSL_VERIFYPEER, 0);
+ switch_curl_easy_setopt(curl_handle, CURLOPT_SSL_VERIFYHOST, 0);
+ }
+ if(globals->proxy) {
+ if(globals->proxy_credentials != NULL) {
+ switch_curl_easy_setopt(curl_handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY);
+ switch_curl_easy_setopt(curl_handle, CURLOPT_PROXYUSERPWD, globals->proxy_credentials);
+ }
+ if(strncasecmp(globals->proxy, "https", 5) == 0) {
+ switch_curl_easy_setopt(curl_handle, CURLOPT_PROXY_SSL_VERIFYPEER, 0);
+ }
+ switch_curl_easy_setopt(curl_handle, CURLOPT_PROXY, globals->proxy);
+ }
+
+ if(api_key) {
+ curl_easy_setopt(curl_handle, CURLOPT_XOAUTH2_BEARER, api_key);
+ curl_easy_setopt(curl_handle, CURLOPT_HTTPAUTH, CURLAUTH_BEARER);
+ }
+
+ if((form = curl_mime_init(curl_handle))) {
+ if((field1 = curl_mime_addpart(form))) {
+ curl_mime_name(field1, "model");
+ curl_mime_data(field1, model_name, CURL_ZERO_TERMINATED);
+ }
+ if((field2 = curl_mime_addpart(form))) {
+ curl_mime_name(field2, "file");
+ curl_mime_filedata(field2, filename);
+ }
+ switch_curl_easy_setopt(curl_handle, CURLOPT_MIMEPOST, form);
+ }
+
+ headers = switch_curl_slist_append(headers, "Expect:");
+ switch_curl_easy_setopt(curl_handle, CURLOPT_URL, globals->api_url);
+
+ curl_ret = switch_curl_easy_perform(curl_handle);
+ if(!curl_ret) {
+ switch_curl_easy_getinfo(curl_handle, CURLINFO_RESPONSE_CODE, &http_resp);
+ if(!http_resp) { switch_curl_easy_getinfo(curl_handle, CURLINFO_HTTP_CONNECTCODE, &http_resp); }
+ } else {
+ http_resp = curl_ret;
+ }
+
+ if(http_resp != 200) {
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "http-error=[%ld] (%s)\n", http_resp, globals->api_url);
+ status = SWITCH_STATUS_FALSE;
+ }
+
+ if(recv_buffer) {
+ if(switch_buffer_inuse(recv_buffer) > 0) {
+ switch_buffer_write(recv_buffer, "\0", 1);
+ }
+ }
+
+ if(curl_handle) {
+ switch_curl_easy_cleanup(curl_handle);
+ }
+ if(form) {
+ curl_mime_free(form);
+ }
+ if(headers) {
+ switch_curl_slist_free_all(headers);
+ }
+
+ return status;
+}
diff --git a/src/mod/asr_tts/mod_openai_asr/mod_openai_asr.c b/src/mod/asr_tts/mod_openai_asr/mod_openai_asr.c
new file mode 100644
index 00000000000..88923db07a0
--- /dev/null
+++ b/src/mod/asr_tts/mod_openai_asr/mod_openai_asr.c
@@ -0,0 +1,729 @@
+/*
+ * FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application
+ * Copyright (C) 2005-2014, Anthony Minessale II
+ *
+ * Version: MPL 1.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * Module Contributor(s):
+ * Konstantin Alexandrin
+ *
+ *
+ * OpenAI Speech-To-Text service for the Freeswitch.
+ * https://platform.openai.com/docs/guides/speech-to-text
+ *
+ * Development respository:
+ * https://github.com/akscf/mod_openai_asr
+ *
+ */
+#include "mod_openai_asr.h"
+
+globals_t globals;
+
+SWITCH_MODULE_LOAD_FUNCTION(mod_openai_asr_load);
+SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_openai_asr_shutdown);
+SWITCH_MODULE_DEFINITION(mod_openai_asr, mod_openai_asr_load, mod_openai_asr_shutdown, NULL);
+
+static void *SWITCH_THREAD_FUNC transcribe_thread(switch_thread_t *thread, void *obj) {
+ volatile asr_ctx_t *_ref = (asr_ctx_t *)obj;
+ asr_ctx_t *asr_ctx = (asr_ctx_t *)_ref;
+ switch_status_t status = SWITCH_STATUS_FALSE;
+ switch_buffer_t *chunk_buffer = NULL;
+ switch_buffer_t *curl_recv_buffer = NULL;
+ switch_memory_pool_t *pool = NULL;
+ cJSON *json = NULL;
+ time_t sentence_timeout = 0;
+ uint32_t schunks = 0;
+ uint32_t chunk_buffer_size = 0;
+ uint8_t fl_cbuff_overflow = SWITCH_FALSE;
+ void *pop = NULL;
+
+ switch_mutex_lock(asr_ctx->mutex);
+ asr_ctx->refs++;
+ switch_mutex_unlock(asr_ctx->mutex);
+
+ if(switch_core_new_memory_pool(&pool) != SWITCH_STATUS_SUCCESS) {
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "switch_core_new_memory_pool()\n");
+ goto out;
+ }
+ if(switch_buffer_create_dynamic(&curl_recv_buffer, 1024, 2048, 8192) != SWITCH_STATUS_SUCCESS) {
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "switch_buffer_create_dynamic()\n");
+ goto out;
+ }
+
+ while(SWITCH_TRUE) {
+ if(globals.fl_shutdown || asr_ctx->fl_destroyed) {
+ break;
+ }
+ if(chunk_buffer_size == 0) {
+ switch_mutex_lock(asr_ctx->mutex);
+ chunk_buffer_size = asr_ctx->chunk_buffer_size;
+ switch_mutex_unlock(asr_ctx->mutex);
+
+ if(chunk_buffer_size > 0) {
+ if(switch_buffer_create(pool, &chunk_buffer, chunk_buffer_size) != SWITCH_STATUS_SUCCESS) {
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "switch_buffer_create()\n");
+ break;
+ }
+ switch_buffer_zero(chunk_buffer);
+ }
+ goto timer_next;
+ }
+
+ fl_cbuff_overflow = SWITCH_FALSE;
+ while(switch_queue_trypop(asr_ctx->q_audio, &pop) == SWITCH_STATUS_SUCCESS) {
+ xdata_buffer_t *audio_buffer = (xdata_buffer_t *)pop;
+ if(globals.fl_shutdown || asr_ctx->fl_destroyed ) {
+ xdata_buffer_free(&audio_buffer);
+ break;
+ }
+ if(audio_buffer && audio_buffer->len) {
+ if(switch_buffer_write(chunk_buffer, audio_buffer->data, audio_buffer->len) >= chunk_buffer_size) {
+ fl_cbuff_overflow = SWITCH_TRUE;
+ break;
+ }
+ schunks++;
+ }
+ xdata_buffer_free(&audio_buffer);
+ }
+
+ if(fl_cbuff_overflow) {
+ sentence_timeout = 1;
+ } else {
+ if(schunks && asr_ctx->vad_state == SWITCH_VAD_STATE_STOP_TALKING) {
+ if(!sentence_timeout) {
+ sentence_timeout = asr_ctx->silence_sec + switch_epoch_time_now(NULL);
+ }
+ }
+ if(sentence_timeout && (asr_ctx->vad_state == SWITCH_VAD_STATE_START_TALKING || asr_ctx->vad_state == SWITCH_VAD_STATE_TALKING)) {
+ sentence_timeout = 0;
+ }
+ }
+
+ if(sentence_timeout && sentence_timeout <= switch_epoch_time_now(NULL)) {
+ const void *chunk_buffer_ptr = NULL;
+ const void *http_response_ptr = NULL;
+ uint32_t buf_len = 0, http_recv_len = 0, stt_failed = 0;
+ char *chunk_fname = NULL;
+
+ if((buf_len = switch_buffer_peek_zerocopy(chunk_buffer, &chunk_buffer_ptr)) > 0 && chunk_buffer_ptr) {
+ chunk_fname = chunk_write((switch_byte_t *)chunk_buffer_ptr, buf_len, asr_ctx->channels, asr_ctx->samplerate, globals.opt_encoding);
+ }
+ if(chunk_fname) {
+ for(uint32_t rqtry = 0; rqtry < asr_ctx->retries_on_error; rqtry++) {
+ switch_buffer_zero(curl_recv_buffer);
+ status = curl_perform(curl_recv_buffer, asr_ctx->opt_api_key, asr_ctx->opt_model, chunk_fname, &globals);
+ if(status == SWITCH_STATUS_SUCCESS || globals.fl_shutdown || asr_ctx->fl_destroyed) { break; }
+ switch_yield(1000);
+ }
+
+ http_recv_len = switch_buffer_peek_zerocopy(curl_recv_buffer, &http_response_ptr);
+ if(status == SWITCH_STATUS_SUCCESS) {
+ if(http_response_ptr && http_recv_len) {
+ char *txt = parse_response((char *)http_response_ptr, NULL);
+#ifdef MOD_OPENAI_ASR_DEBUG
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Service response [%s]\n", (char *)http_response_ptr);
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Text [%s]\n", txt ? txt : "null");
+#endif
+ if(!txt) txt = strdup("");
+ if(switch_queue_trypush(asr_ctx->q_text, txt) == SWITCH_STATUS_SUCCESS) {
+ switch_mutex_lock(asr_ctx->mutex);
+ asr_ctx->transcription_results++;
+ switch_mutex_unlock(asr_ctx->mutex);
+ } else {
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Queue is full!\n");
+ switch_safe_free(txt);
+ }
+ } else {
+ stt_failed = 1;
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Empty service response!\n");
+ }
+ } else {
+ stt_failed = 1;
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Unable to perform request!\n");
+ }
+
+ if(stt_failed) {
+ char *txt = strdup("[transcription failed]");
+ if(switch_queue_trypush(asr_ctx->q_text, txt) == SWITCH_STATUS_SUCCESS) {
+ switch_mutex_lock(asr_ctx->mutex);
+ asr_ctx->transcription_results++;
+ switch_mutex_unlock(asr_ctx->mutex);
+ } else {
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Queue is full!\n");
+ switch_safe_free(txt);
+ }
+ }
+
+ schunks = 0;
+ sentence_timeout = 0;
+ unlink(chunk_fname);
+ switch_safe_free(chunk_fname);
+ switch_buffer_zero(chunk_buffer);
+ }
+ }
+
+ timer_next:
+ switch_yield(10000);
+ }
+
+out:
+ if(json != NULL) {
+ cJSON_Delete(json);
+ }
+ if(curl_recv_buffer) {
+ switch_buffer_destroy(&curl_recv_buffer);
+ }
+ if(chunk_buffer) {
+ switch_buffer_destroy(&chunk_buffer);
+ }
+ if(pool) {
+ switch_core_destroy_memory_pool(&pool);
+ }
+
+ switch_mutex_lock(asr_ctx->mutex);
+ if(asr_ctx->refs > 0) asr_ctx->refs--;
+ switch_mutex_unlock(asr_ctx->mutex);
+
+ switch_mutex_lock(globals.mutex);
+ if(globals.active_threads) globals.active_threads--;
+ switch_mutex_unlock(globals.mutex);
+
+ return NULL;
+}
+
+// ---------------------------------------------------------------------------------------------------------------------------------------------
+static switch_status_t asr_open(switch_asr_handle_t *ah, const char *codec, int samplerate, const char *dest, switch_asr_flag_t *flags) {
+ switch_status_t status = SWITCH_STATUS_SUCCESS;
+ switch_threadattr_t *attr = NULL;
+ switch_thread_t *thread = NULL;
+ asr_ctx_t *asr_ctx = NULL;
+
+ if(strcmp(codec, "L16") !=0) {
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Unsupported encoding (%s)\n", codec);
+ switch_goto_status(SWITCH_STATUS_FALSE, out);
+ }
+
+ if((asr_ctx = switch_core_alloc(ah->memory_pool, sizeof(asr_ctx_t))) == NULL) {
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "switch_core_alloc()\n");
+ switch_goto_status(SWITCH_STATUS_GENERR, out);
+ }
+
+ asr_ctx->channels = 1;
+ asr_ctx->chunk_buffer_size = 0;
+ asr_ctx->samplerate = samplerate;
+ asr_ctx->silence_sec = globals.speech_silence_sec;
+ asr_ctx->retries_on_error = globals.retries_on_error;
+
+ asr_ctx->opt_model = globals.opt_model;
+ asr_ctx->opt_api_key = globals.api_key;
+
+ if((status = switch_mutex_init(&asr_ctx->mutex, SWITCH_MUTEX_NESTED, ah->memory_pool)) != SWITCH_STATUS_SUCCESS) {
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "switch_mutex_init()\n");
+ switch_goto_status(SWITCH_STATUS_GENERR, out);
+ }
+
+ switch_queue_create(&asr_ctx->q_audio, QUEUE_SIZE, ah->memory_pool);
+ switch_queue_create(&asr_ctx->q_text, QUEUE_SIZE, ah->memory_pool);
+
+ asr_ctx->vad_buffer = NULL;
+ asr_ctx->frame_len = 0;
+ asr_ctx->vad_buffer_size = 0;
+ asr_ctx->vad_stored_frames = 0;
+ asr_ctx->fl_vad_first_cycle = SWITCH_TRUE;
+
+ if((asr_ctx->vad = switch_vad_init(asr_ctx->samplerate, asr_ctx->channels)) == NULL) {
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "switch_vad_init()\n");
+ switch_goto_status(SWITCH_STATUS_GENERR, out);
+ }
+ switch_vad_set_mode(asr_ctx->vad, -1);
+ switch_vad_set_param(asr_ctx->vad, "debug", globals.fl_vad_debug);
+ if(globals.vad_silence_ms > 0) { switch_vad_set_param(asr_ctx->vad, "silence_ms", globals.vad_silence_ms); }
+ if(globals.vad_voice_ms > 0) { switch_vad_set_param(asr_ctx->vad, "voice_ms", globals.vad_voice_ms); }
+ if(globals.vad_threshold > 0) { switch_vad_set_param(asr_ctx->vad, "thresh", globals.vad_threshold); }
+
+ ah->private_info = asr_ctx;
+
+ switch_mutex_lock(globals.mutex);
+ globals.active_threads++;
+ switch_mutex_unlock(globals.mutex);
+
+ switch_threadattr_create(&attr, ah->memory_pool);
+ switch_threadattr_detach_set(attr, 1);
+ switch_threadattr_stacksize_set(attr, SWITCH_THREAD_STACKSIZE);
+ switch_thread_create(&thread, attr, transcribe_thread, asr_ctx, ah->memory_pool);
+
+out:
+ return status;
+}
+
+static switch_status_t asr_close(switch_asr_handle_t *ah, switch_asr_flag_t *flags) {
+ asr_ctx_t *asr_ctx = (asr_ctx_t *)ah->private_info;
+ uint8_t fl_wloop = SWITCH_TRUE;
+
+ assert(asr_ctx != NULL);
+
+ asr_ctx->fl_abort = SWITCH_TRUE;
+ asr_ctx->fl_destroyed = SWITCH_TRUE;
+
+ switch_mutex_lock(asr_ctx->mutex);
+ fl_wloop = (asr_ctx->refs != 0);
+ switch_mutex_unlock(asr_ctx->mutex);
+
+ if(fl_wloop) {
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Waiting for unlock (refs=%d)...\n", asr_ctx->refs);
+ while(fl_wloop) {
+ switch_mutex_lock(asr_ctx->mutex);
+ fl_wloop = (asr_ctx->refs != 0);
+ switch_mutex_unlock(asr_ctx->mutex);
+ switch_yield(100000);
+ }
+ }
+
+ if(asr_ctx->q_audio) {
+ xdata_buffer_queue_clean(asr_ctx->q_audio);
+ switch_queue_term(asr_ctx->q_audio);
+ }
+ if(asr_ctx->q_text) {
+ text_queue_clean(asr_ctx->q_text);
+ switch_queue_term(asr_ctx->q_text);
+ }
+ if(asr_ctx->vad) {
+ switch_vad_destroy(&asr_ctx->vad);
+ }
+ if(asr_ctx->vad_buffer) {
+ switch_buffer_destroy(&asr_ctx->vad_buffer);
+ }
+
+ switch_set_flag(ah, SWITCH_ASR_FLAG_CLOSED);
+
+ return SWITCH_STATUS_SUCCESS;
+}
+
+static switch_status_t asr_feed(switch_asr_handle_t *ah, void *data, unsigned int data_len, switch_asr_flag_t *flags) {
+ asr_ctx_t *asr_ctx = (asr_ctx_t *) ah->private_info;
+ switch_vad_state_t vad_state = 0;
+ uint8_t fl_has_audio = SWITCH_FALSE;
+
+ assert(asr_ctx != NULL);
+
+ if(switch_test_flag(ah, SWITCH_ASR_FLAG_CLOSED)) {
+ return SWITCH_STATUS_BREAK;
+ }
+ if(asr_ctx->fl_destroyed || asr_ctx->fl_abort) {
+ return SWITCH_STATUS_BREAK;
+ }
+ if(asr_ctx->fl_pause) {
+ return SWITCH_STATUS_SUCCESS;
+ }
+ if(!data || !data_len) {
+ return SWITCH_STATUS_BREAK;
+ }
+
+ if(data_len > 0 && asr_ctx->frame_len == 0) {
+ switch_mutex_lock(asr_ctx->mutex);
+ asr_ctx->frame_len = data_len;
+ asr_ctx->vad_buffer_size = asr_ctx->frame_len * VAD_STORE_FRAMES;
+ asr_ctx->chunk_buffer_size = asr_ctx->samplerate * globals.speech_max_sec;
+ switch_mutex_unlock(asr_ctx->mutex);
+
+ if(switch_buffer_create(ah->memory_pool, &asr_ctx->vad_buffer, asr_ctx->vad_buffer_size) != SWITCH_STATUS_SUCCESS) {
+ asr_ctx->vad_buffer_size = 0;
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "switch_buffer_create()\n");
+ }
+ }
+
+ if(asr_ctx->vad_buffer_size) {
+ if(asr_ctx->vad_state == SWITCH_VAD_STATE_STOP_TALKING || (asr_ctx->vad_state == vad_state && vad_state == SWITCH_VAD_STATE_NONE)) {
+ if(data_len <= asr_ctx->frame_len) {
+ if(asr_ctx->vad_stored_frames >= VAD_STORE_FRAMES) {
+ switch_buffer_zero(asr_ctx->vad_buffer);
+ asr_ctx->vad_stored_frames = 0;
+ asr_ctx->fl_vad_first_cycle = SWITCH_FALSE;
+ }
+ switch_buffer_write(asr_ctx->vad_buffer, data, MIN(asr_ctx->frame_len, data_len));
+ asr_ctx->vad_stored_frames++;
+ }
+ }
+
+ vad_state = switch_vad_process(asr_ctx->vad, (int16_t *)data, (data_len / sizeof(int16_t)));
+ if(vad_state == SWITCH_VAD_STATE_START_TALKING) {
+ asr_ctx->vad_state = vad_state;
+ fl_has_audio = SWITCH_TRUE;
+ } else if (vad_state == SWITCH_VAD_STATE_STOP_TALKING) {
+ asr_ctx->vad_state = vad_state;
+ fl_has_audio = SWITCH_FALSE;
+ switch_vad_reset(asr_ctx->vad);
+ } else if (vad_state == SWITCH_VAD_STATE_TALKING) {
+ asr_ctx->vad_state = vad_state;
+ fl_has_audio = SWITCH_TRUE;
+ }
+ } else {
+ fl_has_audio = SWITCH_TRUE;
+ }
+
+ if(fl_has_audio) {
+ if(vad_state == SWITCH_VAD_STATE_START_TALKING && asr_ctx->vad_stored_frames > 0) {
+ xdata_buffer_t *tau_buf = NULL;
+ const void *ptr = NULL;
+ switch_size_t vblen = 0;
+ uint32_t rframes = 0, rlen = 0;
+ int ofs = 0;
+
+ if((vblen = switch_buffer_peek_zerocopy(asr_ctx->vad_buffer, &ptr)) && ptr && vblen > 0) {
+ rframes = (asr_ctx->vad_stored_frames >= VAD_RECOVERY_FRAMES ? VAD_RECOVERY_FRAMES : (asr_ctx->fl_vad_first_cycle ? asr_ctx->vad_stored_frames : VAD_RECOVERY_FRAMES));
+ rlen = (rframes * asr_ctx->frame_len);
+ ofs = (vblen - rlen);
+
+ if(ofs < 0) {
+ uint32_t hdr_sz = -ofs;
+ uint32_t hdr_ofs = (asr_ctx->vad_buffer_size - hdr_sz);
+
+ switch_zmalloc(tau_buf, sizeof(xdata_buffer_t));
+
+ tau_buf->len = (hdr_sz + vblen + data_len);
+ switch_malloc(tau_buf->data, tau_buf->len);
+
+ memcpy(tau_buf->data, (void *)(ptr + hdr_ofs), hdr_sz);
+ memcpy(tau_buf->data + hdr_sz , (void *)(ptr + 0), vblen);
+ memcpy(tau_buf->data + rlen, data, data_len);
+
+ if(switch_queue_trypush(asr_ctx->q_audio, tau_buf) != SWITCH_STATUS_SUCCESS) {
+ xdata_buffer_free(&tau_buf);
+ }
+
+ switch_buffer_zero(asr_ctx->vad_buffer);
+ asr_ctx->vad_stored_frames = 0;
+ } else {
+ switch_zmalloc(tau_buf, sizeof(xdata_buffer_t));
+
+ tau_buf->len = (rlen + data_len);
+ switch_malloc(tau_buf->data, tau_buf->len);
+
+ memcpy(tau_buf->data, (void *)(ptr + ofs), rlen);
+ memcpy(tau_buf->data + rlen, data, data_len);
+
+ if(switch_queue_trypush(asr_ctx->q_audio, tau_buf) != SWITCH_STATUS_SUCCESS) {
+ xdata_buffer_free(&tau_buf);
+ }
+
+ switch_buffer_zero(asr_ctx->vad_buffer);
+ asr_ctx->vad_stored_frames = 0;
+ }
+ }
+ } else {
+ xdata_buffer_push(asr_ctx->q_audio, data, data_len);
+ }
+ }
+
+ return SWITCH_STATUS_SUCCESS;
+}
+
+static switch_status_t asr_check_results(switch_asr_handle_t *ah, switch_asr_flag_t *flags) {
+ asr_ctx_t *asr_ctx = (asr_ctx_t *)ah->private_info;
+
+ assert(asr_ctx != NULL);
+
+ if(asr_ctx->fl_pause) {
+ return SWITCH_STATUS_FALSE;
+ }
+
+ return (asr_ctx->transcription_results > 0 ? SWITCH_STATUS_SUCCESS : SWITCH_STATUS_FALSE);
+}
+
+static switch_status_t asr_get_results(switch_asr_handle_t *ah, char **xmlstr, switch_asr_flag_t *flags) {
+ asr_ctx_t *asr_ctx = (asr_ctx_t *)ah->private_info;
+ switch_status_t status = SWITCH_STATUS_FALSE;
+ void *pop = NULL;
+
+ assert(asr_ctx != NULL);
+
+ if(switch_queue_trypop(asr_ctx->q_text, &pop) == SWITCH_STATUS_SUCCESS) {
+ if(pop) {
+ *xmlstr = (char *)pop;
+ status = SWITCH_STATUS_SUCCESS;
+
+ switch_mutex_lock(asr_ctx->mutex);
+ if(asr_ctx->transcription_results > 0) asr_ctx->transcription_results--;
+ switch_mutex_unlock(asr_ctx->mutex);
+ }
+ }
+
+ return status;
+}
+
+static switch_status_t asr_start_input_timers(switch_asr_handle_t *ah) {
+ asr_ctx_t *asr_ctx = (asr_ctx_t *)ah->private_info;
+
+ assert(asr_ctx != NULL);
+
+ asr_ctx->fl_start_timers = SWITCH_TRUE;
+
+ return SWITCH_STATUS_SUCCESS;
+}
+
+static switch_status_t asr_pause(switch_asr_handle_t *ah) {
+ asr_ctx_t *asr_ctx = (asr_ctx_t *)ah->private_info;
+
+ assert(asr_ctx != NULL);
+
+ asr_ctx->fl_pause = SWITCH_TRUE;
+
+ return SWITCH_STATUS_SUCCESS;
+}
+
+static switch_status_t asr_resume(switch_asr_handle_t *ah) {
+ asr_ctx_t *asr_ctx = (asr_ctx_t *)ah->private_info;
+
+ assert(asr_ctx != NULL);
+
+ asr_ctx->fl_pause = SWITCH_FALSE;
+
+ return SWITCH_STATUS_SUCCESS;
+}
+
+static void asr_text_param(switch_asr_handle_t *ah, char *param, const char *val) {
+ asr_ctx_t *asr_ctx = (asr_ctx_t *)ah->private_info;
+
+ assert(asr_ctx != NULL);
+
+ if(strcasecmp(param, "lang") == 0) {
+ if(val) asr_ctx->opt_lang = switch_core_strdup(ah->memory_pool, val);
+ } else if(strcasecmp(param, "model") == 0) {
+ if(val) asr_ctx->opt_model = switch_core_strdup(ah->memory_pool, val);
+ } else if(strcasecmp(param, "key") == 0) {
+ if(val) asr_ctx->opt_api_key = switch_core_strdup(ah->memory_pool, val);
+ } else if(strcasecmp(param, "silence") == 0) {
+ if(val) asr_ctx->silence_sec = atoi(val);
+ }
+}
+
+static void asr_numeric_param(switch_asr_handle_t *ah, char *param, int val) {
+}
+
+static void asr_float_param(switch_asr_handle_t *ah, char *param, double val) {
+}
+
+static switch_status_t asr_load_grammar(switch_asr_handle_t *ah, const char *grammar, const char *name) {
+ return SWITCH_STATUS_SUCCESS;
+}
+
+static switch_status_t asr_unload_grammar(switch_asr_handle_t *ah, const char *name) {
+ return SWITCH_STATUS_SUCCESS;
+}
+
+// ---------------------------------------------------------------------------------------------------------------------------------------------
+#define CMD_SYNTAX "path_to/filename.(mp3|wav) [key=altkey model=altModel]\n"
+SWITCH_STANDARD_API(openai_asr_cmd_handler) {
+ switch_status_t status = 0;
+ char *mycmd = NULL, *argv[10] = { 0 }; int argc = 0;
+ switch_buffer_t *recv_buf = NULL;
+ const void *response_ptr = NULL;
+ char *opt_api_key = globals.api_key;
+ char *opt_model = globals.opt_model;
+ char *file_name = NULL, *file_ext = NULL;
+ uint32_t recv_len = 0;
+
+ if (!zstr(cmd)) {
+ mycmd = strdup(cmd);
+ switch_assert(mycmd);
+ argc = switch_separate_string(mycmd, ' ', argv, (sizeof(argv) / sizeof(argv[0])));
+ }
+ if(argc == 0) {
+ goto usage;
+ }
+
+ file_name = argv[0];
+ if(switch_file_exists(file_name, NULL) != SWITCH_STATUS_SUCCESS) {
+ stream->write_function(stream, "-ERR: file not found (%s)\n", file_name);
+ goto out;
+ }
+
+ file_ext = strrchr(file_name, '.');
+ if(!file_ext) {
+ stream->write_function(stream, "-ERR: unsupported file encoding (null)\n");
+ goto out;
+ }
+
+ file_ext++;
+ if(strcasecmp("mp3", file_ext) && strcasecmp("wav", file_ext)) {
+ stream->write_function(stream, "-ERR: unsupported file encoding (%s)\n", file_ext);
+ goto out;
+ }
+
+ if(switch_buffer_create_dynamic(&recv_buf, 1024, 2048, 8192) != SWITCH_STATUS_SUCCESS) {
+ stream->write_function(stream, "-ERR: switch_buffer_create_dynamic()\n");
+ goto out;
+ }
+
+ if(argc > 1) {
+ for(int i = 1; i < argc; i++) {
+ char *kvp[2] = { 0 };
+ if(switch_separate_string(argv[i], '=', kvp, 2) >= 2) {
+ if(strcasecmp(kvp[0], "key") == 0) {
+ if(kvp[1]) opt_api_key = kvp[1];
+ } else if(strcasecmp(kvp[0], "model") == 0) {
+ if(kvp[1]) opt_model = kvp[1];
+ }
+ }
+ }
+ }
+
+ status = curl_perform(recv_buf, opt_api_key, opt_model, file_name, &globals);
+
+ recv_len = switch_buffer_peek_zerocopy(recv_buf, &response_ptr);
+ if(status == SWITCH_STATUS_SUCCESS && response_ptr && recv_len) {
+ char *txt = parse_response((char *)response_ptr, stream);
+ if(txt) {
+ stream->write_function(stream, "+OK: %s\n", txt);
+ }
+ switch_safe_free(txt);
+ } else {
+ stream->write_function(stream, "-ERR: unable to perform request\n");
+ }
+
+ goto out;
+usage:
+ stream->write_function(stream, "-ERR:\nUsage: %s\n", CMD_SYNTAX);
+
+out:
+ if(recv_buf) {
+ switch_buffer_destroy(&recv_buf);
+ }
+
+ switch_safe_free(mycmd);
+ return SWITCH_STATUS_SUCCESS;
+}
+
+// ---------------------------------------------------------------------------------------------------------------------------------------------
+// main
+// ---------------------------------------------------------------------------------------------------------------------------------------------
+SWITCH_MODULE_LOAD_FUNCTION(mod_openai_asr_load) {
+ switch_status_t status = SWITCH_STATUS_SUCCESS;
+ switch_xml_t cfg, xml, settings, param;
+ switch_api_interface_t *commands_interface;
+ switch_asr_interface_t *asr_interface;
+
+ memset(&globals, 0, sizeof(globals));
+ switch_mutex_init(&globals.mutex, SWITCH_MUTEX_NESTED, pool);
+
+ if((xml = switch_xml_open_cfg(MOD_CONFIG_NAME, &cfg, NULL)) == NULL) {
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Unable to open configuration: %s\n", MOD_CONFIG_NAME);
+ switch_goto_status(SWITCH_STATUS_GENERR, out);
+ }
+
+ if((settings = switch_xml_child(cfg, "settings"))) {
+ for (param = switch_xml_child(settings, "param"); param; param = param->next) {
+ char *var = (char *) switch_xml_attr_soft(param, "name");
+ char *val = (char *) switch_xml_attr_soft(param, "value");
+
+ if(!strcasecmp(var, "vad-silence-ms")) {
+ if(val) globals.vad_silence_ms = atoi (val);
+ } else if(!strcasecmp(var, "vad-voice-ms")) {
+ if(val) globals.vad_voice_ms = atoi (val);
+ } else if(!strcasecmp(var, "vad-threshold")) {
+ if(val) globals.vad_threshold = atoi (val);
+ } else if(!strcasecmp(var, "vad-debug")) {
+ if(val) globals.fl_vad_debug = switch_true(val);
+ } else if(!strcasecmp(var, "api-key")) {
+ if(val) globals.api_key = switch_core_strdup(pool, val);
+ } else if(!strcasecmp(var, "api-url")) {
+ if(val) globals.api_url = switch_core_strdup(pool, val);
+ } else if(!strcasecmp(var, "user-agent")) {
+ if(val) globals.user_agent = switch_core_strdup(pool, val);
+ } else if(!strcasecmp(var, "proxy")) {
+ if(val) globals.proxy = switch_core_strdup(pool, val);
+ } else if(!strcasecmp(var, "proxy-credentials")) {
+ if(val) globals.proxy_credentials = switch_core_strdup(pool, val);
+ } else if(!strcasecmp(var, "encoding")) {
+ if(val) globals.opt_encoding = switch_core_strdup(pool, val);
+ } else if(!strcasecmp(var, "model")) {
+ if(val) globals.opt_model= switch_core_strdup(pool, val);
+ } else if(!strcasecmp(var, "speech-max-sec")) {
+ if(val) globals.speech_max_sec = atoi(val);
+ } else if(!strcasecmp(var, "speech-silence-sec")) {
+ if(val) globals.speech_silence_sec = atoi(val);
+ } else if(!strcasecmp(var, "request-timeout")) {
+ if(val) globals.request_timeout = atoi(val);
+ } else if(!strcasecmp(var, "connect-timeout")) {
+ if(val) globals.connect_timeout = atoi(val);
+ } else if(!strcasecmp(var, "log-http-errors")) {
+ if(val) globals.fl_log_http_errors = switch_true(val);
+ } else if(!strcasecmp(var, "retries-on-error")) {
+ if(val) globals.retries_on_error = atoi(val);
+ }
+ }
+ }
+
+ if(!globals.api_url) {
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Missing required parameter: api-url\n");
+ switch_goto_status(SWITCH_STATUS_GENERR, out);
+ }
+
+ globals.opt_encoding = globals.opt_encoding ? globals.opt_encoding : "wav";
+ globals.speech_max_sec = !globals.speech_max_sec ? 35 : globals.speech_max_sec;
+ globals.speech_silence_sec = !globals.speech_silence_sec ? 3 : globals.speech_silence_sec;
+ globals.retries_on_error = !globals.retries_on_error ? 1 : globals.retries_on_error;
+
+ globals.tmp_path = switch_core_sprintf(pool, "%s%sopenai-asr-cache", SWITCH_GLOBAL_dirs.temp_dir, SWITCH_PATH_SEPARATOR);
+ if(switch_directory_exists(globals.tmp_path, NULL) != SWITCH_STATUS_SUCCESS) {
+ switch_dir_make(globals.tmp_path, SWITCH_FPROT_OS_DEFAULT, NULL);
+ }
+
+ *module_interface = switch_loadable_module_create_module_interface(pool, modname);
+ SWITCH_ADD_API(commands_interface, "openai_asr_transcript", "OpenAI speech-to-text", openai_asr_cmd_handler, CMD_SYNTAX);
+
+ asr_interface = switch_loadable_module_create_interface(*module_interface, SWITCH_ASR_INTERFACE);
+ asr_interface->interface_name = "openai";
+ asr_interface->asr_open = asr_open;
+ asr_interface->asr_close = asr_close;
+ asr_interface->asr_feed = asr_feed;
+ asr_interface->asr_pause = asr_pause;
+ asr_interface->asr_resume = asr_resume;
+ asr_interface->asr_check_results = asr_check_results;
+ asr_interface->asr_get_results = asr_get_results;
+ asr_interface->asr_start_input_timers = asr_start_input_timers;
+ asr_interface->asr_text_param = asr_text_param;
+ asr_interface->asr_numeric_param = asr_numeric_param;
+ asr_interface->asr_float_param = asr_float_param;
+ asr_interface->asr_load_grammar = asr_load_grammar;
+ asr_interface->asr_unload_grammar = asr_unload_grammar;
+
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "OpenAI-ASR (%s)\n", MOD_VERSION);
+out:
+ if(xml) {
+ switch_xml_free(xml);
+ }
+ return status;
+}
+
+SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_openai_asr_shutdown) {
+ uint8_t fl_wloop = SWITCH_TRUE;
+
+ globals.fl_shutdown = SWITCH_TRUE;
+
+ switch_mutex_lock(globals.mutex);
+ fl_wloop = (globals.active_threads > 0);
+ switch_mutex_unlock(globals.mutex);
+
+ if(fl_wloop) {
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Waiting for termination (%d) threads...\n", globals.active_threads);
+ while(fl_wloop) {
+ switch_mutex_lock(globals.mutex);
+ fl_wloop = (globals.active_threads > 0);
+ switch_mutex_unlock(globals.mutex);
+ switch_yield(100000);
+ }
+ }
+
+ return SWITCH_STATUS_SUCCESS;
+}
diff --git a/src/mod/asr_tts/mod_openai_asr/mod_openai_asr.h b/src/mod/asr_tts/mod_openai_asr/mod_openai_asr.h
new file mode 100644
index 00000000000..230cc68212c
--- /dev/null
+++ b/src/mod/asr_tts/mod_openai_asr/mod_openai_asr.h
@@ -0,0 +1,110 @@
+/*
+ * FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application
+ * Copyright (C) 2005-2014, Anthony Minessale II
+ *
+ * Version: MPL 1.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * Module Contributor(s):
+ * Konstantin Alexandrin
+ *
+ *
+ */
+#ifndef MOD_OPENAI_ASR_H
+#define MOD_OPENAI_ASR_H
+
+#include
+#include
+#include
+
+#define MIN(a,b) (((a)<(b))?(a):(b))
+#define MAX(a,b) (((a)>(b))?(a):(b))
+
+#define MOD_CONFIG_NAME "openai_asr.conf"
+#define MOD_VERSION "1.0.4"
+#define QUEUE_SIZE 128
+#define VAD_STORE_FRAMES 64
+#define VAD_RECOVERY_FRAMES 20
+
+//#define MOD_OPENAI_ASR_DEBUG
+
+typedef struct {
+ switch_mutex_t *mutex;
+ uint32_t active_threads;
+ uint32_t speech_max_sec;
+ uint32_t speech_silence_sec;
+ uint32_t vad_silence_ms;
+ uint32_t vad_voice_ms;
+ uint32_t vad_threshold;
+ uint32_t request_timeout; // secondss
+ uint32_t connect_timeout; // seconds
+ uint32_t retries_on_error;
+ uint8_t fl_vad_debug;
+ uint8_t fl_shutdown;
+ uint8_t fl_log_http_errors;
+ char *tmp_path;
+ char *api_key;
+ char *api_url;
+ char *user_agent;
+ char *proxy;
+ char *proxy_credentials;
+ char *opt_encoding;
+ char *opt_model;
+} globals_t;
+
+typedef struct {
+ switch_memory_pool_t *pool;
+ switch_vad_t *vad;
+ switch_buffer_t *vad_buffer;
+ switch_mutex_t *mutex;
+ switch_queue_t *q_audio;
+ switch_queue_t *q_text;
+ switch_buffer_t *curl_recv_buffer_ref;
+ switch_vad_state_t vad_state;
+ char *opt_lang;
+ char *opt_model;
+ char *opt_api_key;
+ int32_t transcription_results;
+ uint32_t retries_on_error;
+ uint32_t vad_buffer_size;
+ uint32_t vad_stored_frames;
+ uint32_t chunk_buffer_size;
+ uint32_t refs;
+ uint32_t samplerate;
+ uint32_t channels;
+ uint32_t frame_len;
+ uint32_t silence_sec;
+ uint8_t fl_start_timers;
+ uint8_t fl_pause;
+ uint8_t fl_vad_first_cycle;
+ uint8_t fl_destroyed;
+ uint8_t fl_abort;
+} asr_ctx_t;
+
+typedef struct {
+ uint32_t len;
+ switch_byte_t *data;
+} xdata_buffer_t;
+
+/* curl.c */
+switch_status_t curl_perform(switch_buffer_t *recv_buffer, char *api_key, char *model_name, char *filename, globals_t *globals);
+
+/* utils.c */
+char *chunk_write(switch_byte_t *buf, uint32_t buf_len, uint32_t channels, uint32_t samplerate, const char *file_ext);
+switch_status_t xdata_buffer_push(switch_queue_t *queue, switch_byte_t *data, uint32_t data_len);
+switch_status_t xdata_buffer_alloc(xdata_buffer_t **out, switch_byte_t *data, uint32_t data_len);
+void xdata_buffer_free(xdata_buffer_t **buf);
+void xdata_buffer_queue_clean(switch_queue_t *queue);
+void text_queue_clean(switch_queue_t *queue);
+char *parse_response(char *data, switch_stream_handle_t *stream);
+
+#endif
diff --git a/src/mod/asr_tts/mod_openai_asr/utils.c b/src/mod/asr_tts/mod_openai_asr/utils.c
new file mode 100644
index 00000000000..ed9a620dc2c
--- /dev/null
+++ b/src/mod/asr_tts/mod_openai_asr/utils.c
@@ -0,0 +1,149 @@
+/*
+ * FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application
+ * Copyright (C) 2005-2014, Anthony Minessale II
+ *
+ * Version: MPL 1.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * Module Contributor(s):
+ * Konstantin Alexandrin
+ *
+ *
+ */
+#include "mod_openai_asr.h"
+
+extern globals_t globals;
+
+switch_status_t xdata_buffer_alloc(xdata_buffer_t **out, switch_byte_t *data, uint32_t data_len) {
+ xdata_buffer_t *buf = NULL;
+
+ switch_zmalloc(buf, sizeof(xdata_buffer_t));
+
+ if(data_len) {
+ switch_malloc(buf->data, data_len);
+ switch_assert(buf->data);
+
+ buf->len = data_len;
+ memcpy(buf->data, data, data_len);
+ }
+
+ *out = buf;
+ return SWITCH_STATUS_SUCCESS;
+}
+
+void xdata_buffer_free(xdata_buffer_t **buf) {
+ if(buf && *buf) {
+ switch_safe_free((*buf)->data);
+ free(*buf);
+ }
+}
+
+void xdata_buffer_queue_clean(switch_queue_t *queue) {
+ xdata_buffer_t *data = NULL;
+
+ if(!queue || !switch_queue_size(queue)) {
+ return;
+ }
+
+ while(switch_queue_trypop(queue, (void *) &data) == SWITCH_STATUS_SUCCESS) {
+ if(data) { xdata_buffer_free(&data); }
+ }
+}
+
+switch_status_t xdata_buffer_push(switch_queue_t *queue, switch_byte_t *data, uint32_t data_len) {
+ xdata_buffer_t *buff = NULL;
+
+ if(xdata_buffer_alloc(&buff, data, data_len) == SWITCH_STATUS_SUCCESS) {
+ if(switch_queue_trypush(queue, buff) == SWITCH_STATUS_SUCCESS) {
+ return SWITCH_STATUS_SUCCESS;
+ }
+ xdata_buffer_free(&buff);
+ }
+ return SWITCH_STATUS_FALSE;
+}
+
+char *chunk_write(switch_byte_t *buf, uint32_t buf_len, uint32_t channels, uint32_t samplerate, const char *file_ext) {
+ switch_status_t status = SWITCH_STATUS_FALSE;
+ switch_size_t len = (buf_len / sizeof(int16_t));
+ switch_file_handle_t fh = { 0 };
+ char *file_name = NULL;
+ char name_uuid[SWITCH_UUID_FORMATTED_LENGTH + 1] = { 0 };
+ int flags = (SWITCH_FILE_FLAG_WRITE | SWITCH_FILE_DATA_SHORT);
+
+ switch_uuid_str((char *)name_uuid, sizeof(name_uuid));
+ file_name = switch_mprintf("%s%s%s.%s", globals.tmp_path, SWITCH_PATH_SEPARATOR, name_uuid, (file_ext == NULL ? "wav" : file_ext) );
+
+ if((status = switch_core_file_open(&fh, file_name, channels, samplerate, flags, NULL)) != SWITCH_STATUS_SUCCESS) {
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Unable to open file (%s)\n", file_name);
+ goto out;
+ }
+
+ if((status = switch_core_file_write(&fh, buf, &len)) != SWITCH_STATUS_SUCCESS) {
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Unable to write (%s)\n", file_name);
+ goto out;
+ }
+
+ switch_core_file_close(&fh);
+out:
+ if(status != SWITCH_STATUS_SUCCESS) {
+ if(file_name) {
+ unlink(file_name);
+ switch_safe_free(file_name);
+ }
+ return NULL;
+ }
+
+ return file_name;
+}
+
+void text_queue_clean(switch_queue_t *queue) {
+ void *data = NULL;
+
+ if(!queue || !switch_queue_size(queue)) {
+ return;
+ }
+
+ while(switch_queue_trypop(queue, (void *)&data) == SWITCH_STATUS_SUCCESS) {
+ switch_safe_free(data);
+ }
+}
+
+char *parse_response(char *data, switch_stream_handle_t *stream) {
+ char *result = NULL;
+ cJSON *json = NULL;
+
+ if(!data) {
+ return NULL;
+ }
+
+ if(!(json = cJSON_Parse(data))) {
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Unable to parse json (%s)\n", data);
+ if(stream) stream->write_function(stream, "-ERR: Unable to parse json (see log)\n");
+ } else {
+ cJSON *jres = cJSON_GetObjectItem(json, "error");
+ if(jres) {
+ switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Service returns error (%s)\n", data);
+ if(stream) stream->write_function(stream, "-ERR: Service returns error (see log)\n");
+ } else {
+ cJSON *jres = cJSON_GetObjectItem(json, "text");
+ if(jres) {
+ result = strdup(jres->valuestring);
+ }
+ }
+ }
+
+ if(json) {
+ cJSON_Delete(json);
+ }
+
+ return result;
+}
diff --git a/src/switch_hashtable.c b/src/switch_hashtable.c
index 66669acb63d..107d539c4f4 100644
--- a/src/switch_hashtable.c
+++ b/src/switch_hashtable.c
@@ -126,7 +126,7 @@ hashtable_expand(switch_hashtable_t *h)
realloc(h->table, newsize * sizeof(struct entry *));
if (NULL == newtable) { (h->primeindex)--; return 0; }
h->table = newtable;
- memset(newtable[h->tablelength], 0, newsize - h->tablelength);
+ memset(&newtable[h->tablelength], 0, (newsize - h->tablelength) * sizeof(struct entry*));
for (i = 0; i < h->tablelength; i++) {
for (pE = &(newtable[i]), e = *pE; e != NULL; e = *pE) {
index = indexFor(newsize,e->h);