Support the EndOfUtterance message

JamesG-Speechmatics · web-flow · commit 6be8ea91c3a9 · 2025-05-20T10:25:40.000+01:00
* Prints [EndOfUtterance] in text mode, prints the full message in json mode.
* Adds a command line parameter for setting the end of utterance silence trigger.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [3.0.6] - 2025-05-20
+
+### Added
+
+- Support end-of-utterance messages (DEL-24982)
+
 ## [3.0.5] - 2025-05-15
 
 - cli: fix some config options not being set when defined in a config file: `topic_detection_config` and `speaker_diarization_config`
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-3.0.5
+3.0.6
diff --git a/speechmatics/cli.py b/speechmatics/cli.py
@@ -281,6 +281,13 @@ def get_transcription_config(
     ]:
         config[option] = True if args.get(option) else config.get(option)
 
+    if args.get("end_of_utterance_silence_trigger") is not None:
+        config["conversation_config"] = {
+            "end_of_utterance_silence_trigger": args.get(
+                "end_of_utterance_silence_trigger"
+            )
+        }
+
     if args.get("volume_threshold") is not None:
         config["audio_filtering_config"] = {
             "volume_threshold": args.get("volume_threshold")
@@ -556,6 +563,13 @@ def audio_event_handler(message):
         sys.stdout.write(f"{escape_seq}[{event_name}]\n")
         transcripts.text += f"[{event_name}] "
 
+    def end_of_utterance_handler(message):
+        if print_json:
+            print(json.dumps(message))
+            return
+        sys.stdout.write("[EndOfUtterance]\n")
+        transcripts.text += "[EndOfUtterance]"
+
     def partial_translation_handler(message):
         if print_json:
             print(json.dumps(message))
@@ -590,6 +604,8 @@ def end_of_transcript_handler(_):
     # print transcription (if text was requested without translation)
 
     api.add_event_handler(ServerMessageType.AudioEventStarted, audio_event_handler)
+    api.add_event_handler(ServerMessageType.EndOfUtterance, end_of_utterance_handler)
+
     if print_json:
         if enable_partials or enable_translation_partials:
             api.add_event_handler(
diff --git a/speechmatics/cli_parser.py b/speechmatics/cli_parser.py
@@ -218,6 +218,15 @@ def get_arg_parser():
         default=None,
         help=("Filter out quiet audio which falls below this threshold (0.0-100.0)"),
     )
+    config_parser.add_argument(
+        "--end-of-utterance-silence-trigger",
+        dest="end_of_utterance_silence_trigger",
+        type=float,
+        default=None,
+        help=(
+            "Generate an EndOfUtterance message from the server after this many seconds of silence (0.0-2.0)"
+        ),
+    )
     config_parser.add_argument(
         "--remove-disfluencies",
         default=None,
diff --git a/speechmatics/models.py b/speechmatics/models.py
@@ -166,6 +166,14 @@ def asdict(self):
         return asdict(self)
 
 
+@dataclass
+class ConversationConfig:
+    """Conversation config."""
+
+    end_of_utterance_silence_trigger: Optional[float] = None
+    """How much silence in seconds is required to trigger end of utterance detection."""
+
+
 @dataclass
 class RTTranslationConfig(TranslationConfig):
     """Real-time mode: Translation config."""
@@ -287,6 +295,9 @@ class TranscriptionConfig(_TranscriptionConfig):
     """Indicates if partial translation, where words are produced
     immediately, is enabled."""
 
+    conversation_config: Optional[ConversationConfig] = None
+    """Optional configuration for end-of-utterance detection."""
+
     translation_config: Optional[TranslationConfig] = None
     """Optional configuration for translation."""
 
@@ -550,6 +561,9 @@ class ServerMessageType(str, Enum):
     AddTranscript = "AddTranscript"
     """Indicates the final transcript of a part of the audio."""
 
+    EndOfUtterance = "EndOfUtterance"
+    """Indicates that an utterance has ended, based on silence"""
+
     AudioEventStarted = "AudioEventStarted"
     """Indicates the start of an audio event."""
 
diff --git a/tests/mock_rt_server.py b/tests/mock_rt_server.py
@@ -120,6 +120,15 @@ def dummy_add_partial_transcript():
     }
 
 
+def dummy_end_of_utterance():
+    """Returns a dummy EndOfUtterance message."""
+    return {
+        "message": "EndOfUtterance",
+        "format": "2.1",
+        "metadata": {"start_time": 3.0, "end_time": 3.0},
+    }
+
+
 def dummy_add_transcript():
     """Returns a dummy AddTranscript message."""
     return {
@@ -194,9 +203,10 @@ def get_responses(message, is_binary=False):
             )
             mock_server_handler.next_audio_seq_no += 1
 
-            # Answer immediately with a partial and a final.
+            # Answer immediately with a partial and a final and an end of utterance.
             responses.append(dummy_add_partial_transcript())
             responses.append(dummy_add_transcript())
+            responses.append(dummy_end_of_utterance())
         else:
             msg_name = message.get("message")
             if not msg_name:
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -182,6 +182,10 @@
             {"enable_translation_partials": True},
         ),
         (["rt", "transcribe", "--enable-entities"], {"enable_entities": True}),
+        (
+            ["rt", "transcribe", "--end-of-utterance-silence-trigger=1.8"],
+            {"end_of_utterance_silence_trigger": 1.8},
+        ),
         (["batch", "transcribe", "--enable-entities"], {"enable_entities": True}),
         (
             ["batch", "transcribe", "--diarization=speaker"],
diff --git a/tests/test_models.py b/tests/test_models.py
@@ -220,3 +220,17 @@ def test_notification_config(params, want):
 def test_audio_events_config_config(params, want):
     audio_events_config = models.AudioEventsConfig(**params)
     assert audio_events_config.asdict() == want
+
+
+@mark.parametrize(
+    "params, want",
+    [
+        param(
+            {"end_of_utterance_silence_trigger": 1.8},
+            {"end_of_utterance_silence_trigger": 1.8},
+        ),
+    ],
+)
+def test_conversation_config(params, want):
+    conversation_config = models.ConversationConfig(**params)
+    assert asdict(conversation_config) == want