Skip to content

Commit 6be8ea9

Browse files
Support the EndOfUtterance message
* Prints [EndOfUtterance] in text mode, prints the full message in json mode. * Adds a command line parameter for setting the end of utterance silence trigger.
1 parent 3638fd2 commit 6be8ea9

File tree

8 files changed

+75
-2
lines changed

8 files changed

+75
-2
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [3.0.6] - 2025-05-20
9+
10+
### Added
11+
12+
- Support end-of-utterance messages (DEL-24982)
13+
814
## [3.0.5] - 2025-05-15
915

1016
- cli: fix some config options not being set when defined in a config file: `topic_detection_config` and `speaker_diarization_config`

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
3.0.5
1+
3.0.6

speechmatics/cli.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,13 @@ def get_transcription_config(
281281
]:
282282
config[option] = True if args.get(option) else config.get(option)
283283

284+
if args.get("end_of_utterance_silence_trigger") is not None:
285+
config["conversation_config"] = {
286+
"end_of_utterance_silence_trigger": args.get(
287+
"end_of_utterance_silence_trigger"
288+
)
289+
}
290+
284291
if args.get("volume_threshold") is not None:
285292
config["audio_filtering_config"] = {
286293
"volume_threshold": args.get("volume_threshold")
@@ -556,6 +563,13 @@ def audio_event_handler(message):
556563
sys.stdout.write(f"{escape_seq}[{event_name}]\n")
557564
transcripts.text += f"[{event_name}] "
558565

566+
def end_of_utterance_handler(message):
567+
if print_json:
568+
print(json.dumps(message))
569+
return
570+
sys.stdout.write("[EndOfUtterance]\n")
571+
transcripts.text += "[EndOfUtterance]"
572+
559573
def partial_translation_handler(message):
560574
if print_json:
561575
print(json.dumps(message))
@@ -590,6 +604,8 @@ def end_of_transcript_handler(_):
590604
# print transcription (if text was requested without translation)
591605

592606
api.add_event_handler(ServerMessageType.AudioEventStarted, audio_event_handler)
607+
api.add_event_handler(ServerMessageType.EndOfUtterance, end_of_utterance_handler)
608+
593609
if print_json:
594610
if enable_partials or enable_translation_partials:
595611
api.add_event_handler(

speechmatics/cli_parser.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,15 @@ def get_arg_parser():
218218
default=None,
219219
help=("Filter out quiet audio which falls below this threshold (0.0-100.0)"),
220220
)
221+
config_parser.add_argument(
222+
"--end-of-utterance-silence-trigger",
223+
dest="end_of_utterance_silence_trigger",
224+
type=float,
225+
default=None,
226+
help=(
227+
"Generate an EndOfUtterance message from the server after this many seconds of silence (0.0-2.0)"
228+
),
229+
)
221230
config_parser.add_argument(
222231
"--remove-disfluencies",
223232
default=None,

speechmatics/models.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,14 @@ def asdict(self):
166166
return asdict(self)
167167

168168

169+
@dataclass
170+
class ConversationConfig:
171+
"""Conversation config."""
172+
173+
end_of_utterance_silence_trigger: Optional[float] = None
174+
"""How much silence in seconds is required to trigger end of utterance detection."""
175+
176+
169177
@dataclass
170178
class RTTranslationConfig(TranslationConfig):
171179
"""Real-time mode: Translation config."""
@@ -287,6 +295,9 @@ class TranscriptionConfig(_TranscriptionConfig):
287295
"""Indicates if partial translation, where words are produced
288296
immediately, is enabled."""
289297

298+
conversation_config: Optional[ConversationConfig] = None
299+
"""Optional configuration for end-of-utterance detection."""
300+
290301
translation_config: Optional[TranslationConfig] = None
291302
"""Optional configuration for translation."""
292303

@@ -550,6 +561,9 @@ class ServerMessageType(str, Enum):
550561
AddTranscript = "AddTranscript"
551562
"""Indicates the final transcript of a part of the audio."""
552563

564+
EndOfUtterance = "EndOfUtterance"
565+
"""Indicates that an utterance has ended, based on silence"""
566+
553567
AudioEventStarted = "AudioEventStarted"
554568
"""Indicates the start of an audio event."""
555569

tests/mock_rt_server.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,15 @@ def dummy_add_partial_transcript():
120120
}
121121

122122

123+
def dummy_end_of_utterance():
124+
"""Returns a dummy EndOfUtterance message."""
125+
return {
126+
"message": "EndOfUtterance",
127+
"format": "2.1",
128+
"metadata": {"start_time": 3.0, "end_time": 3.0},
129+
}
130+
131+
123132
def dummy_add_transcript():
124133
"""Returns a dummy AddTranscript message."""
125134
return {
@@ -194,9 +203,10 @@ def get_responses(message, is_binary=False):
194203
)
195204
mock_server_handler.next_audio_seq_no += 1
196205

197-
# Answer immediately with a partial and a final.
206+
# Answer immediately with a partial and a final and an end of utterance.
198207
responses.append(dummy_add_partial_transcript())
199208
responses.append(dummy_add_transcript())
209+
responses.append(dummy_end_of_utterance())
200210
else:
201211
msg_name = message.get("message")
202212
if not msg_name:

tests/test_cli.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,10 @@
182182
{"enable_translation_partials": True},
183183
),
184184
(["rt", "transcribe", "--enable-entities"], {"enable_entities": True}),
185+
(
186+
["rt", "transcribe", "--end-of-utterance-silence-trigger=1.8"],
187+
{"end_of_utterance_silence_trigger": 1.8},
188+
),
185189
(["batch", "transcribe", "--enable-entities"], {"enable_entities": True}),
186190
(
187191
["batch", "transcribe", "--diarization=speaker"],

tests/test_models.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,3 +220,17 @@ def test_notification_config(params, want):
220220
def test_audio_events_config_config(params, want):
221221
audio_events_config = models.AudioEventsConfig(**params)
222222
assert audio_events_config.asdict() == want
223+
224+
225+
@mark.parametrize(
226+
"params, want",
227+
[
228+
param(
229+
{"end_of_utterance_silence_trigger": 1.8},
230+
{"end_of_utterance_silence_trigger": 1.8},
231+
),
232+
],
233+
)
234+
def test_conversation_config(params, want):
235+
conversation_config = models.ConversationConfig(**params)
236+
assert asdict(conversation_config) == want

0 commit comments

Comments
 (0)