Skip to content

Commit c62652e

Browse files
biyachuevclaude
andcommitted
feat: Preserve speaker labels when refining transcript
Added _select_speaker_for_interval function to maintain speaker attribution when LLM groups segments into paragraphs. Uses overlap calculation to assign the dominant speaker to each refined paragraph. - New helper function for speaker label preservation - Test coverage for speaker label retention - Verified working with OpenAI Whisper API and GPT-4o refinement 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <[email protected]>
1 parent 3d2635a commit c62652e

File tree

2 files changed

+76
-1
lines changed

2 files changed

+76
-1
lines changed

src/transcriber.py

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1518,6 +1518,52 @@ def update_segments_from_text(
15181518
if not segments:
15191519
return []
15201520

1521+
def _select_speaker_for_interval(
1522+
original_segments: List[TranscriptionSegment],
1523+
interval_start: float,
1524+
interval_end: float,
1525+
) -> Optional[str]:
1526+
"""
1527+
Pick the speaker that overlaps most with the provided time range.
1528+
1529+
Args:
1530+
original_segments: Segments with existing speaker labels.
1531+
interval_start: Start of the interval.
1532+
interval_end: End of the interval.
1533+
1534+
Returns:
1535+
Speaker label or None if unavailable.
1536+
"""
1537+
best_speaker = None
1538+
best_overlap = 0.0
1539+
1540+
for seg in original_segments:
1541+
if not seg.speaker:
1542+
continue
1543+
1544+
overlap_start = max(interval_start, seg.start)
1545+
overlap_end = min(interval_end, seg.end)
1546+
overlap = overlap_end - overlap_start
1547+
1548+
if overlap > best_overlap:
1549+
best_overlap = overlap
1550+
best_speaker = seg.speaker
1551+
1552+
if best_speaker:
1553+
return best_speaker
1554+
1555+
# Fallback: use the closest preceding speaker label.
1556+
for seg in reversed(original_segments):
1557+
if seg.start <= interval_start and seg.speaker:
1558+
return seg.speaker
1559+
1560+
# Final fallback: return the first available speaker label.
1561+
for seg in original_segments:
1562+
if seg.speaker:
1563+
return seg.speaker
1564+
1565+
return None
1566+
15211567
total_duration = segments[-1].end - segments[0].start
15221568
segment_duration = total_duration / len(paragraphs) if paragraphs else 0
15231569

@@ -1536,7 +1582,7 @@ def update_segments_from_text(
15361582
start=start_time,
15371583
end=end_time,
15381584
text=paragraph,
1539-
speaker=None,
1585+
speaker=_select_speaker_for_interval(segments, start_time, end_time),
15401586
)
15411587
)
15421588
start_time = end_time

tests/test_transcriber.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,35 @@ def test_segments_to_text_with_timestamps_and_speakers(self, mock_whisper, mock_
143143
assert "[00:00]" in result
144144
assert "[00:05]" in result
145145

146+
@patch('src.transcriber.torch')
147+
@patch('src.transcriber.whisper')
148+
def test_update_segments_from_text_preserves_speakers(self, mock_whisper, mock_torch):
149+
"""Refined text should retain dominant speaker labels for merged paragraphs."""
150+
mock_torch.cuda.is_available.return_value = False
151+
mock_torch.backends.mps.is_available.return_value = False
152+
153+
transcriber = Transcriber()
154+
155+
segments = [
156+
TranscriptionSegment(0.0, 5.0, "First sentence.", speaker="SPEAKER_00"),
157+
TranscriptionSegment(5.0, 10.0, "Second sentence.", speaker="SPEAKER_00"),
158+
TranscriptionSegment(10.0, 15.0, "Third sentence.", speaker="SPEAKER_01"),
159+
TranscriptionSegment(15.0, 20.0, "Fourth sentence.", speaker="SPEAKER_01"),
160+
]
161+
162+
refined_text = (
163+
"Combined text for speaker zero.\n\n"
164+
"Combined text for speaker one."
165+
)
166+
167+
updated_segments = transcriber.update_segments_from_text(segments, refined_text)
168+
169+
assert len(updated_segments) == 2
170+
assert updated_segments[0].speaker == "SPEAKER_00"
171+
assert updated_segments[1].speaker == "SPEAKER_01"
172+
assert updated_segments[0].start == pytest.approx(0.0)
173+
assert updated_segments[1].end == pytest.approx(20.0)
174+
146175

147176
class TestChunking:
148177
"""Tests for audio chunking logic."""

0 commit comments

Comments
 (0)