feat: Integrate with and default to ElevenLabs Scribe v1

allozaur · allozaur · commit a30c7897ffdd · 2025-04-09T12:11:40.000+02:00
diff --git a/apps/web/src/lib/components/Recorder.svelte b/apps/web/src/lib/components/Recorder.svelte
@@ -3,6 +3,7 @@
 	import { onMount } from 'svelte';
 	import WaveSurfer from 'wavesurfer.js';
 	import RecordPlugin from 'wavesurfer.js/dist/plugins/record.esm.js';
+	import { ElevenLabsClient } from 'elevenlabs';
 
 	import Button from './Button.svelte';
 	import ButtonPause from './ButtonPause.svelte';
@@ -18,13 +19,15 @@
 		recordingUrl?: string;
 		saveRecording: () => void;
 		scrollingWaveform?: boolean;
+		transcriptionModel?: 'elevenlabs-scribe-v1' | 'openai-whisper';
 	}
 
 	let {
 		discardRecording,
 		recordingUrl = $bindable(''),
 		saveRecording,
-		scrollingWaveform = true
+		scrollingWaveform = true,
+		transcriptionModel = 'elevenlabs-scribe-v1'
 	}: RecorderProps = $props();
 
 	let defaultDeviceId: string | undefined = $state(undefined);
diff --git a/apps/web/src/lib/components/RecordingTile/RecordingTile.svelte b/apps/web/src/lib/components/RecordingTile/RecordingTile.svelte
@@ -15,9 +15,12 @@
 		name,
 		savedRecordings = $bindable([]),
 		titleSlot,
-		transcription
+		transcription,
+		transcriptionModel = $bindable('elevenlabs-scribe-v1')
 	}: RecordingTileProps = $props();
 
+	$inspect(transcription);
+
 	let blob: Blob = $state(base64ToBlob(data));
 
 	let isPlaying = $state(false);
@@ -80,7 +83,7 @@
 		<div class="transcription">
 			<div class="inner">
 				<!-- svelte-ignore a11y_click_events_have_key_events -->
-				{#each transcription.words as { word, start, end }, i}
+				{#each transcription.words as { word, text, start, end }, i}
 					<!-- svelte-ignore a11y_click_events_have_key_events -->
 					<!-- svelte-ignore a11y_no_static_element_interactions -->
 					<span
@@ -91,11 +94,21 @@
 						class:read={currentTime >= start}
 						onclick={() => playFromTime(start)}
 					>
-						{#if i !== transcription.words.length - 1}
-							<span class="word">{word}</span>&nbsp;
-						{:else}
-							<span class="word">{word}</span>
-						{/if}
+						<span class="word">
+							{#if text}
+								{#if text === ' '}
+									&nbsp;
+								{:else}
+									{text}
+								{/if}
+							{:else if word}
+								{#if i !== transcription.words.length - 1}
+									{word}&nbsp;
+								{:else}
+									{word}
+								{/if}
+							{/if}
+						</span>
 					</span>
 				{/each}
 			</div>
@@ -109,6 +122,11 @@
 			{:else}
 				<span> You haven't transcribed this recording yet. </span>
 
+				<select bind:value={transcriptionModel}>
+					<option value="elevenlabs-scribe-v1">ElevenLabs Scribe v1</option>
+					<option value="openai-whisper">OpenAI Whisper</option>
+				</select>
+
 				<Button
 					kind="secondary"
 					label="Transcribe"
@@ -119,7 +137,7 @@
 
 						isTranscribing = true;
 
-						const transcription = await transcribeRecording(blob);
+						const transcription = await transcribeRecording(blob, transcriptionModel);
 
 						isTranscribing = false;
 
diff --git a/apps/web/src/lib/components/RecordingTile/types.ts b/apps/web/src/lib/components/RecordingTile/types.ts
@@ -9,4 +9,5 @@ export interface RecordingTileProps {
 	savedRecordings?: any[];
 	titleSlot?: Snippet;
 	transcription?: Transcription;
+	transcriptionModel?: string;
 }
diff --git a/apps/web/src/lib/methods/transcribe-recording.ts b/apps/web/src/lib/methods/transcribe-recording.ts
@@ -1,11 +1,13 @@
 export default async function transcribeRecording(
-	audioBlob: Blob
+	audioBlob: Blob,
+	transcriptionModel = 'elevenlabs-scribe-v1'
 ): Promise<{ text: string } | null> {
 	try {
 		const formData = new FormData();
 		const newBlob = new Blob([audioBlob], { type: 'audio/webm' });
 
 		formData.append('audio', newBlob, 'audio.webm');
+		formData.append('model', transcriptionModel);
 
 		const response = await fetch('/api/transcribe', {
 			method: 'POST',
diff --git a/apps/web/src/lib/types/transcription.ts b/apps/web/src/lib/types/transcription.ts
@@ -2,5 +2,5 @@ export interface Transcription {
 	text: string;
 	vtt: string;
 	word_count: number;
-	words: { word: string; start: number; end: number }[];
+	words: { word?: string; text?: string; start: number; end: number }[];
 }
diff --git a/apps/web/src/routes/api/transcribe/+server.ts b/apps/web/src/routes/api/transcribe/+server.ts
@@ -1,6 +1,7 @@
 import { error, json } from '@sveltejs/kit';
 import type { RequestHandler } from './$types';
 import { env as envPrivate } from '$env/dynamic/private';
+import { ElevenLabsClient } from 'elevenlabs';
 
 const CLOUDFLARE_API_URL = `https://api.cloudflare.com/client/v4/accounts/${envPrivate.CLOUDFLARE_ACCOUNT_ID}/ai/run/@cf/openai/whisper`;
 
@@ -16,33 +17,69 @@ export const POST: RequestHandler = async ({ request, url }) => {
 	try {
 		const data = await request.formData();
 		const audioFile = data.get('audio') as File;
+		const model = data.get('model') as string;
 
 		if (!audioFile) {
 			return json({ error: 'No audio file provided' }, { status: 400 });
 		}
 
-		const controller = new AbortController();
-		const timeoutId = setTimeout(() => controller.abort(), 60000);
+		switch (model) {
+			case 'openai-whisper': {
+				const controller = new AbortController();
+				const timeoutId = setTimeout(() => controller.abort(), 60000);
 
-		const response = await fetch(CLOUDFLARE_API_URL, {
-			method: 'POST',
-			headers: {
-				Authorization: `Bearer ${envPrivate.CLOUDFLARE_WORKERS_AI_API_TOKEN}`,
-				'Content-Type': 'application/octet-stream'
-			},
-			body: audioFile,
-			signal: controller.signal
-		});
+				const response = await fetch(CLOUDFLARE_API_URL, {
+					method: 'POST',
+					headers: {
+						Authorization: `Bearer ${envPrivate.CLOUDFLARE_WORKERS_AI_API_TOKEN}`,
+						'Content-Type': 'application/octet-stream'
+					},
+					body: audioFile,
+					signal: controller.signal
+				});
 
-		clearTimeout(timeoutId);
+				clearTimeout(timeoutId);
 
-		if (!response.ok) {
-			throw new Error(`Cloudflare API error: ${response.statusText}`);
-		}
+				if (!response.ok) {
+					throw new Error(`Cloudflare API error: ${response.statusText}`);
+				}
+
+				const res = await response.json();
+
+				return json({ ...res.result });
+			}
+
+			case 'elevenlabs-scribe-v1':
+			default: {
+				if (!envPrivate.ELEVENLABS_API_KEY) {
+					return error(500, 'ElevenLabs API key not provided');
+				}
 
-		const res = await response.json();
+				try {
+					const client = new ElevenLabsClient({
+						apiKey: envPrivate.ELEVENLABS_API_KEY
+					});
 
-		return json({ ...res.result });
+					const audioBlob = new Blob([await audioFile.arrayBuffer()], { type: audioFile.type });
+
+					const transcription = await client.speechToText.convert({
+						file: audioBlob,
+						model_id: 'scribe_v1',
+						tag_audio_events: true,
+						language_code: 'en',
+						diarize: true
+					});
+
+					return json({
+						text: transcription.words.map(({ text }) => text).toString(),
+						words: transcription.words
+					});
+				} catch (elevenLabsError: any) {
+					console.error('ElevenLabs API error:', elevenLabsError);
+					return error(500, `ElevenLabs API error: ${elevenLabsError.message}`);
+				}
+			}
+		}
 	} catch (err: any) {
 		if (err.name === 'AbortError') {
 			console.error('Request timed out:', err);

Original file line number	Diff line number	Diff line change
`@@ -9,4 +9,5 @@ export interface RecordingTileProps {`
`9`	`9`	`savedRecordings?: any[];`
`10`	`10`	`titleSlot?: Snippet;`
`11`	`11`	`transcription?: Transcription;`
	`12`	`+ transcriptionModel?: string;`
`12`	`13`	`}`
Original file line number	Diff line number	Diff line change
`@@ -2,5 +2,5 @@ export interface Transcription {`
`2`	`2`	`text: string;`
`3`	`3`	`vtt: string;`
`4`	`4`	`word_count: number;`
`5`		`- words: { word: string; start: number; end: number }[];`
	`5`	`+ words: { word?: string; text?: string; start: number; end: number }[];`
`6`	`6`	`}`