diff --git a/packages/tasks/src/tasks/audio-text-to-text/about.md b/packages/tasks/src/tasks/audio-text-to-text/about.md new file mode 100644 index 0000000000..e47ab84669 --- /dev/null +++ b/packages/tasks/src/tasks/audio-text-to-text/about.md @@ -0,0 +1,133 @@ +## Audio Text to Text + +The Audio Text to Text task (also sometimes referred to as speech-to-text, speech recognition, or speech translation depending on the specifics) converts audio input into textual output. This is a versatile task that can be used for various applications. + +### Use Cases + +* **Speech Recognition:** Transcribing spoken language from an audio clip into text. This is foundational for voice assistants, dictation software, and transcribing meetings or interviews. +* **Speech Translation:** Directly translating spoken language from an audio clip in one language into text in another language. This is useful for real-time translation applications or translating audio content. +* **Voice Command Interfaces:** Converting spoken commands into text that can then be interpreted by a system to perform actions (e.g., "Play music," "Set a timer"). +* **Audio Event Description/Captioning:** Generating textual descriptions of sounds or events occurring in an audio stream (though this might sometimes overlap with Audio Tagging). + +### Python Examples + +You can use the `transformers` library for many audio-text-to-text tasks. + +**1. Automatic Speech Recognition (ASR):** + +```python +from transformers import pipeline +import torchaudio + +# Initialize the ASR pipeline +# Replace "openai/whisper-base" with any ASR model of your choice +asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-base") + +# Load an example audio file (you'll need to have one) +# For example, using torchaudio to load and resample if needed +# waveform, sample_rate = torchaudio.load("your_audio_file.wav") +# if sample_rate != asr_pipeline.feature_extractor.sampling_rate: +# resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=asr_pipeline.feature_extractor.sampling_rate) +# waveform = resampler(waveform) + +# Or provide a path directly (ensure it's in a supported format and sample rate) +# For demonstration, let's assume you have a file "sample_audio.flac" +# If you don't have an audio file handy, you can skip loading and pass dummy data or a public URL if the model supports it. +# For local files, it's usually best to load them as numpy arrays or torch tensors. + +# Example with a local file path (ensure the file exists and is accessible) +# text_output = asr_pipeline("path/to/your/sample_audio.flac") +# print(text_output) +# Expected output: {'text': 'TRANSCRIPTION OF THE AUDIO...'} + +# Example with a publicly accessible URL (model/pipeline must support it) +# text_output_url = asr_pipeline("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac") +# print(text_output_url) +# Expected output: {'text': ' HELLO MY NAME IS NARSEEL'} +``` +*Note: For local audio files, you might need to load and preprocess them into the format expected by the pipeline (e.g., a NumPy array or Torch tensor of raw audio samples). Ensure the sampling rate matches the model's requirements.* + +**2. Speech Translation (Example with a model fine-tuned for S2T):** +Speech-to-Text (S2T) models can directly translate audio from one language to text in another. + +```python +from transformers import pipeline + +# Initialize the speech-to-text translation pipeline +# Replace "facebook/s2t-small-librispeech-asr" with a speech translation model +# For example, if you want to translate English audio to French text: +translator_pipeline = pipeline("automatic-speech-recognition", model="facebook/s2t-small-en-fr-st") # Example model + +# Process an audio file (similar to ASR) +# audio_input = "path/to/your/english_audio.wav" +# translated_text = translator_pipeline(audio_input) +# print(translated_text) +# Expected output: {'text': 'FRENCH TRANSLATION OF THE AUDIO...'} + +# Example with a publicly accessible URL (model/pipeline must support it) +# For S2T, ensure the audio is in the source language the model expects. +# This example uses an ASR model URL, adapt if a direct S2T URL is available. +# translated_text_url = translator_pipeline("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac") # Assuming English audio +# print(translated_text_url) +# Expected output: {'text': 'BONJOUR MON NOM EST NARSEEL'} (if input was "Hello my name is Narsil") +``` + +### JavaScript Example + +You can use [`@huggingface/inference`](https://github.com/huggingface/huggingface.js) to perform audio-text-to-text tasks with models on the Hugging Face Hub. + +```javascript +import { InferenceClient } from "@huggingface/inference"; + +const inference = new InferenceClient(HF_TOKEN); // Your Hugging Face token + +async function transcribeAudio(audioBlob) { + try { + const result = await inference.automaticSpeechRecognition({ + model: "openai/whisper-base", // Or your preferred ASR/S2T model + data: audioBlob, + }); + console.log(result.text); + return result.text; + } catch (error) { + console.error("Error during transcription:", error); + } +} + +// Example usage: +// Assumes you have an audio file as a Blob object (e.g., from a file input) +// const audioFile = new File(["...audio data..."], "audio.wav", { type: "audio/wav" }); +// transcribeAudio(audioFile); + +// Example fetching a remote audio file and then transcribing: +async function transcribeRemoteAudio(audioUrl) { + try { + const response = await fetch(audioUrl); + if (!response.ok) { + throw new Error(`HTTP error! status: ${response.status}`); + } + const audioBlob = await response.blob(); + + const result = await inference.automaticSpeechRecognition({ + model: "openai/whisper-base", // Or your preferred ASR/S2T model + data: audioBlob, + }); + console.log("Transcription:", result.text); + return result.text; + } catch (error) { + console.error("Error during remote audio transcription:", error); + } +} + +// transcribeRemoteAudio("httpsS://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac"); +```Here are some useful resources: + +- [Ultravox, a fast multimodal large language model designed for real-time voice interactions-.](https://github.com/fixie-ai/ultravox) + +- [An open-source large-scale audio-language model by Alibaba Cloud, Qwen2-Audio, supporting voice chat and audio analysis in multiple languages.](https://github.com/QwenLM/Qwen2-Audio) + +- [A compact, open-source speech tokenizer, WhisperSpeech, enhancing multilingual performance with minimal impact on English capabilities.](https://github.com/janhq/WhisperSpeech) + +- [A guide to Microsoft's open-source Phi models, PhiCookBook, offering capable and cost-effective small language models.](https://github.com/microsoft/PhiCookBook) + +- [Fast-RTC, turn any python function into a real-time audio and video stream over WebRTC or WebSockets.](https://huggingface.co/fastrtc) diff --git a/packages/tasks/src/tasks/audio-text-to-text/data.ts b/packages/tasks/src/tasks/audio-text-to-text/data.ts new file mode 100644 index 0000000000..b54ca0e33e --- /dev/null +++ b/packages/tasks/src/tasks/audio-text-to-text/data.ts @@ -0,0 +1,65 @@ +import type { TaskDataCustom } from "../index.js"; + +const taskData: TaskDataCustom = { + datasets: [ + { + description: "A massively multilingual speech corpus, excellent for training speech recognition models.", + id: "mozilla-foundation/common_voice_11_0", // Mozilla Common Voice (example) + }, + { + description: "A benchmark dataset for speech translation.", + id: "facebook/covost2", // CoVoST 2 (example for speech translation) + }, + ], + demo: { + inputs: [ + { + filename: "input.flac", + type: "audio", + }, + ], + outputs: [ + { + label: "Output", // Generic label, will be "Transcription" or "Translation" + content: "This is a sample transcription or translation from the audio.", + type: "text", + }, + ], + }, + metrics: [ + { + description: "Word Error Rate (WER) is a common metric for the accuracy of an automatic speech recognition system. The lower the WER, the better.", + id: "wer", + }, + { + description: "BLEU (Bilingual Evaluation Understudy) score is often used to measure the quality of machine translation from one language to another.", + id: "bleu", + }, + ], + models: [ + { + description: "A popular multilingual model for automatic speech recognition.", + id: "openai/whisper-base", + }, + { + description: "A model for translating speech from English to German (example of a speech translation model).", + id: "facebook/s2t-medium-en-de-st", + }, + ], + spaces: [ + { + description: "A demonstration of the Whisper model for speech recognition.", + id: "openai/whisper", + }, + { + description: "An ESPnet demo that can perform speech recognition and translation.", + id: "espnet/espnet_asr_demo", + }, + ], + summary: + "Audio Text to Text tasks convert audio input into textual output. This primarily includes automatic speech recognition (transcribing audio to text in the same language) and speech translation (translating audio in one language to text in another).", + widgetModels: ["openai/whisper-base"], + youtubeId: "SqE7xeyjBFg", // Example: A video about Whisper +}; + +export default taskData; diff --git a/packages/tasks/src/tasks/audio-text-to-text/spec/input.json b/packages/tasks/src/tasks/audio-text-to-text/spec/input.json new file mode 100644 index 0000000000..e674796f21 --- /dev/null +++ b/packages/tasks/src/tasks/audio-text-to-text/spec/input.json @@ -0,0 +1,24 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "AudioTextToTextInput", + "type": "object", + "properties": { + "inputs": { + "type": "string", + "format": "binary", + "description": "The audio input to be processed." + }, + "parameters": { + "type": "object", + "properties": { + "generate_kwargs": { + "type": "object", + "description": "Keyword arguments to control generation. Varies by model." + } + } + } + }, + "required": [ + "inputs" + ] +} diff --git a/packages/tasks/src/tasks/audio-text-to-text/spec/output.json b/packages/tasks/src/tasks/audio-text-to-text/spec/output.json new file mode 100644 index 0000000000..ca87f39212 --- /dev/null +++ b/packages/tasks/src/tasks/audio-text-to-text/spec/output.json @@ -0,0 +1,17 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "AudioTextToTextOutput", + "type": "array", + "items": { + "type": "object", + "properties": { + "text": { + "type": "string", + "description": "The generated text from the audio input." + } + }, + "required": [ + "text" + ] + } +} diff --git a/packages/tasks/src/tasks/index.ts b/packages/tasks/src/tasks/index.ts index 19c25ccfce..3faed80cc5 100644 --- a/packages/tasks/src/tasks/index.ts +++ b/packages/tasks/src/tasks/index.ts @@ -45,6 +45,7 @@ import imageTo3D from "./image-to-3d/data.js"; import textTo3D from "./text-to-3d/data.js"; import keypointDetection from "./keypoint-detection/data.js"; import videoTextToText from "./video-text-to-text/data.js"; +import audioTextToText from "./audio-text-to-text/data.js"; export type * from "./audio-classification/inference.js"; export type * from "./automatic-speech-recognition/inference.js"; @@ -121,7 +122,7 @@ export const TASKS_MODEL_LIBRARIES: Record = { "audio-classification": ["speechbrain", "transformers", "transformers.js"], "audio-to-audio": ["asteroid", "fairseq", "speechbrain"], "automatic-speech-recognition": ["espnet", "nemo", "speechbrain", "transformers", "transformers.js"], - "audio-text-to-text": [], + "audio-text-to-text": ["transformers", "speechbrain", "espnet", "nemo"], "depth-estimation": ["transformers", "transformers.js"], "document-question-answering": ["transformers", "transformers.js"], "feature-extraction": ["sentence-transformers", "transformers", "transformers.js"], @@ -205,7 +206,7 @@ export const TASKS_DATA: Record = { "any-to-any": getData("any-to-any", anyToAny), "audio-classification": getData("audio-classification", audioClassification), "audio-to-audio": getData("audio-to-audio", audioToAudio), - "audio-text-to-text": getData("audio-text-to-text", placeholder), + "audio-text-to-text": getData("audio-text-to-text", audioTextToText), "automatic-speech-recognition": getData("automatic-speech-recognition", automaticSpeechRecognition), "depth-estimation": getData("depth-estimation", depthEstimation), "document-question-answering": getData("document-question-answering", documentQuestionAnswering),