diff --git a/.github/workflows/build-and-push-to-ghcr.yml b/.github/workflows/build-and-push-to-ghcr.yml index 4a7bdca..3e6267f 100644 --- a/.github/workflows/build-and-push-to-ghcr.yml +++ b/.github/workflows/build-and-push-to-ghcr.yml @@ -4,7 +4,7 @@ on: branches: [main] pull_request: jobs: - build-and-push-to-ghcr-cuda117: + build-and-push-to-ghcr-cuda118: runs-on: ubuntu-22.04 steps: - @@ -27,7 +27,7 @@ jobs: sudo rm -rf "/usr/local/share/boost" sudo rm -rf "$AGENT_TOOLSDIRECTORY" - - name: Build only for PR Cuda 11.7 + - name: Build only for PR Cuda 11.8 if: github.ref != 'refs/heads/main' uses: docker/build-push-action@v5 with: @@ -37,7 +37,7 @@ jobs: cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest; type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-${{ github.event.number }} cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-${{ github.event.number }} - - name: Build and Push image Cuda 11.7 + - name: Build and Push image Cuda 11.8 if: github.ref == 'refs/heads/main' uses: docker/build-push-action@v5 with: @@ -49,7 +49,7 @@ jobs: tags: ghcr.io/coqui-ai/xtts-streaming-server:latest, ghcr.io/coqui-ai/xtts-streaming-server:main-${{ github.sha }} #build-args: - build-and-push-to-ghcr-cuda118: + build-and-push-to-ghcr-cuda121: runs-on: ubuntu-22.04 steps: - @@ -72,29 +72,28 @@ jobs: sudo rm -rf "/usr/local/share/boost" sudo rm -rf "$AGENT_TOOLSDIRECTORY" - - name: Build only for PR cuda 11.8 + - name: Build only for PR cuda 12.1 if: github.ref != 'refs/heads/main' uses: docker/build-push-action@v5 with: context: "{{defaultContext}}:server" - file: Dockerfile.cuda118 + file: Dockerfile.cuda121 push: false # Do not push image for PR - cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda118; type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-cuda118-${{ github.event.number }} - cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-cuda118-${{ github.event.number }} + cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda121; type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-cuda121-${{ github.event.number }} + cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-cuda121-${{ github.event.number }} - - name: Build and Push image cuda 11.8 + - name: Build and Push image cuda 12.1 if: github.ref == 'refs/heads/main' uses: docker/build-push-action@v5 with: context: "{{defaultContext}}:server" - file: Dockerfile.cuda118 + file: Dockerfile.cuda121 push: true # Push if merged - cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda118 - cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda118 - tags: ghcr.io/coqui-ai/xtts-streaming-server:latest-cuda118, ghcr.io/coqui-ai/xtts-streaming-server:main-cuda118-${{ github.sha }} + cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda121 + cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda121 + tags: ghcr.io/coqui-ai/xtts-streaming-server:latest-cuda121, ghcr.io/coqui-ai/xtts-streaming-server:main-cuda121-${{ github.sha }} #build-args: - - build-and-push-to-ghcr-cuda121: + build-and-push-to-ghcr-cpu: runs-on: ubuntu-22.04 steps: - @@ -117,24 +116,24 @@ jobs: sudo rm -rf "/usr/local/share/boost" sudo rm -rf "$AGENT_TOOLSDIRECTORY" - - name: Build only for PR cuda 12.1 + - name: Build only for PR CPU if: github.ref != 'refs/heads/main' uses: docker/build-push-action@v5 with: context: "{{defaultContext}}:server" - file: Dockerfile.cuda121 + file: Dockerfile.cpu push: false # Do not push image for PR - cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda121; type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-cuda121-${{ github.event.number }} - cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-cuda121-${{ github.event.number }} + cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cpu; type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-cuda121-${{ github.event.number }} + cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-cpu-${{ github.event.number }} - - name: Build and Push image cuda 12.1 + - name: Build and Push image CPU if: github.ref == 'refs/heads/main' uses: docker/build-push-action@v5 with: context: "{{defaultContext}}:server" - file: Dockerfile.cuda121 + file: Dockerfile.cpu push: true # Push if merged - cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda121 - cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda121 - tags: ghcr.io/coqui-ai/xtts-streaming-server:latest-cuda121, ghcr.io/coqui-ai/xtts-streaming-server:main-cuda121-${{ github.sha }} + cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cpu + cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cpu + tags: ghcr.io/coqui-ai/xtts-streaming-server:latest-cpu, ghcr.io/coqui-ai/xtts-streaming-server:main-cpu-${{ github.sha }} #build-args: diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..181fd98 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +demo_outputs \ No newline at end of file diff --git a/README.md b/README.md index 886931d..31d7e73 100644 --- a/README.md +++ b/README.md @@ -1,53 +1,83 @@ # XTTS streaming server +*Warning: XTTS-streaming-server doesn't support concurrent streaming requests, it's a demo server, not meant for production.* -## Running the server +https://github.com/coqui-ai/xtts-streaming-server/assets/17219561/7220442a-e88a-4288-8a73-608c4b39d06c -To run a pre-built container (CUDA 11.7): + +## 1) Run the server + +### Use a pre-built image + +CUDA 12.1: + +```bash +$ docker run --gpus=all -e COQUI_TOS_AGREED=1 --rm -p 8000:80 ghcr.io/coqui-ai/xtts-streaming-server:latest-cuda121 +``` + +CUDA 11.8 (for older cards): ```bash $ docker run --gpus=all -e COQUI_TOS_AGREED=1 --rm -p 8000:80 ghcr.io/coqui-ai/xtts-streaming-server:latest ``` -CUDA 11.8 version (for newer cards, tested on 4060 and L4 instance) +CPU (not recommended): + ```bash -$ docker run --gpus=all -e COQUI_TOS_AGREED=1 --rm -p 8000:80 ghcr.io/coqui-ai/xtts-streaming-server:latest-cuda118 +$ docker run -e COQUI_TOS_AGREED=1 --rm -p 8000:80 ghcr.io/coqui-ai/xtts-streaming-server:latest-cpu ``` -If you have already downloaded v1.1 model and like to use this server, and using Ubuntu, change your /home/YOUR_USER_NAME +Run with a fine-tuned model: + +Make sure the model folder `/path/to/model/folder` contains the following files: +- `config.json` +- `model.pth` +- `vocab.json` + ```bash -$ docker run -v /home/YOUR_USER_NAME/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_v1.1:/root/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_v1.1 --env NVIDIA_DISABLE_REQUIRE=1 --gpus=all -e COQUI_TOS_AGREED=1 --rm -p 8000:80 ghcr.io/coqui-ai/xtts-streaming-server:latest-cuda118` +$ docker run -v /path/to/model/folder:/app/tts_models --gpus=all -e COQUI_TOS_AGREED=1 --rm -p 8000:80 ghcr.io/coqui-ai/xtts-streaming-server:latest` ``` + Setting the `COQUI_TOS_AGREED` environment variable to `1` indicates you have read and agreed to -the terms of the [CPML license](https://coqui.ai/cpml). +the terms of the [CPML license](https://coqui.ai/cpml). (Fine-tuned XTTS models also are under the [CPML license](https://coqui.ai/cpml)) + +### Build the image yourself -## Testing the server +To build the Docker container Pytorch 2.1 and CUDA 11.8 : -1. Generate audio with the test script: +`DOCKERFILE` may be `Dockerfile`, `Dockerfile.cpu`, `Dockerfile.cuda121`, or your own custom Dockerfile. ```bash -$ cd test -$ python -m pip install -r requirements.txt -$ python test_streaming.py +$ git clone git@github.com:coqui-ai/xtts-streaming-server.git +$ cd xtts-streaming-server/server +$ docker build -t xtts-stream . -f DOCKERFILE +$ docker run --gpus all -e COQUI_TOS_AGREED=1 --rm -p 8000:80 xtts-stream ``` -## Building the container +Setting the `COQUI_TOS_AGREED` environment variable to `1` indicates you have read and agreed to +the terms of the [CPML license](https://coqui.ai/cpml). (Fine-tuned XTTS models also are under the [CPML license](https://coqui.ai/cpml)) + +## 2) Testing the running server + +Once your Docker container is running, you can test that it's working properly. You will need to run the following code from a fresh terminal. -1. To build the Docker container (Pytorch 2.01 Cuda 11.7) : +### Clone `xtts-streaming-server` if you haven't already ```bash -$ cd server -$ docker build -t xtts-stream . +$ git clone git@github.com:coqui-ai/xtts-streaming-server.git ``` -For Pytorch 2.1 and CUDA 11.8 version (when running set NVIDIA_DISABLE_REQUIRE=1 if you have Cuda < 11.8 drivers) + +### Using the gradio demo + ```bash -$ cd server -# docker build -t xtts-stream . -f Dockerfile.cuda118 +$ cd xtts-streaming-server +$ python -m pip install -r test/requirements.txt +$ python demo.py ``` -2. Run the server container: + +### Using the test script ```bash -$ docker run --gpus=all -e COQUI_TOS_AGREED=1 --rm -p 8000:80 xtts-stream +$ cd xtts-streaming-server/test +$ python -m pip install -r requirements.txt +$ python test_streaming.py ``` - -Setting the `COQUI_TOS_AGREED` environment variable to `1` indicates you have read and agreed to -the terms of the [CPML license](https://coqui.ai/cpml). diff --git a/demo.py b/demo.py new file mode 100644 index 0000000..2ed6251 --- /dev/null +++ b/demo.py @@ -0,0 +1,121 @@ +import gradio as gr +import requests +import base64 +import tempfile +import json +import os + + +SERVER_URL = 'http://localhost:8000' +OUTPUT = "./demo_outputs" +cloned_speakers = {} + +print("Preparing file structure...") +if not os.path.exists(OUTPUT): + os.mkdir(OUTPUT) + os.mkdir(os.path.join(OUTPUT, "cloned_speakers")) + os.mkdir(os.path.join(OUTPUT, "generated_audios")) +elif os.path.exists(os.path.join(OUTPUT, "cloned_speakers")): + print("Loading existing cloned speakers...") + for file in os.listdir(os.path.join(OUTPUT, "cloned_speakers")): + if file.endswith(".json"): + with open(os.path.join(OUTPUT, "cloned_speakers", file), "r") as fp: + cloned_speakers[file[:-5]] = json.load(fp) + print("Available cloned speakers:", ", ".join(cloned_speakers.keys())) + +try: + print("Getting metadata from server ...") + LANUGAGES = requests.get(SERVER_URL + "/languages").json() + print("Available languages:", ", ".join(LANUGAGES)) + STUDIO_SPEAKERS = requests.get(SERVER_URL + "/studio_speakers").json() + print("Available studio speakers:", ", ".join(STUDIO_SPEAKERS.keys())) +except: + raise Exception("Please make sure the server is running first.") + + +def clone_speaker(upload_file, clone_speaker_name, cloned_speaker_names): + files = {"wav_file": ("reference.wav", open(upload_file, "rb"))} + embeddings = requests.post(SERVER_URL + "/clone_speaker", files=files).json() + with open(os.path.join(OUTPUT, "cloned_speakers", clone_speaker_name + ".json"), "w") as fp: + json.dump(embeddings, fp) + cloned_speakers[clone_speaker_name] = embeddings + cloned_speaker_names.append(clone_speaker_name) + return upload_file, clone_speaker_name, cloned_speaker_names, gr.Dropdown.update(choices=cloned_speaker_names) + +def tts(text, speaker_type, speaker_name_studio, speaker_name_custom, lang): + embeddings = STUDIO_SPEAKERS[speaker_name_studio] if speaker_type == 'Studio' else cloned_speakers[speaker_name_custom] + generated_audio = requests.post( + SERVER_URL + "/tts", + json={ + "text": text, + "language": lang, + "speaker_embedding": embeddings["speaker_embedding"], + "gpt_cond_latent": embeddings["gpt_cond_latent"] + } + ).content + generated_audio_path = os.path.join("demo_outputs", "generated_audios", next(tempfile._get_candidate_names()) + ".wav") + with open(generated_audio_path, "wb") as fp: + fp.write(base64.b64decode(generated_audio)) + return fp.name + +with gr.Blocks() as demo: + cloned_speaker_names = gr.State(list(cloned_speakers.keys())) + with gr.Tab("TTS"): + with gr.Column() as row4: + with gr.Row() as col4: + speaker_name_studio = gr.Dropdown( + label="Studio speaker", + choices=STUDIO_SPEAKERS.keys(), + value="Asya Anara" if "Asya Anara" in STUDIO_SPEAKERS.keys() else None, + ) + speaker_name_custom = gr.Dropdown( + label="Cloned speaker", + choices=cloned_speaker_names.value, + value=cloned_speaker_names.value[0] if len(cloned_speaker_names.value) != 0 else None, + ) + speaker_type = gr.Dropdown(label="Speaker type", choices=["Studio", "Cloned"], value="Studio") + with gr.Column() as col2: + lang = gr.Dropdown(label="Language", choices=LANUGAGES, value="en") + text = gr.Textbox(label="text", value="A quick brown fox jumps over the lazy dog.") + tts_button = gr.Button(value="TTS") + with gr.Column() as col3: + generated_audio = gr.Audio(label="Generated audio", autoplay=True) + with gr.Tab("Clone a new speaker"): + with gr.Column() as col1: + upload_file = gr.Audio(label="Upload reference audio", type="filepath") + clone_speaker_name = gr.Textbox(label="Speaker name", value="default_speaker") + clone_button = gr.Button(value="Clone speaker") + + clone_button.click( + fn=clone_speaker, + inputs=[upload_file, clone_speaker_name, cloned_speaker_names], + outputs=[upload_file, clone_speaker_name, cloned_speaker_names, speaker_name_custom], + ) + + tts_button.click( + fn=tts, + inputs=[text, speaker_type, speaker_name_studio, speaker_name_custom, lang], + outputs=[generated_audio], + ) + +if __name__ == "__main__": + print("Warming up server...") + with open("test/default_speaker.json", "r") as fp: + warmup_speaker = json.load(fp) + resp = requests.post( + SERVER_URL + "/tts", + json={ + "text": "This is a warmup request.", + "language": "en", + "speaker_embedding": warmup_speaker["speaker_embedding"], + "gpt_cond_latent": warmup_speaker["gpt_cond_latent"], + } + ) + resp.raise_for_status() + print("Starting the demo...") + demo.launch( + share=False, + debug=False, + server_port=3009, + server_name="0.0.0.0", + ) diff --git a/server/Dockerfile.cpu b/server/Dockerfile.cpu new file mode 100644 index 0000000..57dc3a7 --- /dev/null +++ b/server/Dockerfile.cpu @@ -0,0 +1,20 @@ +FROM python:3.11.7 +ARG DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && \ + apt-get install --no-install-recommends -y sox libsox-fmt-all curl wget gcc git git-lfs build-essential libaio-dev libsndfile1 ssh ffmpeg && \ + apt-get clean && apt-get -y autoremove + +WORKDIR /app +COPY requirements_cpu.txt . +RUN python -m pip install --use-deprecated=legacy-resolver -r requirements_cpu.txt \ + && python -m pip cache purge + +RUN python -m unidic download +RUN mkdir -p /app/tts_models + +COPY main.py . +ENV USE_CPU=1 + +EXPOSE 80 +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80"] diff --git a/server/requirements_cpu.txt b/server/requirements_cpu.txt new file mode 100644 index 0000000..0bb7c39 --- /dev/null +++ b/server/requirements_cpu.txt @@ -0,0 +1,11 @@ +TTS @ git+https://github.com/coqui-ai/TTS@fa28f99f1508b5b5366539b2149963edcb80ba62 +uvicorn[standard]==0.23.2 +fastapi==0.95.2 +pydantic==1.10.13 +python-multipart==0.0.6 +typing-extensions>=4.8.0 +numpy==1.24.3 +cutlet +mecab-python3==1.0.6 +unidic-lite==1.0.8 +unidic==1.1.0 diff --git a/test/requirements.txt b/test/requirements.txt index 2c24336..7a68a27 100644 --- a/test/requirements.txt +++ b/test/requirements.txt @@ -1 +1,2 @@ requests==2.31.0 +gradio==3.50.2 diff --git a/test/test_streaming.py b/test/test_streaming.py index 3e35409..8fe8dc5 100644 --- a/test/test_streaming.py +++ b/test/test_streaming.py @@ -38,11 +38,10 @@ def stream_ffplay(audio_stream, output_file, save=True): ffplay_proc.wait() -def tts(text, speaker,language, server_url , decoder, stream_chunk_size) -> Iterator[bytes]: +def tts(text, speaker, language, server_url, stream_chunk_size) -> Iterator[bytes]: start = time.perf_counter() speaker["text"] = text speaker["language"] = language - speaker["decoder"] = decoder # "hifigan" or "ne_hifigan" for TTS>0.19.0 speaker["stream_chunk_size"] = stream_chunk_size # you can reduce it to get faster response, but degrade quality res = requests.post( f"{server_url}/tts_stream", @@ -86,7 +85,6 @@ def get_speaker(ref_audio,server_url): default="en", help="Language to use default is 'en' (English)" ) - parser.add_argument( "--output_file", default=None, @@ -102,18 +100,11 @@ def get_speaker(ref_audio,server_url): default="http://localhost:8000", help="Server url http://localhost:8000 default, change to your server location " ) - parser.add_argument( - "--decoder", - default="ne_hifigan", - help="Decoder for vocoder, ne_hifigan default, options ne_hifigan or hifigan" - ) - parser.add_argument( "--stream_chunk_size", default="20", help="Stream chunk size , 20 default, reducing will get faster latency but may degrade quality" ) - args = parser.parse_args() with open("./default_speaker.json", "r") as file: @@ -121,6 +112,16 @@ def get_speaker(ref_audio,server_url): if args.ref_file is not None: print("Computing the latents for a new reference...") - speaker = get_speaker(args.ref_file,args.server_url) - - audio = stream_ffplay(tts(args.text, speaker,args.language,args.server_url,args.decoder,args.stream_chunk_size), args.output_file, save=bool(args.output_file)) + speaker = get_speaker(args.ref_file, args.server_url) + + audio = stream_ffplay( + tts( + args.text, + speaker, + args.language, + args.server_url, + args.stream_chunk_size + ), + args.output_file, + save=bool(args.output_file) + )