diff --git a/.github/workflows/build-and-push-to-ghcr.yml b/.github/workflows/build-and-push-to-ghcr.yml
index 4a7bdca..3e6267f 100644
--- a/.github/workflows/build-and-push-to-ghcr.yml
+++ b/.github/workflows/build-and-push-to-ghcr.yml
@@ -4,7 +4,7 @@ on:
     branches: [main]
   pull_request:
 jobs:
-  build-and-push-to-ghcr-cuda117:
+  build-and-push-to-ghcr-cuda118:
     runs-on: ubuntu-22.04
     steps:
       -
@@ -27,7 +27,7 @@ jobs:
           sudo rm -rf "/usr/local/share/boost"
           sudo rm -rf "$AGENT_TOOLSDIRECTORY"
 
-      - name: Build only for PR Cuda 11.7
+      - name: Build only for PR Cuda 11.8
         if: github.ref != 'refs/heads/main'
         uses: docker/build-push-action@v5
         with:
@@ -37,7 +37,7 @@ jobs:
           cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest; type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-${{ github.event.number }}
           cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-${{ github.event.number }}
 
-      - name: Build and Push image Cuda 11.7
+      - name: Build and Push image Cuda 11.8
         if: github.ref == 'refs/heads/main'
         uses: docker/build-push-action@v5
         with:
@@ -49,7 +49,7 @@ jobs:
           tags: ghcr.io/coqui-ai/xtts-streaming-server:latest, ghcr.io/coqui-ai/xtts-streaming-server:main-${{ github.sha }}
           #build-args:
 
-  build-and-push-to-ghcr-cuda118:
+  build-and-push-to-ghcr-cuda121:
     runs-on: ubuntu-22.04
     steps:
       -
@@ -72,29 +72,28 @@ jobs:
           sudo rm -rf "/usr/local/share/boost"
           sudo rm -rf "$AGENT_TOOLSDIRECTORY"
 
-      - name: Build only for PR cuda 11.8
+      - name: Build only for PR cuda 12.1
         if: github.ref != 'refs/heads/main'
         uses: docker/build-push-action@v5
         with:
           context: "{{defaultContext}}:server"
-          file: Dockerfile.cuda118
+          file: Dockerfile.cuda121
           push: false # Do not push image for PR
-          cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda118; type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-cuda118-${{ github.event.number }}
-          cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-cuda118-${{ github.event.number }}
+          cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda121; type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-cuda121-${{ github.event.number }}
+          cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-cuda121-${{ github.event.number }}
 
-      - name: Build and Push image cuda 11.8
+      - name: Build and Push image cuda 12.1
         if: github.ref == 'refs/heads/main'
         uses: docker/build-push-action@v5
         with:
           context: "{{defaultContext}}:server"
-          file: Dockerfile.cuda118
+          file: Dockerfile.cuda121
           push: true # Push if merged
-          cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda118
-          cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda118
-          tags: ghcr.io/coqui-ai/xtts-streaming-server:latest-cuda118, ghcr.io/coqui-ai/xtts-streaming-server:main-cuda118-${{ github.sha }}
+          cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda121
+          cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda121
+          tags: ghcr.io/coqui-ai/xtts-streaming-server:latest-cuda121, ghcr.io/coqui-ai/xtts-streaming-server:main-cuda121-${{ github.sha }}
           #build-args:
-
-  build-and-push-to-ghcr-cuda121:
+  build-and-push-to-ghcr-cpu:
     runs-on: ubuntu-22.04
     steps:
       -
@@ -117,24 +116,24 @@ jobs:
           sudo rm -rf "/usr/local/share/boost"
           sudo rm -rf "$AGENT_TOOLSDIRECTORY"
 
-      - name: Build only for PR cuda 12.1
+      - name: Build only for PR CPU
         if: github.ref != 'refs/heads/main'
         uses: docker/build-push-action@v5
         with:
           context: "{{defaultContext}}:server"
-          file: Dockerfile.cuda121
+          file: Dockerfile.cpu
           push: false # Do not push image for PR
-          cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda121; type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-cuda121-${{ github.event.number }}
-          cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-cuda121-${{ github.event.number }}
+          cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cpu; type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-cuda121-${{ github.event.number }}
+          cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-cpu-${{ github.event.number }}
 
-      - name: Build and Push image cuda 12.1
+      - name: Build and Push image CPU
         if: github.ref == 'refs/heads/main'
         uses: docker/build-push-action@v5
         with:
           context: "{{defaultContext}}:server"
-          file: Dockerfile.cuda121
+          file: Dockerfile.cpu
           push: true # Push if merged
-          cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda121
-          cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda121
-          tags: ghcr.io/coqui-ai/xtts-streaming-server:latest-cuda121, ghcr.io/coqui-ai/xtts-streaming-server:main-cuda121-${{ github.sha }}
+          cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cpu
+          cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cpu
+          tags: ghcr.io/coqui-ai/xtts-streaming-server:latest-cpu, ghcr.io/coqui-ai/xtts-streaming-server:main-cpu-${{ github.sha }}
           #build-args:
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..181fd98
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+demo_outputs
\ No newline at end of file
diff --git a/README.md b/README.md
index 886931d..31d7e73 100644
--- a/README.md
+++ b/README.md
@@ -1,53 +1,83 @@
 # XTTS streaming server
+*Warning: XTTS-streaming-server doesn't support concurrent streaming requests, it's a demo server, not meant for production.*
 
-## Running the server
+https://github.com/coqui-ai/xtts-streaming-server/assets/17219561/7220442a-e88a-4288-8a73-608c4b39d06c
 
-To run a pre-built container (CUDA 11.7):
+
+## 1) Run the server
+
+### Use a pre-built image
+
+CUDA 12.1:
+
+```bash
+$ docker run --gpus=all -e COQUI_TOS_AGREED=1 --rm -p 8000:80 ghcr.io/coqui-ai/xtts-streaming-server:latest-cuda121
+```
+
+CUDA 11.8 (for older cards):
 
 ```bash
 $ docker run --gpus=all -e COQUI_TOS_AGREED=1 --rm -p 8000:80 ghcr.io/coqui-ai/xtts-streaming-server:latest
 ```
 
-CUDA 11.8 version (for newer cards, tested on 4060 and L4 instance)
+CPU (not recommended):
+
 ```bash
-$ docker run --gpus=all -e COQUI_TOS_AGREED=1 --rm -p 8000:80  ghcr.io/coqui-ai/xtts-streaming-server:latest-cuda118
+$ docker run -e COQUI_TOS_AGREED=1 --rm -p 8000:80 ghcr.io/coqui-ai/xtts-streaming-server:latest-cpu
 ```
 
-If you have already downloaded v1.1 model and like to use this server, and using Ubuntu, change your /home/YOUR_USER_NAME
+Run with a fine-tuned model:
+
+Make sure the model folder `/path/to/model/folder`  contains the following files:
+- `config.json`
+- `model.pth`
+- `vocab.json`
+
 ```bash
-$ docker run -v /home/YOUR_USER_NAME/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_v1.1:/root/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_v1.1 --env NVIDIA_DISABLE_REQUIRE=1 --gpus=all -e COQUI_TOS_AGREED=1  --rm -p 8000:80 ghcr.io/coqui-ai/xtts-streaming-server:latest-cuda118`
+$ docker run -v /path/to/model/folder:/app/tts_models --gpus=all -e COQUI_TOS_AGREED=1  --rm -p 8000:80 ghcr.io/coqui-ai/xtts-streaming-server:latest`
 ```
+
 Setting the `COQUI_TOS_AGREED` environment variable to `1` indicates you have read and agreed to
-the terms of the [CPML license](https://coqui.ai/cpml).
+the terms of the [CPML license](https://coqui.ai/cpml). (Fine-tuned XTTS models also are under the [CPML license](https://coqui.ai/cpml))
+
+### Build the image yourself
 
-## Testing the server
+To build the Docker container Pytorch 2.1 and CUDA 11.8 :
 
-1. Generate audio with the test script:
+`DOCKERFILE` may be `Dockerfile`, `Dockerfile.cpu`, `Dockerfile.cuda121`, or your own custom Dockerfile.
 
 ```bash
-$ cd test
-$ python -m pip install -r requirements.txt
-$ python test_streaming.py
+$ git clone git@github.com:coqui-ai/xtts-streaming-server.git
+$ cd xtts-streaming-server/server
+$ docker build -t xtts-stream . -f DOCKERFILE
+$ docker run --gpus all -e COQUI_TOS_AGREED=1 --rm -p 8000:80 xtts-stream
 ```
 
-## Building the container
+Setting the `COQUI_TOS_AGREED` environment variable to `1` indicates you have read and agreed to
+the terms of the [CPML license](https://coqui.ai/cpml). (Fine-tuned XTTS models also are under the [CPML license](https://coqui.ai/cpml))
+
+## 2) Testing the running server
+
+Once your Docker container is running, you can test that it's working properly. You will need to run the following code from a fresh terminal.
 
-1. To build the Docker container (Pytorch 2.01 Cuda 11.7) :
+### Clone `xtts-streaming-server` if you haven't already
 
 ```bash
-$ cd server
-$ docker build -t xtts-stream .
+$ git clone git@github.com:coqui-ai/xtts-streaming-server.git
 ```
-For Pytorch 2.1 and CUDA 11.8 version (when running set NVIDIA_DISABLE_REQUIRE=1 if you have Cuda < 11.8 drivers) 
+
+### Using the gradio demo
+
 ```bash
-$ cd server
-# docker build -t xtts-stream . -f Dockerfile.cuda118
+$ cd xtts-streaming-server
+$ python -m pip install -r test/requirements.txt
+$ python demo.py
 ```
-2. Run the server container:
+
+### Using the test script
 
 ```bash
-$ docker run --gpus=all -e COQUI_TOS_AGREED=1 --rm -p 8000:80 xtts-stream
+$ cd xtts-streaming-server/test
+$ python -m pip install -r requirements.txt
+$ python test_streaming.py
 ```
-
-Setting the `COQUI_TOS_AGREED` environment variable to `1` indicates you have read and agreed to
-the terms of the [CPML license](https://coqui.ai/cpml).
diff --git a/demo.py b/demo.py
new file mode 100644
index 0000000..2ed6251
--- /dev/null
+++ b/demo.py
@@ -0,0 +1,121 @@
+import gradio as gr
+import requests
+import base64
+import tempfile
+import json
+import os
+
+
+SERVER_URL = 'http://localhost:8000'
+OUTPUT = "./demo_outputs"
+cloned_speakers = {}
+
+print("Preparing file structure...")
+if not os.path.exists(OUTPUT):
+    os.mkdir(OUTPUT)
+    os.mkdir(os.path.join(OUTPUT, "cloned_speakers"))
+    os.mkdir(os.path.join(OUTPUT, "generated_audios"))
+elif os.path.exists(os.path.join(OUTPUT, "cloned_speakers")):
+    print("Loading existing cloned speakers...")
+    for file in os.listdir(os.path.join(OUTPUT, "cloned_speakers")):
+        if file.endswith(".json"):
+            with open(os.path.join(OUTPUT, "cloned_speakers", file), "r") as fp:
+                cloned_speakers[file[:-5]] = json.load(fp)
+    print("Available cloned speakers:", ", ".join(cloned_speakers.keys()))
+
+try:
+    print("Getting metadata from server ...")
+    LANUGAGES = requests.get(SERVER_URL + "/languages").json()
+    print("Available languages:", ", ".join(LANUGAGES))
+    STUDIO_SPEAKERS = requests.get(SERVER_URL + "/studio_speakers").json()
+    print("Available studio speakers:", ", ".join(STUDIO_SPEAKERS.keys()))
+except:
+    raise Exception("Please make sure the server is running first.")
+
+
+def clone_speaker(upload_file, clone_speaker_name, cloned_speaker_names):
+    files = {"wav_file": ("reference.wav", open(upload_file, "rb"))}
+    embeddings = requests.post(SERVER_URL + "/clone_speaker", files=files).json()
+    with open(os.path.join(OUTPUT, "cloned_speakers", clone_speaker_name + ".json"), "w") as fp:
+        json.dump(embeddings, fp)
+    cloned_speakers[clone_speaker_name] = embeddings
+    cloned_speaker_names.append(clone_speaker_name)
+    return upload_file, clone_speaker_name, cloned_speaker_names, gr.Dropdown.update(choices=cloned_speaker_names)
+
+def tts(text, speaker_type, speaker_name_studio, speaker_name_custom, lang):
+    embeddings = STUDIO_SPEAKERS[speaker_name_studio] if speaker_type == 'Studio' else cloned_speakers[speaker_name_custom]
+    generated_audio = requests.post(
+        SERVER_URL + "/tts",
+        json={
+            "text": text,
+            "language": lang,
+            "speaker_embedding": embeddings["speaker_embedding"],
+            "gpt_cond_latent": embeddings["gpt_cond_latent"]
+        }
+    ).content
+    generated_audio_path = os.path.join("demo_outputs", "generated_audios", next(tempfile._get_candidate_names()) + ".wav")
+    with open(generated_audio_path, "wb") as fp:
+        fp.write(base64.b64decode(generated_audio))
+        return fp.name
+
+with gr.Blocks() as demo:
+    cloned_speaker_names = gr.State(list(cloned_speakers.keys()))
+    with gr.Tab("TTS"):
+        with gr.Column() as row4:
+            with gr.Row() as col4:
+                speaker_name_studio = gr.Dropdown(
+                    label="Studio speaker",
+                    choices=STUDIO_SPEAKERS.keys(),
+                    value="Asya Anara" if "Asya Anara" in STUDIO_SPEAKERS.keys() else None,
+                )
+                speaker_name_custom = gr.Dropdown(
+                    label="Cloned speaker",
+                    choices=cloned_speaker_names.value,
+                    value=cloned_speaker_names.value[0] if len(cloned_speaker_names.value) != 0 else None,
+                )
+            speaker_type = gr.Dropdown(label="Speaker type", choices=["Studio", "Cloned"], value="Studio")
+        with gr.Column() as col2:
+            lang = gr.Dropdown(label="Language", choices=LANUGAGES, value="en")
+            text = gr.Textbox(label="text", value="A quick brown fox jumps over the lazy dog.")
+            tts_button = gr.Button(value="TTS")
+        with gr.Column() as col3:
+            generated_audio = gr.Audio(label="Generated audio", autoplay=True)
+    with gr.Tab("Clone a new speaker"):
+        with gr.Column() as col1:
+            upload_file = gr.Audio(label="Upload reference audio", type="filepath")
+            clone_speaker_name = gr.Textbox(label="Speaker name", value="default_speaker")
+            clone_button = gr.Button(value="Clone speaker")
+
+    clone_button.click(
+        fn=clone_speaker,
+        inputs=[upload_file, clone_speaker_name, cloned_speaker_names],
+        outputs=[upload_file, clone_speaker_name, cloned_speaker_names, speaker_name_custom],
+    )
+
+    tts_button.click(
+        fn=tts,
+        inputs=[text, speaker_type, speaker_name_studio, speaker_name_custom, lang],
+        outputs=[generated_audio],
+    )
+
+if __name__ == "__main__":
+    print("Warming up server...")
+    with open("test/default_speaker.json", "r") as fp:
+        warmup_speaker = json.load(fp)
+    resp = requests.post(
+        SERVER_URL + "/tts",
+        json={
+            "text": "This is a warmup request.",
+            "language": "en",
+            "speaker_embedding": warmup_speaker["speaker_embedding"],
+            "gpt_cond_latent": warmup_speaker["gpt_cond_latent"],
+        }
+    )
+    resp.raise_for_status()
+    print("Starting the demo...")
+    demo.launch(
+        share=False,
+        debug=False,
+        server_port=3009,
+        server_name="0.0.0.0",
+    )
diff --git a/server/Dockerfile.cpu b/server/Dockerfile.cpu
new file mode 100644
index 0000000..57dc3a7
--- /dev/null
+++ b/server/Dockerfile.cpu
@@ -0,0 +1,20 @@
+FROM python:3.11.7
+ARG DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && \
+    apt-get install --no-install-recommends -y sox libsox-fmt-all curl wget gcc git git-lfs build-essential libaio-dev libsndfile1 ssh ffmpeg && \
+    apt-get clean && apt-get -y autoremove
+
+WORKDIR /app
+COPY requirements_cpu.txt .
+RUN python -m pip install --use-deprecated=legacy-resolver -r requirements_cpu.txt \
+    && python -m pip cache purge
+
+RUN python -m unidic download
+RUN mkdir -p /app/tts_models
+
+COPY main.py .
+ENV USE_CPU=1
+
+EXPOSE 80
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80"]
diff --git a/server/requirements_cpu.txt b/server/requirements_cpu.txt
new file mode 100644
index 0000000..0bb7c39
--- /dev/null
+++ b/server/requirements_cpu.txt
@@ -0,0 +1,11 @@
+TTS @ git+https://github.com/coqui-ai/TTS@fa28f99f1508b5b5366539b2149963edcb80ba62
+uvicorn[standard]==0.23.2
+fastapi==0.95.2
+pydantic==1.10.13
+python-multipart==0.0.6
+typing-extensions>=4.8.0
+numpy==1.24.3
+cutlet
+mecab-python3==1.0.6
+unidic-lite==1.0.8
+unidic==1.1.0
diff --git a/test/requirements.txt b/test/requirements.txt
index 2c24336..7a68a27 100644
--- a/test/requirements.txt
+++ b/test/requirements.txt
@@ -1 +1,2 @@
 requests==2.31.0
+gradio==3.50.2
diff --git a/test/test_streaming.py b/test/test_streaming.py
index 3e35409..8fe8dc5 100644
--- a/test/test_streaming.py
+++ b/test/test_streaming.py
@@ -38,11 +38,10 @@ def stream_ffplay(audio_stream, output_file, save=True):
     ffplay_proc.wait()
 
 
-def tts(text, speaker,language, server_url , decoder, stream_chunk_size) -> Iterator[bytes]:
+def tts(text, speaker, language, server_url, stream_chunk_size) -> Iterator[bytes]:
     start = time.perf_counter()
     speaker["text"] = text
     speaker["language"] = language
-    speaker["decoder"] = decoder  # "hifigan" or "ne_hifigan" for TTS>0.19.0
     speaker["stream_chunk_size"] = stream_chunk_size  # you can reduce it to get faster response, but degrade quality
     res = requests.post(
         f"{server_url}/tts_stream",
@@ -86,7 +85,6 @@ def get_speaker(ref_audio,server_url):
         default="en",
         help="Language to use default is 'en'  (English)"
     )
-
     parser.add_argument(
         "--output_file",
         default=None,
@@ -102,18 +100,11 @@ def get_speaker(ref_audio,server_url):
         default="http://localhost:8000",
         help="Server url http://localhost:8000 default, change to your server location "
     )
-    parser.add_argument(
-        "--decoder",
-        default="ne_hifigan",
-        help="Decoder for vocoder, ne_hifigan default, options ne_hifigan or hifigan"
-    )
-
     parser.add_argument(
         "--stream_chunk_size",
         default="20",
         help="Stream chunk size , 20 default, reducing will get faster latency but may degrade quality"
     )
-
     args = parser.parse_args()
 
     with open("./default_speaker.json", "r") as file:
@@ -121,6 +112,16 @@ def get_speaker(ref_audio,server_url):
 
     if args.ref_file is not None:
         print("Computing the latents for a new reference...")
-        speaker = get_speaker(args.ref_file,args.server_url)
-
-    audio = stream_ffplay(tts(args.text, speaker,args.language,args.server_url,args.decoder,args.stream_chunk_size), args.output_file, save=bool(args.output_file))
+        speaker = get_speaker(args.ref_file, args.server_url)
+
+    audio = stream_ffplay(
+        tts(
+            args.text,
+            speaker,
+            args.language,
+            args.server_url,
+            args.stream_chunk_size
+        ), 
+        args.output_file,
+        save=bool(args.output_file)
+    )