update readme

cwx-worst-one · cwx-worst-one · commit 85191fb10819 · 2024-12-22T09:34:16.000Z
diff --git a/examples/s2s/README.md b/examples/s2s/README.md
@@ -46,6 +46,7 @@ We also support JSONL format for its concise structure. Below is an example:
 We reproduced the single-stage fine-tuning results of SLAM-Omni with a group size of **3**. The following checkpoints are available for download:
 - [Single-Round Dialogue (English)](https://drive.google.com/drive/folders/1ZmM1h5ZTvS-piuN-msmctmZdi51GWLAu?usp=sharing): Trained on VoiceAssistant-400K.
 - [Multi-Round Dialogue (English)](https://drive.google.com/drive/folders/1xBNrqR2LWC0uEjezjx4aUgdsbstisboS?usp=sharing): Trained on VoiceAssistant-400K and UltraChat-300K.
+- [Multi-Round Dialogue (Chinese)](https://drive.google.com/drive/folders/1sExIp-UDdL37gb-mh9YlhuDIib0-wUVP?usp=sharing): Trained on Belle_1.4M.
 
 
 ## Training
@@ -114,4 +115,21 @@ bash ./examples/s2s/scripts/inference/mini-omni/inference_s2s_batch.sh
 
 ## Acknowledgement
 - We borrow some code from [Mini-Omni](https://github.com/gpt-omni/mini-omni) for SNAC-based modeling.
-- We borrow some code from [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) for the vocoder.
+- We borrow some code from [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) for the vocoder.
+
+## Citation
+<!-- ```bibtex
+
+``` -->
+
+```bibtex
+@article{xie2024mini,
+  title={Mini-omni: Language models can hear, talk while thinking in streaming},
+  author={Xie, Zhifei and Wu, Changqiao},
+  journal={arXiv preprint arXiv:2408.16725},
+  year={2024}
+}
+```
+
+## License
+Our code is released under MIT License. The Chinese dialogue model is licensed under GPL-3.0 due to its use of Belle data and is intended for research purposes only.
diff --git a/examples/s2s/scripts/inference/inference_s2s_online_multi-round.sh b/examples/s2s/scripts/inference/inference_s2s_online_multi-round.sh
@@ -33,7 +33,7 @@ num_latency_tokens=0                # number of latency tokens (same as the numb
 do_layershift=false                 # if false, tokens in each layers use the same codebook, otherwise, use different codebooks
 
 # load the backbone model
-ckpt_path=/valleblob/v-wenxichen/exp/s2s/en-mix/s2s_train_v4-Qwen2-0.5b-gpu4-btz3-lr1e-4-fp16-epochs10-whisper_small-latency0-group3-multiround-from_pretrained/s2s_epoch_2_step_23152
+ckpt_path=/valleblob/v-wenxichen/exp/s2s/zh-single/s2s_train_v4-Qwen2-0.5b-gpu4-btz3-lr1e-4-fp16-epochs10-whisper_small-latency0-group3-chinese-multiround-from_scratch/s2s_epoch_2_step_82467
 
 # model settings
 group_decode=true
@@ -56,9 +56,9 @@ output_text_only=false
 speech_sample_rate=22050            # 22050 for CosyVoice, 24000 for SNAC
 inference_online=true
 multi_round=true
-online_output_dir=/home/v-wenxichen/exp/cosyvoice/multi-round-en
-# audio_prompt_path=./examples/s2s/audio_prompt/zh/prompt_6.wav      # replace this with your own audio prompt path or our provided audio prompt path
-audio_prompt_path=./examples/s2s/audio_prompt/en/prompt_6.wav        # replace this with your own audio prompt path or our provided audio prompt path
+online_output_dir=/home/v-wenxichen/exp/cosyvoice/multi-round-zh
+audio_prompt_path=./examples/s2s/audio_prompt/zh/prompt_6.wav      # replace this with your own audio prompt path or our provided audio prompt path
+# audio_prompt_path=./examples/s2s/audio_prompt/en/prompt_6.wav        # replace this with your own audio prompt path or our provided audio prompt path
 
 decode_log=$ckpt_path/s2s_decode_${split}_trp${text_repetition_penalty}_arp${audio_repetition_penalty}_seed${dataset_sample_seed}_greedy
 if [ "$do_sample" = true ] ; then