Skip to content

Commit 41d6185

Browse files
committed
update README
1 parent 3a729b9 commit 41d6185

File tree

3 files changed

+372
-4
lines changed

3 files changed

+372
-4
lines changed

README.md

Lines changed: 45 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ developers to train custom multimodal large language model (MLLM), focusing on <
2828
6. [Citation](#citation)
2929

3030
# News
31-
- [Update Oct. 12, 2024] Recipes for [SLAM-AAC](examples/slam_aac/README.md) have been supported.
31+
- [Update Nov. 5, 2024] Recipes for [speech emotion captioning (SEC)](examples/sec_emotioncaps/README.md) with [emotion2vec](https://github.com/ddlBoJack/emotion2vec) as the encoder has been supported.
32+
- [Update Oct. 12, 2024] Recipes for [SLAM-AAC](examples/slam_aac/README.md) with [EAT](https://github.com/cwx-worst-one/EAT) as the encoder have been supported.
3233
- [Update Sep. 28, 2024] Recipes for [CoT-ST](examples/st_covost2/README.md) have been supported.
3334
- [Update Sep. 25, 2024] Recipes for [DRCap](examples/drcap_zeroshot_aac/README.md) have been supported.
3435
- [Update Jun. 12, 2024] Recipes for [MaLa-ASR](examples/mala_asr_slidespeech/README.md) have been supported.
@@ -90,6 +91,7 @@ We provide reference implementations of various LLM-based speech, audio, and mus
9091

9192
- Text-to-Speech (TTS)
9293
- [VALL-E-X](examples/vallex/README.md)
94+
- [Speech Emotion Captioning (SEC)](examples/sec_emotioncaps/README.md)
9395

9496
- **Audio Task**
9597
- [Automated Audio Captioning (AAC)](examples/aac_audiocaps/README.md)
@@ -118,7 +120,10 @@ command-line (shell file) > Hydra configuration (yaml file) > dataclass configur
118120
- We borrow code from [Fairseq](https://github.com/facebookresearch/fairseq) for deepspeed configuration.
119121
- We thank the contributors for providing diverse recipes.
120122

121-
## Citation
123+
# Citation
124+
125+
## Speech Task
126+
122127
SLAM-ASR:
123128
```
124129
@article{ma2024embarrassingly,
@@ -128,7 +133,27 @@ SLAM-ASR:
128133
year={2024}
129134
}
130135
```
136+
Mala-ASR:
137+
```
138+
@article{yang2024mala,
139+
title={MaLa-ASR: Multimedia-Assisted LLM-Based ASR},
140+
author={Yang, Guanrou and Ma, Ziyang and Yu, Fan and Gao, Zhifu and Zhang, Shiliang and Chen, Xie},
141+
journal={Proc. INTERSPEECH},
142+
year={2024}
143+
}
144+
```
145+
CoT-ST:
146+
```
147+
@article{du2024cot,
148+
title={CoT-ST: Enhancing LLM-based Speech Translation with Multimodal Chain-of-Thought},
149+
author={Du, Yexing and Ma, Ziyang and Yang, Yifan and Deng, Keqi and Chen, Xie and Yang, Bo and Xiang, Yang and Liu, Ming and Qin, Bing},
150+
journal={arXiv preprint arXiv:2409.19510},
151+
year={2024}
152+
}
153+
```
131154

155+
156+
## Audio Task
132157
SLAM-AAC:
133158
```
134159
@article{chen2024slam,
@@ -138,5 +163,21 @@ SLAM-AAC:
138163
year={2024}
139164
}
140165
```
141-
142-
166+
DRCap:
167+
```
168+
@article{li2024drcap,
169+
title={DRCap: Decoding CLAP Latents with Retrieval-augmented Generation for Zero-shot Audio Captioning},
170+
author={Li, Xiquan and Chen, Wenxi and Ma, Ziyang and Xu, Xuenan and Liang, Yuzhe and Zheng, Zhisheng and Kong, Qiuqiang and Chen, Xie},
171+
journal={arXiv preprint arXiv:2410.09472},
172+
year={2024}
173+
}
174+
```
175+
BAT:
176+
```
177+
@article{zheng2024bat,
178+
title={BAT: Learning to Reason about Spatial Sounds with Large Language Models},
179+
author={Zheng, Zhisheng and Peng, Puyuan and Ma, Ziyang and Chen, Xie and Choi, Eunsol and Harwath, David},
180+
journal={Proc. ICML},
181+
year={2024}
182+
}
183+
```
Lines changed: 257 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,257 @@
1+
import re
2+
import json
3+
from zhon.hanzi import punctuation as zh_punctuation
4+
from opencc import OpenCC
5+
import string
6+
from tqdm import tqdm
7+
8+
# 初始化OpenCC进行繁体到简体转换
9+
cc = OpenCC('t2s')
10+
11+
# 定义合理的标点符号(中英文)
12+
all_punctuation = string.punctuation + zh_punctuation + "×÷=" + "°℃"
13+
14+
# 定义正则表达式模式
15+
url_pattern = re.compile(r'https?://\S+|www\.\S+')
16+
email_pattern = re.compile(r'\S+@\S+\.\S+')
17+
code_pattern = re.compile(r'`{1,3}.*?`{1,3}', re.DOTALL)
18+
emoji_pattern = re.compile(
19+
'['
20+
u'\U0001F600-\U0001F64F' # 表情符号
21+
u'\U0001F300-\U0001F5FF' # 符号和象形文字
22+
u'\U0001F680-\U0001F6FF' # 交通和地图符号
23+
u'\U0001F1E0-\U0001F1FF' # 国旗
24+
u'\U0001F700-\U0001F77F' # 其他象形符号
25+
u'\U0001F780-\U0001F7FF' # 更多符号
26+
u'\U0001F800-\U0001F8FF' # 更多符号
27+
u'\U0001F900-\U0001F9FF' # 更多表情符号
28+
u'\U0001FA00-\U0001FA6F' # 角色面部表情、面具等
29+
u'\U0001FA70-\U0001FAFF' # 高级符号,常见于现代设备
30+
u'\U00002702-\U000027B0' # 符号(例如勾选、交叉)
31+
']',
32+
re.UNICODE
33+
)
34+
35+
table_pattern = re.compile(r'(\|.*\|(\n|\r\n)*)+')
36+
37+
file_patten = re.compile(r'\S+\.(txt|pdf|docx|doc|xlsx|xls|csv|pptx|ppt|png|jpg|jpeg|gif|mp3|mp4|wav|avi|mov|mkv|flv|wmv|zip|rar|tar|gz|7z|iso|dmg|pkg)')
38+
39+
phonetic_pattern = re.compile(r'/[a-zA-Zɪɛæʌʊɔəʌɜɑɔr]+/') # 匹配 /i/、/ɪ/ 等音素符号
40+
41+
repeating_pattern = re.compile(r'[_\-*]{6,}') # 匹配重复符号,如 ______
42+
43+
# 定义缩写词典
44+
abbreviation_dict = {
45+
"Dr.": "Doctor",
46+
"e.g.": "for example",
47+
"i.e.": "that is", # 常见的缩写,用于解释说明
48+
"Mr.": "Mister",
49+
"Mrs.": "Misses",
50+
"St.": "Saint", # 用于地名或圣人的称谓
51+
"vs.": "versus", # 用于对比,通常见于比赛或法庭用语
52+
"Prof.": "Professor",
53+
"Ave.": "Avenue",
54+
"Dept.": "Department",
55+
"etc.": "and so on", # 表示等等
56+
"Inc.": "Incorporated", # 公司名中常见
57+
"Ltd.": "Limited", # 有限公司
58+
"Jr.": "Junior",
59+
"Sr.": "Senior",
60+
"vs.": "versus", # 对抗
61+
"approx.": "approximately", # 大约
62+
"min.": "minute", # 分钟
63+
"sec.": "second", # 秒
64+
"Fri.": "Friday",
65+
"Sat.": "Saturday",
66+
"Sun.": "Sunday",
67+
"Mon.": "Monday",
68+
"Tue.": "Tuesday",
69+
"Wed.": "Wednesday",
70+
"Thu.": "Thursday",
71+
"no.": "number", # 常用于序号
72+
"No.": "number",
73+
"Jan.": "January",
74+
"Feb.": "February",
75+
"Mar.": "March",
76+
"Apr.": "April",
77+
"Jun.": "June",
78+
"Jul.": "July",
79+
"Aug.": "August",
80+
"Sept.": "September",
81+
"Oct.": "October",
82+
"Nov.": "November",
83+
"Dec.": "December",
84+
"est.": "established", # 成立于(例如用于公司成立时间)
85+
"max.": "maximum", # 最大值
86+
"min.": "minimum", # 最小值
87+
}
88+
89+
def normalize_abbreviations(text):
90+
for abbr, full in abbreviation_dict.items():
91+
text = re.sub(r'\b' + re.escape(abbr) + r'\b', full, text)
92+
return text
93+
94+
# 定义要过滤的敏感词列表(不包含具体词汇)
95+
sensitive_words = set() # 可以在这里添加敏感词汇
96+
97+
def filter_inappropriate_content(text):
98+
# 可以使用外部库或自定义方法过滤不当内容
99+
# 这里只是示例,不包含具体实现
100+
for word in sensitive_words:
101+
text = text.replace(word, '')
102+
return text
103+
104+
def process_text(text):
105+
# 将繁体中文转换为简体中文
106+
text = cc.convert(text)
107+
108+
# 规范缩写
109+
text = normalize_abbreviations(text)
110+
111+
# 过滤不当内容
112+
text = filter_inappropriate_content(text)
113+
114+
return text
115+
116+
def is_valid_char(text):
117+
return all(
118+
(u'\u4e00' <= char <= u'\u9fff') # 保留汉字
119+
or char.isalpha() # 保留字母
120+
or char in all_punctuation # 保留标点符号
121+
or char.isspace() # 允许空格
122+
or char == '\n' # 允许换行符
123+
or char.isdigit() # 允许数字
124+
for char in text
125+
)
126+
127+
def is_valid_sentence(sentence):
128+
# 检查URL和链接
129+
if url_pattern.search(sentence):
130+
return False, "URL"
131+
132+
# 检查电子邮件地址
133+
if email_pattern.search(sentence):
134+
return False, "Email"
135+
136+
# 检查代码片段
137+
if code_pattern.search(sentence):
138+
return False, "Code"
139+
140+
# 检查表情符号
141+
if emoji_pattern.search(sentence):
142+
return False, "Emoji"
143+
144+
# 检查表格
145+
if table_pattern.search(sentence):
146+
return False, "Table"
147+
148+
# 检查文件
149+
if file_patten.search(sentence):
150+
return False, "File"
151+
152+
# 检查音素符号
153+
if phonetic_pattern.search(sentence):
154+
return False, "Phonetic"
155+
156+
# 检查重复符号
157+
if repeating_pattern.search(sentence):
158+
return False, "Repeating"
159+
160+
return True, "Valid"
161+
162+
def suitable_for_tts(data, max_turn_num=10, max_text_length=200):
163+
# 检查对话轮数
164+
if len(data["conversations"]) > max_turn_num:
165+
return False
166+
167+
# 检查文本长度
168+
for conversation in data["conversations"]:
169+
if len(conversation["value"]) > max_text_length:
170+
return False
171+
172+
return True
173+
174+
def process_conversations(input_file, output_file, invalid_file, tts_file):
175+
176+
with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile, open(invalid_file, 'w', encoding='utf-8') as invalidfile, open(tts_file, 'w', encoding='utf-8') as ttsfile:
177+
178+
valid_num = 0
179+
invalid_num = 0
180+
tts_num = 0
181+
182+
# 流式处理,逐行读取和处理
183+
for line in tqdm(infile):
184+
data = json.loads(line)
185+
186+
# 处理对话数据
187+
processed_data = {
188+
"conversations": [],
189+
"id": data.get("id", "")
190+
}
191+
valid = True # 标记是否有有效的对话
192+
193+
for conversation in data.get("conversations", []):
194+
text = conversation.get("value", "").strip()
195+
196+
if not is_valid_char(text):
197+
# print(f"Invalid Character error in: {text}")
198+
valid = False
199+
invalid_num += 1
200+
data["invalid_reason"] = "Character"
201+
json.dump(data, invalidfile, ensure_ascii=False)
202+
invalidfile.write('\n')
203+
break
204+
205+
valid_sentence, reason = is_valid_sentence(text)
206+
if not valid_sentence:
207+
# print(f"{reason} error in: {text}")
208+
valid = False
209+
invalid_num += 1
210+
data["invalid_reason"] = reason
211+
json.dump(data, invalidfile, ensure_ascii=False)
212+
invalidfile.write('\n')
213+
break
214+
215+
216+
# 处理文本
217+
processed_text = process_text(text)
218+
if not is_valid_char(processed_text) or not is_valid_sentence(processed_text):
219+
valid = False
220+
break
221+
222+
# 构建新的对话结构
223+
processed_conversation = {
224+
"from": conversation.get("from", ""),
225+
"value": processed_text
226+
}
227+
processed_data["conversations"].append(processed_conversation)
228+
229+
if valid:
230+
valid_num += 1
231+
json.dump(processed_data, outfile, ensure_ascii=False)
232+
outfile.write('\n') # 添加换行符,以便下一行开始
233+
234+
if suitable_for_tts(processed_data, 10, 200):
235+
tts_num += 1
236+
json.dump(processed_data, ttsfile, ensure_ascii=False)
237+
ttsfile.write('\n') # 添加换行符,以便下一行开始
238+
239+
240+
print(f"Total number of valid conversations: {valid_num}")
241+
print(f"Total number of invalid conversations: {invalid_num}")
242+
print(f"Total number of conversations suitable for TTS: {tts_num}")
243+
244+
245+
if __name__ == "__main__":
246+
# 输入和输出文件路径
247+
input_file = '/mnt/bn/dev-mzy/data/corpus/belle_raw/train_3.5M_CN.json' # 原始对话数据文件
248+
output_file = '/mnt/bn/dev-mzy/data/corpus/belle_raw/train_3.5M_CN_cleaned.json' # 处理后的数据文件
249+
invalid_file = '/mnt/bn/dev-mzy/data/corpus/belle_raw/train_3.5M_CN_invalid.json' # 无效对话数据文件
250+
tts_file = '/mnt/bn/dev-mzy/data/corpus/belle_raw/train_3.5M_CN_ready4cosy.json' # 准备好的tts数据文件
251+
252+
process_conversations(input_file, output_file, invalid_file, tts_file)
253+
# test = {"conversations": [{"from": "human", "value": "生成一个调查问卷,调查大学生对校园环境改善的看法和建议。"}, {"from": "assistant", "value": "调查问卷:大学生对校园环境改善的看法和建议\n你好,我们正在进行一项调查,旨在了解大学生们对于校园环境改善的看法和建议。希望您能抽出几分钟时间填写这份问卷,谢谢!\n1. 你是哪所大学的学生?\nA. xx大学\nB. xx大学\nC. 其他(请注明:________)\n2. 你认为你所在的学校的校园环境如何?\nA. 很好 B. 较好 C. 一般 D. 较差 E. 很差\n3. 对于学校的校园环境,你最满意的是什么方面?\nA. 绿化和景观\nB. 卫生和清洁\nC. 建筑和设施\nD. 安全和保障\nE. 其他(请注明:________)\n4. 对于学校的校园环境,你最不满意的是什么方面?\nA. 绿化和景观\nB. 卫生和清洁\nC. 建筑和设施\nD. 安全和保障\nE. 其他(请注明:________)\n5. 你认为有哪些方面可以改善你所在学校的校园环境?(请勾选)\nA. 增加绿化和植树\nB. 加大垃圾分类投放力度\nC. 加强设施和建筑维护\nD. 提高校园安全保障\nE. 加大环保宣传和教育力度\nF. 其他(请注明:________)\n6. 你觉得学校和学生可以共同做些什么来改善校园环境?\nA. 学生可以更爱护校园环境\nB. 学校可以加强环境管理和维护力度\nC. 设置环保奖励机制鼓励环保行为\nD. 加强师生环保教育\nE. 其他(请注明:________)\n7. 你认为学生在校内生态保护方面需要加强哪些方面的意识和行为?\nA. 节约用水和用电\nB. 垃圾分类和投放\nC. 自觉保护校园环境\nD. 拒绝乱扔垃圾和破坏设施\nE. 其他(请注明:________)\n8. 如果你可以对你所在学校的校园环境进行改善,你会怎么做?\nA. 增加植物和花草,美化校园\nB. 加强清洁和卫生管理\nC. 提高设施和建筑维护水平\nD. 加强安保力度保障学生个人安全\nE. 向学生提供更多与环保有关的活动和项目\nF. 其他(请注明:________)\n9. 最后,你认为校园环境的改善对于学生学习和生活的影响是什么?\nA. 可以提高学生的学习积极性和创造力\nB. 可以改善学生的生活和健康状况\nC. 可以提高学生的环保意识和社会责任感\nD. 可以促进学生个人成长和发展\nE. 其他(请注明:________)\n感谢您的参与!"}], "id": "18054991"}
254+
# for conversations in test["conversations"]:
255+
# print(conversations["value"])
256+
# print(is_valid_char(conversations["value"]))
257+
# print(is_valid_sentence(conversations["value"]))

0 commit comments

Comments
 (0)