Skip to content

Commit 4aa296c

Browse files
committed
feat: support for mp3 format in audio processing and export
1 parent 2e28726 commit 4aa296c

File tree

3 files changed

+40
-29
lines changed

3 files changed

+40
-29
lines changed

cvat/apps/dataset_manager/task.py

Lines changed: 25 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1015,34 +1015,35 @@ def create_gt_annotation_clips_zip(annotation_audio_chunk_file_paths, json_data,
10151015
shutil.move(zip_filename, dst_file)
10161016

10171017
def get_np_audio_array_from_job(job_id):
1018-
10191018
with transaction.atomic():
10201019
job = JobAnnotation(job_id)
10211020
job.init_from_db()
10221021

1023-
job_data_chunk_size = job.db_job.segment.task.data.chunk_size
1024-
task_dimension = job.db_job.segment.task.dimension
1025-
1026-
start = job.start_frame / job_data_chunk_size
1027-
stop = job.stop_frame / job_data_chunk_size
1028-
1029-
audio_array_buffer = []
1030-
for i in range(math.trunc(start), math.trunc(stop) + 1):
1031-
db_job = job.db_job
1032-
data_num = i
1033-
data_quality = 'compressed'
1034-
1035-
chunk_path = jobChunkPathGetter(job.db_job.segment.task.data, job.start_frame, job.stop_frame, task_dimension, data_quality, data_num, db_job)
1036-
1037-
audio = AudioSegment.from_file(chunk_path) # Handles both MP3 and WAV
1038-
audio_data = np.array(audio.get_array_of_samples(), dtype=np.int16)
1039-
1040-
audio_array_buffer.append(audio_data)
1041-
1042-
# Concatenate all audio data into a single numpy array
1043-
concat_array = np.concatenate(audio_array_buffer, axis=0)
1044-
1045-
return concat_array
1022+
db_job = job.db_job
1023+
task_jobs = Job.objects.filter(segment__task__id=db_job.segment.task_id).order_by('id')
1024+
1025+
data_quality = 'compressed'
1026+
data_num = (
1027+
(len(task_jobs)-1) if (db_job.type == JobType.GROUND_TRUTH) # len(task_jobs) - 1 because of one GT job
1028+
else job_id - task_jobs.first().id
1029+
)
1030+
1031+
task_dimension = db_job.segment.task.dimension
1032+
chunk_path = jobChunkPathGetter(
1033+
db_job.segment.task.data,
1034+
job.start_frame,
1035+
job.stop_frame,
1036+
task_dimension,
1037+
data_quality,
1038+
data_num,
1039+
db_job
1040+
)
1041+
1042+
audio = AudioSegment.from_file(chunk_path) # Handles both MP3 and WAV
1043+
audio_data = np.array(audio.get_array_of_samples(), dtype=np.int16)
1044+
audio_data_int16 = np.array(audio_data, dtype=np.int16)
1045+
1046+
return audio_data_int16
10461047

10471048
def get_audio_job_export_data(job_id, dst_file, job, temp_dir_base, temp_dir):
10481049

cvat/apps/engine/background_operations.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -821,6 +821,10 @@ def save_concatenated_gt_audio_chunks(time_stamps, upload_dir):
821821
num_jobs = len(time_stamps) // 2
822822
output_path = os.path.join(upload_dir, f"{num_jobs}.wav")
823823

824+
# Check if the file with .mp3 exist or .wav
825+
if not os.path.exists(output_path):
826+
output_path = os.path.splitext(output_path)[0] + '.mp3'
827+
824828
if os.path.exists(output_path):
825829
os.remove(output_path)
826830

@@ -829,22 +833,26 @@ def save_concatenated_gt_audio_chunks(time_stamps, upload_dir):
829833
for job_index in range(num_jobs):
830834
input_wav_path = os.path.join(upload_dir, f"{job_index}.wav")
831835

836+
# Check if the file with .mp3 exist or .wav
837+
if not os.path.exists(input_wav_path):
838+
input_wav_path = os.path.splitext(input_wav_path)[0] + '.mp3'
839+
832840
if not os.path.exists(input_wav_path):
833841
print(f"Warning: Input file {input_wav_path} not found, skipping job {job_index}.")
834842
continue
835843

836844
duration_ms = time_stamps[job_index * 2 + 1]
837845

838846
try:
839-
audio_segment = AudioSegment.from_wav(input_wav_path)
847+
audio_segment = AudioSegment.from_file(input_wav_path)
840848
audio_segment = audio_segment[:duration_ms]
841849
concatenated_audio += audio_segment
842850
except Exception as e:
843851
print(f"Error processing {input_wav_path}: {e}")
844852

845853
if len(concatenated_audio) > 0:
846854
try:
847-
concatenated_audio.export(output_path, format="wav")
855+
concatenated_audio.export(output_path, format="mp3")
848856
except Exception as e:
849857
print(f"Error writing {output_path}: {e}")
850858
else:

cvat/apps/engine/utils.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -462,15 +462,17 @@ def get_audio_segments(self, index, min_duration=0.5):
462462
self._update_model()
463463

464464
audio_file_path = f"{self.data_dir}/original/{index}.wav"
465+
if not os.path.exists(audio_file_path):
466+
audio_file_path = f"{self.data_dir}/original/{index}.mp3"
465467

466468
try:
467-
wav = self.read_audio(audio_file_path, sampling_rate=16000)
469+
audio_data = self.read_audio(audio_file_path, sampling_rate=16000)
468470
except FileNotFoundError:
469471
print(f"Audio file {audio_file_path} not found.")
470472
return []
471473

472-
speech_timestamps = self.get_speech_timestamps(wav, self.model, sampling_rate=16000)
473-
total_duration = len(wav) / 16000
474+
speech_timestamps = self.get_speech_timestamps(audio_data, self.model, sampling_rate=16000)
475+
total_duration = len(audio_data) / 16000
474476

475477
if not speech_timestamps:
476478
return []

0 commit comments

Comments
 (0)