feat: support for mp3 format in audio processing and export

rohan220217 · rohan220217 · commit 4aa296c36c33 · 2025-05-01T18:35:35.000+05:30
diff --git a/cvat/apps/dataset_manager/task.py b/cvat/apps/dataset_manager/task.py
@@ -1015,34 +1015,35 @@ def create_gt_annotation_clips_zip(annotation_audio_chunk_file_paths, json_data,
     shutil.move(zip_filename, dst_file)
 
 def get_np_audio_array_from_job(job_id):
-
     with transaction.atomic():
         job = JobAnnotation(job_id)
         job.init_from_db()
 
-    job_data_chunk_size = job.db_job.segment.task.data.chunk_size
-    task_dimension = job.db_job.segment.task.dimension
-
-    start = job.start_frame / job_data_chunk_size
-    stop = job.stop_frame / job_data_chunk_size
-
-    audio_array_buffer = []
-    for i in range(math.trunc(start), math.trunc(stop) + 1):
-        db_job = job.db_job
-        data_num = i
-        data_quality = 'compressed'
-
-        chunk_path = jobChunkPathGetter(job.db_job.segment.task.data, job.start_frame, job.stop_frame, task_dimension, data_quality, data_num, db_job)
-
-        audio = AudioSegment.from_file(chunk_path)  # Handles both MP3 and WAV
-        audio_data = np.array(audio.get_array_of_samples(), dtype=np.int16)
-
-        audio_array_buffer.append(audio_data)
-
-    # Concatenate all audio data into a single numpy array
-    concat_array = np.concatenate(audio_array_buffer, axis=0)
-
-    return concat_array
+    db_job = job.db_job
+    task_jobs = Job.objects.filter(segment__task__id=db_job.segment.task_id).order_by('id')
+
+    data_quality = 'compressed'
+    data_num = (
+        (len(task_jobs)-1) if (db_job.type == JobType.GROUND_TRUTH) # len(task_jobs) - 1 because of one GT job
+        else job_id - task_jobs.first().id
+    )
+
+    task_dimension = db_job.segment.task.dimension
+    chunk_path = jobChunkPathGetter(
+        db_job.segment.task.data,
+        job.start_frame,
+        job.stop_frame,
+        task_dimension,
+        data_quality,
+        data_num,
+        db_job
+    )
+
+    audio = AudioSegment.from_file(chunk_path) # Handles both MP3 and WAV
+    audio_data = np.array(audio.get_array_of_samples(), dtype=np.int16)
+    audio_data_int16 = np.array(audio_data, dtype=np.int16)
+
+    return audio_data_int16
 
 def get_audio_job_export_data(job_id, dst_file, job, temp_dir_base, temp_dir):
 
diff --git a/cvat/apps/engine/background_operations.py b/cvat/apps/engine/background_operations.py
@@ -821,6 +821,10 @@ def save_concatenated_gt_audio_chunks(time_stamps, upload_dir):
                             num_jobs = len(time_stamps) // 2
                             output_path = os.path.join(upload_dir, f"{num_jobs}.wav")
 
+                            # Check if the file with .mp3 exist or .wav
+                            if not os.path.exists(output_path):
+                                output_path = os.path.splitext(output_path)[0] + '.mp3'
+
                             if os.path.exists(output_path):
                                 os.remove(output_path)
 
@@ -829,22 +833,26 @@ def save_concatenated_gt_audio_chunks(time_stamps, upload_dir):
                             for job_index in range(num_jobs):
                                 input_wav_path = os.path.join(upload_dir, f"{job_index}.wav")
 
+                                # Check if the file with .mp3 exist or .wav
+                                if not os.path.exists(input_wav_path):
+                                    input_wav_path = os.path.splitext(input_wav_path)[0] + '.mp3'
+
                                 if not os.path.exists(input_wav_path):
                                     print(f"Warning: Input file {input_wav_path} not found, skipping job {job_index}.")
                                     continue
 
                                 duration_ms = time_stamps[job_index * 2 + 1]
 
                                 try:
-                                    audio_segment = AudioSegment.from_wav(input_wav_path)
+                                    audio_segment = AudioSegment.from_file(input_wav_path)
                                     audio_segment = audio_segment[:duration_ms]
                                     concatenated_audio += audio_segment
                                 except Exception as e:
                                     print(f"Error processing {input_wav_path}: {e}")
 
                             if len(concatenated_audio) > 0:
                                 try:
-                                    concatenated_audio.export(output_path, format="wav")
+                                    concatenated_audio.export(output_path, format="mp3")
                                 except Exception as e:
                                     print(f"Error writing {output_path}: {e}")
                             else:
diff --git a/cvat/apps/engine/utils.py b/cvat/apps/engine/utils.py
@@ -462,15 +462,17 @@ def get_audio_segments(self, index, min_duration=0.5):
             self._update_model()
 
         audio_file_path = f"{self.data_dir}/original/{index}.wav"
+        if not os.path.exists(audio_file_path):
+            audio_file_path = f"{self.data_dir}/original/{index}.mp3"
 
         try:
-            wav = self.read_audio(audio_file_path, sampling_rate=16000)
+            audio_data = self.read_audio(audio_file_path, sampling_rate=16000)
         except FileNotFoundError:
             print(f"Audio file {audio_file_path} not found.")
             return []
 
-        speech_timestamps = self.get_speech_timestamps(wav, self.model, sampling_rate=16000)
-        total_duration = len(wav) / 16000
+        speech_timestamps = self.get_speech_timestamps(audio_data, self.model, sampling_rate=16000)
+        total_duration = len(audio_data) / 16000
 
         if not speech_timestamps:
             return []