feat: added mp3 format instead of wav

rohan220217 · rohan220217 · commit 2e28726c0fdb · 2025-04-29T16:14:10.000+05:30
diff --git a/cvat/apps/dataset_manager/task.py b/cvat/apps/dataset_manager/task.py
@@ -11,7 +11,6 @@
 import uuid
 import zipfile
 from pydub import AudioSegment
-from scipy.io import wavfile
 import numpy as np
 import os
 from collections import OrderedDict
@@ -920,9 +919,10 @@ def jobChunkPathGetter(db_data, start, stop, task_dimension, data_quality, data_
             if data_quality == 'compressed' else FrameProvider.Quality.ORIGINAL
 
     path = os.path.realpath(frame_provider.get_chunk(number, quality))
-    # pylint: disable=superfluous-parens
 
-    # return {"start_chunk" : start_chunk, "stop_chunk" : stop_chunk}
+    # Check if the file with .mp3 exist or .wav
+    if not os.path.exists(path):
+        path = os.path.splitext(path)[0] + '.wav'
 
     return path
 
@@ -1015,34 +1015,34 @@ def create_gt_annotation_clips_zip(annotation_audio_chunk_file_paths, json_data,
     shutil.move(zip_filename, dst_file)
 
 def get_np_audio_array_from_job(job_id):
+
     with transaction.atomic():
         job = JobAnnotation(job_id)
         job.init_from_db()
 
-    db_job = job.db_job
-    task_jobs = Job.objects.filter(segment__task__id=db_job.segment.task_id).order_by('id')
-
-    data_quality = 'compressed'
-    data_num = (
-        (len(task_jobs)-1) if (db_job.type == JobType.GROUND_TRUTH) # len(task_jobs) - 1 because of one GT job
-        else job_id - task_jobs.first().id
-    )
-
-    task_dimension = db_job.segment.task.dimension
-    chunk_path = jobChunkPathGetter(
-        db_job.segment.task.data,
-        job.start_frame,
-        job.stop_frame,
-        task_dimension,
-        data_quality,
-        data_num,
-        db_job
-    )
-
-    _, audio_data = wavfile.read(chunk_path)
-    audio_data_int16 = np.array(audio_data, dtype=np.int16)
-
-    return audio_data_int16
+    job_data_chunk_size = job.db_job.segment.task.data.chunk_size
+    task_dimension = job.db_job.segment.task.dimension
+
+    start = job.start_frame / job_data_chunk_size
+    stop = job.stop_frame / job_data_chunk_size
+
+    audio_array_buffer = []
+    for i in range(math.trunc(start), math.trunc(stop) + 1):
+        db_job = job.db_job
+        data_num = i
+        data_quality = 'compressed'
+
+        chunk_path = jobChunkPathGetter(job.db_job.segment.task.data, job.start_frame, job.stop_frame, task_dimension, data_quality, data_num, db_job)
+
+        audio = AudioSegment.from_file(chunk_path)  # Handles both MP3 and WAV
+        audio_data = np.array(audio.get_array_of_samples(), dtype=np.int16)
+
+        audio_array_buffer.append(audio_data)
+
+    # Concatenate all audio data into a single numpy array
+    concat_array = np.concatenate(audio_array_buffer, axis=0)
+
+    return concat_array
 
 def get_audio_job_export_data(job_id, dst_file, job, temp_dir_base, temp_dir):
 
diff --git a/cvat/apps/engine/media_extractors.py b/cvat/apps/engine/media_extractors.py
@@ -907,29 +907,33 @@ def _encode_images(images, container, stream):
             container.mux(packet)
 
 class AudioChunkWriter(IChunkWriter):
-    FORMAT = 'wav'
+    FORMAT = 'mp3'
 
     def __init__(self, quality=67):
         # translate inversed range [1:100] to [0:51]
         quality = round(51 * (100 - quality) / 99)
         super().__init__(quality)
         self.rate = 44100
 
-        codec = av.codec.Codec('pcm_s16le', 'w')
+        codec = av.codec.Codec('libmp3lame', 'w')  # Use MP3 codec (libmp3lame)
         self._codec_name = codec.name
         self._codec_opts = {
+            'bit_rate': '192000',  # Set a bitrate for MP3 (e.g., 192kbps)
+            'vbr': 'true',         # Variable Bitrate (optional, depending on your needs)
         }
 
     def _add_audio_stream(self, container, rate, options):
-
         audio_stream = container.add_stream(self._codec_name, rate=rate, layout="stereo")
-        # audio_stream.options = options
+
+        # additional codec options, like bitrate
+        for key, value in options.items():
+            audio_stream.options[key] = value
 
         return audio_stream
 
     def save_as_chunk(self, images, chunk_path):
         if not images:
-            raise Exception('no images to save')
+            raise Exception('no audios to save')
 
         with av.open(chunk_path, 'w', format=self.FORMAT) as output_container:
             output_v_stream = self._add_audio_stream(
@@ -942,8 +946,8 @@ def save_as_chunk(self, images, chunk_path):
         return [(0, 0)]
 
     @staticmethod
-    def _encode_audio_frames(images, container, stream):
-        for frame, _, _ in images:
+    def _encode_audio_frames(audios, container, stream):
+        for frame, _, _ in audios:
             # let libav set the correct pts and time_base
             frame.pts = None
             frame.time_base = None
diff --git a/cvat/apps/engine/models.py b/cvat/apps/engine/models.py
@@ -275,7 +275,7 @@ def _get_chunk_name(chunk_number, chunk_type):
         elif chunk_type == DataChoice.IMAGESET:
             ext = 'zip'
         elif chunk_type == DataChoice.AUDIO:
-            ext = 'wav'
+            ext = 'mp3'
         else:
             ext = 'list'
 
diff --git a/cvat/apps/engine/task.py b/cvat/apps/engine/task.py
@@ -1336,8 +1336,4 @@ def process_results(img_meta: list[tuple[str, int, tuple[int, int]]]):
                 "success" : False,
                 "message" : "No match found."
             }
-            slogger.glob.error(response)
-
-        # f = open( '/home/vignesh/Desktop/Desktop/IIITD/BTP.02/cvat/cvat/apps/engine/chunks.txt', 'w' )
-        # f.write( 'dict = ' + repr(response) + '\n' )
-        # f.close()
+            slogger.glob.error(response)
diff --git a/cvat/apps/engine/views.py b/cvat/apps/engine/views.py
@@ -717,6 +717,10 @@ def __call__(self, request, start: int, stop: int, db_data: Optional[Data]):
                 # Follow symbol links if the chunk is a link on a real image otherwise
                 # mimetype detection inside sendfile will work incorrectly.
                 path = os.path.realpath(frame_provider.get_chunk(self.number, self.quality))
+                # Check if the file with .mp3 exist or .wav
+                if not os.path.exists(path):
+                    path = os.path.splitext(path)[0] + '.wav'
+
                 return sendfile(request, path)
             elif self.type == 'frame' or self.type == 'preview':
                 self._check_frame_range(self.number)
diff --git a/cvat/apps/notifications/views.py b/cvat/apps/notifications/views.py
@@ -238,7 +238,7 @@ def FetchUserNotifications(self, request: Request):
                             },
                             "error": None
                         },
-                        status = status.HTTP_400_BAD_REQUEST
+                        status = status.HTTP_200_OK
                     )
 
                 serialized_notifications = [UserNotificationDetailSerializer(noti_status.notification).data for noti_status in paginated_notifications]

Original file line number	Diff line number	Diff line change
`@@ -238,7 +238,7 @@ def FetchUserNotifications(self, request: Request):`
`238`	`238`	`},`
`239`	`239`	`"error": None`
`240`	`240`	`},`
`241`		`- status = status.HTTP_400_BAD_REQUEST`
	`241`	`+ status = status.HTTP_200_OK`
`242`	`242`	`)`
`243`	`243`
`244`	`244`	`serialized_notifications = [UserNotificationDetailSerializer(noti_status.notification).data for noti_status in paginated_notifications]`