Skip to content

Commit 2e28726

Browse files
committed
feat: added mp3 format instead of wav
1 parent b60bb92 commit 2e28726

File tree

6 files changed

+45
-41
lines changed

6 files changed

+45
-41
lines changed

cvat/apps/dataset_manager/task.py

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
import uuid
1212
import zipfile
1313
from pydub import AudioSegment
14-
from scipy.io import wavfile
1514
import numpy as np
1615
import os
1716
from collections import OrderedDict
@@ -920,9 +919,10 @@ def jobChunkPathGetter(db_data, start, stop, task_dimension, data_quality, data_
920919
if data_quality == 'compressed' else FrameProvider.Quality.ORIGINAL
921920

922921
path = os.path.realpath(frame_provider.get_chunk(number, quality))
923-
# pylint: disable=superfluous-parens
924922

925-
# return {"start_chunk" : start_chunk, "stop_chunk" : stop_chunk}
923+
# Check if the file with .mp3 exist or .wav
924+
if not os.path.exists(path):
925+
path = os.path.splitext(path)[0] + '.wav'
926926

927927
return path
928928

@@ -1015,34 +1015,34 @@ def create_gt_annotation_clips_zip(annotation_audio_chunk_file_paths, json_data,
10151015
shutil.move(zip_filename, dst_file)
10161016

10171017
def get_np_audio_array_from_job(job_id):
1018+
10181019
with transaction.atomic():
10191020
job = JobAnnotation(job_id)
10201021
job.init_from_db()
10211022

1022-
db_job = job.db_job
1023-
task_jobs = Job.objects.filter(segment__task__id=db_job.segment.task_id).order_by('id')
1024-
1025-
data_quality = 'compressed'
1026-
data_num = (
1027-
(len(task_jobs)-1) if (db_job.type == JobType.GROUND_TRUTH) # len(task_jobs) - 1 because of one GT job
1028-
else job_id - task_jobs.first().id
1029-
)
1030-
1031-
task_dimension = db_job.segment.task.dimension
1032-
chunk_path = jobChunkPathGetter(
1033-
db_job.segment.task.data,
1034-
job.start_frame,
1035-
job.stop_frame,
1036-
task_dimension,
1037-
data_quality,
1038-
data_num,
1039-
db_job
1040-
)
1041-
1042-
_, audio_data = wavfile.read(chunk_path)
1043-
audio_data_int16 = np.array(audio_data, dtype=np.int16)
1044-
1045-
return audio_data_int16
1023+
job_data_chunk_size = job.db_job.segment.task.data.chunk_size
1024+
task_dimension = job.db_job.segment.task.dimension
1025+
1026+
start = job.start_frame / job_data_chunk_size
1027+
stop = job.stop_frame / job_data_chunk_size
1028+
1029+
audio_array_buffer = []
1030+
for i in range(math.trunc(start), math.trunc(stop) + 1):
1031+
db_job = job.db_job
1032+
data_num = i
1033+
data_quality = 'compressed'
1034+
1035+
chunk_path = jobChunkPathGetter(job.db_job.segment.task.data, job.start_frame, job.stop_frame, task_dimension, data_quality, data_num, db_job)
1036+
1037+
audio = AudioSegment.from_file(chunk_path) # Handles both MP3 and WAV
1038+
audio_data = np.array(audio.get_array_of_samples(), dtype=np.int16)
1039+
1040+
audio_array_buffer.append(audio_data)
1041+
1042+
# Concatenate all audio data into a single numpy array
1043+
concat_array = np.concatenate(audio_array_buffer, axis=0)
1044+
1045+
return concat_array
10461046

10471047
def get_audio_job_export_data(job_id, dst_file, job, temp_dir_base, temp_dir):
10481048

cvat/apps/engine/media_extractors.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -907,29 +907,33 @@ def _encode_images(images, container, stream):
907907
container.mux(packet)
908908

909909
class AudioChunkWriter(IChunkWriter):
910-
FORMAT = 'wav'
910+
FORMAT = 'mp3'
911911

912912
def __init__(self, quality=67):
913913
# translate inversed range [1:100] to [0:51]
914914
quality = round(51 * (100 - quality) / 99)
915915
super().__init__(quality)
916916
self.rate = 44100
917917

918-
codec = av.codec.Codec('pcm_s16le', 'w')
918+
codec = av.codec.Codec('libmp3lame', 'w') # Use MP3 codec (libmp3lame)
919919
self._codec_name = codec.name
920920
self._codec_opts = {
921+
'bit_rate': '192000', # Set a bitrate for MP3 (e.g., 192kbps)
922+
'vbr': 'true', # Variable Bitrate (optional, depending on your needs)
921923
}
922924

923925
def _add_audio_stream(self, container, rate, options):
924-
925926
audio_stream = container.add_stream(self._codec_name, rate=rate, layout="stereo")
926-
# audio_stream.options = options
927+
928+
# additional codec options, like bitrate
929+
for key, value in options.items():
930+
audio_stream.options[key] = value
927931

928932
return audio_stream
929933

930934
def save_as_chunk(self, images, chunk_path):
931935
if not images:
932-
raise Exception('no images to save')
936+
raise Exception('no audios to save')
933937

934938
with av.open(chunk_path, 'w', format=self.FORMAT) as output_container:
935939
output_v_stream = self._add_audio_stream(
@@ -942,8 +946,8 @@ def save_as_chunk(self, images, chunk_path):
942946
return [(0, 0)]
943947

944948
@staticmethod
945-
def _encode_audio_frames(images, container, stream):
946-
for frame, _, _ in images:
949+
def _encode_audio_frames(audios, container, stream):
950+
for frame, _, _ in audios:
947951
# let libav set the correct pts and time_base
948952
frame.pts = None
949953
frame.time_base = None

cvat/apps/engine/models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -275,7 +275,7 @@ def _get_chunk_name(chunk_number, chunk_type):
275275
elif chunk_type == DataChoice.IMAGESET:
276276
ext = 'zip'
277277
elif chunk_type == DataChoice.AUDIO:
278-
ext = 'wav'
278+
ext = 'mp3'
279279
else:
280280
ext = 'list'
281281

cvat/apps/engine/task.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1336,8 +1336,4 @@ def process_results(img_meta: list[tuple[str, int, tuple[int, int]]]):
13361336
"success" : False,
13371337
"message" : "No match found."
13381338
}
1339-
slogger.glob.error(response)
1340-
1341-
# f = open( '/home/vignesh/Desktop/Desktop/IIITD/BTP.02/cvat/cvat/apps/engine/chunks.txt', 'w' )
1342-
# f.write( 'dict = ' + repr(response) + '\n' )
1343-
# f.close()
1339+
slogger.glob.error(response)

cvat/apps/engine/views.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -717,6 +717,10 @@ def __call__(self, request, start: int, stop: int, db_data: Optional[Data]):
717717
# Follow symbol links if the chunk is a link on a real image otherwise
718718
# mimetype detection inside sendfile will work incorrectly.
719719
path = os.path.realpath(frame_provider.get_chunk(self.number, self.quality))
720+
# Check if the file with .mp3 exist or .wav
721+
if not os.path.exists(path):
722+
path = os.path.splitext(path)[0] + '.wav'
723+
720724
return sendfile(request, path)
721725
elif self.type == 'frame' or self.type == 'preview':
722726
self._check_frame_range(self.number)

cvat/apps/notifications/views.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,7 @@ def FetchUserNotifications(self, request: Request):
238238
},
239239
"error": None
240240
},
241-
status = status.HTTP_400_BAD_REQUEST
241+
status = status.HTTP_200_OK
242242
)
243243

244244
serialized_notifications = [UserNotificationDetailSerializer(noti_status.notification).data for noti_status in paginated_notifications]

0 commit comments

Comments
 (0)