Skip to content

Commit eb538d5

Browse files
committed
Fix: task export, data upload status, overlap
1 parent 7ff1faa commit eb538d5

File tree

5 files changed

+71
-64
lines changed

5 files changed

+71
-64
lines changed

cvat/apps/dataset_manager/task.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from pydub import AudioSegment
1414
import numpy as np
1515
import os
16-
from collections import OrderedDict
16+
from collections import OrderedDict, defaultdict
1717
from copy import deepcopy
1818
from enum import Enum
1919
from tempfile import TemporaryDirectory
@@ -1359,6 +1359,30 @@ def export_audino_task(task_id, dst_file, format_name, server_url=None, save_ima
13591359
task_jobs = task.db_jobs
13601360
audio_files_mapping = sorted(os.listdir(os.path.join(task.db_task.data.get_data_dirname(), "raw")), key=str.lower)
13611361

1362+
job_duration_without_overlap: list[float] = []
1363+
for audio_idx in range(len(task.db_task.audio_total_duration)):
1364+
duration_ms = task.db_task.audio_total_duration[audio_idx]
1365+
frame_count = task.db_task.total_frames_count[audio_idx]
1366+
if duration_ms > 0 and frame_count > 0:
1367+
frames_per_ms = frame_count / duration_ms
1368+
overlap_ms = task.db_task.overlap / frames_per_ms if frames_per_ms > 0 else 0
1369+
effective_segment_duration = task.db_task.segment_duration - overlap_ms
1370+
job_duration_without_overlap.append(effective_segment_duration / 1000.0)
1371+
else:
1372+
job_duration_without_overlap.append(0.0)
1373+
1374+
job_start_time_mapping: dict[int, float] = {}
1375+
audio_job_counters: dict[int, int] = defaultdict(int)
1376+
for job in task_jobs:
1377+
job_id = job.id
1378+
audio_index = job.segment.audio_file_index
1379+
if audio_index is not None and audio_index < len(job_duration_without_overlap):
1380+
duration_without_overlap = job_duration_without_overlap[audio_index]
1381+
job_start_time_mapping[job_id] = audio_job_counters[audio_index] * duration_without_overlap
1382+
audio_job_counters[audio_index] += 1
1383+
else:
1384+
job_start_time_mapping[job_id] = 0.0
1385+
13621386
final_task_data = []
13631387
final_annotation_chunk_paths = []
13641388
with TemporaryDirectory(dir=temp_dir_base) as temp_dir:
@@ -1373,6 +1397,16 @@ def export_audino_task(task_id, dst_file, format_name, server_url=None, save_ima
13731397
# Convert the data into a format
13741398
final_data = convert_annotation_data_format(job, final_data, format_name)
13751399

1400+
# Modify start and end for each annotation to make time stamps absolute
1401+
for annotation in final_data:
1402+
job_id = annotation.get("job_id")
1403+
if job_id is not None:
1404+
job_time_offset = job_start_time_mapping.get(job_id)
1405+
if "start" in annotation:
1406+
annotation["start"] += job_time_offset
1407+
if "end" in annotation:
1408+
annotation["end"] += job_time_offset
1409+
13761410
final_task_data.append(final_data)
13771411
final_annotation_chunk_paths.append(annotation_audio_chunk_file_paths)
13781412

cvat/apps/engine/background_operations.py

Lines changed: 22 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -793,54 +793,38 @@ def create_gt_job_task(task_id, validated_data):
793793
job_segments[index] = used_segments
794794

795795

796-
def save_concatenated_gt_audio_chunks(time_stamps, upload_dir):
796+
def save_concatenated_gt_audio_chunks(time_stamps: list, output_directory: str):
797797
"""
798-
Concatenates audio chunks from multiple WAV files in the same directory based on time_stamps into a single WAV file, saving it in the same directory.
799-
800-
Args:
801-
time_stamps: A list where odd indices contain durations for each job (in milliseconds).
802-
upload_dir: The directory containing the input WAV files (0.wav, 1.wav, etc.) and where the output will be saved.
798+
Efficiently concatenates MP3 audio chunks into a single MP3, minimizing RAM usage
799+
by appending incrementally. Assumes input files are named '{job_index}.mp3'.
803800
"""
801+
number_of_jobs = len(time_stamps) // 2
802+
output_file_path = os.path.join(output_directory, f"{number_of_jobs}.mp3")
804803

805-
num_jobs = len(time_stamps) // 2
806-
output_path = os.path.join(upload_dir, f"{num_jobs}.wav")
807-
808-
# Check if the file with .mp3 exist or .wav
809-
if not os.path.exists(output_path):
810-
output_path = os.path.splitext(output_path)[0] + '.mp3'
811-
812-
if os.path.exists(output_path):
813-
os.remove(output_path)
814-
815-
concatenated_audio = AudioSegment.empty()
816-
817-
for job_index in range(num_jobs):
818-
input_wav_path = os.path.join(upload_dir, f"{job_index}.wav")
819-
820-
# Check if the file with .mp3 exist or .wav
821-
if not os.path.exists(input_wav_path):
822-
input_wav_path = os.path.splitext(input_wav_path)[0] + '.mp3'
804+
if os.path.exists(output_file_path):
805+
os.remove(output_file_path)
823806

824-
if not os.path.exists(input_wav_path):
825-
print(f"Warning: Input file {input_wav_path} not found, skipping job {job_index}.")
807+
output_file_created = False
808+
for job_index in range(number_of_jobs):
809+
input_file_path = os.path.join(output_directory, f"{job_index}.mp3")
810+
if not os.path.exists(input_file_path):
826811
continue
827812

828813
duration_ms = time_stamps[job_index * 2 + 1]
829-
830814
try:
831-
audio_segment = AudioSegment.from_file(input_wav_path)
832-
audio_segment = audio_segment[:duration_ms]
833-
concatenated_audio += audio_segment
815+
audio_segment = AudioSegment.from_file(input_file_path)[:duration_ms]
816+
if not output_file_created:
817+
audio_segment.export(output_file_path, format="mp3")
818+
output_file_created = True
819+
else:
820+
existing_audio = AudioSegment.from_file(output_file_path)
821+
combined_audio = existing_audio + audio_segment
822+
combined_audio.export(output_file_path, format="mp3")
834823
except Exception as e:
835-
print(f"Error processing {input_wav_path}: {e}")
824+
print(f"Error processing {input_file_path}: {e}")
836825

837-
if len(concatenated_audio) > 0:
838-
try:
839-
concatenated_audio.export(output_path, format="mp3")
840-
except Exception as e:
841-
print(f"Error writing {output_path}: {e}")
842-
else:
843-
print("Warning: No valid audio data to concatenate.")
826+
if not output_file_created:
827+
print("Warning: No valid audio segments found for concatenation.")
844828

845829
save_concatenated_gt_audio_chunks(time_stamps, task.data.get_compressed_cache_dirname())
846830
else:

cvat/apps/engine/media_extractors.py

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -508,17 +508,6 @@ def _has_frame(self, i):
508508

509509
return False
510510

511-
def get_total_frames(self):
512-
total_frame = 0
513-
with self._get_av_container() as container:
514-
stream = container.streams.audio[0]
515-
stream.thread_type = 'AUTO'
516-
for packet in container.demux(stream):
517-
for _ in packet.decode():
518-
total_frame += 1
519-
520-
return total_frame
521-
522511
def get_file_encoding(self, file_path):
523512

524513
with open(file_path, 'rb') as f:
@@ -539,8 +528,8 @@ def __iter__(self):
539528
if self._has_frame(frame_num - 1):
540529
yield (image, self._source_path[0], image.pts)
541530

542-
def get_progress(self, pos):
543-
duration = self._get_duration()
531+
def get_progress(self, pos, audio_idx=0):
532+
duration = self._get_duration(audio_idx)
544533
return pos / duration if duration else None
545534

546535
def _get_av_container(self, file_path_idx = 0):
@@ -553,8 +542,8 @@ def _get_av_container(self, file_path_idx = 0):
553542
else:
554543
return av.open(self._source_path[file_path_idx])
555544

556-
def _get_duration(self):
557-
with self._get_av_container() as container:
545+
def _get_duration(self, audio_idx=0):
546+
with self._get_av_container(audio_idx) as container:
558547
stream = container.streams.audio[0]
559548
duration = None
560549
if stream.duration:

cvat/apps/engine/task.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1070,7 +1070,7 @@ def get_audio_duration_and_frame_count(file_path):
10701070
db_task.audio_total_duration.append(duration_milliseconds)
10711071
db_task.total_frames_count.append(total_frame)
10721072

1073-
db_task.overlap = db_task.overlap if db_task.overlap is not None else 0 # we want to hardcode overlap for audio
1073+
db_task.overlap = db_task.overlap if db_task.overlap else 191 # we want to hardcode overlap for audio (191 frames approx 5 sec)
10741074

10751075
# Default chunk size = entire frames
10761076
db_data.chunk_size = db_task.segment_size #db_task.data.size
@@ -1243,11 +1243,11 @@ def generate_chunks_with_overlap(
12431243
chunk.append((image, file_path, image.pts))
12441244

12451245
if frame_num == frame_count:
1246-
yield chunk_idx, chunk
1246+
yield chunk_idx, idx, chunk
12471247
chunk_idx += 1
12481248
chunk = []
12491249
elif len(chunk) == segment_size:
1250-
yield chunk_idx, chunk[:segment_size]
1250+
yield chunk_idx, idx, chunk[:segment_size]
12511251
chunk_idx += 1
12521252
chunk = chunk[segment_size - overlap:]
12531253

@@ -1264,7 +1264,7 @@ def generate_chunks_with_overlap(
12641264
db_task.overlap,
12651265
db_task.total_jobs_count
12661266
)
1267-
generator = ((idx, list(chunk_data)) for idx, chunk_data in generator)
1267+
generator = ((idx, audio_idx, list(chunk_data)) for idx, audio_idx, chunk_data in generator)
12681268

12691269
def save_chunks(
12701270
executor: concurrent.futures.ThreadPoolExecutor,
@@ -1296,7 +1296,7 @@ def save_chunks(
12961296
# (path, frame, size)
12971297
return list((i[0][1], i[0][2], i[1]) for i in zip(chunk_data, image_sizes))
12981298

1299-
def process_results(img_meta: list[tuple[str, int, tuple[int, int]]]):
1299+
def process_results(audio_idx: int, img_meta: list[tuple[str, int, tuple[int, int]]]):
13001300
nonlocal db_images, db_data, video_path, video_size
13011301

13021302
if db_task.mode == 'annotation':
@@ -1312,7 +1312,7 @@ def process_results(img_meta: list[tuple[str, int, tuple[int, int]]]):
13121312
video_size = img_meta[0][2]
13131313
video_path = img_meta[0][0]
13141314

1315-
progress = extractor.get_progress(img_meta[-1][1])
1315+
progress = extractor.get_progress(img_meta[-1][1], audio_idx)
13161316
update_progress(progress)
13171317

13181318
futures = queue.Queue(maxsize=settings.CVAT_CONCURRENT_CHUNK_PROCESSING)
@@ -1322,14 +1322,14 @@ def process_results(img_meta: list[tuple[str, int, tuple[int, int]]]):
13221322
for frames in db_task.total_frames_count:
13231323
db_data.size += frames
13241324

1325-
for chunk_idx, chunk_data in generator:
1325+
for chunk_idx, audio_idx, chunk_data in generator:
13261326
slogger.glob.debug("Chunk {} with {} frames".format(chunk_idx, len(chunk_data)))
13271327
if futures.full():
1328-
process_results(futures.get().result())
1328+
process_results(audio_idx, futures.get().result())
13291329
futures.put(executor.submit(save_chunks, executor, chunk_idx, chunk_data))
13301330

13311331
while not futures.empty():
1332-
process_results(futures.get().result())
1332+
process_results(audio_idx, futures.get().result())
13331333

13341334
if db_task.mode == 'annotation':
13351335
models.Image.objects.bulk_create(db_images)

cvat/apps/quality_control/quality_reports.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2208,7 +2208,7 @@ def _match_annotations(self, ds_annotations, gt_annotations):
22082208
gt_annotations = [
22092209
gt_ann
22102210
for gt_ann in gt_annotations
2211-
if job_start_time <= gt_ann["points"][0] and gt_ann["points"][3] <= job_end_time
2211+
if (job_start_time - 1.5) <= gt_ann["points"][0] and gt_ann["points"][3] <= (job_end_time + 1.5)
22122212
]
22132213

22142214
ds_annotations = [

0 commit comments

Comments
 (0)