@@ -1031,6 +1031,7 @@ def get_audio_job_export_data(job_id, dst_file, job, temp_dir_base, temp_dir):
10311031 labels_list = list (labels_queryset .values ())
10321032
10331033 labels_mapping = {}
1034+ gt_jobs = []
10341035
10351036 for label in labels_list :
10361037 labels_mapping [label ["id" ]] = label
@@ -1044,36 +1045,128 @@ def get_audio_job_export_data(job_id, dst_file, job, temp_dir_base, temp_dir):
10441045 for attribute in attributes_list :
10451046 labels_mapping [label ["id" ]]["attributes" ][attribute ["id" ]] = attribute
10461047
1047- slogger .glob .debug ("JOB LABELS ATTRIBUTES" )
1048- slogger .glob .debug (json .dumps (attributes_list ))
1049-
1050-
1051- slogger .glob .debug ("JOB LABELS" )
1052- slogger .glob .debug (json .dumps (labels_list ))
1053-
1054- # audio_file_path = os.path.join(temp_dir, str(job_id) + ".wav")
1055- # with wave.open(audio_file_path, 'wb') as wave_file:
1056- # wave_file.setnchannels(1)
1057- # wave_file.setsampwidth(4)
1058- # wave_file.setframerate(44100)
1059- # wave_file.writeframes(concat_array)
1060-
10611048 annotation_audio_chunk_file_paths = chunk_annotation_audio (concat_array , temp_dir , annotations )
10621049
1063- for i , annotation in enumerate (annotations ):
1064- entry = {
1065- "path" : os .path .basename (annotation_audio_chunk_file_paths [i ]),
1050+ # handle Gt jobs
1051+ if job_details .segment .type == "specific_frames" :
1052+ frames = job_details .segment .frames
1053+ start_frame = frames [0 ]
1054+ segment_size = job_details .segment .task .segment_size
1055+ overlap = job_details .segment .task .overlap
1056+
1057+ def generate_jobs (included_frames : list [int ], segment_size : int , overlap : int ) -> list [dict ]:
1058+ if not included_frames :
1059+ return []
1060+
1061+ jobs = []
1062+ start_frame = included_frames [0 ]
1063+
1064+ while start_frame + segment_size <= included_frames [- 1 ]:
1065+ end_frame = start_frame + segment_size - 1
1066+
1067+ # Check if both start_frame and end_frame exist in included_frames
1068+ start_exists = start_frame in included_frames
1069+ end_exists = end_frame in included_frames
1070+
1071+ if start_exists and end_exists :
1072+ jobs .append ({
1073+ "start_frame" : start_frame ,
1074+ "end_frame" : end_frame
1075+ })
1076+
1077+ # Move start_frame back by the overlap for the next job
1078+ start_frame = end_frame - overlap + 1
1079+
1080+ # Find the next valid start frame
1081+ while start_frame not in included_frames and start_frame <= included_frames [- 1 ]:
1082+ start_frame += 1
1083+
1084+ # Break if we can't find a valid next start frame
1085+ if start_frame not in included_frames :
1086+ break
1087+
1088+ # Handle the last section if necessary
1089+ last_start = start_frame
1090+ if last_start in included_frames and last_start < included_frames [- 1 ]:
1091+ jobs .append ({
1092+ "start_frame" : last_start ,
1093+ "end_frame" : included_frames [- 1 ]
1094+ })
1095+
1096+ return jobs
1097+
1098+ gt_jobs = generate_jobs (included_frames = frames , segment_size = segment_size , overlap = overlap )
1099+ # fetch all jobs of this task
1100+ task_jobs = Job .objects .filter (segment__task__id = job_details .segment .task_id ).order_by ('id' )
1101+ start = 0
1102+ for job_index , job in enumerate (task_jobs ):
1103+ for i , gt_job in enumerate (gt_jobs ):
1104+ if job .segment .start_frame == gt_job ['start_frame' ] and job .segment .stop_frame == gt_job ['end_frame' ]:
1105+ diff_in_frame = gt_job ['end_frame' ] - gt_job ['start_frame' ] + 1
1106+ duration = int (((job_details .segment .task .audio_total_duration / job_details .segment .task .data .size ) * diff_in_frame )/ 1000 )
1107+ gt_jobs [i ]['job_index' ] = job_index
1108+ gt_jobs [i ]['start' ] = start
1109+ gt_jobs [i ]['end' ] = start + duration
1110+ start = start + duration
1111+ break
1112+
1113+ def process_annotations (annotations , gt_jobs , job_details , labels_mapping ):
1114+ final_data = []
1115+
1116+ for i , annotation in enumerate (annotations ):
1117+ start = annotation ["points" ][0 ]
1118+ end = annotation ["points" ][3 ]
1119+
1120+ if job_details .segment .type == "specific_frames" :
1121+ overlapping_jobs = []
1122+ for gt_job in gt_jobs :
1123+ if not (end <= gt_job ['start' ] or start >= gt_job ['end' ]):
1124+ overlapping_jobs .append (gt_job )
1125+
1126+ if len (overlapping_jobs ) > 1 :
1127+ for job in overlapping_jobs :
1128+ entry = create_entry (annotation , job_details , labels_mapping , i )
1129+ entry ['job_id' ] = job ['job_index' ]
1130+ entry ['start' ] = 0 if start <= job ['start' ] else start - job ['start' ]
1131+ entry ['end' ] = job ['end' ] - job ['start' ] if end >= job ['end' ] else end - job ['start' ]
1132+ add_attributes (entry , annotation , labels_mapping )
1133+ final_data .append (entry )
1134+ else :
1135+ entry = create_entry (annotation , job_details , labels_mapping , i )
1136+ for gt_job in gt_jobs :
1137+ if gt_job ['start' ] <= start and gt_job ['end' ] >= end :
1138+ entry ['job_id' ] = gt_job ['job_index' ]
1139+ entry ['start' ] = start - gt_job ['start' ]
1140+ entry ['end' ] = end - gt_job ['start' ]
1141+ break
1142+ add_attributes (entry , annotation , labels_mapping )
1143+ final_data .append (entry )
1144+ else :
1145+ # Handle normal jobs
1146+ entry = create_entry (annotation , job_details , labels_mapping , i )
1147+ entry ['job_id' ] = job_details .id
1148+ entry ['start' ] = start
1149+ entry ['end' ] = end
1150+ add_attributes (entry , annotation , labels_mapping )
1151+ final_data .append (entry )
1152+
1153+ return final_data
1154+
1155+ def create_entry (annotation , job_details , labels_mapping , index ):
1156+ return {
1157+ "project_id" : job_details .segment .task .project_id ,
1158+ "task_id" : job_details .segment .task_id ,
1159+ "path" : os .path .basename (annotation_audio_chunk_file_paths [index ]),
10661160 "sentence" : annotation .get ("transcript" , "" ),
10671161 "age" : annotation .get ("age" , "" ),
10681162 "gender" : annotation .get ("gender" , "" ),
10691163 "accents" : annotation .get ("accent" , "" ),
10701164 "locale" : annotation .get ("locale" , "" ),
10711165 "emotion" : annotation .get ("emotion" , "" ),
10721166 "label" : labels_mapping [annotation ["label_id" ]]["name" ],
1073- "start" : annotation ["points" ][0 ],
1074- "end" : annotation ["points" ][3 ]
10751167 }
10761168
1169+ def add_attributes (entry , annotation , labels_mapping ):
10771170 attributes = annotation .get ("attributes" , [])
10781171 for idx , attr in enumerate (attributes ):
10791172 annotation_attribute_id = attr .get ("spec_id" , "" )
@@ -1085,12 +1178,7 @@ def get_audio_job_export_data(job_id, dst_file, job, temp_dir_base, temp_dir):
10851178 entry [f"attribute_{ idx + 1 } _name" ] = attribute_name
10861179 entry [f"attribute_{ idx + 1 } _value" ] = attribute_val
10871180
1088- final_data .append (entry )
1089-
1090- slogger .glob .debug ("JOB ANNOTATION DATA" )
1091- slogger .glob .debug (json .dumps (final_data ))
1092- slogger .glob .debug ("All ANNOTATIONs DATA" )
1093- slogger .glob .debug (json .dumps (annotations ))
1181+ final_data = process_annotations (annotations , gt_jobs , job_details , labels_mapping )
10941182 return final_data , annotation_audio_chunk_file_paths
10951183
10961184def convert_annotation_data_format (data , format_name ):
@@ -1247,7 +1335,8 @@ def export_audino_job(job_id, dst_file, format_name, server_url=None, save_image
12471335 df = pd .DataFrame (final_data )
12481336
12491337 # sorting by start column in ascending order
1250- df = df .sort_values (by = 'start' )
1338+ if 'job_id' in df .columns :
1339+ df = df .sort_values (by = 'job_id' )
12511340
12521341 # Saving the metadata file
12531342 meta_data_file_path = os .path .join (temp_dir_base , str (job_id ) + ".tsv" )
0 commit comments