Merge pull request #30 from midas-research/consensus-api-fix

rohan220217 · web-flow · commit 4e590de4062c · 2025-10-07T15:36:20.000+05:30
Consensus api added in quality report
diff --git a/cvat/apps/quality_control/serializers.py b/cvat/apps/quality_control/serializers.py
@@ -78,6 +78,9 @@ class QualityReportCreateSerializer(serializers.Serializer):
 class ImmediateQualityReportCreateSerializer(serializers.Serializer):
     job_id = serializers.IntegerField(write_only=True)
 
+class ConsensusCreateSerializer(serializers.Serializer):
+    files = serializers.ListField()
+
 class QualitySettingsSerializer(serializers.ModelSerializer):
     class Meta:
         model = models.QualitySettings
diff --git a/cvat/apps/quality_control/views.py b/cvat/apps/quality_control/views.py
@@ -2,9 +2,18 @@
 #
 # SPDX-License-Identifier: MIT
 
+import os
 import textwrap
+import requests
+import json
+import tempfile
+import shutil
+import zipfile
+import io
 import random
 
+from collections import namedtuple, Counter, defaultdict
+from django.http import FileResponse
 from django.db import transaction
 from django.db.models import Q, QuerySet
 from django.http import HttpResponse
@@ -16,7 +25,8 @@
     extend_schema_view,
 )
 from rest_framework import mixins, status, viewsets
-from rest_framework.decorators import action
+from rest_framework.permissions import AllowAny
+from rest_framework.decorators import action, api_view, permission_classes
 from rest_framework.exceptions import NotFound, ValidationError
 from rest_framework.response import Response
 
@@ -42,6 +52,7 @@
     QualityReportCreateSerializer,
     QualityReportSerializer,
     QualitySettingsSerializer,
+    ConsensusCreateSerializer,
 )
 from rest_framework.permissions import IsAuthenticated
 
@@ -584,6 +595,217 @@ def calculate_score(gt_samples, ds_samples, start_time=0):
         except Exception as e:
             raise ValidationError(f"An internal server error occurred: {str(e)}")
 
+    @extend_schema(
+        summary='Get the consensus result from multiple ZIP files containing JSONs.',
+        description=textwrap.dedent(
+            """
+            Accepts multiple ZIP files containing JSON annotations via URLs, processes them to build a consensus result,
+            and returns the consensus JSON file. Each ZIP file is expected to contain a single JSON file with annotations.
+            The URLs should be provided in the request body as a list under the 'files' key.
+            Each file object in the list must contain 'bucket_name', 'chain_id', 'escrow_address', and 'file_name' fields.
+            """
+        ),
+        request=ConsensusCreateSerializer,
+        responses={
+            "200": OpenApiResponse(
+                response={"type": "object"},
+                description="Consensus JSON file containing combined annotations from the provided files."
+            ),
+            "400": OpenApiResponse(description="Bad Request: Missing or invalid data."),
+        },
+    )
+    @action(detail=False, methods=['POST'], url_path='consensus-reports', permission_classes=[IsAuthenticated])
+    def process_json_from_url(self, request, *args, **kwargs):
+        """Process multiple ZIP files containing JSONs and return consensus result"""
+        files = request.data.get('files', [])
+        if not files or not isinstance(files, list):
+            return Response({"error": "Invalid request"}, status=status.HTTP_400_BAD_REQUEST)
+
+        temp_dir = None
+        try:
+            temp_dir = tempfile.mkdtemp()
+            datasets = []
+
+            def download_and_extract_json(url, temp_dir, file_prefix):
+                """Downloads a ZIP file from URL, extracts it, and returns the JSON content."""
+                response = requests.get(url, stream=True)
+                response.raise_for_status()
+
+                zip_path = os.path.join(temp_dir, f"{file_prefix}.zip")
+                with open(zip_path, 'wb') as f:
+                    for chunk in response.iter_content(chunk_size=8192):
+                        if chunk:
+                            f.write(chunk)
+
+                extract_dir = os.path.join(temp_dir, f"extract_{file_prefix}")
+                os.makedirs(extract_dir, exist_ok=True)
+
+                with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+                    zip_ref.extractall(extract_dir)
+
+                json_files = [f for f in os.listdir(extract_dir) if f.endswith('.json')]
+                if not json_files:
+                    raise ValueError(f"No JSON file found in ZIP from {url}")
+
+                json_path = os.path.join(extract_dir, json_files[0])
+                with open(json_path, 'r', encoding='utf-8') as f:
+                    return json.load(f)
+
+            def build_consensus(raw_labels):
+                """
+                Builds consensus from a list of raw labels
+                """
+                emotion_counts = Counter()
+                intensity_counts = Counter()
+                total = 0
+
+                def split_label(label):
+                    """
+                    Splits a label like 'Positive_Mild' into emotion and intensity
+                    """
+                    if label == "Can't Predict":
+                        return None, None
+                    parts = label.split("_", 1)
+                    if len(parts) == 2:
+                        return parts[0], parts[1]
+                    return label, None
+
+                for lbl in raw_labels:
+                    emotion, intensity = split_label(lbl)
+                    if emotion is None:  # Can't Predict
+                        continue
+                    emotion_counts[emotion] += 1
+                    if intensity:
+                        intensity_counts[intensity] += 1
+                    total += 1
+
+                result = {
+                    "raw_labels": raw_labels,
+                    "combined_label_emotion": [],
+                    "combined_label_intensity": [],
+                    "consensus_label_emotion": [],
+                    "consensus_label_intensity": []
+                }
+
+                if total == 0:  # all were Can't Predict
+                    result["combined_label_emotion"] = [{"label": "Can't Predict", "confidence": 1.0}]
+                    result["combined_label_intensity"] = []
+                    result["consensus_label_emotion"] = ["Can't Predict"]
+                    result["consensus_label_intensity"] = []
+                    return result
+
+                # Combined distributions
+                for emo, count in emotion_counts.items():
+                    result["combined_label_emotion"].append({
+                        "label": emo,
+                        "confidence": count / total
+                    })
+
+                for inten, count in intensity_counts.items():
+                    result["combined_label_intensity"].append({
+                        "label": inten,
+                        "confidence": count / total
+                    })
+
+                # Consensus = majority or tie
+                if emotion_counts:
+                    max_emotion = max(emotion_counts.values())
+                    result["consensus_label_emotion"] = [emo for emo, c in emotion_counts.items() if c == max_emotion]
+
+                if intensity_counts:
+                    max_intensity = max(intensity_counts.values())
+                    result["consensus_label_intensity"] = [inten for inten, c in intensity_counts.items() if c == max_intensity]
+
+                return result
+
+            def apply_consensus_logic(datasets):
+                """
+                Applies consensus logic to multiple datasets
+                """
+                if not datasets:
+                    return []
+
+                # Build consensus for each annotation across datasets
+                consensus = []
+                for items in zip(*datasets):
+                    raw_labels = [item["label"] for item in items]
+
+                    consensus.append({
+                        "audio_file": items[0]["audio_file"],
+                        "start": items[0]["start"],
+                        "end": items[0]["end"],
+                        **build_consensus(raw_labels)
+                    })
+
+                return consensus
+
+            def apply_consensus_logic_audio(consensus):
+                audio_groups = defaultdict(lambda: {"emotions": [], "intensities": []})
+
+                for ann in consensus:
+                    audio_file = ann["audio_file"]
+                    audio_groups[audio_file]["emotions"].extend(ann["consensus_label_emotion"])
+                    audio_groups[audio_file]["intensities"].extend(ann["consensus_label_intensity"])
+
+                audio_consensus = []
+                for audio_file, vals in audio_groups.items():
+                    emotion_result = build_consensus([f"{e}_Mild" for e in vals["emotions"]])  # dummy intensity to reuse logic
+                    intensity_result = build_consensus([f"Positive_{i}" for i in vals["intensities"]])  # dummy emotion to reuse logic
+
+                    audio_consensus.append({
+                        "audio_file": audio_file,
+                        "combined_label_emotion": emotion_result["combined_label_emotion"],
+                        "combined_label_intensity": intensity_result["combined_label_intensity"],
+                        "consensus_label_emotion": emotion_result["consensus_label_emotion"],
+                        "consensus_label_intensity": intensity_result["consensus_label_intensity"]
+                    })
+                return audio_consensus
+
+            for index, file_obj in enumerate(files):
+                # Extract required fields
+                required_fields = ['bucket_name', 'chain_id', 'escrow_address', 'file_name']
+                if not all(file_obj.get(field) for field in required_fields):
+                    return Response({"error": "Missing required fields"}, status=status.HTTP_400_BAD_REQUEST)
+
+                # Build URL and download JSON
+                url = f"https://{file_obj['bucket_name']}.s3.ap-south-1.amazonaws.com/{file_obj['escrow_address']}%40{file_obj['chain_id']}/{file_obj['file_name']}"
+                json_data = download_and_extract_json(url, temp_dir, f"file{index+1}")
+                datasets.append(json_data)
+
+            if not datasets:
+                return Response({"error": "No data found"}, status=status.HTTP_400_BAD_REQUEST)
+
+            # Process and return result
+            consensus_data = apply_consensus_logic(datasets)
+            audio_consensus_data = apply_consensus_logic_audio(consensus_data)
+            output_file_path = os.path.join(temp_dir, "consensus.json")
+            output_audio_file_path = os.path.join(temp_dir, "audio_consensus.json")
+
+            with open(output_file_path, 'w', encoding='utf-8') as f:
+                json.dump(consensus_data, f)
+
+            with open(output_audio_file_path, 'w', encoding='utf-8') as f:
+                json.dump(audio_consensus_data, f)
+
+            # Create in-memory ZIP file
+            zip_buffer = io.BytesIO()
+            with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf:
+                zipf.writestr("consensus.json", json.dumps(consensus_data, ensure_ascii=False))
+                zipf.writestr("audio_consensus.json", json.dumps(audio_consensus_data, ensure_ascii=False))
+
+            zip_buffer.seek(0)  # Go to the start of the BytesIO buffer
+
+            # Create response
+            response = HttpResponse(zip_buffer, content_type='application/zip')
+            response['Content-Disposition'] = 'attachment; filename=consensus_bundle.zip'
+            return response
+
+        except Exception as e:
+            return Response({"error": str(e)}, status=status.HTTP_500_INTERNAL_SERVER_ERROR)
+        finally:
+            if temp_dir and os.path.exists(temp_dir):
+                shutil.rmtree(temp_dir)
+
     @extend_schema(
         operation_id="quality_retrieve_report_data",
         summary="Get quality report contents",