AI4Bharat · munishmangla98 · Apr 19, 2024 · Apr 23, 2024 · Apr 25, 2024 · Apr 29, 2024
diff --git a/.env.example b/.env.example
@@ -1,21 +1,53 @@
 SECRET_KEY='<-- YOUR SECRET KEY HERE -->'
 
-DB_NAME='postgres' # Insert your database name here
-DB_USER='postgres' # Insert your PostgreSQL username here
-DB_PASSWORD='password' #Insert your PostgreSQL password here.
-DB_HOST='db'
-DB_PORT='5432'
+DB_NAME='citus' # Insert your database name here
+DB_USER='citus' # Insert your PostgreSQL username here
+DB_PASSWORD='' #Insert your PostgreSQL password here.
+DB_HOST=''
 
-SMTP_USERNAME = ""
-SMTP_PASSWORD = ""
+API_URL=''
 
-API_URL='http://localhost:8000'
 
-LOGGING='false'
-LOG_LEVEL='INFO'
 
-ENV='dev'
+DB_PORT='5432'
 
-FRONTEND_URL=''
+ENV='dev'
+DEFAULT_FROM_EMAIL=""
+EMAIL_HOST=""
+SMTP_USERNAME=""
+SMTP_PASSWORD=""
 
 INDIC_TRANS_V2_KEY=''
+INDIC_TRANS_V2_URL=''
+
+LOGGING='true'
+LOG_LEVEL='WARNING'
+
+GOOGLE_APPLICATION_CREDENTIALS = ''
+
+FRONTEND_URL_FOR_RESET_PASSWORD = 'https://dev.shoonya.ai4bharat.org'
+SECRET_KEY_RESET_PASSWORD = ''
+
+ASR_DHRUVA_URL = ''
+ASR_DHRUVA_AUTHORIZATION = ''
+
+INDEX_NAME= 'django_logs_dev'
+ELASTICSEARCH_URL=''
+
+AZURE_CONNECTION_STRING = ''
+
+STORAGE_ACCOUNT_CONNECTION_STRING='
+
+CONTAINER_NAME_FOR_DOWNLOAD_ALL_PROJECTS=''
+
+LOGS_CONTAINER_NAME='logs'
+
+
+FLOWER_ADDRESS="localhost"
+FLOWER_PORT="5555"
+FLOWER_USERNAME="shoonya"
+FLOWER_PASSWORD="flower123"
+FRONTEND_URL=''
+CELERY_BROKER_URL="redis://redis:6379"
+REDIS_HOST="127.0.0.1"
+REDIS_PORT="6379"
diff --git a/backend/dataset/migrations/0047_speechconversation_final_transcribed_json.py b/backend/dataset/migrations/0047_speechconversation_final_transcribed_json.py
@@ -0,0 +1,22 @@
+# Generated by Django 3.2.14 on 2024-05-21 06:02
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("dataset", "0046_merge_20240416_2233"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="speechconversation",
+            name="final_transcribed_json",
+            field=models.JSONField(
+                blank=True,
+                help_text="Field where data from this standardised_transcription_editing type will be exported.",
+                null=True,
+                verbose_name="final_transcribed_json",
+            ),
+        ),
+    ]
diff --git a/backend/dataset/migrations/0048_ocrdocument_bboxes_relation_prediction_json.py b/backend/dataset/migrations/0048_ocrdocument_bboxes_relation_prediction_json.py
@@ -0,0 +1,19 @@
+# Generated by Django 3.2.14 on 2024-06-19 11:15
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("dataset", "0047_speechconversation_final_transcribed_json"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="ocrdocument",
+            name="bboxes_relation_prediction_json",
+            field=models.JSONField(
+                blank=True, null=True, verbose_name="bboxes_relation_prediction_json"
+            ),
+        ),
+    ]
diff --git a/backend/dataset/models.py b/backend/dataset/models.py
@@ -311,6 +311,10 @@ class OCRDocument(DatasetBase):
         verbose_name="bboxes_relation_json", null=True, blank=True
     )
 
+    bboxes_relation_prediction_json = models.JSONField(
+        verbose_name="bboxes_relation_prediction_json", null=True, blank=True
+    )
+
     annotated_document_details_json = models.JSONField(
         verbose_name="annotated_document_details_json", null=True, blank=True
     )
@@ -484,6 +488,14 @@ class SpeechConversation(DatasetBase):
         blank=True,
         help_text=("Prepopulated prediction for the implemented models"),
     )
+    final_transcribed_json = models.JSONField(
+        verbose_name="final_transcribed_json",
+        null=True,
+        blank=True,
+        help_text=(
+            "Field where data from this standardised_transcription_editing type will be exported."
+        ),
+    )
 
     def __str__(self):
         return str(self.id)

diff --git a/backend/dataset/tasks.py b/backend/dataset/tasks.py
@@ -12,11 +12,9 @@
 #### CELERY SHARED TASKS
 
 
-@shared_task(
-    bind=True,
-)
+@shared_task(queue="default")
 def upload_data_to_data_instance(
-    self, dataset_string, pk, dataset_type, content_type, deduplicate=False
+    dataset_string, pk, dataset_type, content_type, deduplicate=False
 ):
     # sourcery skip: raise-specific-error
     """Celery background task to upload the data to the dataset instance through file upload.
@@ -102,8 +100,8 @@ def upload_data_to_data_instance(
         raise Exception(f"Upload failed for lines: {failed_rows}")
 
 
-@shared_task(bind=True)
-def deduplicate_dataset_instance_items(self, pk, deduplicate_field_list):
+@shared_task(queue="default")
+def deduplicate_dataset_instance_items(pk, deduplicate_field_list):
     if len(deduplicate_field_list) == 0:
         return "Field list cannot be empty"
     try:

diff --git a/backend/functions/tasks.py b/backend/functions/tasks.py
@@ -29,7 +29,7 @@
     ANNOTATED,
 )
 from tasks.views import SentenceOperationViewSet
-from users.models import User, LANG_CHOICES
+from users.models import User
 from django.core.mail import EmailMessage
 
 from utils.blob_functions import (
@@ -56,7 +56,11 @@
 import tempfile
 
 from shoonya_backend.locks import Lock
-
+from utils.constants import LANG_CHOICES
+from projects.tasks import filter_data_items
+from projects.models import BATCH
+from dataset import models as dataset_models
+from projects.registry_helper import ProjectRegistry
 import logging
 
 logger = logging.getLogger(__name__)
@@ -72,6 +76,10 @@ def sentence_text_translate_and_save_translation_pairs(
     input_dataset_instance_id,
     output_dataset_instance_id,
     batch_size,
+    filter_string,
+    sampling_mode,
+    sampling_parameters,
+    variable_parameters,
     api_type="indic-trans-v2",
     checks_for_particular_languages=False,
     automate_missing_data_items=True,
@@ -87,6 +95,10 @@ def sentence_text_translate_and_save_translation_pairs(
             Allowed - [indic-trans, google, indic-trans-v2, azure, blank]
         checks_for_particular_languages (bool): If True, checks for the particular languages in the translations.
         automate_missing_data_items (bool): If True, consider only those data items that are missing in the target dataset instance.
+        filter_string (str): string to filter input data.
+        sampling_mode (str): can be batch or full.
+        sampling_parameters (json): is a json that contains, batch number and batch size
+
     """
     task_name = "sentence_text_translate_and_save_translation_pairs"
     output_sentences = list(
@@ -113,6 +125,14 @@ def sentence_text_translate_and_save_translation_pairs(
             "metadata_json",
         )
     )
+    if filter_string and sampling_mode and sampling_parameters:
+        input_sentences = get_filtered_items(
+            "SentenceText",
+            input_dataset_instance_id,
+            filter_string,
+            sampling_mode,
+            sampling_parameters,
+        )
 
     # Convert the input_sentences list into a dataframe
     input_sentences_complete_df = pd.DataFrame(
@@ -403,7 +423,15 @@ def conversation_data_machine_translation(
 
 @shared_task(bind=True)
 def generate_ocr_prediction_json(
-    self, dataset_instance_id, user_id, api_type, automate_missing_data_items
+    self,
+    dataset_instance_id,
+    user_id,
+    api_type,
+    automate_missing_data_items,
+    filter_string,
+    sampling_mode,
+    sampling_parameters,
+    variable_parameters,
 ):
     """Function to generate OCR prediction data and to save to the same data item.
     Args:
@@ -436,7 +464,14 @@ def generate_ocr_prediction_json(
         )
     except Exception as e:
         ocr_data_items = []
-
+    if filter_string and sampling_mode and sampling_parameters:
+        ocr_data_items = get_filtered_items(
+            "OCRDocument",
+            dataset_instance_id,
+            filter_string,
+            sampling_mode,
+            sampling_parameters,
+        )
     # converting the dataset_instance to pandas dataframe.
     ocr_data_items_df = pd.DataFrame(
         ocr_data_items,
@@ -555,7 +590,15 @@ def generate_ocr_prediction_json(
 
 @shared_task(bind=True)
 def generate_asr_prediction_json(
-    self, dataset_instance_id, user_id, api_type, automate_missing_data_items
+    self,
+    dataset_instance_id,
+    user_id,
+    api_type,
+    automate_missing_data_items,
+    filter_string,
+    sampling_mode,
+    sampling_parameters,
+    variable_parameters,
 ):
     """Function to generate ASR prediction data and to save to the same data item.
     Args:
@@ -589,7 +632,14 @@ def generate_asr_prediction_json(
         )
     except Exception as e:
         asr_data_items = []
-
+    if filter_string and sampling_mode and sampling_parameters:
+        asr_data_items = get_filtered_items(
+            "SpeechConversation",
+            dataset_instance_id,
+            filter_string,
+            sampling_mode,
+            sampling_parameters,
+        )
     # converting the dataset_instance to pandas dataframe.
     asr_data_items_df = pd.DataFrame(
         asr_data_items,
@@ -703,7 +753,16 @@ def generate_asr_prediction_json(
 
 
 @shared_task(bind=True)
-def populate_draft_data_json(self, pk, user_id, fields_list):
+def populate_draft_data_json(
+    self,
+    pk,
+    user_id,
+    fields_list,
+    filter_string,
+    sampling_mode,
+    sampling_parameters,
+    variable_parameters,
+):
     task_name = "populate_draft_data_json"
     try:
         dataset_instance = DatasetInstance.objects.get(pk=pk)
@@ -712,6 +771,10 @@ def populate_draft_data_json(self, pk, user_id, fields_list):
     dataset_type = dataset_instance.dataset_type
     dataset_model = apps.get_model("dataset", dataset_type)
     dataset_items = dataset_model.objects.filter(instance_id=dataset_instance)
+    if filter_string and sampling_mode and sampling_parameters:
+        dataset_items = get_filtered_items(
+            dataset_type, pk, filter_string, sampling_mode, sampling_parameters
+        )
     cnt = 0
     for dataset_item in dataset_items:
         new_draft_data_json = {}
@@ -1695,3 +1758,38 @@ def upload_all_projects_to_blob_and_get_url(csv_files_directory):
             return "Error in generating url"
         blob_url = f"https://{account_name}.blob.{endpoint_suffix}/{CONTAINER_NAME_FOR_DOWNLOAD_ALL_PROJECTS}/{blob_client.blob_name}?{sas_token}"
     return blob_url
+
+
+def get_filtered_items(
+    dataset_model,
+    dataset_instance_id,
+    filter_string,
+    sampling_mode,
+    sampling_parameters,
+):
+    registry_helper = ProjectRegistry.get_instance()
+    project_type = registry_helper.get_project_name_from_dataset(dataset_model)
+    if not isinstance(dataset_instance_id, list):
+        dataset_instance_id = [dataset_instance_id]
+    filtered_items = filter_data_items(
+        project_type=project_type,
+        dataset_instance_ids=dataset_instance_id,
+        filter_string=filter_string,
+    )
+    # Apply sampling
+    if sampling_mode == BATCH:
+        batch_size = sampling_parameters["batch_size"]
+        try:
+            batch_number = sampling_parameters["batch_number"]
+            if len(batch_number) == 0:
+                batch_number = [1]
+        except KeyError:
+            batch_number = [1]
+        sampled_items = []
+        for batch_num in batch_number:
+            sampled_items += filtered_items[
+                batch_size * (batch_num - 1) : batch_size * batch_num
+            ]
+    else:
+        sampled_items = filtered_items
+    return sampled_items