From e6d3a141ca70842035cfe9900a819ae7a015785a Mon Sep 17 00:00:00 2001
From: Ratin Kumar <ratin.kumar@platform.sh>
Date: Sat, 12 Jun 2021 23:55:22 +0200
Subject: [PATCH] allow adding files using url

---
 backend/requirements.txt            |   1 +
 backend/routes/data.py              | 102 ++++++++++++++++++++++++++--
 docs/tutorials/upload-data.md       |  21 ++++--
 examples/upload_data/upload_data.py |  45 ++++++++----
 4 files changed, 147 insertions(+), 22 deletions(-)

diff --git a/backend/requirements.txt b/backend/requirements.txt
index c891c81..b34bc38 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -1,3 +1,4 @@
+requests
 alembic==1.2.1
 Flask==1.1.1
 Flask-Login==0.4.1
diff --git a/backend/routes/data.py b/backend/routes/data.py
index 1d65ef8..923cabc 100644
--- a/backend/routes/data.py
+++ b/backend/routes/data.py
@@ -1,6 +1,7 @@
 import json
 import sqlalchemy as sa
 import uuid
+import requests
 
 from pathlib import Path
 
@@ -25,8 +26,7 @@ def send_audio_file(file_name):
 
 
 def validate_segmentation(segment):
-    """Validate the segmentation before accepting the annotation's upload from users
-    """
+    """Validate the segmentation before accepting the annotation's upload from users"""
     required_key = {"start_time", "end_time", "transcription"}
 
     if set(required_key).issubset(segment.keys()):
@@ -44,8 +44,7 @@ def generate_segmentation(
     data_id,
     segmentation_id=None,
 ):
-    """Generate a Segmentation from the required segment information
-    """
+    """Generate a Segmentation from the required segment information"""
     if segmentation_id is None:
         segmentation = Segmentation(
             data_id=data_id,
@@ -190,3 +189,98 @@ def add_data():
         ),
         201,
     )
+
+
+def download_file(url, save_path=None):
+    local_filename = url.split("/")[-1] if save_path is None else save_path
+    with requests.get(url, stream=True) as r:
+        r.raise_for_status()
+        with open(local_filename, "wb") as f:
+            for chunk in r.iter_content(chunk_size=8192):
+                f.write(chunk)
+    return local_filename
+
+
+@api.route("/dataWithUrl", methods=["POST"])
+def add_data_from_url():
+    api_key = request.headers.get("Authorization", None)
+
+    if not api_key:
+        raise BadRequest(description="API Key missing from `Authorization` Header")
+
+    project = Project.query.filter_by(api_key=api_key).first()
+
+    if not project:
+        raise NotFound(description="No project exist with given API Key")
+
+    username = request.form.get("username", None)
+    user = User.query.filter_by(username=username).first()
+
+    if not user:
+        raise NotFound(description="No user found with given username")
+
+    segmentations = request.form.get("segmentations", "[]")
+    reference_transcription = request.form.get("reference_transcription", None)
+    data_url = request.form.get("data_url", None)
+    is_marked_for_review = bool(request.form.get("is_marked_for_review", False))
+
+    if data_url is None:
+        return 404
+
+    original_filename = secure_filename(data_url.split("/")[-1])
+
+    extension = Path(original_filename).suffix.lower()
+
+    if len(extension) > 1 and extension[1:] not in ALLOWED_EXTENSIONS:
+        raise BadRequest(description="File format is not supported")
+
+    filename = f"{str(uuid.uuid4().hex)}{extension}"
+
+    file_path = Path(app.config["UPLOAD_FOLDER"]).joinpath(filename)
+    download_file(data_url, file_path.as_posix())
+
+    data = Data(
+        project_id=project.id,
+        filename=filename,
+        original_filename=original_filename,
+        reference_transcription=reference_transcription,
+        is_marked_for_review=is_marked_for_review,
+        assigned_user_id=user.id,
+    )
+    db.session.add(data)
+    db.session.flush()
+
+    segmentations = json.loads(segmentations)
+
+    new_segmentations = []
+
+    for segment in segmentations:
+        validated = validate_segmentation(segment)
+
+        if not validated:
+            raise BadRequest(description=f"Segmentations have missing keys.")
+
+        new_segment = generate_segmentation(
+            data_id=data.id,
+            project_id=project.id,
+            end_time=float(segment["end_time"]),
+            start_time=float(segment["start_time"]),
+            annotations=segment.get("annotations", {}),
+            transcription=segment["transcription"],
+        )
+
+        new_segmentations.append(new_segment)
+
+    data.set_segmentations(new_segmentations)
+
+    db.session.commit()
+    db.session.refresh(data)
+
+    return (
+        jsonify(
+            data_id=data.id,
+            message=f"Data uploaded, created and assigned successfully",
+            type="DATA_CREATED",
+        ),
+        201,
+    )
diff --git a/docs/tutorials/upload-data.md b/docs/tutorials/upload-data.md
index 7e3626e..9f076a5 100644
--- a/docs/tutorials/upload-data.md
+++ b/docs/tutorials/upload-data.md
@@ -2,15 +2,18 @@
 
 The tool provides an end point to upload datapoints. You would need an API Key which can be found on the admin dashboard for all projects. To upload datapoints for a project, you would need to make a `POST` request to `/api/data` end point. API Key should be passed in `Authorization` header. Labels for data can also be uploaded.
 
-For every datapoint, we need to provide the following required information:
+Following are ways in which datapoints can be created and their respective requirements:
 
-1. `audio_file`: The audio binary file of `mp3`, `wav` or `ogg` format along with filename.
-2. `username`: The username to whom this audio needs to be assigned for annotation. It should be one of the users created.
+1. Using remote audio file url
+   1. `data_url`: The URL to audio file accessable via a simple python request.
+2. Using local audio files
+   1. `audio_file`: The audio binary file of `mp3`, `wav` or `ogg` format along with filename.
+   2. `username`: The username to whom this audio needs to be assigned for annotation. It should be one of the users created.
 
 You can also provide the following optional information:
 
 1. `reference_transcription`: Transcription of audio for reference.
-2. `is_marked_for_review`:  Whether this audio should be marked for review or not.
+2. `is_marked_for_review`: Whether this audio should be marked for review or not.
 3. `segmentations` : The list of segmentation values for the given audio.
 
 We provide an [example CLI script](../../examples/upload_data/upload_data.py) to show how to upload the datapoints.
@@ -18,13 +21,21 @@ We provide an [example CLI script](../../examples/upload_data/upload_data.py) to
 For example, you can add data with reference transcripts:
 
 ```sh
+// creating datapoint using local audio file
 API_KEY=4369e45d3a94466b8fe1efb86b8a4392 python upload_data.py  --username admin --is_marked_for_review True --audio_file OSR_us_000_0010_8k.wav --host localhost --port 80 --reference_transcription "The birch canoe slid on the smooth planks. Glue the sheet to the dark blue background. It's easy to tell the depth of a well. These days a chicken leg is a rare dish. Rice is often served in round bowls. The juice of lemons makes fine punch. The box was thrown beside the parked truck. The hogs were fed chopped corn and garbage. Four hours of steady work faced us. Large size in stockings is hard to sell."
 ```
 
 or
 
+```sh
+// creating datapoint using remote audio file url
+API_KEY=67cf63744f0f444f98a4326f37b53b93 python3 upload_data.py  --username admin --is_marked_for_review True --host localhost --port 3000 --reference_transcription "Glue the sheet to the dark blue background." --data_url "https://static.wikia.nocookie.net/soundeffects/images/3/31/Bird_Singing_Chirp_Sound.ogg/revision/latest?cb=20210122103806"
+```
+
+or
+
 add data with segmentation values:
 
 ```sh
 API_KEY=cb0ac22ca0404fd19e89162bee8c462b python upload_data.py  --username admin --is_marked_for_review True --audio_file OSR_us_000_0010_8k.wav --host localhost --port 5000 --segmentations '[ { "annotations": { "testing this": { "values": [ "4", "5" ] } }, "end_time": 7.7407, "start_time": 3.8604, "transcription": "Sample transcription data" }, { "end_time": 17.7407, "start_time": 13.8604, "transcription": "Sample transcription data" }]'
-```
\ No newline at end of file
+```
diff --git a/examples/upload_data/upload_data.py b/examples/upload_data/upload_data.py
index ff7f07f..05f438b 100644
--- a/examples/upload_data/upload_data.py
+++ b/examples/upload_data/upload_data.py
@@ -39,6 +39,12 @@
     help="List of segmentations for the audio",
     default=[],
 )
+parser.add_argument(
+    "--data_url",
+    type=str,
+    help="Url of ",
+    default="",
+)
 parser.add_argument("--port", type=int, help="Port to make request to", default=80)
 
 args = parser.parse_args()
@@ -46,20 +52,12 @@
 api_key = os.getenv("API_KEY", None)
 headers = {"Authorization": api_key}
 
-audio_path = Path(args.audio_file)
-audio_filename = audio_path.name
-if audio_path.is_file():
-    audio_obj = open(audio_path.resolve(), "rb")
-else:
-    print("Audio file does not exist")
-    exit()
-
+data_url = args.data_url
 reference_transcription = args.reference_transcription
 username = args.username
 is_marked_for_review = args.is_marked_for_review
 segmentations = args.segmentations
 
-file = {"audio_file": (audio_filename, audio_obj)}
 
 values = {
     "reference_transcription": reference_transcription,
@@ -68,10 +66,31 @@
     "is_marked_for_review": is_marked_for_review,
 }
 
-print("Creating datapoint")
-response = requests.post(
-    f"http://{args.host}:{args.port}/api/data", files=file, data=values, headers=headers
-)
+
+print("Creating datapoint {}".format(f"from url: {data_url}" if data_url else ""))
+
+if data_url:
+    values.update({"data_url": data_url})
+    response = requests.post(
+        f"http://{args.host}:{args.port}/api/dataWithUrl", data=values, headers=headers
+    )
+else:
+    audio_path = Path(args.audio_file)
+    audio_filename = audio_path.name
+    if audio_path.is_file():
+        audio_obj = open(audio_path.resolve(), "rb")
+    else:
+        print("Audio file does not exist")
+        exit()
+    file = {"audio_file": (audio_filename, audio_obj)}
+
+    response = requests.post(
+        f"http://{args.host}:{args.port}/api/data",
+        files=file,
+        data=values,
+        headers=headers,
+    )
+
 
 if response.status_code == 201:
     response_json = response.json()