Skip to content

Commit e6d3a14

Browse files
author
Ratin Kumar
committed
allow adding files using url
1 parent 10d26c4 commit e6d3a14

File tree

4 files changed

+147
-22
lines changed

4 files changed

+147
-22
lines changed

backend/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
requests
12
alembic==1.2.1
23
Flask==1.1.1
34
Flask-Login==0.4.1

backend/routes/data.py

Lines changed: 98 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import json
22
import sqlalchemy as sa
33
import uuid
4+
import requests
45

56
from pathlib import Path
67

@@ -25,8 +26,7 @@ def send_audio_file(file_name):
2526

2627

2728
def validate_segmentation(segment):
28-
"""Validate the segmentation before accepting the annotation's upload from users
29-
"""
29+
"""Validate the segmentation before accepting the annotation's upload from users"""
3030
required_key = {"start_time", "end_time", "transcription"}
3131

3232
if set(required_key).issubset(segment.keys()):
@@ -44,8 +44,7 @@ def generate_segmentation(
4444
data_id,
4545
segmentation_id=None,
4646
):
47-
"""Generate a Segmentation from the required segment information
48-
"""
47+
"""Generate a Segmentation from the required segment information"""
4948
if segmentation_id is None:
5049
segmentation = Segmentation(
5150
data_id=data_id,
@@ -190,3 +189,98 @@ def add_data():
190189
),
191190
201,
192191
)
192+
193+
194+
def download_file(url, save_path=None):
195+
local_filename = url.split("/")[-1] if save_path is None else save_path
196+
with requests.get(url, stream=True) as r:
197+
r.raise_for_status()
198+
with open(local_filename, "wb") as f:
199+
for chunk in r.iter_content(chunk_size=8192):
200+
f.write(chunk)
201+
return local_filename
202+
203+
204+
@api.route("/dataWithUrl", methods=["POST"])
205+
def add_data_from_url():
206+
api_key = request.headers.get("Authorization", None)
207+
208+
if not api_key:
209+
raise BadRequest(description="API Key missing from `Authorization` Header")
210+
211+
project = Project.query.filter_by(api_key=api_key).first()
212+
213+
if not project:
214+
raise NotFound(description="No project exist with given API Key")
215+
216+
username = request.form.get("username", None)
217+
user = User.query.filter_by(username=username).first()
218+
219+
if not user:
220+
raise NotFound(description="No user found with given username")
221+
222+
segmentations = request.form.get("segmentations", "[]")
223+
reference_transcription = request.form.get("reference_transcription", None)
224+
data_url = request.form.get("data_url", None)
225+
is_marked_for_review = bool(request.form.get("is_marked_for_review", False))
226+
227+
if data_url is None:
228+
return 404
229+
230+
original_filename = secure_filename(data_url.split("/")[-1])
231+
232+
extension = Path(original_filename).suffix.lower()
233+
234+
if len(extension) > 1 and extension[1:] not in ALLOWED_EXTENSIONS:
235+
raise BadRequest(description="File format is not supported")
236+
237+
filename = f"{str(uuid.uuid4().hex)}{extension}"
238+
239+
file_path = Path(app.config["UPLOAD_FOLDER"]).joinpath(filename)
240+
download_file(data_url, file_path.as_posix())
241+
242+
data = Data(
243+
project_id=project.id,
244+
filename=filename,
245+
original_filename=original_filename,
246+
reference_transcription=reference_transcription,
247+
is_marked_for_review=is_marked_for_review,
248+
assigned_user_id=user.id,
249+
)
250+
db.session.add(data)
251+
db.session.flush()
252+
253+
segmentations = json.loads(segmentations)
254+
255+
new_segmentations = []
256+
257+
for segment in segmentations:
258+
validated = validate_segmentation(segment)
259+
260+
if not validated:
261+
raise BadRequest(description=f"Segmentations have missing keys.")
262+
263+
new_segment = generate_segmentation(
264+
data_id=data.id,
265+
project_id=project.id,
266+
end_time=float(segment["end_time"]),
267+
start_time=float(segment["start_time"]),
268+
annotations=segment.get("annotations", {}),
269+
transcription=segment["transcription"],
270+
)
271+
272+
new_segmentations.append(new_segment)
273+
274+
data.set_segmentations(new_segmentations)
275+
276+
db.session.commit()
277+
db.session.refresh(data)
278+
279+
return (
280+
jsonify(
281+
data_id=data.id,
282+
message=f"Data uploaded, created and assigned successfully",
283+
type="DATA_CREATED",
284+
),
285+
201,
286+
)

docs/tutorials/upload-data.md

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,29 +2,40 @@
22

33
The tool provides an end point to upload datapoints. You would need an API Key which can be found on the admin dashboard for all projects. To upload datapoints for a project, you would need to make a `POST` request to `/api/data` end point. API Key should be passed in `Authorization` header. Labels for data can also be uploaded.
44

5-
For every datapoint, we need to provide the following required information:
5+
Following are ways in which datapoints can be created and their respective requirements:
66

7-
1. `audio_file`: The audio binary file of `mp3`, `wav` or `ogg` format along with filename.
8-
2. `username`: The username to whom this audio needs to be assigned for annotation. It should be one of the users created.
7+
1. Using remote audio file url
8+
1. `data_url`: The URL to audio file accessable via a simple python request.
9+
2. Using local audio files
10+
1. `audio_file`: The audio binary file of `mp3`, `wav` or `ogg` format along with filename.
11+
2. `username`: The username to whom this audio needs to be assigned for annotation. It should be one of the users created.
912

1013
You can also provide the following optional information:
1114

1215
1. `reference_transcription`: Transcription of audio for reference.
13-
2. `is_marked_for_review`: Whether this audio should be marked for review or not.
16+
2. `is_marked_for_review`: Whether this audio should be marked for review or not.
1417
3. `segmentations` : The list of segmentation values for the given audio.
1518

1619
We provide an [example CLI script](../../examples/upload_data/upload_data.py) to show how to upload the datapoints.
1720

1821
For example, you can add data with reference transcripts:
1922

2023
```sh
24+
// creating datapoint using local audio file
2125
API_KEY=4369e45d3a94466b8fe1efb86b8a4392 python upload_data.py --username admin --is_marked_for_review True --audio_file OSR_us_000_0010_8k.wav --host localhost --port 80 --reference_transcription "The birch canoe slid on the smooth planks. Glue the sheet to the dark blue background. It's easy to tell the depth of a well. These days a chicken leg is a rare dish. Rice is often served in round bowls. The juice of lemons makes fine punch. The box was thrown beside the parked truck. The hogs were fed chopped corn and garbage. Four hours of steady work faced us. Large size in stockings is hard to sell."
2226
```
2327

2428
or
2529

30+
```sh
31+
// creating datapoint using remote audio file url
32+
API_KEY=67cf63744f0f444f98a4326f37b53b93 python3 upload_data.py --username admin --is_marked_for_review True --host localhost --port 3000 --reference_transcription "Glue the sheet to the dark blue background." --data_url "https://static.wikia.nocookie.net/soundeffects/images/3/31/Bird_Singing_Chirp_Sound.ogg/revision/latest?cb=20210122103806"
33+
```
34+
35+
or
36+
2637
add data with segmentation values:
2738

2839
```sh
2940
API_KEY=cb0ac22ca0404fd19e89162bee8c462b python upload_data.py --username admin --is_marked_for_review True --audio_file OSR_us_000_0010_8k.wav --host localhost --port 5000 --segmentations '[ { "annotations": { "testing this": { "values": [ "4", "5" ] } }, "end_time": 7.7407, "start_time": 3.8604, "transcription": "Sample transcription data" }, { "end_time": 17.7407, "start_time": 13.8604, "transcription": "Sample transcription data" }]'
30-
```
41+
```

examples/upload_data/upload_data.py

Lines changed: 32 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -39,27 +39,25 @@
3939
help="List of segmentations for the audio",
4040
default=[],
4141
)
42+
parser.add_argument(
43+
"--data_url",
44+
type=str,
45+
help="Url of ",
46+
default="",
47+
)
4248
parser.add_argument("--port", type=int, help="Port to make request to", default=80)
4349

4450
args = parser.parse_args()
4551

4652
api_key = os.getenv("API_KEY", None)
4753
headers = {"Authorization": api_key}
4854

49-
audio_path = Path(args.audio_file)
50-
audio_filename = audio_path.name
51-
if audio_path.is_file():
52-
audio_obj = open(audio_path.resolve(), "rb")
53-
else:
54-
print("Audio file does not exist")
55-
exit()
56-
55+
data_url = args.data_url
5756
reference_transcription = args.reference_transcription
5857
username = args.username
5958
is_marked_for_review = args.is_marked_for_review
6059
segmentations = args.segmentations
6160

62-
file = {"audio_file": (audio_filename, audio_obj)}
6361

6462
values = {
6563
"reference_transcription": reference_transcription,
@@ -68,10 +66,31 @@
6866
"is_marked_for_review": is_marked_for_review,
6967
}
7068

71-
print("Creating datapoint")
72-
response = requests.post(
73-
f"http://{args.host}:{args.port}/api/data", files=file, data=values, headers=headers
74-
)
69+
70+
print("Creating datapoint {}".format(f"from url: {data_url}" if data_url else ""))
71+
72+
if data_url:
73+
values.update({"data_url": data_url})
74+
response = requests.post(
75+
f"http://{args.host}:{args.port}/api/dataWithUrl", data=values, headers=headers
76+
)
77+
else:
78+
audio_path = Path(args.audio_file)
79+
audio_filename = audio_path.name
80+
if audio_path.is_file():
81+
audio_obj = open(audio_path.resolve(), "rb")
82+
else:
83+
print("Audio file does not exist")
84+
exit()
85+
file = {"audio_file": (audio_filename, audio_obj)}
86+
87+
response = requests.post(
88+
f"http://{args.host}:{args.port}/api/data",
89+
files=file,
90+
data=values,
91+
headers=headers,
92+
)
93+
7594

7695
if response.status_code == 201:
7796
response_json = response.json()

0 commit comments

Comments
 (0)