Skip to content

Commit 3ca48ed

Browse files
authored
Add support for Pangaea datasets (#45)
1 parent c892377 commit 3ca48ed

File tree

7 files changed

+52
-4
lines changed

7 files changed

+52
-4
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ Datahugger is a tool to download scientific datasets, software, and code from a
88

99
## Supported repositories
1010

11-
Datahugger offers support for more than [<!-- count -->376<!-- count --> generic and specific (scientific) repositories](https://j535d165.github.io/datahugger/repositories) (and more to come!).
11+
Datahugger offers support for more than [<!-- count -->377<!-- count --> generic and specific (scientific) repositories](https://j535d165.github.io/datahugger/repositories) (and more to come!).
1212

1313
[![Datahugger support Zenodo, Dataverse, DataOne, GitHub, FigShare, HuggingFace, Mendeley Data, Dryad, OSF, and many more](https://github.com/J535D165/datahugger/raw/main/docs/images/logos.png)](https://j535d165.github.io/datahugger/repositories)
1414

datahugger/base.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,10 @@ def _get(
303303
):
304304
if (
305305
len(self.files) == 1
306-
and self.files[0]["link"].endswith(".zip")
306+
and (
307+
self.files[0]["link"].endswith(".zip")
308+
or self.files[0]["name"].endswith(".zip")
309+
)
307310
and self.unzip
308311
):
309312
self._unpack_single_folder(self.files[0]["link"], output_folder)

datahugger/config.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from datahugger.services import HuggingFaceDataset
1010
from datahugger.services import MendeleyDataset
1111
from datahugger.services import OSFDataset
12+
from datahugger.services import PangaeaDataset
1213
from datahugger.services import ZenodoDataset
1314

1415
# fast lookup
@@ -40,7 +41,7 @@
4041
"get.iedadata.org": DataOneDataset,
4142
"usap-dc.org": DataOneDataset,
4243
"iys.hakai.org": DataOneDataset,
43-
# "doi.pangaea.de": DataOneDataset,
44+
"doi.pangaea.de": PangaeaDataset,
4445
"rvdata.us": DataOneDataset,
4546
"sead-published.ncsa.illinois.edu": DataOneDataset,
4647
# DataVerse repositories (extracted from re3data)

datahugger/services.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import io
2+
import re
23
import xml.etree.ElementTree as ET
34
import zipfile
45
from pathlib import Path
@@ -142,6 +143,45 @@ def files(self):
142143
return self._files
143144

144145

146+
class PangaeaDataset(DatasetDownloader):
147+
"""Downloader for PangaeaDataset repository."""
148+
149+
REGEXP_ID = r"doi\.pangaea\.de/(?P<record_id>.*)"
150+
151+
# the base entry point of the REST API
152+
API_URL = "https://doi.pangaea.de/"
153+
154+
@property
155+
def files(self):
156+
# get the difference between collection and file
157+
r = requests.get(
158+
f"{self.API_URL}{self._params['record_id']}?format=metadata_jsonld"
159+
)
160+
r.raise_for_status()
161+
dists = r.json()["distribution"]
162+
163+
if isinstance(dists, dict):
164+
dists = [dists]
165+
166+
files = []
167+
for d in dists:
168+
if d["encodingFormat"] in ["text/tab-separated-values", "application/zip"]:
169+
r_filename = requests.head(d["contentUrl"])
170+
content_d = r_filename.headers["content-disposition"]
171+
172+
files.append(
173+
{
174+
"link": d["contentUrl"],
175+
"name": re.findall("filename=(.+)", content_d)[0],
176+
"size": None,
177+
"hash": None,
178+
"hash_type": None,
179+
}
180+
)
181+
182+
return files
183+
184+
145185
class DSpaceDataset(DatasetDownloader):
146186
"""Downloader for DSpaceDataset repositories."""
147187

docs/repositories.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Supported repositories
22

3-
Datahugger offers support for more than <!-- count -->376<!-- count --> generic and specific (scientific) repositories (and more to come!).
3+
Datahugger offers support for more than <!-- count -->377<!-- count --> generic and specific (scientific) repositories (and more to come!).
44

55
![Datahugger support Zenodo, Dataverse, DataOne, GitHub, FigShare, HuggingFace, Mendeley Data, Dryad, OSF, and many more](images/logos.png)
66

scripts/estimate_repos_supported.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ def count_repos():
1616
"dryad": 1,
1717
"github": 1,
1818
"huggingface": 1,
19+
"pangaea": 1,
1920
}
2021

2122
print(counts)

tests/test_repositories.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,9 @@
5555
),
5656
# huggingface
5757
# ("10.57967/hf/0034", "test.csv"),
58+
# Pangaea
59+
("https://doi.org/10.1594/PANGAEA.954547", "Gubbio_age.tab"),
60+
("https://doi.pangaea.de/10.1594/PANGAEA.954543", "AA_age.tab"),
5861
]
5962

6063

0 commit comments

Comments
 (0)