Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
# Changelog for oda_reader

## 1.4.1 (2025-12-19)
- Extends bulk download auto-detection to support `.csv` files in addition to `.txt` files.

## 1.4.0 (2025-12-19)
- Adds `bulk_download_dac2a()` function for bulk downloading the full DAC2A dataset.
- Auto-detects file types (parquet vs txt/csv) in bulk downloads, removing the need for the `is_txt` parameter.
- Auto-detects file types (parquet or txt) in bulk downloads, removing the need for the `is_txt` parameter.
- Auto-detects CSV delimiters (comma, pipe, tab, semicolon) when reading txt files from bulk downloads.
- Deprecates the `is_txt` parameter in `bulk_download_parquet()`. The parameter is still accepted for backward compatibility but emits a deprecation warning and will be removed in a future major release.
- Adds pytest and pytest-mock to dev dependencies for improved testing support.
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "oda_reader"
version = "1.4.0"
version = "1.4.1"
description = "A simple package to import ODA data from the OECD's API and AidData's database"
readme = "README.md"
license = "MIT"
Expand Down
48 changes: 31 additions & 17 deletions src/oda_reader/download/download_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,16 +226,16 @@ def _save_or_return_parquet_files_from_content(
*,
as_iterator: bool = False,
) -> list[pd.DataFrame] | None | typing.Iterator[pd.DataFrame]:
"""Extract parquet or txt (CSV) files from a zip archive.
"""Extract parquet, csv, or txt files from a zip archive.

If `save_to_path` is provided the files are extracted and written
to disk. Otherwise the contents are returned either as a list of
`DataFrame` objects or, when `as_iterator` is `True`, as an iterator
yielding one `DataFrame` per row group (parquet only).

The function auto-detects whether the zip contains parquet or txt files.
Txt files have their delimiter auto-detected (comma or pipe) and are
converted to parquet when saving.
The function auto-detects whether the zip contains parquet, csv, or txt files.
CSV/txt files have their delimiter auto-detected (comma, pipe, tab, etc.) and
are converted to parquet when saving.

Args:
response_content: Bytes or `Path` pointing to the zipped file.
Expand All @@ -252,7 +252,11 @@ def _save_or_return_parquet_files_from_content(

with _open_zip(response_content=response_content) as z:
parquet_files = [name for name in z.namelist() if name.endswith(".parquet")]
txt_files = [name for name in z.namelist() if name.endswith(".txt")]
csv_files = [
name
for name in z.namelist()
if name.endswith(".txt") or name.endswith(".csv")
]

# Determine which file type we're dealing with
if parquet_files:
Expand All @@ -276,15 +280,18 @@ def _save_or_return_parquet_files_from_content(
logger.info(f"Reading {len(parquet_files)} parquet files.")
return [pd.read_parquet(z.open(file)) for file in parquet_files]

elif txt_files:
elif csv_files:
if as_iterator:
raise ValueError("Streaming not supported for txt files.")
raise ValueError("Streaming not supported for csv/txt files.")

if save_to_path:
save_to_path.mkdir(parents=True, exist_ok=True)
for file_name in txt_files:
for file_name in csv_files:
clean_name = (
file_name.replace(".txt", ".parquet").lower().replace(" ", "_")
file_name.replace(".txt", ".parquet")
.replace(".csv", ".parquet")
.lower()
.replace(" ", "_")
)
logger.info(f"Saving {clean_name}")
with z.open(file_name) as f_in:
Expand All @@ -299,9 +306,9 @@ def _save_or_return_parquet_files_from_content(
).to_parquet(save_to_path / clean_name)
return None

logger.info(f"Reading {len(txt_files)} txt files.")
logger.info(f"Reading {len(csv_files)} csv/txt files.")
dfs = []
for file_name in txt_files:
for file_name in csv_files:
with z.open(file_name) as f_in:
delimiter = _detect_delimiter(f_in)
logger.info(f"Detected delimiter for {file_name}: '{delimiter}'")
Expand All @@ -317,14 +324,14 @@ def _save_or_return_parquet_files_from_content(
return dfs

else:
raise ValueError("No parquet or txt files found in the zip archive.")
raise ValueError("No parquet, csv, or txt files found in the zip archive.")


def _save_or_return_parquet_files_from_txt_in_zip(
response_content: bytes | Path,
save_to_path: Path | str | None = None,
) -> list[pd.DataFrame] | None:
"""Extract a `.txt` file from a zipped archive supplied as bytes or a file path.
"""Extract csv or txt files from a zipped archive supplied as bytes or a file path.

The file is read as CSV (with auto-detected delimiter) and optionally saved
as a parquet file.
Expand All @@ -341,15 +348,22 @@ def _save_or_return_parquet_files_from_txt_in_zip(
save_to_path = Path(save_to_path).expanduser().resolve() if save_to_path else None

with _open_zip(response_content=response_content) as z:
# Find all txt files in the zip archive
files = [name for name in z.namelist() if name.endswith(".txt")]
# Find all csv/txt files in the zip archive
files = [
name
for name in z.namelist()
if name.endswith(".txt") or name.endswith(".csv")
]

# If save_to_path is provided, save the files to the path
if save_to_path:
save_to_path.mkdir(parents=True, exist_ok=True)
for file_name in files:
clean_name = (
file_name.replace(".txt", ".parquet").lower().replace(" ", "_")
file_name.replace(".txt", ".parquet")
.replace(".csv", ".parquet")
.lower()
.replace(" ", "_")
)
logger.info(f"Saving {clean_name}")
with z.open(file_name) as f_in:
Expand Down Expand Up @@ -501,7 +515,7 @@ def bulk_download_parquet(
"""Download data from the stats.oecd.org file download service.

Certain data files are available as a bulk download. This function
downloads the files (parquet or txt/csv) and returns a single DataFrame.
downloads the files (parquet, csv, or txt) and returns a single DataFrame.
The file type is auto-detected from the zip contents.

Args:
Expand Down
58 changes: 56 additions & 2 deletions tests/download/unit/test_download_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def _create_zip_with_parquet(self) -> bytes:
return zip_buffer.getvalue()

def _create_zip_with_txt(self, delimiter: str = ",") -> bytes:
"""Create a zip file containing a txt/CSV file."""
"""Create a zip file containing a txt file."""
if delimiter == "|":
csv_content = "col1|col2|col3\n1|2|3\n4|5|6"
else:
Expand All @@ -151,6 +151,18 @@ def _create_zip_with_txt(self, delimiter: str = ",") -> bytes:
z.writestr("test_data.txt", csv_content.encode("utf-8"))
return zip_buffer.getvalue()

def _create_zip_with_csv(self, delimiter: str = ",") -> bytes:
"""Create a zip file containing a .csv file."""
if delimiter == "|":
csv_content = "col1|col2|col3\n1|2|3\n4|5|6"
else:
csv_content = "col1,col2,col3\n1,2,3\n4,5,6"

zip_buffer = io.BytesIO()
with zipfile.ZipFile(zip_buffer, "w") as z:
z.writestr("test_data.csv", csv_content.encode("utf-8"))
return zip_buffer.getvalue()

def test_auto_detect_parquet_files(self):
"""Test that parquet files are auto-detected and read correctly."""
zip_content = self._create_zip_with_parquet()
Expand Down Expand Up @@ -187,6 +199,48 @@ def test_auto_detect_txt_files_pipe(self):
assert list(df.columns) == ["col1", "col2", "col3"]
assert len(df) == 2

def test_auto_detect_csv_files(self):
"""Test that .csv files are auto-detected and read correctly."""
zip_content = self._create_zip_with_csv(delimiter=",")

result = _save_or_return_parquet_files_from_content(zip_content)

assert result is not None
assert len(result) == 1
assert isinstance(result[0], pd.DataFrame)
assert list(result[0].columns) == ["col1", "col2", "col3"]
assert len(result[0]) == 2

def test_auto_detect_csv_files_pipe(self):
"""Test that pipe-delimited .csv files are auto-detected."""
zip_content = self._create_zip_with_csv(delimiter="|")

result = _save_or_return_parquet_files_from_content(zip_content)

assert result is not None
assert len(result) == 1
df = result[0]
assert isinstance(df, pd.DataFrame)
assert list(df.columns) == ["col1", "col2", "col3"]
assert len(df) == 2

def test_save_csv_as_parquet_to_path(self, tmp_path):
"""Test that .csv files are converted to parquet when saving."""
zip_content = self._create_zip_with_csv()

result = _save_or_return_parquet_files_from_content(
zip_content, save_to_path=tmp_path
)

assert result is None
saved_files = list(tmp_path.glob("*.parquet"))
assert len(saved_files) == 1
# Verify conversion to parquet with correct name
assert saved_files[0].suffix == ".parquet"
assert "test_data" in saved_files[0].name
df = pd.read_parquet(saved_files[0])
assert len(df) == 2

def test_save_parquet_to_path(self, tmp_path):
"""Test saving parquet files to a path."""
zip_content = self._create_zip_with_parquet()
Expand Down Expand Up @@ -224,7 +278,7 @@ def test_raises_on_empty_zip(self):
with zipfile.ZipFile(zip_buffer, "w") as z:
z.writestr("readme.md", "Not a data file")

with pytest.raises(ValueError, match="No parquet or txt files"):
with pytest.raises(ValueError, match="No parquet, csv, or txt files"):
_save_or_return_parquet_files_from_content(zip_buffer.getvalue())

def test_txt_iterator_raises(self):
Expand Down
2 changes: 1 addition & 1 deletion uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.