diff --git a/CHANGELOG.md b/CHANGELOG.md index 48ffad8..de2fc44 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,11 @@ # Changelog for oda_reader +## 1.4.1 (2025-12-19) +- Extends bulk download auto-detection to support `.csv` files in addition to `.txt` files. + ## 1.4.0 (2025-12-19) - Adds `bulk_download_dac2a()` function for bulk downloading the full DAC2A dataset. -- Auto-detects file types (parquet vs txt/csv) in bulk downloads, removing the need for the `is_txt` parameter. +- Auto-detects file types (parquet or txt) in bulk downloads, removing the need for the `is_txt` parameter. - Auto-detects CSV delimiters (comma, pipe, tab, semicolon) when reading txt files from bulk downloads. - Deprecates the `is_txt` parameter in `bulk_download_parquet()`. The parameter is still accepted for backward compatibility but emits a deprecation warning and will be removed in a future major release. - Adds pytest and pytest-mock to dev dependencies for improved testing support. diff --git a/pyproject.toml b/pyproject.toml index a153af4..f97a4e4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "oda_reader" -version = "1.4.0" +version = "1.4.1" description = "A simple package to import ODA data from the OECD's API and AidData's database" readme = "README.md" license = "MIT" diff --git a/src/oda_reader/download/download_tools.py b/src/oda_reader/download/download_tools.py index 09c2edb..c3b0130 100644 --- a/src/oda_reader/download/download_tools.py +++ b/src/oda_reader/download/download_tools.py @@ -226,16 +226,16 @@ def _save_or_return_parquet_files_from_content( *, as_iterator: bool = False, ) -> list[pd.DataFrame] | None | typing.Iterator[pd.DataFrame]: - """Extract parquet or txt (CSV) files from a zip archive. + """Extract parquet, csv, or txt files from a zip archive. If `save_to_path` is provided the files are extracted and written to disk. Otherwise the contents are returned either as a list of `DataFrame` objects or, when `as_iterator` is `True`, as an iterator yielding one `DataFrame` per row group (parquet only). - The function auto-detects whether the zip contains parquet or txt files. - Txt files have their delimiter auto-detected (comma or pipe) and are - converted to parquet when saving. + The function auto-detects whether the zip contains parquet, csv, or txt files. + CSV/txt files have their delimiter auto-detected (comma, pipe, tab, etc.) and + are converted to parquet when saving. Args: response_content: Bytes or `Path` pointing to the zipped file. @@ -252,7 +252,11 @@ def _save_or_return_parquet_files_from_content( with _open_zip(response_content=response_content) as z: parquet_files = [name for name in z.namelist() if name.endswith(".parquet")] - txt_files = [name for name in z.namelist() if name.endswith(".txt")] + csv_files = [ + name + for name in z.namelist() + if name.endswith(".txt") or name.endswith(".csv") + ] # Determine which file type we're dealing with if parquet_files: @@ -276,15 +280,18 @@ def _save_or_return_parquet_files_from_content( logger.info(f"Reading {len(parquet_files)} parquet files.") return [pd.read_parquet(z.open(file)) for file in parquet_files] - elif txt_files: + elif csv_files: if as_iterator: - raise ValueError("Streaming not supported for txt files.") + raise ValueError("Streaming not supported for csv/txt files.") if save_to_path: save_to_path.mkdir(parents=True, exist_ok=True) - for file_name in txt_files: + for file_name in csv_files: clean_name = ( - file_name.replace(".txt", ".parquet").lower().replace(" ", "_") + file_name.replace(".txt", ".parquet") + .replace(".csv", ".parquet") + .lower() + .replace(" ", "_") ) logger.info(f"Saving {clean_name}") with z.open(file_name) as f_in: @@ -299,9 +306,9 @@ def _save_or_return_parquet_files_from_content( ).to_parquet(save_to_path / clean_name) return None - logger.info(f"Reading {len(txt_files)} txt files.") + logger.info(f"Reading {len(csv_files)} csv/txt files.") dfs = [] - for file_name in txt_files: + for file_name in csv_files: with z.open(file_name) as f_in: delimiter = _detect_delimiter(f_in) logger.info(f"Detected delimiter for {file_name}: '{delimiter}'") @@ -317,14 +324,14 @@ def _save_or_return_parquet_files_from_content( return dfs else: - raise ValueError("No parquet or txt files found in the zip archive.") + raise ValueError("No parquet, csv, or txt files found in the zip archive.") def _save_or_return_parquet_files_from_txt_in_zip( response_content: bytes | Path, save_to_path: Path | str | None = None, ) -> list[pd.DataFrame] | None: - """Extract a `.txt` file from a zipped archive supplied as bytes or a file path. + """Extract csv or txt files from a zipped archive supplied as bytes or a file path. The file is read as CSV (with auto-detected delimiter) and optionally saved as a parquet file. @@ -341,15 +348,22 @@ def _save_or_return_parquet_files_from_txt_in_zip( save_to_path = Path(save_to_path).expanduser().resolve() if save_to_path else None with _open_zip(response_content=response_content) as z: - # Find all txt files in the zip archive - files = [name for name in z.namelist() if name.endswith(".txt")] + # Find all csv/txt files in the zip archive + files = [ + name + for name in z.namelist() + if name.endswith(".txt") or name.endswith(".csv") + ] # If save_to_path is provided, save the files to the path if save_to_path: save_to_path.mkdir(parents=True, exist_ok=True) for file_name in files: clean_name = ( - file_name.replace(".txt", ".parquet").lower().replace(" ", "_") + file_name.replace(".txt", ".parquet") + .replace(".csv", ".parquet") + .lower() + .replace(" ", "_") ) logger.info(f"Saving {clean_name}") with z.open(file_name) as f_in: @@ -501,7 +515,7 @@ def bulk_download_parquet( """Download data from the stats.oecd.org file download service. Certain data files are available as a bulk download. This function - downloads the files (parquet or txt/csv) and returns a single DataFrame. + downloads the files (parquet, csv, or txt) and returns a single DataFrame. The file type is auto-detected from the zip contents. Args: diff --git a/tests/download/unit/test_download_tools.py b/tests/download/unit/test_download_tools.py index 9191eed..9e638c8 100644 --- a/tests/download/unit/test_download_tools.py +++ b/tests/download/unit/test_download_tools.py @@ -140,7 +140,7 @@ def _create_zip_with_parquet(self) -> bytes: return zip_buffer.getvalue() def _create_zip_with_txt(self, delimiter: str = ",") -> bytes: - """Create a zip file containing a txt/CSV file.""" + """Create a zip file containing a txt file.""" if delimiter == "|": csv_content = "col1|col2|col3\n1|2|3\n4|5|6" else: @@ -151,6 +151,18 @@ def _create_zip_with_txt(self, delimiter: str = ",") -> bytes: z.writestr("test_data.txt", csv_content.encode("utf-8")) return zip_buffer.getvalue() + def _create_zip_with_csv(self, delimiter: str = ",") -> bytes: + """Create a zip file containing a .csv file.""" + if delimiter == "|": + csv_content = "col1|col2|col3\n1|2|3\n4|5|6" + else: + csv_content = "col1,col2,col3\n1,2,3\n4,5,6" + + zip_buffer = io.BytesIO() + with zipfile.ZipFile(zip_buffer, "w") as z: + z.writestr("test_data.csv", csv_content.encode("utf-8")) + return zip_buffer.getvalue() + def test_auto_detect_parquet_files(self): """Test that parquet files are auto-detected and read correctly.""" zip_content = self._create_zip_with_parquet() @@ -187,6 +199,48 @@ def test_auto_detect_txt_files_pipe(self): assert list(df.columns) == ["col1", "col2", "col3"] assert len(df) == 2 + def test_auto_detect_csv_files(self): + """Test that .csv files are auto-detected and read correctly.""" + zip_content = self._create_zip_with_csv(delimiter=",") + + result = _save_or_return_parquet_files_from_content(zip_content) + + assert result is not None + assert len(result) == 1 + assert isinstance(result[0], pd.DataFrame) + assert list(result[0].columns) == ["col1", "col2", "col3"] + assert len(result[0]) == 2 + + def test_auto_detect_csv_files_pipe(self): + """Test that pipe-delimited .csv files are auto-detected.""" + zip_content = self._create_zip_with_csv(delimiter="|") + + result = _save_or_return_parquet_files_from_content(zip_content) + + assert result is not None + assert len(result) == 1 + df = result[0] + assert isinstance(df, pd.DataFrame) + assert list(df.columns) == ["col1", "col2", "col3"] + assert len(df) == 2 + + def test_save_csv_as_parquet_to_path(self, tmp_path): + """Test that .csv files are converted to parquet when saving.""" + zip_content = self._create_zip_with_csv() + + result = _save_or_return_parquet_files_from_content( + zip_content, save_to_path=tmp_path + ) + + assert result is None + saved_files = list(tmp_path.glob("*.parquet")) + assert len(saved_files) == 1 + # Verify conversion to parquet with correct name + assert saved_files[0].suffix == ".parquet" + assert "test_data" in saved_files[0].name + df = pd.read_parquet(saved_files[0]) + assert len(df) == 2 + def test_save_parquet_to_path(self, tmp_path): """Test saving parquet files to a path.""" zip_content = self._create_zip_with_parquet() @@ -224,7 +278,7 @@ def test_raises_on_empty_zip(self): with zipfile.ZipFile(zip_buffer, "w") as z: z.writestr("readme.md", "Not a data file") - with pytest.raises(ValueError, match="No parquet or txt files"): + with pytest.raises(ValueError, match="No parquet, csv, or txt files"): _save_or_return_parquet_files_from_content(zip_buffer.getvalue()) def test_txt_iterator_raises(self): diff --git a/uv.lock b/uv.lock index 9512dd9..41eae9f 100644 --- a/uv.lock +++ b/uv.lock @@ -789,7 +789,7 @@ wheels = [ [[package]] name = "oda-reader" -version = "1.4.0" +version = "1.4.1" source = { editable = "." } dependencies = [ { name = "filelock" },