Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 25 additions & 21 deletions oda_reader/aiddata.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,39 +57,43 @@ def download_aiddata(
pre_process: bool = True,
) -> pd.DataFrame | None:
"""
Download the AidData from the website.
Download AidData from the AidData website. If save_to_path is not specified, a dataframe will be returned with the
corresponding settings (end_year, start_year and pre_process). If save_to_path is specified, the raw AidData Excel
file will be saved as a parquet file to the specified path ignoring the settings (end_year, start_year and
pre_process).

Args:
save_to_path (Path | str): Path to save the raw data to.
start_year (int): The start year of the data to return. This will filter based on commitment year. Optional
end_year (int): The end year of the data to return. This will filter base on commitment year. Optional
pre_process (bool): Whether to preprocess the data. Defaults to True.
start_year (int): Optional parameter indicating the start year of the data to return. This will filter based on
commitment year. If save_to_path is specified, the saved parquet file won't take into account start_year.
end_year (int): Optional parameter indicating the end year of the data to return. This will filter base on
commitment year. If save_to_path is specified, the saved parquet file won't take into account end_year.
pre_process (bool): Whether to preprocess the data. Defaults to True. If save_to_path is specified, the saved
parquet file won't be preprocessed.
Returns:
pd.DataFrame: The adiData data.
pd.DataFrame: The adiData data if no save_to_path is specified.

"""

# Get data
df = bulk_download_aiddata()
df = bulk_download_aiddata(save_to_path=save_to_path)

# Filter years, if needed
df = filter_years(df=df, start_year=start_year, end_year=end_year)
if not save_to_path:

# get scheme for dtypes and column names
schema = read_schema_translation(version="aidData")
# Filter years, if needed
df = filter_years(df=df, start_year=start_year, end_year=end_year)

# Convert dtypes
df = convert_dtypes(df, schema=schema)
# get scheme for dtypes and column names
schema = read_schema_translation(version="aidData")

# rename/remove columns, convert bool columns
if pre_process:
df = preprocess(df, schema)
# Convert dtypes
df = convert_dtypes(df, schema=schema)

# remove columns where all rows are NaN
df = df.dropna(axis=1, how="all")
# rename/remove columns, convert bool columns
if pre_process:
df = preprocess(df, schema)

if save_to_path:
df.to_parquet(save_to_path)
return None
# remove columns where all rows are NaN
df = df.dropna(axis=1, how="all")

return df
return df
7 changes: 6 additions & 1 deletion oda_reader/download/download_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ def _save_or_return_excel_files_from_content(

if save_to_path:
save_to_path.mkdir(parents=True, exist_ok=True)
output_file = save_to_path / Path(excel_file).name
output_file = save_to_path / Path(excel_file).with_suffix(".parquet").name
logger.info(f"Saving {excel_file} as parquet to {output_file}")
df = df.astype(
{
Expand Down Expand Up @@ -539,3 +539,8 @@ def get_bulk_file_id(
parquet_link = match.group(1).strip()

return parquet_link.split("=")[-1]


if __name__ == "__main__":

bulk_download_aiddata(save_to_path='/Users/miguelharoruiz/Desktop')