From bb9412a7589d3939830cea1dec82ecb8029c35a6 Mon Sep 17 00:00:00 2001 From: Harish Chandrashekar Date: Mon, 2 Feb 2026 05:16:32 +0000 Subject: [PATCH 1/4] Modified manifest for better execution --- scripts/us_census/pep/annual_population/manifest.json | 3 ++- scripts/us_census/pep/annual_population/preprocess.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/us_census/pep/annual_population/manifest.json b/scripts/us_census/pep/annual_population/manifest.json index bb555bf4a3..3f1cef3508 100644 --- a/scripts/us_census/pep/annual_population/manifest.json +++ b/scripts/us_census/pep/annual_population/manifest.json @@ -8,7 +8,8 @@ "provenance_url": "https://www2.census.gov/programs-surveys/popest/", "provenance_description": "U.S. Census Bureau PEP Annual Population", "scripts": [ - "preprocess.py" + "preprocess.py --mode=download", + "preprocess.py --mode=process" ], "source_files": [ "input_files/*" diff --git a/scripts/us_census/pep/annual_population/preprocess.py b/scripts/us_census/pep/annual_population/preprocess.py index 8e092989d3..de4a1321a8 100644 --- a/scripts/us_census/pep/annual_population/preprocess.py +++ b/scripts/us_census/pep/annual_population/preprocess.py @@ -1054,6 +1054,7 @@ def process(input_path, cleaned_csv_file_path: str, mcf_file_path: str, except Exception as e: logging.fatal(f"Error while processing files {e}") + logging.info(f"No of files to be processed {total_files_to_process}") logging.info(f"No of files processed {processed_count}") if processed_count >= total_files_to_process & total_files_to_process > 0: final_df["Year"] = final_df["Year"].astype("int") From 5eeb4829e552627a8ffa1fecf7558e8669e66e99 Mon Sep 17 00:00:00 2001 From: Harish Chandrashekar Date: Tue, 3 Feb 2026 18:53:28 +0000 Subject: [PATCH 2/4] script kills incase download fails --- .../pep/annual_population/preprocess.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/scripts/us_census/pep/annual_population/preprocess.py b/scripts/us_census/pep/annual_population/preprocess.py index de4a1321a8..dab5c2215c 100644 --- a/scripts/us_census/pep/annual_population/preprocess.py +++ b/scripts/us_census/pep/annual_population/preprocess.py @@ -1142,7 +1142,15 @@ def download_files(): with open(os.path.join(_INPUT_FILE_PATH, file_name_to_save), 'wb') as f: f.write(response.content) - file_to_dowload['is_downloaded'] = True + file_to_dowload['is_downloaded'] = True + else: + logging.error( + f"Failed to download {url} with status code {response.status_code}" + ) + file_to_dowload['is_downloaded'] = False + for file_to_dowload in _FILES_TO_DOWNLOAD: + if not file_to_dowload['is_downloaded']: + raise Exception("Failed to download all files.") except Exception as e: logging.fatal(f"Error occurred in download method {e}") @@ -1161,7 +1169,11 @@ def main(_): if mode == "" or mode == "download": add_future_year_urls() - download_files() + try: + download_files() + except Exception as e: + logging.fatal(f"Download failed: {e}") + sys.exit(1) if mode == "" or mode == "process": process(_INPUT_FILE_PATH, cleaned_csv_path, mcf_path, tmcf_path, is_summary_levels) From 4561600fa5768b6bd5d9e43745bffe479c6ebd3c Mon Sep 17 00:00:00 2001 From: Harish Chandrashekar Date: Mon, 16 Feb 2026 08:24:00 +0000 Subject: [PATCH 3/4] resolved internal comments --- scripts/us_census/pep/annual_population/preprocess.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/scripts/us_census/pep/annual_population/preprocess.py b/scripts/us_census/pep/annual_population/preprocess.py index dab5c2215c..e31608e397 100644 --- a/scripts/us_census/pep/annual_population/preprocess.py +++ b/scripts/us_census/pep/annual_population/preprocess.py @@ -30,8 +30,7 @@ 2000 - 2009 Data is available in State File in the year 2000-2009 2010 - 2020 Processed As Is -Before running this module, run download.sh script, it downloads required -input files, creates necessary folders for processing. +This module downloads and processes the input files. Folder information input_files - downloaded files (from US census website) are placed here output_files - output files (mcf, tmcf and csv are written here) @@ -1169,11 +1168,8 @@ def main(_): if mode == "" or mode == "download": add_future_year_urls() - try: - download_files() - except Exception as e: - logging.fatal(f"Download failed: {e}") - sys.exit(1) + download_files() + if mode == "" or mode == "process": process(_INPUT_FILE_PATH, cleaned_csv_path, mcf_path, tmcf_path, is_summary_levels) From 39654fba560930beba6847ff3460910ab45c011a Mon Sep 17 00:00:00 2001 From: Harish Chandrashekar Date: Mon, 16 Feb 2026 12:18:01 +0000 Subject: [PATCH 4/4] resolved core team comments --- .../pep/annual_population/preprocess.py | 34 +++++++++++-------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/scripts/us_census/pep/annual_population/preprocess.py b/scripts/us_census/pep/annual_population/preprocess.py index e31608e397..79a8f13dbc 100644 --- a/scripts/us_census/pep/annual_population/preprocess.py +++ b/scripts/us_census/pep/annual_population/preprocess.py @@ -1118,22 +1118,22 @@ def download_files(): if not os.path.exists(_INPUT_FILE_PATH): os.makedirs(_INPUT_FILE_PATH) try: - for file_to_dowload in _FILES_TO_DOWNLOAD: + for file_to_download in _FILES_TO_DOWNLOAD: file_name_to_save = None - url = file_to_dowload['download_path'] - if 'file_name' in file_to_dowload and len( - file_to_dowload['file_name'] > 5): - file_name_to_save = file_to_dowload['file_name'] + url = file_to_download['download_path'] + if 'file_name' in file_to_download and len( + file_to_download['file_name']) > 5: + file_name_to_save = file_to_download['file_name'] else: file_name_to_save = url.split('/')[-1] - if 'file_path' in file_to_dowload: + if 'file_path' in file_to_download: if not os.path.exists( os.path.join(_INPUT_FILE_PATH, - file_to_dowload['file_path'])): + file_to_download['file_path'])): os.makedirs( os.path.join(_INPUT_FILE_PATH, - file_to_dowload['file_path'])) - file_name_to_save = file_to_dowload[ + file_to_download['file_path'])) + file_name_to_save = file_to_download[ 'file_path'] + file_name_to_save response = download_with_retry(url, file_name_to_save) @@ -1141,15 +1141,21 @@ def download_files(): with open(os.path.join(_INPUT_FILE_PATH, file_name_to_save), 'wb') as f: f.write(response.content) - file_to_dowload['is_downloaded'] = True + file_to_download['is_downloaded'] = True else: logging.error( f"Failed to download {url} with status code {response.status_code}" ) - file_to_dowload['is_downloaded'] = False - for file_to_dowload in _FILES_TO_DOWNLOAD: - if not file_to_dowload['is_downloaded']: - raise Exception("Failed to download all files.") + file_to_download['is_downloaded'] = False + failed_downloads = [ + file_to_download['download_path'] + for file_to_download in _FILES_TO_DOWNLOAD + if not file_to_download.get('is_downloaded', False) + ] + if failed_downloads: + raise Exception( + f"Failed to download {len(failed_downloads)} files: {failed_downloads}" + ) except Exception as e: logging.fatal(f"Error occurred in download method {e}")