diff --git a/.gitignore b/.gitignore index 0ae86e0..e888a89 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ .ipynb_checkpoints build/ *.egg-info/ +__pycache__/ .pyc diff --git a/dist/neonutilities-1.1.1-py3-none-any.whl b/dist/neonutilities-1.1.1-py3-none-any.whl index 070b5b4..28885e5 100644 Binary files a/dist/neonutilities-1.1.1-py3-none-any.whl and b/dist/neonutilities-1.1.1-py3-none-any.whl differ diff --git a/dist/neonutilities-1.1.1.tar.gz b/dist/neonutilities-1.1.1.tar.gz index 43819ea..e9ded00 100644 Binary files a/dist/neonutilities-1.1.1.tar.gz and b/dist/neonutilities-1.1.1.tar.gz differ diff --git a/requirements.txt b/requirements.txt index a9cd6f4..874cd35 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ pyarrow>=16.0.0 pyproj>=3.6.1 requests>=2.31.0 tqdm>=4.66.2 +google_crc32c>=1.7.0 \ No newline at end of file diff --git a/src/neonutilities/__init__.py b/src/neonutilities/__init__.py index 7132757..4ee09e9 100644 --- a/src/neonutilities/__init__.py +++ b/src/neonutilities/__init__.py @@ -12,5 +12,4 @@ stack_by_table, load_by_product, dataset_query, -) - +) \ No newline at end of file diff --git a/src/neonutilities/__pycache__/__init__.cpython-311.pyc b/src/neonutilities/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index b304059..0000000 Binary files a/src/neonutilities/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/src/neonutilities/__pycache__/aop_download.cpython-311.pyc b/src/neonutilities/__pycache__/aop_download.cpython-311.pyc deleted file mode 100644 index 45a7e0d..0000000 Binary files a/src/neonutilities/__pycache__/aop_download.cpython-311.pyc and /dev/null differ diff --git a/src/neonutilities/__pycache__/citation.cpython-311.pyc b/src/neonutilities/__pycache__/citation.cpython-311.pyc deleted file mode 100644 index b5d53d2..0000000 Binary files a/src/neonutilities/__pycache__/citation.cpython-311.pyc and /dev/null differ diff --git a/src/neonutilities/__pycache__/files_by_uri.cpython-311.pyc b/src/neonutilities/__pycache__/files_by_uri.cpython-311.pyc deleted file mode 100644 index a61e63f..0000000 Binary files a/src/neonutilities/__pycache__/files_by_uri.cpython-311.pyc and /dev/null differ diff --git a/src/neonutilities/__pycache__/get_issue_log.cpython-311.pyc b/src/neonutilities/__pycache__/get_issue_log.cpython-311.pyc deleted file mode 100644 index c4eb540..0000000 Binary files a/src/neonutilities/__pycache__/get_issue_log.cpython-311.pyc and /dev/null differ diff --git a/src/neonutilities/__pycache__/read_table_neon.cpython-311.pyc b/src/neonutilities/__pycache__/read_table_neon.cpython-311.pyc deleted file mode 100644 index fdd4465..0000000 Binary files a/src/neonutilities/__pycache__/read_table_neon.cpython-311.pyc and /dev/null differ diff --git a/src/neonutilities/__pycache__/tabular_download.cpython-311.pyc b/src/neonutilities/__pycache__/tabular_download.cpython-311.pyc deleted file mode 100644 index 424b40a..0000000 Binary files a/src/neonutilities/__pycache__/tabular_download.cpython-311.pyc and /dev/null differ diff --git a/src/neonutilities/__pycache__/unzip_and_stack.cpython-311.pyc b/src/neonutilities/__pycache__/unzip_and_stack.cpython-311.pyc deleted file mode 100644 index a3ed8da..0000000 Binary files a/src/neonutilities/__pycache__/unzip_and_stack.cpython-311.pyc and /dev/null differ diff --git a/src/neonutilities/aop_download.py b/src/neonutilities/aop_download.py index b575e55..ef5b047 100644 --- a/src/neonutilities/aop_download.py +++ b/src/neonutilities/aop_download.py @@ -15,23 +15,29 @@ @author: Claire Lunch (clunch@battelleecology.org) @author: Christine Laney (claney@battelleecology.org) +Updated Oct 2025 to include options to skip files if they already exist locally and +to validate existing files against files published on the Data Portal with checksums. + """ # %% imports from datetime import datetime +import glob import importlib_resources import logging import pandas as pd import numpy as np import os import re + +# import time from time import sleep from tqdm import tqdm # local imports from . import __resources__ from .helper_mods.api_helpers import get_api -from .helper_mods.api_helpers import download_file +from .helper_mods.api_helpers import download_file, calculate_crc32c from .helper_mods.metadata_helpers import convert_byte_size from .get_issue_log import get_issue_log from .citation import get_citation @@ -104,8 +110,8 @@ def get_file_urls(urls, token=None): Notes -------- - The function makes API calls to each URL in the 'urls' list and retrieves the file information. - It also retrieves the release information from the response JSON. + The function makes API calls to each URL in the 'urls' list and retrieves the file + information. It also retrieves the release information from the response JSON. If the API call fails, it prints a warning message and continues with the next URL. """ @@ -206,7 +212,9 @@ def get_neon_sites(): neon_sites_df = pd.read_csv(neon_sites_file) neon_sites_list = list(neon_sites_df["field_site_id"]) - neon_sites_list.append("CHEQ") + neon_sites_list.append( + "CHEQ" + ) # append the CHEQ site, a separate flight area than STEI-TREE return neon_sites_list @@ -217,6 +225,52 @@ def get_data_product_name(dpid): return product_name +def check_exists_and_checksum(row, download_path): + """ + Check if a file exists locally and if its CRC32C checksum matches the expected value. + Used in the skip_if_exists option of by_tile_aop and by_file_aop. + + Parameters + ---------- + row : pandas.Series + A row from a DataFrame containing at least a 'url' and optionally a 'crc32c' field. + download_path : str + The root directory where files are downloaded. + + Returns + ------- + pandas.Series + A Series of two boolean values: + - exists_locally: True if the file exists locally, False otherwise. + - checksum_matches: True if the local file's CRC32C matches the expected value, False otherwise. + + Example + ------- + >>> file_url_df[["exists_locally", "checksum_matches"]] = file_url_df.apply( + ... lambda row: check_exists_and_checksum(row, download_path), axis=1 + ... ) + """ + # print(row["url"]) + pathparts = row["url"].split("/") + file_path = os.path.join(download_path, *pathparts[3:]) + # file_path = os.path.join(download_path,"\\".join(pathparts[3:])) + # print(file_path) + exists = os.path.exists(file_path) + # print(exists) + if not exists: + # print(f'{pathparts[-1:]} does not exist!') + return pd.Series([False, False]) + expected_crc32c = row.get("crc32c") + # print(expected_crc32c) + if expected_crc32c: + local_crc32c = calculate_crc32c(file_path) + matches = local_crc32c.zfill(8) == str(expected_crc32c).zfill(8) + # return pd.Series({"exists_locally": True, "checksum_matches": matches}) + return pd.Series([True, matches]) + return pd.Series([True, False]) + # return pd.Series({"exists_locally": True, "checksum_matches": False}) + + # %% functions to validate inputs for by_file_aop and by_tile_aop @@ -356,7 +410,7 @@ def validate_aop_l3_dpid(dpid): def check_field_spectra_dpid(dpid): if dpid == "DP1.30012.001": raise ValueError( - f"NEON {dpid} is the Field spectral data product, which is published as tabular data. Use zipsByProduct() or loadByProduct() to download these data." + f"NEON {dpid} is the Field spectral data product, which is published as tabular data. Use zips_by_product() or loadByProduct() to download these data." ) @@ -402,6 +456,28 @@ def validate_year(year): ) +def validate_overwrite(overwrite): + """ + Validates that overwrite is one of the accepted options: 'yes', 'no', or 'prompt'. + Raises a ValueError if not. + """ + valid_options = {"yes", "no", "prompt"} + if overwrite not in valid_options: + raise ValueError(f"overwrite must be one of {valid_options}. ") + + +def validate_skip_if_exists(skip_if_exists): + """ + Validates that skip_if_exists is a boolean (True or False). + Raises a ValueError if not. + """ + if not isinstance(skip_if_exists, bool): + raise ValueError( + f"skip_if_exists must be a boolean (True or False). " + f"Received: '{skip_if_exists}' of type {type(skip_if_exists)}" + ) + + def check_aop_dpid(response_dict, dpid): if response_dict["data"]["productScienceTeamAbbr"] != "AOP": logging.info( @@ -678,6 +754,9 @@ def by_file_aop( savepath=None, chunk_size=1024, token=None, + verbose=False, + skip_if_exists=False, + overwrite="prompt", ): """ This function queries the NEON API for AOP data by site, year, and product, and downloads all @@ -712,23 +791,51 @@ def by_file_aop( Size in bytes of chunk for chunked download. Defaults to 1024. token: str, optional - User-specific API token from data.neonscience.org user account. See - https://data.neonscience.org/data-api/rate-limiting/ for details about - API rate limits and user tokens. + User-specific API token from data.neonscience.org user account. Defaults to None. + See https://data.neonscience.org/data-api/rate-limiting/ for details about API + rate limits and user tokens. + + verbose: bool, optional + If set to True, the function will print more detailed information about the download process. + + skip_if_exists: bool, optional + If set to True, the function will skip downloading files that already exist in the + savepath and are valid (local checksums match the checksums of the published file). + Defaults to False. If any local file checksums don't match those of files published + on the NEON Data Portal, the user will be prompted to skip these files or overwrite + the existing files with the new ones (see overwrite input). + + overwrite: str, optional + Must be one of: + 'yes' - overwrite mismatched files without prompting, + 'no' - don't overwrite mismatched files (skip them, no prompt), + 'prompt' - prompt the user (y/n) to overwrite mismatched files after displaying them (default). + If skip_if_exists is False, this parameter is ignored, and any existing files in + the savepath will be overwritten according to the function's default behavior. Returns -------- - None; data are downloaded to the local directory specified. + None; data are downloaded to the directory specified (savepath) or the current working directory. + If data already exist in the expected path, they will be overwritten by default. To check for + existing files before downloading, set skip_if_exists=True along with an overwrite option (y/n/prompt). Examples -------- - >>> by_file_aop(dpid="DP3.30015.001", site="MCRA", year="2021", savepath="./test_download") - # This will download 2021 canopy height model data from McRae Creek to the './test_download' directory. + >>> by_file_aop(dpid="DP3.30015.001", + site="MCRA", + year="2021", + savepath="./test_download", + skip_if_exists=True) + # This downloads the 2021 Canopy Height Model data from McRae Creek to the './test_download' directory. + # If any files already exist in the savepath, they will be checked and skipped if they are valid. + # The user will be prompted to ovewrite or skip downloading any existing files that do not match + # the latest published data on the NEON Data Portal. Notes -------- - The function creates a folder in the 'savepath' directory, containing all AOP files meeting the query criteria. - If 'savepath' is not provided, data are downloaded to the working directory. + This function creates a folder named by the Data Product ID (DPID; e.g. DP3.30015.001) in the + 'savepath' directory, containing all AOP files meeting the query criteria. If 'savepath' is + not provided, data are downloaded to the working directory, in a folder named by the DPID. """ # raise value error and print message if dpid isn't formatted as expected @@ -751,6 +858,19 @@ def by_file_aop( year = str(year) # cast year to string (if it's not already) validate_year(year) + # raise value error and print message if skip_if_exists input is not valid (boolean) + validate_skip_if_exists(skip_if_exists) + + # raise value error and print message if validate_overwrite input is not valid (yes, no, prompt) + validate_overwrite(overwrite) + + # warn if overwrite is set but skip_if_exists is False + if not skip_if_exists and overwrite != "prompt": + logging.info( + "WARNING: overwrite option only applies if skip_if_exists=True. " + "By default, any existing files will be overwritten unless you select skip_if_exists=True and overwrite='no' or 'prompt' (default)." + ) + # if token is an empty string, set to None if token == "": token = None @@ -766,8 +886,6 @@ def by_file_aop( # check that token was used if token and "x-ratelimit-limit" in response.headers: check_token(response) - # if response.headers['x-ratelimit-limit'] == '200': - # print('API token was not recognized. Public rate limit applied.\n') # get the request response dictionary response_dict = response.json() @@ -794,7 +912,6 @@ def by_file_aop( # get the number of files in the dataframe, if there are no files to download, return if len(file_url_df) == 0: - # print("No data files found.") logging.info("No NEON data files found.") return @@ -806,12 +923,10 @@ def by_file_aop( ) else: # log provisional not included message and filter to the released data - # logging.info( - # "Provisional data are not included. To download provisional data, use input parameter include_provisional=True.") file_url_df = file_url_df[file_url_df["release"] != "PROVISIONAL"] if len(file_url_df) == 0: logging.info( - "NEON Provisional data are not included. To download provisional data, use input parameter include_provisional=True." + "Provisional NEON data are not included. To download provisional data, use input parameter include_provisional=True." ) num_files = len(file_url_df) @@ -834,9 +949,9 @@ def by_file_aop( f"Continuing will download {num_files} NEON data files totaling approximately {download_size}. Do you want to proceed? (y/n) " ) .strip() - .lower() + .lower() # lower or upper case 'y' will work != "y" - ): # lower or upper case 'y' will work + ): print("Download halted.") return @@ -849,15 +964,212 @@ def by_file_aop( # serially download all files, with progress bar files = list(file_url_df["url"]) - print( - f"Downloading {num_files} NEON data file(s) totaling approximately {download_size}\n" - ) - sleep(1) - for file in tqdm(files): - download_file( - url=file, savepath=download_path, chunk_size=chunk_size, token=token + + # different messages depending on whether skip_if_exists is True or False + if skip_if_exists: + logging.info( + f"Found {num_files} NEON data files totaling approximately {download_size}.\n" + "Files in savepath will be checked and skipped if they exist and match the latest version." + ) + else: + logging.info( + f"Downloading {num_files} NEON data files totaling approximately {download_size}\n" + ) + # verbose option to list all files being downloaded + if verbose: + logging.info("Downloading the following data and metadata files:") + for file in files: + logging.info(file.replace("https://storage.googleapis.com", f"{dpid}")) + + if skip_if_exists: + # Existence and checksum check + file_url_df[["exists_locally", "checksum_matches"]] = file_url_df.apply( + lambda row: check_exists_and_checksum(row, download_path), axis=1 + ) + + # Handle the various cases + # 1. Skip files that exist locally and checksums match those on the GCS (exists_locally and checksum_matches are both True). + # 2. Prompt the user to decide whether to download (overwrite) files that exist locally but checksums don't match (exists_locally is True and checksum_matches is False). + # 3. Download files if they don't already exist locally (exists_locally is False). + # 4. If there are extra files locally that don't exist on the GCS, display a warning message. + + # Skipped files + files_to_skip = file_url_df[ + (file_url_df["exists_locally"]) & (file_url_df["checksum_matches"]) + ] + + # Files to prompt for overwrite + mismatched_files = file_url_df[ + (file_url_df["exists_locally"]) & (~file_url_df["checksum_matches"]) + ] + + # Files to download (do not exist locally) + files_to_download = file_url_df[~file_url_df["exists_locally"]] + + # Identify README files (case-insensitive, .txt) + readme_files = file_url_df[ + file_url_df["name"].str.contains("readme", case=False, na=False) + ] + + # If any files are missing or mismatched, download the README file + if ( + not files_to_download.empty or not mismatched_files.empty + ) and not readme_files.empty: + logging.info("Downloading README file") + for idx, row in tqdm(readme_files.iterrows(), total=len(readme_files)): + download_file( + url=row["url"], + savepath=download_path, + chunk_size=chunk_size, + token=token, + ) + + # display a warning message if there are extra files locally that + # are not published on the Portal as part of the Data Product + + # get all expected file paths (relative to download_path) + expected_files = set() + for _, row in file_url_df.iterrows(): + pathparts = row["url"].split("/") + expected_files.add( + os.path.normpath(os.path.join(download_path, *pathparts[3:])) + ) + + # get the set of all AOP bucket names from the URLs + gcs_bucket = { + url.split("/")[3] + for url in file_url_df["url"] + if len(url.split("/")) > 3 + and url.split("/")[3] + in ["neon-aop-products", "neon-aop-provisional-products"] + } + + # get the domain from the URLs + domain = { + val + for url in file_url_df["url"] + for val in [url.split("/")[6]] + if re.fullmatch(r"D\d{2}", val) + } + + # get the Year_Site_Visit from the URLs + ysv = { + val + for url in file_url_df["url"] + for val in [url.split("/")[7]] + if re.fullmatch(r"\d{4}_[A-Z]{4}_\d+", val) + } + + # recursively find all files in the expected path + local_path = os.path.normpath( + os.path.join( + download_path, + gcs_bucket.pop(), + str(year), + "FullSite", + domain.pop(), + ysv.pop(), + ) + ) + + # get the set of all local files (normalized paths) + all_local_files = set( + os.path.normpath(f) + for f in glob.glob(os.path.join(local_path, "**"), recursive=True) + if os.path.isfile(f) ) + # find extra files and display warning message if any exist + extra_files = sorted(all_local_files - expected_files) + if extra_files: + logging.info( + f"WARNING: Files found in the local folder that do not exist in the latest version of data for {dpid}. " + f"\nYou will need to delete the extra files below if you want the download folder to match the latest available data contents," + f"\nor to be safe, delete the folder {local_path} and re-download." + "\nExtra Files Found:" + ) + for f in sorted(extra_files): + logging.info(f" {os.path.abspath(f)}") + + # files that do not exist locally + if not files_to_download.empty: + logging.info( + "The following files will be downloaded (they do not already exist locally):" + ) + for f in files_to_download["name"].sort_values(): + logging.info(f" {f}") + + # Download files that do not exist locally + for _, row in tqdm( + files_to_download.iterrows(), total=len(files_to_download) + ): + download_file( + url=row["url"], + savepath=download_path, + chunk_size=chunk_size, + token=token, + ) + + if not files_to_skip.empty: + logging.info( + "The remainder of the files in savepath will not be downloaded. " + "They already exist locally and match the latest available data." + ) + + # mismatched files (local checksums are not the same as those on the Data Portal) + # filter out files where the name contains "readme" (case-insensitive) + # readme files do not have a checksum so would always fail, these should be downloaded by default + mismatched_files_no_readme = mismatched_files[ + ~mismatched_files["name"].str.contains("readme", case=False, na=False) + ] + + # message if there is nothing to download - all files exist and they all match the checksums + if files_to_download.empty and mismatched_files_no_readme.empty: + logging.info( + "All files already exist locally and match the latest available data. Skipping download." + ) + + if not mismatched_files_no_readme.empty: + logging.info( + "The following files exist locally but have a different checksum than the remote files:" + ) + for f in sorted(mismatched_files_no_readme["name"]): + logging.info(f" {f}") + # determine whether to overwrite mismatched files + if overwrite == "yes": + response = "y" + elif overwrite == "no": + response = "n" + # logging.info("WARNING: Files where the checksum doesn't match the latest version will not be overwritten.") + else: # 'prompt' or any other value + response = ( + input( + "Do you want to overwrite these files with the latest version? (y/n) " + ) + .strip() + .lower() + ) + if response.lower() == "y": + logging.info("Overwriting these files with the latest available data.") + # download including the readme + for _, row in tqdm( + mismatched_files.iterrows(), total=len(mismatched_files) + ): + download_file( + url=row["url"], + savepath=download_path, + chunk_size=chunk_size, + token=token, + ) + else: + logging.info("Skipped overwriting files with mismatched checksums.") + + else: + for file in tqdm(files): + download_file( + url=file, savepath=download_path, chunk_size=chunk_size, token=token + ) + # download issue log table ilog = get_issue_log(dpid=dpid, token=None) if ilog is not None: @@ -911,6 +1223,8 @@ def by_tile_aop( chunk_size=1024, token=None, verbose=False, + skip_if_exists=False, + overwrite="prompt", ): """ This function queries the NEON API for AOP data by site, year, product, and @@ -929,13 +1243,13 @@ def by_tile_aop( year: str or int The four-digit year of data collection. - easting: int or list of int + easting: float/int or list of float/int A number or list containing the easting UTM coordinate(s) of the locations to download. - northing: int or list of int + northing: float/int or list of float/int A number or list containing the northing UTM coordinate(s) of the locations to download. - buffer: int, optional + buffer: float/int, optional Size, in meters, of the buffer to be included around the coordinates when determining which tiles to download. Defaults to 0. include_provisional: bool, optional @@ -956,26 +1270,55 @@ def by_tile_aop( Size in bytes of chunk for chunked download. Defaults to 1024. token: str, optional - User-specific API token from data.neonscience.org user account. See - https://data.neonscience.org/data-api/rate-limiting/ for details about + User-specific API token from data.neonscience.org user account. Defaults to None. + See https://data.neonscience.org/data-api/rate-limiting/ for details about API rate limits and user tokens. verbose: bool, optional - If set to True, the function will print out a list of the downloaded tiles before downloading. + If set to True, the function will print out a list of the tiles to be downloaded before downloading. Defaults to False. - Return + skip_if_exists: bool, optional + If set to True, the function will skip downloading files that already exist in the + savepath and are valid (local checksums match the checksums of the published file). + Defaults to False. If any local file checksums don't match those of files published + on the NEON Data Portal, the user will be prompted to skip these files or overwrite + the existing files with the new ones (see overwrite input). + + overwrite: str, optional + Must be one of: + 'yes' - overwrite mismatched files without prompting, + 'no' - don't overwrite mismatched files (skip them, no prompt), + 'prompt' - prompt the user (y/n) to overwrite mismatched files after displaying them (default). + If skip_if_exists is False, this parameter is ignored, and any existing files in + the savepath will be overwritten according to the function's default behavior. + + Returns -------- - None; data are downloaded to the local directory specified. + None; data are downloaded to the directory specified (savepath) or the current working directory. + If data already exist in the expected path, they will be overwritten by default. To check for + existing files before downloading, set skip_if_exists=True along with an overwrite option (y/n/prompt). Example -------- - >>> by_tile_aop(dpid="DP3.30015.001", site="MCRA", - easting=[566456, 566639], northing=[4900783, 4901094], - year="2021", savepath="../../test_download") - # This will download any tiles overlapping the specified UTM coordinates for + >>> by_tile_aop(dpid="DP3.30015.001", + site="MCRA", + easting=[566456, 566639], + northing=[4900783, 4901094], + year="2021", + savepath="./test_download", + skip_if_exists=True) + # This downloads any tiles overlapping the specified UTM coordinates for # 2021 canopy height model data from McRae Creek to the './test_download' directory. + # If any files already exist in the savepath, they will be checked and skipped if they are valid. + # The user will be prompted to ovewrite or skip downloading any existing files that do not match + # the latest published data on the NEON Data Portal. + Notes + -------- + This function creates a folder named by the Data Product ID (DPID; e.g. DP3.30015.001) in the + 'savepath' directory, containing all AOP files meeting the query criteria. If 'savepath' is + not provided, data are downloaded to the working directory, in a folder named by the DPID. """ # raise value error and print message if dpid isn't formatted as expected @@ -998,6 +1341,20 @@ def by_tile_aop( year = str(year) # cast year to string (if it's not already) validate_year(year) + # raise value error and print message if skip_if_exists input is not valid (boolean) + validate_skip_if_exists(skip_if_exists) + + # raise value error and print message if validate_overwrite input is not valid (yes, no, prompt) + validate_overwrite(overwrite) + + # warn if overwrite is set to yes or no, but skip_if_exists is False + if not skip_if_exists and overwrite != "prompt": + logging.info( + "WARNING: overwrite option only applies if skip_if_exists=True. By default, " + "any existing files in the expected directory will be overwritten unless " + "you select skip_if_exists=True and overwrite='no' or 'prompt' (default)." + ) + # convert easting and northing to lists, if they are not already if type(easting) is not list: easting = [easting] @@ -1021,8 +1378,7 @@ def by_tile_aop( ) print(e) - # link easting and northing coordinates - as a list of tuples ? - # coord_tuples = [(easting[i], northing[i]) for i in range(0, len(easting))] + # link easting and northing coordinates # error message if easting and northing vector lengths don't match (also handles empty/NA cases) # there should not be any strings now that everything has been converted to a float @@ -1047,15 +1403,18 @@ def by_tile_aop( logging.info("No response from NEON API. Check internet connection") return - # # check that token was used + # check that token was used if token and "x-ratelimit-limit" in response.headers: check_token(response) # get the request response dictionary response_dict = response.json() + # error message if dpid is not an AOP data product if response_dict["data"]["productScienceTeamAbbr"] != "AOP": - print(f"NEON {dpid} is not a remote sensing product. Use zipsByProduct()") + logging.info( + f"NEON {dpid} is not a remote sensing product. Use zips_by_product()" + ) return # replace collocated site with the site name it's published under @@ -1108,8 +1467,6 @@ def by_tile_aop( # check that pyproj is installed try: from pyproj import Proj, CRS - - # importlib.import_module('pyproj') except ImportError: logging.info( "Package pyproj is required for this function to work at the NEON BLAN site. Install and re-try." @@ -1171,8 +1528,6 @@ def get_buffer_coords(easting, northing, buffer): return new_coords # get the tiles corresponding to the new coordinates (mins and maxes) - # if verbose: - # print('getting coordinates of tiles, including the buffer') buffer_coords = [] for e, n in zip(easting, northing): buffer_coords.extend(get_buffer_coords(e, n, buffer)) @@ -1207,9 +1562,16 @@ def get_buffer_coords(easting, northing, buffer): file_url_df["name"].str.contains("|".join(coord_strs)) ] + # remove .txt files (README) and create a copy to use for the checksum/existence checks file_url_df_subset2 = file_url_df_subset[ ~file_url_df_subset["name"].str.endswith(".txt") - ] + ].copy() + + # file_url_df_subset2.reset_index(drop=True,inplace=True) + file_url_df_subset2 = file_url_df_subset2.reset_index(drop=True) + + # print(list(file_url_df_subset2.head(1))) + # print(file_url_df_subset2.head()) # if any coordinates were not included in the data, print a warning message unique_coords_to_download = set( @@ -1223,7 +1585,7 @@ def get_buffer_coords(easting, northing, buffer): coords_not_found = list(set(coord_strs).difference(list(unique_coords_to_download))) if len(coords_not_found) > 0: logging.info( - "Warning: the following coordinates fall outside the bounds of the NEON site, so will not be downloaded:" + "WARNING: the following coordinates fall outside the bounds of the NEON site, so will not be downloaded:" ) for coord in coords_not_found: print(",".join(coord.split("_"))) @@ -1231,7 +1593,7 @@ def get_buffer_coords(easting, northing, buffer): # get the number of files in the dataframe, if there are no files to download, return num_files = len(file_url_df_subset) if num_files == 0: - logging.info(f"Warning: No NEON {dpid} files found.") + logging.info(f"WARNING: No NEON {dpid} files found.") return # get the total size of all the files found @@ -1261,22 +1623,140 @@ def get_buffer_coords(easting, northing, buffer): # print('download path', download_path) os.makedirs(download_path, exist_ok=True) - # serially download all files, with progress bar + # Get the sorted list of the files to download # use the files from the subsetted dataframe files = list(file_url_df_subset["url"]) files.sort() # sort the files for consistent download order - print( - f"Downloading {num_files} NEON data file(s) totaling approximately {download_size}\n" - ) - if verbose: - logging.info("Downloading the following data and metadata files:") - for file in files: - logging.info(file.replace("https://storage.googleapis.com", f"{dpid}")) - sleep(1) - for file in tqdm(files): - download_file( - url=file, savepath=download_path, chunk_size=chunk_size, token=token + + # different messages depending on whether skip_if_exists is True or False + if skip_if_exists: + logging.info( + f"Found {num_files} NEON data files totaling approximately {download_size}.\n" + "Files in savepath will be checked and skipped if they exist and match the latest version." + ) + else: + logging.info( + f"Downloading {num_files} NEON data files totaling approximately {download_size}\n" ) + if verbose: + logging.info("Downloading the following data and metadata files:") + for file in files: + logging.info(file.replace("https://storage.googleapis.com", f"{dpid}")) + + if skip_if_exists: + # Check which files already exist locally and if their checksums match + file_url_df_subset2[ + ["exists_locally", "checksum_matches"] + ] = file_url_df_subset2.apply( + lambda row: check_exists_and_checksum(row, download_path), axis=1 + ) + + # Files to skip (already exist locally and checksums match) + files_to_skip = file_url_df_subset2[ + (file_url_df_subset2["exists_locally"]) + & (file_url_df_subset2["checksum_matches"]) + ] + + # Files to prompt for overwrite (exists locally but checksums don't match) + mismatched_files = file_url_df_subset2[ + (file_url_df_subset2["exists_locally"]) + & (~file_url_df_subset2["checksum_matches"]) + ] + + # Files to download (do not exist locally) + files_to_download = file_url_df_subset2[~file_url_df_subset2["exists_locally"]] + # print(file_url_df_subset2[["name","exists_locally","checksum_matches"]]) + + # if verbose: # print these even in non-verbose mode + if not files_to_download.empty: + logging.info( + "The following files will be downloaded (they do not already exist locally):" + ) + for f in files_to_download["name"].sort_values(): + logging.info(f" {f}") + + # Download files that do not exist locally + for _, row in tqdm( + files_to_download.iterrows(), total=len(files_to_download) + ): + download_file( + url=row["url"], + savepath=download_path, + chunk_size=chunk_size, + token=token, + ) + + if not files_to_skip.empty: + logging.info( + "The remainder of the files in savepath will not be downloaded. " + "They already exist locally and match the latest available data." + ) + + # prompt for mismatched files (excluding readme.txt) + mismatched_files_no_readme = mismatched_files[ + ~mismatched_files["name"].str.contains("readme", case=False, na=False) + ] + + # identify README files (case-insensitive, usually .txt), there should only be one of these + readme_files = mismatched_files[ + mismatched_files["name"].str.contains("readme", case=False, na=False) + ] + + # download README files + if not readme_files.empty: + logging.info("Downloading README file") + for _, row in tqdm(readme_files.iterrows(), total=len(readme_files)): + download_file( + url=row["url"], + savepath=download_path, + chunk_size=chunk_size, + token=token, + ) + + # message if there is nothing to download - all files exist and they all match the checksums + if files_to_download.empty and mismatched_files_no_readme.empty: + logging.info( + "All files already exist locally and match the latest available data. Skipping download." + ) + + if not mismatched_files_no_readme.empty: + logging.info( + "The following files exist locally but have a different checksum than the remote files:" + ) + for f in sorted(mismatched_files_no_readme["name"]): + logging.info(f" {f}") + # determine whether to overwrite mismatched files + if overwrite == "yes": + response = "y" + elif overwrite == "no": + response = "n" + else: # 'prompt' or any other value + response = ( + input( + "Do you want to overwrite these files with the latest version? (y/n) " + ) + .strip() + .lower() + ) + + if response.lower() == "y": + logging.info("Overwriting these files with the latest available data.") + for idx, row in tqdm( + mismatched_files.iterrows(), total=len(mismatched_files) + ): + download_file( + url=row["url"], + savepath=download_path, + chunk_size=chunk_size, + token=token, + ) + else: + logging.info("Skipped overwriting files with mismatched checksums.") + else: # if skip_if_exists=False (default behavior) + for file in tqdm(files): + download_file( + url=file, savepath=download_path, chunk_size=chunk_size, token=token + ) # download issue log table ilog = get_issue_log(dpid=dpid, token=None) diff --git a/src/neonutilities/helper_mods/__pycache__/__init__.cpython-311.pyc b/src/neonutilities/helper_mods/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index 7d1c64e..0000000 Binary files a/src/neonutilities/helper_mods/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/src/neonutilities/helper_mods/__pycache__/api_helpers.cpython-311.pyc b/src/neonutilities/helper_mods/__pycache__/api_helpers.cpython-311.pyc deleted file mode 100644 index c3b09cc..0000000 Binary files a/src/neonutilities/helper_mods/__pycache__/api_helpers.cpython-311.pyc and /dev/null differ diff --git a/src/neonutilities/helper_mods/__pycache__/metadata_helpers.cpython-311.pyc b/src/neonutilities/helper_mods/__pycache__/metadata_helpers.cpython-311.pyc deleted file mode 100644 index 0f35667..0000000 Binary files a/src/neonutilities/helper_mods/__pycache__/metadata_helpers.cpython-311.pyc and /dev/null differ diff --git a/src/neonutilities/helper_mods/api_helpers.py b/src/neonutilities/helper_mods/api_helpers.py index 0c7cc2e..c09d1eb 100644 --- a/src/neonutilities/helper_mods/api_helpers.py +++ b/src/neonutilities/helper_mods/api_helpers.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- +# import crcmod +import google_crc32c import requests import re import os @@ -626,6 +628,45 @@ def download_urls(url_set, outpath, token=None, progress=True): return None +# def calculate_crc32c(filename): +# """ +# This function calculates the CRC32C checksum of a file. This is designed to use +# in the check_exists_and_checksum function to . + +# Parameters +# ---------- +# filename : str +# Path to the file for which to calculate the CRC32C checksum. + +# Returns +# ------- +# str +# The CRC32C checksum as an 8-character hexadecimal string (zero-padded if necessary). + +# Example +# ------- +# >>> calculate_crc32c("myfile.tif") +# '036d153e' +# """ +# crc32c_func = crcmod.predefined.mkPredefinedCrcFun('crc-32c') +# with open(filename, 'rb') as f: +# return format(crc32c_func(f.read()), '08x') + + +def calculate_crc32c(filename, chunk_size=1024 * 1024): + """ + Faster CRC32C checksum using google-crc32c. + """ + crc32c = google_crc32c.Checksum() + with open(filename, "rb") as f: + while True: + chunk = f.read(chunk_size) + if not chunk: + break + crc32c.update(chunk) + return crc32c.digest().hex().zfill(8) + + def download_file(url, savepath, chunk_size=1024, token=None): """ This function downloads a single file from a Google Cloud Storage URL to a user-specified directory. diff --git a/tests/test_aop_download.py b/tests/test_aop_download.py index 2e350fc..0fc0b84 100644 --- a/tests/test_aop_download.py +++ b/tests/test_aop_download.py @@ -264,7 +264,23 @@ def test_provisional_included_and_data_available_message(self, input_mock): cm.output, ) - # other scenarios- check messages but don't download the data ? + # overwrite warning when skip_if_exists is False + @patch("builtins.input", return_value="n") + def test_by_file_aop_overwrite_warning_when_skip_if_exists_false(self, input_mock): + with self.assertLogs(level="INFO") as cm: + by_file_aop( + dpid="DP3.30015.001", + site="MCRA", + year="2022", + skip_if_exists=False, + overwrite="no", + ) + self.assertIn( + "WARNING: overwrite option only applies if skip_if_exists=True. By default, any existing files will be overwritten unless you select skip_if_exists=True and overwrite='no' or 'prompt' (default).", + "\n".join(cm.output), + ) + + # other scenarios- check messages but don't download the data # provisional not included, and data available # provisional included, and data available