From 053568cd536b9c13c3c62a0bed0c5905a383bd3d Mon Sep 17 00:00:00 2001 From: yuzie007 Date: Wed, 12 Apr 2023 00:42:02 +0200 Subject: [PATCH 1/3] Modify `-DCMAKE_INSTALL_RPATH` for MacOS --- setup.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index fae64ed..11d6154 100644 --- a/setup.py +++ b/setup.py @@ -71,7 +71,10 @@ def build_extension(self, ext): # Pile all .so in one place and use $ORIGIN as RPATH cmake_args += ["-DCMAKE_BUILD_WITH_INSTALL_RPATH=TRUE"] - cmake_args += ["-DCMAKE_INSTALL_RPATH={}".format("$ORIGIN")] + if platform.system() == "Darwin": # MacOS + cmake_args += ["-DCMAKE_INSTALL_RPATH={}".format("@loader_path")] + else: + cmake_args += ["-DCMAKE_INSTALL_RPATH={}".format("$ORIGIN")] cmake_args += ["-DBUILD_SHARED_LIBS=ON"] cmake_args += ["-DYAML_BUILD_SHARED_LIBS=ON"] From df88e9ecd0a75c2aa32afd6323ed27b0770fb4e7 Mon Sep 17 00:00:00 2001 From: yuzie007 Date: Wed, 12 Apr 2023 14:17:36 +0200 Subject: [PATCH 2/3] Remove *protocol* from `pd.read_pickle` This argument exists for `df.to_pickle` but not for `pd.read_pickle`. --- docs/pacemaker/quickstart.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/pacemaker/quickstart.md b/docs/pacemaker/quickstart.md index 30fa786..32d955e 100644 --- a/docs/pacemaker/quickstart.md +++ b/docs/pacemaker/quickstart.md @@ -45,7 +45,7 @@ An example DataFrame can be red as: ```python import pandas as pd -df = pd.read_pickle("../data/exmpl_df.pckl.gzip", compression="gzip", protocol=4) +df = pd.read_pickle("../data/exmpl_df.pckl.gzip", compression="gzip") ``` And it contains the following entries: From d0e5670e5677b48630755835e07743fe7a0eebb0 Mon Sep 17 00:00:00 2001 From: yuzie007 Date: Wed, 12 Apr 2023 15:28:32 +0200 Subject: [PATCH 3/3] Modify to use the `.pkl.gz` extension rather than the present `.pckl.gzip` extension for dataset files. --- bin/pace_activeset.py | 4 ++-- bin/pace_collect.py | 6 ++--- bin/pacemaker.py | 16 ++++++------- data/{exmpl_df.pckl.gzip => exmpl_df.pkl.gz} | Bin docs/pacemaker/active_learning.md | 8 +++---- docs/pacemaker/faq.md | 6 ++--- docs/pacemaker/inputfile.md | 10 ++++---- docs/pacemaker/quickstart.md | 14 +++++------ docs/pacemaker/utilities.md | 12 +++++----- ...p => Cu_df1_A1_A2_A3_EV_elast_phon.pkl.gz} | Bin examples/Cu-I/input.yaml | 6 ++--- .../{Cu_df2_1k.pkl.gzip => Cu_df2_1k.pkl.gz} | Bin examples/Cu-II/input.yaml | 4 ++-- .../{ethanol.pckl.gzip => ethanol.pkl.gz} | Bin examples/Ethanol/input.yaml | 4 ++-- ...le.pckl.gzip => HEA_randII_example.pkl.gz} | Bin examples/HEA/input.yaml | 4 ++-- .../custom-weights/data_custom_weights.ipynb | 21 ++++++++++------- examples/data_selection/data_selection.ipynb | 8 +++---- setup.py | 2 +- ...pckl => mus_ns_uni_to_rawlsLS_np_rank.pkl} | Bin ... => pyace_selected_bbasis_funcspec.pkl.gz} | Bin src/pyace/generalfit.py | 3 +-- src/pyace/multispecies_basisextension.py | 2 +- src/pyace/preparedata.py | 22 +++++++----------- 25 files changed, 73 insertions(+), 79 deletions(-) rename data/{exmpl_df.pckl.gzip => exmpl_df.pkl.gz} (100%) rename examples/Cu-I/{Cu_df1_A1_A2_A3_EV_elast_phon.pckl.gzip => Cu_df1_A1_A2_A3_EV_elast_phon.pkl.gz} (100%) rename examples/Cu-II/{Cu_df2_1k.pkl.gzip => Cu_df2_1k.pkl.gz} (100%) rename examples/Ethanol/{ethanol.pckl.gzip => ethanol.pkl.gz} (100%) rename examples/HEA/{HEA_randII_example.pckl.gzip => HEA_randII_example.pkl.gz} (100%) rename src/pyace/data/{mus_ns_uni_to_rawlsLS_np_rank.pckl => mus_ns_uni_to_rawlsLS_np_rank.pkl} (100%) rename src/pyace/data/{pyace_selected_bbasis_funcspec.pckl.gzip => pyace_selected_bbasis_funcspec.pkl.gz} (100%) diff --git a/bin/pace_activeset.py b/bin/pace_activeset.py index 0df7c57..160b58b 100644 --- a/bin/pace_activeset.py +++ b/bin/pace_activeset.py @@ -24,7 +24,7 @@ parser.add_argument("potential_file", help="B-basis file name (.yaml)", type=str) parser.add_argument("-d", "--dataset", action='append', - help="Dataset file name(s), ex.: -d filename.pckl.gzip [-d filename2.pckl.gzip]", type=str, + help="Dataset file name(s), ex.: -d filename.pkl.gz [-d filename2.pkl.gz]", type=str, required=True) parser.add_argument("-f", "--full", help="Compute active set on full (linearized) design matrix", @@ -71,7 +71,7 @@ else: raise RuntimeError("File {} not found".format(dsfn)) log.info("Loading dataset #{}/{} from {}".format(i + 1, len(dataset_filename), dsfn)) - df = pd.read_pickle(dsfn, compression="gzip") + df = pd.read_pickle(dsfn) log.info("Number of structures: {}".format(len(df))) df_list.append(df) df = pd.concat(df_list, axis=0) diff --git a/bin/pace_collect.py b/bin/pace_collect.py index 24b09da..e821daa 100644 --- a/bin/pace_collect.py +++ b/bin/pace_collect.py @@ -173,8 +173,8 @@ def main(args): parser.add_argument("-wd", "--working-dir", help="top directory where keep calculations", type=str, default='.', dest="working_dir") - parser.add_argument("--output-dataset-filename", help="pickle filename, default is collected.pckl.gzip", - type=str, default="collected.pckl.gzip", dest="output_dataset_filename") + parser.add_argument("--output-dataset-filename", help="pickle filename, default is collected.pkl.gz", + type=str, default="collected.pkl.gz", dest="output_dataset_filename") parser.add_argument('--free-atom-energy', help="dictionary of reference energies (auto for extraction from dataset), i.e. `Al:-0.123 Cu:-0.456 Zn:auto`," @@ -268,7 +268,7 @@ def main(args): ####### df.drop(columns=n_el_cols + ['comp_dict', 'volume', 'volume_per_atom', 'NUMBER_OF_ATOMS'], inplace=True) - df.to_pickle('{}'.format(output_dataset_filename), compression='gzip', protocol=4) + df.to_pickle('{}'.format(output_dataset_filename), protocol=4) logger.info('Store dataset into {}'.format(output_dataset_filename)) ###### df['absolute_energy_collected_per_atom'] = df['energy_corrected_per_atom'].abs() diff --git a/bin/pacemaker.py b/bin/pacemaker.py index 7c772c9..c38258c 100644 --- a/bin/pacemaker.py +++ b/bin/pacemaker.py @@ -29,11 +29,11 @@ from pyace.atomicenvironment import calculate_minimal_nn_atomic_env, calculate_minimal_nn_tp_atoms from pyace.validate import plot_analyse_error_distributions -files_to_remove = ["fitting_data_info.csv", "fitting_data_info.pckl.gzip", "log.txt", "nohup.out", +files_to_remove = ["fitting_data_info.csv", "fitting_data_info.pkl.gz", "log.txt", "nohup.out", "target_potential.yaml", "current_extended_potential.yaml", "output_potential.yaml", "ladder_metrics.txt", "cycle_metrics.txt", "metrics.txt", "test_ladder_metrics.txt", "test_cycle_metrics.txt", "test_metrics.txt", - "train_pred.pckl.gzip", "test_pred.pckl.gzip", + "train_pred.pkl.gz", "test_pred.pkl.gz", "test_ef-distributions.png", "train_ef-distributions.png", "report" ] @@ -297,7 +297,7 @@ def main(args): if general_fit.fitting_data is not None: log.info("For train data") pred_data = predict_and_save(general_fit, target_bbasisconfig, general_fit.fitting_data, - fname="train_pred.pckl.gzip") + fname="train_pred.pkl.gz") log.info("Ploting validation graphs") plot_analyse_error_distributions(pred_data, fig_prefix="train_", fig_path="report", imagetype=backend_config.get("imagetype", "png")) @@ -305,7 +305,7 @@ def main(args): if general_fit.test_data is not None: log.info("For test data") pred_data = predict_and_save(general_fit, target_bbasisconfig, general_fit.test_data, - fname="test_pred.pckl.gzip") + fname="test_pred.pkl.gz") log.info("Ploting validation graphs") plot_analyse_error_distributions(pred_data, fig_prefix="test_", fig_path="report", imagetype=backend_config.get("imagetype", "png")) @@ -316,7 +316,7 @@ def generate_template_input(): readline.parse_and_bind("tab: complete") # 1. Training set size - train_filename = input("Enter training dataset filename (ex.: data.pckl.gzip, [TAB] - autocompletion): ") + train_filename = input("Enter training dataset filename (ex.: data.pkl.gz, [TAB] - autocompletion): ") testset_size_inp = float(input("Enter test set fraction or size (ex.: 0.05 or [ENTER] - no test set): ") or 0) # 2. Elements @@ -333,7 +333,7 @@ def generate_template_input(): # checking dataset print("Trying to load {}".format(train_filename)) - df = pd.read_pickle(train_filename, compression="gzip") + df = pd.read_pickle(train_filename) if determine_elements_from_dataset: if 'ase_atoms' in df.columns: print("Determining available elements...") @@ -350,7 +350,7 @@ def generate_template_input(): if resp == "yes": df["energy_corrected"] = df["energy"] print("Saving upgraded dataset into {}...".format(train_filename), end="") - df.to_pickle(train_filename, compression="gzip") + df.to_pickle(train_filename) print("done") @@ -429,7 +429,7 @@ def predict_and_save(general_fit, target_bbasisconfig, structures_dataframe, fna columns_to_drop = [column for column in columns_to_drop if column in structures_dataframe] pred_data = pd.merge(structures_dataframe.drop(columns=columns_to_drop), pred_data, left_index=True, right_index=True) - pred_data.to_pickle(fname, compression="gzip", protocol=4) + pred_data.to_pickle(fname, protocol=4) log.info("Predictions are saved into {} ({})".format(fname, sizeof_fmt(fname))) return pred_data diff --git a/data/exmpl_df.pckl.gzip b/data/exmpl_df.pkl.gz similarity index 100% rename from data/exmpl_df.pckl.gzip rename to data/exmpl_df.pkl.gz diff --git a/docs/pacemaker/active_learning.md b/docs/pacemaker/active_learning.md index ca82aa0..19fc989 100644 --- a/docs/pacemaker/active_learning.md +++ b/docs/pacemaker/active_learning.md @@ -1,7 +1,7 @@ # Extrapolation grade and active learning For any fitted ACE potential and corresponding training set -(usually stored by `pacemaker` into `fitting_data_info.pckl.gzip` file in working directory) +(usually stored by `pacemaker` into `fitting_data_info.pkl.gz` file in working directory) one can generate corresponding active set for linear B-projections (default) of full non-linear embedding. Practice shows that linear active set is enough for extrapolation grade estimation. However, if you want more sensitive (and "over-secure") extrapolation grade, then full active set could be used. @@ -23,7 +23,7 @@ potential_file B-basis file name (.yaml) optional arguments: -h, --help show this help message and exit -d DATASET, --dataset DATASET - Dataset file name, ex.: filename.pckl.gzip + Dataset file name, ex.: filename.pkl.gz -f, --full Compute active set on full (linearized) design matrix -b BATCH_SIZE, --batch_size BATCH_SIZE Batch size (number of structures) considered simultaneously.If not provided - all dataset at once is considered @@ -40,14 +40,14 @@ optional arguments: Example of usage: ``` -pace_activeset -d fitting_data_info.pckl.gzip output_potential.yaml +pace_activeset -d fitting_data_info.pkl.gz output_potential.yaml ``` that will generate **linear** active set and store it into `output_potential.asi` file. or ``` -pace_activeset -d fitting_data_info.pckl.gzip output_potential.yaml -f +pace_activeset -d fitting_data_info.pkl.gz output_potential.yaml -f ``` that will generate **full** active set (including linearized part of non-linear embedding function) and store it into `output_potential.asi.nonlinear` file. diff --git a/docs/pacemaker/faq.md b/docs/pacemaker/faq.md index 1bacc11..e85dfa4 100644 --- a/docs/pacemaker/faq.md +++ b/docs/pacemaker/faq.md @@ -134,8 +134,8 @@ Alternatively, you can provide train and test datasets separately: ```yaml data: - filename: /path/to/train_data.pckl.gzip - test_filename: /path/to/test_data.pckl.gzip + filename: /path/to/train_data.pkl.gz + test_filename: /path/to/test_data.pkl.gz ``` ## I want to change the cutoff, what should I do ? @@ -147,7 +147,7 @@ If you change cutoff, i.e. from `rcut: 7` to `rcut: 6.5`, then potential should ## How better to organize my dataset files ? -It is recommended to store all dataset files (i.e. `df*.pckl.gzip`) in one folder and +It is recommended to store all dataset files (i.e. `df*.pkl.gz`) in one folder and specify the environment variable `$PACEMAKERDATAPATH` (exectue it in terminal or add to for example `.bashrc`) ``` diff --git a/docs/pacemaker/inputfile.md b/docs/pacemaker/inputfile.md index 754eaaf..6c31b49 100644 --- a/docs/pacemaker/inputfile.md +++ b/docs/pacemaker/inputfile.md @@ -34,7 +34,7 @@ Dataset could be saved into file as a pickled `pandas` dataframe with special na ```YAML data: - filename: some_stored_dataset.pckl.gzip + filename: some_stored_dataset.pkl.gz # cache_ref_df: False # whether to store the queried or modified dataset into file, default - True # ignore_weights: False # whether to ignore energy and force weighting columns in dataframe # datapath: ../data # path to folder with cache files with pickled dataframes @@ -49,11 +49,11 @@ Example of generating **custom energy/forces weights** is given in `examples/cus ### Test set You could provide test set either as a fraction or certain number of samples from the train set (option `test_size`) or -as a separate pckl.gzip file (option `test_filename`) +as a separate pkl.gz file (option `test_filename`) ```yaml data: - test_filename: my_test_dataset.pckl.gzip + test_filename: my_test_dataset.pkl.gz ``` or @@ -231,8 +231,8 @@ fit: } ## Custom weights: corresponding to main dataset index and `w_energy` and `w_forces` columns should - ## be provided in pckl.gzip file - #weighting: {type: ExternalWeightingPolicy, filename: custom_weights_only.pckl.gzip} + ## be provided in pkl.gz file + #weighting: {type: ExternalWeightingPolicy, filename: custom_weights_only.pkl.gz} ## OPTIMIZATION OPTIONS ## optimizer: BFGS # BFGS, L-BFGS-B, Nelder-Mead, etc. : scipy minimization algorithm diff --git a/docs/pacemaker/quickstart.md b/docs/pacemaker/quickstart.md index 32d955e..d088a41 100644 --- a/docs/pacemaker/quickstart.md +++ b/docs/pacemaker/quickstart.md @@ -20,7 +20,7 @@ If you have free atom calculations (single atom in large volume) in subfolders, pace_collect -wd path/to/my_dft_calculation --free-atom-energy auto ``` Both commands will scan through all folders and subfolders and collect DFT free energies (that are force-consistent) and forces -and make a single atom corrections. Resulting dataset will be stored into `collected.pckl.gzip` file. +and make a single atom corrections. Resulting dataset will be stored into `collected.pkl.gz` file. If you need more flexibility for DFT dataset manipulation, please check [Manual fitting dataset preparation](#manual_fitting_dataset_preparation). @@ -45,7 +45,7 @@ An example DataFrame can be red as: ```python import pandas as pd -df = pd.read_pickle("../data/exmpl_df.pckl.gzip", compression="gzip") +df = pd.read_pickle("../data/exmpl_df.pkl.gz") ``` And it contains the following entries: @@ -120,24 +120,22 @@ data = {'energy': [e1, e2], # create a DataFrame df = pd.DataFrame(data) # and save it -df.to_pickle('my_data.pckl.gzip', compression='gzip', protocol=4) +df.to_pickle('my_data.pkl.gz', protocol=4) ``` or use the utility `pace_collect` from a top-level directory to collect VASP calculations and store them in a -`collected.pckl.gzip` file. +`collected.pkl.gz` file. The resulting dataframe can be used for fitting with `pacemaker`. ### Creating an input file In this example we will use template as it is, however one would need to provide a path to the -example dataset `exmpl_df.pckl.gzip`. This can be done by changing `filename` parameter in the `data` section of the +example dataset `exmpl_df.pkl.gz`. This can be done by changing `filename` parameter in the `data` section of the `input.yaml`: ```yaml - data: - filename: /path/to/the/pyace/data/exmpl_df.pckl.gzip - + filename: /path/to/the/pyace/data/exmpl_df.pkl.gz ``` Please check [examples folder](https://github.com/ICAMS/python-ace/tree/master/examples) for more examples of input file. diff --git a/docs/pacemaker/utilities.md b/docs/pacemaker/utilities.md index 7935620..525f387 100644 --- a/docs/pacemaker/utilities.md +++ b/docs/pacemaker/utilities.md @@ -46,7 +46,7 @@ pace_info [-h] potential_file ## Collect and store VASP data in pickle file -Utility to collect VASP calculations from a top-level directory and store them in a `*.pckl.gzip` file that can be used for fitting with `pacemaker`. +Utility to collect VASP calculations from a top-level directory and store them in a `*.pkl.gz` file that can be used for fitting with `pacemaker`. The reference energies could be provided for each element (default value is zero) or extracted automatically from the calculation with single atom and large enough (>500 Ang^3/atom) volume. Usage: @@ -59,7 +59,7 @@ optional arguments: -wd WORKING_DIR, --working-dir WORKING_DIR top directory where keep calculations --output-dataset-filename OUTPUT_DATASET_FILENAME - pickle filename, default is collected.pckl.gzip + pickle filename, default is collected.pkl.gz --free-atom-energy [FREE_ATOM_ENERGY [FREE_ATOM_ENERGY ...]] dictionary of reference energies (auto for extraction from dataset), i.e. `Al:-0.123 Cu:-0.456 Zn:auto`, default is zero. If option is `auto`, then it will be extracted from dataset --selection SELECTION @@ -81,7 +81,7 @@ potential_file B-basis file name (.yaml) optional arguments: -h, --help show this help message and exit -d DATASET, --dataset DATASET - Dataset file name, ex.: filename.pckl.gzip + Dataset file name, ex.: filename.pkl.gz -f, --full Compute active set on full (linearized) design matrix -b BATCH_SIZE, --batch_size BATCH_SIZE Batch size (number of structures) considered simultaneously.If not provided - all dataset at once is considered @@ -98,14 +98,14 @@ optional arguments: Example of usage: ``` -pace_activeset -d fitting_data_info.pckl.gzip output_potential.yaml +pace_activeset -d fitting_data_info.pkl.gz output_potential.yaml ``` that will generate **linear** active set and store it into `output_potential.asi` file. or ``` -pace_activeset -d fitting_data_info.pckl.gzip output_potential.yaml -f +pace_activeset -d fitting_data_info.pkl.gz output_potential.yaml -f ``` that will generate **full** active set (including linearized part of non-linear embedding function) -and store it into `output_potential.asi.nonlinear` file. \ No newline at end of file +and store it into `output_potential.asi.nonlinear` file. diff --git a/examples/Cu-I/Cu_df1_A1_A2_A3_EV_elast_phon.pckl.gzip b/examples/Cu-I/Cu_df1_A1_A2_A3_EV_elast_phon.pkl.gz similarity index 100% rename from examples/Cu-I/Cu_df1_A1_A2_A3_EV_elast_phon.pckl.gzip rename to examples/Cu-I/Cu_df1_A1_A2_A3_EV_elast_phon.pkl.gz diff --git a/examples/Cu-I/input.yaml b/examples/Cu-I/input.yaml index 15e525b..5044b26 100644 --- a/examples/Cu-I/input.yaml +++ b/examples/Cu-I/input.yaml @@ -42,7 +42,7 @@ potential: NameOfCutoffFunction: cos, } - + ## possible keywords: ALL, UNARY, BINARY, TERNARY, QUATERNARY, QUINARY, ## element combinations as (Al,Al), (Al, Ni), (Al, Ni, Zn), etc... @@ -58,8 +58,8 @@ potential: ## Dataset specification section ################################################################# data: -### Option 1: pandas dataframe in pckl.gzip - filename: Cu_df1_A1_A2_A3_EV_elast_phon.pckl.gzip # force to read reference pickled dataframe from given file +### Option 1: pandas dataframe in pkl.gz + filename: Cu_df1_A1_A2_A3_EV_elast_phon.pkl.gz # force to read reference pickled dataframe from given file ################################################################# diff --git a/examples/Cu-II/Cu_df2_1k.pkl.gzip b/examples/Cu-II/Cu_df2_1k.pkl.gz similarity index 100% rename from examples/Cu-II/Cu_df2_1k.pkl.gzip rename to examples/Cu-II/Cu_df2_1k.pkl.gz diff --git a/examples/Cu-II/input.yaml b/examples/Cu-II/input.yaml index 3c4fb29..2e155f2 100644 --- a/examples/Cu-II/input.yaml +++ b/examples/Cu-II/input.yaml @@ -55,8 +55,8 @@ potential: ## Dataset specification section ################################################################# data: -### Option 1: pandas dataframe in pckl.gzip - filename: Cu_df2_1k.pkl.gzip # force to read reference pickled dataframe from given file +### Option 1: pandas dataframe in pkl.gz + filename: Cu_df2_1k.pkl.gz # force to read reference pickled dataframe from given file diff --git a/examples/Ethanol/ethanol.pckl.gzip b/examples/Ethanol/ethanol.pkl.gz similarity index 100% rename from examples/Ethanol/ethanol.pckl.gzip rename to examples/Ethanol/ethanol.pkl.gz diff --git a/examples/Ethanol/input.yaml b/examples/Ethanol/input.yaml index b4bebef..3d00085 100644 --- a/examples/Ethanol/input.yaml +++ b/examples/Ethanol/input.yaml @@ -49,7 +49,7 @@ potential: ## Dataset specification section ################################################################# data: - filename: ethanol.pckl.gzip # force to read reference pickled dataframe from given file + filename: ethanol.pkl.gz # force to read reference pickled dataframe from given file ################################################################# @@ -76,6 +76,6 @@ fit: backend: evaluator: tensorpot # tensorpot backend (recommended) batch_size: 1000 - + ## frequency of detailed metric calculation and printing display_step: 50 diff --git a/examples/HEA/HEA_randII_example.pckl.gzip b/examples/HEA/HEA_randII_example.pkl.gz similarity index 100% rename from examples/HEA/HEA_randII_example.pckl.gzip rename to examples/HEA/HEA_randII_example.pkl.gz diff --git a/examples/HEA/input.yaml b/examples/HEA/input.yaml index e590438..fc270c9 100644 --- a/examples/HEA/input.yaml +++ b/examples/HEA/input.yaml @@ -58,8 +58,8 @@ potential: ## Dataset specification section ################################################################# data: - ### Option 1: pandas dataframe in pckl.gzip - filename: HEA_randII_example.pckl.gzip + ### Option 1: pandas dataframe in pkl.gz + filename: HEA_randII_example.pkl.gz ################################################################# ## Fit settings section diff --git a/examples/custom-weights/data_custom_weights.ipynb b/examples/custom-weights/data_custom_weights.ipynb index b176db3..9cf0939 100644 --- a/examples/custom-weights/data_custom_weights.ipynb +++ b/examples/custom-weights/data_custom_weights.ipynb @@ -55,7 +55,7 @@ "metadata": {}, "outputs": [], "source": [ - "df=pd.read_pickle(\"/some/fit/data/Al-Li/data.pckl.gzip\", compression=\"gzip\")" + "df=pd.read_pickle(\"/some/fit/data/Al-Li/data.pkl.gz\")" ] }, { @@ -312,7 +312,7 @@ "metadata": {}, "outputs": [], "source": [ - "df.to_pickle(\"df_weights.pckl.gzip\", compression=\"gzip\", protocol=4)" + "df.to_pickle(\"df_weights.pkl.gz\", protocol=4)" ] }, { @@ -554,7 +554,7 @@ "metadata": {}, "outputs": [], "source": [ - "df.to_pickle(\"df_weights_elastic_x10.pckl.gzip\", compression=\"gzip\", protocol=4)" + "df.to_pickle(\"df_weights_elastic_x10.pkl.gz\", protocol=4)" ] }, { @@ -635,7 +635,7 @@ "metadata": {}, "outputs": [], "source": [ - "df.to_pickle(\"df_weights_Li_x5.pckl.gzip\", compression=\"gzip\", protocol=4)" + "df.to_pickle(\"df_weights_Li_x5.pkl.gz\", protocol=4)" ] }, { @@ -646,6 +646,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -655,7 +656,7 @@ "...\n", "\n", "data:\n", - " filename: df_weights.pckl.gzip\n", + " filename: df_weights.pkl.gz\n", "\n", "...\n", "``` " @@ -669,6 +670,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -678,7 +680,7 @@ "...\n", "\n", "data:\n", - " filename: df_weights.pckl.gzip\n", + " filename: df_weights.pkl.gz\n", " ignore_weights: True\n", "\n", "...\n", @@ -837,7 +839,7 @@ "metadata": {}, "outputs": [], "source": [ - "weights_only.to_pickle(\"custom_weights_only.pckl.gzip\", compression=\"gzip\", protocol=4)" + "weights_only.to_pickle(\"custom_weights_only.pkl.gz\", protocol=4)" ] }, { @@ -848,6 +850,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -857,11 +860,11 @@ "...\n", "# load data as usual\n", "data:\n", - " filename: /some/fit/data/Al-Li/data.pckl.gzip\n", + " filename: /some/fit/data/Al-Li/data.pkl.gz\n", "\n", "fit:\n", " # use ExternalWeightingPolicy weighting scheme\n", - " weighting: {type: ExternalWeightingPolicy, filename: custom_weights_only.pckl.gzip}\n", + " weighting: {type: ExternalWeightingPolicy, filename: custom_weights_only.pkl.gz}\n", "\n", "...\n", "``` " diff --git a/examples/data_selection/data_selection.ipynb b/examples/data_selection/data_selection.ipynb index ec76b6a..7b34244 100644 --- a/examples/data_selection/data_selection.ipynb +++ b/examples/data_selection/data_selection.ipynb @@ -53,7 +53,7 @@ "metadata": {}, "outputs": [], "source": [ - "df=pd.read_pickle(\"../Cu-II/Cu_df2_1k.pkl.gzip\",compression=\"gzip\")" + "df=pd.read_pickle(\"../Cu-II/Cu_df2_1k.pkl.gz\")" ] }, { @@ -444,7 +444,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_selected.to_pickle(\"Cu-fcc-only.pckl.gzip\",compression=\"gzip\")" + "df_selected.to_pickle(\"Cu-fcc-only.pkl.gz\")" ] }, { @@ -521,7 +521,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_selected.to_pickle(\"Cu-0.5eV.pckl.gzip\", compression=\"gzip\")" + "df_selected.to_pickle(\"Cu-0.5eV.pkl.gz\")" ] }, { @@ -577,7 +577,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_selected.to_pickle(\"Cu-no-shake.pckl.gzip\", compression=\"gzip\")" + "df_selected.to_pickle(\"Cu-no-shake.pkl.gz\")" ] }, { diff --git a/setup.py b/setup.py index 11d6154..d0d44c3 100644 --- a/setup.py +++ b/setup.py @@ -161,7 +161,7 @@ def build_extension(self, ext): 'Programming Language :: Python :: 3', ], package_data={"pyace.data": [ - "mus_ns_uni_to_rawlsLS_np_rank.pckl", + "mus_ns_uni_to_rawlsLS_np_rank.pkl", "input_template.yaml" ]}, scripts=["bin/pacemaker", "bin/pace_yaml2yace", "bin/pace_update_ace", diff --git a/src/pyace/data/mus_ns_uni_to_rawlsLS_np_rank.pckl b/src/pyace/data/mus_ns_uni_to_rawlsLS_np_rank.pkl similarity index 100% rename from src/pyace/data/mus_ns_uni_to_rawlsLS_np_rank.pckl rename to src/pyace/data/mus_ns_uni_to_rawlsLS_np_rank.pkl diff --git a/src/pyace/data/pyace_selected_bbasis_funcspec.pckl.gzip b/src/pyace/data/pyace_selected_bbasis_funcspec.pkl.gz similarity index 100% rename from src/pyace/data/pyace_selected_bbasis_funcspec.pckl.gzip rename to src/pyace/data/pyace_selected_bbasis_funcspec.pkl.gz diff --git a/src/pyace/generalfit.py b/src/pyace/generalfit.py index 30d8de3..e83104f 100644 --- a/src/pyace/generalfit.py +++ b/src/pyace/generalfit.py @@ -26,7 +26,7 @@ __username = None -FITTING_DATA_INFO_FILENAME = "fitting_data_info.pckl.gzip" +FITTING_DATA_INFO_FILENAME = "fitting_data_info.pkl.gz" def get_username(): @@ -382,7 +382,6 @@ def save_fitting_data_info(self): columns_to_save = [col for col in fitting_data_columns if col not in columns_to_drop] self.fitting_data[columns_to_save].to_pickle(FITTING_DATA_INFO_FILENAME, - compression="gzip", protocol=4) log.info("Fitting dataset info saved into {}".format(FITTING_DATA_INFO_FILENAME)) diff --git a/src/pyace/multispecies_basisextension.py b/src/pyace/multispecies_basisextension.py index 82bb41e..4774590 100644 --- a/src/pyace/multispecies_basisextension.py +++ b/src/pyace/multispecies_basisextension.py @@ -45,7 +45,7 @@ 'Lv', 'Ts', 'Og'] default_mus_ns_uni_to_rawlsLS_np_rank_filename = pkg_resources.resource_filename('pyace.data', - 'mus_ns_uni_to_rawlsLS_np_rank.pckl') + 'mus_ns_uni_to_rawlsLS_np_rank.pkl') def clean_bbasisconfig(initial_bbasisconfig): for block in initial_bbasisconfig.funcspecs_blocks: diff --git a/src/pyace/preparedata.py b/src/pyace/preparedata.py index 124302f..bc8b6e2 100644 --- a/src/pyace/preparedata.py +++ b/src/pyace/preparedata.py @@ -230,25 +230,19 @@ def generate_weights(self, df): def save_dataframe(df: pd.DataFrame, filename: str, protocol: int = 4): filename = os.path.abspath(filename) log.info("Writing fit pickle file: {}".format(filename)) - if filename.endswith("gzip"): - compression = "gzip" - else: - compression = "infer" dirname = os.path.dirname(filename) os.makedirs(dirname, exist_ok=True) if not isinstance(df, DataFrameWithMetadata): log.info("Transforming to DataFrameWithMetadata") df = DataFrameWithMetadata(df) - df.to_pickle(filename, protocol=protocol, compression=compression) + df.to_pickle(filename, protocol=protocol) log.info("Saved to file {} ({})".format(filename, sizeof_fmt(filename))) -def load_dataframe(filename: str, compression: str = "infer") -> pd.DataFrame: +def load_dataframe(filename: str) -> pd.DataFrame: filesize = os.path.getsize(filename) log.info("Loading dataframe from pickle file {} ({})".format(filename, sizeof_fmt(filesize))) - if filename.endswith(".gzip"): - compression = "gzip" - df = pd.read_pickle(filename, compression=compression) + df = pd.read_pickle(filename) return df @@ -346,7 +340,7 @@ def add_ase_atoms_transformer(self, result_column_name, transformer_func, **kwar def get_default_ref_filename(self): try: - return "df-{calculator}-{element}-{suffix}.pckl.gzip".format( + return "df-{calculator}-{element}-{suffix}.pkl.gz".format( calculator=self.config["calculator"], element=self.config["element"], suffix="ref").replace("/", "_") @@ -562,7 +556,7 @@ def load_or_query_ref_structures_dataframe(self, force_query=None): self.df = self.raw_df elif file_to_load is not None and os.path.isfile(file_to_load) and not force_query: log.info(file_to_load + " found, try to load") - self.df = load_dataframe(file_to_load, compression="infer") + self.df = load_dataframe(file_to_load) else: # if ref_df is still not loaded, try to query from DB if not force_query: log.info("Cache not found, querying database") @@ -610,7 +604,7 @@ def get_ref_dataframe(self, force_query=None, cache_ref_df=False): if cache_ref_df or self.cache_ref_df: if self.ref_df_changed: # generate filename to save df: if name is provided - try to put it into datapath - filename = self.get_actual_filename() or "df_ref.pckl.gzip" + filename = self.get_actual_filename() or "df_ref.pkl.gz" log.info("Saving processed raw dataframe into " + filename) save_dataframe(self.df, filename=filename) else: @@ -1051,7 +1045,7 @@ class ExternalWeightingPolicy(StructuresDatasetWeightingPolicy): def __init__(self, filename: str): """ - :param filename: .pckl.gzip filename of dataframe with index and `w_energy` and `w_forces` columns + :param filename: .pkl.gz filename of dataframe with index and `w_energy` and `w_forces` columns """ self.filename = filename @@ -1060,7 +1054,7 @@ def __str__(self): def generate_weights(self, df): log.info("Loading external weights dataframe {}".format(self.filename)) - self.weights_df = pd.read_pickle(self.filename, compression="gzip") + self.weights_df = pd.read_pickle(self.filename) log.info("External weights dataframe loaded, it contains {} entries".format(len(self.weights_df))) # check that columns are presented