ICAMS · yuzie007 · Apr 11, 2023 · Apr 12, 2023 · Apr 12, 2023
diff --git a/bin/pace_activeset.py b/bin/pace_activeset.py
@@ -24,7 +24,7 @@
 parser.add_argument("potential_file", help="B-basis file name (.yaml)", type=str)
 
 parser.add_argument("-d", "--dataset", action='append',
-                    help="Dataset file name(s), ex.: -d filename.pckl.gzip [-d filename2.pckl.gzip]", type=str,
+                    help="Dataset file name(s), ex.: -d filename.pkl.gz [-d filename2.pkl.gz]", type=str,
                     required=True)
 
 parser.add_argument("-f", "--full", help="Compute active set on full (linearized) design matrix",
@@ -71,7 +71,7 @@
         else:
             raise RuntimeError("File {} not found".format(dsfn))
         log.info("Loading dataset #{}/{} from {}".format(i + 1, len(dataset_filename), dsfn))
-        df = pd.read_pickle(dsfn, compression="gzip")
+        df = pd.read_pickle(dsfn)
         log.info("Number of structures: {}".format(len(df)))
         df_list.append(df)
     df = pd.concat(df_list, axis=0)

diff --git a/bin/pace_collect.py b/bin/pace_collect.py
@@ -173,8 +173,8 @@ def main(args):
     parser.add_argument("-wd", "--working-dir", help="top directory where keep calculations",
                         type=str, default='.', dest="working_dir")
 
-    parser.add_argument("--output-dataset-filename", help="pickle filename, default is collected.pckl.gzip",
-                        type=str, default="collected.pckl.gzip", dest="output_dataset_filename")
+    parser.add_argument("--output-dataset-filename", help="pickle filename, default is collected.pkl.gz",
+                        type=str, default="collected.pkl.gz", dest="output_dataset_filename")
 
     parser.add_argument('--free-atom-energy',
                         help="dictionary of reference energies (auto for extraction from dataset), i.e. `Al:-0.123 Cu:-0.456 Zn:auto`,"
@@ -268,7 +268,7 @@ def main(args):
 
     #######
     df.drop(columns=n_el_cols + ['comp_dict', 'volume', 'volume_per_atom', 'NUMBER_OF_ATOMS'], inplace=True)
-    df.to_pickle('{}'.format(output_dataset_filename), compression='gzip', protocol=4)
+    df.to_pickle('{}'.format(output_dataset_filename), protocol=4)
     logger.info('Store dataset into {}'.format(output_dataset_filename))
     ######
     df['absolute_energy_collected_per_atom'] = df['energy_corrected_per_atom'].abs()

diff --git a/bin/pacemaker.py b/bin/pacemaker.py
@@ -29,11 +29,11 @@
 from pyace.atomicenvironment import calculate_minimal_nn_atomic_env, calculate_minimal_nn_tp_atoms
 from pyace.validate import plot_analyse_error_distributions
 
-files_to_remove = ["fitting_data_info.csv", "fitting_data_info.pckl.gzip", "log.txt", "nohup.out",
+files_to_remove = ["fitting_data_info.csv", "fitting_data_info.pkl.gz", "log.txt", "nohup.out",
                    "target_potential.yaml", "current_extended_potential.yaml", "output_potential.yaml",
                    "ladder_metrics.txt", "cycle_metrics.txt", "metrics.txt",
                    "test_ladder_metrics.txt", "test_cycle_metrics.txt", "test_metrics.txt",
-                   "train_pred.pckl.gzip", "test_pred.pckl.gzip",
+                   "train_pred.pkl.gz", "test_pred.pkl.gz",
                    "test_ef-distributions.png", "train_ef-distributions.png", "report"
                    ]
 
@@ -297,15 +297,15 @@ def main(args):
         if general_fit.fitting_data is not None:
             log.info("For train data")
             pred_data = predict_and_save(general_fit, target_bbasisconfig, general_fit.fitting_data,
-                                         fname="train_pred.pckl.gzip")
+                                         fname="train_pred.pkl.gz")
             log.info("Ploting validation graphs")
             plot_analyse_error_distributions(pred_data, fig_prefix="train_", fig_path="report",
                                              imagetype=backend_config.get("imagetype", "png"))
 
         if general_fit.test_data is not None:
             log.info("For test data")
             pred_data = predict_and_save(general_fit, target_bbasisconfig, general_fit.test_data,
-                                         fname="test_pred.pckl.gzip")
+                                         fname="test_pred.pkl.gz")
             log.info("Ploting validation graphs")
             plot_analyse_error_distributions(pred_data, fig_prefix="test_", fig_path="report",
                                              imagetype=backend_config.get("imagetype", "png"))
@@ -316,7 +316,7 @@ def generate_template_input():
     readline.parse_and_bind("tab: complete")
 
     # 1. Training set size
-    train_filename = input("Enter training dataset filename (ex.: data.pckl.gzip, [TAB] - autocompletion): ")
+    train_filename = input("Enter training dataset filename (ex.: data.pkl.gz, [TAB] - autocompletion): ")
     testset_size_inp = float(input("Enter test set fraction or size (ex.: 0.05 or [ENTER] - no test set): ") or 0)
 
     # 2. Elements
@@ -333,7 +333,7 @@ def generate_template_input():
 
     # checking dataset
     print("Trying to load {}".format(train_filename))
-    df = pd.read_pickle(train_filename, compression="gzip")
+    df = pd.read_pickle(train_filename)
     if determine_elements_from_dataset:
         if 'ase_atoms' in df.columns:
             print("Determining available elements...")
@@ -350,7 +350,7 @@ def generate_template_input():
         if resp == "yes":
             df["energy_corrected"] = df["energy"]
             print("Saving upgraded dataset into {}...".format(train_filename), end="")
-            df.to_pickle(train_filename, compression="gzip")
+            df.to_pickle(train_filename)
             print("done")
 
 
@@ -429,7 +429,7 @@ def predict_and_save(general_fit, target_bbasisconfig, structures_dataframe, fna
     columns_to_drop = [column for column in columns_to_drop if column in structures_dataframe]
     pred_data = pd.merge(structures_dataframe.drop(columns=columns_to_drop), pred_data,
                          left_index=True, right_index=True)
-    pred_data.to_pickle(fname, compression="gzip", protocol=4)
+    pred_data.to_pickle(fname, protocol=4)
     log.info("Predictions are saved into {} ({})".format(fname, sizeof_fmt(fname)))
     return pred_data
 

diff --git a/data/exmpl_df.pckl.gzip → data/exmpl_df.pkl.gz b/data/exmpl_df.pckl.gzip → data/exmpl_df.pkl.gz
diff --git a/docs/pacemaker/active_learning.md b/docs/pacemaker/active_learning.md
@@ -1,7 +1,7 @@
 # Extrapolation grade and active learning 
 
 For any fitted ACE potential and corresponding training set 
-(usually stored by `pacemaker` into `fitting_data_info.pckl.gzip` file in working directory)
+(usually stored by `pacemaker` into `fitting_data_info.pkl.gz` file in working directory)
 one can generate corresponding active set for linear B-projections (default) of full non-linear embedding.
 Practice shows that linear active set is enough for extrapolation grade estimation.
 However, if you want more sensitive (and "over-secure") extrapolation grade, then full active set could be used.
@@ -23,7 +23,7 @@ potential_file        B-basis file name (.yaml)
 optional arguments:
    -h, --help            show this help message and exit
    -d DATASET, --dataset DATASET
-   Dataset file name, ex.: filename.pckl.gzip
+   Dataset file name, ex.: filename.pkl.gz
    -f, --full            Compute active set on full (linearized) design matrix
    -b BATCH_SIZE, --batch_size BATCH_SIZE
    Batch size (number of structures) considered simultaneously.If not provided - all dataset at once is considered
@@ -40,14 +40,14 @@ optional arguments:
 Example of usage:
 
 ```
-pace_activeset -d fitting_data_info.pckl.gzip output_potential.yaml
+pace_activeset -d fitting_data_info.pkl.gz output_potential.yaml
 ```
 that will generate **linear** active set and store it into `output_potential.asi` file.
 
 or
 
 ```
-pace_activeset -d fitting_data_info.pckl.gzip output_potential.yaml -f
+pace_activeset -d fitting_data_info.pkl.gz output_potential.yaml -f
 ```
 that will generate **full** active set (including linearized part of non-linear embedding function)
 and store it into `output_potential.asi.nonlinear` file.

diff --git a/docs/pacemaker/faq.md b/docs/pacemaker/faq.md
@@ -134,8 +134,8 @@ Alternatively, you can provide train and test datasets separately:
 
 ```yaml
 data:
-  filename: /path/to/train_data.pckl.gzip    
-  test_filename: /path/to/test_data.pckl.gzip 
+  filename: /path/to/train_data.pkl.gz
+  test_filename: /path/to/test_data.pkl.gz
 ``` 
 
 ## I want to change the cutoff, what should I do ?
@@ -147,7 +147,7 @@ If you change cutoff, i.e. from `rcut: 7` to `rcut: 6.5`, then potential should
 
 ## How better to organize my dataset files ?
 
-It is recommended to store all dataset files (i.e. `df*.pckl.gzip`) in one folder and
+It is recommended to store all dataset files (i.e. `df*.pkl.gz`) in one folder and
 specify the environment variable `$PACEMAKERDATAPATH` (exectue it in terminal or add to for example `.bashrc`) 
 
 ```

diff --git a/docs/pacemaker/inputfile.md b/docs/pacemaker/inputfile.md
@@ -34,7 +34,7 @@ Dataset could be saved into file as a pickled `pandas` dataframe with special na
 
 ```YAML
 data: 
-  filename: some_stored_dataset.pckl.gzip
+  filename: some_stored_dataset.pkl.gz
   # cache_ref_df: False             # whether to store the queried or modified dataset into file, default - True
   # ignore_weights: False          # whether to ignore energy and force weighting columns in dataframe
   # datapath: ../data              # path to folder with cache files with pickled dataframes 
@@ -49,11 +49,11 @@ Example of generating **custom energy/forces weights** is given in `examples/cus
 ### Test set
 
 You could provide test set either as a fraction or certain number of samples from the train set (option `test_size`) or
-as a separate pckl.gzip file (option `test_filename`)
+as a separate pkl.gz file (option `test_filename`)
 
 ```yaml
 data:
-  test_filename: my_test_dataset.pckl.gzip
+  test_filename: my_test_dataset.pkl.gz
 ```
 
 or
@@ -231,8 +231,8 @@ fit:
     }
 
     ## Custom weights:  corresponding to main dataset index and `w_energy` and `w_forces` columns should
-    ## be provided in pckl.gzip file
-    #weighting: {type: ExternalWeightingPolicy, filename: custom_weights_only.pckl.gzip}
+    ## be provided in pkl.gz file
+    #weighting: {type: ExternalWeightingPolicy, filename: custom_weights_only.pkl.gz}
 
     ## OPTIMIZATION OPTIONS ##
     optimizer: BFGS # BFGS, L-BFGS-B, Nelder-Mead, etc. : scipy minimization algorithm

diff --git a/docs/pacemaker/quickstart.md b/docs/pacemaker/quickstart.md
@@ -20,7 +20,7 @@ If you have free atom calculations (single atom in large volume) in subfolders,
 pace_collect -wd path/to/my_dft_calculation --free-atom-energy auto 
 ```
 Both commands will scan through all folders and subfolders and collect DFT free energies (that are force-consistent) and forces 
-and make a single atom corrections. Resulting dataset will be stored into `collected.pckl.gzip` file.
+and make a single atom corrections. Resulting dataset will be stored into `collected.pkl.gz` file.
 
 If you need more flexibility for DFT dataset manipulation,
 please check [Manual fitting dataset preparation](#manual_fitting_dataset_preparation).
@@ -45,7 +45,7 @@ An example DataFrame can be red as:
 
 ```python
 import pandas as pd
-df = pd.read_pickle("../data/exmpl_df.pckl.gzip", compression="gzip", protocol=4)
+df = pd.read_pickle("../data/exmpl_df.pkl.gz")
 ```
 And it contains the following entries:
 
@@ -120,24 +120,22 @@ data = {'energy': [e1, e2],
 # create a DataFrame
 df = pd.DataFrame(data)
 # and save it 
-df.to_pickle('my_data.pckl.gzip', compression='gzip', protocol=4)
+df.to_pickle('my_data.pkl.gz', protocol=4)
 ```
 
 or use the utility `pace_collect` from a top-level directory to collect VASP calculations and store them in a 
-`collected.pckl.gzip` file.
+`collected.pkl.gz` file.
 The resulting dataframe can be used for fitting with `pacemaker`.
 
 ### Creating an input file
 
 In this example we will use template as it is, however one would need to provide a path to the
-example dataset `exmpl_df.pckl.gzip`. This can be done by changing `filename` parameter in the `data` section of the 
+example dataset `exmpl_df.pkl.gz`. This can be done by changing `filename` parameter in the `data` section of the 
 `input.yaml`:
 
 ```yaml
-
 data:
-   filename: /path/to/the/pyace/data/exmpl_df.pckl.gzip
-
+   filename: /path/to/the/pyace/data/exmpl_df.pkl.gz
 ```
 
 Please check [examples folder](https://github.com/ICAMS/python-ace/tree/master/examples) for more examples of input file.

diff --git a/docs/pacemaker/utilities.md b/docs/pacemaker/utilities.md
@@ -46,7 +46,7 @@ pace_info [-h] potential_file
 
 ## Collect and store VASP data in pickle file
 
-Utility to collect VASP calculations from a top-level directory and store them in a `*.pckl.gzip` file that can be used for fitting with `pacemaker`. 
+Utility to collect VASP calculations from a top-level directory and store them in a `*.pkl.gz` file that can be used for fitting with `pacemaker`. 
 The reference energies could be provided for each element (default value is zero) 
 or extracted automatically from the calculation with single atom and large enough (>500 Ang^3/atom) volume. Usage: 
 
@@ -59,7 +59,7 @@ optional arguments:
   -wd WORKING_DIR, --working-dir WORKING_DIR
                         top directory where keep calculations
   --output-dataset-filename OUTPUT_DATASET_FILENAME
-                        pickle filename, default is collected.pckl.gzip
+                        pickle filename, default is collected.pkl.gz
   --free-atom-energy [FREE_ATOM_ENERGY [FREE_ATOM_ENERGY ...]]
                         dictionary of reference energies (auto for extraction from dataset), i.e. `Al:-0.123 Cu:-0.456 Zn:auto`, default is zero. If option is `auto`, then it will be extracted from dataset
   --selection SELECTION
@@ -81,7 +81,7 @@ potential_file        B-basis file name (.yaml)
 optional arguments:
    -h, --help            show this help message and exit
    -d DATASET, --dataset DATASET
-   Dataset file name, ex.: filename.pckl.gzip
+   Dataset file name, ex.: filename.pkl.gz
    -f, --full            Compute active set on full (linearized) design matrix
    -b BATCH_SIZE, --batch_size BATCH_SIZE
    Batch size (number of structures) considered simultaneously.If not provided - all dataset at once is considered
@@ -98,14 +98,14 @@ optional arguments:
 Example of usage:
 
 ```
-pace_activeset -d fitting_data_info.pckl.gzip output_potential.yaml
+pace_activeset -d fitting_data_info.pkl.gz output_potential.yaml
 ```
 that will generate **linear** active set and store it into `output_potential.asi` file.
 
 or
 
 ```
-pace_activeset -d fitting_data_info.pckl.gzip output_potential.yaml -f
+pace_activeset -d fitting_data_info.pkl.gz output_potential.yaml -f
 ```
 that will generate **full** active set (including linearized part of non-linear embedding function)
-and store it into `output_potential.asi.nonlinear` file.
+and store it into `output_potential.asi.nonlinear` file.
diff --git a/...I/Cu_df1_A1_A2_A3_EV_elast_phon.pckl.gzip → ...Cu-I/Cu_df1_A1_A2_A3_EV_elast_phon.pkl.gz b/...I/Cu_df1_A1_A2_A3_EV_elast_phon.pckl.gzip → ...Cu-I/Cu_df1_A1_A2_A3_EV_elast_phon.pkl.gz
diff --git a/examples/Cu-I/input.yaml b/examples/Cu-I/input.yaml
@@ -42,7 +42,7 @@ potential:
       NameOfCutoffFunction: cos,
     }
 
-    
+
 
   ## possible keywords: ALL, UNARY, BINARY, TERNARY, QUATERNARY, QUINARY,
   ## element combinations as (Al,Al), (Al, Ni), (Al, Ni, Zn), etc...
@@ -58,8 +58,8 @@ potential:
 ## Dataset specification section
 #################################################################
 data:
-### Option 1: pandas dataframe in pckl.gzip
-  filename: Cu_df1_A1_A2_A3_EV_elast_phon.pckl.gzip  # force to read reference pickled dataframe from given file
+### Option 1: pandas dataframe in pkl.gz
+  filename: Cu_df1_A1_A2_A3_EV_elast_phon.pkl.gz  # force to read reference pickled dataframe from given file
 
 
 #################################################################

diff --git a/examples/Cu-II/Cu_df2_1k.pkl.gzip → examples/Cu-II/Cu_df2_1k.pkl.gz b/examples/Cu-II/Cu_df2_1k.pkl.gzip → examples/Cu-II/Cu_df2_1k.pkl.gz
diff --git a/examples/Cu-II/input.yaml b/examples/Cu-II/input.yaml
@@ -55,8 +55,8 @@ potential:
 ## Dataset specification section
 #################################################################
 data:
-### Option 1: pandas dataframe in pckl.gzip
-  filename: Cu_df2_1k.pkl.gzip  # force to read reference pickled dataframe from given file
+### Option 1: pandas dataframe in pkl.gz
+  filename: Cu_df2_1k.pkl.gz  # force to read reference pickled dataframe from given file
 
 
 

diff --git a/examples/Ethanol/ethanol.pckl.gzip → examples/Ethanol/ethanol.pkl.gz b/examples/Ethanol/ethanol.pckl.gzip → examples/Ethanol/ethanol.pkl.gz
diff --git a/examples/Ethanol/input.yaml b/examples/Ethanol/input.yaml
@@ -49,7 +49,7 @@ potential:
 ## Dataset specification section
 #################################################################
 data:
-  filename: ethanol.pckl.gzip       # force to read reference pickled dataframe from given file
+  filename: ethanol.pkl.gz       # force to read reference pickled dataframe from given file
 
 
 #################################################################
@@ -76,6 +76,6 @@ fit:
 backend:
   evaluator: tensorpot  # tensorpot backend (recommended)
   batch_size: 1000
-  
+
   ## frequency of detailed metric calculation and printing
   display_step: 50
diff --git a/examples/HEA/HEA_randII_example.pckl.gzip → examples/HEA/HEA_randII_example.pkl.gz b/examples/HEA/HEA_randII_example.pckl.gzip → examples/HEA/HEA_randII_example.pkl.gz
diff --git a/examples/HEA/input.yaml b/examples/HEA/input.yaml
@@ -58,8 +58,8 @@ potential:
 ## Dataset specification section
 #################################################################
 data:
-  ### Option 1: pandas dataframe in pckl.gzip
-  filename: HEA_randII_example.pckl.gzip
+  ### Option 1: pandas dataframe in pkl.gz
+  filename: HEA_randII_example.pkl.gz
 
 #################################################################
 ## Fit settings section