From 053568cd536b9c13c3c62a0bed0c5905a383bd3d Mon Sep 17 00:00:00 2001
From: yuzie007 <yuji.ikeda.ac.jp@gmail.com>
Date: Wed, 12 Apr 2023 00:42:02 +0200
Subject: [PATCH 1/3] Modify `-DCMAKE_INSTALL_RPATH` for MacOS

---
 setup.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index fae64ed..11d6154 100644
--- a/setup.py
+++ b/setup.py
@@ -71,7 +71,10 @@ def build_extension(self, ext):
 
         # Pile all .so in one place and use $ORIGIN as RPATH
         cmake_args += ["-DCMAKE_BUILD_WITH_INSTALL_RPATH=TRUE"]
-        cmake_args += ["-DCMAKE_INSTALL_RPATH={}".format("$ORIGIN")]
+        if platform.system() == "Darwin":  # MacOS
+            cmake_args += ["-DCMAKE_INSTALL_RPATH={}".format("@loader_path")]
+        else:
+            cmake_args += ["-DCMAKE_INSTALL_RPATH={}".format("$ORIGIN")]
         cmake_args += ["-DBUILD_SHARED_LIBS=ON"]
         cmake_args += ["-DYAML_BUILD_SHARED_LIBS=ON"]
 

From df88e9ecd0a75c2aa32afd6323ed27b0770fb4e7 Mon Sep 17 00:00:00 2001
From: yuzie007 <yuji.ikeda.ac.jp@gmail.com>
Date: Wed, 12 Apr 2023 14:17:36 +0200
Subject: [PATCH 2/3] Remove *protocol* from `pd.read_pickle`

This argument exists for `df.to_pickle` but not for `pd.read_pickle`.
---
 docs/pacemaker/quickstart.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/pacemaker/quickstart.md b/docs/pacemaker/quickstart.md
index 30fa786..32d955e 100644
--- a/docs/pacemaker/quickstart.md
+++ b/docs/pacemaker/quickstart.md
@@ -45,7 +45,7 @@ An example DataFrame can be red as:
 
 ```python
 import pandas as pd
-df = pd.read_pickle("../data/exmpl_df.pckl.gzip", compression="gzip", protocol=4)
+df = pd.read_pickle("../data/exmpl_df.pckl.gzip", compression="gzip")
 ```
 And it contains the following entries:
 

From d0e5670e5677b48630755835e07743fe7a0eebb0 Mon Sep 17 00:00:00 2001
From: yuzie007 <yuji.ikeda.ac.jp@gmail.com>
Date: Wed, 12 Apr 2023 15:28:32 +0200
Subject: [PATCH 3/3] Modify to use the `.pkl.gz` extension

rather than the present `.pckl.gzip` extension for dataset files.
---
 bin/pace_activeset.py                         |   4 ++--
 bin/pace_collect.py                           |   6 ++---
 bin/pacemaker.py                              |  16 ++++++-------
 data/{exmpl_df.pckl.gzip => exmpl_df.pkl.gz}  | Bin
 docs/pacemaker/active_learning.md             |   8 +++----
 docs/pacemaker/faq.md                         |   6 ++---
 docs/pacemaker/inputfile.md                   |  10 ++++----
 docs/pacemaker/quickstart.md                  |  14 +++++------
 docs/pacemaker/utilities.md                   |  12 +++++-----
 ...p => Cu_df1_A1_A2_A3_EV_elast_phon.pkl.gz} | Bin
 examples/Cu-I/input.yaml                      |   6 ++---
 .../{Cu_df2_1k.pkl.gzip => Cu_df2_1k.pkl.gz}  | Bin
 examples/Cu-II/input.yaml                     |   4 ++--
 .../{ethanol.pckl.gzip => ethanol.pkl.gz}     | Bin
 examples/Ethanol/input.yaml                   |   4 ++--
 ...le.pckl.gzip => HEA_randII_example.pkl.gz} | Bin
 examples/HEA/input.yaml                       |   4 ++--
 .../custom-weights/data_custom_weights.ipynb  |  21 ++++++++++-------
 examples/data_selection/data_selection.ipynb  |   8 +++----
 setup.py                                      |   2 +-
 ...pckl => mus_ns_uni_to_rawlsLS_np_rank.pkl} | Bin
 ... => pyace_selected_bbasis_funcspec.pkl.gz} | Bin
 src/pyace/generalfit.py                       |   3 +--
 src/pyace/multispecies_basisextension.py      |   2 +-
 src/pyace/preparedata.py                      |  22 +++++++-----------
 25 files changed, 73 insertions(+), 79 deletions(-)
 rename data/{exmpl_df.pckl.gzip => exmpl_df.pkl.gz} (100%)
 rename examples/Cu-I/{Cu_df1_A1_A2_A3_EV_elast_phon.pckl.gzip => Cu_df1_A1_A2_A3_EV_elast_phon.pkl.gz} (100%)
 rename examples/Cu-II/{Cu_df2_1k.pkl.gzip => Cu_df2_1k.pkl.gz} (100%)
 rename examples/Ethanol/{ethanol.pckl.gzip => ethanol.pkl.gz} (100%)
 rename examples/HEA/{HEA_randII_example.pckl.gzip => HEA_randII_example.pkl.gz} (100%)
 rename src/pyace/data/{mus_ns_uni_to_rawlsLS_np_rank.pckl => mus_ns_uni_to_rawlsLS_np_rank.pkl} (100%)
 rename src/pyace/data/{pyace_selected_bbasis_funcspec.pckl.gzip => pyace_selected_bbasis_funcspec.pkl.gz} (100%)

diff --git a/bin/pace_activeset.py b/bin/pace_activeset.py
index 0df7c57..160b58b 100644
--- a/bin/pace_activeset.py
+++ b/bin/pace_activeset.py
@@ -24,7 +24,7 @@
 parser.add_argument("potential_file", help="B-basis file name (.yaml)", type=str)
 
 parser.add_argument("-d", "--dataset", action='append',
-                    help="Dataset file name(s), ex.: -d filename.pckl.gzip [-d filename2.pckl.gzip]", type=str,
+                    help="Dataset file name(s), ex.: -d filename.pkl.gz [-d filename2.pkl.gz]", type=str,
                     required=True)
 
 parser.add_argument("-f", "--full", help="Compute active set on full (linearized) design matrix",
@@ -71,7 +71,7 @@
         else:
             raise RuntimeError("File {} not found".format(dsfn))
         log.info("Loading dataset #{}/{} from {}".format(i + 1, len(dataset_filename), dsfn))
-        df = pd.read_pickle(dsfn, compression="gzip")
+        df = pd.read_pickle(dsfn)
         log.info("Number of structures: {}".format(len(df)))
         df_list.append(df)
     df = pd.concat(df_list, axis=0)
diff --git a/bin/pace_collect.py b/bin/pace_collect.py
index 24b09da..e821daa 100644
--- a/bin/pace_collect.py
+++ b/bin/pace_collect.py
@@ -173,8 +173,8 @@ def main(args):
     parser.add_argument("-wd", "--working-dir", help="top directory where keep calculations",
                         type=str, default='.', dest="working_dir")
 
-    parser.add_argument("--output-dataset-filename", help="pickle filename, default is collected.pckl.gzip",
-                        type=str, default="collected.pckl.gzip", dest="output_dataset_filename")
+    parser.add_argument("--output-dataset-filename", help="pickle filename, default is collected.pkl.gz",
+                        type=str, default="collected.pkl.gz", dest="output_dataset_filename")
 
     parser.add_argument('--free-atom-energy',
                         help="dictionary of reference energies (auto for extraction from dataset), i.e. `Al:-0.123 Cu:-0.456 Zn:auto`,"
@@ -268,7 +268,7 @@ def main(args):
 
     #######
     df.drop(columns=n_el_cols + ['comp_dict', 'volume', 'volume_per_atom', 'NUMBER_OF_ATOMS'], inplace=True)
-    df.to_pickle('{}'.format(output_dataset_filename), compression='gzip', protocol=4)
+    df.to_pickle('{}'.format(output_dataset_filename), protocol=4)
     logger.info('Store dataset into {}'.format(output_dataset_filename))
     ######
     df['absolute_energy_collected_per_atom'] = df['energy_corrected_per_atom'].abs()
diff --git a/bin/pacemaker.py b/bin/pacemaker.py
index 7c772c9..c38258c 100644
--- a/bin/pacemaker.py
+++ b/bin/pacemaker.py
@@ -29,11 +29,11 @@
 from pyace.atomicenvironment import calculate_minimal_nn_atomic_env, calculate_minimal_nn_tp_atoms
 from pyace.validate import plot_analyse_error_distributions
 
-files_to_remove = ["fitting_data_info.csv", "fitting_data_info.pckl.gzip", "log.txt", "nohup.out",
+files_to_remove = ["fitting_data_info.csv", "fitting_data_info.pkl.gz", "log.txt", "nohup.out",
                    "target_potential.yaml", "current_extended_potential.yaml", "output_potential.yaml",
                    "ladder_metrics.txt", "cycle_metrics.txt", "metrics.txt",
                    "test_ladder_metrics.txt", "test_cycle_metrics.txt", "test_metrics.txt",
-                   "train_pred.pckl.gzip", "test_pred.pckl.gzip",
+                   "train_pred.pkl.gz", "test_pred.pkl.gz",
                    "test_ef-distributions.png", "train_ef-distributions.png", "report"
                    ]
 
@@ -297,7 +297,7 @@ def main(args):
         if general_fit.fitting_data is not None:
             log.info("For train data")
             pred_data = predict_and_save(general_fit, target_bbasisconfig, general_fit.fitting_data,
-                                         fname="train_pred.pckl.gzip")
+                                         fname="train_pred.pkl.gz")
             log.info("Ploting validation graphs")
             plot_analyse_error_distributions(pred_data, fig_prefix="train_", fig_path="report",
                                              imagetype=backend_config.get("imagetype", "png"))
@@ -305,7 +305,7 @@ def main(args):
         if general_fit.test_data is not None:
             log.info("For test data")
             pred_data = predict_and_save(general_fit, target_bbasisconfig, general_fit.test_data,
-                                         fname="test_pred.pckl.gzip")
+                                         fname="test_pred.pkl.gz")
             log.info("Ploting validation graphs")
             plot_analyse_error_distributions(pred_data, fig_prefix="test_", fig_path="report",
                                              imagetype=backend_config.get("imagetype", "png"))
@@ -316,7 +316,7 @@ def generate_template_input():
     readline.parse_and_bind("tab: complete")
 
     # 1. Training set size
-    train_filename = input("Enter training dataset filename (ex.: data.pckl.gzip, [TAB] - autocompletion): ")
+    train_filename = input("Enter training dataset filename (ex.: data.pkl.gz, [TAB] - autocompletion): ")
     testset_size_inp = float(input("Enter test set fraction or size (ex.: 0.05 or [ENTER] - no test set): ") or 0)
 
     # 2. Elements
@@ -333,7 +333,7 @@ def generate_template_input():
 
     # checking dataset
     print("Trying to load {}".format(train_filename))
-    df = pd.read_pickle(train_filename, compression="gzip")
+    df = pd.read_pickle(train_filename)
     if determine_elements_from_dataset:
         if 'ase_atoms' in df.columns:
             print("Determining available elements...")
@@ -350,7 +350,7 @@ def generate_template_input():
         if resp == "yes":
             df["energy_corrected"] = df["energy"]
             print("Saving upgraded dataset into {}...".format(train_filename), end="")
-            df.to_pickle(train_filename, compression="gzip")
+            df.to_pickle(train_filename)
             print("done")
 
 
@@ -429,7 +429,7 @@ def predict_and_save(general_fit, target_bbasisconfig, structures_dataframe, fna
     columns_to_drop = [column for column in columns_to_drop if column in structures_dataframe]
     pred_data = pd.merge(structures_dataframe.drop(columns=columns_to_drop), pred_data,
                          left_index=True, right_index=True)
-    pred_data.to_pickle(fname, compression="gzip", protocol=4)
+    pred_data.to_pickle(fname, protocol=4)
     log.info("Predictions are saved into {} ({})".format(fname, sizeof_fmt(fname)))
     return pred_data
 
diff --git a/data/exmpl_df.pckl.gzip b/data/exmpl_df.pkl.gz
similarity index 100%
rename from data/exmpl_df.pckl.gzip
rename to data/exmpl_df.pkl.gz
diff --git a/docs/pacemaker/active_learning.md b/docs/pacemaker/active_learning.md
index ca82aa0..19fc989 100644
--- a/docs/pacemaker/active_learning.md
+++ b/docs/pacemaker/active_learning.md
@@ -1,7 +1,7 @@
 # Extrapolation grade and active learning 
 
 For any fitted ACE potential and corresponding training set 
-(usually stored by `pacemaker` into `fitting_data_info.pckl.gzip` file in working directory)
+(usually stored by `pacemaker` into `fitting_data_info.pkl.gz` file in working directory)
 one can generate corresponding active set for linear B-projections (default) of full non-linear embedding.
 Practice shows that linear active set is enough for extrapolation grade estimation.
 However, if you want more sensitive (and "over-secure") extrapolation grade, then full active set could be used.
@@ -23,7 +23,7 @@ potential_file        B-basis file name (.yaml)
 optional arguments:
    -h, --help            show this help message and exit
    -d DATASET, --dataset DATASET
-   Dataset file name, ex.: filename.pckl.gzip
+   Dataset file name, ex.: filename.pkl.gz
    -f, --full            Compute active set on full (linearized) design matrix
    -b BATCH_SIZE, --batch_size BATCH_SIZE
    Batch size (number of structures) considered simultaneously.If not provided - all dataset at once is considered
@@ -40,14 +40,14 @@ optional arguments:
 Example of usage:
 
 ```
-pace_activeset -d fitting_data_info.pckl.gzip output_potential.yaml
+pace_activeset -d fitting_data_info.pkl.gz output_potential.yaml
 ```
 that will generate **linear** active set and store it into `output_potential.asi` file.
 
 or
 
 ```
-pace_activeset -d fitting_data_info.pckl.gzip output_potential.yaml -f
+pace_activeset -d fitting_data_info.pkl.gz output_potential.yaml -f
 ```
 that will generate **full** active set (including linearized part of non-linear embedding function)
 and store it into `output_potential.asi.nonlinear` file.
diff --git a/docs/pacemaker/faq.md b/docs/pacemaker/faq.md
index 1bacc11..e85dfa4 100644
--- a/docs/pacemaker/faq.md
+++ b/docs/pacemaker/faq.md
@@ -134,8 +134,8 @@ Alternatively, you can provide train and test datasets separately:
 
 ```yaml
 data:
-  filename: /path/to/train_data.pckl.gzip    
-  test_filename: /path/to/test_data.pckl.gzip 
+  filename: /path/to/train_data.pkl.gz
+  test_filename: /path/to/test_data.pkl.gz
 ``` 
 
 ## I want to change the cutoff, what should I do ?
@@ -147,7 +147,7 @@ If you change cutoff, i.e. from `rcut: 7` to `rcut: 6.5`, then potential should
 
 ## How better to organize my dataset files ?
 
-It is recommended to store all dataset files (i.e. `df*.pckl.gzip`) in one folder and
+It is recommended to store all dataset files (i.e. `df*.pkl.gz`) in one folder and
 specify the environment variable `$PACEMAKERDATAPATH` (exectue it in terminal or add to for example `.bashrc`) 
 
 ```
diff --git a/docs/pacemaker/inputfile.md b/docs/pacemaker/inputfile.md
index 754eaaf..6c31b49 100644
--- a/docs/pacemaker/inputfile.md
+++ b/docs/pacemaker/inputfile.md
@@ -34,7 +34,7 @@ Dataset could be saved into file as a pickled `pandas` dataframe with special na
 
 ```YAML
 data: 
-  filename: some_stored_dataset.pckl.gzip
+  filename: some_stored_dataset.pkl.gz
   # cache_ref_df: False             # whether to store the queried or modified dataset into file, default - True
   # ignore_weights: False          # whether to ignore energy and force weighting columns in dataframe
   # datapath: ../data              # path to folder with cache files with pickled dataframes 
@@ -49,11 +49,11 @@ Example of generating **custom energy/forces weights** is given in `examples/cus
 ### Test set
 
 You could provide test set either as a fraction or certain number of samples from the train set (option `test_size`) or
-as a separate pckl.gzip file (option `test_filename`)
+as a separate pkl.gz file (option `test_filename`)
 
 ```yaml
 data:
-  test_filename: my_test_dataset.pckl.gzip
+  test_filename: my_test_dataset.pkl.gz
 ```
 
 or
@@ -231,8 +231,8 @@ fit:
     }
     
     ## Custom weights:  corresponding to main dataset index and `w_energy` and `w_forces` columns should
-    ## be provided in pckl.gzip file
-    #weighting: {type: ExternalWeightingPolicy, filename: custom_weights_only.pckl.gzip}
+    ## be provided in pkl.gz file
+    #weighting: {type: ExternalWeightingPolicy, filename: custom_weights_only.pkl.gz}
     
     ## OPTIMIZATION OPTIONS ##
     optimizer: BFGS # BFGS, L-BFGS-B, Nelder-Mead, etc. : scipy minimization algorithm
diff --git a/docs/pacemaker/quickstart.md b/docs/pacemaker/quickstart.md
index 32d955e..d088a41 100644
--- a/docs/pacemaker/quickstart.md
+++ b/docs/pacemaker/quickstart.md
@@ -20,7 +20,7 @@ If you have free atom calculations (single atom in large volume) in subfolders,
 pace_collect -wd path/to/my_dft_calculation --free-atom-energy auto 
 ```
 Both commands will scan through all folders and subfolders and collect DFT free energies (that are force-consistent) and forces 
-and make a single atom corrections. Resulting dataset will be stored into `collected.pckl.gzip` file.
+and make a single atom corrections. Resulting dataset will be stored into `collected.pkl.gz` file.
 
 If you need more flexibility for DFT dataset manipulation,
 please check [Manual fitting dataset preparation](#manual_fitting_dataset_preparation).
@@ -45,7 +45,7 @@ An example DataFrame can be red as:
 
 ```python
 import pandas as pd
-df = pd.read_pickle("../data/exmpl_df.pckl.gzip", compression="gzip")
+df = pd.read_pickle("../data/exmpl_df.pkl.gz")
 ```
 And it contains the following entries:
 
@@ -120,24 +120,22 @@ data = {'energy': [e1, e2],
 # create a DataFrame
 df = pd.DataFrame(data)
 # and save it 
-df.to_pickle('my_data.pckl.gzip', compression='gzip', protocol=4)
+df.to_pickle('my_data.pkl.gz', protocol=4)
 ```
 
 or use the utility `pace_collect` from a top-level directory to collect VASP calculations and store them in a 
-`collected.pckl.gzip` file.
+`collected.pkl.gz` file.
 The resulting dataframe can be used for fitting with `pacemaker`.
 
 ### Creating an input file
  
 In this example we will use template as it is, however one would need to provide a path to the
-example dataset `exmpl_df.pckl.gzip`. This can be done by changing `filename` parameter in the `data` section of the 
+example dataset `exmpl_df.pkl.gz`. This can be done by changing `filename` parameter in the `data` section of the 
 `input.yaml`:
 
 ```yaml
-
 data:
-   filename: /path/to/the/pyace/data/exmpl_df.pckl.gzip
-
+   filename: /path/to/the/pyace/data/exmpl_df.pkl.gz
 ```
 
 Please check [examples folder](https://github.com/ICAMS/python-ace/tree/master/examples) for more examples of input file.
diff --git a/docs/pacemaker/utilities.md b/docs/pacemaker/utilities.md
index 7935620..525f387 100644
--- a/docs/pacemaker/utilities.md
+++ b/docs/pacemaker/utilities.md
@@ -46,7 +46,7 @@ pace_info [-h] potential_file
 
 ## Collect and store VASP data in pickle file
 
-Utility to collect VASP calculations from a top-level directory and store them in a `*.pckl.gzip` file that can be used for fitting with `pacemaker`. 
+Utility to collect VASP calculations from a top-level directory and store them in a `*.pkl.gz` file that can be used for fitting with `pacemaker`. 
 The reference energies could be provided for each element (default value is zero) 
 or extracted automatically from the calculation with single atom and large enough (>500 Ang^3/atom) volume. Usage: 
 
@@ -59,7 +59,7 @@ optional arguments:
   -wd WORKING_DIR, --working-dir WORKING_DIR
                         top directory where keep calculations
   --output-dataset-filename OUTPUT_DATASET_FILENAME
-                        pickle filename, default is collected.pckl.gzip
+                        pickle filename, default is collected.pkl.gz
   --free-atom-energy [FREE_ATOM_ENERGY [FREE_ATOM_ENERGY ...]]
                         dictionary of reference energies (auto for extraction from dataset), i.e. `Al:-0.123 Cu:-0.456 Zn:auto`, default is zero. If option is `auto`, then it will be extracted from dataset
   --selection SELECTION
@@ -81,7 +81,7 @@ potential_file        B-basis file name (.yaml)
 optional arguments:
    -h, --help            show this help message and exit
    -d DATASET, --dataset DATASET
-   Dataset file name, ex.: filename.pckl.gzip
+   Dataset file name, ex.: filename.pkl.gz
    -f, --full            Compute active set on full (linearized) design matrix
    -b BATCH_SIZE, --batch_size BATCH_SIZE
    Batch size (number of structures) considered simultaneously.If not provided - all dataset at once is considered
@@ -98,14 +98,14 @@ optional arguments:
 Example of usage:
 
 ```
-pace_activeset -d fitting_data_info.pckl.gzip output_potential.yaml
+pace_activeset -d fitting_data_info.pkl.gz output_potential.yaml
 ```
 that will generate **linear** active set and store it into `output_potential.asi` file.
 
 or
 
 ```
-pace_activeset -d fitting_data_info.pckl.gzip output_potential.yaml -f
+pace_activeset -d fitting_data_info.pkl.gz output_potential.yaml -f
 ```
 that will generate **full** active set (including linearized part of non-linear embedding function)
-and store it into `output_potential.asi.nonlinear` file.
\ No newline at end of file
+and store it into `output_potential.asi.nonlinear` file.
diff --git a/examples/Cu-I/Cu_df1_A1_A2_A3_EV_elast_phon.pckl.gzip b/examples/Cu-I/Cu_df1_A1_A2_A3_EV_elast_phon.pkl.gz
similarity index 100%
rename from examples/Cu-I/Cu_df1_A1_A2_A3_EV_elast_phon.pckl.gzip
rename to examples/Cu-I/Cu_df1_A1_A2_A3_EV_elast_phon.pkl.gz
diff --git a/examples/Cu-I/input.yaml b/examples/Cu-I/input.yaml
index 15e525b..5044b26 100644
--- a/examples/Cu-I/input.yaml
+++ b/examples/Cu-I/input.yaml
@@ -42,7 +42,7 @@ potential:
       NameOfCutoffFunction: cos,
     }
 
-    
+
 
   ## possible keywords: ALL, UNARY, BINARY, TERNARY, QUATERNARY, QUINARY,
   ## element combinations as (Al,Al), (Al, Ni), (Al, Ni, Zn), etc...
@@ -58,8 +58,8 @@ potential:
 ## Dataset specification section
 #################################################################
 data:
-### Option 1: pandas dataframe in pckl.gzip
-  filename: Cu_df1_A1_A2_A3_EV_elast_phon.pckl.gzip  # force to read reference pickled dataframe from given file
+### Option 1: pandas dataframe in pkl.gz
+  filename: Cu_df1_A1_A2_A3_EV_elast_phon.pkl.gz  # force to read reference pickled dataframe from given file
 
 
 #################################################################
diff --git a/examples/Cu-II/Cu_df2_1k.pkl.gzip b/examples/Cu-II/Cu_df2_1k.pkl.gz
similarity index 100%
rename from examples/Cu-II/Cu_df2_1k.pkl.gzip
rename to examples/Cu-II/Cu_df2_1k.pkl.gz
diff --git a/examples/Cu-II/input.yaml b/examples/Cu-II/input.yaml
index 3c4fb29..2e155f2 100644
--- a/examples/Cu-II/input.yaml
+++ b/examples/Cu-II/input.yaml
@@ -55,8 +55,8 @@ potential:
 ## Dataset specification section
 #################################################################
 data:
-### Option 1: pandas dataframe in pckl.gzip
-  filename: Cu_df2_1k.pkl.gzip  # force to read reference pickled dataframe from given file
+### Option 1: pandas dataframe in pkl.gz
+  filename: Cu_df2_1k.pkl.gz  # force to read reference pickled dataframe from given file
 
 
 
diff --git a/examples/Ethanol/ethanol.pckl.gzip b/examples/Ethanol/ethanol.pkl.gz
similarity index 100%
rename from examples/Ethanol/ethanol.pckl.gzip
rename to examples/Ethanol/ethanol.pkl.gz
diff --git a/examples/Ethanol/input.yaml b/examples/Ethanol/input.yaml
index b4bebef..3d00085 100644
--- a/examples/Ethanol/input.yaml
+++ b/examples/Ethanol/input.yaml
@@ -49,7 +49,7 @@ potential:
 ## Dataset specification section
 #################################################################
 data:
-  filename: ethanol.pckl.gzip       # force to read reference pickled dataframe from given file
+  filename: ethanol.pkl.gz       # force to read reference pickled dataframe from given file
 
 
 #################################################################
@@ -76,6 +76,6 @@ fit:
 backend:
   evaluator: tensorpot  # tensorpot backend (recommended)
   batch_size: 1000
-  
+
   ## frequency of detailed metric calculation and printing
   display_step: 50
diff --git a/examples/HEA/HEA_randII_example.pckl.gzip b/examples/HEA/HEA_randII_example.pkl.gz
similarity index 100%
rename from examples/HEA/HEA_randII_example.pckl.gzip
rename to examples/HEA/HEA_randII_example.pkl.gz
diff --git a/examples/HEA/input.yaml b/examples/HEA/input.yaml
index e590438..fc270c9 100644
--- a/examples/HEA/input.yaml
+++ b/examples/HEA/input.yaml
@@ -58,8 +58,8 @@ potential:
 ## Dataset specification section
 #################################################################
 data:
-  ### Option 1: pandas dataframe in pckl.gzip
-  filename: HEA_randII_example.pckl.gzip
+  ### Option 1: pandas dataframe in pkl.gz
+  filename: HEA_randII_example.pkl.gz
 
 #################################################################
 ## Fit settings section
diff --git a/examples/custom-weights/data_custom_weights.ipynb b/examples/custom-weights/data_custom_weights.ipynb
index b176db3..9cf0939 100644
--- a/examples/custom-weights/data_custom_weights.ipynb
+++ b/examples/custom-weights/data_custom_weights.ipynb
@@ -55,7 +55,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df=pd.read_pickle(\"/some/fit/data/Al-Li/data.pckl.gzip\", compression=\"gzip\")"
+    "df=pd.read_pickle(\"/some/fit/data/Al-Li/data.pkl.gz\")"
    ]
   },
   {
@@ -312,7 +312,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df.to_pickle(\"df_weights.pckl.gzip\", compression=\"gzip\", protocol=4)"
+    "df.to_pickle(\"df_weights.pkl.gz\", protocol=4)"
    ]
   },
   {
@@ -554,7 +554,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df.to_pickle(\"df_weights_elastic_x10.pckl.gzip\", compression=\"gzip\", protocol=4)"
+    "df.to_pickle(\"df_weights_elastic_x10.pkl.gz\", protocol=4)"
    ]
   },
   {
@@ -635,7 +635,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df.to_pickle(\"df_weights_Li_x5.pckl.gzip\", compression=\"gzip\", protocol=4)"
+    "df.to_pickle(\"df_weights_Li_x5.pkl.gz\", protocol=4)"
    ]
   },
   {
@@ -646,6 +646,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -655,7 +656,7 @@
     "...\n",
     "\n",
     "data:\n",
-    "    filename: df_weights.pckl.gzip\n",
+    "    filename: df_weights.pkl.gz\n",
     "\n",
     "...\n",
     "``` "
@@ -669,6 +670,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -678,7 +680,7 @@
     "...\n",
     "\n",
     "data:\n",
-    "    filename: df_weights.pckl.gzip\n",
+    "    filename: df_weights.pkl.gz\n",
     "    ignore_weights: True\n",
     "\n",
     "...\n",
@@ -837,7 +839,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "weights_only.to_pickle(\"custom_weights_only.pckl.gzip\", compression=\"gzip\", protocol=4)"
+    "weights_only.to_pickle(\"custom_weights_only.pkl.gz\", protocol=4)"
    ]
   },
   {
@@ -848,6 +850,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -857,11 +860,11 @@
     "...\n",
     "# load data as usual\n",
     "data:\n",
-    "   filename: /some/fit/data/Al-Li/data.pckl.gzip\n",
+    "   filename: /some/fit/data/Al-Li/data.pkl.gz\n",
     "\n",
     "fit:\n",
     "    # use ExternalWeightingPolicy weighting scheme\n",
-    "    weighting: {type: ExternalWeightingPolicy, filename: custom_weights_only.pckl.gzip}\n",
+    "    weighting: {type: ExternalWeightingPolicy, filename: custom_weights_only.pkl.gz}\n",
     "\n",
     "...\n",
     "``` "
diff --git a/examples/data_selection/data_selection.ipynb b/examples/data_selection/data_selection.ipynb
index ec76b6a..7b34244 100644
--- a/examples/data_selection/data_selection.ipynb
+++ b/examples/data_selection/data_selection.ipynb
@@ -53,7 +53,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df=pd.read_pickle(\"../Cu-II/Cu_df2_1k.pkl.gzip\",compression=\"gzip\")"
+    "df=pd.read_pickle(\"../Cu-II/Cu_df2_1k.pkl.gz\")"
    ]
   },
   {
@@ -444,7 +444,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df_selected.to_pickle(\"Cu-fcc-only.pckl.gzip\",compression=\"gzip\")"
+    "df_selected.to_pickle(\"Cu-fcc-only.pkl.gz\")"
    ]
   },
   {
@@ -521,7 +521,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df_selected.to_pickle(\"Cu-0.5eV.pckl.gzip\", compression=\"gzip\")"
+    "df_selected.to_pickle(\"Cu-0.5eV.pkl.gz\")"
    ]
   },
   {
@@ -577,7 +577,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df_selected.to_pickle(\"Cu-no-shake.pckl.gzip\", compression=\"gzip\")"
+    "df_selected.to_pickle(\"Cu-no-shake.pkl.gz\")"
    ]
   },
   {
diff --git a/setup.py b/setup.py
index 11d6154..d0d44c3 100644
--- a/setup.py
+++ b/setup.py
@@ -161,7 +161,7 @@ def build_extension(self, ext):
         'Programming Language :: Python :: 3',
     ],
     package_data={"pyace.data": [
-        "mus_ns_uni_to_rawlsLS_np_rank.pckl",
+        "mus_ns_uni_to_rawlsLS_np_rank.pkl",
         "input_template.yaml"
     ]},
     scripts=["bin/pacemaker", "bin/pace_yaml2yace", "bin/pace_update_ace",
diff --git a/src/pyace/data/mus_ns_uni_to_rawlsLS_np_rank.pckl b/src/pyace/data/mus_ns_uni_to_rawlsLS_np_rank.pkl
similarity index 100%
rename from src/pyace/data/mus_ns_uni_to_rawlsLS_np_rank.pckl
rename to src/pyace/data/mus_ns_uni_to_rawlsLS_np_rank.pkl
diff --git a/src/pyace/data/pyace_selected_bbasis_funcspec.pckl.gzip b/src/pyace/data/pyace_selected_bbasis_funcspec.pkl.gz
similarity index 100%
rename from src/pyace/data/pyace_selected_bbasis_funcspec.pckl.gzip
rename to src/pyace/data/pyace_selected_bbasis_funcspec.pkl.gz
diff --git a/src/pyace/generalfit.py b/src/pyace/generalfit.py
index 30d8de3..e83104f 100644
--- a/src/pyace/generalfit.py
+++ b/src/pyace/generalfit.py
@@ -26,7 +26,7 @@
 
 __username = None
 
-FITTING_DATA_INFO_FILENAME = "fitting_data_info.pckl.gzip"
+FITTING_DATA_INFO_FILENAME = "fitting_data_info.pkl.gz"
 
 
 def get_username():
@@ -382,7 +382,6 @@ def save_fitting_data_info(self):
         columns_to_save = [col for col in fitting_data_columns if col not in columns_to_drop]
 
         self.fitting_data[columns_to_save].to_pickle(FITTING_DATA_INFO_FILENAME,
-                                                     compression="gzip",
                                                      protocol=4)
         log.info("Fitting dataset info saved into {}".format(FITTING_DATA_INFO_FILENAME))
 
diff --git a/src/pyace/multispecies_basisextension.py b/src/pyace/multispecies_basisextension.py
index 82bb41e..4774590 100644
--- a/src/pyace/multispecies_basisextension.py
+++ b/src/pyace/multispecies_basisextension.py
@@ -45,7 +45,7 @@
     'Lv', 'Ts', 'Og']
 
 default_mus_ns_uni_to_rawlsLS_np_rank_filename = pkg_resources.resource_filename('pyace.data',
-                                                                                 'mus_ns_uni_to_rawlsLS_np_rank.pckl')
+                                                                                 'mus_ns_uni_to_rawlsLS_np_rank.pkl')
 
 def clean_bbasisconfig(initial_bbasisconfig):
     for block in initial_bbasisconfig.funcspecs_blocks:
diff --git a/src/pyace/preparedata.py b/src/pyace/preparedata.py
index 124302f..bc8b6e2 100644
--- a/src/pyace/preparedata.py
+++ b/src/pyace/preparedata.py
@@ -230,25 +230,19 @@ def generate_weights(self, df):
 def save_dataframe(df: pd.DataFrame, filename: str, protocol: int = 4):
     filename = os.path.abspath(filename)
     log.info("Writing fit pickle file: {}".format(filename))
-    if filename.endswith("gzip"):
-        compression = "gzip"
-    else:
-        compression = "infer"
     dirname = os.path.dirname(filename)
     os.makedirs(dirname, exist_ok=True)
     if not isinstance(df, DataFrameWithMetadata):
         log.info("Transforming to DataFrameWithMetadata")
         df = DataFrameWithMetadata(df)
-    df.to_pickle(filename, protocol=protocol, compression=compression)
+    df.to_pickle(filename, protocol=protocol)
     log.info("Saved to file {} ({})".format(filename, sizeof_fmt(filename)))
 
 
-def load_dataframe(filename: str, compression: str = "infer") -> pd.DataFrame:
+def load_dataframe(filename: str) -> pd.DataFrame:
     filesize = os.path.getsize(filename)
     log.info("Loading dataframe from pickle file {} ({})".format(filename, sizeof_fmt(filesize)))
-    if filename.endswith(".gzip"):
-        compression = "gzip"
-    df = pd.read_pickle(filename, compression=compression)
+    df = pd.read_pickle(filename)
     return df
 
 
@@ -346,7 +340,7 @@ def add_ase_atoms_transformer(self, result_column_name, transformer_func, **kwar
 
     def get_default_ref_filename(self):
         try:
-            return "df-{calculator}-{element}-{suffix}.pckl.gzip".format(
+            return "df-{calculator}-{element}-{suffix}.pkl.gz".format(
                 calculator=self.config["calculator"],
                 element=self.config["element"],
                 suffix="ref").replace("/", "_")
@@ -562,7 +556,7 @@ def load_or_query_ref_structures_dataframe(self, force_query=None):
             self.df = self.raw_df
         elif file_to_load is not None and os.path.isfile(file_to_load) and not force_query:
             log.info(file_to_load + " found, try to load")
-            self.df = load_dataframe(file_to_load, compression="infer")
+            self.df = load_dataframe(file_to_load)
         else:  # if ref_df is still not loaded, try to query from DB
             if not force_query:
                 log.info("Cache not found, querying database")
@@ -610,7 +604,7 @@ def get_ref_dataframe(self, force_query=None, cache_ref_df=False):
         if cache_ref_df or self.cache_ref_df:
             if self.ref_df_changed:
                 # generate filename to save df: if name is provided - try to put it into datapath
-                filename = self.get_actual_filename() or "df_ref.pckl.gzip"
+                filename = self.get_actual_filename() or "df_ref.pkl.gz"
                 log.info("Saving processed raw dataframe into " + filename)
                 save_dataframe(self.df, filename=filename)
             else:
@@ -1051,7 +1045,7 @@ class ExternalWeightingPolicy(StructuresDatasetWeightingPolicy):
 
     def __init__(self, filename: str):
         """
-        :param filename: .pckl.gzip filename of dataframe with index and  `w_energy` and `w_forces` columns
+        :param filename: .pkl.gz filename of dataframe with index and  `w_energy` and `w_forces` columns
         """
         self.filename = filename
 
@@ -1060,7 +1054,7 @@ def __str__(self):
 
     def generate_weights(self, df):
         log.info("Loading external weights dataframe {}".format(self.filename))
-        self.weights_df = pd.read_pickle(self.filename, compression="gzip")
+        self.weights_df = pd.read_pickle(self.filename)
         log.info("External weights dataframe loaded, it contains {} entries".format(len(self.weights_df)))
 
         # check that columns are presented