Skip to content

Modify to use the .pkl.gz extension #31

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions bin/pace_activeset.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
parser.add_argument("potential_file", help="B-basis file name (.yaml)", type=str)

parser.add_argument("-d", "--dataset", action='append',
help="Dataset file name(s), ex.: -d filename.pckl.gzip [-d filename2.pckl.gzip]", type=str,
help="Dataset file name(s), ex.: -d filename.pkl.gz [-d filename2.pkl.gz]", type=str,
required=True)

parser.add_argument("-f", "--full", help="Compute active set on full (linearized) design matrix",
Expand Down Expand Up @@ -71,7 +71,7 @@
else:
raise RuntimeError("File {} not found".format(dsfn))
log.info("Loading dataset #{}/{} from {}".format(i + 1, len(dataset_filename), dsfn))
df = pd.read_pickle(dsfn, compression="gzip")
df = pd.read_pickle(dsfn)
log.info("Number of structures: {}".format(len(df)))
df_list.append(df)
df = pd.concat(df_list, axis=0)
Expand Down
6 changes: 3 additions & 3 deletions bin/pace_collect.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,8 +173,8 @@ def main(args):
parser.add_argument("-wd", "--working-dir", help="top directory where keep calculations",
type=str, default='.', dest="working_dir")

parser.add_argument("--output-dataset-filename", help="pickle filename, default is collected.pckl.gzip",
type=str, default="collected.pckl.gzip", dest="output_dataset_filename")
parser.add_argument("--output-dataset-filename", help="pickle filename, default is collected.pkl.gz",
type=str, default="collected.pkl.gz", dest="output_dataset_filename")

parser.add_argument('--free-atom-energy',
help="dictionary of reference energies (auto for extraction from dataset), i.e. `Al:-0.123 Cu:-0.456 Zn:auto`,"
Expand Down Expand Up @@ -268,7 +268,7 @@ def main(args):

#######
df.drop(columns=n_el_cols + ['comp_dict', 'volume', 'volume_per_atom', 'NUMBER_OF_ATOMS'], inplace=True)
df.to_pickle('{}'.format(output_dataset_filename), compression='gzip', protocol=4)
df.to_pickle('{}'.format(output_dataset_filename), protocol=4)
logger.info('Store dataset into {}'.format(output_dataset_filename))
######
df['absolute_energy_collected_per_atom'] = df['energy_corrected_per_atom'].abs()
Expand Down
16 changes: 8 additions & 8 deletions bin/pacemaker.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@
from pyace.atomicenvironment import calculate_minimal_nn_atomic_env, calculate_minimal_nn_tp_atoms
from pyace.validate import plot_analyse_error_distributions

files_to_remove = ["fitting_data_info.csv", "fitting_data_info.pckl.gzip", "log.txt", "nohup.out",
files_to_remove = ["fitting_data_info.csv", "fitting_data_info.pkl.gz", "log.txt", "nohup.out",
"target_potential.yaml", "current_extended_potential.yaml", "output_potential.yaml",
"ladder_metrics.txt", "cycle_metrics.txt", "metrics.txt",
"test_ladder_metrics.txt", "test_cycle_metrics.txt", "test_metrics.txt",
"train_pred.pckl.gzip", "test_pred.pckl.gzip",
"train_pred.pkl.gz", "test_pred.pkl.gz",
"test_ef-distributions.png", "train_ef-distributions.png", "report"
]

Expand Down Expand Up @@ -297,15 +297,15 @@ def main(args):
if general_fit.fitting_data is not None:
log.info("For train data")
pred_data = predict_and_save(general_fit, target_bbasisconfig, general_fit.fitting_data,
fname="train_pred.pckl.gzip")
fname="train_pred.pkl.gz")
log.info("Ploting validation graphs")
plot_analyse_error_distributions(pred_data, fig_prefix="train_", fig_path="report",
imagetype=backend_config.get("imagetype", "png"))

if general_fit.test_data is not None:
log.info("For test data")
pred_data = predict_and_save(general_fit, target_bbasisconfig, general_fit.test_data,
fname="test_pred.pckl.gzip")
fname="test_pred.pkl.gz")
log.info("Ploting validation graphs")
plot_analyse_error_distributions(pred_data, fig_prefix="test_", fig_path="report",
imagetype=backend_config.get("imagetype", "png"))
Expand All @@ -316,7 +316,7 @@ def generate_template_input():
readline.parse_and_bind("tab: complete")

# 1. Training set size
train_filename = input("Enter training dataset filename (ex.: data.pckl.gzip, [TAB] - autocompletion): ")
train_filename = input("Enter training dataset filename (ex.: data.pkl.gz, [TAB] - autocompletion): ")
testset_size_inp = float(input("Enter test set fraction or size (ex.: 0.05 or [ENTER] - no test set): ") or 0)

# 2. Elements
Expand All @@ -333,7 +333,7 @@ def generate_template_input():

# checking dataset
print("Trying to load {}".format(train_filename))
df = pd.read_pickle(train_filename, compression="gzip")
df = pd.read_pickle(train_filename)
if determine_elements_from_dataset:
if 'ase_atoms' in df.columns:
print("Determining available elements...")
Expand All @@ -350,7 +350,7 @@ def generate_template_input():
if resp == "yes":
df["energy_corrected"] = df["energy"]
print("Saving upgraded dataset into {}...".format(train_filename), end="")
df.to_pickle(train_filename, compression="gzip")
df.to_pickle(train_filename)
print("done")


Expand Down Expand Up @@ -429,7 +429,7 @@ def predict_and_save(general_fit, target_bbasisconfig, structures_dataframe, fna
columns_to_drop = [column for column in columns_to_drop if column in structures_dataframe]
pred_data = pd.merge(structures_dataframe.drop(columns=columns_to_drop), pred_data,
left_index=True, right_index=True)
pred_data.to_pickle(fname, compression="gzip", protocol=4)
pred_data.to_pickle(fname, protocol=4)
log.info("Predictions are saved into {} ({})".format(fname, sizeof_fmt(fname)))
return pred_data

Expand Down
File renamed without changes.
8 changes: 4 additions & 4 deletions docs/pacemaker/active_learning.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Extrapolation grade and active learning

For any fitted ACE potential and corresponding training set
(usually stored by `pacemaker` into `fitting_data_info.pckl.gzip` file in working directory)
(usually stored by `pacemaker` into `fitting_data_info.pkl.gz` file in working directory)
one can generate corresponding active set for linear B-projections (default) of full non-linear embedding.
Practice shows that linear active set is enough for extrapolation grade estimation.
However, if you want more sensitive (and "over-secure") extrapolation grade, then full active set could be used.
Expand All @@ -23,7 +23,7 @@ potential_file B-basis file name (.yaml)
optional arguments:
-h, --help show this help message and exit
-d DATASET, --dataset DATASET
Dataset file name, ex.: filename.pckl.gzip
Dataset file name, ex.: filename.pkl.gz
-f, --full Compute active set on full (linearized) design matrix
-b BATCH_SIZE, --batch_size BATCH_SIZE
Batch size (number of structures) considered simultaneously.If not provided - all dataset at once is considered
Expand All @@ -40,14 +40,14 @@ optional arguments:
Example of usage:

```
pace_activeset -d fitting_data_info.pckl.gzip output_potential.yaml
pace_activeset -d fitting_data_info.pkl.gz output_potential.yaml
```
that will generate **linear** active set and store it into `output_potential.asi` file.

or

```
pace_activeset -d fitting_data_info.pckl.gzip output_potential.yaml -f
pace_activeset -d fitting_data_info.pkl.gz output_potential.yaml -f
```
that will generate **full** active set (including linearized part of non-linear embedding function)
and store it into `output_potential.asi.nonlinear` file.
Expand Down
6 changes: 3 additions & 3 deletions docs/pacemaker/faq.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,8 @@ Alternatively, you can provide train and test datasets separately:

```yaml
data:
filename: /path/to/train_data.pckl.gzip
test_filename: /path/to/test_data.pckl.gzip
filename: /path/to/train_data.pkl.gz
test_filename: /path/to/test_data.pkl.gz
```

## I want to change the cutoff, what should I do ?
Expand All @@ -147,7 +147,7 @@ If you change cutoff, i.e. from `rcut: 7` to `rcut: 6.5`, then potential should

## How better to organize my dataset files ?

It is recommended to store all dataset files (i.e. `df*.pckl.gzip`) in one folder and
It is recommended to store all dataset files (i.e. `df*.pkl.gz`) in one folder and
specify the environment variable `$PACEMAKERDATAPATH` (exectue it in terminal or add to for example `.bashrc`)

```
Expand Down
10 changes: 5 additions & 5 deletions docs/pacemaker/inputfile.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ Dataset could be saved into file as a pickled `pandas` dataframe with special na

```YAML
data:
filename: some_stored_dataset.pckl.gzip
filename: some_stored_dataset.pkl.gz
# cache_ref_df: False # whether to store the queried or modified dataset into file, default - True
# ignore_weights: False # whether to ignore energy and force weighting columns in dataframe
# datapath: ../data # path to folder with cache files with pickled dataframes
Expand All @@ -49,11 +49,11 @@ Example of generating **custom energy/forces weights** is given in `examples/cus
### Test set

You could provide test set either as a fraction or certain number of samples from the train set (option `test_size`) or
as a separate pckl.gzip file (option `test_filename`)
as a separate pkl.gz file (option `test_filename`)

```yaml
data:
test_filename: my_test_dataset.pckl.gzip
test_filename: my_test_dataset.pkl.gz
```

or
Expand Down Expand Up @@ -231,8 +231,8 @@ fit:
}

## Custom weights: corresponding to main dataset index and `w_energy` and `w_forces` columns should
## be provided in pckl.gzip file
#weighting: {type: ExternalWeightingPolicy, filename: custom_weights_only.pckl.gzip}
## be provided in pkl.gz file
#weighting: {type: ExternalWeightingPolicy, filename: custom_weights_only.pkl.gz}

## OPTIMIZATION OPTIONS ##
optimizer: BFGS # BFGS, L-BFGS-B, Nelder-Mead, etc. : scipy minimization algorithm
Expand Down
14 changes: 6 additions & 8 deletions docs/pacemaker/quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ If you have free atom calculations (single atom in large volume) in subfolders,
pace_collect -wd path/to/my_dft_calculation --free-atom-energy auto
```
Both commands will scan through all folders and subfolders and collect DFT free energies (that are force-consistent) and forces
and make a single atom corrections. Resulting dataset will be stored into `collected.pckl.gzip` file.
and make a single atom corrections. Resulting dataset will be stored into `collected.pkl.gz` file.

If you need more flexibility for DFT dataset manipulation,
please check [Manual fitting dataset preparation](#manual_fitting_dataset_preparation).
Expand All @@ -45,7 +45,7 @@ An example DataFrame can be red as:

```python
import pandas as pd
df = pd.read_pickle("../data/exmpl_df.pckl.gzip", compression="gzip", protocol=4)
df = pd.read_pickle("../data/exmpl_df.pkl.gz")
```
And it contains the following entries:

Expand Down Expand Up @@ -120,24 +120,22 @@ data = {'energy': [e1, e2],
# create a DataFrame
df = pd.DataFrame(data)
# and save it
df.to_pickle('my_data.pckl.gzip', compression='gzip', protocol=4)
df.to_pickle('my_data.pkl.gz', protocol=4)
```

or use the utility `pace_collect` from a top-level directory to collect VASP calculations and store them in a
`collected.pckl.gzip` file.
`collected.pkl.gz` file.
The resulting dataframe can be used for fitting with `pacemaker`.

### Creating an input file

In this example we will use template as it is, however one would need to provide a path to the
example dataset `exmpl_df.pckl.gzip`. This can be done by changing `filename` parameter in the `data` section of the
example dataset `exmpl_df.pkl.gz`. This can be done by changing `filename` parameter in the `data` section of the
`input.yaml`:

```yaml

data:
filename: /path/to/the/pyace/data/exmpl_df.pckl.gzip

filename: /path/to/the/pyace/data/exmpl_df.pkl.gz
```

Please check [examples folder](https://github.com/ICAMS/python-ace/tree/master/examples) for more examples of input file.
Expand Down
12 changes: 6 additions & 6 deletions docs/pacemaker/utilities.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ pace_info [-h] potential_file

## Collect and store VASP data in pickle file

Utility to collect VASP calculations from a top-level directory and store them in a `*.pckl.gzip` file that can be used for fitting with `pacemaker`.
Utility to collect VASP calculations from a top-level directory and store them in a `*.pkl.gz` file that can be used for fitting with `pacemaker`.
The reference energies could be provided for each element (default value is zero)
or extracted automatically from the calculation with single atom and large enough (>500 Ang^3/atom) volume. Usage:

Expand All @@ -59,7 +59,7 @@ optional arguments:
-wd WORKING_DIR, --working-dir WORKING_DIR
top directory where keep calculations
--output-dataset-filename OUTPUT_DATASET_FILENAME
pickle filename, default is collected.pckl.gzip
pickle filename, default is collected.pkl.gz
--free-atom-energy [FREE_ATOM_ENERGY [FREE_ATOM_ENERGY ...]]
dictionary of reference energies (auto for extraction from dataset), i.e. `Al:-0.123 Cu:-0.456 Zn:auto`, default is zero. If option is `auto`, then it will be extracted from dataset
--selection SELECTION
Expand All @@ -81,7 +81,7 @@ potential_file B-basis file name (.yaml)
optional arguments:
-h, --help show this help message and exit
-d DATASET, --dataset DATASET
Dataset file name, ex.: filename.pckl.gzip
Dataset file name, ex.: filename.pkl.gz
-f, --full Compute active set on full (linearized) design matrix
-b BATCH_SIZE, --batch_size BATCH_SIZE
Batch size (number of structures) considered simultaneously.If not provided - all dataset at once is considered
Expand All @@ -98,14 +98,14 @@ optional arguments:
Example of usage:

```
pace_activeset -d fitting_data_info.pckl.gzip output_potential.yaml
pace_activeset -d fitting_data_info.pkl.gz output_potential.yaml
```
that will generate **linear** active set and store it into `output_potential.asi` file.

or

```
pace_activeset -d fitting_data_info.pckl.gzip output_potential.yaml -f
pace_activeset -d fitting_data_info.pkl.gz output_potential.yaml -f
```
that will generate **full** active set (including linearized part of non-linear embedding function)
and store it into `output_potential.asi.nonlinear` file.
and store it into `output_potential.asi.nonlinear` file.
6 changes: 3 additions & 3 deletions examples/Cu-I/input.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ potential:
NameOfCutoffFunction: cos,
}



## possible keywords: ALL, UNARY, BINARY, TERNARY, QUATERNARY, QUINARY,
## element combinations as (Al,Al), (Al, Ni), (Al, Ni, Zn), etc...
Expand All @@ -58,8 +58,8 @@ potential:
## Dataset specification section
#################################################################
data:
### Option 1: pandas dataframe in pckl.gzip
filename: Cu_df1_A1_A2_A3_EV_elast_phon.pckl.gzip # force to read reference pickled dataframe from given file
### Option 1: pandas dataframe in pkl.gz
filename: Cu_df1_A1_A2_A3_EV_elast_phon.pkl.gz # force to read reference pickled dataframe from given file


#################################################################
Expand Down
File renamed without changes.
4 changes: 2 additions & 2 deletions examples/Cu-II/input.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ potential:
## Dataset specification section
#################################################################
data:
### Option 1: pandas dataframe in pckl.gzip
filename: Cu_df2_1k.pkl.gzip # force to read reference pickled dataframe from given file
### Option 1: pandas dataframe in pkl.gz
filename: Cu_df2_1k.pkl.gz # force to read reference pickled dataframe from given file



Expand Down
File renamed without changes.
4 changes: 2 additions & 2 deletions examples/Ethanol/input.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ potential:
## Dataset specification section
#################################################################
data:
filename: ethanol.pckl.gzip # force to read reference pickled dataframe from given file
filename: ethanol.pkl.gz # force to read reference pickled dataframe from given file


#################################################################
Expand All @@ -76,6 +76,6 @@ fit:
backend:
evaluator: tensorpot # tensorpot backend (recommended)
batch_size: 1000

## frequency of detailed metric calculation and printing
display_step: 50
4 changes: 2 additions & 2 deletions examples/HEA/input.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ potential:
## Dataset specification section
#################################################################
data:
### Option 1: pandas dataframe in pckl.gzip
filename: HEA_randII_example.pckl.gzip
### Option 1: pandas dataframe in pkl.gz
filename: HEA_randII_example.pkl.gz

#################################################################
## Fit settings section
Expand Down
Loading