diff --git a/battdat/consistency/time.py b/battdat/consistency/time.py new file mode 100644 index 0000000..63da827 --- /dev/null +++ b/battdat/consistency/time.py @@ -0,0 +1,42 @@ +"""Check for problems across the columns which describe time""" +from dataclasses import dataclass +from datetime import datetime +from typing import List + +import numpy as np + +from .base import ConsistencyChecker +from ..data import BatteryDataset + + +@dataclass +class TestTimeVsTimeChecker(ConsistencyChecker): + """Ensure that the test time and timestamp columns agree + + Verify that the difference between the first and current row + for the ``test_time`` (time elapsed since the beginning of cycling) + and ``time`` (clock datetime) columns agree. + """ + + max_inconsistency: float = 0.1 + """Maximum inconsistency between timestamp and test time (s)""" + + def check(self, dataset: BatteryDataset) -> List[str]: + output = [] + for name, subset in dataset.tables.items(): + if 'time' not in subset.columns or 'test_time' not in subset.columns: + continue + + # Ensure that + test_time_normed = subset['test_time'] - subset['test_time'].min() + timestamp_normed = subset['time'] - subset['time'].min() + diffs = np.abs(test_time_normed - timestamp_normed) + max_diff = diffs.max() + if max_diff > self.max_inconsistency: + idx_max = np.argmax(diffs) + date_max = datetime.fromtimestamp(subset['time'].iloc[idx_max]) + time_max = subset['test_time'].iloc[idx_max] + output.append(f'Test times and timestep in dataset "{name}" differ by {max_diff:.1e} seconds in row {idx_max}.' + f' test_time={int(time_max)} s, time={date_max}') + + return output diff --git a/battdat/io/arbin.py b/battdat/io/arbin.py index 5c30303..38af3e4 100644 --- a/battdat/io/arbin.py +++ b/battdat/io/arbin.py @@ -21,8 +21,7 @@ def group(self, files: Union[str, List[str]], directories: List[str] = None, if file.lower().endswith('.csv'): yield file - def read_file(self, file: str, file_number: int = 0, start_cycle: int = 0, - start_time: float = 0) -> pd.DataFrame: + def read_file(self, file: str) -> pd.DataFrame: # Read the file and rename the file df = pd.read_csv(file) @@ -32,10 +31,9 @@ def read_file(self, file: str, file_number: int = 0, start_cycle: int = 0, df_out = pd.DataFrame() # Convert the column names - df_out['cycle_number'] = df['Cycle_Index'] + start_cycle - df['Cycle_Index'].min() + df_out['cycle_number'] = df['Cycle_Index'] - df['Cycle_Index'].min() df_out['cycle_number'] = df_out['cycle_number'].astype('int64') - df_out['file_number'] = file_number # df_out['cycle_number']*0 - df_out['test_time'] = np.array(df['test_time'] - df['test_time'][0] + start_time, dtype=float) + df_out['test_time'] = np.array(df['test_time'] - df['test_time'][0], dtype=float) df_out['current'] = df['Current'] # TODO (wardlt): Check this!? df_out['temperature'] = df['Temperature'] df_out['internal_resistance'] = df['Internal_Resistance'] diff --git a/battdat/io/base.py b/battdat/io/base.py index c231557..0cf2306 100644 --- a/battdat/io/base.py +++ b/battdat/io/base.py @@ -7,6 +7,7 @@ from battdat.data import BatteryDataset from battdat.schemas import BatteryMetadata +from battdat.schemas.column import ChargingState PathLike = Union[str, Path] @@ -96,20 +97,13 @@ class CycleTestReader(DatasetFileReader): Adds logic for reading cycling time series from a list of files. """ - def read_file(self, - file: str, - file_number: int = 0, - start_cycle: int = 0, - start_time: int = 0) -> pd.DataFrame: + def read_file(self, file: str) -> pd.DataFrame: """Generate a DataFrame containing the data in this file The dataframe will be in our standard format Args: file: Path to the file - file_number: Number of file, in case the test is spread across multiple files - start_cycle: Index to use for the first cycle, in case test is spread across multiple files - start_time: Test time to use for the start of the test, in case test is spread across multiple files Returns: Dataframe containing the battery data in a standard format @@ -127,21 +121,53 @@ def read_dataset(self, group: Sequence[PathLike] = (), metadata: Optional[Batter DataFrame containing the information from all files """ - # Initialize counters for the cycle numbers, etc., Used to determine offsets for the files read - start_cycle = 0 - start_time = 0 - # Read the data for each file # Keep track of the ending index and ending time output_dfs = [] for file_number, file in enumerate(group): - # Read the file - df_out = self.read_file(file, file_number, start_cycle, start_time) - output_dfs.append(df_out) + df_out = self.read_file(file) + df_out['file_number'] = file_number + + # Adjust the test time and cycle for subsequent files + if len(output_dfs) > 0: + last_row = output_dfs[-1].iloc[-1] + + # Determine the length of rest between last file and current + rest_between_files = 0 # Assume duplicate points if no data are available + if 'time' in last_row and 'time' in df_out: + rest_between_files = max(df_out['time'].iloc[0] - last_row['time'], 0) + + # Increment the test time such that it continues from the last file + df_out['test_time'] += last_row['test_time'] + rest_between_files + + # Ensure current is zero if the rest between files is nonzero + if rest_between_files != 0 and (last_row['current'] != 0 or df_out['current'].iloc[0] != 0): + # Assume the rest occurs a millisecond later + new_last_row = output_dfs[-1].iloc[-1:].copy() + new_last_row['test_time'] += 1e-3 + new_last_row['current'] = 0 + if 'time' in new_last_row: + new_last_row['time'] += 1e-3 + if 'state' in new_last_row: + new_last_row['state'] = ChargingState.hold + output_dfs[-1] = pd.concat([output_dfs[-1], new_last_row], ignore_index=True) + + # Assume the rest ends a millisecond before the new cycle starts + new_first_row = df_out.iloc[:1].copy() + new_first_row['test_time'] -= 1e-3 + new_first_row['current'] = 0. + if 'time' in new_first_row: + new_first_row['time'] -= 1e-3 + if 'state' in new_first_row: + df_out['state'] = ChargingState.hold + df_out = pd.concat([new_first_row, df_out], ignore_index=True) + + # Adjust the cycle number, if included + # Assume the new file starts a new cycle + if 'cycle_number' in df_out.columns and 'cycle_number' in last_row: + df_out['cycle_number'] += 1 + int(last_row['cycle_number']) - # Increment the start cycle and time to determine starting point of next file - start_cycle += df_out['cycle_number'].max() - df_out['cycle_number'].min() + 1 - start_time = df_out['test_time'].max() + output_dfs.append(df_out) # Combine the data from all files df_out = pd.concat(output_dfs, ignore_index=True) diff --git a/battdat/io/maccor.py b/battdat/io/maccor.py index 0eff245..510080f 100644 --- a/battdat/io/maccor.py +++ b/battdat/io/maccor.py @@ -1,5 +1,6 @@ """Extractor for MACCOR""" import re +import logging import itertools from dataclasses import dataclass from datetime import datetime @@ -17,6 +18,8 @@ _test_date_re = re.compile(r'Date of Test:\s+(\d{2}/\d{2}/\d{4})') +logger = logging.getLogger(__name__) + @dataclass class MACCORReader(CycleTestReader, DatasetFileReader): @@ -26,10 +29,16 @@ class MACCORReader(CycleTestReader, DatasetFileReader): The :meth:`group` operation will consolidate files such that all with the same prefix (i.e., everything except the numerals in the extension) are treated as part of the same experiment. - """ - ignore_time: bool = False - """Ignore the the time column, which can be problematic.""" + MACCOR files include both a test time relative to the start of testing + and a timestamp following the clock time. + This parser only assumes the test time to be correct because the timestamps + are nontrivial to rely upon, as they may be non-monotonic due to + changes to the computer's clock. + Test times are always monotonic. + The timestamps are generated based on the timestamp of the first row and + the change in test time. + """ def group(self, files: Union[str, List[str]], directories: List[str] = None, context: dict = None) -> Iterator[Tuple[str, ...]]: @@ -50,7 +59,7 @@ def read_dataset(self, group: Sequence[PathLike] = (), metadata: Optional[Batter # Verify the cells are ordered by test date start_dates = [] for file in group: - with open(file, 'r') as fp: + with open(file, 'r', encoding='latin1') as fp: header = fp.readline() test_date = _test_date_re.findall(header)[0] start_dates.append(datetime.strptime(test_date, '%m/%d/%Y')) @@ -62,11 +71,10 @@ def read_dataset(self, group: Sequence[PathLike] = (), metadata: Optional[Batter return super().read_dataset(group, metadata) - def read_file(self, file: PathLike, file_number: int = 0, start_cycle: int = 0, - start_time: int = 0) -> pd.DataFrame: + def read_file(self, file: PathLike) -> pd.DataFrame: # Pull the test date from the first line of the file - with open(file, 'r') as fp: + with open(file, 'r', encoding='latin1') as fp: header = fp.readline() test_date = _test_date_re.findall(header)[0] @@ -78,22 +86,23 @@ def read_file(self, file: PathLike, file_number: int = 0, start_cycle: int = 0, df_out = pd.DataFrame() # fill in new dataframe - df_out['cycle_number'] = df['Cyc#'] + start_cycle - df['Cyc#'].min() + df_out['cycle_number'] = df['Cyc#'] - df['Cyc#'].min() df_out['cycle_number'] = df_out['cycle_number'].astype('int64') - df_out['file_number'] = file_number # df_out['cycle_number']*0 - df_out['test_time'] = df['Test (Min)'] * 60 - df['Test (Min)'].iloc[0] * 60 + start_time + df_out['test_time'] = (df['Test (Min)'] - df['Test (Min)'].iloc[0]) * 60 df_out['state'] = df['State'] df_out['current'] = df['Amps'] df_out['current'] = np.where(df['State'] == 'D', -1 * df_out['current'], df_out['current']) + df_out['voltage'] = df['Volts'] - if not self.ignore_time: - def _parse_time(time: str) -> float: - if '/' in time: - return datetime.strptime(time, '%m/%d/%Y %H:%M:%S').timestamp() - else: - return datetime.strptime(f'{test_date} {time}', '%m/%d/%Y %H:%M:%S').timestamp() + # Parse the timestamps + def _parse_time(time: str) -> float: + if '/' in time: + return datetime.strptime(time, '%m/%d/%Y %H:%M:%S').timestamp() + else: + return datetime.strptime(f'{test_date} {time}', '%m/%d/%Y %H:%M:%S').timestamp() - df_out['time'] = df['DPt Time'].apply(_parse_time) + start_time = _parse_time(df['DPt Time'].iloc[0]) + df_out['time'] = start_time + df_out['test_time'] # 0 is rest, 1 is charge, -1 is discharge df_out.loc[df_out['state'] == 'R', 'state'] = ChargingState.hold @@ -101,7 +110,6 @@ def _parse_time(time: str) -> float: df_out.loc[df_out['state'] == 'D', 'state'] = ChargingState.discharging df_out.loc[df_out['state'].apply(lambda x: x not in {'R', 'C', 'D'}), 'state'] = ChargingState.unknown - df_out['voltage'] = df['Volts'] df_out = drop_cycles(df_out) AddSteps().enhance(df_out) AddMethod().enhance(df_out) diff --git a/dev/environment.yml b/dev/environment.yml index dfb1fa5..49a5a12 100644 --- a/dev/environment.yml +++ b/dev/environment.yml @@ -1,5 +1,5 @@ # Conda environment file -name: batdata +name: battdat channels: - defaults dependencies: diff --git a/docs/source/consistency.rst b/docs/source/consistency.rst index 8619125..914c0df 100644 --- a/docs/source/consistency.rst +++ b/docs/source/consistency.rst @@ -22,3 +22,11 @@ Current (``b.consistency.current``) :members: :undoc-members: :show-inheritance: + +Current (``b.consistency.time``) +------------------------------------ + +.. automodule:: battdat.consistency.time + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/user-guide/io.rst b/docs/user-guide/io.rst index 4225b31..c846928 100644 --- a/docs/user-guide/io.rst +++ b/docs/user-guide/io.rst @@ -61,8 +61,17 @@ find files: group = next(extractor.identify_files('./example-path/')) dataset = extractor.read_dataset(group) -The :ref:`type of output dataset ` is defined by the :attr:`~battdat.io.base.DatasetFileReader.output_class` attribute. -Most uses of readers do not require modifying this attribute. + +Reading Data from Multiple Files +++++++++++++++++++++++++++++++++ + +The MACCOR and Arbin readers can combine test data from multiple files into the same, contiguous dataset. +Combining is built on two key assumptions: + +1. The cells passed to ``read_dataset`` are in chronological order. +2. The battery is at rest in any period between testing files. + The dataset reader will insert rows with zero current + if the current in the first or last measurement of a file is nonzero. Writing Data ------------ diff --git a/tests/consistency/test_times.py b/tests/consistency/test_times.py new file mode 100644 index 0000000..53357ad --- /dev/null +++ b/tests/consistency/test_times.py @@ -0,0 +1,33 @@ +"""Test for inconsistencies in time columns""" +from datetime import datetime + +import numpy as np +import pandas as pd +from pytest import fixture + +from battdat.consistency.time import TestTimeVsTimeChecker +from battdat.data import BatteryDataset + + +@fixture() +def example_dataset(): + df = pd.DataFrame({ + 'voltage': [1.] * 8, + 'current': [0.] * 8, + 'test_time': np.arange(8, dtype=float) + }) + df['time'] = datetime.now().timestamp() + df['test_time'] + data = BatteryDataset.make_cell_dataset(raw_data=df, cycle_stats=pd.DataFrame({'cycle_number': [0]})) + data.validate() + return data + + +def test_correct_inter(example_dataset): + checker = TestTimeVsTimeChecker() + assert len(checker.check(example_dataset)) == 0 + + example_dataset.raw_data['time'].iloc[4:] += 0.2 + errors = checker.check(example_dataset) + assert len(errors) == 1 + assert '2.0e-01 seconds' in errors[0] + assert 'row 4. test_time=4 s' in errors[0] diff --git a/tests/files/maccor_example.002 b/tests/files/maccor_example.002 index 2f45b7b..baf1a9b 100644 --- a/tests/files/maccor_example.002 +++ b/tests/files/maccor_example.002 @@ -1,10 +1,10 @@ Today's Date 04/04/2016 Date of Test: 04/01/2016 Filename: C:\Data\MIMS\Backup\ARGONNE #20\SET-LN3024-104-1a.001 Procedure: ABRHV-NCM523-Form-4p1.000NCM 523, Formation Test at 0.1C; from 3.0 to 4.1V Comment/Barcode: SET-LN3024-104, Targray NCM811 [LN2086-32-4] vs. Li metal, 3.0 to 4.3V, Formation, C-rate= 2.4 mAh, Data collected for electrode matching (HEHV) Rec# Cyc# Step Test (Min) Step (Min) Amp-hr Watt-hr Amps Volts State ES DPt Time -1 0 1 0.0000 0.0000 0.0000000000 0.0000000000 0.0000000000 3.30678264 R 0 16:05:31 -2 0 1 0.1667 0.1667 0.0000000000 0.0000000000 0.0000000000 3.30571450 R 1 16:05:41 -3 0 1 0.3333 0.3333 0.0000000000 0.0000000000 0.0000000000 3.30571450 R 1 16:05:51 -4 0 1 0.5000 0.5000 0.0000000000 0.0000000000 0.0000000000 3.30586709 R 1 16:06:01 -5 0 1 0.6667 0.6667 0.0000000000 0.0000000000 0.0000000000 3.30601968 R 1 16:06:11 -6 0 1 0.8333 0.8333 0.0000000000 0.0000000000 0.0000000000 3.30601968 R 1 16:06:21 -7 0 1 1.0000 1.0000 0.0000000000 0.0000000000 0.0000000000 3.30586709 R 1 16:06:31 -8 0 1 1.1667 1.1667 0.0000000000 0.0000000000 0.0000000000 3.30617227 R 1 16:06:41 +1 0 1 0.0000 0.0000 0.0000000000 0.0000000000 0.0000000000 3.30678264 R 0 23:59:31 +2 0 1 0.1667 0.1667 0.0000000000 0.0000000000 0.0000000000 3.30571450 R 1 23:59:41 +3 0 1 0.3333 0.3333 0.0000000000 0.0000000000 0.0000000000 3.30571450 R 1 23:59:51 +4 0 1 0.5000 0.5000 0.0000000000 0.0000000000 0.0000000000 3.30586709 R 1 00:00:01 +5 0 1 0.6667 0.6667 0.0000000000 0.0000000000 0.0000000000 3.30601968 R 1 00:00:11 +6 0 1 0.8333 0.8333 0.0000000000 0.0000000000 0.0000000000 3.30601968 R 1 00:00:21 +7 0 1 1.0000 1.0000 0.0000000000 0.0000000000 0.0000000000 3.30586709 R 1 00:00:31 +8 0 1 1.1667 1.1667 0.0000000000 0.0000000000 0.0000000000 3.30617227 R 1 00:00:41 diff --git a/tests/files/maccor_example.charge.001 b/tests/files/maccor_example.charge.001 new file mode 100644 index 0000000..31f4e01 --- /dev/null +++ b/tests/files/maccor_example.charge.001 @@ -0,0 +1,10 @@ +Today's Date 04/04/2016 Date of Test: 03/31/2016 Filename: C:\Data\MIMS\Backup\ARGONNE #20\SET-LN3024-104-1a.001 Procedure: ABRHV-NCM523-Form-4p1.000NCM 523, Formation Test at 0.1C; from 3.0 to 4.1V Comment/Barcode: SET-LN3024-104, Targray NCM811 [LN2086-32-4] vs. Li metal, 3.0 to 4.3V, Formation, C-rate= 2.4 mAh, Data collected for electrode matching (HEHV) +Rec# Cyc# Step Test (Min) Step (Min) Amp-hr Watt-hr Amps Volts State ES DPt Time +1 0 1 0.0000 0.0000 0.0000000000 0.0000000000 0.1000000000 3.30678264 C 0 03/31/2016 16:05:31 +2 0 1 0.1667 0.1667 0.0000000000 0.0000000000 0.1000000000 3.30571450 C 1 03/31/2016 16:05:41 +3 0 1 0.3333 0.3333 0.0000000000 0.0000000000 0.1000000000 3.30571450 C 1 03/31/2016 16:05:51 +4 0 1 0.5000 0.5000 0.0000000000 0.0000000000 0.1000000000 3.30586709 C 1 03/31/2016 16:06:01 +5 0 1 0.6667 0.6667 0.0000000000 0.0000000000 0.1000000000 3.30601968 C 1 03/31/2016 16:06:11 +6 0 1 0.8333 0.8333 0.0000000000 0.0000000000 0.1000000000 3.30601968 C 1 03/31/2016 16:06:21 +7 0 1 1.0000 1.0000 0.0000000000 0.0000000000 0.1000000000 3.30586709 C 1 03/31/2016 16:06:31 +8 0 1 1.1667 1.1667 0.0000000000 0.0000000000 0.1000000000 3.30617227 C 1 03/31/2016 16:06:41 diff --git a/tests/io/test_maccor.py b/tests/io/test_maccor.py index 5ca412c..3dd1f9f 100644 --- a/tests/io/test_maccor.py +++ b/tests/io/test_maccor.py @@ -1,7 +1,10 @@ """Tests related to the MACCOR parser""" from datetime import datetime + +import numpy as np from pytest import fixture, raises +from battdat.consistency.time import TestTimeVsTimeChecker from battdat.io.maccor import MACCORReader @@ -33,7 +36,31 @@ def test_grouping(extractor, tmp_path): assert (str(tmp_path / 'testB.001'),) in groups +def test_test_time_multifile(extractor, test_file): + """Ensure we get the time between starting files correctly""" + files = [test_file, test_file.with_suffix('.002')] + data = extractor.read_dataset(files) + data.validate() + + assert len(TestTimeVsTimeChecker().check(data)) == 0 # That the test times and date columns are correct + assert data.raw_data['test_time'].max() > 86400 + assert data.raw_data['cycle_number'].max() == 1 + + +def test_add_zero_current(extractor, test_file): + """Ensure that we add a zero-current row between files""" + data = extractor.read_dataset([test_file.with_suffix('.charge.001')]) + orig_len = len(data.raw_data) + assert data.raw_data['current'].iloc[-1] != 0 + + # Append a second test file, ensure nonzero current + data = extractor.read_dataset([test_file.with_suffix('.charge.001'), test_file.with_suffix('.002')]) + assert np.allclose(data.raw_data['current'].iloc[orig_len:orig_len + 2], 0) + assert np.allclose(data.raw_data['file_number'].iloc[orig_len:orig_len + 2], [0, 1]) + + def test_date_check(extractor, test_file): + """Test detecting out-of-order files""" files = [test_file, test_file.with_suffix('.002')] data = extractor.read_dataset(files) data.validate() @@ -51,8 +78,5 @@ def test_time_parser(extractor, test_file): # With only the time in the time column df = extractor.read_file(test_file.with_suffix('.002')) assert datetime.fromtimestamp(df['time'].iloc[0]).month == 4 - - # Ignoring datetime - extractor.ignore_time = True - df = extractor.read_file(test_file) - assert 'time' not in df.columns + assert datetime.fromtimestamp(df['time'].iloc[0]).day == 1 + assert datetime.fromtimestamp(df['time'].iloc[-1]).day == 2