From 7e3bfdf1de39fde4215cdd85155990738ee92ed1 Mon Sep 17 00:00:00 2001 From: lward Date: Tue, 27 May 2025 15:19:04 -0400 Subject: [PATCH 01/10] Add check for time consistency --- battdat/consistency/time.py | 40 +++++++++++++++++++++++++++++++++ docs/source/consistency.rst | 8 +++++++ tests/consistency/test_times.py | 33 +++++++++++++++++++++++++++ 3 files changed, 81 insertions(+) create mode 100644 battdat/consistency/time.py create mode 100644 tests/consistency/test_times.py diff --git a/battdat/consistency/time.py b/battdat/consistency/time.py new file mode 100644 index 0000000..3bc89b5 --- /dev/null +++ b/battdat/consistency/time.py @@ -0,0 +1,40 @@ +"""Check for problems across the columns which describe time""" +from dataclasses import dataclass +from datetime import datetime +from typing import List + +import numpy as np + +from .base import ConsistencyChecker +from ..data import BatteryDataset + + +@dataclass +class TestTimeVsTimeChecker(ConsistencyChecker): + """Ensure that the test time and timestamp columns agree + + Verify that the difference between the test_time + """ + + max_inconsistency: float = 0.1 + """Maximum inconsistency between timestamp and test time (s)""" + + def check(self, dataset: BatteryDataset) -> List[str]: + output = [] + for name, subset in dataset.tables.items(): + if 'time' not in subset.columns or 'test_time' not in subset.columns: + continue + + # Ensure that + test_time_normed = subset['test_time'] - subset['test_time'].min() + timestamp_normed = subset['time'] - subset['time'].min() + diffs = np.abs(test_time_normed - timestamp_normed) + max_diff = diffs.max() + if max_diff > self.max_inconsistency: + idx_max = np.argmax(diffs) + date_max = datetime.fromtimestamp(subset['time'].iloc[idx_max]) + time_max = subset['test_time'].iloc[idx_max] + output.append(f'Test times and timestep in dataset "{name}" differ by {max_diff:.1e} seconds in row {idx_max}.' + f' test_time={int(time_max)} s, time={date_max}') + + return output diff --git a/docs/source/consistency.rst b/docs/source/consistency.rst index 8619125..914c0df 100644 --- a/docs/source/consistency.rst +++ b/docs/source/consistency.rst @@ -22,3 +22,11 @@ Current (``b.consistency.current``) :members: :undoc-members: :show-inheritance: + +Current (``b.consistency.time``) +------------------------------------ + +.. automodule:: battdat.consistency.time + :members: + :undoc-members: + :show-inheritance: diff --git a/tests/consistency/test_times.py b/tests/consistency/test_times.py new file mode 100644 index 0000000..7ee005c --- /dev/null +++ b/tests/consistency/test_times.py @@ -0,0 +1,33 @@ +"""Test for inconsistencies in time columns""" +from datetime import datetime + +import numpy as np +import pandas as pd +from pytest import fixture + +from battdat.consistency.time import TestTimeVsTimeChecker +from battdat.data import BatteryDataset + + +@fixture() +def example_dataset(): + df = pd.DataFrame({ + 'voltage': [1.] * 8, + 'current': [0.] * 8, + 'test_time': np.arange(8, dtype=float) + }) + df['time'] = datetime.now().timestamp() + df['test_time'] + data = BatteryDataset.make_cell_dataset(raw_data=df) + data.validate() + return data + + +def test_correct_inter(example_dataset): + checker = TestTimeVsTimeChecker() + assert len(checker.check(example_dataset)) == 0 + + example_dataset.raw_data['time'].iloc[4:] += 0.2 + errors = checker.check(example_dataset) + assert len(errors) == 1 + assert '2.0e-01 seconds' in errors[0] + assert 'row 4. test_time=4 s' in errors[0] From 3895869d0abae8413af09646d1d8efcda08d7cba Mon Sep 17 00:00:00 2001 From: lward Date: Tue, 27 May 2025 15:24:19 -0400 Subject: [PATCH 02/10] Test skipping irrelevant tables --- tests/consistency/test_times.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/consistency/test_times.py b/tests/consistency/test_times.py index 7ee005c..53357ad 100644 --- a/tests/consistency/test_times.py +++ b/tests/consistency/test_times.py @@ -17,7 +17,7 @@ def example_dataset(): 'test_time': np.arange(8, dtype=float) }) df['time'] = datetime.now().timestamp() + df['test_time'] - data = BatteryDataset.make_cell_dataset(raw_data=df) + data = BatteryDataset.make_cell_dataset(raw_data=df, cycle_stats=pd.DataFrame({'cycle_number': [0]})) data.validate() return data From f503f9ec2bcc9776e1dee31d8f2583810e24d52d Mon Sep 17 00:00:00 2001 From: lward Date: Tue, 27 May 2025 16:26:36 -0400 Subject: [PATCH 03/10] Add tool for correcting the offsets --- battdat/io/maccor.py | 62 ++++++++++++++++++++++++++++------ tests/files/maccor_example.002 | 16 ++++----- tests/io/test_maccor.py | 29 ++++++++++++---- 3 files changed, 83 insertions(+), 24 deletions(-) diff --git a/battdat/io/maccor.py b/battdat/io/maccor.py index 0eff245..356fe0a 100644 --- a/battdat/io/maccor.py +++ b/battdat/io/maccor.py @@ -1,5 +1,6 @@ """Extractor for MACCOR""" import re +import logging import itertools from dataclasses import dataclass from datetime import datetime @@ -17,6 +18,48 @@ _test_date_re = re.compile(r'Date of Test:\s+(\d{2}/\d{2}/\d{4})') +logger = logging.getLogger(__name__) + + +def correct_time_offsets(raw_data: pd.DataFrame, desync_tol: float = 0.01) -> int: + """Correct errors in the timestamp column that result + from the day not being listed with timestamp. + + Day rollovers are detected by desynchronization between the test time + and timestamps, which are corrected by moving the test_time forward + to meet the date time. + + Will warn if the desynchronization is not a multiple of a day, + an hour (daylight savings time), or a second (leap seconds). + + Args: + raw_data: Raw data signal to be corrected + desync_tol: Tolerance of desynchronization between time columns + Returns: + Number of day rollovers that were detected + """ + + test_time = raw_data['test_time'] - raw_data['test_time'].iloc[0] + + def _get_differences(): + timestamp_diff = raw_data['time'] - raw_data['time'].iloc[0] + return timestamp_diff - test_time + + while np.abs(diffs := _get_differences()).max() > desync_tol: + # Get the amount of offset detected + first_bad_ix = np.argmax(np.abs(diffs) > desync_tol) + offset = diffs[first_bad_ix].item() + + # Check if it's consistent with a date rollover, daylight savings time, or leap second + if np.isclose(offset % 86400, 0, atol=1e-1) or \ + np.isclose(np.abs(offset), [3600, 1], atol=1e-1).any(): + pass # Nothing of concern + else: + logger.warning(f'Detected an offset inconsistent with a day: {offset} s') + + # Correct the offset + raw_data['time'].iloc[first_bad_ix:] -= offset + @dataclass class MACCORReader(CycleTestReader, DatasetFileReader): @@ -28,9 +71,6 @@ class MACCORReader(CycleTestReader, DatasetFileReader): are treated as part of the same experiment. """ - ignore_time: bool = False - """Ignore the the time column, which can be problematic.""" - def group(self, files: Union[str, List[str]], directories: List[str] = None, context: dict = None) -> Iterator[Tuple[str, ...]]: if isinstance(files, str): @@ -86,14 +126,16 @@ def read_file(self, file: PathLike, file_number: int = 0, start_cycle: int = 0, df_out['current'] = df['Amps'] df_out['current'] = np.where(df['State'] == 'D', -1 * df_out['current'], df_out['current']) - if not self.ignore_time: - def _parse_time(time: str) -> float: - if '/' in time: - return datetime.strptime(time, '%m/%d/%Y %H:%M:%S').timestamp() - else: - return datetime.strptime(f'{test_date} {time}', '%m/%d/%Y %H:%M:%S').timestamp() + # Parse the timestamps + def _parse_time(time: str) -> float: + if '/' in time: + return datetime.strptime(time, '%m/%d/%Y %H:%M:%S').timestamp() + else: + return datetime.strptime(f'{test_date} {time}', '%m/%d/%Y %H:%M:%S').timestamp() + + df_out['time'] = df['DPt Time'].apply(_parse_time) - df_out['time'] = df['DPt Time'].apply(_parse_time) + correct_time_offsets(df_out) # 0 is rest, 1 is charge, -1 is discharge df_out.loc[df_out['state'] == 'R', 'state'] = ChargingState.hold diff --git a/tests/files/maccor_example.002 b/tests/files/maccor_example.002 index 2f45b7b..baf1a9b 100644 --- a/tests/files/maccor_example.002 +++ b/tests/files/maccor_example.002 @@ -1,10 +1,10 @@ Today's Date 04/04/2016 Date of Test: 04/01/2016 Filename: C:\Data\MIMS\Backup\ARGONNE #20\SET-LN3024-104-1a.001 Procedure: ABRHV-NCM523-Form-4p1.000NCM 523, Formation Test at 0.1C; from 3.0 to 4.1V Comment/Barcode: SET-LN3024-104, Targray NCM811 [LN2086-32-4] vs. Li metal, 3.0 to 4.3V, Formation, C-rate= 2.4 mAh, Data collected for electrode matching (HEHV) Rec# Cyc# Step Test (Min) Step (Min) Amp-hr Watt-hr Amps Volts State ES DPt Time -1 0 1 0.0000 0.0000 0.0000000000 0.0000000000 0.0000000000 3.30678264 R 0 16:05:31 -2 0 1 0.1667 0.1667 0.0000000000 0.0000000000 0.0000000000 3.30571450 R 1 16:05:41 -3 0 1 0.3333 0.3333 0.0000000000 0.0000000000 0.0000000000 3.30571450 R 1 16:05:51 -4 0 1 0.5000 0.5000 0.0000000000 0.0000000000 0.0000000000 3.30586709 R 1 16:06:01 -5 0 1 0.6667 0.6667 0.0000000000 0.0000000000 0.0000000000 3.30601968 R 1 16:06:11 -6 0 1 0.8333 0.8333 0.0000000000 0.0000000000 0.0000000000 3.30601968 R 1 16:06:21 -7 0 1 1.0000 1.0000 0.0000000000 0.0000000000 0.0000000000 3.30586709 R 1 16:06:31 -8 0 1 1.1667 1.1667 0.0000000000 0.0000000000 0.0000000000 3.30617227 R 1 16:06:41 +1 0 1 0.0000 0.0000 0.0000000000 0.0000000000 0.0000000000 3.30678264 R 0 23:59:31 +2 0 1 0.1667 0.1667 0.0000000000 0.0000000000 0.0000000000 3.30571450 R 1 23:59:41 +3 0 1 0.3333 0.3333 0.0000000000 0.0000000000 0.0000000000 3.30571450 R 1 23:59:51 +4 0 1 0.5000 0.5000 0.0000000000 0.0000000000 0.0000000000 3.30586709 R 1 00:00:01 +5 0 1 0.6667 0.6667 0.0000000000 0.0000000000 0.0000000000 3.30601968 R 1 00:00:11 +6 0 1 0.8333 0.8333 0.0000000000 0.0000000000 0.0000000000 3.30601968 R 1 00:00:21 +7 0 1 1.0000 1.0000 0.0000000000 0.0000000000 0.0000000000 3.30586709 R 1 00:00:31 +8 0 1 1.1667 1.1667 0.0000000000 0.0000000000 0.0000000000 3.30617227 R 1 00:00:41 diff --git a/tests/io/test_maccor.py b/tests/io/test_maccor.py index 5ca412c..0e06ab3 100644 --- a/tests/io/test_maccor.py +++ b/tests/io/test_maccor.py @@ -1,8 +1,10 @@ """Tests related to the MACCOR parser""" +import numpy as np +import pandas as pd from datetime import datetime from pytest import fixture, raises -from battdat.io.maccor import MACCORReader +from battdat.io.maccor import MACCORReader, correct_time_offsets @fixture() @@ -21,6 +23,26 @@ def test_validation(extractor, test_file): data.validate_columns(allow_extra_columns=False) +def test_check_offset_correct(caplog): + df = pd.DataFrame({ + 'test_time': np.arange(3, dtype=float), + }) + + # Test the OK offsets + for off in [86400, 1, -3600]: + df['time'] = df['test_time'] + datetime.now().timestamp() + df['time'].iloc[1:] += off + correct_time_offsets(df) + assert np.allclose(df['time'] - df['time'].iloc[0], np.arange(3.)) + assert len(caplog.messages) == 0 + + # Test an offset which yields a warning + df['time'].iloc[1:] += 25 + correct_time_offsets(df) + assert len(caplog.messages) == 1 + assert '25' in caplog.messages[-1] + + def test_grouping(extractor, tmp_path): # Make a file structure with two sets of experiments and a nonsense file for f in ['README', 'testA.002', 'testA.001', 'testB.001']: @@ -51,8 +73,3 @@ def test_time_parser(extractor, test_file): # With only the time in the time column df = extractor.read_file(test_file.with_suffix('.002')) assert datetime.fromtimestamp(df['time'].iloc[0]).month == 4 - - # Ignoring datetime - extractor.ignore_time = True - df = extractor.read_file(test_file) - assert 'time' not in df.columns From 1c44ab339f2bfe45b6dfb83230fd5bccefa682a3 Mon Sep 17 00:00:00 2001 From: lward Date: Tue, 27 May 2025 17:07:51 -0400 Subject: [PATCH 04/10] Refactor logic for multi files into superclass We were repeating ourselves, and that became a problem with complex strategies for combining files --- battdat/io/arbin.py | 8 +++----- battdat/io/base.py | 39 +++++++++++++++++++++------------------ battdat/io/maccor.py | 8 +++----- dev/environment.yml | 2 +- docs/user-guide/io.rst | 3 --- tests/io/test_maccor.py | 13 +++++++++++++ 6 files changed, 41 insertions(+), 32 deletions(-) diff --git a/battdat/io/arbin.py b/battdat/io/arbin.py index 5c30303..38af3e4 100644 --- a/battdat/io/arbin.py +++ b/battdat/io/arbin.py @@ -21,8 +21,7 @@ def group(self, files: Union[str, List[str]], directories: List[str] = None, if file.lower().endswith('.csv'): yield file - def read_file(self, file: str, file_number: int = 0, start_cycle: int = 0, - start_time: float = 0) -> pd.DataFrame: + def read_file(self, file: str) -> pd.DataFrame: # Read the file and rename the file df = pd.read_csv(file) @@ -32,10 +31,9 @@ def read_file(self, file: str, file_number: int = 0, start_cycle: int = 0, df_out = pd.DataFrame() # Convert the column names - df_out['cycle_number'] = df['Cycle_Index'] + start_cycle - df['Cycle_Index'].min() + df_out['cycle_number'] = df['Cycle_Index'] - df['Cycle_Index'].min() df_out['cycle_number'] = df_out['cycle_number'].astype('int64') - df_out['file_number'] = file_number # df_out['cycle_number']*0 - df_out['test_time'] = np.array(df['test_time'] - df['test_time'][0] + start_time, dtype=float) + df_out['test_time'] = np.array(df['test_time'] - df['test_time'][0], dtype=float) df_out['current'] = df['Current'] # TODO (wardlt): Check this!? df_out['temperature'] = df['Temperature'] df_out['internal_resistance'] = df['Internal_Resistance'] diff --git a/battdat/io/base.py b/battdat/io/base.py index c231557..b3d0809 100644 --- a/battdat/io/base.py +++ b/battdat/io/base.py @@ -96,20 +96,13 @@ class CycleTestReader(DatasetFileReader): Adds logic for reading cycling time series from a list of files. """ - def read_file(self, - file: str, - file_number: int = 0, - start_cycle: int = 0, - start_time: int = 0) -> pd.DataFrame: + def read_file(self, file: str) -> pd.DataFrame: """Generate a DataFrame containing the data in this file The dataframe will be in our standard format Args: file: Path to the file - file_number: Number of file, in case the test is spread across multiple files - start_cycle: Index to use for the first cycle, in case test is spread across multiple files - start_time: Test time to use for the start of the test, in case test is spread across multiple files Returns: Dataframe containing the battery data in a standard format @@ -127,21 +120,31 @@ def read_dataset(self, group: Sequence[PathLike] = (), metadata: Optional[Batter DataFrame containing the information from all files """ - # Initialize counters for the cycle numbers, etc., Used to determine offsets for the files read - start_cycle = 0 - start_time = 0 - # Read the data for each file # Keep track of the ending index and ending time output_dfs = [] for file_number, file in enumerate(group): - # Read the file - df_out = self.read_file(file, file_number, start_cycle, start_time) - output_dfs.append(df_out) + df_out = self.read_file(file) + df_out['file_number'] = file_number + + # Adjust the test time and cycle for subsequent files + if len(output_dfs) > 0: + last_row = output_dfs[-1] + + # Determine the length of rest between last file and current + rest_between_files = 0 # Assume duplicate points if no data are available + if 'time' in last_row and 'time' in df_out: + rest_between_files = df_out['time'].iloc[0] - last_row['time'] - # Increment the start cycle and time to determine starting point of next file - start_cycle += df_out['cycle_number'].max() - df_out['cycle_number'].min() + 1 - start_time = df_out['test_time'].max() + # Increment the test time such that it continues from the last file + df_out['test_time'] += last_row['test_time'] + rest_between_files + + # Adjust the cycle number, if included + # Assume the new file starts a new cycle + if 'cycle_number' in df_out.columns and 'cycle_number' in last_row: + df_out['cycle_number'] += 1 + last_row['cycle_number'] + + output_dfs.append(df_out) # Combine the data from all files df_out = pd.concat(output_dfs, ignore_index=True) diff --git a/battdat/io/maccor.py b/battdat/io/maccor.py index 356fe0a..6a6c3ef 100644 --- a/battdat/io/maccor.py +++ b/battdat/io/maccor.py @@ -102,8 +102,7 @@ def read_dataset(self, group: Sequence[PathLike] = (), metadata: Optional[Batter return super().read_dataset(group, metadata) - def read_file(self, file: PathLike, file_number: int = 0, start_cycle: int = 0, - start_time: int = 0) -> pd.DataFrame: + def read_file(self, file: PathLike) -> pd.DataFrame: # Pull the test date from the first line of the file with open(file, 'r') as fp: @@ -118,10 +117,9 @@ def read_file(self, file: PathLike, file_number: int = 0, start_cycle: int = 0, df_out = pd.DataFrame() # fill in new dataframe - df_out['cycle_number'] = df['Cyc#'] + start_cycle - df['Cyc#'].min() + df_out['cycle_number'] = df['Cyc#'] - df['Cyc#'].min() df_out['cycle_number'] = df_out['cycle_number'].astype('int64') - df_out['file_number'] = file_number # df_out['cycle_number']*0 - df_out['test_time'] = df['Test (Min)'] * 60 - df['Test (Min)'].iloc[0] * 60 + start_time + df_out['test_time'] = df['Test (Min)'] * 60 - df['Test (Min)'].iloc[0] * 60 df_out['state'] = df['State'] df_out['current'] = df['Amps'] df_out['current'] = np.where(df['State'] == 'D', -1 * df_out['current'], df_out['current']) diff --git a/dev/environment.yml b/dev/environment.yml index dfb1fa5..49a5a12 100644 --- a/dev/environment.yml +++ b/dev/environment.yml @@ -1,5 +1,5 @@ # Conda environment file -name: batdata +name: battdat channels: - defaults dependencies: diff --git a/docs/user-guide/io.rst b/docs/user-guide/io.rst index 4225b31..448a6e5 100644 --- a/docs/user-guide/io.rst +++ b/docs/user-guide/io.rst @@ -61,9 +61,6 @@ find files: group = next(extractor.identify_files('./example-path/')) dataset = extractor.read_dataset(group) -The :ref:`type of output dataset ` is defined by the :attr:`~battdat.io.base.DatasetFileReader.output_class` attribute. -Most uses of readers do not require modifying this attribute. - Writing Data ------------ diff --git a/tests/io/test_maccor.py b/tests/io/test_maccor.py index 0e06ab3..6abc26d 100644 --- a/tests/io/test_maccor.py +++ b/tests/io/test_maccor.py @@ -4,6 +4,7 @@ from datetime import datetime from pytest import fixture, raises +from battdat.consistency.time import TestTimeVsTimeChecker from battdat.io.maccor import MACCORReader, correct_time_offsets @@ -55,7 +56,17 @@ def test_grouping(extractor, tmp_path): assert (str(tmp_path / 'testB.001'),) in groups +def test_test_time_multifile(extractor, test_file): + """Ensure we get the time between starting files correctly""" + files = [test_file, test_file.with_suffix('.002')] + data = extractor.read_dataset(files) + data.validate() + + assert len(TestTimeVsTimeChecker().check(data)) == 0 # That the test times and date columns are correct + + def test_date_check(extractor, test_file): + """Test detecting out-of-order files""" files = [test_file, test_file.with_suffix('.002')] data = extractor.read_dataset(files) data.validate() @@ -73,3 +84,5 @@ def test_time_parser(extractor, test_file): # With only the time in the time column df = extractor.read_file(test_file.with_suffix('.002')) assert datetime.fromtimestamp(df['time'].iloc[0]).month == 4 + assert datetime.fromtimestamp(df['time'].iloc[0]).day == 1 + assert datetime.fromtimestamp(df['time'].iloc[-1]).day == 2 From 971c6f1c9e1679faa23d038e3dd32f578052e785 Mon Sep 17 00:00:00 2001 From: lward Date: Wed, 28 May 2025 10:41:56 -0400 Subject: [PATCH 05/10] Stop relying on the timestamps in a MACCOR file Assume that the first one is correct, infer the remainder --- battdat/io/base.py | 6 ++--- battdat/io/maccor.py | 58 +++++++++-------------------------------- tests/io/test_maccor.py | 26 +++--------------- 3 files changed, 19 insertions(+), 71 deletions(-) diff --git a/battdat/io/base.py b/battdat/io/base.py index b3d0809..be38e1a 100644 --- a/battdat/io/base.py +++ b/battdat/io/base.py @@ -129,12 +129,12 @@ def read_dataset(self, group: Sequence[PathLike] = (), metadata: Optional[Batter # Adjust the test time and cycle for subsequent files if len(output_dfs) > 0: - last_row = output_dfs[-1] + last_row = output_dfs[-1].iloc[-1] # Determine the length of rest between last file and current rest_between_files = 0 # Assume duplicate points if no data are available if 'time' in last_row and 'time' in df_out: - rest_between_files = df_out['time'].iloc[0] - last_row['time'] + rest_between_files = max(df_out['time'].iloc[0] - last_row['time'], 0) # Increment the test time such that it continues from the last file df_out['test_time'] += last_row['test_time'] + rest_between_files @@ -142,7 +142,7 @@ def read_dataset(self, group: Sequence[PathLike] = (), metadata: Optional[Batter # Adjust the cycle number, if included # Assume the new file starts a new cycle if 'cycle_number' in df_out.columns and 'cycle_number' in last_row: - df_out['cycle_number'] += 1 + last_row['cycle_number'] + df_out['cycle_number'] += 1 + int(last_row['cycle_number']) output_dfs.append(df_out) diff --git a/battdat/io/maccor.py b/battdat/io/maccor.py index 6a6c3ef..4aab905 100644 --- a/battdat/io/maccor.py +++ b/battdat/io/maccor.py @@ -21,46 +21,6 @@ logger = logging.getLogger(__name__) -def correct_time_offsets(raw_data: pd.DataFrame, desync_tol: float = 0.01) -> int: - """Correct errors in the timestamp column that result - from the day not being listed with timestamp. - - Day rollovers are detected by desynchronization between the test time - and timestamps, which are corrected by moving the test_time forward - to meet the date time. - - Will warn if the desynchronization is not a multiple of a day, - an hour (daylight savings time), or a second (leap seconds). - - Args: - raw_data: Raw data signal to be corrected - desync_tol: Tolerance of desynchronization between time columns - Returns: - Number of day rollovers that were detected - """ - - test_time = raw_data['test_time'] - raw_data['test_time'].iloc[0] - - def _get_differences(): - timestamp_diff = raw_data['time'] - raw_data['time'].iloc[0] - return timestamp_diff - test_time - - while np.abs(diffs := _get_differences()).max() > desync_tol: - # Get the amount of offset detected - first_bad_ix = np.argmax(np.abs(diffs) > desync_tol) - offset = diffs[first_bad_ix].item() - - # Check if it's consistent with a date rollover, daylight savings time, or leap second - if np.isclose(offset % 86400, 0, atol=1e-1) or \ - np.isclose(np.abs(offset), [3600, 1], atol=1e-1).any(): - pass # Nothing of concern - else: - logger.warning(f'Detected an offset inconsistent with a day: {offset} s') - - # Correct the offset - raw_data['time'].iloc[first_bad_ix:] -= offset - - @dataclass class MACCORReader(CycleTestReader, DatasetFileReader): """Parser for reading from MACCOR-format files @@ -69,6 +29,15 @@ class MACCORReader(CycleTestReader, DatasetFileReader): The :meth:`group` operation will consolidate files such that all with the same prefix (i.e., everything except the numerals in the extension) are treated as part of the same experiment. + + MACCOR files include both a test time relative to the start of testing + and a timestamp following the clock time. + This parser only assumes the test time to be correct because the timestamps + are nontrivial to rely upon, as they may be non-monotonic due to + changes to the computer's clock. + Test times are always monotonic. + The timestamps are generated based on the timestamp of the first row and + the change in test time. """ def group(self, files: Union[str, List[str]], directories: List[str] = None, @@ -119,10 +88,11 @@ def read_file(self, file: PathLike) -> pd.DataFrame: # fill in new dataframe df_out['cycle_number'] = df['Cyc#'] - df['Cyc#'].min() df_out['cycle_number'] = df_out['cycle_number'].astype('int64') - df_out['test_time'] = df['Test (Min)'] * 60 - df['Test (Min)'].iloc[0] * 60 + df_out['test_time'] = (df['Test (Min)'] - df['Test (Min)'].iloc[0]) * 60 df_out['state'] = df['State'] df_out['current'] = df['Amps'] df_out['current'] = np.where(df['State'] == 'D', -1 * df_out['current'], df_out['current']) + df_out['voltage'] = df['Volts'] # Parse the timestamps def _parse_time(time: str) -> float: @@ -131,9 +101,8 @@ def _parse_time(time: str) -> float: else: return datetime.strptime(f'{test_date} {time}', '%m/%d/%Y %H:%M:%S').timestamp() - df_out['time'] = df['DPt Time'].apply(_parse_time) - - correct_time_offsets(df_out) + start_time = _parse_time(df['DPt Time'].iloc[0]) + df_out['time'] = start_time + df_out['test_time'] # 0 is rest, 1 is charge, -1 is discharge df_out.loc[df_out['state'] == 'R', 'state'] = ChargingState.hold @@ -141,7 +110,6 @@ def _parse_time(time: str) -> float: df_out.loc[df_out['state'] == 'D', 'state'] = ChargingState.discharging df_out.loc[df_out['state'].apply(lambda x: x not in {'R', 'C', 'D'}), 'state'] = ChargingState.unknown - df_out['voltage'] = df['Volts'] df_out = drop_cycles(df_out) AddSteps().enhance(df_out) AddMethod().enhance(df_out) diff --git a/tests/io/test_maccor.py b/tests/io/test_maccor.py index 6abc26d..5aa41c5 100644 --- a/tests/io/test_maccor.py +++ b/tests/io/test_maccor.py @@ -1,11 +1,9 @@ """Tests related to the MACCOR parser""" -import numpy as np -import pandas as pd from datetime import datetime from pytest import fixture, raises from battdat.consistency.time import TestTimeVsTimeChecker -from battdat.io.maccor import MACCORReader, correct_time_offsets +from battdat.io.maccor import MACCORReader @fixture() @@ -24,26 +22,6 @@ def test_validation(extractor, test_file): data.validate_columns(allow_extra_columns=False) -def test_check_offset_correct(caplog): - df = pd.DataFrame({ - 'test_time': np.arange(3, dtype=float), - }) - - # Test the OK offsets - for off in [86400, 1, -3600]: - df['time'] = df['test_time'] + datetime.now().timestamp() - df['time'].iloc[1:] += off - correct_time_offsets(df) - assert np.allclose(df['time'] - df['time'].iloc[0], np.arange(3.)) - assert len(caplog.messages) == 0 - - # Test an offset which yields a warning - df['time'].iloc[1:] += 25 - correct_time_offsets(df) - assert len(caplog.messages) == 1 - assert '25' in caplog.messages[-1] - - def test_grouping(extractor, tmp_path): # Make a file structure with two sets of experiments and a nonsense file for f in ['README', 'testA.002', 'testA.001', 'testB.001']: @@ -63,6 +41,8 @@ def test_test_time_multifile(extractor, test_file): data.validate() assert len(TestTimeVsTimeChecker().check(data)) == 0 # That the test times and date columns are correct + assert data.raw_data['test_time'].max() > 86400 + assert data.raw_data['cycle_number'].max() == 1 def test_date_check(extractor, test_file): From 762dbae0ccaa926006a6d82f5f146fb4ab2e22a5 Mon Sep 17 00:00:00 2001 From: lward Date: Wed, 28 May 2025 13:53:55 -0400 Subject: [PATCH 06/10] Clarify the documentation --- battdat/consistency/time.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/battdat/consistency/time.py b/battdat/consistency/time.py index 3bc89b5..63da827 100644 --- a/battdat/consistency/time.py +++ b/battdat/consistency/time.py @@ -13,7 +13,9 @@ class TestTimeVsTimeChecker(ConsistencyChecker): """Ensure that the test time and timestamp columns agree - Verify that the difference between the test_time + Verify that the difference between the first and current row + for the ``test_time`` (time elapsed since the beginning of cycling) + and ``time`` (clock datetime) columns agree. """ max_inconsistency: float = 0.1 From c5bd1ea5cd008149c3262f74408936bd37e9711a Mon Sep 17 00:00:00 2001 From: Logan Ward Date: Mon, 2 Jun 2025 08:48:10 -0400 Subject: [PATCH 07/10] Add a rest at the end of a file --- battdat/io/base.py | 9 +++++++++ tests/files/maccor_example.charge.001 | 10 ++++++++++ tests/io/test_maccor.py | 12 ++++++++++++ 3 files changed, 31 insertions(+) create mode 100644 tests/files/maccor_example.charge.001 diff --git a/battdat/io/base.py b/battdat/io/base.py index be38e1a..c05dd9f 100644 --- a/battdat/io/base.py +++ b/battdat/io/base.py @@ -139,6 +139,15 @@ def read_dataset(self, group: Sequence[PathLike] = (), metadata: Optional[Batter # Increment the test time such that it continues from the last file df_out['test_time'] += last_row['test_time'] + rest_between_files + # Ensure current is zero if the rest between files is nonzero + if rest_between_files != 0 and last_row['current'] != 0: + new_last_row = output_dfs[-1].iloc[-1:].copy() + new_last_row['test_time'] += 1e-3 # Assume the rest occurs a millisecond later + new_last_row['current'] = 0 + if 'time' in new_last_row: + new_last_row['time'] += 1e-3 + output_dfs[-1] = pd.concat([output_dfs[-1], new_last_row], ignore_index=True) + # Adjust the cycle number, if included # Assume the new file starts a new cycle if 'cycle_number' in df_out.columns and 'cycle_number' in last_row: diff --git a/tests/files/maccor_example.charge.001 b/tests/files/maccor_example.charge.001 new file mode 100644 index 0000000..31f4e01 --- /dev/null +++ b/tests/files/maccor_example.charge.001 @@ -0,0 +1,10 @@ +Today's Date 04/04/2016 Date of Test: 03/31/2016 Filename: C:\Data\MIMS\Backup\ARGONNE #20\SET-LN3024-104-1a.001 Procedure: ABRHV-NCM523-Form-4p1.000NCM 523, Formation Test at 0.1C; from 3.0 to 4.1V Comment/Barcode: SET-LN3024-104, Targray NCM811 [LN2086-32-4] vs. Li metal, 3.0 to 4.3V, Formation, C-rate= 2.4 mAh, Data collected for electrode matching (HEHV) +Rec# Cyc# Step Test (Min) Step (Min) Amp-hr Watt-hr Amps Volts State ES DPt Time +1 0 1 0.0000 0.0000 0.0000000000 0.0000000000 0.1000000000 3.30678264 C 0 03/31/2016 16:05:31 +2 0 1 0.1667 0.1667 0.0000000000 0.0000000000 0.1000000000 3.30571450 C 1 03/31/2016 16:05:41 +3 0 1 0.3333 0.3333 0.0000000000 0.0000000000 0.1000000000 3.30571450 C 1 03/31/2016 16:05:51 +4 0 1 0.5000 0.5000 0.0000000000 0.0000000000 0.1000000000 3.30586709 C 1 03/31/2016 16:06:01 +5 0 1 0.6667 0.6667 0.0000000000 0.0000000000 0.1000000000 3.30601968 C 1 03/31/2016 16:06:11 +6 0 1 0.8333 0.8333 0.0000000000 0.0000000000 0.1000000000 3.30601968 C 1 03/31/2016 16:06:21 +7 0 1 1.0000 1.0000 0.0000000000 0.0000000000 0.1000000000 3.30586709 C 1 03/31/2016 16:06:31 +8 0 1 1.1667 1.1667 0.0000000000 0.0000000000 0.1000000000 3.30617227 C 1 03/31/2016 16:06:41 diff --git a/tests/io/test_maccor.py b/tests/io/test_maccor.py index 5aa41c5..976ca6f 100644 --- a/tests/io/test_maccor.py +++ b/tests/io/test_maccor.py @@ -45,6 +45,18 @@ def test_test_time_multifile(extractor, test_file): assert data.raw_data['cycle_number'].max() == 1 +def test_add_zero_current(extractor, test_file): + """Ensure that we add a zero-current row between files""" + data = extractor.read_dataset([test_file.with_suffix('.charge.001')]) + orig_len = len(data.raw_data) + assert data.raw_data['current'].iloc[-1] != 0 + + # Append a second test file, ensure nonzero current + data = extractor.read_dataset([test_file.with_suffix('.charge.001'), test_file.with_suffix('.002')]) + assert data.raw_data['current'].iloc[orig_len] == 0 + + + def test_date_check(extractor, test_file): """Test detecting out-of-order files""" files = [test_file, test_file.with_suffix('.002')] From 7f76f66e95245bd9c47dea105d2f3456813d4c7a Mon Sep 17 00:00:00 2001 From: Logan Ward Date: Mon, 2 Jun 2025 09:10:02 -0400 Subject: [PATCH 08/10] Set the character encoding to latin-1 It's probably https://en.wikipedia.org/wiki/Windows-1252, but latin-1 should work --- battdat/io/maccor.py | 4 ++-- tests/io/test_maccor.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/battdat/io/maccor.py b/battdat/io/maccor.py index 4aab905..510080f 100644 --- a/battdat/io/maccor.py +++ b/battdat/io/maccor.py @@ -59,7 +59,7 @@ def read_dataset(self, group: Sequence[PathLike] = (), metadata: Optional[Batter # Verify the cells are ordered by test date start_dates = [] for file in group: - with open(file, 'r') as fp: + with open(file, 'r', encoding='latin1') as fp: header = fp.readline() test_date = _test_date_re.findall(header)[0] start_dates.append(datetime.strptime(test_date, '%m/%d/%Y')) @@ -74,7 +74,7 @@ def read_dataset(self, group: Sequence[PathLike] = (), metadata: Optional[Batter def read_file(self, file: PathLike) -> pd.DataFrame: # Pull the test date from the first line of the file - with open(file, 'r') as fp: + with open(file, 'r', encoding='latin1') as fp: header = fp.readline() test_date = _test_date_re.findall(header)[0] diff --git a/tests/io/test_maccor.py b/tests/io/test_maccor.py index 976ca6f..cc095dc 100644 --- a/tests/io/test_maccor.py +++ b/tests/io/test_maccor.py @@ -56,7 +56,6 @@ def test_add_zero_current(extractor, test_file): assert data.raw_data['current'].iloc[orig_len] == 0 - def test_date_check(extractor, test_file): """Test detecting out-of-order files""" files = [test_file, test_file.with_suffix('.002')] From a1b7a22063d2cd29e07427285f90f8ca863468ee Mon Sep 17 00:00:00 2001 From: Logan Ward Date: Mon, 2 Jun 2025 11:14:54 -0400 Subject: [PATCH 09/10] Add a rest to the beginning of next cycle too --- battdat/io/base.py | 16 +++++++++++++++- tests/io/test_maccor.py | 5 ++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/battdat/io/base.py b/battdat/io/base.py index c05dd9f..ea7d949 100644 --- a/battdat/io/base.py +++ b/battdat/io/base.py @@ -7,6 +7,7 @@ from battdat.data import BatteryDataset from battdat.schemas import BatteryMetadata +from battdat.schemas.column import ChargingState PathLike = Union[str, Path] @@ -141,13 +142,26 @@ def read_dataset(self, group: Sequence[PathLike] = (), metadata: Optional[Batter # Ensure current is zero if the rest between files is nonzero if rest_between_files != 0 and last_row['current'] != 0: + # Assume the rest occurs a millisecond later new_last_row = output_dfs[-1].iloc[-1:].copy() - new_last_row['test_time'] += 1e-3 # Assume the rest occurs a millisecond later + new_last_row['test_time'] += 1e-3 new_last_row['current'] = 0 if 'time' in new_last_row: new_last_row['time'] += 1e-3 + if 'state' in new_last_row: + new_last_row['state'] = ChargingState.hold output_dfs[-1] = pd.concat([output_dfs[-1], new_last_row], ignore_index=True) + # Assume the rest ends a millisecond before the new cycle starts + new_first_row = df_out.iloc[:1].copy() + new_first_row['test_time'] -= 1e-3 + new_first_row['current'] = 0. + if 'time' in new_first_row: + new_first_row['time'] -= 1e-3 + if 'state' in new_first_row: + df_out['state'] = ChargingState.hold + df_out = pd.concat([new_first_row, df_out], ignore_index=True) + # Adjust the cycle number, if included # Assume the new file starts a new cycle if 'cycle_number' in df_out.columns and 'cycle_number' in last_row: diff --git a/tests/io/test_maccor.py b/tests/io/test_maccor.py index cc095dc..3dd1f9f 100644 --- a/tests/io/test_maccor.py +++ b/tests/io/test_maccor.py @@ -1,5 +1,7 @@ """Tests related to the MACCOR parser""" from datetime import datetime + +import numpy as np from pytest import fixture, raises from battdat.consistency.time import TestTimeVsTimeChecker @@ -53,7 +55,8 @@ def test_add_zero_current(extractor, test_file): # Append a second test file, ensure nonzero current data = extractor.read_dataset([test_file.with_suffix('.charge.001'), test_file.with_suffix('.002')]) - assert data.raw_data['current'].iloc[orig_len] == 0 + assert np.allclose(data.raw_data['current'].iloc[orig_len:orig_len + 2], 0) + assert np.allclose(data.raw_data['file_number'].iloc[orig_len:orig_len + 2], [0, 1]) def test_date_check(extractor, test_file): From 6cb1bd287334236c35ebcce2e649d6ed14d3a682 Mon Sep 17 00:00:00 2001 From: Logan Ward Date: Mon, 2 Jun 2025 11:23:17 -0400 Subject: [PATCH 10/10] Document the inserted rows --- battdat/io/base.py | 2 +- docs/user-guide/io.rst | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/battdat/io/base.py b/battdat/io/base.py index ea7d949..0cf2306 100644 --- a/battdat/io/base.py +++ b/battdat/io/base.py @@ -141,7 +141,7 @@ def read_dataset(self, group: Sequence[PathLike] = (), metadata: Optional[Batter df_out['test_time'] += last_row['test_time'] + rest_between_files # Ensure current is zero if the rest between files is nonzero - if rest_between_files != 0 and last_row['current'] != 0: + if rest_between_files != 0 and (last_row['current'] != 0 or df_out['current'].iloc[0] != 0): # Assume the rest occurs a millisecond later new_last_row = output_dfs[-1].iloc[-1:].copy() new_last_row['test_time'] += 1e-3 diff --git a/docs/user-guide/io.rst b/docs/user-guide/io.rst index 448a6e5..c846928 100644 --- a/docs/user-guide/io.rst +++ b/docs/user-guide/io.rst @@ -61,6 +61,18 @@ find files: group = next(extractor.identify_files('./example-path/')) dataset = extractor.read_dataset(group) + +Reading Data from Multiple Files +++++++++++++++++++++++++++++++++ + +The MACCOR and Arbin readers can combine test data from multiple files into the same, contiguous dataset. +Combining is built on two key assumptions: + +1. The cells passed to ``read_dataset`` are in chronological order. +2. The battery is at rest in any period between testing files. + The dataset reader will insert rows with zero current + if the current in the first or last measurement of a file is nonzero. + Writing Data ------------