From fe7fbe459648a57d9f44bbed3437d90246411885 Mon Sep 17 00:00:00 2001 From: Joe Zuntz Date: Fri, 15 Jul 2022 11:57:41 +0100 Subject: [PATCH 1/9] add the concept of a rerun key to stages --- ceci/stage.py | 17 ++++++++++++----- tests/test_stage.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 5 deletions(-) diff --git a/ceci/stage.py b/ceci/stage.py index 76215b3..3a1ccde 100644 --- a/ceci/stage.py +++ b/ceci/stage.py @@ -83,6 +83,9 @@ def __init__(self, args, comm=None): comm: MPI communicator (default is None) An MPI comm object to use in preference to COMM_WORLD """ + if not isinstance(args, dict): + args = vars(args) + self._configs = StageConfig(**self.config_options) self._inputs = None self._outputs = None @@ -92,7 +95,7 @@ def __init__(self, args, comm=None): self._rank = 0 self._io_checked = False self.dask_client = None - + self._rerun_key = args.get('rerun_key', 0) self.load_configs(args) if comm is not None: self.setup_mpi(comm) @@ -148,12 +151,9 @@ def load_configs(self, args): Parameters ---------- - args: dict or namespace + args: dict Specification of input and output paths and any missing config options """ - if not isinstance(args, dict): - args = vars(args) - # We alwys assume the config arg exists, whether it is in input_tags or not if "config" not in args: # pragma: no cover raise ValueError("The argument --config was missing on the command line.") @@ -534,6 +534,13 @@ def parse_command_line(cls, cmd=None): help="Report memory use. Argument gives interval in seconds between reports", ) + parser.add_argument( + "--rerun-key", + type=int, + default=0, + help="A key to use when re-running an interrupted run. Subclasses can use this as they wish.", + ) + if cmd is None: ret_args = parser.parse_args() else: diff --git a/tests/test_stage.py b/tests/test_stage.py index 2201e84..195ee83 100644 --- a/tests/test_stage.py +++ b/tests/test_stage.py @@ -489,6 +489,36 @@ def test_map(): mockmpi.mock_mpiexec(3, core_test_map) +def test_key(): + class Lima(PipelineStage): + name = f"Lima" + inputs = [] + outputs = [] + config_options = {} + + def run(self): + pass + + # First check interactive construction + + # default rerun key should be zero + ll1 = Lima.make_stage() + assert ll1._rerun_key == 0 + + # if we set it explicitly it should work like this + ll2 = Lima.make_stage(rerun_key=78910) + assert ll2._rerun_key == 78910 + + # Now test command-line construction + + # again test both default value and set value + cmd = ["Lima", "--config", "tests/config.yml"] + ll3 = Lima(Lima.parse_command_line(cmd)) + assert ll3._rerun_key == 0 + + ll4 = Lima(Lima.parse_command_line(cmd + ["--rerun-key", "54321"])) + assert ll4._rerun_key == 54321 + def test_unknown_stage(): with pytest.raises(StageNotFound): From 0682a3805cbfb0d87f2cfa7bf0652ecfd68ae4ee Mon Sep 17 00:00:00 2001 From: Joe Zuntz Date: Mon, 18 Jul 2022 13:14:59 +0100 Subject: [PATCH 2/9] add use of desc_provenance library --- ceci/stage.py | 43 +++++++++++++++++++++++++++++++++++++++++-- tests/test_stage.py | 20 ++++++++++---------- 2 files changed, 51 insertions(+), 12 deletions(-) diff --git a/ceci/stage.py b/ceci/stage.py index 3a1ccde..2a05481 100644 --- a/ceci/stage.py +++ b/ceci/stage.py @@ -8,6 +8,7 @@ import cProfile import pdb import datetime +import desc_provenance from abc import abstractmethod from . import errors @@ -96,6 +97,7 @@ def __init__(self, args, comm=None): self._io_checked = False self.dask_client = None self._rerun_key = args.get('rerun_key', 0) + self._provenance = None self.load_configs(args) if comm is not None: self.setup_mpi(comm) @@ -339,7 +341,7 @@ def __init_subclass__(cls, **kwargs): ############################################# @classmethod - def get_stage(cls, name): + def get_stage(cls, name, incomplete=False): """ Return the PipelineStage subclass with the given name. @@ -386,6 +388,19 @@ def get_module(cls): """ return cls.pipeline_stages[cls.name][0].__module__ + @classmethod + def get_module_file(cls): + """ + Return the path to the file containing the current sub-class + + Returns + ------- + path: Path object + The file defining this class. + """ + return cls.pipeline_stages[cls.name][1] + + @classmethod def usage(cls): # pragma: no cover """ @@ -992,8 +1007,10 @@ def open_output( ) raise RuntimeError("h5py module is not MPI-enabled.") + # Return an opened object representing the file - obj = output_class(path, "w", **kwargs) + obj = output_class(path, "w", provenance=self.provenance, **kwargs) + if wrapper: return obj return obj.file @@ -1040,6 +1057,28 @@ def get_output_type(self, tag): return dt raise ValueError(f"Tag {tag} is not a known output") # pragma: no cover + + @property + def provenance(self): + if self._provenance is not None: + return self._provenance + + p = desc_provenance.Provenance() + + # Ignore any missing files + input_files = {tag: path for tag, path in self._inputs.items() if path is not None} + + # Get the place this stage is defined + directory = os.path.split(self.get_module_file())[0] + + # Make and write provenance information + p.generate(user_config=self.config.to_dict(), input_files=input_files, directory=directory) + + self._provenance = p + return p + + + ################################################## # Configuration-related methods and properties. ################################################## diff --git a/tests/test_stage.py b/tests/test_stage.py index 195ee83..0e6530e 100644 --- a/tests/test_stage.py +++ b/tests/test_stage.py @@ -428,24 +428,24 @@ class Juliett(PipelineStage): inputs = [] outputs = [("my_output", HDFFile)] config_options = {} + def run(self): + pass cmd = "Juliett", "--config", "tests/config.yml", "--my_output", "tests/test_out.hdf5" # Testing without an alias - jj = Juliett(Juliett.parse_command_line(cmd)) - #assert os.path.exists(jj.get_output("my_output")) - f = jj.open_output("my_output") - print(f.keys()) - f.close() + jj1 = Juliett(Juliett.parse_command_line(cmd)) + + with jj1.open_output("my_output") as f: + print(f.keys()) # Testing with an alias - config.yml defines an alias for my_input, my_alias - jj = Juliett.make_stage(name="JuliettCopy", aliases=dict(my_output='my_alias')) + jj2 = Juliett.make_stage(name="JuliettCopy", aliases=dict(my_output='my_alias')) - print(jj.get_aliases()) + print(jj2.get_aliases()) # This works now - f = jj.open_output("my_output") - print(f.keys()) - f.close() + with jj2.open_output("my_output") as f: + print(f.keys()) def core_test_map(comm): From 3c80ab459d38f5ee814a6ded946894fb10694f76 Mon Sep 17 00:00:00 2001 From: Joe Zuntz Date: Mon, 18 Jul 2022 13:24:10 +0100 Subject: [PATCH 3/9] add code from old provenance library --- ceci/__init__.py | 2 +- ceci/provenance/__init__.py | 2 + ceci/provenance/errors.py | 22 + ceci/provenance/git.py | 81 ++++ ceci/provenance/provenance.py | 870 ++++++++++++++++++++++++++++++++++ ceci/provenance/utils.py | 138 ++++++ ceci/stage.py | 4 +- 7 files changed, 1116 insertions(+), 3 deletions(-) create mode 100644 ceci/provenance/__init__.py create mode 100644 ceci/provenance/errors.py create mode 100644 ceci/provenance/git.py create mode 100755 ceci/provenance/provenance.py create mode 100755 ceci/provenance/utils.py diff --git a/ceci/__init__.py b/ceci/__init__.py index 4e5857d..31ce580 100644 --- a/ceci/__init__.py +++ b/ceci/__init__.py @@ -9,4 +9,4 @@ __version__ = get_distribution(__name__).version except DistributionNotFound: # pragma: no cover # package is not installed - pass + __version__ = "unknown" diff --git a/ceci/provenance/__init__.py b/ceci/provenance/__init__.py new file mode 100644 index 0000000..577a2d9 --- /dev/null +++ b/ceci/provenance/__init__.py @@ -0,0 +1,2 @@ +from .provenance import Provenance +from .errors import * diff --git a/ceci/provenance/errors.py b/ceci/provenance/errors.py new file mode 100644 index 0000000..101d0a2 --- /dev/null +++ b/ceci/provenance/errors.py @@ -0,0 +1,22 @@ +class ProvenanceError(Exception): + pass + + +class ProvenanceFileTypeUnknown(ProvenanceError): + pass + + +class ProvenanceFileSchemeUnsupported(ProvenanceError): + pass + + +class ProvenanceMissingFile(ProvenanceError): + pass + + +class ProvenanceMissingSection(ProvenanceError): + pass + + +class ProvenanceMissingItem(ProvenanceError): + pass diff --git a/ceci/provenance/git.py b/ceci/provenance/git.py new file mode 100644 index 0000000..618c86b --- /dev/null +++ b/ceci/provenance/git.py @@ -0,0 +1,81 @@ +from .utils import get_caller_directory +import subprocess + + +def diff(dirname=None, parent_frames=1): + """ + Run git diff in the caller's directory (default) or another specified directory, + and return stdout+stderr + """ + if dirname is None: + dirname = get_caller_directory(parent_frames + 1) + + if dirname is None: + return "ERROR_GIT_NO_DIRECTORY" + # We use git diff head because it shows all differences, + # including any that have been staged but not committed. + try: + diff = subprocess.run( + "git diff HEAD".split(), + cwd=dirname, + universal_newlines=True, + timeout=5, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + + # There are lots of different ways this can go wrong. + # Here are some - any others it is probably worth knowing + # about + except subprocess.TimeoutExpired: + return "ERROR_GIT_TIMEOUT" + except UnicodeDecodeError: + return "ERROR_GIT_DECODING" + except subprocess.SubprocessError: + return "ERROR_GIT_OTHER" + except FileNotFoundError: + return "ERROR_GIT_NOT_RUNNABLE" + except OSError: + return "ERROR_GIT_OTHER_OSERROR" + # If for some reason we are running outside the main repo + # this will return an error too + if diff.returncode: + return "ERROR_GIT_FAIL" + + return diff.stdout + + +def current_revision(dirname=None, parent_frames=1): + """Return the git revision ID in the caller's directory (default) or another + specified directory. + """ + if dirname is None: + dirname = get_caller_directory(parent_frames + 1) + + if dirname is None: + return "ERROR_GIT_NO_DIRECTORY" + try: + rev = subprocess.run( + "git rev-parse HEAD".split(), + cwd=dirname, + universal_newlines=True, + timeout=5, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + # Same as git diff above. + except subprocess.TimeoutExpired: + return "ERROR_GIT_TIMEOUT" + except UnicodeDecodeError: + return "ERROR_GIT_DECODING" + except subprocess.SubprocessError: + return "ERROR_GIT_OTHER" + except FileNotFoundError: + return "ERROR_GIT_NOT_RUNNABLE" + except OSError: + return "ERROR_GIT_OTHER_OSERROR" + # If for some reason we are running outside the main repo + # this will return an error too + if rev.returncode: + return "ERROR_GIT_FAIL" + return rev.stdout diff --git a/ceci/provenance/provenance.py b/ceci/provenance/provenance.py new file mode 100755 index 0000000..a320ebf --- /dev/null +++ b/ceci/provenance/provenance.py @@ -0,0 +1,870 @@ +from . import git +from . import errors +from . import utils +import sys +import uuid +import socket +import getpass +import pathlib +import warnings +import datetime +import functools +import contextlib +import collections +import numpy as np +import pickle +import copy + +# Some useful constants +unknown_value = "UNKNOWN" +provenance_group = "provenance" +base_section = "base" +config_section = "config" +input_id_section = "input_id" +input_path_section = "input_path" +git_section = "git" +versions_section = "versions" +comments_section = "comments" + + +def writer_method(method): + """Do some book-keeping to turn a provenance method into a writer method + + We put this decorator around all the methods that write + provenance to a file, so that they all generate, remove, + and return a unique ID for each new file they write to. + + It's not intended for users. + """ + # This makes the decorator "well-behaved" so that it + # doesn't change the name of the function when printed, + # etc. + @functools.wraps(method) + def wrapped_method(self, *args, **kwargs): + # Record it in the provenance object + file_id = self.generate_file_id() + + # I was a bit confused at the need to include + # self here, but it seems to be required + try: + method(self, *args, **kwargs) + finally: + # At the end, remove it from the Provenance, because it will not + # be relevant for future files + del self.provenance[base_section, "file_id"] + # But return it; this is mainly to help testing + return file_id + + return wrapped_method + + +def writable_value(x): + if isinstance(x, (int, np.integer)) or isinstance(x, (float, np.floating)): + return x + return str(x) + + +class Provenance: + """Collects, generates, reads, and writes provenance information. + + This object can be used like a dictionary with two keys: + provenance[category, key] = value + """ + + def __init__(self, code_dir=None, parent_frames=0): + """Create an empty provenance object""" + self.code_dir = code_dir or utils.get_caller_directory(parent_frames + 1) + self.provenance = {} + self.comments = [] + + def copy(self): + cls = self.__class__ + cp = cls(code_dir=self.code_dir) + cp.provenance = copy.deepcopy(self.provenance) + cp.comments = copy.deepcopy(self.comments) + return cp + + # Generation methods + # ------------------ + def generate( + self, user_config=None, input_files=None, comments=None, directory=None + ): + """ + Generate a new set of provenance. + + After calling this the provenance object will contain: + - the date, time, and place of creation + - the user and domain name + - all python modules already imported anywhere that have a version number + - git info about the directory where this instance was created + - sys.argv + - a config dict passed by the caller + - a dict of input files passed by the caller + - any comments we want to add. + + Parameters + ---------- + user_config: dict or None + Optional input configuration options + input_files: dict or None + Optional name_for_file: file_path dict + comments: list or None + Optional comments to include. Not intended to be machine-readable + directory: str or None + Optional directory in which to run git information + """ + # Record various core pieces of information + self._add_core_info() + self._add_git_info(directory) + self._add_module_versions() + self._add_argv_info() + + # Add user inputs + if input_files is not None: + for name, path in input_files.items(): + self.add_input_file(name, path) + + # Add any specific items given by the user + if user_config is not None: + for key, value in user_config.items(): + self[config_section, key] = writable_value(value) + + if comments is not None: + for comment in comments: + self.add_comment(comment) + + # Core methods called in generate above + # ------------------------------------- + def _add_core_info(self): + self[base_section, "process_id"] = uuid.uuid4().hex + self[base_section, "domain"] = socket.getfqdn() + self[base_section, "creation"] = datetime.datetime.now().isoformat() + self[base_section, "user"] = getpass.getuser() + + def _add_argv_info(self): + for i, arg in enumerate(sys.argv): + self[base_section, f"argv_{i}"] = arg + + def _add_git_info(self, directory): + # Add some git information + self[git_section, "diff"] = git.diff(directory) + self[git_section, "head"] = git.current_revision() + + def _add_module_versions(self): + for module, version in utils.find_module_versions().items(): + self[versions_section, module] = version + + def add_input_file(self, name, path): + """ + Tell the provenance the name and path to one of your input files + so it can be recorded correctly in the output. + + Parameters + ---------- + name: str + A tag or name representing the file + + path: str or pathlib.Path + The path to the file + """ + # get the absolute form path to the file + path = str(pathlib.Path(path).absolute().resolve()) + + # Save it in ourselves + self[input_path_section, name] = path + # If the file was saved with its own provenance then it will have its own + # unique file_id. Try to record that ID. The file may be some other type, + # or not have provenance, so ignore any errors here. + try: + self[input_id_section, name] = self.get(path, base_section, "file_id") + except: + self[input_id_section, name] = unknown_value + + def add_comment(self, comment): + """ + Add a text comment. + + Comments with line breaks will be split into separate comments. + + Parameters + ---------- + comment: str + Comment to include + """ + for c in comment.split("\n"): + self.comments.append(c) + + # Dictionary methods + # ------------------ + def __getitem__(self, section_name): + section, name = section_name + return self.provenance[section, name] + + def __setitem__(self, section_name, value): + section, name = section_name + self.provenance[section, name] = value + + def __delitem__(self, section_name): + section, name = section_name + del self.provenance[section, name] + + def update(self, d): + """ + Update the provenance from a dictionary. + + The dictionary keys should be tuples of (category, key). + The values can be any basic type. + + Parameters + ---------- + d: dict or mapping + The dict to update from. + """ + for (section, name), value in d.items(): + self.provenance[section, name] = value + + # Generic I/O Methods + # ----------- + def write(self, f, suffix=None): + """ + Write provenance to a named file, guessing the file type from its suffix. + + Use the various write_* methods intead to write to a file you have already + opened, or if the file suffix does not match the type. + + Parameters + ---------- + f: str or writeable object + suffix: str + Must be supplied if f is a file-like object + + Returns + ------- + str + The newly-assigned file ID + """ + + # String or path + if utils.is_path(f): + f = pathlib.Path(f) + suffix = f.suffix + elif suffix is None: + raise ValueError("Must supply suffix if open file is supplied") + + # If passed a directory, make a provenance file in that directory + if suffix == "" and isinstance(f, pathlib.Path) and f.is_dir(): + return self.write_yaml(f + "provenance.yaml") + + if suffix and not suffix.startswith("."): + suffix = "." + suffix + + writers = { + ".hdf": self.write_hdf, + ".hdf5": self.write_hdf, + ".fits": self.write_fits, + ".fit": self.write_fits, + ".yml": self.write_yaml, + ".yaml": self.write_yaml, + ".pkl": self.write_pickle, + ".pickle": self.write_pickle, + } + method = writers.get(suffix) + + if method is None: + return self.write_yaml(f + ".provenance.yaml") + + return method(f) + + def read(self, filename): + """ + Read all provenance from any supported file type, guessing + the file type from its suffix. + + If the suffix does not match the type you can use one of the specific read_ + methods instead. You can also pass open file objects directly to those methods. + + Parameters + ---------- + filename: str + + Returns + ------- + None + """ + p = pathlib.Path(filename) + + readers = { + ".hdf": self.read_hdf, + ".hdf5": self.read_hdf, + ".fits": self.read_fits, + ".fit": self.read_fits, + ".yml": self.read_yaml, + ".yaml": self.read_yaml, + ".pkl": self.read_yaml, + ".pickle": self.read_pickle, + } + + method = readers.get(p.suffix) + if method is None: + raise errors.ProvenanceFileTypeUnknown(filename) + + return method(filename) + + @classmethod + def get(cls, filename, section, key): + """ + Get a single item of provenance from any supported file type, guessing + the file type from its suffix. + + If the suffix does not match the type you can use one of the specific get_ + methods instead. You can also pass open file objects directly to those methods. + + Parameters + ---------- + filename: str + + section: str + + key: str + + Returns + ------- + value: any + The native value of the key in this value + """ + + p = pathlib.Path(filename) + if not p.exists(): + raise errors.ProvenanceMissingFile(filename) + + getters = { + ".hdf": cls.get_hdf, + ".hdf5": cls.get_hdf, + ".fits": cls.get_fits, + ".fit": cls.get_fits, + ".yml": cls.get_yaml, + ".yaml": cls.get_yaml, + ".pkl": cls.get_pickle, + ".pickle": cls.get_pickle, + } + + method = getters.get(p.suffix) + if method is None: + raise errors.ProvenanceFileTypeUnknown(filename) + + return method(filename, section, key) + + # HDF Methods + # ----------- + @classmethod + def _read_get_hdf(cls, hdf_file, item=None): + with utils.open_hdf(hdf_file, "r") as f: + # If the whole provenance section is missing, e.g. + # because the file was not generated with provenance at all, + # then raise the appropriate error + if provenance_group not in f.keys(): + raise errors.ProvenanceMissingSection( + "HDF File is missing provenance section" + ) + + # Othewise get the provenance group. Provenance is stored in its + # attributes of subgroups + g = f[provenance_group] + + # If we are being called from read_hdf then we want to read everything + if item is None: + d = {} + comments = [] + # Go to all the (category) subgroups + for section in g.keys(): + sg = g[section] + if section == comments_section: + for val in sg.attrs.values(): + comments.append(val) + else: + # and read all the attributes in each one + for key, val in sg.attrs.items(): + d[section, key] = val + return d, comments + # Otherwise just read the one requested item + else: + section, key = item + if section not in g.keys(): + raise errors.ProvenanceMissingItem(f"{section}/{key}") + + # Will be None if not present + value = g[section].attrs.get(key) + + if value is None: + raise errors.ProvenanceMissingItem(item) + else: + return value + + @classmethod + def get_hdf(cls, hdf_file, section, key): + """Get a single item of provenance from an HDF file. + + Parameters + ---------- + hdf_file: str or h5py.File + The file name or an open file + + section: str + The provenance item category + + key: str + The provenance item name + + Returns + ------- + value + The value (of any type) found in the file + """ + return cls._read_get_hdf(hdf_file, (section, key)) + + def read_hdf(self, hdf_file): + """Read provenance from an HDF5 file. + + Parameters + ---------- + hdf_file: str or h5py.File.FITS + The file name or an open file object + + Returns + ------- + None + """ + d, com = self._read_get_hdf(hdf_file) + self.update(d) + self.comments.extend(com) + + @writer_method + def write_hdf(self, hdf_file): + """Write provenance to an HDF5 file. + + Parameters + ---------- + hdf_file: str or h5py.File + The file name or an open file object + + Returns + ------- + str + The newly-assigned file ID + """ + with utils.open_hdf(hdf_file, "a") as f: + # Group may or may not exist already + if provenance_group in f.keys(): + g = f[provenance_group] + else: + g = f.create_group(provenance_group) + + # Write each category to a subgroup + for (section, key), value in self.provenance.items(): + # Create subgroup if it does not exist already + if section not in g.keys(): + subg = g.create_group(section) + else: + subg = g[section] + + # Write values to subgroup attributes + subg.attrs[key] = value + + # Write comments in this section if needed + if comments_section not in g.keys(): + subg = g.create_group(comments_section) + else: + subg = g[comments_section] + + for i, comment in enumerate(self.comments): + subg.attrs[f"comment_{i}"] = comment + + # FITS Methods + # ------------ + @classmethod + def get_fits(cls, fits_file, section, key): + """Get a single item of provenance from a FITS file. + + Parameters + ---------- + fits_file: str or fitsio.FITS + The file name or an open file + + section: str + The provenance item category + + key: str + The provenance item name + + Returns + ------- + value + The value (of any type) found in the file + """ + return cls._read_get_fits(fits_file, (section, key)) + + def read_fits(self, fits_file): + """Read proveance from a FITS file. + + Loads all the provenance found in a FITS file. + + Parameters + ---------- + fits_file: str or fitsio.FITS + The file name or an open file object + + """ + d, com = self._read_get_fits(fits_file) + self.update(d) + self.comments.extend(com) + + @writer_method + def write_fits(self, fits_file): + """Write provenance to a FITS file. + + Parameters + ---------- + fits_file: str or fitsio.FITS + The file name or an open file object + + Returns + ------- + str + The newly-assigned file ID + """ + with utils.open_fits(fits_file, "rw") as f: + + # Create the group if it doesn't exist + if provenance_group in f: + ext = f[provenance_group] + else: + f.create_image_hdu(extname=provenance_group) + f.update_hdu_list() + ext = f[provenance_group] + + # Helper local function to write a key. + # To maintain case we store items as a trio + # of keys specifying category, key, and value + def write_key(s, k, v, i): + ext.write_key(f"SEC{i}", s) + ext.write_key(f"KEY{i}", k) + ext.write_key(f"VAL{i}", v) + + # Write the keys we have one by one + for i, ((section, key), value) in enumerate(self.provenance.items()): + # FITS header items can't contain newlines, so we break up + # any text with newlines into separate entries which we patch + # together again when loading + if isinstance(value, str) and "\n" in value: + values = value.split("\n") + # There's some kind of bug in CFITSIO that lets you write + # but not read certain text that includes new lines when the + # key is longer than 8 characters. This avoids that because + # our keys are always shorter than this in this case + if len(values) > 999: + warnings.warn( + f"Cannot write all very long item {section}/{key} to FITS provenance (>999 lines). Truncating." + ) + values = values[:999] + for j, v in enumerate(values): + write_key(section, key, v, f"{i}_{j}") + # or if it's any other item we just put it in directly + else: + write_key(section, key, value, i) + + for comment in self.comments: + ext.write_comment(comment) + + # Internal method implementing the read and get methods + @classmethod + def _read_get_fits(cls, fits_file, item=None): + with utils.open_fits(fits_file, "r") as f: + ext = f[0] + # We may be called from the get or read methods. + # In the former case we will be given a specific item + # to get, which we split here + if item is not None: + target_sec, target_key = item + + # Read the entire header. A bit wasteful if we only want a single + # item from it, but this shouldn't be a performance bottleneck. + hdr = ext.read_header() + + comments = [ + k["value"].strip() for k in hdr.records() if k["name"] == "COMMENT" + ] + + # Remove sone of the standard FITS comments put in everything + # by CFITSIO. + try: + comments.remove( + "FITS (Flexible Image Transport System) format is defined in 'Astronomy" + ) + comments.remove( + "and Astrophysics', volume 376, page 359; bibcode: 2001A&A...376..359H" + ) + except ValueError: + pass + # We have recorded items in trios of KEY0, SEC0, VAL0, 1, 2 etc. + # so count how many keys we have + indices = [k[3:] for k in hdr if k.upper().startswith("KEY")] + indices = [k for k in indices if k and k[0] in "0123456789"] + + # We will collect the number of lines for each multi-line + # item, so we can patch together later. + multiline_indices = collections.defaultdict(int) + d = {} + + # split these keys into multi-line and normal keys + for index in indices: + if "_" in index: + orig_index, _ = index.split("_", 1) + multiline_indices[orig_index] += 1 + else: + # Handle the normal keys just by reading them + sec = hdr[f"SEC{index}"] + val = hdr[f"VAL{index}"] + key = hdr[f"KEY{index}"] + # If this is called from get_ then return + # if we have found the desired object + if item is not None: + if (sec == target_sec) and (key == target_key): + return val + # Otherwise just build up all the items + else: + d[sec, key] = val + + # Now deal with all the multiline ones we found. + # we recorded the number of entries for each of them + for index, n in multiline_indices.items(): + vals = [] + # sec and key should be the same for them all + sec = hdr[f"SEC{index}_0"] + key = hdr[f"KEY{index}_0"] + + # reassemble into a multi-line text + for i in range(n): + vals.append(hdr[f"VAL{index}_{i}"]) + val = "\n".join(vals) + + # Check if the target is this multiline item + if item is not None: + if (sec == target_sec) and (key == target_key): + return val + else: + d[sec, key] = val + + # If we were not asked for a specific item then return + # the entire thing + if item is None: + return d, comments + else: + # If we were asked for an item then if we've got this far + # then we've failed. + raise errors.ProvenanceMissingItem( + f"Missing item {target_sec} {target_key}" + ) + + # Other I/O Methods + # ----------------- + + @classmethod + def _read_get_yaml(cls, yml_file, item): + import ruamel.yaml as yaml + + y = yaml.YAML() + + with utils.open_file(yml_file, "r") as f: + # Read the whole file + data = y.load(f) + d = data["provenance"] + if item is not None: + sd = d[item[0]] + return sd[item[1]] + + # Pull out the different sections + # into a dict for provenance and a list + # for comments + out = {} + com = [] + for section, sub in d.items(): + if section == comments_section: + com = sub[:] + else: + for key, value in sub.items(): + out[section, key] = value + return out, com + + @classmethod + def get_yaml(self, yml_file, section, key): + return self._read_get_yaml(yml_file, (section, key)) + + def read_yaml(self, yml_file): + """Read provenance from a YAML file. + + Updates the provenance object. + + Parameters + ---------- + yml_file: str or file + The file name or an open file object + + Returns + ------- + None + """ + d, com = self._read_get_yaml(yml_file, None) + self.update(d) + self.comments.extend(com) + + def _make_yml(self): + # internal method to make a dictionary from + # this instance suitable to dump to yml + d = {} + for (sec, key), val in self.provenance.items(): + if sec not in d: + d[sec] = {} + d[sec][key] = val + d["comments"] = self.comments[:] + return d + + @writer_method + def write_yaml(self, yml_file): + """Write provenance to a YAML file. + + Parameters + ---------- + yml_file: str or file + The file name or an open file object + + Returns + ------- + str + The newly-assigned file ID + """ + import ruamel.yaml as yaml + + # Create the YAML loader. The default instance + # of this preserves comments in the YAML if present, + # which means we can run this code on existing + # commented yaml without destroying it + y = yaml.YAML() + p = self._make_yml() + + if utils.is_path(yml_file) or "r" in yml_file.mode: + with utils.open_file(yml_file, "r+") as f: + + # record curent position (in case this is a pre-opened file) + # and load the yaml from the start + s = f.tell() + f.seek(0) + d = y.load(f) + + # if file was empty before: + if d is None: + d = {} + elif not (isinstance(d, yaml.comments.CommentedMap)): + # go back to where we started but complain that this is + # not a dict-type yaml file + f.seek(s) + raise errors.ProvenanceFileSchemeUnsupported( + "Provenance only supports yaml files containing a dictionary as the top level object" + ) + + # replace existing prov completely if present. We re-write + # the whole file contents after the prov. Could avoid but not really needed + # as ruamel should maintain comments. + d["provenance"] = p + f.seek(0) + y.dump(d, f) + f.truncate() + else: + # filed opened in write-only mpde + y.dump(x, f) + + @writer_method + def write_pickle(self, pickle_file): + """Write provenance to a Pickle file. + + Parameters + ---------- + pickle_file: str or file + The file name or an open file object + + Returns + ------- + str + The newly-assigned file ID + """ + + if utils.is_path(pickle_file) or "r" in pickle_file.mode: + with utils.open_file(pickle_file, "r+") as f: + # jump to the end of the file + f.seek(0, 2) + # save the pickle info + pickle.dump(["provenance_dump", self.provenance, self.comments], f) + + else: + # filed opened in write-only mode already + pickle.dump( + ["provenance_dump", self.provenance, self.comments], pickle_file + ) + + @classmethod + def _read_get_pickle(cls, pickle_file): + f = utils.open_file(pickle_file, "r") + s = f.tell() + + try: + n = 0 + while True: + try: + item = pickle.load(f) + except: + break + if n == 0: + raise ProvenanceError(f"Nothing readable found in file {pickle_file}") + if ( + (not isinstance(item, list)) + or (len(item) != 3) + or (item[0] != "provenance_dump") + ): + raise ProvenanceError(f"No provenance found in file {pickle_file}") + _, d, c = item + finally: + if utils.is_path(pickle_file): + f.close() + else: + f.seek(s) + + def read_pickle(self, pickle_file): + """Read provenance from a YAML file. + + Updates the provenance object. + + Parameters + ---------- + pickle_file: str or file + The file name or an open file object + + Returns + ------- + None + """ + d, com = self._read_get_pickle(pickle_file) + self.update(d) + self.comments.extend(com) + + @classmethod + def get_pickle(self, pickle_file, section, key): + d, com = self._read_get_pickle(pickle_file) + return d[section, key] + + def to_string_dict(self): + d = {f"{s}/{k}": str(v) for (s, k), v in self.provenance.items()} + for i, c in self.comments: + d[f"comment_{i}"] = c + return d + + def generate_file_id(self): + self[base_section, "file_id"] = uuid.uuid4().hex diff --git a/ceci/provenance/utils.py b/ceci/provenance/utils.py new file mode 100755 index 0000000..56edc27 --- /dev/null +++ b/ceci/provenance/utils.py @@ -0,0 +1,138 @@ +import distutils.version +import sys +import os +import inspect +import pathlib +import contextlib +import shutil + + +def is_path(p): + """ + Decide whether the input is likely to be a path instead of a file object. + + Returns True if the parameter is a pathlib.Path object or a string, False + otherwise + + Parameters + ---------- + p: Any + Object which may be a path or not + + Returns + ------- + bool + + """ + return isinstance(p, str) or isinstance(p, pathlib.Path) + + +def get_caller_directory(parent_frames=0): + """ + Find the directory where the code calling this + function lives, or any number of jumps back up the stack + + Parameters + ---------- + parent_frames: int + Number of additional frames to go up in the call stack + + Returns + ------- + directory: str + """ + previous_frame = inspect.currentframe().f_back + # go back more frames if desired + for i in range(parent_frames): + previous_frame = previous_frame.f_back + + filename = inspect.getframeinfo(previous_frame).filename + p = pathlib.Path(filename) + if not p.exists(): + # dynamically generated or interactive mode + return None + return str(p.parent) + + +def find_module_versions(): + """ + Generate a dictionary of versions of all imported modules + by looking for __version__ or version attributes on them. + + Parameters + ---------- + None + + Returns + ------- + dict: + A dictioary of the versions of all loaded modules + """ + versions = {} + for name, module in sys.modules.items(): + if hasattr(module, "version"): + v = module.version + elif hasattr(module, "__version__"): + v = module.__version__ + else: + continue + if isinstance(v, str) or isinstance(v, distutils.version.Version): + versions[name] = str(v) + return versions + + +@contextlib.contextmanager +def open_hdf(hdf_file, mode): + """Open an HDF file, or if a file is provided, simply return it""" + import h5py + + if is_path(hdf_file): + f = h5py.File(hdf_file, mode) + try: + yield f + finally: + f.close() + else: + yield hdf_file + + +@contextlib.contextmanager +def open_fits(fits_file, mode): + """Open a FITS file, or if a file is already provided simply return it""" + import fitsio + + if is_path(fits_file): + exists = os.path.exists(fits_file) + + # By default the "w" mode in FITSIO is r/w. We have to explicitly remove + # first if we want to do a proper write and the file already exists. + if mode == "w": + mode = "rw" + if exists: + shutil.remove(fits_file) + f = fitsio.FITS(fits_file, mode=mode) + + try: + yield f + finally: + f.close() + else: + yield fits_file + + +@contextlib.contextmanager +def open_file(file, mode): + """Open a regular file, or if a file is already provided simply return it""" + + if is_path(file): + if mode == "r+" and not os.path.exists(file): + f = open(file, "w+") + else: + f = open(file, mode=mode) + + try: + yield f + finally: + f.close() + else: + yield file diff --git a/ceci/stage.py b/ceci/stage.py index 2a05481..8d51d17 100644 --- a/ceci/stage.py +++ b/ceci/stage.py @@ -8,7 +8,7 @@ import cProfile import pdb import datetime -import desc_provenance +from .provenance import Provenance from abc import abstractmethod from . import errors @@ -1063,7 +1063,7 @@ def provenance(self): if self._provenance is not None: return self._provenance - p = desc_provenance.Provenance() + p = Provenance() # Ignore any missing files input_files = {tag: path for tag, path in self._inputs.items() if path is not None} From 807e57ed8d00b980cc1a106343496c862965495a Mon Sep 17 00:00:00 2001 From: Joe Zuntz Date: Mon, 18 Jul 2022 14:38:39 +0100 Subject: [PATCH 4/9] working tests --- ceci/__init__.py | 1 + ceci/provenance/provenance.py | 16 ++++++++++++---- ceci/stage.py | 10 +++++++++- ceci_example/types.py | 6 ++++-- 4 files changed, 26 insertions(+), 7 deletions(-) diff --git a/ceci/__init__.py b/ceci/__init__.py index 31ce580..3780600 100644 --- a/ceci/__init__.py +++ b/ceci/__init__.py @@ -2,6 +2,7 @@ from .stage import PipelineStage from .pipeline import Pipeline, MiniPipeline, ParslPipeline, DryRunPipeline +from .handle import BaseIOHandle from pkg_resources import DistributionNotFound from pkg_resources import get_distribution diff --git a/ceci/provenance/provenance.py b/ceci/provenance/provenance.py index a320ebf..cf55a8b 100755 --- a/ceci/provenance/provenance.py +++ b/ceci/provenance/provenance.py @@ -253,7 +253,8 @@ def write(self, f, suffix=None): # If passed a directory, make a provenance file in that directory if suffix == "" and isinstance(f, pathlib.Path) and f.is_dir(): - return self.write_yaml(f + "provenance.yaml") + p = f.parent / (f.name + ".provenance.yaml") + return self.write_yaml(p) if suffix and not suffix.startswith("."): suffix = "." + suffix @@ -270,8 +271,11 @@ def write(self, f, suffix=None): } method = writers.get(suffix) + # If we do not know how to write to this file type + # then just put a file next to it. if method is None: - return self.write_yaml(f + ".provenance.yaml") + p = f.parent / (f.name + ".provenance.yaml") + return self.write_yaml(p) return method(f) @@ -305,8 +309,10 @@ def read(self, filename): } method = readers.get(p.suffix) + if method is None: - raise errors.ProvenanceFileTypeUnknown(filename) + method = self.read_yaml + filename = p.parent / (p.name + ".provenance.yaml") return method(filename) @@ -867,4 +873,6 @@ def to_string_dict(self): return d def generate_file_id(self): - self[base_section, "file_id"] = uuid.uuid4().hex + file_id = uuid.uuid4().hex + self[base_section, "file_id"] = file_id + return file_id diff --git a/ceci/stage.py b/ceci/stage.py index 8d51d17..06b8c5e 100644 --- a/ceci/stage.py +++ b/ceci/stage.py @@ -935,6 +935,13 @@ def open_input(self, tag, wrapper=False, **kwargs): path = self.get_input(aliased_tag) input_class = self.get_input_type(tag) obj = input_class(path, "r", **kwargs) + prov = Provenance() + try: + prov.read(path) + obj.provenance = prov + except: + pass + if wrapper: # pragma: no cover return obj @@ -1009,7 +1016,8 @@ def open_output( # Return an opened object representing the file - obj = output_class(path, "w", provenance=self.provenance, **kwargs) + obj = output_class(path, "w", **kwargs) + obj.provenance = self.provenance if wrapper: return obj diff --git a/ceci_example/types.py b/ceci_example/types.py index c5e4e1a..9e46568 100644 --- a/ceci_example/types.py +++ b/ceci_example/types.py @@ -1,4 +1,6 @@ -class DataFile: +from ceci import BaseIOHandle + +class DataFile(BaseIOHandle): """ A class representing a DataFile to be made by pipeline stages and passed on to subsequent ones. @@ -16,7 +18,7 @@ class DataFile: """ - def __init__(self, path, mode, extra_provenance=None, validate=True, **kwargs): + def __init__(self, path, mode, validate=True, **kwargs): self.path = path self.mode = mode From 8e895add0dd2b10d8cb9477b21d920001560a7a3 Mon Sep 17 00:00:00 2001 From: Joe Zuntz Date: Mon, 18 Jul 2022 14:42:43 +0100 Subject: [PATCH 5/9] add tests and remove change --- ceci/stage.py | 2 +- tests/test_provenance.py | 281 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 282 insertions(+), 1 deletion(-) create mode 100644 tests/test_provenance.py diff --git a/ceci/stage.py b/ceci/stage.py index 06b8c5e..e3b1652 100644 --- a/ceci/stage.py +++ b/ceci/stage.py @@ -341,7 +341,7 @@ def __init_subclass__(cls, **kwargs): ############################################# @classmethod - def get_stage(cls, name, incomplete=False): + def get_stage(cls, name): """ Return the PipelineStage subclass with the given name. diff --git a/tests/test_provenance.py b/tests/test_provenance.py new file mode 100644 index 0000000..c7582f5 --- /dev/null +++ b/tests/test_provenance.py @@ -0,0 +1,281 @@ +import tempfile +from ceci.provenance import Provenance, errors, utils +from ceci import __version__ as lib_version +from pprint import pprint +import pytest +import datetime +import os +import random +import string + + +def test_generate(): + p = Provenance() + + # Check that the basic provenance information is generated correctly. + # temporarily change the user name so that we check it's okay + logname = os.environ.get("LOGNAME") + os.environ["LOGNAME"] = "mr_potato" + user_config = { + "xxx": "abc", + "car": "47", + } + + try: + p.generate( + user_config, + ) + finally: + if logname is None: + del os.environ["LOGNAME"] + else: + os.environ["LOGNAME"] = logname + + assert p["base", "user"] == "mr_potato" + + # check our own version is correct, and that of pytest + assert p["versions", "ceci"] == lib_version + assert p["versions", "pytest"] == pytest.__version__ + + # check some basic information + # date and time should be in the last few moments + dt = datetime.datetime.now() - datetime.datetime.fromisoformat( + p["base", "creation"] + ) + assert dt.total_seconds() < 5 + + +def test_inputs(): + p = Provenance() + p.generate() + with tempfile.TemporaryDirectory() as dirname: + fname = os.path.join(dirname, "test1.hdf") + file_id = p.write(fname) + + q = Provenance() + input_files = {"tag1": fname, "tag2": "./non_existent_file.hdf"} + q.generate(input_files=input_files) + + assert file_id == q["input_id", "tag1"] + assert "UNKNOWN" == q["input_id", "tag2"] + + +@pytest.mark.parametrize( + "file_type, opener", [("hdf", utils.open_hdf), ("yml", utils.open_file)] +) +def test_new(file_type, opener): + p = Provenance() + p["sec", "aaa"] = "xxx" + p["sec", "bbb"] = 123 + p["sec", "ccc"] = 3.14 + p["sec", "ddd"] = "cat" + p["sec", "eee"] = 4.14 + + with tempfile.TemporaryDirectory() as dirname: + fname = os.path.join(dirname, f"test.{file_type}") + file_id = p.write(fname) + + assert os.path.exists(fname) + + q = Provenance() + q.read(fname) + + print(q.provenance) + assert q["sec", "aaa"] == "xxx" + assert q["sec", "bbb"] == 123 + assert q["sec", "ccc"] == 3.14 + + # check the direct getter class methods work + assert Provenance.get(fname, "sec", "ddd") == "cat" + assert Provenance.get(fname, "sec", "eee") == 4.14 + + # the file ID will not be in the original as it changes + # but we saved it + print(q.provenance) + assert q["base", "file_id"] == file_id + del q["base", "file_id"] + assert p.provenance == q.provenance + + +def test_new_fits(): + file_type = "fits" + p = Provenance() + p["sec", "aaa"] = "xxx" + p["sec", "bbb"] = 123 + p["sec", "ccc"] = 3.14 + p["sec", "DDD"] = "cat" + p["sec", "eee"] = 4.14 + + with tempfile.TemporaryDirectory() as dirname: + fname = os.path.join(dirname, f"test.{file_type}") + + # Write to now-closed + file_id = p.write(fname) + + assert os.path.exists(fname) + + q = Provenance() + q.read(fname) + + assert q["sec", "aaa"] == "xxx" + assert q["sec", "bbb"] == 123 + assert q["sec", "ccc"] == 3.14 + + # check the direct getters work + assert q.get(fname, "sec", "DDD") == "cat" + assert q.get(fname, "sec", "eee") == 4.14 + + assert file_id == q["base", "file_id"] + del q["base", "file_id"] + assert p.provenance == q.provenance + + +def test_fits_multiline(): + file_type = "fits" + p = Provenance() + p["sec", "aaa"] = "Two households;\nboth alike in dignity!" + + with tempfile.TemporaryDirectory() as dirname: + fname = os.path.join(dirname, f"test.{file_type}") + + # Write to now-closed + p.write(fname) + + assert os.path.exists(fname) + + q = Provenance() + q.read(fname) + + assert p["sec", "aaa"] == q["sec", "aaa"] + + +def test_existing_hdf(): + import h5py + + p = Provenance() + p["sec", "aaa"] = "xxx" + p["sec", "bbb"] = 123 + p["sec", "ccc"] = 3.14 + p["sec", "DDD"] = "cat" + p["sec", "eee"] = 4.14 + p.add_comment("hopefully nothing else will break if I add this") + + with tempfile.TemporaryDirectory() as dirname: + fname = os.path.join(dirname, "test.hdf") + fname2 = os.path.join(dirname, "test2.hdf") + + with utils.open_hdf(fname, "w") as f: + f.create_group("cake") + + id1 = p.write(fname) + id2 = p.write(fname2) + + assert id1 != id2 + + q1 = Provenance() + q1.read(fname) + q2 = Provenance() + q2.read(fname2) + + assert q1["base", "file_id"] == id1 + assert q2["base", "file_id"] == id2 + + # Check full round trip + q1.provenance.pop(("base", "file_id")) + q2.provenance.pop(("base", "file_id")) + assert q1.provenance == q1.provenance + + +def test_yml(): + import ruamel.yaml as yaml + + y = yaml.YAML() + # check nothing is overridden + d = { + "cat": "good", + "dog": "bad", + "spoon": 45.6, + "cow": -222, + } + with tempfile.TemporaryDirectory() as dirname: + fname = os.path.join(dirname, "test.yml") + y.dump(d, open(fname, "w")) + + p = Provenance() + p.generate() + p.add_comment("this is a test to check nothing else breaks") + file_id = p.write(fname) + + # check prov reads + q = Provenance() + q.read(fname) + + assert q["base", "file_id"] == file_id + assert q["base", "process_id"] == q["base", "process_id"] + + d2 = y.load(open(fname)) + d2.pop("provenance") + assert d == d2 + + q = Provenance() + q.read_yaml(open(fname)) + assert q["base", "file_id"] == file_id + assert q["base", "process_id"] == q["base", "process_id"] + + +def test_unknown_file_type(): + p = Provenance() + p.generate() + with tempfile.TemporaryDirectory() as dirname: + fname = os.path.join(dirname, "test.xyz") + pname = os.path.join(dirname, "test.xyz.provenance.yaml") + file_id = p.write(fname) + print(fname, pname) + p2 = Provenance() + p2.read(fname) + assert p2['base', 'file_id'] == file_id + assert p2['base', 'process_id'] == p['base', 'process_id'] + + +def test_long(): + p = Provenance() + lines = [] + for i in range(1002): + line = "".join(random.choice(string.printable) for i in range(100)) + lines.append(line) + text = "\n".join(lines) + p["section", "key"] = text + + with tempfile.TemporaryDirectory() as dirname: + fname = os.path.join("./", "test.fits") + with pytest.warns(UserWarning): + p.write(fname) + q = Provenance() + q.read(fname) + + +def test_comments(): + p = Provenance() + p.generate() + comments = [ + "Hello,", + "My name is Inigo Montoya", + "You killed my father", + "Prepare to die", + ] + for comment in comments: + p.add_comment(comment) + + with tempfile.TemporaryDirectory() as dirname: + for suffix in ["hdf", "fits", "yml"]: + fname = os.path.join("./", f"test.{suffix}") + p.write(fname) + q = Provenance() + q.read(fname) + for c in comments: + assert c in q.comments + + +if __name__ == "__main__": + # test_comments() + test_long() From 43af8bf93f95a8e4846182ea2ee90b8bdb65b3f4 Mon Sep 17 00:00:00 2001 From: Joe Zuntz Date: Mon, 18 Jul 2022 15:04:16 +0100 Subject: [PATCH 6/9] fix output --- ceci/stage.py | 3 +-- ceci_example/types.py | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ceci/stage.py b/ceci/stage.py index e3b1652..b00f67e 100644 --- a/ceci/stage.py +++ b/ceci/stage.py @@ -1016,8 +1016,7 @@ def open_output( # Return an opened object representing the file - obj = output_class(path, "w", **kwargs) - obj.provenance = self.provenance + obj = output_class(path, "w", provenance=self.provenance, **kwargs) if wrapper: return obj diff --git a/ceci_example/types.py b/ceci_example/types.py index 9e46568..9db6ec9 100644 --- a/ceci_example/types.py +++ b/ceci_example/types.py @@ -19,6 +19,7 @@ class DataFile(BaseIOHandle): """ def __init__(self, path, mode, validate=True, **kwargs): + super().__init__(kwargs.pop("provenance")) self.path = path self.mode = mode From 70603fa4786a7fda262cc1b56b249ba5c262cac0 Mon Sep 17 00:00:00 2001 From: Joe Zuntz Date: Mon, 18 Jul 2022 15:19:26 +0100 Subject: [PATCH 7/9] Fix import overwrite --- ceci/__init__.py | 1 + ceci/handle.py | 20 ++++++++++++++++++++ ceci_example/types.py | 2 +- 3 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 ceci/handle.py diff --git a/ceci/__init__.py b/ceci/__init__.py index 3780600..2d81efc 100644 --- a/ceci/__init__.py +++ b/ceci/__init__.py @@ -1,5 +1,6 @@ """Ceci n'est pas une pipeline""" +from .provenance import Provenance from .stage import PipelineStage from .pipeline import Pipeline, MiniPipeline, ParslPipeline, DryRunPipeline from .handle import BaseIOHandle diff --git a/ceci/handle.py b/ceci/handle.py new file mode 100644 index 0000000..8f2f7d5 --- /dev/null +++ b/ceci/handle.py @@ -0,0 +1,20 @@ +import copy + +class BaseIOHandle: + """ + Base class for input and output objects. For now this is pretty empty. + """ + suffix = "" + + def __init__(self, provenance): + # If provenance is provided then pull that out + self.provenance = provenance + + @property + def provenance(self): + return self._provenance + + @provenance.setter + def provenance(self, provenance): + # always copy the provenance + self._provenance = copy.deepcopy(provenance) diff --git a/ceci_example/types.py b/ceci_example/types.py index 9db6ec9..3cefb8e 100644 --- a/ceci_example/types.py +++ b/ceci_example/types.py @@ -19,7 +19,7 @@ class DataFile(BaseIOHandle): """ def __init__(self, path, mode, validate=True, **kwargs): - super().__init__(kwargs.pop("provenance")) + super().__init__(kwargs.pop("provenance", None)) self.path = path self.mode = mode From f2a6dde3aff759c5f4ade7eb8ca93078b24ae3bd Mon Sep 17 00:00:00 2001 From: Joe Zuntz Date: Mon, 18 Jul 2022 15:25:16 +0100 Subject: [PATCH 8/9] fix setup --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ad40e43..39418ff 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ 'Programming Language :: Python :: 3.8', 'Development Status :: 3 - Alpha', ], - packages=['ceci', 'ceci.sites'], + packages=['ceci', 'ceci.sites', 'ceci.provenance'], entry_points={ 'console_scripts':['ceci=ceci.main:main'] }, From 2806b1d0e0f24ea429bf8fc335f5a174bf7feb2d Mon Sep 17 00:00:00 2001 From: Joe Zuntz Date: Mon, 18 Jul 2022 15:34:28 +0100 Subject: [PATCH 9/9] add more test reqs --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 39418ff..94be81a 100644 --- a/setup.py +++ b/setup.py @@ -38,6 +38,6 @@ extras_require={ 'parsl': ['flask', 'parsl>=1.0.0'], 'cwl': ['cwlgen>=0.4', 'cwltool>=2.0.20200126090152'], - 'test': ['pytest', 'codecov', 'pytest-cov'], + 'test': ['pytest', 'codecov', 'pytest-cov', 'fitsio', 'h5py', "ruamel.yaml"], } )