From fe7fbe459648a57d9f44bbed3437d90246411885 Mon Sep 17 00:00:00 2001
From: Joe Zuntz <joezuntz@googlemail.com>
Date: Fri, 15 Jul 2022 11:57:41 +0100
Subject: [PATCH 1/9] add the concept of a rerun key to stages

---
 ceci/stage.py       | 17 ++++++++++++-----
 tests/test_stage.py | 30 ++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/ceci/stage.py b/ceci/stage.py
index 76215b3..3a1ccde 100644
--- a/ceci/stage.py
+++ b/ceci/stage.py
@@ -83,6 +83,9 @@ def __init__(self, args, comm=None):
         comm: MPI communicator
             (default is None) An MPI comm object to use in preference to COMM_WORLD
         """
+        if not isinstance(args, dict):
+            args = vars(args)
+
         self._configs = StageConfig(**self.config_options)
         self._inputs = None
         self._outputs = None
@@ -92,7 +95,7 @@ def __init__(self, args, comm=None):
         self._rank = 0
         self._io_checked = False
         self.dask_client = None
-
+        self._rerun_key = args.get('rerun_key', 0)
         self.load_configs(args)
         if comm is not None:
             self.setup_mpi(comm)
@@ -148,12 +151,9 @@ def load_configs(self, args):
 
         Parameters
         ----------
-        args: dict or namespace
+        args: dict
             Specification of input and output paths and any missing config options
         """
-        if not isinstance(args, dict):
-            args = vars(args)
-
         # We alwys assume the config arg exists, whether it is in input_tags or not
         if "config" not in args:  # pragma: no cover
             raise ValueError("The argument --config was missing on the command line.")
@@ -534,6 +534,13 @@ def parse_command_line(cls, cmd=None):
             help="Report memory use. Argument gives interval in seconds between reports",
         )
 
+        parser.add_argument(
+            "--rerun-key",
+            type=int,
+            default=0,
+            help="A key to use when re-running an interrupted run. Subclasses can use this as they wish.",
+        )
+
         if cmd is None:
             ret_args = parser.parse_args()
         else:
diff --git a/tests/test_stage.py b/tests/test_stage.py
index 2201e84..195ee83 100644
--- a/tests/test_stage.py
+++ b/tests/test_stage.py
@@ -489,6 +489,36 @@ def test_map():
     mockmpi.mock_mpiexec(3, core_test_map)
 
 
+def test_key():
+    class Lima(PipelineStage):
+        name = f"Lima"
+        inputs = []
+        outputs = []
+        config_options = {}
+
+        def run(self):
+            pass
+
+    # First check interactive construction
+
+    # default rerun key should be zero
+    ll1 = Lima.make_stage()
+    assert ll1._rerun_key == 0
+
+    # if we set it explicitly it should work like this
+    ll2 = Lima.make_stage(rerun_key=78910)
+    assert ll2._rerun_key == 78910
+
+    # Now test command-line construction
+
+    # again test both default value and set value
+    cmd = ["Lima", "--config", "tests/config.yml"]
+    ll3 = Lima(Lima.parse_command_line(cmd))
+    assert ll3._rerun_key == 0
+
+    ll4 = Lima(Lima.parse_command_line(cmd + ["--rerun-key", "54321"]))
+    assert ll4._rerun_key == 54321
+
 
 def test_unknown_stage():
     with pytest.raises(StageNotFound):

From 0682a3805cbfb0d87f2cfa7bf0652ecfd68ae4ee Mon Sep 17 00:00:00 2001
From: Joe Zuntz <joezuntz@googlemail.com>
Date: Mon, 18 Jul 2022 13:14:59 +0100
Subject: [PATCH 2/9] add use of desc_provenance library

---
 ceci/stage.py       | 43 +++++++++++++++++++++++++++++++++++++++++--
 tests/test_stage.py | 20 ++++++++++----------
 2 files changed, 51 insertions(+), 12 deletions(-)

diff --git a/ceci/stage.py b/ceci/stage.py
index 3a1ccde..2a05481 100644
--- a/ceci/stage.py
+++ b/ceci/stage.py
@@ -8,6 +8,7 @@
 import cProfile
 import pdb
 import datetime
+import desc_provenance
 
 from abc import abstractmethod
 from . import errors
@@ -96,6 +97,7 @@ def __init__(self, args, comm=None):
         self._io_checked = False
         self.dask_client = None
         self._rerun_key = args.get('rerun_key', 0)
+        self._provenance = None
         self.load_configs(args)
         if comm is not None:
             self.setup_mpi(comm)
@@ -339,7 +341,7 @@ def __init_subclass__(cls, **kwargs):
     #############################################
 
     @classmethod
-    def get_stage(cls, name):
+    def get_stage(cls, name, incomplete=False):
         """
         Return the PipelineStage subclass with the given name.
 
@@ -386,6 +388,19 @@ def get_module(cls):
         """
         return cls.pipeline_stages[cls.name][0].__module__
 
+    @classmethod
+    def get_module_file(cls):
+        """
+        Return the path to the file containing the current sub-class
+
+        Returns
+        -------
+        path: Path object
+            The file defining this class.
+        """
+        return cls.pipeline_stages[cls.name][1]
+
+
     @classmethod
     def usage(cls):  # pragma: no cover
         """
@@ -992,8 +1007,10 @@ def open_output(
                 )
                 raise RuntimeError("h5py module is not MPI-enabled.")
 
+
         # Return an opened object representing the file
-        obj = output_class(path, "w", **kwargs)
+        obj = output_class(path, "w", provenance=self.provenance, **kwargs)
+
         if wrapper:
             return obj
         return obj.file
@@ -1040,6 +1057,28 @@ def get_output_type(self, tag):
                 return dt
         raise ValueError(f"Tag {tag} is not a known output")  # pragma: no cover
 
+
+    @property
+    def provenance(self):
+        if self._provenance is not None:
+            return self._provenance
+
+        p = desc_provenance.Provenance()
+
+        # Ignore any missing files
+        input_files = {tag: path for tag, path in self._inputs.items() if path is not None}
+
+        # Get the place this stage is defined
+        directory = os.path.split(self.get_module_file())[0]
+
+        # Make and write provenance information
+        p.generate(user_config=self.config.to_dict(), input_files=input_files, directory=directory)
+
+        self._provenance = p
+        return p
+
+
+
     ##################################################
     # Configuration-related methods and properties.
     ##################################################
diff --git a/tests/test_stage.py b/tests/test_stage.py
index 195ee83..0e6530e 100644
--- a/tests/test_stage.py
+++ b/tests/test_stage.py
@@ -428,24 +428,24 @@ class Juliett(PipelineStage):
         inputs = []
         outputs = [("my_output", HDFFile)]
         config_options = {}
+        def run(self):
+            pass
     cmd = "Juliett", "--config", "tests/config.yml", "--my_output", "tests/test_out.hdf5"
 
     # Testing without an alias
-    jj = Juliett(Juliett.parse_command_line(cmd))
-    #assert os.path.exists(jj.get_output("my_output"))
-    f = jj.open_output("my_output")
-    print(f.keys())
-    f.close()
+    jj1 = Juliett(Juliett.parse_command_line(cmd))
+
+    with jj1.open_output("my_output") as f:
+        print(f.keys())
 
     # Testing with an alias - config.yml defines an alias for my_input, my_alias
-    jj = Juliett.make_stage(name="JuliettCopy", aliases=dict(my_output='my_alias'))
+    jj2 = Juliett.make_stage(name="JuliettCopy", aliases=dict(my_output='my_alias'))
 
-    print(jj.get_aliases())
+    print(jj2.get_aliases())
 
     # This works now
-    f = jj.open_output("my_output")
-    print(f.keys())
-    f.close()
+    with jj2.open_output("my_output") as f:
+        print(f.keys())
 
 
 def core_test_map(comm):

From 3c80ab459d38f5ee814a6ded946894fb10694f76 Mon Sep 17 00:00:00 2001
From: Joe Zuntz <joezuntz@googlemail.com>
Date: Mon, 18 Jul 2022 13:24:10 +0100
Subject: [PATCH 3/9] add code from old provenance library

---
 ceci/__init__.py              |   2 +-
 ceci/provenance/__init__.py   |   2 +
 ceci/provenance/errors.py     |  22 +
 ceci/provenance/git.py        |  81 ++++
 ceci/provenance/provenance.py | 870 ++++++++++++++++++++++++++++++++++
 ceci/provenance/utils.py      | 138 ++++++
 ceci/stage.py                 |   4 +-
 7 files changed, 1116 insertions(+), 3 deletions(-)
 create mode 100644 ceci/provenance/__init__.py
 create mode 100644 ceci/provenance/errors.py
 create mode 100644 ceci/provenance/git.py
 create mode 100755 ceci/provenance/provenance.py
 create mode 100755 ceci/provenance/utils.py

diff --git a/ceci/__init__.py b/ceci/__init__.py
index 4e5857d..31ce580 100644
--- a/ceci/__init__.py
+++ b/ceci/__init__.py
@@ -9,4 +9,4 @@
     __version__ = get_distribution(__name__).version
 except DistributionNotFound:  # pragma: no cover
     # package is not installed
-    pass
+    __version__ = "unknown"
diff --git a/ceci/provenance/__init__.py b/ceci/provenance/__init__.py
new file mode 100644
index 0000000..577a2d9
--- /dev/null
+++ b/ceci/provenance/__init__.py
@@ -0,0 +1,2 @@
+from .provenance import Provenance
+from .errors import *
diff --git a/ceci/provenance/errors.py b/ceci/provenance/errors.py
new file mode 100644
index 0000000..101d0a2
--- /dev/null
+++ b/ceci/provenance/errors.py
@@ -0,0 +1,22 @@
+class ProvenanceError(Exception):
+    pass
+
+
+class ProvenanceFileTypeUnknown(ProvenanceError):
+    pass
+
+
+class ProvenanceFileSchemeUnsupported(ProvenanceError):
+    pass
+
+
+class ProvenanceMissingFile(ProvenanceError):
+    pass
+
+
+class ProvenanceMissingSection(ProvenanceError):
+    pass
+
+
+class ProvenanceMissingItem(ProvenanceError):
+    pass
diff --git a/ceci/provenance/git.py b/ceci/provenance/git.py
new file mode 100644
index 0000000..618c86b
--- /dev/null
+++ b/ceci/provenance/git.py
@@ -0,0 +1,81 @@
+from .utils import get_caller_directory
+import subprocess
+
+
+def diff(dirname=None, parent_frames=1):
+    """
+    Run git diff in the caller's directory (default) or another specified directory,
+    and return stdout+stderr
+    """
+    if dirname is None:
+        dirname = get_caller_directory(parent_frames + 1)
+
+    if dirname is None:
+        return "ERROR_GIT_NO_DIRECTORY"
+    # We use git diff head because it shows all differences,
+    # including any that have been staged but not committed.
+    try:
+        diff = subprocess.run(
+            "git diff HEAD".split(),
+            cwd=dirname,
+            universal_newlines=True,
+            timeout=5,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+        )
+
+    # There are lots of different ways this can go wrong.
+    # Here are some - any others it is probably worth knowing
+    # about
+    except subprocess.TimeoutExpired:
+        return "ERROR_GIT_TIMEOUT"
+    except UnicodeDecodeError:
+        return "ERROR_GIT_DECODING"
+    except subprocess.SubprocessError:
+        return "ERROR_GIT_OTHER"
+    except FileNotFoundError:
+        return "ERROR_GIT_NOT_RUNNABLE"
+    except OSError:
+        return "ERROR_GIT_OTHER_OSERROR"
+    # If for some reason we are running outside the main repo
+    # this will return an error too
+    if diff.returncode:
+        return "ERROR_GIT_FAIL"
+
+    return diff.stdout
+
+
+def current_revision(dirname=None, parent_frames=1):
+    """Return the git revision ID in the caller's directory (default) or another
+    specified directory.
+    """
+    if dirname is None:
+        dirname = get_caller_directory(parent_frames + 1)
+
+    if dirname is None:
+        return "ERROR_GIT_NO_DIRECTORY"
+    try:
+        rev = subprocess.run(
+            "git rev-parse HEAD".split(),
+            cwd=dirname,
+            universal_newlines=True,
+            timeout=5,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+        )
+    # Same as git diff above.
+    except subprocess.TimeoutExpired:
+        return "ERROR_GIT_TIMEOUT"
+    except UnicodeDecodeError:
+        return "ERROR_GIT_DECODING"
+    except subprocess.SubprocessError:
+        return "ERROR_GIT_OTHER"
+    except FileNotFoundError:
+        return "ERROR_GIT_NOT_RUNNABLE"
+    except OSError:
+        return "ERROR_GIT_OTHER_OSERROR"
+    # If for some reason we are running outside the main repo
+    # this will return an error too
+    if rev.returncode:
+        return "ERROR_GIT_FAIL"
+    return rev.stdout
diff --git a/ceci/provenance/provenance.py b/ceci/provenance/provenance.py
new file mode 100755
index 0000000..a320ebf
--- /dev/null
+++ b/ceci/provenance/provenance.py
@@ -0,0 +1,870 @@
+from . import git
+from . import errors
+from . import utils
+import sys
+import uuid
+import socket
+import getpass
+import pathlib
+import warnings
+import datetime
+import functools
+import contextlib
+import collections
+import numpy as np
+import pickle
+import copy
+
+# Some useful constants
+unknown_value = "UNKNOWN"
+provenance_group = "provenance"
+base_section = "base"
+config_section = "config"
+input_id_section = "input_id"
+input_path_section = "input_path"
+git_section = "git"
+versions_section = "versions"
+comments_section = "comments"
+
+
+def writer_method(method):
+    """Do some book-keeping to turn a provenance method into a writer method
+
+    We put this decorator around all the methods that write
+    provenance to a file, so that they all generate, remove,
+    and return a unique ID for each new file they write to.
+
+    It's not intended for users.
+    """
+    # This makes the decorator "well-behaved" so that it
+    # doesn't change the name of the function when printed,
+    # etc.
+    @functools.wraps(method)
+    def wrapped_method(self, *args, **kwargs):
+        # Record it in the provenance object
+        file_id = self.generate_file_id()
+
+        # I was a bit confused at the need to include
+        # self here, but it seems to be required
+        try:
+            method(self, *args, **kwargs)
+        finally:
+            # At the end, remove it from the Provenance, because it will not
+            # be relevant for future files
+            del self.provenance[base_section, "file_id"]
+        # But return it; this is mainly to help testing
+        return file_id
+
+    return wrapped_method
+
+
+def writable_value(x):
+    if isinstance(x, (int, np.integer)) or isinstance(x, (float, np.floating)):
+        return x
+    return str(x)
+
+
+class Provenance:
+    """Collects, generates, reads, and writes provenance information.
+
+    This object can be used like a dictionary with two keys:
+    provenance[category, key] = value
+    """
+
+    def __init__(self, code_dir=None, parent_frames=0):
+        """Create an empty provenance object"""
+        self.code_dir = code_dir or utils.get_caller_directory(parent_frames + 1)
+        self.provenance = {}
+        self.comments = []
+
+    def copy(self):
+        cls = self.__class__
+        cp = cls(code_dir=self.code_dir)
+        cp.provenance = copy.deepcopy(self.provenance)
+        cp.comments = copy.deepcopy(self.comments)
+        return cp
+
+    # Generation methods
+    # ------------------
+    def generate(
+        self, user_config=None, input_files=None, comments=None, directory=None
+    ):
+        """
+        Generate a new set of provenance.
+
+        After calling this the provenance object will contain:
+            - the date, time, and place of creation
+            - the user and domain name
+            - all python modules already imported anywhere that have a version number
+            - git info about the directory where this instance was created
+            - sys.argv
+            - a config dict passed by the caller
+            - a dict of input files passed by the caller
+            - any comments we want to add.
+
+        Parameters
+        ----------
+        user_config: dict or None
+            Optional input configuration options
+        input_files: dict or None
+            Optional name_for_file: file_path dict
+        comments: list or None
+            Optional comments to include.  Not intended to be machine-readable
+        directory: str or None
+            Optional directory in which to run git information
+        """
+        # Record various core pieces of information
+        self._add_core_info()
+        self._add_git_info(directory)
+        self._add_module_versions()
+        self._add_argv_info()
+
+        # Add user inputs
+        if input_files is not None:
+            for name, path in input_files.items():
+                self.add_input_file(name, path)
+
+        # Add any specific items given by the user
+        if user_config is not None:
+            for key, value in user_config.items():
+                self[config_section, key] = writable_value(value)
+
+        if comments is not None:
+            for comment in comments:
+                self.add_comment(comment)
+
+    # Core methods called in generate above
+    # -------------------------------------
+    def _add_core_info(self):
+        self[base_section, "process_id"] = uuid.uuid4().hex
+        self[base_section, "domain"] = socket.getfqdn()
+        self[base_section, "creation"] = datetime.datetime.now().isoformat()
+        self[base_section, "user"] = getpass.getuser()
+
+    def _add_argv_info(self):
+        for i, arg in enumerate(sys.argv):
+            self[base_section, f"argv_{i}"] = arg
+
+    def _add_git_info(self, directory):
+        # Add some git information
+        self[git_section, "diff"] = git.diff(directory)
+        self[git_section, "head"] = git.current_revision()
+
+    def _add_module_versions(self):
+        for module, version in utils.find_module_versions().items():
+            self[versions_section, module] = version
+
+    def add_input_file(self, name, path):
+        """
+        Tell the provenance the name and path to one of your input files
+        so it can be recorded correctly in the output.
+
+        Parameters
+        ----------
+        name: str
+            A tag or name representing the file
+
+        path: str or pathlib.Path
+            The path to the file
+        """
+        # get the absolute form path to the file
+        path = str(pathlib.Path(path).absolute().resolve())
+
+        # Save it in ourselves
+        self[input_path_section, name] = path
+        # If the file was saved with its own provenance then it will have its own
+        # unique file_id.  Try to record that ID.  The file may be some other type,
+        # or not have provenance, so ignore any errors here.
+        try:
+            self[input_id_section, name] = self.get(path, base_section, "file_id")
+        except:
+            self[input_id_section, name] = unknown_value
+
+    def add_comment(self, comment):
+        """
+        Add a text comment.
+
+        Comments with line breaks will be split into separate comments.
+
+        Parameters
+        ----------
+        comment: str
+            Comment to include
+        """
+        for c in comment.split("\n"):
+            self.comments.append(c)
+
+    # Dictionary methods
+    # ------------------
+    def __getitem__(self, section_name):
+        section, name = section_name
+        return self.provenance[section, name]
+
+    def __setitem__(self, section_name, value):
+        section, name = section_name
+        self.provenance[section, name] = value
+
+    def __delitem__(self, section_name):
+        section, name = section_name
+        del self.provenance[section, name]
+
+    def update(self, d):
+        """
+        Update the provenance from a dictionary.
+
+        The dictionary keys should be tuples of (category, key).
+        The values can be any basic type.
+
+        Parameters
+        ----------
+        d: dict or mapping
+            The dict to update from.
+        """
+        for (section, name), value in d.items():
+            self.provenance[section, name] = value
+
+    # Generic I/O Methods
+    # -----------
+    def write(self, f, suffix=None):
+        """
+        Write provenance to a named file, guessing the file type from its suffix.
+
+        Use the various write_* methods intead to write to a file you have already
+        opened, or if the file suffix does not match the type.
+
+        Parameters
+        ----------
+        f: str or writeable object
+        suffix: str
+            Must be supplied if f is a file-like object
+
+        Returns
+        -------
+        str
+            The newly-assigned file ID
+        """
+
+        # String or path
+        if utils.is_path(f):
+            f = pathlib.Path(f)
+            suffix = f.suffix
+        elif suffix is None:
+            raise ValueError("Must supply suffix if open file is supplied")
+
+        # If passed a directory, make a provenance file in that directory
+        if suffix == "" and isinstance(f, pathlib.Path) and f.is_dir():
+            return self.write_yaml(f + "provenance.yaml")
+
+        if suffix and not suffix.startswith("."):
+            suffix = "." + suffix
+
+        writers = {
+            ".hdf": self.write_hdf,
+            ".hdf5": self.write_hdf,
+            ".fits": self.write_fits,
+            ".fit": self.write_fits,
+            ".yml": self.write_yaml,
+            ".yaml": self.write_yaml,
+            ".pkl": self.write_pickle,
+            ".pickle": self.write_pickle,
+        }
+        method = writers.get(suffix)
+
+        if method is None:
+            return self.write_yaml(f + ".provenance.yaml")
+
+        return method(f)
+
+    def read(self, filename):
+        """
+        Read all provenance from any supported file type, guessing
+        the file type from its suffix.
+
+        If the suffix does not match the type you can use one of the specific read_
+        methods instead.  You can also pass open file objects directly to those methods.
+
+        Parameters
+        ----------
+        filename: str
+
+        Returns
+        -------
+        None
+        """
+        p = pathlib.Path(filename)
+
+        readers = {
+            ".hdf": self.read_hdf,
+            ".hdf5": self.read_hdf,
+            ".fits": self.read_fits,
+            ".fit": self.read_fits,
+            ".yml": self.read_yaml,
+            ".yaml": self.read_yaml,
+            ".pkl": self.read_yaml,
+            ".pickle": self.read_pickle,
+        }
+
+        method = readers.get(p.suffix)
+        if method is None:
+            raise errors.ProvenanceFileTypeUnknown(filename)
+
+        return method(filename)
+
+    @classmethod
+    def get(cls, filename, section, key):
+        """
+        Get a single item of provenance from any supported file type, guessing
+        the file type from its suffix.
+
+        If the suffix does not match the type you can use one of the specific get_
+        methods instead.  You can also pass open file objects directly to those methods.
+
+        Parameters
+        ----------
+        filename: str
+
+        section: str
+
+        key: str
+
+        Returns
+        -------
+        value: any
+            The native value of the key in this value
+        """
+
+        p = pathlib.Path(filename)
+        if not p.exists():
+            raise errors.ProvenanceMissingFile(filename)
+
+        getters = {
+            ".hdf": cls.get_hdf,
+            ".hdf5": cls.get_hdf,
+            ".fits": cls.get_fits,
+            ".fit": cls.get_fits,
+            ".yml": cls.get_yaml,
+            ".yaml": cls.get_yaml,
+            ".pkl": cls.get_pickle,
+            ".pickle": cls.get_pickle,
+        }
+
+        method = getters.get(p.suffix)
+        if method is None:
+            raise errors.ProvenanceFileTypeUnknown(filename)
+
+        return method(filename, section, key)
+
+    # HDF Methods
+    # -----------
+    @classmethod
+    def _read_get_hdf(cls, hdf_file, item=None):
+        with utils.open_hdf(hdf_file, "r") as f:
+            # If the whole provenance section is missing, e.g.
+            # because the file was not generated with provenance at all,
+            # then raise the appropriate error
+            if provenance_group not in f.keys():
+                raise errors.ProvenanceMissingSection(
+                    "HDF File is missing provenance section"
+                )
+
+            # Othewise get the provenance group.  Provenance is stored in its
+            # attributes of subgroups
+            g = f[provenance_group]
+
+            # If we are being called from read_hdf then we want to read everything
+            if item is None:
+                d = {}
+                comments = []
+                # Go to all the (category) subgroups
+                for section in g.keys():
+                    sg = g[section]
+                    if section == comments_section:
+                        for val in sg.attrs.values():
+                            comments.append(val)
+                    else:
+                        # and read all the attributes in each one
+                        for key, val in sg.attrs.items():
+                            d[section, key] = val
+                return d, comments
+            # Otherwise just read the one requested item
+            else:
+                section, key = item
+                if section not in g.keys():
+                    raise errors.ProvenanceMissingItem(f"{section}/{key}")
+
+                # Will be None if not present
+                value = g[section].attrs.get(key)
+
+                if value is None:
+                    raise errors.ProvenanceMissingItem(item)
+                else:
+                    return value
+
+    @classmethod
+    def get_hdf(cls, hdf_file, section, key):
+        """Get a single item of provenance from an HDF file.
+
+        Parameters
+        ----------
+        hdf_file: str or h5py.File
+            The file name or an open file
+
+        section: str
+            The provenance item category
+
+        key: str
+            The provenance item name
+
+        Returns
+        -------
+        value
+            The value (of any type) found in the file
+        """
+        return cls._read_get_hdf(hdf_file, (section, key))
+
+    def read_hdf(self, hdf_file):
+        """Read provenance from an HDF5 file.
+
+        Parameters
+        ----------
+        hdf_file: str or h5py.File.FITS
+            The file name or an open file object
+
+        Returns
+        -------
+        None
+        """
+        d, com = self._read_get_hdf(hdf_file)
+        self.update(d)
+        self.comments.extend(com)
+
+    @writer_method
+    def write_hdf(self, hdf_file):
+        """Write provenance to an HDF5 file.
+
+        Parameters
+        ----------
+        hdf_file: str or h5py.File
+            The file name or an open file object
+
+        Returns
+        -------
+        str
+            The newly-assigned file ID
+        """
+        with utils.open_hdf(hdf_file, "a") as f:
+            # Group may or may not exist already
+            if provenance_group in f.keys():
+                g = f[provenance_group]
+            else:
+                g = f.create_group(provenance_group)
+
+            # Write each category to a subgroup
+            for (section, key), value in self.provenance.items():
+                # Create subgroup if it does not exist already
+                if section not in g.keys():
+                    subg = g.create_group(section)
+                else:
+                    subg = g[section]
+
+                # Write values to subgroup attributes
+                subg.attrs[key] = value
+
+            # Write comments in this section if needed
+            if comments_section not in g.keys():
+                subg = g.create_group(comments_section)
+            else:
+                subg = g[comments_section]
+
+            for i, comment in enumerate(self.comments):
+                subg.attrs[f"comment_{i}"] = comment
+
+    # FITS Methods
+    # ------------
+    @classmethod
+    def get_fits(cls, fits_file, section, key):
+        """Get a single item of provenance from a FITS file.
+
+        Parameters
+        ----------
+        fits_file: str or fitsio.FITS
+            The file name or an open file
+
+        section: str
+            The provenance item category
+
+        key: str
+            The provenance item name
+
+        Returns
+        -------
+        value
+            The value (of any type) found in the file
+        """
+        return cls._read_get_fits(fits_file, (section, key))
+
+    def read_fits(self, fits_file):
+        """Read proveance from a FITS file.
+
+        Loads all the provenance found in a FITS file.
+
+        Parameters
+        ----------
+        fits_file: str or fitsio.FITS
+            The file name or an open file object
+
+        """
+        d, com = self._read_get_fits(fits_file)
+        self.update(d)
+        self.comments.extend(com)
+
+    @writer_method
+    def write_fits(self, fits_file):
+        """Write provenance to a FITS file.
+
+        Parameters
+        ----------
+        fits_file: str or fitsio.FITS
+            The file name or an open file object
+
+        Returns
+        -------
+        str
+            The newly-assigned file ID
+        """
+        with utils.open_fits(fits_file, "rw") as f:
+
+            # Create the group if it doesn't exist
+            if provenance_group in f:
+                ext = f[provenance_group]
+            else:
+                f.create_image_hdu(extname=provenance_group)
+                f.update_hdu_list()
+                ext = f[provenance_group]
+
+            # Helper local function to write a key.
+            # To maintain case we store items as a trio
+            # of keys specifying category, key, and value
+            def write_key(s, k, v, i):
+                ext.write_key(f"SEC{i}", s)
+                ext.write_key(f"KEY{i}", k)
+                ext.write_key(f"VAL{i}", v)
+
+            # Write the keys we have one by one
+            for i, ((section, key), value) in enumerate(self.provenance.items()):
+                # FITS header items can't contain newlines, so we break up
+                # any text with newlines into separate entries which we patch
+                # together again when loading
+                if isinstance(value, str) and "\n" in value:
+                    values = value.split("\n")
+                    # There's some kind of bug in CFITSIO that lets you write
+                    # but not read certain text that includes new lines when the
+                    # key is longer than 8 characters.  This avoids that because
+                    # our keys are always shorter than this in this case
+                    if len(values) > 999:
+                        warnings.warn(
+                            f"Cannot write all very long item {section}/{key} to FITS provenance (>999 lines).  Truncating."
+                        )
+                        values = values[:999]
+                    for j, v in enumerate(values):
+                        write_key(section, key, v, f"{i}_{j}")
+                # or if it's any other item we just put it in directly
+                else:
+                    write_key(section, key, value, i)
+
+            for comment in self.comments:
+                ext.write_comment(comment)
+
+    # Internal method implementing the read and get methods
+    @classmethod
+    def _read_get_fits(cls, fits_file, item=None):
+        with utils.open_fits(fits_file, "r") as f:
+            ext = f[0]
+            # We may be called from the get or read methods.
+            # In the former case we will be given a specific item
+            # to get, which we split here
+            if item is not None:
+                target_sec, target_key = item
+
+            # Read the entire header. A bit wasteful if we only want a single
+            # item from it, but this shouldn't be a performance bottleneck.
+            hdr = ext.read_header()
+
+            comments = [
+                k["value"].strip() for k in hdr.records() if k["name"] == "COMMENT"
+            ]
+
+            # Remove sone of the standard FITS comments put in everything
+            # by CFITSIO.
+            try:
+                comments.remove(
+                    "FITS (Flexible Image Transport System) format is defined in 'Astronomy"
+                )
+                comments.remove(
+                    "and Astrophysics', volume 376, page 359; bibcode: 2001A&A...376..359H"
+                )
+            except ValueError:
+                pass
+            # We have recorded items in trios of KEY0, SEC0, VAL0, 1, 2 etc.
+            # so count how many keys we have
+            indices = [k[3:] for k in hdr if k.upper().startswith("KEY")]
+            indices = [k for k in indices if k and k[0] in "0123456789"]
+
+            # We will collect the number of lines for each multi-line
+            # item, so we can patch together later.
+            multiline_indices = collections.defaultdict(int)
+            d = {}
+
+            # split these keys into multi-line and normal keys
+            for index in indices:
+                if "_" in index:
+                    orig_index, _ = index.split("_", 1)
+                    multiline_indices[orig_index] += 1
+                else:
+                    # Handle the normal keys just by reading them
+                    sec = hdr[f"SEC{index}"]
+                    val = hdr[f"VAL{index}"]
+                    key = hdr[f"KEY{index}"]
+                    # If this is called from get_ then return
+                    # if we have found the desired object
+                    if item is not None:
+                        if (sec == target_sec) and (key == target_key):
+                            return val
+                    # Otherwise just build up all the items
+                    else:
+                        d[sec, key] = val
+
+            # Now deal with all the multiline ones we found.
+            # we recorded the number of entries for each of them
+            for index, n in multiline_indices.items():
+                vals = []
+                # sec and key should be the same for them all
+                sec = hdr[f"SEC{index}_0"]
+                key = hdr[f"KEY{index}_0"]
+
+                # reassemble into a multi-line text
+                for i in range(n):
+                    vals.append(hdr[f"VAL{index}_{i}"])
+                val = "\n".join(vals)
+
+                # Check if the target is this multiline item
+                if item is not None:
+                    if (sec == target_sec) and (key == target_key):
+                        return val
+                else:
+                    d[sec, key] = val
+
+            # If we were not asked for a specific item then return
+            # the entire thing
+            if item is None:
+                return d, comments
+            else:
+                # If we were asked for an item then if we've got this far
+                # then we've failed.
+                raise errors.ProvenanceMissingItem(
+                    f"Missing item {target_sec} {target_key}"
+                )
+
+    # Other I/O Methods
+    # -----------------
+
+    @classmethod
+    def _read_get_yaml(cls, yml_file, item):
+        import ruamel.yaml as yaml
+
+        y = yaml.YAML()
+
+        with utils.open_file(yml_file, "r") as f:
+            # Read the whole file
+            data = y.load(f)
+            d = data["provenance"]
+            if item is not None:
+                sd = d[item[0]]
+                return sd[item[1]]
+
+            # Pull out the different sections
+            # into a dict for provenance and a list
+            # for comments
+            out = {}
+            com = []
+            for section, sub in d.items():
+                if section == comments_section:
+                    com = sub[:]
+                else:
+                    for key, value in sub.items():
+                        out[section, key] = value
+            return out, com
+
+    @classmethod
+    def get_yaml(self, yml_file, section, key):
+        return self._read_get_yaml(yml_file, (section, key))
+
+    def read_yaml(self, yml_file):
+        """Read provenance from a YAML file.
+
+        Updates the provenance object.
+
+        Parameters
+        ----------
+        yml_file: str or file
+            The file name or an open file object
+
+        Returns
+        -------
+        None
+        """
+        d, com = self._read_get_yaml(yml_file, None)
+        self.update(d)
+        self.comments.extend(com)
+
+    def _make_yml(self):
+        # internal method to make a dictionary from
+        # this instance suitable to dump to yml
+        d = {}
+        for (sec, key), val in self.provenance.items():
+            if sec not in d:
+                d[sec] = {}
+            d[sec][key] = val
+        d["comments"] = self.comments[:]
+        return d
+
+    @writer_method
+    def write_yaml(self, yml_file):
+        """Write provenance to a YAML file.
+
+        Parameters
+        ----------
+        yml_file: str or file
+            The file name or an open file object
+
+        Returns
+        -------
+        str
+            The newly-assigned file ID
+        """
+        import ruamel.yaml as yaml
+
+        # Create the YAML loader.  The default instance
+        # of this preserves comments in the YAML if present,
+        # which means we can run this code on existing
+        # commented yaml without destroying it
+        y = yaml.YAML()
+        p = self._make_yml()
+
+        if utils.is_path(yml_file) or "r" in yml_file.mode:
+            with utils.open_file(yml_file, "r+") as f:
+
+                # record curent position (in case this is a pre-opened file)
+                # and load the yaml from the start
+                s = f.tell()
+                f.seek(0)
+                d = y.load(f)
+
+                # if file was empty before:
+                if d is None:
+                    d = {}
+                elif not (isinstance(d, yaml.comments.CommentedMap)):
+                    # go back to where we started but complain that this is
+                    # not a dict-type yaml file
+                    f.seek(s)
+                    raise errors.ProvenanceFileSchemeUnsupported(
+                        "Provenance only supports yaml files containing a dictionary as the top level object"
+                    )
+
+                # replace existing prov completely if present.  We re-write
+                # the whole file contents after the prov.  Could avoid but not really needed
+                # as ruamel should maintain comments.
+                d["provenance"] = p
+                f.seek(0)
+                y.dump(d, f)
+                f.truncate()
+        else:
+            # filed opened in write-only mpde
+            y.dump(x, f)
+
+    @writer_method
+    def write_pickle(self, pickle_file):
+        """Write provenance to a Pickle file.
+
+        Parameters
+        ----------
+        pickle_file: str or file
+            The file name or an open file object
+
+        Returns
+        -------
+        str
+            The newly-assigned file ID
+        """
+
+        if utils.is_path(pickle_file) or "r" in pickle_file.mode:
+            with utils.open_file(pickle_file, "r+") as f:
+                # jump to the end of the file
+                f.seek(0, 2)
+                # save the pickle info
+                pickle.dump(["provenance_dump", self.provenance, self.comments], f)
+
+        else:
+            # filed opened in write-only mode already
+            pickle.dump(
+                ["provenance_dump", self.provenance, self.comments], pickle_file
+            )
+
+    @classmethod
+    def _read_get_pickle(cls, pickle_file):
+        f = utils.open_file(pickle_file, "r")
+        s = f.tell()
+
+        try:
+            n = 0
+            while True:
+                try:
+                    item = pickle.load(f)
+                except:
+                    break
+            if n == 0:
+                raise ProvenanceError(f"Nothing readable found in file {pickle_file}")
+            if (
+                (not isinstance(item, list))
+                or (len(item) != 3)
+                or (item[0] != "provenance_dump")
+            ):
+                raise ProvenanceError(f"No provenance found in file {pickle_file}")
+                _, d, c = item
+        finally:
+            if utils.is_path(pickle_file):
+                f.close()
+            else:
+                f.seek(s)
+
+    def read_pickle(self, pickle_file):
+        """Read provenance from a YAML file.
+
+        Updates the provenance object.
+
+        Parameters
+        ----------
+        pickle_file: str or file
+            The file name or an open file object
+
+        Returns
+        -------
+        None
+        """
+        d, com = self._read_get_pickle(pickle_file)
+        self.update(d)
+        self.comments.extend(com)
+
+    @classmethod
+    def get_pickle(self, pickle_file, section, key):
+        d, com = self._read_get_pickle(pickle_file)
+        return d[section, key]
+
+    def to_string_dict(self):
+        d = {f"{s}/{k}": str(v) for (s, k), v in self.provenance.items()}
+        for i, c in self.comments:
+            d[f"comment_{i}"] = c
+        return d
+
+    def generate_file_id(self):
+        self[base_section, "file_id"] = uuid.uuid4().hex
diff --git a/ceci/provenance/utils.py b/ceci/provenance/utils.py
new file mode 100755
index 0000000..56edc27
--- /dev/null
+++ b/ceci/provenance/utils.py
@@ -0,0 +1,138 @@
+import distutils.version
+import sys
+import os
+import inspect
+import pathlib
+import contextlib
+import shutil
+
+
+def is_path(p):
+    """
+    Decide whether the input is likely to be a path instead of a file object.
+
+    Returns True if the parameter is a pathlib.Path object or a string, False
+    otherwise
+
+    Parameters
+    ----------
+    p: Any
+        Object which may be a path or not
+
+    Returns
+    -------
+    bool
+
+    """
+    return isinstance(p, str) or isinstance(p, pathlib.Path)
+
+
+def get_caller_directory(parent_frames=0):
+    """
+    Find the directory where the code calling this
+    function lives, or any number of jumps back up the stack
+
+    Parameters
+    ----------
+    parent_frames: int
+        Number of additional frames to go up in the call stack
+
+    Returns
+    -------
+    directory: str
+    """
+    previous_frame = inspect.currentframe().f_back
+    # go back more frames if desired
+    for i in range(parent_frames):
+        previous_frame = previous_frame.f_back
+
+    filename = inspect.getframeinfo(previous_frame).filename
+    p = pathlib.Path(filename)
+    if not p.exists():
+        # dynamically generated or interactive mode
+        return None
+    return str(p.parent)
+
+
+def find_module_versions():
+    """
+    Generate a dictionary of versions of all imported modules
+    by looking for __version__ or version attributes on them.
+
+    Parameters
+    ----------
+    None
+
+    Returns
+    -------
+    dict:
+        A dictioary of the versions of all loaded modules
+    """
+    versions = {}
+    for name, module in sys.modules.items():
+        if hasattr(module, "version"):
+            v = module.version
+        elif hasattr(module, "__version__"):
+            v = module.__version__
+        else:
+            continue
+        if isinstance(v, str) or isinstance(v, distutils.version.Version):
+            versions[name] = str(v)
+    return versions
+
+
+@contextlib.contextmanager
+def open_hdf(hdf_file, mode):
+    """Open an HDF file, or if a file is provided, simply return it"""
+    import h5py
+
+    if is_path(hdf_file):
+        f = h5py.File(hdf_file, mode)
+        try:
+            yield f
+        finally:
+            f.close()
+    else:
+        yield hdf_file
+
+
+@contextlib.contextmanager
+def open_fits(fits_file, mode):
+    """Open a FITS file, or if a file is already provided simply return it"""
+    import fitsio
+
+    if is_path(fits_file):
+        exists = os.path.exists(fits_file)
+
+        # By default the "w" mode in FITSIO is r/w.  We have to explicitly remove
+        # first if we want to do a proper write and the file already exists.
+        if mode == "w":
+            mode = "rw"
+            if exists:
+                shutil.remove(fits_file)
+        f = fitsio.FITS(fits_file, mode=mode)
+
+        try:
+            yield f
+        finally:
+            f.close()
+    else:
+        yield fits_file
+
+
+@contextlib.contextmanager
+def open_file(file, mode):
+    """Open a regular file, or if a file is already provided simply return it"""
+
+    if is_path(file):
+        if mode == "r+" and not os.path.exists(file):
+            f = open(file, "w+")
+        else:
+            f = open(file, mode=mode)
+
+        try:
+            yield f
+        finally:
+            f.close()
+    else:
+        yield file
diff --git a/ceci/stage.py b/ceci/stage.py
index 2a05481..8d51d17 100644
--- a/ceci/stage.py
+++ b/ceci/stage.py
@@ -8,7 +8,7 @@
 import cProfile
 import pdb
 import datetime
-import desc_provenance
+from .provenance import Provenance
 
 from abc import abstractmethod
 from . import errors
@@ -1063,7 +1063,7 @@ def provenance(self):
         if self._provenance is not None:
             return self._provenance
 
-        p = desc_provenance.Provenance()
+        p = Provenance()
 
         # Ignore any missing files
         input_files = {tag: path for tag, path in self._inputs.items() if path is not None}

From 807e57ed8d00b980cc1a106343496c862965495a Mon Sep 17 00:00:00 2001
From: Joe Zuntz <joezuntz@googlemail.com>
Date: Mon, 18 Jul 2022 14:38:39 +0100
Subject: [PATCH 4/9] working tests

---
 ceci/__init__.py              |  1 +
 ceci/provenance/provenance.py | 16 ++++++++++++----
 ceci/stage.py                 | 10 +++++++++-
 ceci_example/types.py         |  6 ++++--
 4 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/ceci/__init__.py b/ceci/__init__.py
index 31ce580..3780600 100644
--- a/ceci/__init__.py
+++ b/ceci/__init__.py
@@ -2,6 +2,7 @@
 
 from .stage import PipelineStage
 from .pipeline import Pipeline, MiniPipeline, ParslPipeline, DryRunPipeline
+from .handle import  BaseIOHandle
 from pkg_resources import DistributionNotFound
 from pkg_resources import get_distribution
 
diff --git a/ceci/provenance/provenance.py b/ceci/provenance/provenance.py
index a320ebf..cf55a8b 100755
--- a/ceci/provenance/provenance.py
+++ b/ceci/provenance/provenance.py
@@ -253,7 +253,8 @@ def write(self, f, suffix=None):
 
         # If passed a directory, make a provenance file in that directory
         if suffix == "" and isinstance(f, pathlib.Path) and f.is_dir():
-            return self.write_yaml(f + "provenance.yaml")
+            p = f.parent / (f.name + ".provenance.yaml")
+            return self.write_yaml(p)
 
         if suffix and not suffix.startswith("."):
             suffix = "." + suffix
@@ -270,8 +271,11 @@ def write(self, f, suffix=None):
         }
         method = writers.get(suffix)
 
+        # If we do not know how to write to this file type
+        # then just put a file next to it.
         if method is None:
-            return self.write_yaml(f + ".provenance.yaml")
+            p = f.parent / (f.name + ".provenance.yaml")
+            return self.write_yaml(p)
 
         return method(f)
 
@@ -305,8 +309,10 @@ def read(self, filename):
         }
 
         method = readers.get(p.suffix)
+
         if method is None:
-            raise errors.ProvenanceFileTypeUnknown(filename)
+            method = self.read_yaml
+            filename = p.parent / (p.name + ".provenance.yaml")
 
         return method(filename)
 
@@ -867,4 +873,6 @@ def to_string_dict(self):
         return d
 
     def generate_file_id(self):
-        self[base_section, "file_id"] = uuid.uuid4().hex
+        file_id = uuid.uuid4().hex
+        self[base_section, "file_id"] = file_id
+        return file_id
diff --git a/ceci/stage.py b/ceci/stage.py
index 8d51d17..06b8c5e 100644
--- a/ceci/stage.py
+++ b/ceci/stage.py
@@ -935,6 +935,13 @@ def open_input(self, tag, wrapper=False, **kwargs):
         path = self.get_input(aliased_tag)
         input_class = self.get_input_type(tag)
         obj = input_class(path, "r", **kwargs)
+        prov = Provenance()
+        try:
+            prov.read(path)
+            obj.provenance = prov
+        except:
+            pass
+
 
         if wrapper:  # pragma: no cover
             return obj
@@ -1009,7 +1016,8 @@ def open_output(
 
 
         # Return an opened object representing the file
-        obj = output_class(path, "w", provenance=self.provenance, **kwargs)
+        obj = output_class(path, "w", **kwargs)
+        obj.provenance = self.provenance
 
         if wrapper:
             return obj
diff --git a/ceci_example/types.py b/ceci_example/types.py
index c5e4e1a..9e46568 100644
--- a/ceci_example/types.py
+++ b/ceci_example/types.py
@@ -1,4 +1,6 @@
-class DataFile:
+from ceci import BaseIOHandle
+
+class DataFile(BaseIOHandle):
     """
     A class representing a DataFile to be made by pipeline stages
     and passed on to subsequent ones.
@@ -16,7 +18,7 @@ class DataFile:
 
     """
 
-    def __init__(self, path, mode, extra_provenance=None, validate=True, **kwargs):
+    def __init__(self, path, mode, validate=True, **kwargs):
         self.path = path
         self.mode = mode
 

From 8e895add0dd2b10d8cb9477b21d920001560a7a3 Mon Sep 17 00:00:00 2001
From: Joe Zuntz <joezuntz@googlemail.com>
Date: Mon, 18 Jul 2022 14:42:43 +0100
Subject: [PATCH 5/9] add tests and remove change

---
 ceci/stage.py            |   2 +-
 tests/test_provenance.py | 281 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 282 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_provenance.py

diff --git a/ceci/stage.py b/ceci/stage.py
index 06b8c5e..e3b1652 100644
--- a/ceci/stage.py
+++ b/ceci/stage.py
@@ -341,7 +341,7 @@ def __init_subclass__(cls, **kwargs):
     #############################################
 
     @classmethod
-    def get_stage(cls, name, incomplete=False):
+    def get_stage(cls, name):
         """
         Return the PipelineStage subclass with the given name.
 
diff --git a/tests/test_provenance.py b/tests/test_provenance.py
new file mode 100644
index 0000000..c7582f5
--- /dev/null
+++ b/tests/test_provenance.py
@@ -0,0 +1,281 @@
+import tempfile
+from ceci.provenance import Provenance, errors, utils
+from ceci import __version__ as lib_version
+from pprint import pprint
+import pytest
+import datetime
+import os
+import random
+import string
+
+
+def test_generate():
+    p = Provenance()
+
+    # Check that the basic provenance information is generated correctly.
+    # temporarily change the user name so that we check it's okay
+    logname = os.environ.get("LOGNAME")
+    os.environ["LOGNAME"] = "mr_potato"
+    user_config = {
+        "xxx": "abc",
+        "car": "47",
+    }
+
+    try:
+        p.generate(
+            user_config,
+        )
+    finally:
+        if logname is None:
+            del os.environ["LOGNAME"]
+        else:
+            os.environ["LOGNAME"] = logname
+
+    assert p["base", "user"] == "mr_potato"
+
+    # check our own version is correct, and that of pytest
+    assert p["versions", "ceci"] == lib_version
+    assert p["versions", "pytest"] == pytest.__version__
+
+    # check some basic information
+    # date and time should be in the last few moments
+    dt = datetime.datetime.now() - datetime.datetime.fromisoformat(
+        p["base", "creation"]
+    )
+    assert dt.total_seconds() < 5
+
+
+def test_inputs():
+    p = Provenance()
+    p.generate()
+    with tempfile.TemporaryDirectory() as dirname:
+        fname = os.path.join(dirname, "test1.hdf")
+        file_id = p.write(fname)
+
+        q = Provenance()
+        input_files = {"tag1": fname, "tag2": "./non_existent_file.hdf"}
+        q.generate(input_files=input_files)
+
+        assert file_id == q["input_id", "tag1"]
+        assert "UNKNOWN" == q["input_id", "tag2"]
+
+
+@pytest.mark.parametrize(
+    "file_type, opener", [("hdf", utils.open_hdf), ("yml", utils.open_file)]
+)
+def test_new(file_type, opener):
+    p = Provenance()
+    p["sec", "aaa"] = "xxx"
+    p["sec", "bbb"] = 123
+    p["sec", "ccc"] = 3.14
+    p["sec", "ddd"] = "cat"
+    p["sec", "eee"] = 4.14
+
+    with tempfile.TemporaryDirectory() as dirname:
+        fname = os.path.join(dirname, f"test.{file_type}")
+        file_id = p.write(fname)
+
+        assert os.path.exists(fname)
+
+        q = Provenance()
+        q.read(fname)
+
+        print(q.provenance)
+        assert q["sec", "aaa"] == "xxx"
+        assert q["sec", "bbb"] == 123
+        assert q["sec", "ccc"] == 3.14
+
+        # check the direct getter class methods work
+        assert Provenance.get(fname, "sec", "ddd") == "cat"
+        assert Provenance.get(fname, "sec", "eee") == 4.14
+
+        # the file ID will not be in the original as it changes
+        # but we saved it
+        print(q.provenance)
+        assert q["base", "file_id"] == file_id
+        del q["base", "file_id"]
+        assert p.provenance == q.provenance
+
+
+def test_new_fits():
+    file_type = "fits"
+    p = Provenance()
+    p["sec", "aaa"] = "xxx"
+    p["sec", "bbb"] = 123
+    p["sec", "ccc"] = 3.14
+    p["sec", "DDD"] = "cat"
+    p["sec", "eee"] = 4.14
+
+    with tempfile.TemporaryDirectory() as dirname:
+        fname = os.path.join(dirname, f"test.{file_type}")
+
+        # Write to now-closed
+        file_id = p.write(fname)
+
+        assert os.path.exists(fname)
+
+        q = Provenance()
+        q.read(fname)
+
+        assert q["sec", "aaa"] == "xxx"
+        assert q["sec", "bbb"] == 123
+        assert q["sec", "ccc"] == 3.14
+
+        # check the direct getters work
+        assert q.get(fname, "sec", "DDD") == "cat"
+        assert q.get(fname, "sec", "eee") == 4.14
+
+        assert file_id == q["base", "file_id"]
+        del q["base", "file_id"]
+        assert p.provenance == q.provenance
+
+
+def test_fits_multiline():
+    file_type = "fits"
+    p = Provenance()
+    p["sec", "aaa"] = "Two households;\nboth alike in dignity!"
+
+    with tempfile.TemporaryDirectory() as dirname:
+        fname = os.path.join(dirname, f"test.{file_type}")
+
+        # Write to now-closed
+        p.write(fname)
+
+        assert os.path.exists(fname)
+
+        q = Provenance()
+        q.read(fname)
+
+        assert p["sec", "aaa"] == q["sec", "aaa"]
+
+
+def test_existing_hdf():
+    import h5py
+
+    p = Provenance()
+    p["sec", "aaa"] = "xxx"
+    p["sec", "bbb"] = 123
+    p["sec", "ccc"] = 3.14
+    p["sec", "DDD"] = "cat"
+    p["sec", "eee"] = 4.14
+    p.add_comment("hopefully nothing else will break if I add this")
+
+    with tempfile.TemporaryDirectory() as dirname:
+        fname = os.path.join(dirname, "test.hdf")
+        fname2 = os.path.join(dirname, "test2.hdf")
+
+        with utils.open_hdf(fname, "w") as f:
+            f.create_group("cake")
+
+        id1 = p.write(fname)
+        id2 = p.write(fname2)
+
+        assert id1 != id2
+
+        q1 = Provenance()
+        q1.read(fname)
+        q2 = Provenance()
+        q2.read(fname2)
+
+        assert q1["base", "file_id"] == id1
+        assert q2["base", "file_id"] == id2
+
+        # Check full round trip
+        q1.provenance.pop(("base", "file_id"))
+        q2.provenance.pop(("base", "file_id"))
+        assert q1.provenance == q1.provenance
+
+
+def test_yml():
+    import ruamel.yaml as yaml
+
+    y = yaml.YAML()
+    # check nothing is overridden
+    d = {
+        "cat": "good",
+        "dog": "bad",
+        "spoon": 45.6,
+        "cow": -222,
+    }
+    with tempfile.TemporaryDirectory() as dirname:
+        fname = os.path.join(dirname, "test.yml")
+        y.dump(d, open(fname, "w"))
+
+        p = Provenance()
+        p.generate()
+        p.add_comment("this is a test to check nothing else breaks")
+        file_id = p.write(fname)
+
+        # check prov reads
+        q = Provenance()
+        q.read(fname)
+
+        assert q["base", "file_id"] == file_id
+        assert q["base", "process_id"] == q["base", "process_id"]
+
+        d2 = y.load(open(fname))
+        d2.pop("provenance")
+        assert d == d2
+
+        q = Provenance()
+        q.read_yaml(open(fname))
+        assert q["base", "file_id"] == file_id
+        assert q["base", "process_id"] == q["base", "process_id"]
+
+
+def test_unknown_file_type():
+    p = Provenance()
+    p.generate()
+    with tempfile.TemporaryDirectory() as dirname:
+        fname = os.path.join(dirname, "test.xyz")
+        pname = os.path.join(dirname, "test.xyz.provenance.yaml")
+        file_id = p.write(fname)
+        print(fname, pname)
+        p2 = Provenance()
+        p2.read(fname)
+        assert p2['base', 'file_id'] == file_id
+        assert p2['base', 'process_id'] == p['base', 'process_id']
+
+
+def test_long():
+    p = Provenance()
+    lines = []
+    for i in range(1002):
+        line = "".join(random.choice(string.printable) for i in range(100))
+        lines.append(line)
+    text = "\n".join(lines)
+    p["section", "key"] = text
+
+    with tempfile.TemporaryDirectory() as dirname:
+        fname = os.path.join("./", "test.fits")
+        with pytest.warns(UserWarning):
+            p.write(fname)
+        q = Provenance()
+        q.read(fname)
+
+
+def test_comments():
+    p = Provenance()
+    p.generate()
+    comments = [
+        "Hello,",
+        "My name is Inigo Montoya",
+        "You killed my father",
+        "Prepare to die",
+    ]
+    for comment in comments:
+        p.add_comment(comment)
+
+    with tempfile.TemporaryDirectory() as dirname:
+        for suffix in ["hdf", "fits", "yml"]:
+            fname = os.path.join("./", f"test.{suffix}")
+            p.write(fname)
+            q = Provenance()
+            q.read(fname)
+            for c in comments:
+                assert c in q.comments
+
+
+if __name__ == "__main__":
+    # test_comments()
+    test_long()

From 43af8bf93f95a8e4846182ea2ee90b8bdb65b3f4 Mon Sep 17 00:00:00 2001
From: Joe Zuntz <joezuntz@googlemail.com>
Date: Mon, 18 Jul 2022 15:04:16 +0100
Subject: [PATCH 6/9] fix output

---
 ceci/stage.py         | 3 +--
 ceci_example/types.py | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ceci/stage.py b/ceci/stage.py
index e3b1652..b00f67e 100644
--- a/ceci/stage.py
+++ b/ceci/stage.py
@@ -1016,8 +1016,7 @@ def open_output(
 
 
         # Return an opened object representing the file
-        obj = output_class(path, "w", **kwargs)
-        obj.provenance = self.provenance
+        obj = output_class(path, "w", provenance=self.provenance, **kwargs)
 
         if wrapper:
             return obj
diff --git a/ceci_example/types.py b/ceci_example/types.py
index 9e46568..9db6ec9 100644
--- a/ceci_example/types.py
+++ b/ceci_example/types.py
@@ -19,6 +19,7 @@ class DataFile(BaseIOHandle):
     """
 
     def __init__(self, path, mode, validate=True, **kwargs):
+        super().__init__(kwargs.pop("provenance"))
         self.path = path
         self.mode = mode
 

From 70603fa4786a7fda262cc1b56b249ba5c262cac0 Mon Sep 17 00:00:00 2001
From: Joe Zuntz <joezuntz@googlemail.com>
Date: Mon, 18 Jul 2022 15:19:26 +0100
Subject: [PATCH 7/9] Fix import overwrite

---
 ceci/__init__.py      |  1 +
 ceci/handle.py        | 20 ++++++++++++++++++++
 ceci_example/types.py |  2 +-
 3 files changed, 22 insertions(+), 1 deletion(-)
 create mode 100644 ceci/handle.py

diff --git a/ceci/__init__.py b/ceci/__init__.py
index 3780600..2d81efc 100644
--- a/ceci/__init__.py
+++ b/ceci/__init__.py
@@ -1,5 +1,6 @@
 """Ceci n'est pas une pipeline"""
 
+from .provenance import Provenance
 from .stage import PipelineStage
 from .pipeline import Pipeline, MiniPipeline, ParslPipeline, DryRunPipeline
 from .handle import  BaseIOHandle
diff --git a/ceci/handle.py b/ceci/handle.py
new file mode 100644
index 0000000..8f2f7d5
--- /dev/null
+++ b/ceci/handle.py
@@ -0,0 +1,20 @@
+import copy
+
+class BaseIOHandle:
+    """
+    Base class for input and output objects. For now this is pretty empty.
+    """
+    suffix = ""
+
+    def __init__(self, provenance):
+        # If provenance is provided then pull that out
+        self.provenance =  provenance
+
+    @property
+    def provenance(self):
+        return self._provenance
+
+    @provenance.setter
+    def provenance(self, provenance):
+        # always copy the provenance
+        self._provenance = copy.deepcopy(provenance)
diff --git a/ceci_example/types.py b/ceci_example/types.py
index 9db6ec9..3cefb8e 100644
--- a/ceci_example/types.py
+++ b/ceci_example/types.py
@@ -19,7 +19,7 @@ class DataFile(BaseIOHandle):
     """
 
     def __init__(self, path, mode, validate=True, **kwargs):
-        super().__init__(kwargs.pop("provenance"))
+        super().__init__(kwargs.pop("provenance", None))
         self.path = path
         self.mode = mode
 

From f2a6dde3aff759c5f4ade7eb8ca93078b24ae3bd Mon Sep 17 00:00:00 2001
From: Joe Zuntz <joezuntz@googlemail.com>
Date: Mon, 18 Jul 2022 15:25:16 +0100
Subject: [PATCH 8/9] fix setup

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index ad40e43..39418ff 100644
--- a/setup.py
+++ b/setup.py
@@ -28,7 +28,7 @@
         'Programming Language :: Python :: 3.8',
         'Development Status :: 3 - Alpha',
     ],
-    packages=['ceci', 'ceci.sites'],
+    packages=['ceci', 'ceci.sites', 'ceci.provenance'],
     entry_points={
         'console_scripts':['ceci=ceci.main:main']
     },

From 2806b1d0e0f24ea429bf8fc335f5a174bf7feb2d Mon Sep 17 00:00:00 2001
From: Joe Zuntz <joezuntz@googlemail.com>
Date: Mon, 18 Jul 2022 15:34:28 +0100
Subject: [PATCH 9/9] add more test reqs

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 39418ff..94be81a 100644
--- a/setup.py
+++ b/setup.py
@@ -38,6 +38,6 @@
     extras_require={
       'parsl': ['flask', 'parsl>=1.0.0'],
       'cwl': ['cwlgen>=0.4', 'cwltool>=2.0.20200126090152'],
-      'test': ['pytest', 'codecov', 'pytest-cov'],
+      'test': ['pytest', 'codecov', 'pytest-cov', 'fitsio', 'h5py', "ruamel.yaml"],
       }
 )