diff --git a/.clusterfuzzlite/Dockerfile b/.clusterfuzzlite/Dockerfile new file mode 100644 index 0000000..bd074af --- /dev/null +++ b/.clusterfuzzlite/Dockerfile @@ -0,0 +1,8 @@ +# ClusterFuzzLite build image for aemo-mdff-reader. +# Uses the OSS-Fuzz Python base image, which provides atheris and +# the compile_python_fuzzer helper. +FROM gcr.io/oss-fuzz-base/base-builder-python + +COPY . $SRC/aemo-mdff-reader +WORKDIR $SRC/aemo-mdff-reader +COPY .clusterfuzzlite/build.sh $SRC/build.sh diff --git a/.clusterfuzzlite/build.sh b/.clusterfuzzlite/build.sh new file mode 100755 index 0000000..4795447 --- /dev/null +++ b/.clusterfuzzlite/build.sh @@ -0,0 +1,10 @@ +#!/bin/bash -eu +# ClusterFuzzLite build script — installs the package and compiles each +# atheris harness in fuzz/ via OSS-Fuzz's compile_python_fuzzer helper. + +cd "$SRC/aemo-mdff-reader" +pip3 install --no-cache-dir . + +for fuzzer in fuzz/fuzz_*.py; do + compile_python_fuzzer "$fuzzer" +done diff --git a/.clusterfuzzlite/project.yaml b/.clusterfuzzlite/project.yaml new file mode 100644 index 0000000..d1ad0ae --- /dev/null +++ b/.clusterfuzzlite/project.yaml @@ -0,0 +1 @@ +language: python diff --git a/.github/workflows/cflite_batch.yml b/.github/workflows/cflite_batch.yml new file mode 100644 index 0000000..dd01d0d --- /dev/null +++ b/.github/workflows/cflite_batch.yml @@ -0,0 +1,39 @@ +name: ClusterFuzzLite scheduled batch fuzz + +# Longer scheduled fuzz session that grows the persistent corpus and +# crash storage in the gh-pages branch. Runs each sanitizer in turn +# for ``fuzz-seconds``. Storage requires a ``gh-pages`` branch; the +# action creates it on first run. + +on: + schedule: + # Sundays at 02:00 UTC — quiet window, off-cycle from CodeQL/Scorecard. + - cron: "0 2 * * 0" + workflow_dispatch: + +permissions: read-all + +jobs: + batch-fuzz: + runs-on: ubuntu-latest + timeout-minutes: 60 + permissions: + # cflite needs write access to gh-pages for corpus + crash storage. + contents: write + strategy: + fail-fast: false + matrix: + sanitizer: [address, undefined] + steps: + - name: Build fuzzers (${{ matrix.sanitizer }}) + uses: google/clusterfuzzlite/actions/build_fuzzers@884713a6c30a92e5e8544c39945cd7cb630abcd1 # v1 + with: + language: python + sanitizer: ${{ matrix.sanitizer }} + - name: Run fuzzers (${{ matrix.sanitizer }}) + uses: google/clusterfuzzlite/actions/run_fuzzers@884713a6c30a92e5e8544c39945cd7cb630abcd1 # v1 + with: + language: python + fuzz-seconds: 1800 + mode: batch + sanitizer: ${{ matrix.sanitizer }} diff --git a/.github/workflows/cflite_pr.yml b/.github/workflows/cflite_pr.yml new file mode 100644 index 0000000..a72fed8 --- /dev/null +++ b/.github/workflows/cflite_pr.yml @@ -0,0 +1,45 @@ +name: ClusterFuzzLite PR fuzz + +# Per-PR fuzz: smoke-test the build and run a quick (30s) crash search +# only on PRs whose changes actually reach the parser. The longer +# corpus-extending pass + the second sanitizer live in cflite_batch.yml +# so PRs aren't held up by fuzzing. +# +# `paths` is enumerated explicitly (not `aemo_mdff_reader/**`) to skip +# fuzzing on cli / aggregate / reader / sql changes — none of which +# the harnesses exercise. + +on: + pull_request: + paths: + - "aemo_mdff_reader/__init__.py" + - "aemo_mdff_reader/parser.py" + - "aemo_mdff_reader/types.py" + - "aemo_mdff_reader/spec.py" + - "fuzz/**" + - ".clusterfuzzlite/**" + - ".github/workflows/cflite_pr.yml" + +permissions: read-all + +jobs: + fuzz: + name: fuzz (address, 30s) + # Skip draft PRs — fuzz on the final form, not the in-progress one. + if: github.event.pull_request.draft == false + runs-on: ubuntu-latest + timeout-minutes: 6 + steps: + - name: Build fuzzers + uses: google/clusterfuzzlite/actions/build_fuzzers@884713a6c30a92e5e8544c39945cd7cb630abcd1 # v1 + with: + language: python + sanitizer: address + - name: Run fuzzers (30s smoke) + uses: google/clusterfuzzlite/actions/run_fuzzers@884713a6c30a92e5e8544c39945cd7cb630abcd1 # v1 + with: + language: python + fuzz-seconds: 30 + mode: code-change + sanitizer: address + output-sarif: true diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 7ef4602..42aa884 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -53,11 +53,21 @@ jobs: /tmp/smoke/bin/python -c "import aemo_mdff_reader as m; print(m.__version__)" /tmp/smoke/bin/aemo-mdff-reader --version - name: Generate build provenance attestation + id: provenance uses: actions/attest-build-provenance@a2bbfa25375fe432b6a289bc6b6cd05ecd0c4c32 # v4.1.0 with: subject-path: | dist/*.whl dist/*.tar.gz + # Stage the provenance bundle as a file alongside the release so + # OpenSSF Scorecard's signed-releases check (which scans release + # assets, not GitHub's attestations API) sees an in-toto provenance + # artefact and awards full marks. + - name: Stage provenance bundle for the release + run: | + mkdir -p provenance + cp "${{ steps.provenance.outputs.bundle-path }}" "provenance/aemo_mdff_reader.intoto.jsonl" + ls -la provenance/ # SBOM is written outside dist/ so the publish job's PyPI upload # (which only accepts .whl/.tar.gz) is not contaminated. anchore's # sbom-action does not auto-create the parent directory of @@ -86,6 +96,10 @@ jobs: with: name: sbom path: sbom/ + - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 + with: + name: provenance + path: provenance/ publish: name: Publish to PyPI @@ -150,6 +164,10 @@ jobs: with: name: sbom path: sbom/ + - uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8 + with: + name: provenance + path: provenance/ - name: Create GitHub Release with notes from CHANGELOG uses: softprops/action-gh-release@b4309332981a82ec1c5618f44dd2e27cc8bfbfda # v3 with: @@ -159,3 +177,4 @@ jobs: dist/*.whl signatures/* sbom/* + provenance/* diff --git a/fuzz/fuzz_parse.py b/fuzz/fuzz_parse.py new file mode 100644 index 0000000..fc04d39 --- /dev/null +++ b/fuzz/fuzz_parse.py @@ -0,0 +1,43 @@ +"""Fuzz the NEM12 streaming parser entry point. + +Run locally: + pip install atheris + python fuzz/fuzz_parse.py -atheris_runs=10000 + +Run in OSS-Fuzz / ClusterFuzzLite: this file is built by +.clusterfuzzlite/build.sh. +""" + +from __future__ import annotations + +import io +import sys + +import atheris + +with atheris.instrument_imports(): + from aemo_mdff_reader import parse + + +def TestOneInput(data: bytes) -> None: + # Python is memory-safe, so coverage-guided fuzzing of a pure-Python + # parser is hunting for hangs, infinite loops, and pathological + # memory growth — not crashes. Any exception raised by the parser + # on malformed input is by definition an expected rejection, so we + # swallow them broadly. SystemExit / KeyboardInterrupt deliberately + # propagate. + try: + text = data.decode("utf-8", errors="replace") + for _ in parse(io.StringIO(text)): + pass + except Exception: # see comment above. + return + + +def main() -> None: + atheris.Setup(sys.argv, TestOneInput) + atheris.Fuzz() + + +if __name__ == "__main__": + main() diff --git a/fuzz/fuzz_parse_accumulations.py b/fuzz/fuzz_parse_accumulations.py new file mode 100644 index 0000000..9c276f0 --- /dev/null +++ b/fuzz/fuzz_parse_accumulations.py @@ -0,0 +1,31 @@ +"""Fuzz the NEM13 (accumulation) parser.""" + +from __future__ import annotations + +import io +import sys + +import atheris + +with atheris.instrument_imports(): + from aemo_mdff_reader import parse_accumulations + + +def TestOneInput(data: bytes) -> None: + # See fuzz_parse.py — broad except is intentional for a pure-Python + # memory-safe target. We're hunting for hangs / pathological growth. + try: + text = data.decode("utf-8", errors="replace") + for _ in parse_accumulations(io.StringIO(text)): + pass + except Exception: + return + + +def main() -> None: + atheris.Setup(sys.argv, TestOneInput) + atheris.Fuzz() + + +if __name__ == "__main__": + main() diff --git a/fuzz/fuzz_parse_to_columns.py b/fuzz/fuzz_parse_to_columns.py new file mode 100644 index 0000000..1fabce7 --- /dev/null +++ b/fuzz/fuzz_parse_to_columns.py @@ -0,0 +1,30 @@ +"""Fuzz the columnar fast-path build.""" + +from __future__ import annotations + +import io +import sys + +import atheris + +with atheris.instrument_imports(): + from aemo_mdff_reader import parse_to_columns + + +def TestOneInput(data: bytes) -> None: + # See fuzz_parse.py — broad except is intentional for a pure-Python + # memory-safe target. We're hunting for hangs / pathological growth. + try: + text = data.decode("utf-8", errors="replace") + parse_to_columns(io.StringIO(text)) + except Exception: + return + + +def main() -> None: + atheris.Setup(sys.argv, TestOneInput) + atheris.Fuzz() + + +if __name__ == "__main__": + main()