diff --git a/scripts/config.json b/scripts/config.json new file mode 100644 index 0000000..d4e37ef --- /dev/null +++ b/scripts/config.json @@ -0,0 +1,21 @@ +{ + "version": "0.1", + "scripts": [ + { + "documentation": { + "command": "search-diff", + "description": "Search the data diff in the delta messages. Parameters:\n diff: default /project/data/diff_file\n delta: default /project/data/delta_file.json\n write_output: default yes\n insert_out_file: default /project/data/missing_inserts.nq\n delete_out_file: default /project/data/missing_deletes.nq", + "arguments": ["diff", "delta", "write", "insert_out_file", "delete_out_file"] + }, + "environment": { + "image": "python:3.10", + "interactive": false, + "script": "delta-notifier/search-diff.sh", + "join_networks": true + }, + "mounts": { + "app": "/project/" + } + } + ] +} diff --git a/scripts/delta-notifier/search-diff.py b/scripts/delta-notifier/search-diff.py new file mode 100644 index 0000000..fc0bbcd --- /dev/null +++ b/scripts/delta-notifier/search-diff.py @@ -0,0 +1,86 @@ +from rdflib import Graph, Dataset +import json +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument('left_diff', type=str, nargs=1) +parser.add_argument('right_diff', type=str, nargs=1) +parser.add_argument('delta', type=str, nargs=1) +parser.add_argument('--write', dest='write', type=str, nargs=1) +parser.add_argument('--missing-inserts-file', type=str, nargs=1) +parser.add_argument('--missing-deletes-file', type=str, nargs=1) +parser.set_defaults(write=True) + +args = parser.parse_args() +left_diff = args.left_diff[0] +right_diff = args.right_diff[0] +delta_filename = args.delta[0] +write_output = args.write[0].lower() in [ + 'true', '1', 't', 'y', 'yes', 'yeah', 'yup' +] +if write_output: + inserts_output_file = args.missing_inserts_file[0] + deletes_output_file = args.missing_deletes_file[0] + if inserts_output_file is None or deletes_output_file is None: + inserts_output_file = 'missing_inserts.nq' + deletes_output_file = 'missing_deletes.nq' + print( + f"Warning: no output files specified, using {inserts_output_file} and {deletes_output_file}" + ) + +left = Dataset() +left.parse(left_diff) +left_diff = set(map(lambda i: tuple(map(str, i[:3])), list(left))) +right = Dataset() +right.parse(right_diff) +right_diff = set(map(lambda i: tuple(map(str, i[:3])), list(right))) + +delta_data = [] +delta_inserts = set() +delta_deletes = set() +with open(delta_filename) as delta_file: + delta_data = json.load(delta_file) +for delta_message in delta_data: + for insert in delta_message['inserts']: + delta_inserts.add( + (insert['subject']['value'], insert['predicate']['value'], + insert['object']['value'])) + for delete in delta_message['deletes']: + delta_deletes.add( + (delete['subject']['value'], delete['predicate']['value'], + delete['object']['value'])) + +found_inserts = 0 +for triple in delta_inserts: + if triple in left_diff: + left_diff.remove(triple) + found_inserts += 1 +missing_inserts = len(left_diff) + +print(f"Number of diffs found in the inserts: {found_inserts}") +print(f"Total number of missing inserts: {missing_inserts}") + +found_deletes = 0 +for triple in delta_deletes: + if triple in right_diff: + right_diff.remove(triple) + found_deletes += 1 +missing_deletes = len(right_diff) + +print(f"Number of diffs found in the deletes: {found_deletes}") +print(f"Total number of missing deletes: {missing_deletes}") + +if write_output: + missing_inserts_dataset = Dataset() + missing_inserts_dataset.addN( + filter(lambda i: tuple(map(str, i[:3])) in left_diff, left)) + missing_inserts_dataset.serialize(inserts_output_file, format="nquads") + print(f"Writing missing inserts to {inserts_output_file}") + missing_deletes_dataset = Dataset() + missing_deletes_dataset.addN( + filter(lambda i: tuple(map(str, i[:3])) in right_diff, right)) + missing_deletes_dataset.serialize(deletes_output_file, format="nquads") + print(f"Writing missing deletes to {deletes_output_file}") + +if missing_inserts + missing_deletes > 0: + exit(1) diff --git a/scripts/delta-notifier/search-diff.sh b/scripts/delta-notifier/search-diff.sh new file mode 100755 index 0000000..163ea96 --- /dev/null +++ b/scripts/delta-notifier/search-diff.sh @@ -0,0 +1,14 @@ +#!/bin/bash +DIFF_FILE=${1:-/project/data/diff_file} +DELTA_FILE=${2:-/project/data/delta_file.json} +WRITE_OUTPUT=${3:-yes} +INSERT_OUTPUT=${4:-/project/data/missing_inserts.nq} +DELETE_OUTPUT=${4:-/project/data/missing_deletes.nq} +left_file=$(mktemp /tmp/diff.XXXXXXXXXX.nq) +right_file=$(mktemp /tmp/diff.XXXXXXXXXX.nq) +cat $DIFF_FILE | egrep "^(>) " | cut -c 3- > $left_file +cat $DIFF_FILE | egrep "^(<) " | cut -c 3- > $right_file + +pip install rdflib &> /dev/null + +python3 search-diff.py $left_file $right_file $DELTA_FILE --write $WRITE_OUTPUT --missing-inserts-file $INSERT_OUTPUT --missing-deletes-file $DELETE_OUTPUT