-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathvalidate_data.py
More file actions
177 lines (142 loc) Β· 4.85 KB
/
validate_data.py
File metadata and controls
177 lines (142 loc) Β· 4.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#!/usr/bin/env python3
"""
Validate data against schemas (JSON Schema, Pydantic models).
Usage:
python validate_data.py <data_file> [--schema SCHEMA]
"""
import sys
from pathlib import Path
try:
import codomyrmex
except ImportError:
project_root = Path(__file__).resolve().parent.parent.parent
sys.path.insert(0, str(project_root / "src"))
import argparse
import json
def validate_json_structure(data: dict) -> list:
"""Basic structural validation."""
issues = []
def check_node(node, path=""):
if isinstance(node, dict):
if not node:
issues.append(f"{path or 'root'}: Empty object")
for k, v in node.items():
check_node(v, f"{path}.{k}" if path else k)
elif isinstance(node, list):
if not node:
issues.append(f"{path}: Empty array")
for i, item in enumerate(node):
check_node(item, f"{path}[{i}]")
elif node is None:
issues.append(f"{path}: Null value")
check_node(data)
return issues
def validate_against_schema(data: dict, schema: dict) -> list:
"""Validate against JSON Schema."""
try:
import jsonschema
validator = jsonschema.Draft7Validator(schema)
errors = list(validator.iter_errors(data))
return [f"{'.'.join(str(p) for p in e.path)}: {e.message}" for e in errors]
except ImportError:
return ["jsonschema not installed - run: pip install jsonschema"]
def infer_types(data: dict, path: str = "") -> dict:
"""Infer type structure from data."""
if isinstance(data, dict):
return {k: infer_types(v, f"{path}.{k}") for k, v in data.items()}
if isinstance(data, list):
if data:
return [infer_types(data[0], f"{path}[]")]
return ["unknown"]
if isinstance(data, str):
return "string"
if isinstance(data, bool):
return "boolean"
if isinstance(data, int):
return "integer"
if isinstance(data, float):
return "number"
if data is None:
return "null"
return "unknown"
def main():
# Auto-injected: Load configuration
from pathlib import Path
import yaml
config_path = (
Path(__file__).resolve().parent.parent.parent
/ "config"
/ "validation"
/ "config.yaml"
)
if config_path.exists():
with open(config_path) as f:
yaml.safe_load(f) or {}
print("Loaded config from config/validation/config.yaml")
parser = argparse.ArgumentParser(description="Validate data files")
parser.add_argument("data_file", nargs="?", help="JSON data file to validate")
parser.add_argument("--schema", "-s", default=None, help="JSON Schema file")
parser.add_argument(
"--infer", "-i", action="store_true", help="Infer and show type structure"
)
parser.add_argument("--strict", action="store_true", help="Fail on warnings")
args = parser.parse_args()
if not args.data_file:
print("π Data Validator\n")
print("Usage:")
print(" python validate_data.py data.json")
print(" python validate_data.py data.json --schema schema.json")
print(" python validate_data.py data.json --infer")
return 0
data_path = Path(args.data_file)
if not data_path.exists():
print(f"β File not found: {args.data_file}")
return 1
try:
with open(data_path) as f:
data = json.load(f)
except json.JSONDecodeError as e:
print(f"β Invalid JSON: {e}")
return 1
print(f"π Validating: {data_path.name}\n")
if args.infer:
types = infer_types(data)
print("π Inferred type structure:")
print(json.dumps(types, indent=2))
return 0
issues = []
warnings = []
# Basic structure check
struct_issues = validate_json_structure(data)
warnings.extend(struct_issues)
# Schema validation
if args.schema:
schema_path = Path(args.schema)
if not schema_path.exists():
print(f"β Schema not found: {args.schema}")
return 1
with open(schema_path) as f:
schema = json.load(f)
schema_issues = validate_against_schema(data, schema)
issues.extend(schema_issues)
# Results
if issues:
print(f"β Validation errors ({len(issues)}):")
for issue in issues[:20]:
print(f" β’ {issue}")
return 1
if warnings:
print(f"β οΈ Warnings ({len(warnings)}):")
for w in warnings[:10]:
print(f" β’ {w}")
if args.strict:
return 1
print("β
Validation passed")
# Stats
if isinstance(data, dict):
print(f" Keys: {len(data)}")
elif isinstance(data, list):
print(f" Items: {len(data)}")
return 0
if __name__ == "__main__":
sys.exit(main())