diff --git a/.gitignore b/.gitignore index bee8a64..5542df6 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,6 @@ __pycache__ +# Ignore the extracted ISO 639-3 code tables directory and its zip archive. +# These files are likely large data resources not needed in version control. +iso-639-3_Code_Tables_20240415/ +iso-639-3_Code_Tables_20240415.zip + diff --git a/Makefile b/Makefile index d296424..b6bfe9e 100644 --- a/Makefile +++ b/Makefile @@ -4,9 +4,13 @@ init: get-table: # https://iso639-3.sil.org/code_tables/download_tables wget https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3_Code_Tables_20240415.zip + unzip -o iso-639-3_Code_Tables_20240415.zip combine-wikipedia: cat wikipedia_languages.csv wikipedia_languages_extra.csv > wikipedia_languages_all.csv generate: - python generate.py + python3 generate.py + +validate: + python3 validate_output.py diff --git a/requirements.txt b/requirements.txt index dffbfe9..c7f9a31 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ beautifulsoup4 jinja2 lxml pyarrow +pytest \ No newline at end of file diff --git a/test_validate_output.py b/test_validate_output.py new file mode 100644 index 0000000..25b169e --- /dev/null +++ b/test_validate_output.py @@ -0,0 +1,20 @@ +from validate_output import validate_markdown_file +import os + +def test_missing_title(tmp_path): + test_file = tmp_path / "test1.md" + test_file.write_text("Summary: This is a test.") + errors = validate_markdown_file(str(test_file)) + assert "Missing title" in errors + +def test_missing_summary(tmp_path): + test_file = tmp_path / "test2.md" + test_file.write_text("# Test Title") + errors = validate_markdown_file(str(test_file)) + assert "Missing summary section" in errors + +def test_valid_markdown(tmp_path): + test_file = tmp_path / "test3.md" + test_file.write_text("# Test Title\nSummary: All good.") + errors = validate_markdown_file(str(test_file)) + assert not errors \ No newline at end of file diff --git a/validate_output.py b/validate_output.py new file mode 100644 index 0000000..b4b4ac3 --- /dev/null +++ b/validate_output.py @@ -0,0 +1,44 @@ +import sys +import os +import re + +def validate_markdown_file(filepath): + with open(filepath, 'r', encoding='utf-8') as f: + content = f.read() + errors = [] + if not re.search(r'^# ', content, re.MULTILINE): + errors.append("Missing title") + if "Summary:" not in content: + errors.append("Missing summary section") + return errors + +def validate_directory(output_dir): + failed = False + for fname in os.listdir(output_dir): + if fname.endswith('.md'): + errors = validate_markdown_file(os.path.join(output_dir, fname)) + if errors: + failed = True + print(f"{os.path.join(output_dir, fname)}:") + for err in errors: + print(f" - {err}") + return failed + +def main(): + base_dir = sys.argv[1] if len(sys.argv) > 1 else "../web-languages" + failed = False + if os.path.isdir(base_dir): + # Validate all subdirectories + for subdir in os.listdir(base_dir): + full_path = os.path.join(base_dir, subdir) + if os.path.isdir(full_path): + if validate_directory(full_path): + failed = True + else: + print(f"Error: Directory '{base_dir}' does not exist.") + sys.exit(1) + if failed: + sys.exit(1) + +if __name__ == "__main__": + main() \ No newline at end of file