Abstract-Data · jreakin · Jan 14, 2025 · Jan 15, 2025 · Jan 15, 2025 · Jan 15, 2025
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitattributes b/.gitattributes
@@ -1,2 +1 @@
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.csv filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,41 @@
+name: CI
+
+on:
+  push:
+  pull_request:
-on:
-  push:
-  pull_request:
+on:
+  push: {}
+  pull_request: {}
-on:
-  push:
-  pull_request:
+on:
+  push: {}
+  pull_request: {}
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install uv and dependencies
+        run: |
+          pip install uv
+          uv sync
+          uv pip install pytest-cov pytest-xdist
+
+      - name: Run tests with coverage
+        run: uv run pytest app/tests/ --cov --cov-branch --cov-report=xml --junitxml=junit.xml -o junit_family=legacy
+        env:
+          PYTHONPATH: .
+
+      - name: Upload coverage reports to Codecov
+        uses: codecov/codecov-action@v5
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          slug: Abstract-Data/campaignfinance-2023
+
+      - name: Upload test results to Codecov
+        if: ${{ !cancelled() }}
+        uses: codecov/test-results-action@v1
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+
-
+          slug: Abstract-Data/campaignfinance-2023
-
+          slug: Abstract-Data/campaignfinance-2023
diff --git a/.gitignore b/.gitignore
@@ -4,8 +4,10 @@
 *.zip
 *.DS_Store
 *.env
+# *.parquet
 *.__pycache__/
 /logs/
+# /tmp/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
@@ -173,3 +175,5 @@ cython_debug/
 
 *.csv
 *.txt
+*.parquet
+/tmp
diff --git a/.idea/.DS_Store b/.idea/.DS_Store
diff --git a/.idea/campaignfinance.iml b/.idea/campaignfinance.iml
diff --git a/.idea/csv-editor.xml b/.idea/csv-editor.xml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/ruff.xml b/.idea/ruff.xml
diff --git a/README.md b/README.md
@@ -0,0 +1,18 @@
+# Texas Campaign Finance Package
+
+## Overview
+This package is designed to provide a simple interface for accessing campaign finance data from the Texas Ethics Commission.
+It also reduces duplication of fields and joins data from multiple files into a single table to reduce the size of the data.
+
+## Examples
+Across all files, there are over 317 columns. This package reduces the number of columns to [number of columns] by joining data from multiple files.
+
+## Ability to Download TEC File Data Built-In
+Using [Selenium](https://www.selenium.dev/), this package can download the latest campaign finance data from the Texas Ethics Commission website. The data is then processed and saved as CSV files.
+
+
+## Dependencies
+![Python](https://img.shields.io/badge/Python-FFD43B?style=for-the-badge&logo=python&logoColor=blue)
+![Pydantic](https://img.shields.io/badge/Pydantic-E92063?style=for-the-badge&logo=Pydantic&logoColor=white)
+![Pandas](https://img.shields.io/badge/Pandas-2C2D72?style=for-the-badge&logo=pandas&logoColor=white)
+![Selenium](https://img.shields.io/badge/Selenium-43B02A?style=for-the-badge&logo=Selenium&logoColor=white)
-# Texas Campaign Finance Package
-
-## Overview
-This package is designed to provide a simple interface for accessing campaign finance data from the Texas Ethics Commission.
-It also reduces duplication of fields and joins data from multiple files into a single table to reduce the size of the data.
-
-## Examples
-Across all files, there are over 317 columns. This package reduces the number of columns to [number of columns] by joining data from multiple files.
-
-## Ability to Download TEC File Data Built-In
-Using [Selenium](https://www.selenium.dev/), this package can download the latest campaign finance data from the Texas Ethics Commission website. The data is then processed and saved as CSV files.
-
-
-## Dependencies
-![Python](https://img.shields.io/badge/Python-FFD43B?style=for-the-badge&logo=python&logoColor=blue)
-![Pydantic](https://img.shields.io/badge/Pydantic-E92063?style=for-the-badge&logo=Pydantic&logoColor=white)
-![Pandas](https://img.shields.io/badge/Pandas-2C2D72?style=for-the-badge&logo=pandas&logoColor=white)
-![Selenium](https://img.shields.io/badge/Selenium-43B02A?style=for-the-badge&logo=Selenium&logoColor=white)
+# Texas Campaign Finance Package
+
+## Overview
+
+This package is designed to provide a simple interface for accessing campaign finance data from the Texas Ethics Commission.
+It also reduces duplication of fields and joins data from multiple files into a single table to reduce the size of the data.
+
+## Examples
+
+Across all files, there are over 317 columns. This package reduces the number of columns to [number of columns] by joining data from multiple files.
+
+## Ability to Download TEC File Data Built-In
+
+Using [Selenium](https://www.selenium.dev/), this package can download the latest campaign finance data from the Texas Ethics Commission website. The data is then processed and saved as CSV files.
+
+
+## Dependencies
+
+![Python](https://img.shields.io/badge/Python-FFD43B?style=for-the-badge&logo=python&logoColor=blue)
+![Pydantic](https://img.shields.io/badge/Pydantic-E92063?style=for-the-badge&logo=Pydantic&logoColor=white)
+![Pandas](https://img.shields.io/badge/Pandas-2C2D72?style=for-the-badge&logo=pandas&logoColor=white)
+![Selenium](https://img.shields.io/badge/Selenium-43B02A?style=for-the-badge&logo=Selenium&logoColor=white)
+
-# Texas Campaign Finance Package
-
-## Overview
-This package is designed to provide a simple interface for accessing campaign finance data from the Texas Ethics Commission.
-It also reduces duplication of fields and joins data from multiple files into a single table to reduce the size of the data.
-
-## Examples
-Across all files, there are over 317 columns. This package reduces the number of columns to [number of columns] by joining data from multiple files.
-
-## Ability to Download TEC File Data Built-In
-Using [Selenium](https://www.selenium.dev/), this package can download the latest campaign finance data from the Texas Ethics Commission website. The data is then processed and saved as CSV files.
-
-
-## Dependencies
-![Python](https://img.shields.io/badge/Python-FFD43B?style=for-the-badge&logo=python&logoColor=blue)
-![Pydantic](https://img.shields.io/badge/Pydantic-E92063?style=for-the-badge&logo=Pydantic&logoColor=white)
-![Pandas](https://img.shields.io/badge/Pandas-2C2D72?style=for-the-badge&logo=pandas&logoColor=white)
-![Selenium](https://img.shields.io/badge/Selenium-43B02A?style=for-the-badge&logo=Selenium&logoColor=white)
+# Texas Campaign Finance Package
+
+## Overview
+
+This package is designed to provide a simple interface for accessing campaign finance data from the Texas Ethics Commission.
+It also reduces duplication of fields and joins data from multiple files into a single table to reduce the size of the data.
+
+## Examples
+
+Across all files, there are over 317 columns. This package reduces the number of columns to [number of columns] by joining data from multiple files.
+
+## Ability to Download TEC File Data Built-In
+
+Using [Selenium](https://www.selenium.dev/), this package can download the latest campaign finance data from the Texas Ethics Commission website. The data is then processed and saved as CSV files.
+
+
+## Dependencies
+
+![Python](https://img.shields.io/badge/Python-FFD43B?style=for-the-badge&logo=python&logoColor=blue)
+![Pydantic](https://img.shields.io/badge/Pydantic-E92063?style=for-the-badge&logo=Pydantic&logoColor=white)
+![Pandas](https://img.shields.io/badge/Pandas-2C2D72?style=for-the-badge&logo=pandas&logoColor=white)
+![Selenium](https://img.shields.io/badge/Selenium-43B02A?style=for-the-badge&logo=Selenium&logoColor=white)
+
diff --git a/app/.DS_Store b/app/.DS_Store
diff --git a/app/abcs/__init__.py b/app/abcs/__init__.py
@@ -1,7 +1,7 @@
 # from abcs.abc_download import FileDownloader
-from abcs.abc_state_config import CSVReaderConfig, StateConfig, CategoryConfig, CategoryTypes
-from abcs.abc_category import StateCategoryClass
-from abcs.abc_validation import StateFileValidation
-from abcs.abc_validation_errors import ValidationErrorList
-from abcs.abc_db_loader import DBLoaderClass
-from abcs.abc_download import FileDownloaderABC, RecordGen
+from app.abcs.abc_state_config import CSVReaderConfig, StateConfig, CategoryConfig, CategoryTypes
+from app.abcs.abc_category import StateCategoryClass
+from app.abcs.abc_validation import StateFileValidation
+from app.abcs.abc_validation_errors import ValidationErrorList
+from app.abcs.abc_db_loader import DBLoaderClass
+from app.abcs.abc_download import FileDownloaderABC, RecordGen, progress
diff --git a/app/abcs/abc_category.py b/app/abcs/abc_category.py
@@ -3,7 +3,7 @@
 
 import sqlalchemy
 import sqlmodel
-import abcs.abc_validation as validation
+import app.abcs.abc_validation as validation
 from pathlib import Path
 from typing import ClassVar, Dict, List
 from dataclasses import dataclass, field
@@ -13,10 +13,10 @@
 from collections import defaultdict
 import datetime
 # from abcs.abc_download import FileDownloader
-from abcs.abc_db_loader import DBLoaderClass
-from abcs.abc_state_config import StateConfig
-from logger import Logger
-import funcs
+from app.abcs.abc_db_loader import DBLoaderClass
+from app.abcs.abc_state_config import StateConfig
+from app.logger import Logger
+import app.funcs as funcs
 import inject
 from sqlmodel import SQLModel
 from pydantic import BaseModel, Field, ConfigDict, model_validator, computed_field

diff --git a/app/abcs/abc_db_loader.py b/app/abcs/abc_db_loader.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 import abc
 from sqlmodel import SQLModel, Session, create_engine, select
-from logger import Logger
+from app.logger import Logger
 from typing import Iterator, List, Type
 from dataclasses import dataclass
 import itertools

diff --git a/app/abcs/abc_download.py b/app/abcs/abc_download.py
@@ -2,29 +2,41 @@
 from dataclasses import dataclass, field
 from pathlib import Path
 import abc
-from abcs.abc_state_config import StateConfig, CategoryTypes
 import sys
-from typing import Optional, Generator, Dict, Annotated
+from typing import Optional, Generator, Dict, Annotated, ClassVar
 from icecream import ic
 from pydantic import Field as PydanticField
 import itertools
+from datetime import datetime
+import polars as pl
 
+from app.abcs.abc_state_config import StateConfig, CategoryTypes
+from web_scrape_utils import CreateWebDriver
+from app.live_display import ProgressTracker
 
 RecordGen = Annotated[Optional[Generator[Dict, None, None]], PydanticField(default=None)]
 FilteredRecordGen = RecordGen
 
-
+progress = ProgressTracker()
+progress.start()
 
 @dataclass
 class FileDownloaderABC(abc.ABC):
     config: StateConfig
+    driver: ClassVar[Optional[CreateWebDriver]] = None
     folder: Path = field(init=False)
     data: RecordGen | CategoryTypes = None
 
     def __post_init__(self):
-
         self.check_if_folder_exists()        
         self.folder = self.config.TEMP_FOLDER
+        FileDownloaderABC.driver = CreateWebDriver(download_folder=self.folder)
+
+    @classmethod
+    def not_headless(cls):
+        if cls.driver:
+            cls.driver.headless = False
+        return cls
 
     def check_if_folder_exists(self) -> Path:
         _temp_folder_name = self.config.TEMP_FOLDER.stem.title()
@@ -46,12 +58,65 @@ def check_if_folder_exists(self) -> Path:
             ic("User selected 'n'. Exiting...")
             sys.exit()
 
+    @classmethod
+    def extract_zipfile(cls, zip_ref, tmp):
+        zip_file_info = zip_ref.infolist()
+        _extract_task = progress.add_task("T4", "Extract Zip", "In Progress")
+        for file in zip_file_info:
+            try:
+                cls._process_csv(zip_ref, file, tmp)
+            except Exception as e:
+                ic(f"Zip File Extraction Error on {file.filename.upper()}: {e}")
-            try:
-                cls._process_csv(zip_ref, file, tmp)
-            except Exception as e:
-                ic(f"Zip File Extraction Error on {file.filename.upper()}: {e}")
+            try:
+                cls._process_csv(zip_ref, file, tmp)
+            except (OSError, IOError, pl.exceptions.ComputeError) as e:
+                ic(f"Zip File Extraction Error on {file.filename.upper()}: {e}")
+                raise
-            try:
-                cls._process_csv(zip_ref, file, tmp)
-            except Exception as e:
-                ic(f"Zip File Extraction Error on {file.filename.upper()}: {e}")
+            try:
+                cls._process_csv(zip_ref, file, tmp)
+            except (OSError, IOError, pl.exceptions.ComputeError) as e:
+                ic(f"Zip File Extraction Error on {file.filename.upper()}: {e}")
+                raise
+        progress.update_task(_extract_task, "Complete")
+
+    @classmethod
+    def _process_csv(cls, zip_ref, file, tmp):
+        file_name = Path(file.filename)
+        if file_name.suffix not in ('.csv', '.txt'):
+            ic(f"File {file_name.stem} is not a CSV/TXT file. Skipping...")
+            return
+
+        _csv_task = progress.add_task("T5", f"Extract CSV {file_name.stem}", "Started")
+        zip_ref.extract(file, tmp)
+
+        if file_name.suffix == '.txt':
+            return
-        _csv_task = progress.add_task("T5", f"Extract CSV {file_name.stem}", "Started")
-        zip_ref.extract(file, tmp)
-
-        if file_name.suffix == '.txt':
-            return
+        _csv_task = progress.add_task("T5", f"Extract CSV {file_name.stem}", "Started")
+        zip_ref.extract(file, tmp)
+
+        if file_name.suffix == '.txt':
+            progress.update_task(_csv_task, "Complete")
+            return
-        _csv_task = progress.add_task("T5", f"Extract CSV {file_name.stem}", "Started")
-        zip_ref.extract(file, tmp)
-
-        if file_name.suffix == '.txt':
-            return
+        _csv_task = progress.add_task("T5", f"Extract CSV {file_name.stem}", "Started")
+        zip_ref.extract(file, tmp)
+
+        if file_name.suffix == '.txt':
+            progress.update_task(_csv_task, "Complete")
+            return
+
+        rename = f"{file_name.stem}_{datetime.now():%Y%m%d}dl"
-        rename = f"{file_name.stem}_{datetime.now():%Y%m%d}dl"
+        rename = f"{file_name.stem}_{datetime.now(timezone.utc):%Y%m%d}dl"
-        rename = f"{file_name.stem}_{datetime.now():%Y%m%d}dl"
+        rename = f"{file_name.stem}_{datetime.now(timezone.utc):%Y%m%d}dl"
+        pl_file = pl.scan_csv(tmp / file_name, low_memory=False, infer_schema=False)
+        pl_file = (
+            pl_file
+            .with_columns(
+                pl.lit(file_name.stem)
+                .alias('file_origin')
+            ))
+
+        pl_file = (
+            pl_file
+            .with_columns([
+                pl.col(col)
+                .cast(pl.String)
+                for col in pl_file.collect_schema().names()
+                ]))
+
+        pl_file.collect().write_parquet(tmp / f"{rename}.parquet", compression='lz4')
+        progress.update_task(_csv_task, "Complete")
+        # Clean up original CSV file
+        (tmp / file_name).unlink()
+
+    @classmethod
+    @abc.abstractmethod
+    def download(cls, overwrite: bool, read_from_temp: bool) -> FileDownloaderABC:
+        ...
+
+    @classmethod
     @abc.abstractmethod
-    def download(self, overwrite: bool, read_from_temp: bool) -> FileDownloaderABC:
+    def consolidate_files(cls):
         ...
 
+    @classmethod
     @abc.abstractmethod
-    def read(self):
+    def read(cls):
         ...
 
     def sort_categories(self) -> CategoryTypes:

diff --git a/app/abcs/abc_state_config.py b/app/abcs/abc_state_config.py
@@ -12,7 +12,7 @@
 from enum import StrEnum
 from rich.progress import track
 
-import funcs
+import app.funcs as funcs
 
 
 def check_for_empty_gen(func):
@@ -151,7 +151,11 @@ def TEMP_FOLDER(self) -> Path:
 
     @property
     def FIELD_DATA(self) -> dict:
-        return funcs.read_toml(Path(__file__).parents[1] / 'states'/ (_state := self.STATE_NAME.lower()) / f"{_state}_fields.toml")
+        return (
+            funcs
+            .read_toml(
+                Path(__file__)
+                .parents[1] / 'states'/ (_state := self.STATE_NAME.lower()) / f"{_state}_fields.toml"))
 
     @staticmethod
     @lru_cache

diff --git a/app/abcs/abc_validation.py b/app/abcs/abc_validation.py
@@ -6,9 +6,9 @@
 import csv
 from tqdm import tqdm
 from pydantic import ValidationError
-from abcs.abc_validation_errors import ValidationErrorList
-from funcs.validator_functions import create_record_id
-from logger import Logger
+from app.abcs.abc_validation_errors import ValidationErrorList
+from app.funcs.validator_functions import create_record_id
+from app.logger import Logger
 from icecream import ic
 
 ValidatorType = Type[SQLModel]

diff --git a/app/abcs/abc_validation_errors.py b/app/abcs/abc_validation_errors.py
@@ -4,7 +4,7 @@
 import pandas as pd
 from icecream import ic
 
-ic.configureOutput(prefix='abc_validation_errors|')
+ic.configureOutput(prefix='campaignfinance|')
 
 
 class RecordValidationError(BaseModel):

diff --git a/app/funcs/__init__.py b/app/funcs/__init__.py
@@ -1,4 +1,4 @@
-from funcs.csv_reader import FileReader
-from funcs.toml_reader import read_toml
-from funcs.file_exporters import write_records_to_csv_validation
-from funcs.depreciated import deprecated
+from .csv_reader import FileReader
+from .toml_reader import read_toml
+from .file_exporters import write_records_to_csv_validation
+from .depreciated import deprecated