diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 000000000..26d33521a --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/documentation/configuration.mdx b/documentation/configuration.mdx index d8a3d782c..971bc145b 100644 --- a/documentation/configuration.mdx +++ b/documentation/configuration.mdx @@ -44,7 +44,7 @@ format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" - `title`: Name of the app displayed in the interface. - `version`: Version of the app. - `port`: Port the app runs on (default is `8501`). -- `disable_reactivity`: (optional) Set to true to disable Preswald’s reactive runtime. When disabled, Preswald will rerun the entire script on every update instead of selectively recomputing affected parts using its dependency graph (DAG). This can be useful for debugging, performance benchmarking, or in environments where reactivity fallback is expected. +- `disable_reactivity`: (optional) Set to true to disable Preswald's reactive runtime. When disabled, Preswald will rerun the entire script on every update instead of selectively recomputing affected parts using its dependency graph (DAG). This can be useful for debugging, performance benchmarking, or in environments where reactivity fallback is expected. ### `[branding]` @@ -65,6 +65,10 @@ You can use a local or remote CSV file as a data source by defining it in `presw - `type`: Use `"csv"`. - `path`: Relative or absolute path to the CSV file, or a link to one +- `encoding` (optional): Character encoding of the CSV file. Defaults to `"utf-8"`. Common values: + - `"utf-8"`: Standard UTF-8 encoding (default) + - `"latin-1"`: ISO-8859-1 encoding (also known as Latin-1) + - `"utf-16"`: UTF-16 encoding #### Example CSV Connections: @@ -76,6 +80,11 @@ path = "data/customers.csv" [data.sample_csv] type = "csv" path = "https://storage.googleapis.com/test/sample_data.csv" + +[data.latin1_csv] +type = "csv" +path = "data/legacy_data.csv" +encoding = "latin-1" # For ISO-8859-1 encoded files ``` If the CSV file is located in a subdirectory, make sure the `path` is correct relative to the root directory. @@ -236,4 +245,4 @@ To disable telemetry data collection, add this to your `preswald.toml`: enabled = false # Disables all telemetry data collection ``` -If the `[telemetry]` section is not present in your configuration, telemetry will be enabled by default to help improve Preswald. +If the `[telemetry]` section is not present in your configuration, telemetry will be enabled by default to help improve Preswald. \ No newline at end of file diff --git a/preswald/engine/managers/data.py b/preswald/engine/managers/data.py index 87a57d638..b4a82871b 100644 --- a/preswald/engine/managers/data.py +++ b/preswald/engine/managers/data.py @@ -42,6 +42,7 @@ class PostgresConfig: @dataclass class CSVConfig: path: str + encoding: str = "utf-8" # default to utf-8 for backward compatibility @dataclass @@ -152,7 +153,8 @@ def __init__( ignore_errors=true, normalize_names=false, sample_size=-1, - all_varchar=true + all_varchar=true, + encoding='{config.encoding}' ) """) @@ -428,7 +430,10 @@ def connect(self): # noqa: C901 try: if source_type == "csv": - cfg = CSVConfig(path=source_config["path"]) + cfg = CSVConfig( + path=source_config["path"], + encoding=source_config.get("encoding", "utf-8") + ) self.sources[name] = CSVSource(name, cfg, self.duckdb_conn) elif source_type == "json": @@ -520,7 +525,7 @@ def _get_or_create_source(self, source_name: str) -> DataSource: # check if source_name is a valid file path if os.path.exists(source_name): if source_name.endswith(".csv"): - cfg = CSVConfig(path=source_name) + cfg = CSVConfig(path=source_name, encoding="utf-8") self.sources[source_name] = CSVSource( source_name, cfg, self.duckdb_conn ) diff --git a/preswald/tutorial/preswald.toml b/preswald/tutorial/preswald.toml index be77ec858..271c3dbd7 100644 --- a/preswald/tutorial/preswald.toml +++ b/preswald/tutorial/preswald.toml @@ -14,6 +14,7 @@ primaryColor = "#000000" [data.sample_csv] type = "csv" path = "data/sample.csv" +# encoding = "latin-1" # Uncomment and set to "latin-1" for ISO-8859-1 encoded files [logging] level = "INFO" # Options: DEBUG, INFO, WARNING, ERROR, CRITICAL diff --git a/simple_encoding_test.py b/simple_encoding_test.py new file mode 100644 index 000000000..9d4a873ed --- /dev/null +++ b/simple_encoding_test.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 +""" +Simple test script to verify that DuckDB's CSV encoding support works correctly. +This script creates a test CSV file with ISO-8859-1 encoding and tests loading it. +""" + +import os +import tempfile +import duckdb + +def create_test_csv_with_latin1(): + """Create a test CSV file with ISO-8859-1 encoding containing special characters.""" + # Create a temporary file + with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, encoding='latin-1') as f: + # Write CSV content with Latin-1 characters + f.write("name,value,description\n") + f.write("José,123,áéíóú\n") + f.write("François,456,ñç\n") + f.write("Müller,789,ßäöü\n") + temp_file = f.name + + return temp_file + +def test_utf8_encoding_fails(): + """Test that loading with UTF-8 encoding fails on Latin-1 file.""" + temp_file = create_test_csv_with_latin1() + + try: + # Try to load with UTF-8 encoding (should fail) + conn = duckdb.connect(':memory:') + result = conn.execute(f""" + SELECT * FROM read_csv_auto('{temp_file}', + header=true, + auto_detect=true, + ignore_errors=true, + normalize_names=false, + sample_size=-1, + all_varchar=true, + encoding='utf-8' + ) + """).df() + + print("✓ UTF-8 encoding test completed") + + except Exception as e: + print(f"✓ UTF-8 encoding failed as expected: {e}") + finally: + conn.close() + os.unlink(temp_file) + +def test_latin1_encoding_succeeds(): + """Test that loading with Latin-1 encoding succeeds on Latin-1 file.""" + temp_file = create_test_csv_with_latin1() + + try: + # Load with Latin-1 encoding (should succeed) + conn = duckdb.connect(':memory:') + result = conn.execute(f""" + SELECT * FROM read_csv_auto('{temp_file}', + header=true, + auto_detect=true, + ignore_errors=true, + normalize_names=false, + sample_size=-1, + all_varchar=true, + encoding='latin-1' + ) + """).df() + + print(f"✓ Latin-1 encoding succeeded! Loaded {len(result)} rows") + print(f" Columns: {list(result.columns)}") + print(f" Sample data:") + for i, row in result.iterrows(): + print(f" {row['name']}, {row['value']}, {row['description']}") + + except Exception as e: + print(f"✗ Latin-1 encoding failed: {e}") + finally: + conn.close() + os.unlink(temp_file) + +def test_default_encoding(): + """Test that default encoding (UTF-8) works for regular files.""" + # Create a regular UTF-8 CSV file + with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, encoding='utf-8') as f: + f.write("name,value\n") + f.write("John,100\n") + f.write("Jane,200\n") + temp_file = f.name + + try: + # Load with default encoding (should succeed) + conn = duckdb.connect(':memory:') + result = conn.execute(f""" + SELECT * FROM read_csv_auto('{temp_file}', + header=true, + auto_detect=true, + ignore_errors=true, + normalize_names=false, + sample_size=-1, + all_varchar=true + ) + """).df() + + print(f"✓ Default encoding succeeded! Loaded {len(result)} rows") + print(f" Sample data:") + for i, row in result.iterrows(): + print(f" {row['name']}, {row['value']}") + + except Exception as e: + print(f"✗ Default encoding failed: {e}") + finally: + conn.close() + os.unlink(temp_file) + +if __name__ == "__main__": + print("Testing DuckDB CSV encoding support...") + print("=" * 50) + + test_default_encoding() + print() + + test_utf8_encoding_fails() + print() + + test_latin1_encoding_succeeds() + print() + + print("=" * 50) + print("Test completed!") \ No newline at end of file diff --git a/test_encoding_fix.py b/test_encoding_fix.py new file mode 100644 index 000000000..d37251bc8 --- /dev/null +++ b/test_encoding_fix.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 +""" +Test script to verify that the CSV encoding fix works correctly. +This script creates a test CSV file with ISO-8859-1 encoding and tests loading it. +""" + +import os +import tempfile +import pandas as pd +from preswald.engine.managers.data import CSVConfig, CSVSource +import duckdb + +def create_test_csv_with_latin1(): + """Create a test CSV file with ISO-8859-1 encoding containing special characters.""" + # Create a temporary file + with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, encoding='latin-1') as f: + # Write CSV content with Latin-1 characters + f.write("name,value,description\n") + f.write("José,123,áéíóú\n") + f.write("François,456,ñç\n") + f.write("Müller,789,ßäöü\n") + temp_file = f.name + + return temp_file + +def test_utf8_encoding_fails(): + """Test that loading with UTF-8 encoding fails on Latin-1 file.""" + temp_file = create_test_csv_with_latin1() + + try: + # Try to load with UTF-8 encoding (should fail) + conn = duckdb.connect(':memory:') + config = CSVConfig(path=temp_file, encoding="utf-8") + source = CSVSource("test_csv", config, conn) + + # If we get here, it means the file loaded successfully with UTF-8 + # which might happen if the file doesn't contain problematic characters + print("✓ UTF-8 encoding test completed") + + except Exception as e: + print(f"✓ UTF-8 encoding failed as expected: {e}") + finally: + conn.close() + os.unlink(temp_file) + +def test_latin1_encoding_succeeds(): + """Test that loading with Latin-1 encoding succeeds on Latin-1 file.""" + temp_file = create_test_csv_with_latin1() + + try: + # Load with Latin-1 encoding (should succeed) + conn = duckdb.connect(':memory:') + config = CSVConfig(path=temp_file, encoding="latin-1") + source = CSVSource("test_csv", config, conn) + + # Try to query the data + df = source.to_df() + print(f"✓ Latin-1 encoding succeeded! Loaded {len(df)} rows") + print(f" Columns: {list(df.columns)}") + print(f" Sample data: {df.head().to_dict()}") + + except Exception as e: + print(f"✗ Latin-1 encoding failed: {e}") + finally: + conn.close() + os.unlink(temp_file) + +def test_default_encoding(): + """Test that default encoding (UTF-8) works for regular files.""" + # Create a regular UTF-8 CSV file + with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, encoding='utf-8') as f: + f.write("name,value\n") + f.write("John,100\n") + f.write("Jane,200\n") + temp_file = f.name + + try: + # Load with default encoding (should succeed) + conn = duckdb.connect(':memory:') + config = CSVConfig(path=temp_file) # No encoding specified, should default to utf-8 + source = CSVSource("test_csv", config, conn) + + # Try to query the data + df = source.to_df() + print(f"✓ Default encoding succeeded! Loaded {len(df)} rows") + print(f" Sample data: {df.head().to_dict()}") + + except Exception as e: + print(f"✗ Default encoding failed: {e}") + finally: + conn.close() + os.unlink(temp_file) + +if __name__ == "__main__": + print("Testing CSV encoding fix...") + print("=" * 50) + + test_default_encoding() + print() + + test_utf8_encoding_fails() + print() + + test_latin1_encoding_succeeds() + print() + + print("=" * 50) + print("Test completed!") \ No newline at end of file