diff --git a/legendql/__init__.py b/legendql/__init__.py index c09658f6b..c9afd6760 100644 --- a/legendql/__init__.py +++ b/legendql/__init__.py @@ -16,5 +16,6 @@ from legendql.ql import query, LegendQL from legendql.store import store from legendql.functions import aggregate, over, unbounded, rows, range, left, avg, count, sum, rank, lead, lag, row_number +from legendql.sql_parser import sql_to_pure_relation, SQLToLegendQLConverter -__all__ = ['db', 'table', 'using', 'using_db', 'using_db_def', 'query', 'LegendQL', 'store', 'aggregate', 'over', 'unbounded', 'rows', 'range', 'left', 'avg', 'count', 'sum', 'rank', 'lead', 'lag', 'row_number'] +__all__ = ['db', 'table', 'using', 'using_db', 'using_db_def', 'query', 'LegendQL', 'store', 'aggregate', 'over', 'unbounded', 'rows', 'range', 'left', 'avg', 'count', 'sum', 'rank', 'lead', 'lag', 'row_number', 'sql_to_pure_relation', 'SQLToLegendQLConverter'] diff --git a/legendql/sql_examples.py b/legendql/sql_examples.py new file mode 100644 index 000000000..ecaa3705a --- /dev/null +++ b/legendql/sql_examples.py @@ -0,0 +1,32 @@ +"""Examples demonstrating SQL to Pure Relation conversion using SQLGlot.""" + +import pyarrow as pa +from legendql.sql_parser import sql_to_pure_relation +from legendql.model.schema import Database, Table +from legendql.runtime.pure.db.h2 import H2DatabaseDefinition + +employees_table = Table( + table="employees", + columns=[ + pa.field("id", pa.int32()), + pa.field("name", pa.utf8()), + pa.field("department", pa.utf8()), + pa.field("salary", pa.float32()) + ] +) + +database = Database(name="company", children=[employees_table]) +db_def = H2DatabaseDefinition(sqls=[]) + +sql1 = "SELECT id, name FROM employees" +pure_code1 = sql_to_pure_relation(sql1, db_def, database) +print("Example 1:") +print(f"SQL: {sql1}") +print(f"Pure Relation: {pure_code1}") +print() + +sql2 = "SELECT name, department, salary FROM employees" +pure_code2 = sql_to_pure_relation(sql2, db_def, database) +print("Example 2:") +print(f"SQL: {sql2}") +print(f"Pure Relation: {pure_code2}") diff --git a/legendql/sql_parser.py b/legendql/sql_parser.py new file mode 100644 index 000000000..a541707ca --- /dev/null +++ b/legendql/sql_parser.py @@ -0,0 +1,104 @@ +# Copyright 2025 Goldman Sachs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations +from typing import List, Optional +import sqlglot +from sqlglot import exp, parse_one + +from legendql.model.metamodel import ( + SelectionClause, DatabaseFromClause, ColumnReferenceExpression, + Expression, Clause +) +from legendql.model.schema import Database, Table, Schema +from legendql.query import Query +from legendql.runtime.pure.db.type import DatabaseDefinition + + +class SQLToLegendQLConverter: + """Converts SQL statements to LegendQL metamodel using SQLGlot parsing.""" + + def __init__(self, database_definition: DatabaseDefinition, database: Database): + self.database_definition = database_definition + self.database = database + + def parse_sql_to_query(self, sql: str) -> Query: + """Parse SQL string and convert to LegendQL Query object.""" + try: + parsed = parse_one(sql) + if not isinstance(parsed, exp.Select): + raise ValueError(f"Only SELECT statements are currently supported, got: {type(parsed)}") + + return self._convert_select_to_query(parsed) + except Exception as e: + raise ValueError(f"Failed to parse SQL: {sql}. Error: {str(e)}") + + def _convert_select_to_query(self, select_ast: exp.Select) -> Query: + """Convert SQLGlot Select AST to LegendQL Query.""" + table_info = self._extract_table_info(select_ast) + + query = Query.from_table( + self.database_definition, + self.database, + table_info['table_name'] + ) + + if select_ast.expressions: + selection_expressions = [] + for expr in select_ast.expressions: + legend_expr = self._convert_expression(expr) + selection_expressions.append(legend_expr) + + query._clauses = [query._clauses[0]] # Keep the FROM clause + query._add_clause(SelectionClause(selection_expressions)) + + return query + + def _extract_table_info(self, select_ast: exp.Select) -> dict: + """Extract table name and schema information from SELECT AST.""" + table_node = select_ast.find(exp.Table) + if not table_node: + raise ValueError("No table found in SELECT statement") + + return { + 'table_name': table_node.name, + 'schema_name': table_node.db if hasattr(table_node, 'db') else None, + 'database_name': table_node.catalog if hasattr(table_node, 'catalog') else None + } + + def _convert_expression(self, expr: exp.Expression) -> Expression: + """Convert SQLGlot expression to LegendQL Expression.""" + if isinstance(expr, exp.Column): + return ColumnReferenceExpression(expr.name) + elif isinstance(expr, exp.Star): + raise NotImplementedError("SELECT * is not yet implemented") + else: + return ColumnReferenceExpression(str(expr)) + + +def sql_to_pure_relation(sql: str, database_definition: DatabaseDefinition, database: Database) -> str: + """ + Convert SQL statement to Pure Relation code. + + Args: + sql: SQL statement to convert + database_definition: Database definition for execution + database: Database schema information + + Returns: + Pure Relation code string + """ + converter = SQLToLegendQLConverter(database_definition, database) + query = converter.parse_sql_to_query(sql) + return query.to_string() diff --git a/pyproject.toml b/pyproject.toml index 981d3a354..13034d959 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,7 @@ numpy = [ { version = ">=1.20.0", python = "<3.12" }, { version = ">=1.26.0", python = ">=3.12" } ] +sqlglot = ">=26.0.0" [tool.poetry.group.dev.dependencies] diff --git a/test_sql_integration.py b/test_sql_integration.py new file mode 100644 index 000000000..fd6c6d3ae --- /dev/null +++ b/test_sql_integration.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +"""Test script for SQLGlot integration with LegendQL.""" + +import pyarrow as pa +from legendql.sql_parser import sql_to_pure_relation +from legendql.model.schema import Database, Table +from legendql.runtime.pure.db.h2 import H2DatabaseDefinition + +def test_basic_sql_conversion(): + """Test basic SQL SELECT conversion to Pure Relation code.""" + + table = Table( + table="tableC", + columns=[ + pa.field("colA", pa.utf8()), + pa.field("colB", pa.int32()) + ] + ) + database = Database(name="testdb", children=[table]) + + db_def = H2DatabaseDefinition(sqls=[]) + + sql = "SELECT colA, colB FROM tableC" + + try: + pure_relation_code = sql_to_pure_relation(sql, db_def, database) + print("SQL:", sql) + print("Pure Relation Code:", pure_relation_code) + print("āœ… Basic SQL conversion successful!") + return True + except Exception as e: + print(f"āŒ Error converting SQL: {e}") + return False + +if __name__ == "__main__": + success = test_basic_sql_conversion() + if success: + print("\nšŸŽ‰ SQLGlot integration test passed!") + else: + print("\nšŸ’„ SQLGlot integration test failed!")