diff --git a/python/cudf/cudf/core/tools/text/__init__.py b/python/cudf/cudf/core/tools/text/__init__.py new file mode 100644 index 00000000000..213b0bce4e0 --- /dev/null +++ b/python/cudf/cudf/core/tools/text/__init__.py @@ -0,0 +1 @@ +# Init for text tools diff --git a/python/cudf/cudf/core/tools/text/__main__.py b/python/cudf/cudf/core/tools/text/__main__.py new file mode 100644 index 00000000000..ff8bdb24ad0 --- /dev/null +++ b/python/cudf/cudf/core/tools/text/__main__.py @@ -0,0 +1,12 @@ +import importlib.resources +import os +import sys + +# Register cugrep as a CLI if run as a module + +def main(): + from .cugrep import cugrep + cugrep() + +if __name__ == "__main__": + main() diff --git a/python/cudf/cudf/core/tools/text/cugrep.py b/python/cudf/cudf/core/tools/text/cugrep.py new file mode 100644 index 00000000000..b4eab43a0b9 --- /dev/null +++ b/python/cudf/cudf/core/tools/text/cugrep.py @@ -0,0 +1,52 @@ +""" +cugrep: GPU-accelerated grep utility using cuDF +""" +import cudf +import click +import re + +@click.command() +@click.argument('pattern') +@click.argument('filenames', nargs=-1) +@click.option('--column', '-c', default=None, help='Column to search (default: all string columns)') +@click.option('--filetype', '-t', default=None, help='File type: csv, parquet, json, orc, txt (default: auto)') +@click.option('--ignore-case', '-i', is_flag=True, help='Ignore case distinctions') +@click.option('--show-row', is_flag=True, help='Show row number in output') +def cugrep(pattern, filenames, column, filetype, ignore_case, show_row): + """Search for PATTERN in each FILE using cuDF on the GPU.""" + flags = re.IGNORECASE if ignore_case else 0 + for fname in filenames: + # Auto-detect file type if not specified + ext = fname.split('.')[-1].lower() + ftype = filetype or ext + if ftype in ('csv', 'txt'): + df = cudf.read_csv(fname) + elif ftype == 'parquet': + df = cudf.read_parquet(fname) + elif ftype == 'json': + df = cudf.read_json(fname) + elif ftype == 'orc': + df = cudf.read_orc(fname) + else: + click.echo(f"Unsupported file type: {ftype}") + continue + # Select columns to search + if column: + cols = [column] if column in df.columns else [] + else: + cols = [col for col in df.columns if df[col].dtype == 'object'] + if not cols: + click.echo(f"No string columns to search in {fname}") + continue + for col in cols: + matches = df[col].str.contains(pattern, regex=True, case=not ignore_case, na=False) + matched_rows = df[matches] + for idx, row in matched_rows.iterrows(): + output = f"{fname}:" + if show_row: + output += f"{idx}:" + output += f"{row[col]}" + click.echo(output) + +if __name__ == '__main__': + cugrep() diff --git a/python/cudf/cudf/tests/text/test_cugrep.py b/python/cudf/cudf/tests/text/test_cugrep.py new file mode 100644 index 00000000000..5fae1f09b79 --- /dev/null +++ b/python/cudf/cudf/tests/text/test_cugrep.py @@ -0,0 +1,76 @@ +import os +import sys +import pytest +from cudf.core.tools.text import cugrep + +def test_cugrep_basic(tmp_path): + # Create a sample CSV file + csv_content = """col1,col2\nhello world,foo\nbar baz,qux\nhello cudf,bar\n""" + file_path = tmp_path / "test.csv" + file_path.write_text(csv_content) + # Run cugrep on the file + from click.testing import CliRunner + runner = CliRunner() + result = runner.invoke(cugrep.cugrep, ["hello", str(file_path), "--column", "col1"]) + assert result.exit_code == 0 + assert "hello world" in result.output + assert "hello cudf" in result.output + assert "bar baz" not in result.output + +def test_cugrep_all_columns(tmp_path): + csv_content = """col1,col2\nhello world,foo\nbar baz,hello again\nhello cudf,bar\n""" + file_path = tmp_path / "test2.csv" + file_path.write_text(csv_content) + from click.testing import CliRunner + runner = CliRunner() + # No --column: should match both columns + result = runner.invoke(cugrep.cugrep, ["hello", str(file_path)]) + assert result.exit_code == 0 + assert "hello world" in result.output + assert "hello cudf" in result.output + assert "hello again" in result.output + +def test_cugrep_ignore_case(tmp_path): + csv_content = "col1\nHello World\nHELLO cudf\nno match\n" + file_path = tmp_path / "test3.csv" + file_path.write_text(csv_content) + from click.testing import CliRunner + runner = CliRunner() + result = runner.invoke(cugrep.cugrep, ["hello", str(file_path), "--ignore-case", "--column", "col1"]) + assert result.exit_code == 0 + assert "Hello World" in result.output + assert "HELLO cudf" in result.output + assert "no match" not in result.output + +def test_cugrep_missing_column(tmp_path): + csv_content = "col1\nfoo\nbar\n" + file_path = tmp_path / "test4.csv" + file_path.write_text(csv_content) + from click.testing import CliRunner + runner = CliRunner() + # Column does not exist + result = runner.invoke(cugrep.cugrep, ["foo", str(file_path), "--column", "not_a_col"]) + assert result.exit_code == 0 + assert "No string columns to search" in result.output or result.output == "" + +def test_cugrep_unsupported_filetype(tmp_path): + file_path = tmp_path / "test.unsupported" + file_path.write_text("irrelevant content") + from click.testing import CliRunner + runner = CliRunner() + result = runner.invoke(cugrep.cugrep, ["foo", str(file_path)]) + assert result.exit_code == 0 + assert "Unsupported file type" in result.output + +def test_cugrep_regex_match(tmp_path): + csv_content = "col1\nfoo123\nbar456\nfoo789\n" + file_path = tmp_path / "test5.csv" + file_path.write_text(csv_content) + from click.testing import CliRunner + runner = CliRunner() + # Regex: match foo followed by digits + result = runner.invoke(cugrep.cugrep, [r"foo\\d+", str(file_path), "--column", "col1"]) + assert result.exit_code == 0 + assert "foo123" in result.output + assert "foo789" in result.output + assert "bar456" not in result.output