rapidsai · Pritiks23 · Mar 28, 2026
@@ -0,0 +1 @@
+# Init for text tools
@@ -0,0 +1,12 @@
+import importlib.resources
+import os
+import sys
+
+# Register cugrep as a CLI if run as a module
+
+def main():
+    from .cugrep import cugrep
+    cugrep()
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,52 @@
+"""
+cugrep: GPU-accelerated grep utility using cuDF
+"""
+import cudf
+import click
+import re
+
+@click.command()
+@click.argument('pattern')
+@click.argument('filenames', nargs=-1)
+@click.option('--column', '-c', default=None, help='Column to search (default: all string columns)')
+@click.option('--filetype', '-t', default=None, help='File type: csv, parquet, json, orc, txt (default: auto)')
+@click.option('--ignore-case', '-i', is_flag=True, help='Ignore case distinctions')
+@click.option('--show-row', is_flag=True, help='Show row number in output')
+def cugrep(pattern, filenames, column, filetype, ignore_case, show_row):
+    """Search for PATTERN in each FILE using cuDF on the GPU."""
+    flags = re.IGNORECASE if ignore_case else 0
+    for fname in filenames:
+        # Auto-detect file type if not specified
+        ext = fname.split('.')[-1].lower()
+        ftype = filetype or ext
+        if ftype in ('csv', 'txt'):
+            df = cudf.read_csv(fname)
+        elif ftype == 'parquet':
+            df = cudf.read_parquet(fname)
+        elif ftype == 'json':
+            df = cudf.read_json(fname)
+        elif ftype == 'orc':
+            df = cudf.read_orc(fname)
+        else:
+            click.echo(f"Unsupported file type: {ftype}")
+            continue
+        # Select columns to search
+        if column:
+            cols = [column] if column in df.columns else []
+        else:
+            cols = [col for col in df.columns if df[col].dtype == 'object']
+        if not cols:
+            click.echo(f"No string columns to search in {fname}")
+            continue
+        for col in cols:
+            matches = df[col].str.contains(pattern, regex=True, case=not ignore_case, na=False)
+            matched_rows = df[matches]
+            for idx, row in matched_rows.iterrows():
+                output = f"{fname}:"
+                if show_row:
+                    output += f"{idx}:"
+                output += f"{row[col]}"
+                click.echo(output)
+
+if __name__ == '__main__':
+    cugrep()
@@ -0,0 +1,76 @@
+import os
+import sys
+import pytest
+from cudf.core.tools.text import cugrep
+
+def test_cugrep_basic(tmp_path):
+    # Create a sample CSV file
+    csv_content = """col1,col2\nhello world,foo\nbar baz,qux\nhello cudf,bar\n"""
+    file_path = tmp_path / "test.csv"
+    file_path.write_text(csv_content)
+    # Run cugrep on the file
+    from click.testing import CliRunner
+    runner = CliRunner()
+    result = runner.invoke(cugrep.cugrep, ["hello", str(file_path), "--column", "col1"])
+    assert result.exit_code == 0
+    assert "hello world" in result.output
+    assert "hello cudf" in result.output
+    assert "bar baz" not in result.output
+
+def test_cugrep_all_columns(tmp_path):
+    csv_content = """col1,col2\nhello world,foo\nbar baz,hello again\nhello cudf,bar\n"""
+    file_path = tmp_path / "test2.csv"
+    file_path.write_text(csv_content)
+    from click.testing import CliRunner
+    runner = CliRunner()
+    # No --column: should match both columns
+    result = runner.invoke(cugrep.cugrep, ["hello", str(file_path)])
+    assert result.exit_code == 0
+    assert "hello world" in result.output
+    assert "hello cudf" in result.output
+    assert "hello again" in result.output
+
+def test_cugrep_ignore_case(tmp_path):
+    csv_content = "col1\nHello World\nHELLO cudf\nno match\n"
+    file_path = tmp_path / "test3.csv"
+    file_path.write_text(csv_content)
+    from click.testing import CliRunner
+    runner = CliRunner()
+    result = runner.invoke(cugrep.cugrep, ["hello", str(file_path), "--ignore-case", "--column", "col1"])
+    assert result.exit_code == 0
+    assert "Hello World" in result.output
+    assert "HELLO cudf" in result.output
+    assert "no match" not in result.output
+
+def test_cugrep_missing_column(tmp_path):
+    csv_content = "col1\nfoo\nbar\n"
+    file_path = tmp_path / "test4.csv"
+    file_path.write_text(csv_content)
+    from click.testing import CliRunner
+    runner = CliRunner()
+    # Column does not exist
+    result = runner.invoke(cugrep.cugrep, ["foo", str(file_path), "--column", "not_a_col"])
+    assert result.exit_code == 0
+    assert "No string columns to search" in result.output or result.output == ""
+
+def test_cugrep_unsupported_filetype(tmp_path):
+    file_path = tmp_path / "test.unsupported"
+    file_path.write_text("irrelevant content")
+    from click.testing import CliRunner
+    runner = CliRunner()
+    result = runner.invoke(cugrep.cugrep, ["foo", str(file_path)])
+    assert result.exit_code == 0
+    assert "Unsupported file type" in result.output
+
+def test_cugrep_regex_match(tmp_path):
+    csv_content = "col1\nfoo123\nbar456\nfoo789\n"
+    file_path = tmp_path / "test5.csv"
+    file_path.write_text(csv_content)
+    from click.testing import CliRunner
+    runner = CliRunner()
+    # Regex: match foo followed by digits
+    result = runner.invoke(cugrep.cugrep, [r"foo\\d+", str(file_path), "--column", "col1"])
+    assert result.exit_code == 0
+    assert "foo123" in result.output
+    assert "foo789" in result.output
+    assert "bar456" not in result.output