feat: Optimize load_policy_line to avoid quadratic individual-character loop (#355)

huonw · web-flow · commit d3149e81c36d · 2025-06-07T21:03:11.000+08:00
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -39,6 +39,7 @@ jobs:
           tests/benchmarks/benchmark_model.py
           tests/benchmarks/benchmark_management_api.py
           tests/benchmarks/benchmark_role_manager.py
+          tests/benchmarks/benchmark_adapter.py
 
       - name: Upload coverage data to coveralls.io
         run: coveralls --service=github
diff --git a/casbin/persist/adapter.py b/casbin/persist/adapter.py
@@ -12,34 +12,56 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import re
 
-def load_policy_line(line, model):
-    """loads a text line as a policy rule to model."""
+_INTERESTING_TOKENS_RE = re.compile(r"[,\[\]\(\)]")
+
+
+def _extract_tokens(line):
+    """Return the list of 'tokens' from the line, or None if this line has none"""
 
     if line == "":
-        return
+        return None
 
     if line[:1] == "#":
-        return
+        return None
 
     stack = []
     tokens = []
-    for c in line:
+
+    # The tokens are separated by commas, but we support nesting so a naive `line.split(",")` is
+    # wrong. E.g. `abc(def, ghi), jkl` is two tokens: `abc(def, ghi)` and `jkl`. We do this by
+    # iterating over the locations of any tokens of interest, and either:
+    #
+    # - [](): adjust the nesting depth
+    # - ,: slice the line to save the token, if the , is at the top-level, outside all []()
+    #
+    # `start_idx` represents the start of the current token, that we haven't seen a `,` for yet.
+    start_idx = 0
+    for match in _INTERESTING_TOKENS_RE.finditer(line):
+        c = match.group()
         if c == "[" or c == "(":
             stack.append(c)
-            tokens[-1] += c
         elif c == "]" or c == ")":
             stack.pop()
-            tokens[-1] += c
-        elif c == "," and len(stack) == 0:
-            tokens.append("")
-        else:
-            if len(tokens) == 0:
-                tokens.append(c)
-            else:
-                tokens[-1] += c
-
-    tokens = [x.strip() for x in tokens]
+        elif not stack:
+            # must be a comma outside of any nesting: we've found the end of a top level token so
+            # save that and start a new one
+            tokens.append(line[start_idx : match.start()].strip())
+            start_idx = match.end()
+
+    # trailing token after the last ,
+    tokens.append(line[start_idx:].strip())
+
+    return tokens
+
+
+def load_policy_line(line, model):
+    """loads a text line as a policy rule to model."""
+
+    tokens = _extract_tokens(line)
+    if tokens is None:
+        return
 
     key = tokens[0]
     sec = key[0]
diff --git a/tests/benchmarks/benchmark_adapter.py b/tests/benchmarks/benchmark_adapter.py
@@ -0,0 +1,30 @@
+from casbin.persist.adapter import _extract_tokens
+
+
+def _benchmark_extract_tokens(benchmark, line):
+    @benchmark
+    def run_benchmark():
+        _extract_tokens(line)
+
+
+def test_benchmark_extract_tokens_short_simple(benchmark):
+    _benchmark_extract_tokens(benchmark, "abc,def,ghi")
+
+
+def test_benchmark_extract_tokens_long_simple(benchmark):
+    # fixed UUIDs for length and to be similar to "real world" usage of UUIDs
+    _benchmark_extract_tokens(
+        benchmark,
+        "00000000-0000-0000-0000-000000000000,00000000-0000-0000-0000-000000000001,00000000-0000-0000-0000-000000000002",
+    )
+
+
+def test_benchmark_extract_tokens_short_nested(benchmark):
+    _benchmark_extract_tokens(benchmark, "abc(def,ghi),jkl(mno,pqr)")
+
+
+def test_benchmark_extract_tokens_long_nested(benchmark):
+    _benchmark_extract_tokens(
+        benchmark,
+        "00000000-0000-0000-0000-000000000000(00000000-0000-0000-0000-000000000001,00000000-0000-0000-0000-000000000002),00000000-0000-0000-0000-000000000003(00000000-0000-0000-0000-000000000004,00000000-0000-0000-0000-000000000005)",
+    )
diff --git a/tests/persist/test_adapter.py b/tests/persist/test_adapter.py
@@ -0,0 +1,53 @@
+from casbin.persist.adapter import _extract_tokens
+from tests import TestCaseBase
+
+
+class TestExtractTokens(TestCaseBase):
+    def test_ignore_lines(self):
+        self.assertIsNone(_extract_tokens(""))  # empty
+        self.assertIsNone(_extract_tokens("# comment"))
+
+    def test_simple_lines(self):
+        # split on top-level commas, strip whitespace from start and end
+        self.assertEqual(_extract_tokens("one"), ["one"])
+        self.assertEqual(_extract_tokens("one,two"), ["one", "two"])
+        self.assertEqual(_extract_tokens("   ignore  \t,\t   external, spaces  "), ["ignore", "external", "spaces"])
+
+        self.assertEqual(_extract_tokens("internal spaces preserved"), ["internal spaces preserved"])
+
+    def test_nested_lines(self):
+        # basic nesting within a single token
+        self.assertEqual(
+            _extract_tokens("outside1()"),
+            ["outside1()"],
+        )
+        self.assertEqual(
+            _extract_tokens("outside1(inside1())"),
+            ["outside1(inside1())"],
+        )
+
+        # split on top-level commas, but not on internal ones
+        self.assertEqual(
+            _extract_tokens("outside1(inside1(), inside2())"),
+            ["outside1(inside1(), inside2())"],
+        )
+        self.assertEqual(
+            _extract_tokens("outside1(inside1(), inside2(inside3(), inside4()))"),
+            ["outside1(inside1(), inside2(inside3(), inside4()))"],
+        )
+        self.assertEqual(
+            _extract_tokens("outside1(inside1(), inside2()), outside2(inside3(), inside4())"),
+            ["outside1(inside1(), inside2())", "outside2(inside3(), inside4())"],
+        )
+
+        # different delimiters
+        self.assertEqual(
+            _extract_tokens(
+                "all_square[inside1[], inside2[]],square_and_parens[inside1(), inside2()],parens_and_square(inside1[], inside2[])"
+            ),
+            [
+                "all_square[inside1[], inside2[]]",
+                "square_and_parens[inside1(), inside2()]",
+                "parens_and_square(inside1[], inside2[])",
+            ],
+        )