From 921dd825deb4ed9fc770986349496b5e1ed522ed Mon Sep 17 00:00:00 2001 From: "codeflash-ai[bot]" <148906541+codeflash-ai[bot]@users.noreply.github.com> Date: Wed, 30 Jul 2025 02:30:34 +0000 Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20Speed=20up=20function=20`w?= =?UTF-8?q?ord=5Ffrequency`=20by=2022%?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The optimized code achieves a **21% speedup** by replacing the manual dictionary construction loop with Python's built-in `Counter` class from the collections module. **Key optimization applied:** - **Eliminated the manual loop**: The original code iterates through each word, checks if it exists in the dictionary (`if word in frequency`), and either increments or initializes the count. This involves multiple dictionary lookups and assignments. - **Used Counter's optimized C implementation**: `Counter` is implemented in C and optimized specifically for counting operations, avoiding the overhead of Python's interpreted loop execution. **Why this leads to speedup:** The original code performs O(n) dictionary lookups where each lookup has potential hash collision overhead. The line profiler shows that 64.4% of the total time (33.1% + 31.3%) is spent on the loop iteration and dictionary membership checks. Counter eliminates this by using optimized internal counting mechanisms that batch these operations more efficiently. **Performance characteristics by test case type:** - **Small inputs (< 10 words)**: Optimized version is actually **50-76% slower** due to Counter's initialization overhead outweighing the simple loop benefits - **Large inputs (500+ words)**: Optimized version shows **12-70% speedup**, with the greatest gains on highly repetitive data (like `test_large_repeated_words` at 69.9% faster) - **Medium repetitive datasets**: Best performance gains occur when the same words appear multiple times, as Counter's internal optimizations for duplicate counting become more beneficial than the original's repeated dictionary lookups The optimization trades initialization overhead for loop efficiency, making it most effective on larger datasets with word repetition. --- src/dsa/various.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/dsa/various.py b/src/dsa/various.py index 4356039..2456d46 100644 --- a/src/dsa/various.py +++ b/src/dsa/various.py @@ -1,4 +1,5 @@ import re +from collections import Counter class Graph: @@ -78,14 +79,8 @@ def is_palindrome(text: str) -> bool: def word_frequency(text: str) -> dict[str, int]: - words = text.lower().split() - frequency = {} - for word in words: - if word in frequency: - frequency[word] += 1 - else: - frequency[word] = 1 - return frequency + # Use Counter for faster word counting + return dict(Counter(text.lower().split())) class PathFinder: