Merge pull request #87 from nol13/autojunk2

nol13 · web-flow · commit 16a493574bcf · 2025-08-14T20:45:25.000-04:00
Autojunk2
diff --git a/README.md b/README.md
@@ -454,7 +454,7 @@ Pass options to fuzz.unique_tokens as the second argument if you're using wildca
 ### Alternate Ratio Calculations
 
 
-If you want to use difflib's ratio function for all ratio calculations, which differs slightly from the default python-Levenshtein style behavior, you can specify options.ratio_alg = "difflib". The difflib calculation is a bit different in that it's based on matching characters rather than true minimum edit distance, but the results are usually pretty similar. Difflib uses the formula 2.0*M / T  where M is the number of matches, and T is the total number of elements in both sequences. This mirrors the behavior of fuzzywuzzy when not using python-Levenshtein. Not all features (wildcards, collation) supported when using difflib ratio.
+If you want to use difflib's ratio function for all ratio calculations, which differs slightly from the default python-Levenshtein style behavior, you can specify options.ratio_alg = "difflib". The difflib calculation is a bit different in that it's based on matching characters rather than true minimum edit distance, but the results are usually pretty similar. Difflib uses the formula 2.0*M / T  where M is the number of matches, and T is the total number of elements in both sequences. This mirrors the behavior of fuzzywuzzy when not using python-Levenshtein. When using difflib, you can also set `options.autojunk` to `false` to disable the automatic junk heuristic that treats popular elements as junk. Not all features (wildcards, collation) supported when using difflib ratio.
 
 Except when using difflib, the ratios are calculated as ((str1.length + str2.length) - distance) / (str1.length + str2.length), where distance is calculated with a substitution cost of 2. This follows the behavior of python-Levenshtein, however the fuzz.distance function still uses a cost of 1 by default for all operations if just calculating distance and not a ratio.
 
diff --git a/dist/esm/fuzzball.esm.min.js b/dist/esm/fuzzball.esm.min.js
diff --git a/dist/fuzzball.umd.min.js b/dist/fuzzball.umd.min.js
diff --git a/fuzzball.d.ts b/fuzzball.d.ts
@@ -29,6 +29,24 @@ export interface FuzzballBaseOptions {
     normalize?: boolean;
 }
 
+export interface FuzzballRatioOptions extends FuzzballBaseOptions {
+    /**
+     * A string representing the ratio algorithm to use, either "levenshtein" or "difflib", default "levenshtein"
+     */
+    ratio_alg?: 'levenshtein' | 'difflib';
+    /**
+     * Autojunk argument passed to difflib if you're using the ratio_alg option, default true
+     */
+    autojunk?: boolean;
+}
+
+export interface FuzzballPartialRatioOptions extends FuzzballBaseOptions {
+    /**
+     * Autojunk argument passed to difflib, default true
+     */
+    autojunk?: boolean;
+}
+
 export interface FuzzballTokenSetOptions extends FuzzballBaseOptions {
     /**
      * Include ratio as part of token set test suite
@@ -64,7 +82,15 @@ interface FuzzballExtractBaseOptions extends FuzzballBaseOptions {
     /**
      * Sort tokens by similarity before combining with token set scorers
      */
-    sortBySimilarity?: boolean
+    sortBySimilarity?: boolean;
+    /**
+     * A string representing the ratio algorithm to use, either "levenshtein" or "difflib", default "levenshtein"
+     */
+    ratio_alg?: 'levenshtein' | 'difflib';
+    /**
+     * Autojunk argument passed to difflib if you're using the ratio_alg option, default true
+     */
+    autojunk?: boolean;
 }
 
 interface AbortController {
@@ -170,14 +196,14 @@ export interface FuzzballDedupeObjOptionsWithMap extends FuzzballExtractObjectOp
 }
 
 export function distance(str1: string, str2: string, opts?: FuzzballBaseOptions): number;
-export function ratio(str1: string, str2: string, opts?: FuzzballBaseOptions): number;
-export function partial_ratio(str1: string, str2: string, opts?: FuzzballBaseOptions): number;
-export function token_set_ratio(str1: string, str2: string, opts?: FuzzballTokenSetOptions): number;
-export function token_sort_ratio(str1: string, str2: string, opts?: FuzzballBaseOptions): number;
-export function token_similarity_sort_ratio(str1: string, str2: string, opts?: FuzzballTokenSetOptions): number;
-export function partial_token_set_ratio(str1: string, str2: string, opts?: FuzzballTokenSetOptions): number;
-export function partial_token_sort_ratio(str1: string, str2: string, opts?: FuzzballBaseOptions): number;
-export function partial_token_similarity_sort_ratio(str1: string, str2: string, opts?: FuzzballTokenSetOptions): number;
+export function ratio(str1: string, str2: string, opts?: FuzzballRatioOptions): number;
+export function partial_ratio(str1: string, str2: string, opts?: FuzzballPartialRatioOptions): number;
+export function token_set_ratio(str1: string, str2: string, opts?: FuzzballRatioOptions & FuzzballTokenSetOptions): number;
+export function token_sort_ratio(str1: string, str2: string, opts?: FuzzballRatioOptions): number;
+export function token_similarity_sort_ratio(str1: string, str2: string, opts?: FuzzballRatioOptions & FuzzballTokenSetOptions): number;
+export function partial_token_set_ratio(str1: string, str2: string, opts?: FuzzballPartialRatioOptions & FuzzballTokenSetOptions): number;
+export function partial_token_sort_ratio(str1: string, str2: string, opts?: FuzzballPartialRatioOptions): number;
+export function partial_token_similarity_sort_ratio(str1: string, str2: string, opts?: FuzzballPartialRatioOptions & FuzzballTokenSetOptions): number;
 export function WRatio(str1: string, str2: string, opts?: FuzzballTokenSetOptions): number;
 export function full_process(str: string, options?: FuzzballExtractOptions | boolean): string;
 export function process_and_sort(str: string): string;
diff --git a/fuzzball.js b/fuzzball.js
@@ -81,6 +81,8 @@
          * @param {string} [options_p.wildcards] - characters that will be used as wildcards if provided
          * @param {number} [options_p.astral] - Use astral aware calculation
          * @param {string} [options_p.normalize] - Normalize unicode representations
+         * @param {string} [options_p.ratio_alg] - a string representing the ratio algorithm to use, either "levenshtein" or "difflib", default "levenshtein"
+         * @param {boolean} [options_p.autojunk] - autojunk argument passed to difflib if you're using the ratio_alg option, default true
          * @returns {number} - the levenshtein ratio (0-100).
          */
         var options = clone_and_set_option_defaults(options_p);
@@ -108,6 +110,7 @@
          * @param {string} [options_p.wildcards] - characters that will be used as wildcards if provided
          * @param {number} [options_p.astral] - Use astral aware calculation
          * @param {string} [options_p.normalize] - Normalize unicode representations
+         * @param {boolean} [options_p.autojunk] - autojunk argument passed to difflib, default true
          * @returns {number} - the levenshtein ratio (0-100).
          */
         var options = clone_and_set_option_defaults(options_p);
@@ -136,6 +139,8 @@
          * @param {string} [options_p.wildcards] - characters that will be used as wildcards if provided
          * @param {number} [options_p.astral] - Use astral aware calculation
          * @param {string} [options_p.normalize] - Normalize unicode representations
+         * @param {string} [options_p.ratio_alg] - a string representing the ratio algorithm to use, either "levenshtein" or "difflib", default "levenshtein"
+         * @param {boolean} [options_p.autojunk] - autojunk argument passed to difflib if you're using the ratio_alg option, default true
          * @returns {number} - the levenshtein ratio (0-100).
          */
         var options = clone_and_set_option_defaults(options_p);
@@ -164,6 +169,7 @@
          * @param {string} [options_p.wildcards] - characters that will be used as wildcards if provided
          * @param {number} [options_p.astral] - Use astral aware calculation
          * @param {string} [options_p.normalize] - Normalize unicode representations
+         * @param {boolean} [options_p.autojunk] - autojunk argument passed to difflib, default true
          * @returns {number} - the levenshtein ratio (0-100).
          */
         var options = clone_and_set_option_defaults(options_p);
@@ -191,6 +197,8 @@
          * @param {string} [options_p.wildcards] - characters that will be used as wildcards if provided
          * @param {number} [options_p.astral] - Use astral aware calculation
          * @param {string} [options_p.normalize] - Normalize unicode representations
+         * @param {string} [options_p.ratio_alg] - a string representing the ratio algorithm to use, either "levenshtein" or "difflib", default "levenshtein"
+         * @param {boolean} [options_p.autojunk] - autojunk argument passed to difflib if you're using the ratio_alg option, default true
          * @returns {number} - the levenshtein ratio (0-100).
          */
         var options = clone_and_set_option_defaults(options_p);
@@ -221,6 +229,7 @@
          * @param {string} [options_p.wildcards] - characters that will be used as wildcards if provided
          * @param {number} [options_p.astral] - Use astral aware calculation
          * @param {string} [options_p.normalize] - Normalize unicode representations
+         * @param {boolean} [options_p.autojunk] - autojunk argument passed to difflib, default true
          * @returns {number} - the levenshtein ratio (0-100).
          */
         var options = clone_and_set_option_defaults(options_p);
@@ -252,6 +261,8 @@
          * @param {string} [options_p.wildcards] - characters that will be used as wildcards if provided
          * @param {number} [options_p.astral] - Use astral aware calculation
          * @param {string} [options_p.normalize] - Normalize unicode representations
+         * @param {string} [options_p.ratio_alg] - a string representing the ratio algorithm to use, either "levenshtein" or "difflib", default "levenshtein"
+         * @param {boolean} [options_p.autojunk] - autojunk argument passed to difflib if you're using the ratio_alg option, default true
          * @returns {number} - the levenshtein ratio (0-100).
          */
         var options = clone_and_set_option_defaults(options_p);
@@ -278,6 +289,7 @@
          * @param {string} [options_p.wildcards] - characters that will be used as wildcards if provided
          * @param {number} [options_p.astral] - Use astral aware calculation
          * @param {string} [options_p.normalize] - Normalize unicode representations
+         * @param {boolean} [options_p.autojunk] - autojunk argument passed to difflib, default true
          * @returns {number} - the levenshtein ratio (0-100).
          */
         var options = clone_and_set_option_defaults(options_p);
@@ -364,6 +376,8 @@
          * @param {boolean} [options_p.sortBySimilarity] - sort tokens by similarity to each other before combining instead of alphabetically
          * @param {string} [options_p.wildcards] - characters that will be used as wildcards if provided
          * @param {boolean} [options_p.returnObjects] - return array of object instead of array of tuples; default false
+         * @param {string} [options_p.ratio_alg] - a string representing the ratio algorithm to use, either "levenshtein" or "difflib", default "levenshtein"
+         * @param {boolean} [options_p.autojunk] - autojunk argument passed to difflib if you're using the ratio_alg option, default true
          * @returns {Array[] | Object[]} - array of choice results with their computed ratios (0-100).
          */
         var options = clone_and_set_option_defaults(options_p);
@@ -510,6 +524,8 @@
          * @param {Object} [options_p.abortController] - track abortion
          * @param {Object} [options_p.cancelToken] - track cancellation
          * @param {number} [options_p.asyncLoopOffset] - number of rows to run in between every async loop iteration, default 256
+         * @param {string} [options_p.ratio_alg] - a string representing the ratio algorithm to use, either "levenshtein" or "difflib", default "levenshtein"
+         * @param {boolean} [options_p.autojunk] - autojunk argument passed to difflib if you're using the ratio_alg option, default true
          * @param {function} callback - node style callback (err, arrayOfResults)
          */
         var options = clone_and_set_option_defaults(options_p);
@@ -894,7 +910,7 @@
         if (!validate(str1)) return 0;
         if (!validate(str2)) return 0;
         if (options.ratio_alg && options.ratio_alg === "difflib") {
-            var m = new SequenceMatcher(null, str1, str2);
+            var m = new SequenceMatcher(null, str1, str2, options.autojunk);
             var r = m.ratio();
             return Math.round(100 * r);
         }
@@ -929,7 +945,7 @@
             var shorter = str2
             var longer = str1
         }
-        var m = new SequenceMatcher(null, shorter, longer);
+        var m = new SequenceMatcher(null, shorter, longer, options.autojunk);
         var blocks = m.getMatchingBlocks();
         var scores = [];
         for (var b = 0; b < blocks.length; b++) {
diff --git a/jsdocs/fuzzball.md b/jsdocs/fuzzball.md
@@ -59,6 +59,8 @@ Calculate levenshtein ratio of the two strings.
 | [options_p.wildcards] | <code>string</code> | characters that will be used as wildcards if provided |
 | [options_p.astral] | <code>number</code> | Use astral aware calculation |
 | [options_p.normalize] | <code>string</code> | Normalize unicode representations |
+| [options_p.ratio_alg] | <code>string</code> | a string representing the ratio algorithm to use, either "levenshtein" or "difflib", default "levenshtein" |
+| [options_p.autojunk] | <code>boolean</code> | autojunk argument passed to difflib if you're using the ratio_alg option, default true |
 
 <a name="module_fuzzball..partial_ratio"></a>
 
@@ -80,6 +82,7 @@ Calculate partial levenshtein ratio of the two strings.
 | [options_p.wildcards] | <code>string</code> | characters that will be used as wildcards if provided |
 | [options_p.astral] | <code>number</code> | Use astral aware calculation |
 | [options_p.normalize] | <code>string</code> | Normalize unicode representations |
+| [options_p.autojunk] | <code>boolean</code> | autojunk argument passed to difflib, default true |
 
 <a name="module_fuzzball..token_set_ratio"></a>
 
@@ -102,6 +105,8 @@ Calculate token set ratio of the two strings.
 | [options_p.wildcards] | <code>string</code> | characters that will be used as wildcards if provided |
 | [options_p.astral] | <code>number</code> | Use astral aware calculation |
 | [options_p.normalize] | <code>string</code> | Normalize unicode representations |
+| [options_p.ratio_alg] | <code>string</code> | a string representing the ratio algorithm to use, either "levenshtein" or "difflib", default "levenshtein" |
+| [options_p.autojunk] | <code>boolean</code> | autojunk argument passed to difflib if you're using the ratio_alg option, default true |
 
 <a name="module_fuzzball..partial_token_set_ratio"></a>
 
@@ -124,6 +129,7 @@ Calculate partial token ratio of the two strings.
 | [options_p.wildcards] | <code>string</code> | characters that will be used as wildcards if provided |
 | [options_p.astral] | <code>number</code> | Use astral aware calculation |
 | [options_p.normalize] | <code>string</code> | Normalize unicode representations |
+| [options_p.autojunk] | <code>boolean</code> | autojunk argument passed to difflib, default true |
 
 <a name="module_fuzzball..token_sort_ratio"></a>
 
@@ -144,6 +150,8 @@ Calculate token sort ratio of the two strings.
 | [options_p.wildcards] | <code>string</code> | characters that will be used as wildcards if provided |
 | [options_p.astral] | <code>number</code> | Use astral aware calculation |
 | [options_p.normalize] | <code>string</code> | Normalize unicode representations |
+| [options_p.ratio_alg] | <code>string</code> | a string representing the ratio algorithm to use, either "levenshtein" or "difflib", default "levenshtein" |
+| [options_p.autojunk] | <code>boolean</code> | autojunk argument passed to difflib if you're using the ratio_alg option, default true |
 
 <a name="module_fuzzball..partial_token_sort_ratio"></a>
 
@@ -164,6 +172,7 @@ Calculate partial token sort ratio of the two strings.
 | [options_p.wildcards] | <code>string</code> | characters that will be used as wildcards if provided |
 | [options_p.astral] | <code>number</code> | Use astral aware calculation |
 | [options_p.normalize] | <code>string</code> | Normalize unicode representations |
+| [options_p.autojunk] | <code>boolean</code> | autojunk argument passed to difflib, default true |
 
 <a name="module_fuzzball..token_similarity_sort_ratio"></a>
 
@@ -184,6 +193,8 @@ Calculate token sort ratio of the two strings.
 | [options_p.wildcards] | <code>string</code> | characters that will be used as wildcards if provided |
 | [options_p.astral] | <code>number</code> | Use astral aware calculation |
 | [options_p.normalize] | <code>string</code> | Normalize unicode representations |
+| [options_p.ratio_alg] | <code>string</code> | a string representing the ratio algorithm to use, either "levenshtein" or "difflib", default "levenshtein" |
+| [options_p.autojunk] | <code>boolean</code> | autojunk argument passed to difflib if you're using the ratio_alg option, default true |
 
 <a name="module_fuzzball..partial_token_similarity_sort_ratio"></a>
 
@@ -204,6 +215,7 @@ Calculate token sort ratio of the two strings.
 | [options_p.wildcards] | <code>string</code> | characters that will be used as wildcards if provided |
 | [options_p.astral] | <code>number</code> | Use astral aware calculation |
 | [options_p.normalize] | <code>string</code> | Normalize unicode representations |
+| [options_p.autojunk] | <code>boolean</code> | autojunk argument passed to difflib, default true |
 
 <a name="module_fuzzball..WRatio"></a>
 
@@ -253,6 +265,8 @@ Return the top scoring items from an array (or assoc array) of choices
 | [options_p.sortBySimilarity] | <code>boolean</code> | sort tokens by similarity to each other before combining instead of alphabetically |
 | [options_p.wildcards] | <code>string</code> | characters that will be used as wildcards if provided |
 | [options_p.returnObjects] | <code>boolean</code> | return array of object instead of array of tuples; default false |
+| [options_p.ratio_alg] | <code>string</code> | a string representing the ratio algorithm to use, either "levenshtein" or "difflib", default "levenshtein" |
+| [options_p.autojunk] | <code>boolean</code> | autojunk argument passed to difflib if you're using the ratio_alg option, default true |
 
 <a name="module_fuzzball..extractAsync"></a>
 
@@ -283,5 +297,7 @@ Return the top scoring items from an array (or assoc array) of choices
 | [options_p.abortController] | <code>Object</code> | track abortion |
 | [options_p.cancelToken] | <code>Object</code> | track cancellation |
 | [options_p.asyncLoopOffset] | <code>number</code> | number of rows to run in between every async loop iteration, default 256 |
+| [options_p.ratio_alg] | <code>string</code> | a string representing the ratio algorithm to use, either "levenshtein" or "difflib", default "levenshtein" |
+| [options_p.autojunk] | <code>boolean</code> | autojunk argument passed to difflib if you're using the ratio_alg option, default true |
 | callback | <code>function</code> | node style callback (err, arrayOfResults) |