Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
strategy:
fail-fast: false
matrix:
php: ['8.1', '8.2']
php: ['8.1', '8.2', '8.3', '8.4']
steps:
- name: Setup PHP
uses: shivammathur/setup-php@v2
Expand Down
81 changes: 81 additions & 0 deletions src/Levenshtein/Automaton.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
<?php

declare(strict_types=1);

namespace Toflar\StateSetIndex\Levenshtein;

class Automaton
{
private array $chars;

private int $length;

public function __construct(
private string $string,
private int $maxDistance,
private int $insertionCost = 1,
private int $deletionCost = 1,
private int $replacementCost = 1,
private int $transpositionCost = 1
) {
$this->length = mb_strlen($this->string);
$this->chars = mb_str_split($this->string);
}

public function canMatch(array $state): bool
{
foreach ($state as $distance) {
if ($distance <= $this->maxDistance) {
return true;
}
}
return false;
}

public function isMatch(array $state): bool
{
return isset($state[$this->length]) && $state[$this->length] <= $this->maxDistance;
}

public function start(): array
{
return [
0 => 0,
];
}

public function step(array $state, string $inputChar): array
{
$newState = [];
foreach ($state as $position => $distance) {
if ($distance > $this->maxDistance) {
continue;
}

// Insertion: Stay in the same position in the target
$newState[$position] = min($newState[$position] ?? PHP_INT_MAX, $distance + $this->insertionCost);

// Deletion: Move forward in the target
if ($position < $this->length) {
$newState[$position + 1] = min($newState[$position + 1] ?? PHP_INT_MAX, $distance + $this->deletionCost);
}

// Replacement or Match: Move forward in the target and input
if ($position < $this->length) {
$replacementCost = ($this->chars[$position] === $inputChar) ? 0 : $this->replacementCost;
$newState[$position + 1] = min($newState[$position + 1] ?? PHP_INT_MAX, $distance + $replacementCost);
}

// Transposition: Swap adjacent characters
if (
$position < $this->length - 1 &&
isset($this->chars[$position + 1]) &&
$this->chars[$position + 1] === $inputChar
) {
$newState[$position + 2] = min($newState[$position + 2] ?? PHP_INT_MAX, $distance + $this->transpositionCost);
}
}

return $newState;
}
}
62 changes: 62 additions & 0 deletions src/Levenshtein/TrieFilter.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
<?php

namespace Toflar\StateSetIndex\Levenshtein;

class TrieFilter
{
private array $trie = [
'children' => [],
'canMatch' => null,
];

public function __construct(
private Automaton $automaton
) {
}

public function filterStrings(array $strings): array
{
$matches = [];

foreach ($strings as $key => $string) {
$state = $this->automaton->start();
$chars = mb_str_split($string);
$node = &$this->trie;

foreach ($chars as $char) {
if (isset($node['children'][$char])) {
$node = &$node['children'][$char];
$state = $this->automaton->step($state, $char);

if ($node['canMatch'] === false) {
continue 2; // Skip to the next string
}
} else {
$state = $this->automaton->step($state, $char);
$canMatch = $this->automaton->canMatch($state);

$node['children'][$char] = [
'children' => [],
'canMatch' => $canMatch,
];
$node = &$node['children'][$char];

if (!$canMatch) {
continue 2; // Skip to the next string
}
}
}

if ($this->automaton->isMatch($state)) {
$matches[$key] = $string;
}
}

return $matches;
}

public function matches(string $string): bool
{
return [] !== $this->filterStrings([$string]);
}
}
30 changes: 23 additions & 7 deletions src/StateSetIndex.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

use Toflar\StateSetIndex\Alphabet\AlphabetInterface;
use Toflar\StateSetIndex\DataStore\DataStoreInterface;
use Toflar\StateSetIndex\Levenshtein\Automaton;
use Toflar\StateSetIndex\Levenshtein\TrieFilter;
use Toflar\StateSetIndex\StateSet\CostAnnotatedStateSet;
use Toflar\StateSetIndex\StateSet\StateSetInterface;

Expand All @@ -19,6 +21,11 @@ class StateSetIndex
*/
private array $matchingStatesCache = [];

/**
* @var array<string, TrieFilter>
*/
private array $trieFilters = [];

public function __construct(
private Config $config,
private AlphabetInterface $alphabet,
Expand All @@ -34,18 +41,17 @@ public function __construct(
*/
public function find(string $string, int $editDistance): array
{
$cacheKey = $string . "\0" . $editDistance;
$acceptedStringsPerState = $this->findAcceptedStrings($string, $editDistance);
$stringLength = mb_strlen($string);
$filtered = [];

if (!isset($this->trieFilters[$cacheKey])) {
$this->trieFilters[$cacheKey] = new TrieFilter(new Automaton($string, $editDistance));
}

foreach ($acceptedStringsPerState as $acceptedStrings) {
foreach ($acceptedStrings as $acceptedString) {
// Early aborts (cheaper) for cases we know are absolutely never going to match
if (abs($stringLength - mb_strlen($acceptedString)) > $editDistance) {
continue;
}

if (Levenshtein::distance($string, $acceptedString) <= $editDistance) {
if ($this->trieFilters[$cacheKey]->matches($acceptedString)) {
$filtered[] = $acceptedString;
}
}
Expand Down Expand Up @@ -170,6 +176,16 @@ public function index(array $strings): array
return $assigned;
}

/**
* Resets internal caches, use this in long-running processes in order to not run into a memory leak.
*/
public function reset(): void
{
$this->indexCache = [];
$this->matchingStatesCache = [];
$this->trieFilters = [];
}

private function getReachableStates(int $startState, int $editDistance, int $currentDistance = 0): CostAnnotatedStateSet
{
$reachable = new CostAnnotatedStateSet();
Expand Down
79 changes: 79 additions & 0 deletions tests/Levenshtein/AutomatonTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
<?php

namespace Toflar\StateSetIndex\Test\Levenshtein;

use PHPUnit\Framework\TestCase;
use Toflar\StateSetIndex\Levenshtein\Automaton;
use Toflar\StateSetIndex\Levenshtein\TrieFilter;

class AutomatonTest extends TestCase
{
public function testAutomaton(): void
{
$automaton = new Automaton('foobar', 2, 1, 1, 2, 1);

$startState = $automaton->start();
$this->assertSame([0 => 0], $startState);

// Step with 'f' (match)
$newState = $automaton->step($startState, 'f');
$this->assertSame([0 => 1, 1 => 0], $newState);
$this->assertTrue($automaton->canMatch($newState));
$this->assertFalse($automaton->isMatch($newState));

// Step with 'o' (partial match)
$newState = $automaton->step($newState, 'o');
$this->assertSame([0 => 2, 1 => 1, 2 => 0, 3 => 1], $newState);
$this->assertTrue($automaton->canMatch($newState));
$this->assertFalse($automaton->isMatch($newState));

// Step with 'o' (continued match)
$newState = $automaton->step($newState, 'o');
$this->assertTrue($automaton->canMatch($newState));
$this->assertFalse($automaton->isMatch($newState));

// Step with 'b' (near end of match)
$newState = $automaton->step($newState, 'b');
$this->assertTrue($automaton->canMatch($newState));
$this->assertFalse($automaton->isMatch($newState));

// Step with 'a' (complete match)
$newState = $automaton->step($newState, 'a');
$newState = $automaton->step($newState, 'r'); // Final step
$this->assertTrue($automaton->canMatch($newState));
$this->assertTrue($automaton->isMatch($newState));
}

public function testTransposition(): void
{
$automaton = new Automaton('foobar', 2, 1, 1, 2, 1);

// Initial state
$startState = $automaton->start();
$this->assertSame([0 => 0], $startState);

// Step with 'o' (transposition handling)
$newState = $automaton->step($startState, 'o');
$newState = $automaton->step($newState, 'f'); // Transposed letters
$this->assertTrue($automaton->canMatch($newState));
$this->assertFalse($automaton->isMatch($newState));
}

public function testCosts(): void
{
$automaton = new Automaton('foobar', 3, 2, 1, 3, 1); // Custom costs

$startState = $automaton->start();
$this->assertSame([0 => 0], $startState);

// Step with 'f' (match, cost 0)
$newState = $automaton->step($startState, 'f');
$this->assertSame([0 => 2, 1 => 0], $newState); // Insertion cost is now 2
$this->assertTrue($automaton->canMatch($newState));
$this->assertFalse($automaton->isMatch($newState));

// Step with 'x' (replacement, cost 3)
$newState = $automaton->step($newState, 'x');
$this->assertFalse($automaton->isMatch($newState));
}
}
Loading