|
| 1 | +package org.fastfilter.xor; |
| 2 | + |
| 3 | +import org.fastfilter.Filter; |
| 4 | +import org.fastfilter.utils.Hash; |
| 5 | + |
| 6 | +/** |
| 7 | + * The Xor Fuse Filter, a new algorithm that can replace a bloom filter. |
| 8 | + * |
| 9 | + * It is related to the BDZ algorithm [1] (a minimal perfect hash function |
| 10 | + * algorithm). |
| 11 | + * |
| 12 | + * [1] paper: Simple and Space-Efficient Minimal Perfect Hash Functions - |
| 13 | + * http://cmph.sourceforge.net/papers/wads07.pdf |
| 14 | + */ |
| 15 | +public class XorFuse8 implements Filter { |
| 16 | + |
| 17 | + private static final int BITS_PER_FINGERPRINT = 8; |
| 18 | + private static final int HASHES = 3; |
| 19 | + |
| 20 | + private static final int FUSE_ARITY = 3; |
| 21 | + private static final int FUSE_SEGMENT_COUNT = 100; |
| 22 | + private static final int FUSE_SLOTS = FUSE_SEGMENT_COUNT + FUSE_ARITY - 1; |
| 23 | + |
| 24 | + private final int size; |
| 25 | + private final int segmentLength; |
| 26 | + private final int arrayLength; |
| 27 | + private long seed; |
| 28 | + private byte[] fingerprints; |
| 29 | + private final int bitCount; |
| 30 | + |
| 31 | + public long getBitCount() { |
| 32 | + return bitCount; |
| 33 | + } |
| 34 | + |
| 35 | + private static int getArrayLength(int size, double factor) { |
| 36 | + int capacity = (int) (1.0 / factor * size); |
| 37 | + capacity = (capacity + FUSE_SLOTS - 1) / FUSE_SLOTS * FUSE_SLOTS; |
| 38 | + return capacity; |
| 39 | + } |
| 40 | + |
| 41 | + public static XorFuse8 construct(long[] keys) { |
| 42 | + int size = keys.length; |
| 43 | + double factor = 0.879; |
| 44 | + if (size < 1_000) { |
| 45 | + factor = 0.5; |
| 46 | + } else if (size < 10_000) { |
| 47 | + factor = 0.7; |
| 48 | + } else if (size < 100_000) { |
| 49 | + factor = 0.8; |
| 50 | + } |
| 51 | + while (true) { |
| 52 | + try { |
| 53 | + return new XorFuse8(keys, factor); |
| 54 | + } catch (UnsupportedOperationException e) { |
| 55 | + // try again with a lower load |
| 56 | + factor -= 0.1; |
| 57 | + } |
| 58 | + } |
| 59 | + } |
| 60 | + |
| 61 | + public XorFuse8(long[] keys, double factor) { |
| 62 | + this.size = keys.length; |
| 63 | + arrayLength = getArrayLength(size, factor); |
| 64 | + segmentLength = arrayLength / FUSE_SLOTS; |
| 65 | + bitCount = arrayLength * BITS_PER_FINGERPRINT; |
| 66 | + int m = arrayLength; |
| 67 | + long[] reverseOrder = new long[size]; |
| 68 | + byte[] reverseH = new byte[size]; |
| 69 | + int reverseOrderPos; |
| 70 | + long seed; |
| 71 | + int x = 0; |
| 72 | + do { |
| 73 | + x++; |
| 74 | + if (x > 10) { |
| 75 | + throw new UnsupportedOperationException(); |
| 76 | + } |
| 77 | + seed = Hash.randomSeed(); |
| 78 | + byte[] t2count = new byte[m]; |
| 79 | + long[] t2 = new long[m]; |
| 80 | + for (long k : keys) { |
| 81 | + for (int hi = 0; hi < HASHES; hi++) { |
| 82 | + int h = getHash(k, seed, hi); |
| 83 | + t2[h] ^= k; |
| 84 | + if (t2count[h] > 120) { |
| 85 | + // probably something wrong with the hash function |
| 86 | + throw new IllegalArgumentException(); |
| 87 | + } |
| 88 | + t2count[h]++; |
| 89 | + } |
| 90 | + } |
| 91 | + reverseOrderPos = 0; |
| 92 | + int[] alone = new int[arrayLength]; |
| 93 | + int alonePos = 0; |
| 94 | + for (int i = 0; i < arrayLength; i++) { |
| 95 | + if (t2count[ i] == 1) { |
| 96 | + alone[alonePos++] = i; |
| 97 | + } |
| 98 | + } |
| 99 | + int found = -1; |
| 100 | + while (alonePos > 0) { |
| 101 | + int i = alone[--alonePos]; |
| 102 | + if (t2count[i] <= 0) { |
| 103 | + continue; |
| 104 | + } |
| 105 | + if (t2count[i] != 1) { |
| 106 | + throw new AssertionError(); |
| 107 | + } |
| 108 | + --t2count[i]; |
| 109 | + long k = t2[i]; |
| 110 | + for (int hi = 0; hi < HASHES; hi++) { |
| 111 | + int h = getHash(k, seed, hi); |
| 112 | + int newCount = --t2count[h]; |
| 113 | + if (h == i) { |
| 114 | + found = hi; |
| 115 | + } else { |
| 116 | + if (newCount == 1) { |
| 117 | + alone[alonePos++] = h; |
| 118 | + } |
| 119 | + t2[h] ^= k; |
| 120 | + } |
| 121 | + } |
| 122 | + reverseOrder[reverseOrderPos] = k; |
| 123 | + reverseH[reverseOrderPos] = (byte) found; |
| 124 | + reverseOrderPos++; |
| 125 | + } |
| 126 | + } while (reverseOrderPos != size); |
| 127 | + this.seed = seed; |
| 128 | + byte[] fp = new byte[m]; |
| 129 | + for (int i = reverseOrderPos - 1; i >= 0; i--) { |
| 130 | + long k = reverseOrder[i]; |
| 131 | + int found = reverseH[i]; |
| 132 | + int change = -1; |
| 133 | + long hash = Hash.hash64(k, seed); |
| 134 | + int xor = fingerprint(hash); |
| 135 | + for (int hi = 0; hi < HASHES; hi++) { |
| 136 | + int h = getHash(k, seed, hi); |
| 137 | + if (found == hi) { |
| 138 | + change = h; |
| 139 | + } else { |
| 140 | + xor ^= fp[h]; |
| 141 | + } |
| 142 | + } |
| 143 | + fp[change] = (byte) xor; |
| 144 | + } |
| 145 | + fingerprints = new byte[m]; |
| 146 | + System.arraycopy(fp, 0, fingerprints, 0, fp.length); |
| 147 | + } |
| 148 | + |
| 149 | + @Override |
| 150 | + public boolean mayContain(long key) { |
| 151 | + long hash = Hash.hash64(key, seed); |
| 152 | + int f = fingerprint(hash); |
| 153 | + int r0 = (int) ((0xBF58476D1CE4E5B9L * hash) >> 32); |
| 154 | + int r1 = (int) hash; |
| 155 | + int r2 = (int) Long.rotateLeft(hash, 21); |
| 156 | + int r3 = (int) Long.rotateLeft(hash, 42); |
| 157 | + int seg = Hash.reduce(r0, FUSE_SEGMENT_COUNT); |
| 158 | + int h0 = (seg + 0) * segmentLength + Hash.reduce(r1, segmentLength); |
| 159 | + int h1 = (seg + 1) * segmentLength + Hash.reduce(r2, segmentLength); |
| 160 | + int h2 = (seg + 2) * segmentLength + Hash.reduce(r3, segmentLength); |
| 161 | + f ^= fingerprints[h0] ^ fingerprints[h1] ^ fingerprints[h2]; |
| 162 | + return (f & 0xff) == 0; |
| 163 | + } |
| 164 | + |
| 165 | + private int getHash(long key, long seed, int index) { |
| 166 | + long hash = Hash.hash64(key, seed); |
| 167 | + int r0 = (int) ((0xBF58476D1CE4E5B9L * hash) >> 32); |
| 168 | + int seg = Hash.reduce(r0, FUSE_SEGMENT_COUNT); |
| 169 | + int r = (int) Long.rotateLeft(hash, 21 * index); |
| 170 | + return (seg + index) * segmentLength + Hash.reduce(r, segmentLength); |
| 171 | + } |
| 172 | + |
| 173 | + private int fingerprint(long hash) { |
| 174 | + return (int) (hash & ((1 << BITS_PER_FINGERPRINT) - 1)); |
| 175 | + } |
| 176 | + |
| 177 | +} |
0 commit comments