Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,12 @@
*/
public class TsidBuilder {

private static final int MAX_TSID_VALUE_FIELDS = 16;
/**
* The maximum number of fields to use for the value similarity part of the TSID.
* This is a trade-off between clustering similar time series together and the size of the TSID.
* More fields improve clustering but also increase the size of the TSID.
*/
private static final int MAX_TSID_VALUE_SIMILARITY_FIELDS = 4;
private final BufferedMurmur3Hasher murmur3Hasher = new BufferedMurmur3Hasher(0L);

private final List<Dimension> dimensions = new ArrayList<>();
Expand Down Expand Up @@ -209,11 +214,11 @@ public MurmurHash3.Hash128 hash() {
* The TSID is a hash that includes:
* <ul>
* <li>
* A hash of the dimension field names (4 bytes).
* A hash of the dimension field names (1 byte).
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happens if we skip this? is it worth experimenting with? It'd be nice to have a tsid that's always the same size e.g. 20b.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Storage is a little worse, but not much. We may be overfitting to a specific dataset, though. We could also have a fixed 4 bytes instead of a variable 1-4 bytes for the values similarity hash.

* This is to cluster time series that are using the same dimensions together, which makes the encodings more effective.
* </li>
* <li>
* A hash of the dimension field values (1 byte each, up to a maximum of 16 fields).
* A hash of the dimension field values (1 byte each, up to a maximum of 4 fields).
* This is to cluster time series with similar values together, also helping with making encodings more effective.
* </li>
* <li>
Expand All @@ -227,24 +232,24 @@ public MurmurHash3.Hash128 hash() {
*/
public BytesRef buildTsid() {
throwIfEmpty();
int numberOfValues = Math.min(MAX_TSID_VALUE_FIELDS, dimensions.size());
byte[] hash = new byte[4 + numberOfValues + 16];
int numberOfValues = Math.min(MAX_TSID_VALUE_SIMILARITY_FIELDS, dimensions.size());
byte[] hash = new byte[1 + numberOfValues + 16];
int index = 0;

Collections.sort(dimensions);

MurmurHash3.Hash128 hashBuffer = new MurmurHash3.Hash128();
murmur3Hasher.reset();
// similarity hash for dimension names
for (int i = 0; i < dimensions.size(); i++) {
Dimension dim = dimensions.get(i);
murmur3Hasher.addLong(dim.pathHash.h1 ^ dim.pathHash.h2);
}
ByteUtils.writeIntLE((int) murmur3Hasher.digestHash(hashBuffer).h1, hash, index);
index += 4;
hash[index++] = (byte) murmur3Hasher.digestHash(hashBuffer).h1;

// similarity hash for values
// similarity hash for dimension values
String previousPath = null;
for (int i = 0; i < numberOfValues; i++) {
for (int i = 0; index < numberOfValues + 1 && i < dimensions.size(); i++) {
Dimension dim = dimensions.get(i);
String path = dim.path();
if (path.equals(previousPath)) {
Expand All @@ -259,6 +264,7 @@ public BytesRef buildTsid() {
}

murmur3Hasher.reset();
// full hash for all dimension names and values for uniqueness
for (int i = 0; i < dimensions.size(); i++) {
Dimension dim = dimensions.get(i);
murmur3Hasher.addLongs(dim.pathHash.h1, dim.pathHash.h2, dim.valueHash.h1, dim.valueHash.h2);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,31 @@ public void testAddDimensions() {

// if these change, we'll need a new index version
// because it means existing time series will get a new _tsid and will be routed to a different shard
assertThat(builder.hash().toString(), equalTo("0xd4de1356065d297a2be489781e15d256")); // used to make shard routing decisions
assertThat(builder.hash().toString(), equalTo("0xd4de1356065d297a2be489781e15d256"));
BytesRef bytesRef = builder.buildTsid();
assertThat(bytesRef, notNullValue());
// 4 bytes for path hash + 1 byte per value (up to 16, only first value for arrays) + 16 bytes for hash
assertThat(bytesRef.length, equalTo(26));
// 1 byte for path hash + 1 byte per value (up to 4, only first value for arrays) + 16 bytes for hash
assertThat(bytesRef.length, equalTo(21));
assertThat(
HexFormat.of().formatHex(bytesRef.bytes, bytesRef.offset, bytesRef.length),
equalTo("bf438ddaa0a8d663fdbb56d2151e7889e42b7a295d065613ded4") // _tsid in hex format
equalTo("bfa0a8d66356d2151e7889e42b7a295d065613ded4") // _tsid in hex format
);
}

public void testArray() {
TsidBuilder builder = TsidBuilder.newBuilder().addStringDimension("test_non_array", "value");

int arrayValues = randomIntBetween(32, 64);
for (int i = 0; i < arrayValues; i++) {
builder.addStringDimension("_test_large_array", "value_" + i);
}

BytesRef bytesRef = builder.buildTsid();
assertThat(bytesRef, notNullValue());
// 1 byte for path hash + 2 bytes for value hash (1 for the first array value and 1 for the the non-array value) + 16 bytes for hash
assertThat(bytesRef.length, equalTo(19));
}

public void testOrderingOfDifferentFieldsDoesNotMatter() {
assertEqualBuilders(
TsidBuilder.newBuilder().addStringDimension("foo", "bar").addStringDimension("baz", "qux"),
Expand Down Expand Up @@ -114,19 +128,20 @@ public void testExceptionWhenNoDimensions() {
public void testTsidMinSize() {
BytesRef tsid = TsidBuilder.newBuilder().addIntDimension("test_int", 42).buildTsid();

// The TSID format should be: 4 bytes for path hash + 1 byte per value (up to 16) + 16 bytes for hash
// Since we only added one dimension, we expect: 4 + 1 + 16 = 21 bytes
assertEquals(21, tsid.length);
// The TSID format should be: 1 bytes for path hash + 1 byte per value (up to 4) + 16 bytes for hash
// Since we only added one dimension, we expect: 1 + 1 + 16 = 21 bytes
assertEquals(18, tsid.length);
}

public void testTsidMaxSize() {
TsidBuilder tsidBuilder = TsidBuilder.newBuilder();
for (int i = 0; i < 32; i++) {
int dimensions = randomIntBetween(4, 64);
for (int i = 0; i < dimensions; i++) {
tsidBuilder.addStringDimension("dimension_" + i, "value_" + i);
}

// The TSID format should be: 4 bytes for path hash + 1 byte per value (up to 16) + 16 bytes for hash
// Since we added 32 dimensions, we expect: 4 + 16 + 16 = 36 bytes
assertEquals(36, tsidBuilder.buildTsid().length);
// The TSID format should be: 1 bytes for path hash + 1 byte per value (up to 4) + 16 bytes for hash
// Since we added at least 32 dimensions, we expect: 1 + 4 + 16 = 21 bytes
assertEquals(21, tsidBuilder.buildTsid().length);
}
}