Skip to content

Commit a189e09

Browse files
add int filter
1 parent e8312dc commit a189e09

File tree

5 files changed

+125
-4
lines changed

5 files changed

+125
-4
lines changed

vectordb_bench/backend/cases.py

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from vectordb_bench import config
66
from vectordb_bench.backend.clients.api import MetricType
7-
from vectordb_bench.backend.filter import Filter, FilterOp, IntFilter, LabelFilter, NonFilter, non_filter
7+
from vectordb_bench.backend.filter import Filter, FilterOp, IntFilter, LabelFilter, NewIntFilter, NonFilter, non_filter
88
from vectordb_bench.base import BaseModel
99
from vectordb_bench.frontend.components.custom.getCustomConfig import CustomDatasetConfig
1010

@@ -52,6 +52,8 @@ class CaseType(Enum):
5252

5353
StreamingPerformanceCase = 200
5454

55+
NewIntFilterPerformanceCase = 250
56+
5557
LabelFilterPerformanceCase = 300
5658

5759
def case_cls(self, custom_configs: dict | None = None) -> type["Case"]:
@@ -130,6 +132,7 @@ class PerformanceCase(Case):
130132
filter_rate: float | None = None
131133
load_timeout: float | int = config.LOAD_TIMEOUT_DEFAULT
132134
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_DEFAULT
135+
int_value: float | None = None
133136

134137

135138
class CapacityDim960(CapacityCase):
@@ -471,6 +474,46 @@ def __init__(
471474
)
472475

473476

477+
class NewIntFilterPerformanceCase(PerformanceCase):
478+
case_id: CaseType = CaseType.NewIntFilterPerformanceCase
479+
dataset_with_size_type: DatasetWithSizeType
480+
filter_rate: float
481+
482+
def __init__(
483+
self,
484+
dataset_with_size_type: DatasetWithSizeType | str,
485+
filter_rate: float,
486+
int_value: float | None = 0,
487+
**kwargs,
488+
):
489+
if not isinstance(dataset_with_size_type, DatasetWithSizeType):
490+
dataset_with_size_type = DatasetWithSizeType(dataset_with_size_type)
491+
name = f"Int-Filter-{filter_rate*100:.1f}% - {dataset_with_size_type.value}"
492+
description = f"Int-Filter-{filter_rate*100:.1f}% Performance Test ({dataset_with_size_type.value})"
493+
dataset = dataset_with_size_type.get_manager()
494+
load_timeout = dataset_with_size_type.get_load_timeout()
495+
optimize_timeout = dataset_with_size_type.get_optimize_timeout()
496+
filters = IntFilter(filter_rate=filter_rate, int_value=int_value)
497+
filter_rate = filters.filter_rate
498+
super().__init__(
499+
name=name,
500+
description=description,
501+
dataset=dataset,
502+
load_timeout=load_timeout,
503+
optimize_timeout=optimize_timeout,
504+
filter_rate=filter_rate,
505+
int_value=int_value,
506+
dataset_with_size_type=dataset_with_size_type,
507+
**kwargs,
508+
)
509+
510+
@property
511+
def filters(self) -> Filter:
512+
int_field = self.dataset.data.train_id_field
513+
int_value = int(self.dataset.data.size * self.filter_rate)
514+
return NewIntFilter(filter_rate=self.filter_rate, int_field=int_field, int_value=int_value)
515+
516+
474517
class LabelFilterPerformanceCase(PerformanceCase):
475518
case_id: CaseType = CaseType.LabelFilterPerformanceCase
476519
dataset_with_size_type: DatasetWithSizeType
@@ -529,5 +572,6 @@ def filters(self) -> Filter:
529572
CaseType.Performance1536D50K: Performance1536D50K,
530573
CaseType.PerformanceCustomDataset: PerformanceCustomDataset,
531574
CaseType.StreamingPerformanceCase: StreamingPerformanceCase,
575+
CaseType.NewIntFilterPerformanceCase: NewIntFilterPerformanceCase,
532576
CaseType.LabelFilterPerformanceCase: LabelFilterPerformanceCase,
533577
}

vectordb_bench/backend/dataset.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ class BaseDataset(BaseModel):
4848
scalar_labels_file_separated: bool = True
4949
scalar_labels_file: str = "scalar_labels.parquet"
5050
scalar_label_percentages: list[float] = []
51+
scalar_int_rates: list[float] = []
5152
train_id_field: str = "id"
5253
train_vector_field: str = "emb"
5354
test_file: str = "test.parquet"
@@ -164,6 +165,7 @@ class Cohere(BaseDataset):
164165
}
165166
with_scalar_labels: bool = True
166167
scalar_label_percentages: list[float] = [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5]
168+
scalar_int_rates: list[float] = [0.01, 0.99]
167169

168170

169171
class Bioasq(BaseDataset):
@@ -178,6 +180,7 @@ class Bioasq(BaseDataset):
178180
}
179181
with_scalar_labels: bool = True
180182
scalar_label_percentages: list[float] = [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5]
183+
scalar_int_rates: list[float] = [0.01, 0.99]
181184

182185

183186
class Glove(BaseDataset):
@@ -217,6 +220,7 @@ class OpenAI(BaseDataset):
217220
}
218221
with_scalar_labels: bool = True
219222
scalar_label_percentages: list[float] = [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5]
223+
scalar_int_rates: list[float] = [0.01, 0.99]
220224

221225

222226
class DatasetManager(BaseModel):

vectordb_bench/backend/filter.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,23 @@ def groundtruth_file(self) -> str:
5151
raise RuntimeError(msg)
5252

5353

54+
class NewIntFilter(Filter):
55+
type: FilterOp = FilterOp.NumGE
56+
int_field: str = "id"
57+
int_value: int
58+
59+
@property
60+
def int_rate(self) -> str:
61+
r = self.filter_rate * 100
62+
if r >= 1:
63+
return f"int_{int(r)}p"
64+
return f"int_{r:.1f}p"
65+
66+
@property
67+
def groundtruth_file(self) -> str:
68+
return f"neighbors_{self.int_rate}.parquet"
69+
70+
5471
class LabelFilter(Filter):
5572
"""
5673
filter expr: label_field == label_value, like `color == "red"`

vectordb_bench/frontend/config/dbCaseConfigs.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,17 @@ def generate_label_filter_cases(dataset_with_size_type: DatasetWithSizeType) ->
219219
]
220220

221221

222+
def generate_int_filter_cases(dataset_with_size_type: DatasetWithSizeType) -> list[CaseConfig]:
223+
filter_rates = dataset_with_size_type.get_manager().data.scalar_int_rates
224+
return [
225+
CaseConfig(
226+
case_id=CaseType.NewIntFilterPerformanceCase,
227+
custom_case=dict(dataset_with_size_type=dataset_with_size_type, filter_rate=filter_rate),
228+
)
229+
for filter_rate in filter_rates
230+
]
231+
232+
222233
UI_CASE_CLUSTERS: list[UICaseItemCluster] = [
223234
UICaseItemCluster(
224235
label="Search Performance Test",
@@ -249,6 +260,27 @@ def generate_label_filter_cases(dataset_with_size_type: DatasetWithSizeType) ->
249260
UICaseItem(cases=generate_normal_cases(CaseType.Performance1536D500K99P)),
250261
],
251262
),
263+
UICaseItemCluster(
264+
label="New-Int-Filter Search Performance Test",
265+
uiCaseItems=[
266+
UICaseItem(
267+
label=f"Int-Filter Search Performance Test - {dataset_with_size_type.value}",
268+
description=(
269+
f"[Batch Cases]These cases test the search performance of a vector database "
270+
f"with dataset {dataset_with_size_type.value}"
271+
f"under filtering rates of {dataset_with_size_type.get_manager().data.scalar_int_rates}, at varying parallel levels."
272+
f"Results will show index building time, recall, and maximum QPS."
273+
),
274+
cases=generate_int_filter_cases(dataset_with_size_type),
275+
)
276+
for dataset_with_size_type in [
277+
DatasetWithSizeType.CohereMedium,
278+
DatasetWithSizeType.CohereLarge,
279+
DatasetWithSizeType.OpenAIMedium,
280+
DatasetWithSizeType.OpenAILarge,
281+
]
282+
],
283+
),
252284
UICaseItemCluster(
253285
label="Label-Filter Search Performance Test",
254286
uiCaseItems=[

vectordb_bench/models.py

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66

77
import ujson
88

9+
from vectordb_bench.backend.cases import type2case
10+
from vectordb_bench.backend.dataset import DatasetWithSizeMap
11+
912
from . import config
1013
from .backend.cases import Case, CaseType
1114
from .backend.clients import (
@@ -270,6 +273,26 @@ def write_db_file(self, result_dir: pathlib.Path, partial: Self, db: str):
270273
b = partial.json(exclude={"db_config": {"password", "api_key"}})
271274
f.write(b)
272275

276+
def get_case_config(case_config: CaseConfig) -> dict[CaseConfig]:
277+
if int(case_config["case_id"]) in {6, 7, 8, 9, 12, 13, 14, 15}:
278+
for key, value in CaseType.__members__.items():
279+
if value.value == case_config["case_id"]:
280+
matching_key = key
281+
break
282+
case_list = type2case[CaseType[matching_key]]
283+
case_instance = case_list()
284+
custom_case = case_config["custom_case"]
285+
if custom_case is None:
286+
custom_case = {}
287+
custom_case["filter_rate"] = case_instance.filter_rate
288+
for dataset, size_type in DatasetWithSizeMap.items():
289+
if case_instance.dataset == size_type:
290+
custom_case["dataset_with_size_type"] = dataset
291+
break
292+
case_config["case_id"] = CaseType.NewIntFilterPerformanceCase
293+
case_config["custom_case"] = custom_case
294+
return case_config
295+
273296
@classmethod
274297
def read_file(cls, full_path: pathlib.Path, trans_unit: bool = False) -> Self:
275298
if not full_path.exists():
@@ -280,10 +303,10 @@ def read_file(cls, full_path: pathlib.Path, trans_unit: bool = False) -> Self:
280303
test_result = ujson.loads(f.read())
281304
if "task_label" not in test_result:
282305
test_result["task_label"] = test_result["run_id"]
283-
284306
for case_result in test_result["results"]:
285-
task_config = case_result["task_config"]
286-
db = DB(task_config["db"])
307+
task_config = case_result.get("task_config")
308+
case_config = task_config.get("case_config")
309+
db = DB(task_config.get("db"))
287310

288311
task_config["db_config"] = db.config_cls(**task_config["db_config"])
289312

@@ -296,6 +319,7 @@ def read_file(cls, full_path: pathlib.Path, trans_unit: bool = False) -> Self:
296319
log.exception(f"Couldn't get class for index '{index_value}' ({full_path})")
297320
task_config["db_case_config"] = EmptyDBCaseConfig(**raw_case_cfg)
298321

322+
task_config["case_config"] = cls.get_case_config(case_config=case_config)
299323
case_result["task_config"] = task_config
300324

301325
if trans_unit:

0 commit comments

Comments
 (0)