Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 81 additions & 0 deletions vectordb_bench/backend/cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ class CaseType(Enum):
PerformanceCustomDataset = 101

StreamingPerformanceCase = 200
StreamingCustomDataset = 201

LabelFilterPerformanceCase = 300

Expand Down Expand Up @@ -474,6 +475,85 @@ def __init__(
)


class StreamingCustomDataset(Case):
case_id: CaseType = CaseType.StreamingCustomDataset
label: CaseLabel = CaseLabel.Streaming
name: str = "Streaming Performance With Custom Dataset"
description: str = ""
dataset: DatasetManager
insert_rate: int
search_stages: list[float]
concurrencies: list[int]
optimize_after_write: bool = True
read_dur_after_write: int = 30

def __init__(
self,
description: str,
dataset_config: dict,
insert_rate: int = 500,
search_stages: list[float] | str = (0.5, 0.8),
concurrencies: list[int] | str = (5, 10),
optimize_after_write: bool = True,
read_dur_after_write: int = 30,
**kwargs,
):
num_per_batch = config.NUM_PER_BATCH
if insert_rate % config.NUM_PER_BATCH != 0:
_insert_rate = max(
num_per_batch,
insert_rate // num_per_batch * num_per_batch,
)
log.warning(
f"[streaming_case init] insert_rate(={insert_rate}) should be "
f"divisible by NUM_PER_BATCH={num_per_batch}), reset to {_insert_rate}",
)
insert_rate = _insert_rate

dataset_config = CustomDatasetConfig(**dataset_config)
dataset = CustomDataset(
name=dataset_config.name,
size=dataset_config.size,
dim=dataset_config.dim,
metric_type=metric_type_map(dataset_config.metric_type),
use_shuffled=dataset_config.use_shuffled,
with_gt=dataset_config.with_gt,
dir=dataset_config.dir,
file_num=dataset_config.file_count,
train_file=dataset_config.train_name,
test_file=f"{dataset_config.test_name}.parquet",
train_id_field=dataset_config.train_id_name,
train_vector_field=dataset_config.train_col_name,
test_vector_field=dataset_config.test_col_name,
gt_neighbors_field=dataset_config.gt_col_name,
scalar_labels_file=f"{dataset_config.scalar_labels_name}.parquet",
)
name = f"Streaming-Perf - Custom - {dataset_config.name}, {insert_rate} rows/s"
description = (
description
if description
else f"This case tests the search performance of vector database while maintaining "
f"a fixed insertion speed. (dataset: Custom - {dataset_config.name})"
)

if isinstance(search_stages, str):
search_stages = json.loads(search_stages)
if isinstance(concurrencies, str):
concurrencies = json.loads(concurrencies)

super().__init__(
name=name,
description=description,
dataset=DatasetManager(data=dataset),
insert_rate=insert_rate,
search_stages=search_stages,
concurrencies=concurrencies,
optimize_after_write=optimize_after_write,
read_dur_after_write=read_dur_after_write,
**kwargs,
)


class NewIntFilterPerformanceCase(PerformanceCase):
case_id: CaseType = CaseType.NewIntFilterPerformanceCase
dataset_with_size_type: DatasetWithSizeType
Expand Down Expand Up @@ -572,6 +652,7 @@ def filters(self) -> Filter:
CaseType.Performance1536D50K: Performance1536D50K,
CaseType.PerformanceCustomDataset: PerformanceCustomDataset,
CaseType.StreamingPerformanceCase: StreamingPerformanceCase,
CaseType.StreamingCustomDataset: StreamingCustomDataset,
CaseType.NewIntFilterPerformanceCase: NewIntFilterPerformanceCase,
CaseType.LabelFilterPerformanceCase: LabelFilterPerformanceCase,
}
28 changes: 28 additions & 0 deletions vectordb_bench/custom/custom_case.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,33 @@
"use_shuffled": false,
"with_gt": true
}
},
{
"case_type": "streaming",
"description": "",
"dataset_config": {
"name": "My Streaming Dataset",
"dir": "/my_dataset_path",
"size": 100000,
"dim": 768,
"metric_type": "L2",
"file_count": 3,
"use_shuffled": false,
"with_gt": true,
"train_name": "shuffle_train",
"test_name": "test",
"gt_name": "neighbors",
"train_id_name": "id",
"train_col_name": "emb",
"test_col_name": "emb",
"gt_col_name": "neighbors_id",
"scalar_labels_name": "scalar_labels",
"label_percentages": [],
"with_label_percentages": [
0.001,
0.02,
0.5
]
}
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from vectordb_bench.frontend.components.custom.getCustomConfig import CustomStreamingCaseConfig


def displayCustomStreamingCase(streamingCase: CustomStreamingCaseConfig, st, key):

columns = st.columns([1, 2])
streamingCase.dataset_config.name = columns[0].text_input(
"Name", key=f"{key}_name", value=streamingCase.dataset_config.name
)
streamingCase.dataset_config.dir = columns[1].text_input(
"Folder Path", key=f"{key}_dir", value=streamingCase.dataset_config.dir
)

columns = st.columns(2)
streamingCase.dataset_config.dim = columns[0].number_input(
"dim", key=f"{key}_dim", value=streamingCase.dataset_config.dim
)
streamingCase.dataset_config.size = columns[1].number_input(
"size", key=f"{key}_size", value=streamingCase.dataset_config.size
)

columns = st.columns(3)
streamingCase.dataset_config.train_name = columns[0].text_input(
"train file name",
key=f"{key}_train_name",
value=streamingCase.dataset_config.train_name,
)
streamingCase.dataset_config.test_name = columns[1].text_input(
"test file name", key=f"{key}_test_name", value=streamingCase.dataset_config.test_name
)
streamingCase.dataset_config.gt_name = columns[2].text_input(
"ground truth file name", key=f"{key}_gt_name", value=streamingCase.dataset_config.gt_name
)

columns = st.columns([1, 1, 2, 2])
streamingCase.dataset_config.train_id_name = columns[0].text_input(
"train id name", key=f"{key}_train_id_name", value=streamingCase.dataset_config.train_id_name
)
streamingCase.dataset_config.train_col_name = columns[1].text_input(
"train emb name", key=f"{key}_train_col_name", value=streamingCase.dataset_config.train_col_name
)
streamingCase.dataset_config.test_col_name = columns[2].text_input(
"test emb name", key=f"{key}_test_col_name", value=streamingCase.dataset_config.test_col_name
)
streamingCase.dataset_config.gt_col_name = columns[3].text_input(
"ground truth emb name", key=f"{key}_gt_col_name", value=streamingCase.dataset_config.gt_col_name
)

streamingCase.description = st.text_area("description", key=f"{key}_description", value=streamingCase.description)
33 changes: 32 additions & 1 deletion vectordb_bench/frontend/components/custom/getCustomConfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,47 @@ class CustomCaseConfig(BaseModel):
dataset_config: CustomDatasetConfig = CustomDatasetConfig()


class CustomStreamingCaseConfig(BaseModel):
case_type: str = "streaming"
description: str = ""
dataset_config: CustomDatasetConfig = CustomDatasetConfig()


def get_custom_configs():
with open(config.CUSTOM_CONFIG_DIR, "r") as f:
custom_configs = json.load(f)
return [CustomCaseConfig(**custom_config) for custom_config in custom_configs]
return [
CustomCaseConfig(**custom_config)
for custom_config in custom_configs
if custom_config.get("case_type") != "streaming"
]


def get_custom_streaming_configs():
with open(config.CUSTOM_CONFIG_DIR, "r") as f:
custom_configs = json.load(f)
return [
CustomStreamingCaseConfig(**custom_config)
for custom_config in custom_configs
if custom_config.get("case_type") == "streaming"
]


def save_custom_configs(custom_configs: list[CustomDatasetConfig]):
with open(config.CUSTOM_CONFIG_DIR, "w") as f:
json.dump([custom_config.dict() for custom_config in custom_configs], f, indent=4)


def save_all_custom_configs(performance_configs: list[CustomCaseConfig], streaming_configs: list[CustomStreamingCaseConfig]):
"""Save both performance and streaming configs to the same JSON file"""
all_configs = [config.dict() for config in performance_configs] + [config.dict() for config in streaming_configs]
with open(config.CUSTOM_CONFIG_DIR, "w") as f:
json.dump(all_configs, f, indent=4)


def generate_custom_case():
return CustomCaseConfig()


def generate_custom_streaming_case():
return CustomStreamingCaseConfig()
3 changes: 2 additions & 1 deletion vectordb_bench/frontend/components/run_test/caseSelector.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
UICaseItemCluster,
get_case_config_inputs,
get_custom_case_cluter,
get_custom_streaming_case_cluster,
)
from vectordb_bench.frontend.config.styles import (
CASE_CONFIG_SETTING_COLUMNS,
Expand All @@ -32,7 +33,7 @@ def caseSelector(st, activedDbList: list[DB]):
activedCaseList: list[CaseConfig] = []
dbToCaseClusterConfigs = defaultdict(lambda: defaultdict(dict))
dbToCaseConfigs = defaultdict(lambda: defaultdict(dict))
caseClusters = UI_CASE_CLUSTERS + [get_custom_case_cluter()]
caseClusters = UI_CASE_CLUSTERS + [get_custom_case_cluter(), get_custom_streaming_case_cluster()]
for caseCluster in caseClusters:
activedCaseList += caseClusterExpander(st, caseCluster, dbToCaseClusterConfigs, activedDbList)
for db in dbToCaseClusterConfigs:
Expand Down
35 changes: 35 additions & 0 deletions vectordb_bench/frontend/config/dbCaseConfigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,34 @@ def get_custom_case_cluter() -> UICaseItemCluster:
return UICaseItemCluster(label="Custom Search Performance Test", uiCaseItems=get_custom_case_items())


def get_custom_streaming_case_items() -> list[UICaseItem]:
from vectordb_bench.frontend.components.custom.getCustomConfig import get_custom_streaming_configs

custom_streaming_configs = get_custom_streaming_configs()
return [
UICaseItem(
label=f"{custom_config.dataset_config.name} - Streaming",
description=f"Streaming test with custom dataset: {custom_config.dataset_config.name}",
cases=[
CaseConfig(
case_id=CaseType.StreamingCustomDataset,
custom_case={
"description": custom_config.description,
"dataset_config": custom_config.dataset_config.dict(),
},
)
],
caseLabel=CaseLabel.Streaming,
extra_custom_case_config_inputs=custom_streaming_config_with_custom_dataset,
)
for custom_config in custom_streaming_configs
]


def get_custom_streaming_case_cluster() -> UICaseItemCluster:
return UICaseItemCluster(label="Custom Streaming Test", uiCaseItems=get_custom_streaming_case_items())


def generate_custom_streaming_case() -> CaseConfig:
return CaseConfig(
case_id=CaseType.StreamingPerformanceCase,
Expand Down Expand Up @@ -207,6 +235,13 @@ def generate_custom_streaming_case() -> CaseConfig:
),
]

# Config for custom streaming tests (with custom dataset from JSON)
# Filter out the dataset_with_size_type from the existing config
custom_streaming_config_with_custom_dataset: list[ConfigInput] = [
config for config in custom_streaming_config
if config.label != CaseConfigParamType.dataset_with_size_type
]


def generate_label_filter_cases(dataset_with_size_type: DatasetWithSizeType) -> list[CaseConfig]:
label_percentages = dataset_with_size_type.get_manager().data.scalar_label_percentages
Expand Down
Loading