diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..d1a8192b1 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,13 @@ +[submodule "big-ann-benchmarks"] + path = big-ann-benchmarks + url = https://github.com/intellistream/big-ann-benchmarks.git +[submodule "GTI"] + path = GTI + url = https://github.com/MingqiWang-coder/GTI-Graph-based-Tree-Index.git +[submodule "DiskANN"] + path = DiskANN + url = https://github.com/MingqiWang-coder/DiskANN.git + branch = diskv2 +[submodule "IP-DiskANN"] + path = IP-DiskANN + url = https://github.com/intellistream/IP-DiskANN.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 98840c33d..f05d6a874 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,23 +1,24 @@ #set(CMAKE_C_COMPILER "/usr/bin/gcc-11") #set(CMAKE_CXX_COMPILER "/usr/bin/g++-11") #set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}") -cmake_minimum_required(VERSION 3.27) +#cmake_minimum_required(VERSION 3.27) +cmake_minimum_required(VERSION 3.14) project(CANDYBENCH CXX) -option(ENABLE_CUDA "Enable cuda" ON) -message(STATUS "1.0 CUDA enabled: ${ENABLE_CUDA}") -include (cmake/FindCuda.cmake) +#option(ENABLE_CUDA "Enable cuda" ON) +#message(STATUS "1.0 CUDA enabled: ${ENABLE_CUDA}") +# include (cmake/FindCuda.cmake) include (cmake/FindTorch.cmake) -#set(CMAKE_CUDA_ARCHITECTURES "70;75;80") -set(CMAKE_CUDA_ARCHITECTURES ALL) find_package(Torch REQUIRED) +include_directories(${Torch_INCLUDE_DIRS}) +include_directories("/usr/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include") +include_directories("/usr/local/lib/python3.10/dist-packages/torch/include") + find_package(Python3 REQUIRED COMPONENTS Development) include_directories(${Python3_INCLUDE_DIRS}) set(CMAKE_POSITION_INDEPENDENT_CODE ON) - - include_directories (${gflags_INCLUDE_DIR}) set(CMAKE_VERBOSE_MAKEFILE OFF) set(CMAKE_RULE_MESSAGES OFF) @@ -35,16 +36,16 @@ include(cmake/default.cmake) #test avx2 # Option to enable/disable CUDA -message(STATUS "2.0 CUDA enabled: ${ENABLE_CUDA}") -if (ENABLE_CUDA) - enable_language(CUDA) - set(CMAKE_CUDA_STANDARD 20) - set(CMAKE_CUDA_ARCHITECTURES OFF) - add_definitions(-DENABLE_CUDA=1) - message(STATUS "CUDA is enabled") -else() - message(STATUS "CUDA is not enabled") -endif () +#message(STATUS "2.0 CUDA enabled: ${ENABLE_CUDA}") +#if (ENABLE_CUDA) +# enable_language(CUDA) +# set(CMAKE_CUDA_STANDARD 20) +# set(CMAKE_CUDA_ARCHITECTURES OFF) +# add_definitions(-DENABLE_CUDA=1) +# message(STATUS "CUDA is enabled") +#else() +# message(STATUS "CUDA is not enabled") +#endif () add_subdirectory(thirdparty/faiss) @@ -52,15 +53,12 @@ add_subdirectory(thirdparty/faiss) #target_compile_options(faiss PRIVATE "-fno-openmp") set(LIBRARIES ${LIBRARIES} faiss) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}") -set(LIBRARIES ${LIBRARIES} ${TORCH_LIBRARIES}) - -# Set Optimization Flags -set(CMAKE_CXX_FLAGS "-std=c++20 -Wall -Werror=return-type -Wno-interference-size") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall") set(CMAKE_CXX_FLAGS_DEBUG "-g -O0 -DNO_RACE_CHECK -DCANDY_DEBUG_MODE=1") set(CMAKE_CXX_FLAGS_RELEASE "-Wno-ignored-qualifiers -Wno-sign-compare -O3") -set(PROJECT_BINARY_DIR_RAW ${PROJECT_BINARY_DIR}) +set(PROJECT_BINARY_DIR_RAW ${PROJECT_BINARY_DIR}) # Valid values are "generic", "avx2", "avx512". detect_avx512_support(AVX512_AVAILABLE) @@ -311,12 +309,6 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) message(STATUS "CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}") message(STATUS "CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG}") message(STATUS "CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE}") -#pytorch -#set(Torch_DIR "/home/tony/.local/lib/python3.10/site-packages/torch/share/cmake" ) -# Log4cc -#find_package(Log4cxx REQUIRED) -#include_directories(${Log4cxx_INCLUDE_DIR}) -#set(LIBRARIES ${LIBRARIES} ${Log4cxx_LIBRARY}) option(ENABLE_UNIT_TESTS "Enable unit tests" OFF) @@ -331,10 +323,6 @@ foreach (dir ${dirs}) endforeach () -#add_subdirectory(pytorchNN) -# Add Source Code - - add_subdirectory(src) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/) @@ -342,43 +330,42 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/) # Add Library get_sources(CANDY_SOURCE_FILES) get_headers(CANDY_HEADER_FILES) - -if (ENABLE_CUDA) - set_source_files_properties(src/CANDY/IndexTable.cpp - PROPERTIES - LANGUAGE CUDA - ) -endif () - -add_library(CANDYBENCH SHARED ${CANDY_SOURCE_FILES} ${CANDY_HEADER_FILES} ${CMAKE_CURRENT_BINARY_DIR} - src/CANDY/IndexTable.cpp) +add_library(CANDYBENCH SHARED ${CANDY_SOURCE_FILES} ${CANDY_HEADER_FILES} ${CMAKE_CURRENT_BINARY_DIR}) +#if (ENABLE_CUDA) +# set_source_files_properties(src/CANDY/IndexTable.cpp +# PROPERTIES +# LANGUAGE CUDA +# ) +#endif () +# +#add_library(CANDYBENCH SHARED ${CANDY_SOURCE_FILES} ${CANDY_HEADER_FILES} ${CMAKE_CURRENT_BINARY_DIR} +# src/CANDY/IndexTable.cpp) set_property(TARGET CANDYBENCH PROPERTY CXX_STANDARD 20) target_include_directories(CANDYBENCH PUBLIC "include") -if (ENABLE_CUDA) - set(LIBRARIES ${LIBRARIES} cublas cudart) - set_target_properties(CANDYBENCH PROPERTIES - CUDA_STANDARD 20 - CXX_STANDARD 20 - ) -else () - set_target_properties(CANDYBENCH PROPERTIES - CXX_STANDARD 20 - ) -endif () +#if (ENABLE_CUDA) +# set(LIBRARIES ${LIBRARIES} cublas cudart) +# set_target_properties(CANDYBENCH PROPERTIES +# CUDA_STANDARD 20 +# CXX_STANDARD 20 +# ) +#else () +# set_target_properties(CANDYBENCH PROPERTIES +# CXX_STANDARD 20 +# ) +#endif () # 设置 MKL 库的路径 -set(MKL_INCLUDE_DIR "/usr/include/mkl") -set(MKL_LIB_DIR "/usr/lib/x86_64-linux-gnu") -#set(MPI_INCLUDE_PATH "/usr/include/openmpi-x86_64") -#set(MPI_LIBRARIES "/usr/lib/x86_64-linux-gnu/openmpi/lib/libmpi.so") - +set(MKL_ROOT /opt/intel/oneapi/mkl/latest) +set(MKL_INCLUDE_DIR /opt/intel/oneapi/mkl/latest/include) +set(MKL_LIB_DIR /opt/intel/oneapi/mkl/latest/lib/intel64) set(MKL_LIBRARIES - "${MKL_LIB_DIR}/libmkl_intel_lp64.so" - "${MKL_LIB_DIR}/libmkl_sequential.so" - "${MKL_LIB_DIR}/libmkl_core.so" + "${MKL_LIB_DIR}/libmkl_intel_lp64.so" + "${MKL_LIB_DIR}/libmkl_sequential.so" + "${MKL_LIB_DIR}/libmkl_core.so" ) + target_include_directories(CANDYBENCH PUBLIC ${MKL_INCLUDE_DIR}) # MKL 和其他库的链接 target_link_libraries(CANDYBENCH PUBLIC diff --git a/DiskANN b/DiskANN new file mode 160000 index 000000000..b7a3b768f --- /dev/null +++ b/DiskANN @@ -0,0 +1 @@ +Subproject commit b7a3b768f7f690f48765420fcfe6d76bfb661966 diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..b6056298e --- /dev/null +++ b/Dockerfile @@ -0,0 +1,86 @@ +FROM ubuntu:22.04 + +WORKDIR /app + +COPY . /app + +RUN DEBIAN_FRONTEND=noninteractive apt-get update && \ + apt-get install -y --no-install-recommends \ + python3 python3-pip git build-essential \ + liblapack-dev libblas-dev libopenblas-dev \ + libboost-all-dev \ + libnuma-dev \ + libgflags-dev libgoogle-glog-dev \ + swig \ + libhdf5-dev \ + libaio-dev \ + libgoogle-perftools-dev \ + libomp-dev \ + libtbb-dev \ + libarchive-dev \ + libcurl4-openssl-dev \ + wget \ + curl \ + gnupg \ + libfmt-dev \ + python3-dev \ + libeigen3-dev \ + libspdlog-dev \ + pybind11-dev \ + pkg-config \ + zlib1g-dev \ + libssl-dev \ + gfortran \ + && rm -rf /var/lib/apt/lists/* && \ + ldconfig + +RUN wget https://github.com/Kitware/CMake/releases/download/v3.30.2/cmake-3.30.2-linux-x86_64.sh -O cmake.sh && \ + chmod +x cmake.sh && \ + ./cmake.sh --skip-license --prefix=/usr/local && \ + rm cmake.sh && \ + ln -sf /usr/local/bin/cmake /usr/bin/cmake + +RUN wget -qO - https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor -o /usr/share/keyrings/oneapi-archive-keyring.gpg && \ + echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \ + apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + intel-oneapi-mkl-devel \ + && rm -rf /var/lib/apt/lists/* + +ENV MKLROOT="/opt/intel/oneapi/mkl/latest" +ENV LD_LIBRARY_PATH="${MKLROOT}/lib/intel64:${LD_LIBRARY_PATH}" + +RUN pip install --no-cache-dir \ + torch==2.3.0+cpu \ + torchvision==0.18.0+cpu \ + torchaudio==2.3.0+cpu \ + --index-url https://download.pytorch.org/whl/cpu + +ENV Torch_DIR="/usr/local/lib/python3.10/dist-packages/torch/share/cmake/Torch" + +WORKDIR /app +RUN pip install . + +WORKDIR /app/GTI/GTI/extern_libraries/n2 +RUN mkdir -p build && make shared_lib + +WORKDIR /app/GTI/GTI +RUN mkdir -p bin build && cd build && cmake -DCMAKE_BUILD_TYPE=Release .. && make -j && make install + +WORKDIR /app/DiskANN +RUN mkdir -p build && cd build && \ + cmake -DCMAKE_BUILD_TYPE=Release \ + -DMKL_PATH=/opt/intel/oneapi/mkl/latest/lib/intel64 \ + -DMKL_INCLUDE_PATH=/opt/intel/oneapi/mkl/latest/include \ + .. && \ + make -j && make install + +WORKDIR /app/IP-DiskANN +RUN mkdir -p build && cd build && \ + cmake -DCMAKE_BUILD_TYPE=Release \ + -DMKL_PATH=/opt/intel/oneapi/mkl/latest/lib/intel64 \ + -DMKL_INCLUDE_PATH=/opt/intel/oneapi/mkl/latest/include \ + .. && \ + make -j && make install + +CMD ["bash"] \ No newline at end of file diff --git a/GTI b/GTI new file mode 160000 index 000000000..a21789816 --- /dev/null +++ b/GTI @@ -0,0 +1 @@ +Subproject commit a2178981626dce884c462e27477fc9ef9ad6ab1c diff --git a/IP-DiskANN b/IP-DiskANN new file mode 160000 index 000000000..ab06fe2c3 --- /dev/null +++ b/IP-DiskANN @@ -0,0 +1 @@ +Subproject commit ab06fe2c355a51d74cc3c4c06d0c50368edbf84d diff --git a/README.md b/README.md index 6d3eefafb..798ede746 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,372 @@ -# CANDY +# CANDOR-Bench: Benchmarking In-Memory Continuous ANNS under Dynamic Open-World Streams -A library and benchmark suite for Approximate Nearest Neighbor Search (ANNS). This project is compatible with LibTorch. +CANDOR-Bench (Continuous Approximate Nearest neighbor search under Dynamic Open-woRld Streams) is a benchmarking framework designed to evaluate in-memory ANNS algorithms under realistic, dynamic data stream conditions. ## Table of Contents +- [Project Structure](#Project-Structure) +- [Datasets and Algorithms](#Datasets-and-Algorithms) + - [Summary of Datasets](#Summary-of-Datasets) + - [Summary of Algorithms](#Summary-of-Algorithms) - [Quick Start Guide](#quick-start-guide) - - [Docker Support](#docker-support) + - [Build With Docker](#Build-With-Docker) + - [Usage](#Usage) + +- [Additional Information](#additional-information) --- +## Project Structure + +``` +CANDY-Benchmark/ +├── benchmark/ +├── big-ann-benchmarks/ # Core benchmarking framework (Dynamic Open-World conditions) +│ ├── benchmark/ +│ │ ├── algorithms/ # Concurrent Track +│ │ ├── concurrent/ # Congestion Track +│ │ ├── congestion/ +│ │ ├── main.py +│ │ ├── runner.py +│ │ └── …… +│ ├── create_dataset.py +│ ├── requirements_py3.10.txt +│ ├── logging.conf +│ ├── neurips21/ +│ ├── neurips23/ # NeurIPS'23 benchmark configurations and scripts +│ │ ├── concurrent/ # Concurrent Track +│ │ ├── congestion/ # Congestion Track +│ │ ├── filter/ +│ │ ├── ood/ +│ │ ├── runbooks/ # Dynamic benchmark scenario definitions (e.g., T1, T3, etc.) +│ │ ├── sparse/ +│ │ ├── streaming/ +│ │ └── …… +│ └──…… +├── DiskANN/ # Integrated DiskANN-based algorithms +├── GTI/ # Integrated GTI algorithm source +├── IP-DiskANN/ # Integrated IP-DiskANN algorithm source +├── src/ # Main algorithm implementations +├── include/ # C++ header files +├── thirdparty/ # External dependencies +├── Dockerfile # Docker build recipe +├── requirements.txt +├── setup.py # Python package setup +└── …… +``` +## Datasets and Algorithms + +Our evaluation involves the following datasets and algorithms. + +### Summary of Datasets + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
CategoryNameDescriptionDimensionData SizeQuery Size
Real-worldSIFTImage1281M10K
OpenImagesStreamingImage5121M10K
SunImage51279K200
SIFT100MImage128100M10K
TreviImage4096100K200
MsongAudio420990K200
COCOMulti-Modal768100K500
GloveText1001.192M200
MSTuringText10030M10K
SyntheticGaussiani.i.d valuesAdjustable500K1000
BlobGaussian Blobs768500K1000
WTEText768100K100
FreewayMLConstructed128100K1K
+ +### Summary of Algorithms + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
CategoryAlgorithm NameDescription
+ Tree-based + SPTAGSpace-partitioning tree structure for efficient data segmentation.
+ LSH-based + LSHData-independent hashing to reduce dimensionality and approximate nearest neighbors.
LSHAPGLSH-driven optimization using LSB-Tree to differentiate graph regions.
+ Clustering-based + PQProduct quantization for efficient clustering into compact subspaces.
IVFPQInverted index with product quantization for hierarchical clustering.
OnlinePQIncremental updates of centroids in product quantization for streaming data.
PuckNon-orthogonal inverted indexes with multiple quantization optimized for large-scale datasets.
SCANNSmall-bit quantization to improve register utilization.
+ Graph-based + NSWNavigable Small World graph for fast nearest neighbor search.
HNSWHierarchical Navigable Small World for scalable search.
FreshDiskANNStreaming graph construction for large-scale proximity-based search with refined robust edge pruning.
MNRUEnhances HNSW with efficient updates to prevent unreachable points in dynamic environments.
CufeEnhances FreshDiskANN with batched neighbor expansion.
PyannsEnhances FreshDiskANN with fix-sized huge pages for optimized memory access.
IPDiskANNEnables efficient in-place deletions for FreshDiskANN, improving update performance without reconstructions.
GTIHybrid tree-graph indexing for efficient, dynamic high-dimensional search, with optimized updates and construction.
ParlayHNSWParallel, deterministic HNSW for improved scalability and performance.
ParlayVamanaParallel, deterministic FreshDiskANN implementation using Vamana for graph construction, with performance improvement.
+ ## Quick Start Guide +--- +# 🚨🚨 Strong Recommendation: Use Docker! 🚨🚨 + +> **We strongly recommend using Docker to build and run this project.** +> +> There are many algorithm libraries with complex dependencies. Setting up the environment locally can be difficult and error-prone. +> **Docker provides a consistent and reproducible environment, saving you time and avoiding compatibility issues.** +> +> **Note:** Building the Docker image may take **10–20 minutes** depending on your network and hardware. + +--- + +### Build With Docker +To build the project using Docker, simply use the provided Dockerfile located in the root directory. This ensures a consistent and reproducible environment for all dependencies and build steps. + +1. To initialize and update all submodules in the project, you can run: +``` +git submodule update --init --recursive +``` +2. You can build the Docker image with: +``` +docker build -t . +``` +3. Once the image is built, you can run a container from it using the following command. +``` +docker run -it +``` +4. After entering the container, navigate to the project directory: +``` +cd /app/big-ann-benchmarks +``` + + +### Usage + +All the following operations are performed in the root directory of big-ann-benchmarks. + +#### 2.1 Preparing dataset +Create a small, sample dataset. For example, to create a dataset with 10000 20-dimensional random floating point vectors, run: +``` +python create_dataset.py --dataset random-xs +``` +To see a complete list of datasets, run the following: +``` +python create_dataset.py --help +``` + +#### 2.2 Running Algorithms on the **congestion** Track + +To evaluate an algorithm under the `congestion` track, use the following command: +```bash +python3 run.py \ + --neurips23track congestion \ + --algorithm "$ALGO" \ + --nodocker \ + --rebuild \ + --runbook_path "$PATH" \ + --dataset "$DS" +``` +- algorithm "$ALGO": Name of the algorithm to evaluate. +- dataset "$DS": Name of the dataset to use. +- runbook_path "$PATH": Path to the runbook file describing the test scenario. +- rebuild: Rebuild the target before running. + +#### 2.3 Computing Ground Truth for Runbooks + +To compute ground truth for an runbook: +1. **Clone and build the [DiskANN repository](https://github.com/Microsoft/DiskANN)** +2. Use the provided script to compute ground truth at various checkpoints: +``` +python3 benchmark/congestion/compute_gt.py \ + --runbook "$PATH_TO_RUNBOOK" \ + --dataset "$DATASET_NAME" \ + --gt_cmdline_tool ~/DiskANN/build/apps/utils/compute_groundtruth +``` + +#### 2.4 Exporting Results +1. To make the results available for post-processing, change permissions of the results folder +``` +sudo chmod 777 -R results/ +``` +2. The following command will summarize all results files into a single csv file +``` +python data_export.py --out "$OUT" --track congestion +``` +The `--out` path "$OUT" should be adjusted according to the testing scenario. Common values include: +- `gen` +- `batch` +- `event` +- `conceptDrift` +- `randomContamination` +- `randomDrop` +- `wordContamination` +- `bulkDeletion` +- `batchDeletion` +- `multiModal` +- …… ## Additional Information @@ -161,253 +576,8 @@ Figures will be generated in the `figures` directory. - [Generate Documentation](#generate-documentation) - [Accessing Documentation](#accessing-documentation) - [Known Issues](#known-issues) +>>>>>>> dd068b958060f24e4d0c2cdf899f229efccc0b2b --- -### Extra CMake Options - -You can set additional CMake options using `cmake -D