diff --git a/.github/workflows/benchmark-issue79.yml b/.github/workflows/benchmark-issue79.yml new file mode 100644 index 0000000..703c336 --- /dev/null +++ b/.github/workflows/benchmark-issue79.yml @@ -0,0 +1,93 @@ +# Custom Docker build + benchmark for fix/issue-79 branch. +# Builds image with the fix, runs benchmark, compares nested vs literal. +# Does NOT run on main. +name: Issue #79 Benchmark + +on: + push: + branches: [fix/issue-79] + pull_request: + branches: [main] + +jobs: + build-and-benchmark: + if: github.ref == 'refs/heads/fix/issue-79' || github.head_ref == 'fix/issue-79' + runs-on: ubuntu-22.04 + permissions: + contents: read + packages: write + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install build dependencies + run: | + sudo apt-get update + sudo apt-get install -y \ + build-essential cmake gcc-10 g++-10 git \ + libssl-dev libncurses5-dev pkg-config bison \ + libtirpc-dev libldap2-dev libsasl2-dev libudev-dev \ + libre2-dev libcurl4-openssl-dev libprotobuf-dev protobuf-compiler + + - name: Cache Boost + id: cache-boost + uses: actions/cache@v4 + with: + path: boost_cache + key: boost-mysql-8.4.8-v1 + + - name: Build MyVector plugin (amd64) + run: | + set -euo pipefail + build_dir="mysql-server" + rm -rf ${build_dir} + git clone --depth 1 --branch mysql-8.4.8 \ + https://github.com/mysql/mysql-server.git ${build_dir} + mkdir -p ${build_dir}/plugin/myvector + cp src/*.cc ${build_dir}/plugin/myvector/ + cp include/*.h ${build_dir}/plugin/myvector/ + cp include/*.i ${build_dir}/plugin/myvector/ 2>/dev/null || true + cp CMakeLists.txt ${build_dir}/plugin/myvector/ + cd ${build_dir} + mkdir -p bld && cd bld + cmake .. \ + -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_CXX_COMPILER=g++-10 \ + -DDOWNLOAD_BOOST=1 -DWITH_BOOST=../../boost_cache \ + -DWITH_UNIT_TESTS=OFF -DWITH_ROUTER=OFF -DWITH_RAPID=OFF \ + -DWITH_NDB=OFF -DWITH_NDBCLUSTER=OFF \ + -DWITH_EXAMPLE_STORAGE_ENGINE=OFF -DCMAKE_BUILD_TYPE=Release + make myvector -j$(nproc) + cp plugin_output_directory/myvector.so ../../myvector-amd64.so + + - name: Prepare Docker build context + run: | + cp myvector-amd64.so . + cp sql/myvectorplugin.sql . + ls -la myvector-amd64.so myvectorplugin.sql + + - name: Build Docker image + run: | + docker build \ + --build-arg MYSQL_VERSION=8.4 \ + --build-arg TARGETARCH=amd64 \ + -t ghcr.io/${{ github.repository }}:mysql8.4-fix-issue79 \ + -f Dockerfile.oraclelinux9 \ + . + + - name: Run benchmark + run: | + BENCH_ROWS=500 BENCH_DIM=768 BENCH_RUNS=3 \ + MYVECTOR_IMAGE=ghcr.io/${{ github.repository }}:mysql8.4-fix-issue79 \ + ./scripts/benchmark-issue79.sh 2>&1 | tee benchmark-results.txt + - name: Upload benchmark results + uses: actions/upload-artifact@v4 + with: + name: benchmark-issue79-results + path: benchmark-results.txt + + - name: Push image (branch only) + if: github.ref == 'refs/heads/fix/issue-79' && github.event_name == 'push' + run: | + echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u ${{ github.actor }} --password-stdin + docker push ghcr.io/${{ github.repository }}:mysql8.4-fix-issue79 diff --git a/.gitignore b/.gitignore index 13b9e24..4b08788 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,9 @@ /scripts/build-fast.sh /src/myvector.cc.tmp # Build artifacts +.cache/ +mysql-server/ +mysql-server-test/ *.o *.so *.dylib diff --git a/docs/ISSUE_79_BENCHMARK_BASELINE.md b/docs/ISSUE_79_BENCHMARK_BASELINE.md new file mode 100644 index 0000000..d264c9a --- /dev/null +++ b/docs/ISSUE_79_BENCHMARK_BASELINE.md @@ -0,0 +1,36 @@ +# Issue #79 Benchmark Baseline (Before Fix) + +**Date:** 2026-03-13 +**Config:** rows=500, dim=768, runs=3 +**Image:** ghcr.io/askdba/myvector:mysql8.4 (prebuilt, no fix) + +## Results + +| Query | Run 1 | Run 2 | Run 3 | Avg (µs) | +|-------|-------|-------|-------|----------| +| Nested myvector_construct | 99694 | 180259 | 105390 | 128447 | +| Literal 0x... | 101686 | 103537 | 94969 | 100064 | + +**Ratio (nested/literal): 1.28x** + +Nested is ~28% slower. With more rows (e.g. 54k as in the original issue), the gap would widen significantly. + +## Raw Output + +``` +=== Issue #79 Benchmark (rows=500, dim=768, runs=3) === +--- Nested myvector_construct (issue #79 pattern) --- + Run 1: 99694 µs + Run 2: 180259 µs + Run 3: 105390 µs + +--- Precomputed 0x literal (baseline) --- + Run 1: 101686 µs + Run 2: 103537 µs + Run 3: 94969 µs + +=== Results === +Nested myvector_construct: 128447 µs (avg) +Literal 0x...: 100064 µs (avg) +Ratio (nested/literal): 1.28x +``` diff --git a/scripts/benchmark-issue79.sh b/scripts/benchmark-issue79.sh new file mode 100755 index 0000000..d4b643e --- /dev/null +++ b/scripts/benchmark-issue79.sh @@ -0,0 +1,140 @@ +#!/usr/bin/env bash +# Benchmark for issue #79: myvector_distance with nested myvector_construct. +# Compares: nested myvector_construct (slow) vs precomputed 0x literal (fast). +# +# Usage: +# ./scripts/benchmark-issue79.sh # Use prebuilt image +# ./scripts/benchmark-issue79.sh /path/to/myvector.so # Use custom plugin +# +# Env: BENCH_ROWS (default 1000), BENCH_DIM (default 768), BENCH_RUNS (default 3) +# 768-dim matches issue #79 (CLIP). Use BENCH_DIM=3 for quick runs. +# +# Output: Timing for nested query and literal query, ratio. +set -euo pipefail +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +cd "$REPO_ROOT" + +IMAGE="${MYVECTOR_IMAGE:-ghcr.io/askdba/myvector:mysql8.4}" +CONTAINER="myvector-bench-$$" +PLUGIN_SO="${1:-}" +ROWS="${BENCH_ROWS:-1000}" +RUNS="${BENCH_RUNS:-3}" +DIM="${BENCH_DIM:-768}" + +# 768-dim matches issue #79 (CLIP vectors). 3-dim is too cheap to show the problem. +echo "=== Issue #79 Benchmark (rows=$ROWS, dim=$DIM, runs=$RUNS) ===" + +docker run -d --name "$CONTAINER" \ + -e MYSQL_ROOT_PASSWORD=bench \ + -e MYSQL_DATABASE=bench \ + -e MYSQL_ROOT_HOST=% \ + "$IMAGE" +trap "docker rm -f $CONTAINER 2>/dev/null || true" EXIT + +echo "Waiting for MySQL..." +for i in $(seq 1 60); do + docker exec "$CONTAINER" mysql -uroot -pbench -e "SELECT 1" 2>/dev/null && break + sleep 2 +done +docker exec "$CONTAINER" mysql -uroot -pbench -e "SELECT 1" || { echo "MySQL not ready"; exit 1; } + +if [ -n "$PLUGIN_SO" ] && [ -f "$PLUGIN_SO" ]; then + echo "Installing custom plugin from $PLUGIN_SO" + PLUGIN_DIR=$(docker exec "$CONTAINER" mysql -uroot -pbench -N -e "SELECT @@plugin_dir;") + docker cp "$PLUGIN_SO" "$CONTAINER:$PLUGIN_DIR/myvector.so" + docker exec "$CONTAINER" mysql -uroot -pbench -e "UNINSTALL PLUGIN myvector; INSTALL PLUGIN myvector SONAME 'myvector.so';" + docker exec "$CONTAINER" mysql -uroot -pbench -e "SOURCE /docker-entrypoint-initdb.d/myvectorplugin.sql;" 2>/dev/null || true +fi + +# Generate N-dim vector string via Python (fast) +gen_vec() { + python3 -c " +import sys +n, dim = int(sys.argv[1]), int(sys.argv[2]) +vals = [((n + i) * 0.001 - 0.5) for i in range(dim)] +print('[' + ','.join(str(v) for v in vals) + ']') +" "$1" "$DIM" +} + +echo "Creating table with $ROWS rows (${DIM}-dim vectors)..." +VARSIZE=$((DIM * 4 + 16)) +docker exec "$CONTAINER" mysql -uroot -pbench bench -e " + DROP TABLE IF EXISTS t79; + CREATE TABLE t79 (id INT PRIMARY KEY, v VARBINARY($VARSIZE)); +" + +# Query vector for nested/literal (constant across all rows) +QUERY_VEC=$(gen_vec 1) + +# Insert in batches of 5 for large vectors +for ((i=1; i<=ROWS; i+=5)); do + end=$((i+4)) + [ $end -gt $ROWS ] && end=$ROWS + vals="" + for ((n=i; n<=end; n++)); do + vec=$(gen_vec $n) + vec_escaped="${vec//\'/\\\'}" + [ -n "$vals" ] && vals="$vals," + vals="${vals}($n, myvector_construct('$vec_escaped'))" + done + docker exec "$CONTAINER" mysql -uroot -pbench bench -e "INSERT INTO t79 VALUES $vals" 2>/dev/null +done + +run_timed() { + local q="$1" + local start end + start=$(python3 -c "import time; print(int(time.time()*1e6))" 2>/dev/null || echo $(($(date +%s) * 1000000))) + docker exec "$CONTAINER" mysql -uroot -pbench bench -N -e "$q" 2>/dev/null >/dev/null + end=$(python3 -c "import time; print(int(time.time()*1e6))" 2>/dev/null || echo $(($(date +%s) * 1000000))) + echo $((end - start)) +} + +# Escape single quotes for SQL +QUERY_VEC_SQL="${QUERY_VEC//\'/\'\'}" +echo "" +echo "--- Nested myvector_construct (issue #79 pattern) ---" +nested_times=() +for r in $(seq 1 $RUNS); do + t=$(run_timed "SELECT COUNT(*) FROM (SELECT myvector_distance(v, myvector_construct('$QUERY_VEC_SQL'), 'L2') AS d FROM t79) sub") + nested_times+=("$t") + echo " Run $r: ${t} µs" +done + +echo "" +echo "--- Precomputed 0x literal (baseline) ---" +LIT_HEX=$(docker exec "$CONTAINER" mysql -uroot -pbench -N -e "SELECT HEX(myvector_construct('$QUERY_VEC_SQL')) FROM DUAL;" 2>/dev/null) +literal_times=() +for r in $(seq 1 $RUNS); do + t=$(run_timed "SELECT COUNT(*) FROM (SELECT myvector_distance(v, 0x$LIT_HEX, 'L2') AS d FROM t79) sub") + literal_times+=("$t") + echo " Run $r: ${t} µs" +done + +# Compute average +nested_sum=0 +literal_sum=0 +for t in "${nested_times[@]}"; do nested_sum=$((nested_sum + t)); done +for t in "${literal_times[@]}"; do literal_sum=$((literal_sum + t)); done +nested_avg=$((nested_sum / RUNS)) +literal_avg=$((literal_sum / RUNS)) +if [ "$literal_avg" -gt 0 ]; then + ratio=$(echo "scale=2; $nested_avg / $literal_avg" | bc 2>/dev/null || echo "N/A") + [[ "$ratio" = .* ]] && ratio="0$ratio" +else + ratio="N/A" +fi + +echo "" +echo "=== Results ===" +echo "Nested myvector_construct: ${nested_avg} µs (avg)" +echo "Literal 0x...: ${literal_avg} µs (avg)" +echo "Ratio (nested/literal): ${ratio}x" +echo "" +if [ "$ratio" != "N/A" ]; then + ratio_int="${ratio%%.*}" + [ -z "$ratio_int" ] && ratio_int="0" + if [ "$ratio_int" -gt 2 ] 2>/dev/null; then + echo "NOTE: Nested is ${ratio}x slower. Fix should bring ratio close to 1.0x" + fi +fi diff --git a/src/myvector.cc b/src/myvector.cc index 4aef068..6f23c10 100644 --- a/src/myvector.cc +++ b/src/myvector.cc @@ -1652,6 +1652,87 @@ int SQFloatVectorToBinaryVector(FP32* fvec, unsigned long* ivec, int dim) { return (idx * sizeof(unsigned long)); // number of bytes } +/* Helper: perform myvector_construct conversion. Used for both constant-arg + * caching (in init) and per-row conversion. Returns true on success. + */ +static bool myvector_construct_do_convert(char* ptr, + unsigned long ptrlen, + const char* opt, + unsigned long optlen, + char* retvec, + unsigned long* retlen, + unsigned char* is_null, + unsigned char* error) { + bool skipConvert = false; + if (!opt || !optlen) + opt = "i=string,o=float"; + else { + MyVectorOptions vo(string(opt, optlen)); + if (vo.getOption("i") == "float" && vo.getOption("o") == "float") + skipConvert = true; + if (vo.getOption("o") == "bv") { + myvector_construct_bv(vo.getOption("i"), + ptr, + retvec, + ptrlen, + retlen, + is_null, + error); + return (*error == 0); + } + } + + int len = 0; + if (skipConvert) { + if ((ptrlen % sizeof(FP32)) != 0) { + *error = 1; + return false; + } + memcpy(retvec, ptr, ptrlen); + len = ptrlen; + } else { + char* start = nullptr; + char endch; + if ((start = strchr(ptr, '['))) + endch = ']'; + else if ((start = strchr(ptr, '{'))) + endch = '}'; + else if ((start = strchr(ptr, '('))) + endch = ')'; + else { + start = ptr; + endch = '\0'; + } + if (endch) + start++; + char* p = start; + while (*p && *p != endch) { + while (*p && (*p == ' ' || *p == ',')) + p++; + char* p1 = p; + while (*p != ' ' && *p != ',' && *p != endch) + p++; + char buff[64]; + strncpy(buff, p1, (p - p1)); + buff[(int)(p - p1)] = 0; + FP32 fval = atof(buff); + memcpy(&retvec[len], &fval, sizeof(FP32)); + len += sizeof(FP32); + } + } + +#if MYSQL_VERSION_ID < 90000 + unsigned int metadata = MYVECTOR_V1_FP32_METADATA; + memcpy(&retvec[len], &metadata, sizeof(metadata)); + len += sizeof(metadata); + ha_checksum cksum = my_checksum(0, (const unsigned char*)retvec, len); + memcpy(&retvec[len], &cksum, sizeof(cksum)); + len += sizeof(cksum); +#endif + *retlen = len; + return true; +} + char* myvector_construct_bv(const std::string& srctype, char* src, char* dst, @@ -1726,7 +1807,35 @@ PLUGIN_EXPORT bool myvector_construct_init(UDF_INIT* initid, return true; // error } initid->max_length = MYVECTOR_CONSTRUCT_MAX_LEN; - initid->ptr = (char*)malloc(MYVECTOR_CONSTRUCT_MAX_LEN); + size_t alloc_size = sizeof(size_t) + MYVECTOR_CONSTRUCT_MAX_LEN; + char* buf = (char*)malloc(alloc_size); + if (!buf) { + strcpy(message, "myvector_construct: malloc failed"); + return true; + } + *(size_t*)buf = 0; // 0 = no cache + + /* Issue #79: when first arg is constant, convert once and cache */ + if (args->args[0] != nullptr && args->lengths[0] > 0) { + const char* opt = + (args->arg_count >= 2 && args->args[1]) ? args->args[1] : nullptr; + unsigned long optlen = + (args->arg_count >= 2 && args->args[1]) ? args->lengths[1] : 0; + unsigned char is_null = 0, err = 0; + unsigned long result_len = 0; + if (myvector_construct_do_convert(args->args[0], + args->lengths[0], + opt, + optlen, + buf + sizeof(size_t), + &result_len, + &is_null, + &err) && + !err) { + *(size_t*)buf = result_len; + } + } + initid->ptr = buf; return false; } @@ -1749,106 +1858,48 @@ PLUGIN_EXPORT char* myvector_construct(UDF_INIT* initid, unsigned long* length, unsigned char* is_null, unsigned char* error) { - char* ptr = args->args[0]; - const char* opt = nullptr; - if (args->arg_count == 2) - opt = args->args[1]; + size_t* cache_len = (size_t*)initid->ptr; + if (*cache_len > 0) { + *length = *cache_len; + return initid->ptr + sizeof(size_t); + } - char* start = nullptr; - char endch; - char* retvec = initid->ptr; - int retlen = 0; - bool skipConvert = false; + char* ptr = args->args[0]; + if (!ptr) { + *is_null = 1; + return initid->ptr + sizeof(size_t); + } - if (!opt || !args->lengths[1]) - opt = "i=string,o=float"; // i=string,o=float - else { - MyVectorOptions vo(opt); - - /* - * i = float, o = float : App already has the vector in floats, just - need to add MyVector metadata and checksum - * i = bv, o = bv : App is sending bytes of a Binary Vector - (e.g Cohere model). Add metadata + checksum - * i = string, o = bv : Convert series of 1-byte int's to Binary - Vector - * i = column, o = bv : App wants to implement SQ compression. Convert - MyVector float column to BV - */ - if (vo.getOption("i") == "float" && vo.getOption("o") == "float") - skipConvert = true; + const char* opt = (args->arg_count >= 2 && args->args[1]) ? args->args[1] : nullptr; + unsigned long optlen = + (args->arg_count >= 2 && args->args[1]) ? args->lengths[1] : 0; - /* For Binary Vectors, we will branch out to a separate routine */ + /* o=bv branches to myvector_construct_bv - not handled by helper */ + if (opt && optlen) { + MyVectorOptions vo(string(opt, optlen)); if (vo.getOption("o") == "bv") return myvector_construct_bv(vo.getOption("i"), ptr, - initid->ptr, + initid->ptr + sizeof(size_t), args->lengths[0], length, is_null, error); - } // else opt - - if (skipConvert) { - // User is passing floats directly in bind variable or using "0x" - // literal - if ((args->lengths[0] % sizeof(FP32)) != 0) - SET_UDF_ERROR_AND_RETURN( - "Input vector is malformed, length not a " - "multiple of sizeof(float) %lu.", - args->lengths[0]); - memcpy(retvec, ptr, args->lengths[0]); - retlen = args->lengths[0]; - goto addChecksum; - } - - /* Below code implements conversion from string "[0.134511 -0.082219 ...]" - * to floats followed by metadata & checksum. - */ - if ((start = strchr(ptr, '['))) - endch = ']'; - else if ((start = strchr(ptr, '{'))) - endch = '}'; - else if ((start = strchr(ptr, '('))) - endch = ')'; - else { - start = ptr; - endch = '\0'; } - if (endch) - start++; - ptr = start; - - while (*ptr && *ptr != endch) { - while (*ptr && (*ptr == ' ' || *ptr == ',')) - ptr++; - char* p1 = ptr; - while (*ptr != ' ' && *ptr != ',' && *ptr != endch) - ptr++; - char buff[64]; - strncpy(buff, p1, (ptr - p1)); - - // TODO - atof() returns 0 if not a valid float - FP32 fval = atof(buff); // change these 2 lines for FP16, INT8 etc - memcpy(&retvec[retlen], &fval, sizeof(FP32)); - - retlen += sizeof(FP32); - } // while + unsigned long retlen = 0; + if (!myvector_construct_do_convert(ptr, + args->lengths[0], + opt, + optlen, + initid->ptr + sizeof(size_t), + &retlen, + is_null, + error)) + return initid->ptr + sizeof(size_t); -addChecksum: -#if MYSQL_VERSION_ID < 90000 - unsigned int metadata = MYVECTOR_V1_FP32_METADATA; - memcpy(&retvec[retlen], &metadata, sizeof(metadata)); - retlen += sizeof(metadata); - - ha_checksum cksum = my_checksum(0, (const unsigned char*)retvec, retlen); - memcpy(&retvec[retlen], &cksum, sizeof(cksum)); - retlen += sizeof(cksum); -#endif *length = retlen; - - return retvec; + return initid->ptr + sizeof(size_t); } PLUGIN_EXPORT void myvector_construct_deinit(UDF_INIT* initid) {