Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 93 additions & 0 deletions .github/workflows/benchmark-issue79.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# Custom Docker build + benchmark for fix/issue-79 branch.
# Builds image with the fix, runs benchmark, compares nested vs literal.
# Does NOT run on main.
name: Issue #79 Benchmark

on:
push:
branches: [fix/issue-79]
pull_request:
branches: [main]

jobs:
build-and-benchmark:
if: github.ref == 'refs/heads/fix/issue-79' || github.head_ref == 'fix/issue-79'
runs-on: ubuntu-22.04
permissions:
contents: read
packages: write

steps:
- name: Checkout
uses: actions/checkout@v4

- name: Install build dependencies
run: |
sudo apt-get update
sudo apt-get install -y \
build-essential cmake gcc-10 g++-10 git \
libssl-dev libncurses5-dev pkg-config bison \
libtirpc-dev libldap2-dev libsasl2-dev libudev-dev \
libre2-dev libcurl4-openssl-dev libprotobuf-dev protobuf-compiler

- name: Cache Boost
id: cache-boost
uses: actions/cache@v4
with:
path: boost_cache
key: boost-mysql-8.4.8-v1

- name: Build MyVector plugin (amd64)
run: |
set -euo pipefail
build_dir="mysql-server"
rm -rf ${build_dir}
git clone --depth 1 --branch mysql-8.4.8 \
https://github.com/mysql/mysql-server.git ${build_dir}
mkdir -p ${build_dir}/plugin/myvector
cp src/*.cc ${build_dir}/plugin/myvector/
cp include/*.h ${build_dir}/plugin/myvector/
cp include/*.i ${build_dir}/plugin/myvector/ 2>/dev/null || true
cp CMakeLists.txt ${build_dir}/plugin/myvector/
cd ${build_dir}
mkdir -p bld && cd bld
cmake .. \
-DCMAKE_C_COMPILER=gcc-10 -DCMAKE_CXX_COMPILER=g++-10 \
-DDOWNLOAD_BOOST=1 -DWITH_BOOST=../../boost_cache \
-DWITH_UNIT_TESTS=OFF -DWITH_ROUTER=OFF -DWITH_RAPID=OFF \
-DWITH_NDB=OFF -DWITH_NDBCLUSTER=OFF \
-DWITH_EXAMPLE_STORAGE_ENGINE=OFF -DCMAKE_BUILD_TYPE=Release
make myvector -j$(nproc)
cp plugin_output_directory/myvector.so ../../myvector-amd64.so

- name: Prepare Docker build context
run: |
cp myvector-amd64.so .

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Remove self-copy of plugin .so in benchmark workflow

The previous step already writes myvector-amd64.so into the repository root, so cp myvector-amd64.so . copies the file onto itself; GNU cp reports this as an error ('file' and './file' are the same file) and exits non-zero. Because GitHub Actions runs run steps with bash -e, this aborts the job before Docker build and benchmark execution.

Useful? React with 👍 / 👎.

cp sql/myvectorplugin.sql .
ls -la myvector-amd64.so myvectorplugin.sql

- name: Build Docker image
run: |
docker build \
--build-arg MYSQL_VERSION=8.4 \
--build-arg TARGETARCH=amd64 \
-t ghcr.io/${{ github.repository }}:mysql8.4-fix-issue79 \
-f Dockerfile.oraclelinux9 \
.

- name: Run benchmark
run: |
BENCH_ROWS=500 BENCH_DIM=768 BENCH_RUNS=3 \
MYVECTOR_IMAGE=ghcr.io/${{ github.repository }}:mysql8.4-fix-issue79 \
./scripts/benchmark-issue79.sh 2>&1 | tee benchmark-results.txt
- name: Upload benchmark results
uses: actions/upload-artifact@v4
with:
name: benchmark-issue79-results
path: benchmark-results.txt

- name: Push image (branch only)
if: github.ref == 'refs/heads/fix/issue-79' && github.event_name == 'push'
run: |
echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u ${{ github.actor }} --password-stdin
docker push ghcr.io/${{ github.repository }}:mysql8.4-fix-issue79
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
/scripts/build-fast.sh
/src/myvector.cc.tmp
# Build artifacts
.cache/
mysql-server/
mysql-server-test/
*.o
*.so
*.dylib
Expand Down
36 changes: 36 additions & 0 deletions docs/ISSUE_79_BENCHMARK_BASELINE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Issue #79 Benchmark Baseline (Before Fix)

**Date:** 2026-03-13
**Config:** rows=500, dim=768, runs=3
**Image:** ghcr.io/askdba/myvector:mysql8.4 (prebuilt, no fix)

## Results

| Query | Run 1 | Run 2 | Run 3 | Avg (µs) |
|-------|-------|-------|-------|----------|
| Nested myvector_construct | 99694 | 180259 | 105390 | 128447 |
| Literal 0x... | 101686 | 103537 | 94969 | 100064 |

**Ratio (nested/literal): 1.28x**

Nested is ~28% slower. With more rows (e.g. 54k as in the original issue), the gap would widen significantly.

## Raw Output

```
=== Issue #79 Benchmark (rows=500, dim=768, runs=3) ===
--- Nested myvector_construct (issue #79 pattern) ---
Run 1: 99694 µs
Run 2: 180259 µs
Run 3: 105390 µs

--- Precomputed 0x literal (baseline) ---
Run 1: 101686 µs
Run 2: 103537 µs
Run 3: 94969 µs

=== Results ===
Nested myvector_construct: 128447 µs (avg)
Literal 0x...: 100064 µs (avg)
Ratio (nested/literal): 1.28x
```
140 changes: 140 additions & 0 deletions scripts/benchmark-issue79.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
#!/usr/bin/env bash
# Benchmark for issue #79: myvector_distance with nested myvector_construct.
# Compares: nested myvector_construct (slow) vs precomputed 0x literal (fast).
#
# Usage:
# ./scripts/benchmark-issue79.sh # Use prebuilt image
# ./scripts/benchmark-issue79.sh /path/to/myvector.so # Use custom plugin
#
# Env: BENCH_ROWS (default 1000), BENCH_DIM (default 768), BENCH_RUNS (default 3)
# 768-dim matches issue #79 (CLIP). Use BENCH_DIM=3 for quick runs.
#
# Output: Timing for nested query and literal query, ratio.
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
cd "$REPO_ROOT"

IMAGE="${MYVECTOR_IMAGE:-ghcr.io/askdba/myvector:mysql8.4}"
CONTAINER="myvector-bench-$$"
PLUGIN_SO="${1:-}"
ROWS="${BENCH_ROWS:-1000}"
RUNS="${BENCH_RUNS:-3}"
DIM="${BENCH_DIM:-768}"

# 768-dim matches issue #79 (CLIP vectors). 3-dim is too cheap to show the problem.
echo "=== Issue #79 Benchmark (rows=$ROWS, dim=$DIM, runs=$RUNS) ==="

docker run -d --name "$CONTAINER" \
-e MYSQL_ROOT_PASSWORD=bench \
-e MYSQL_DATABASE=bench \
-e MYSQL_ROOT_HOST=% \
"$IMAGE"
trap "docker rm -f $CONTAINER 2>/dev/null || true" EXIT

echo "Waiting for MySQL..."
for i in $(seq 1 60); do
docker exec "$CONTAINER" mysql -uroot -pbench -e "SELECT 1" 2>/dev/null && break
sleep 2
done
docker exec "$CONTAINER" mysql -uroot -pbench -e "SELECT 1" || { echo "MySQL not ready"; exit 1; }

if [ -n "$PLUGIN_SO" ] && [ -f "$PLUGIN_SO" ]; then
echo "Installing custom plugin from $PLUGIN_SO"
PLUGIN_DIR=$(docker exec "$CONTAINER" mysql -uroot -pbench -N -e "SELECT @@plugin_dir;")
docker cp "$PLUGIN_SO" "$CONTAINER:$PLUGIN_DIR/myvector.so"
docker exec "$CONTAINER" mysql -uroot -pbench -e "UNINSTALL PLUGIN myvector; INSTALL PLUGIN myvector SONAME 'myvector.so';"
docker exec "$CONTAINER" mysql -uroot -pbench -e "SOURCE /docker-entrypoint-initdb.d/myvectorplugin.sql;" 2>/dev/null || true
fi

# Generate N-dim vector string via Python (fast)
gen_vec() {
python3 -c "
import sys
n, dim = int(sys.argv[1]), int(sys.argv[2])
vals = [((n + i) * 0.001 - 0.5) for i in range(dim)]
print('[' + ','.join(str(v) for v in vals) + ']')
" "$1" "$DIM"
}

echo "Creating table with $ROWS rows (${DIM}-dim vectors)..."
VARSIZE=$((DIM * 4 + 16))
docker exec "$CONTAINER" mysql -uroot -pbench bench -e "
DROP TABLE IF EXISTS t79;
CREATE TABLE t79 (id INT PRIMARY KEY, v VARBINARY($VARSIZE));
"

# Query vector for nested/literal (constant across all rows)
QUERY_VEC=$(gen_vec 1)

# Insert in batches of 5 for large vectors
for ((i=1; i<=ROWS; i+=5)); do
end=$((i+4))
[ $end -gt $ROWS ] && end=$ROWS
vals=""
for ((n=i; n<=end; n++)); do
vec=$(gen_vec $n)
vec_escaped="${vec//\'/\\\'}"
[ -n "$vals" ] && vals="$vals,"
vals="${vals}($n, myvector_construct('$vec_escaped'))"
done
docker exec "$CONTAINER" mysql -uroot -pbench bench -e "INSERT INTO t79 VALUES $vals" 2>/dev/null
done

run_timed() {
local q="$1"
local start end
start=$(python3 -c "import time; print(int(time.time()*1e6))" 2>/dev/null || echo $(($(date +%s) * 1000000)))
docker exec "$CONTAINER" mysql -uroot -pbench bench -N -e "$q" 2>/dev/null >/dev/null
end=$(python3 -c "import time; print(int(time.time()*1e6))" 2>/dev/null || echo $(($(date +%s) * 1000000)))
echo $((end - start))
}

# Escape single quotes for SQL
QUERY_VEC_SQL="${QUERY_VEC//\'/\'\'}"
echo ""
echo "--- Nested myvector_construct (issue #79 pattern) ---"
nested_times=()
for r in $(seq 1 $RUNS); do
t=$(run_timed "SELECT COUNT(*) FROM (SELECT myvector_distance(v, myvector_construct('$QUERY_VEC_SQL'), 'L2') AS d FROM t79) sub")
nested_times+=("$t")
echo " Run $r: ${t} µs"
done

echo ""
echo "--- Precomputed 0x literal (baseline) ---"
LIT_HEX=$(docker exec "$CONTAINER" mysql -uroot -pbench -N -e "SELECT HEX(myvector_construct('$QUERY_VEC_SQL')) FROM DUAL;" 2>/dev/null)
literal_times=()
for r in $(seq 1 $RUNS); do
t=$(run_timed "SELECT COUNT(*) FROM (SELECT myvector_distance(v, 0x$LIT_HEX, 'L2') AS d FROM t79) sub")
literal_times+=("$t")
echo " Run $r: ${t} µs"
done

# Compute average
nested_sum=0
literal_sum=0
for t in "${nested_times[@]}"; do nested_sum=$((nested_sum + t)); done
for t in "${literal_times[@]}"; do literal_sum=$((literal_sum + t)); done
nested_avg=$((nested_sum / RUNS))
literal_avg=$((literal_sum / RUNS))
if [ "$literal_avg" -gt 0 ]; then
ratio=$(echo "scale=2; $nested_avg / $literal_avg" | bc 2>/dev/null || echo "N/A")
[[ "$ratio" = .* ]] && ratio="0$ratio"
else
ratio="N/A"
fi

echo ""
echo "=== Results ==="
echo "Nested myvector_construct: ${nested_avg} µs (avg)"
echo "Literal 0x...: ${literal_avg} µs (avg)"
echo "Ratio (nested/literal): ${ratio}x"
echo ""
if [ "$ratio" != "N/A" ]; then
ratio_int="${ratio%%.*}"
[ -z "$ratio_int" ] && ratio_int="0"
if [ "$ratio_int" -gt 2 ] 2>/dev/null; then
echo "NOTE: Nested is ${ratio}x slower. Fix should bring ratio close to 1.0x"
fi
fi
Loading
Loading