diff --git a/benchmark/scripts/breakdownHNSW/OoOCommon.py b/benchmark/scripts/breakdownHNSW/OoOCommon.py new file mode 100755 index 000000000..043ee99f4 --- /dev/null +++ b/benchmark/scripts/breakdownHNSW/OoOCommon.py @@ -0,0 +1,125 @@ +import csv +import numpy as np +import matplotlib.pyplot as plt +import itertools as it +import os + +import matplotlib +import matplotlib.pyplot as plt +import numpy as np +import pylab +from matplotlib.font_manager import FontProperties +from matplotlib.ticker import LogLocator, LinearLocator +import os +import pandas as pd +import sys +import matplotlib.ticker as mtick + +OPT_FONT_NAME = 'Helvetica' +TICK_FONT_SIZE = 22 +LABEL_FONT_SIZE = 28 +LEGEND_FONT_SIZE = 30 +LABEL_FP = FontProperties(style='normal', size=LABEL_FONT_SIZE) +LEGEND_FP = FontProperties(style='normal', size=LEGEND_FONT_SIZE) +TICK_FP = FontProperties(style='normal', size=TICK_FONT_SIZE) + +MARKERS = (['*', '|', 'v', "^", "", "h", "<", ">", "+", "d", "<", "|", "", "+", "_"]) +# you may want to change the color map for different figures +COLOR_MAP = ( + '#B03A2E', '#2874A6', '#239B56', '#7D3C98', '#FFFFFF', '#F1C40F', '#F5CBA7', '#82E0AA', '#AEB6BF', '#AA4499') +# you may want to change the patterns for different figures +PATTERNS = (["////", "o", "", "||", "-", "//", "\\", "o", "O", "////", ".", "|||", "o", "---", "+", "\\\\", "*"]) +LABEL_WEIGHT = 'bold' +LINE_COLORS = COLOR_MAP +LINE_WIDTH = 3.0 +MARKER_SIZE = 15.0 +MARKER_FREQUENCY = 1000 + + +def editConfig(src, dest, key, value): + df = pd.read_csv(src, header=None) + rowIdx = 0 + idxt = 0 + for cell in df[0]: + # print(cell) + if (cell == key): + rowIdx = idxt + break + idxt = idxt + 1 + df[1][rowIdx] = str(value) + df.to_csv(dest, index=False, header=False) + + +def readConfig(src, key): + df = pd.read_csv(src, header=None) + rowIdx = 0 + idxt = 0 + for cell in df[0]: + # print(cell) + if (cell == key): + rowIdx = idxt + break + idxt = idxt + 1 + return df[1][rowIdx] + + +def draw2yLine(NAME, Com, R1, R2, l1, l2, m1, m2, fname): + fig, ax1 = plt.subplots(figsize=(10, 6.4)) + lines = [None] * 2 + # ax1.set_ylim(0, 1) + print(Com) + print(R1) + lines[0], = ax1.plot(Com, R1, color=LINE_COLORS[0], \ + linewidth=LINE_WIDTH, marker=MARKERS[0], \ + markersize=MARKER_SIZE + # + ) + + # #plt.show() + ax1.set_ylabel(m1, fontproperties=LABEL_FP) + ax1.set_xlabel(NAME, fontproperties=LABEL_FP) + # ax1.set_xticklabels(ax1.get_xticklabels()) # 设置共用的x轴 + plt.xticks(rotation=0, size=TICK_FONT_SIZE) + plt.yticks(rotation=0, size=TICK_FONT_SIZE) + plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 + ax2 = ax1.twinx() + + # ax2.set_ylabel('latency/us') + # ax2.set_ylim(0, 0.5) + lines[1], = ax2.plot(Com, R2, color=LINE_COLORS[1], \ + linewidth=LINE_WIDTH, marker=MARKERS[1], \ + markersize=MARKER_SIZE) + + ax2.set_ylabel(m2, fontproperties=LABEL_FP) + # ax2.vlines(192000, min(R2)-1, max(R2)+1, colors = "GREEN", linestyles = "dashed",label='total L1 size') + # plt.grid(axis='y', color='gray') + + # style = dict(size=10, color='black') + # ax2.hlines(tset, 0, x2_list[len(x2_list)-1]+width, colors = "r", linestyles = "dashed",label="tset") + # ax2.text(4, tset, "$T_{set}$="+str(tset)+"us", ha='right', **style) + + # plt.xlabel('batch', fontproperties=LABEL_FP) + + # plt.xscale('log') + # figure.xaxis.set_major_locator(LinearLocator(5)) + ax1.yaxis.set_major_locator(LinearLocator(5)) + ax2.yaxis.set_major_locator(LinearLocator(5)) + ax1.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.1f')) + ax2.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.1f')) + ax1.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.1f')) + ax2.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.1f')) + plt.legend(lines, + [l1, l2], + prop=LEGEND_FP, + loc='upper center', + ncol=1, + bbox_to_anchor=(0.55, 1.3 + ), shadow=False, + columnspacing=0.1, + frameon=True, borderaxespad=-1.5, handlelength=1.2, + handletextpad=0.1, + labelspacing=0.1) + plt.yticks(rotation=0, size=TICK_FONT_SIZE) + plt.tight_layout() + + plt.savefig(fname + ".pdf") diff --git a/benchmark/scripts/breakdownHNSW/accuBar.py b/benchmark/scripts/breakdownHNSW/accuBar.py new file mode 100755 index 000000000..a18cb34c3 --- /dev/null +++ b/benchmark/scripts/breakdownHNSW/accuBar.py @@ -0,0 +1,335 @@ +import getopt +import os +import sys + +import matplotlib +import matplotlib.pyplot as plt +import numpy as np +import pylab +from matplotlib.font_manager import FontProperties +from matplotlib.ticker import LinearLocator, LogLocator, MaxNLocator, ScalarFormatter +from numpy import double + +OPT_FONT_NAME = 'Helvetica' +TICK_FONT_SIZE = 24 +LABEL_FONT_SIZE = 24 +LEGEND_FONT_SIZE = 24 +TITLE_FRONT_SIZE = 24 +LABEL_FP = FontProperties(style='normal', size=LABEL_FONT_SIZE) +LEGEND_FP = FontProperties(style='normal', size=LEGEND_FONT_SIZE) +TICK_FP = FontProperties(style='normal', size=TICK_FONT_SIZE) +TITLE_FP = FontProperties(style='normal', size=TITLE_FRONT_SIZE) +MARKERS = (['o', 's', 'v', "^", "h", "v", ">", "x", "d", "<", "|", "", "|", "_"]) +# you may want to change the color map for different figures +COLOR_MAP = ('#B03A2E', '#2874A6', '#239B56', '#7D3C98', '#F1C40F', '#F5CBA7', '#82E0AA', '#AEB6BF', '#AA4499') +# you may want to change the patterns for different figures +PATTERNS = (["\\", "///", "o", "||", "\\\\", "\\\\", "//////", "//////", ".", "\\\\\\", "\\\\\\"]) +LABEL_WEIGHT = 'bold' +LINE_COLORS = COLOR_MAP +LINE_WIDTH = 3.0 +MARKER_SIZE = 0.0 +MARKER_FREQUENCY = 1000 + +matplotlib.rcParams['ps.useafm'] = True +matplotlib.rcParams['pdf.use14corefonts'] = True +matplotlib.rcParams['xtick.labelsize'] = TICK_FONT_SIZE +matplotlib.rcParams['ytick.labelsize'] = TICK_FONT_SIZE +matplotlib.rcParams['font.family'] = OPT_FONT_NAME +matplotlib.rcParams['pdf.fonttype'] = 42 + +exp_dir = "/data1/xtra" + +FIGURE_FOLDER = exp_dir + '/results/figure' + + +# there are some embedding problems if directly exporting the pdf figure using matplotlib. +# so we generate the eps format first and convert it to pdf. +def ConvertEpsToPdf(dir_filename): + os.system("epstopdf --outfile " + dir_filename + ".pdf " + dir_filename + ".eps") + os.system("rm -rf " + dir_filename + ".eps") + + +class ScalarFormatterForceFormat(ScalarFormatter): + def _set_format(self): # Override function that finds format to use. + self.format = "%1.1f" # Give format here + + +# draw a line chart +def DrawFigure(x_values, y_values, legend_labels, x_label, y_label, filename, allow_legend, title): + # you may change the figure size on your own. + + fig = plt.figure(figsize=(20, 6)) + figure = fig.add_subplot(111) + + FIGURE_LABEL = legend_labels + LINE_COLORS = [ + '#FF8C00', '#FFE4C4', '#00FFFF', '#E0FFFF', + '#FF6347', '#98FB98', '#800080', '#FFD700', + '#7CFC00', '#8A2BE2', '#FF4500', '#20B2AA', + '#B0E0E6', '#DC143C', '#00FF7F' + ] + HATCH_PATTERNS = ['/', '-', 'o', '///', '\\', '|', 'x', '\\\\', '+', '.', '*', 'oo', '++++', '....', 'xxx'] + # if not os.path.exists(FIGURE_FOLDER): + # os.makedirs(FIGURE_FOLDER) + + # values in the x_xis + index = np.arange(len(x_values)) + # the bar width. + # you may need to tune it to get the best figure. + width = 0.5 + # draw the bars + bottom_base = np.zeros(len(y_values[0])) + bars = [None] * (len(FIGURE_LABEL)) + for i in range(len(y_values)): + bars[i] = plt.bar(index * 1.5 + width / 2, y_values[i], width, hatch=HATCH_PATTERNS[i], color=LINE_COLORS[i], + label=FIGURE_LABEL[i], bottom=bottom_base, edgecolor='black', linewidth=3) + bottom_base = np.array(y_values[i]) + bottom_base + + # sometimes you may not want to draw legends. + if allow_legend == True: + plt.legend(bars, FIGURE_LABEL + # mode='expand', + # shadow=False, + # columnspacing=0.25, + # labelspacing=-2.2, + # borderpad=5, + # bbox_transform=ax.transAxes, + # frameon=False, + # columnspacing=5.5, + # handlelength=2, + ) + if allow_legend == True: + handles, labels = figure.get_legend_handles_labels() + if allow_legend == True: + leg = plt.legend(handles[::-1], labels[::-1], + loc='center', + prop=LEGEND_FP, + ncol=6, + bbox_to_anchor=(0.5, 1.15), + shadow=True, frameon=True, edgecolor='black', + # bbox_to_anchor=(1.17, 0.5), + handletextpad=0.1, + # borderaxespad=0.0, + # handlelength=1.8, + labelspacing=-1.0, + columnspacing=0.5, + ) + leg.get_frame().set_linewidth(2) + leg.get_frame().set_edgecolor("black") + + # you may need to tune the xticks position to get the best figure. + plt.xticks(index * 1.5 + 0.6 * width, x_values) + plt.xticks(rotation=30, fontsize=TICK_FONT_SIZE) + plt.yticks(fontsize=TICK_FONT_SIZE) + # plt.ticklabel_format(axis="y", style="sci", scilimits=(0, 0), useMathText=True) + plt.ylim(0, 100) + plt.grid(axis='y', color='gray') + figure.yaxis.set_major_locator(LinearLocator(10)) + # figure.yaxis.set_major_locator(LogLocator(base=10)) + # figure.yaxis.set_major_locator(LinearLocator(6)) + + figure.get_xaxis().set_tick_params(direction='in', pad=10) + figure.get_yaxis().set_tick_params(direction='in', pad=10) + plt.grid(axis='y', color='gray', alpha=0.5, linewidth=0.5) + plt.xlabel(x_label, fontproperties=LABEL_FP) + plt.ylabel(y_label, fontproperties=LABEL_FP) + + size = fig.get_size_inches() + dpi = fig.get_dpi() + plt.title(title, fontproperties=TITLE_FP) + plt.savefig(filename + ".pdf", bbox_inches='tight', format='pdf') + + +def DrawLegend(legend_labels, filename): + fig = pylab.figure() + ax1 = fig.add_subplot(111) + FIGURE_LABEL = legend_labels + LEGEND_FP = FontProperties(style='normal', size=26) + + bars = [None] * (len(FIGURE_LABEL)) + data = [1] + x_values = [1] + + width = 0.3 + for i in range(len(FIGURE_LABEL)): + bars[i] = ax1.bar(x_values, data, width, hatch=PATTERNS[i], color=LINE_COLORS[i], + linewidth=0.2) + + # LEGEND + figlegend = pylab.figure(figsize=(11, 0.5)) + figlegend.legend(bars, FIGURE_LABEL, prop=LEGEND_FP, \ + loc=9, + bbox_to_anchor=(0, 0.4, 1, 1), + ncol=len(FIGURE_LABEL), mode="expand", shadow=False, \ + frameon=False, handlelength=1.1, handletextpad=0.2, columnspacing=0.1) + + figlegend.savefig(FIGURE_FOLDER + '/' + filename + '.pdf') + + +def normalize(y_values): + y_total_values = np.zeros(len(y_values[0])) + + for i in range(len(y_values)): + y_total_values += np.array(y_values[i]) + y_norm_values = [] + + for i in range(len(y_values)): + y_norm_values.append(np.array(y_values[i]) / (y_total_values) * 100) + return y_norm_values + + +# example for reading csv file +def ReadFile(id): + # Creates a list containing w lists, each of h items, all set to 0 + w, h = 5, 3 + y = [[0 for x in range(w)] for y in range(h)] + # print(matches) + max_value = 0 + j = 0 + bound = id + 1 * w + for i in range(id, bound, 1): + cnt = 0 + print(i) + f = open(exp_dir + "/results/breakdown/PMJ_JBCR_NP_{}.txt".format(i), "r") + read = f.readlines() + others = 0 + for x in read: + value = double(x.strip("\n")) + if value > max_value: + max_value = value + elif cnt == 3: # sort + y[0][j] = value + elif cnt == 4: # merge + y[1][j] = value + elif cnt == 5: # join + y[2][j] = value + else: + others += value + # if cnt == 6: + # y[2][j] = others + cnt += 1 + j += 1 + print(y) + return y, max_value + + +if __name__ == "__main__": + id = 119 + try: + opts, args = getopt.getopt(sys.argv[1:], '-i:h', ['test id', 'help']) + except getopt.GetoptError: + print('breakdown.py -id testid') + sys.exit(2) + for opt, opt_value in opts: + if opt in ('-h', '--help'): + print("[*] Help info") + exit() + elif opt == '-i': + print('Test ID:', opt_value) + id = (int)(opt_value) + + x_values = ['10%', '20%', '30%', '40%', '50%'] # sorting step size + + y_values, max_value = ReadFile(id) # 55 + + # y_norm_values = normalize(y_values) + + # break into 4 parts + legend_labels = ['sort', 'merge', 'join'] # , 'others' + + DrawFigure(x_values, y_values, legend_labels, + 'sorting step size', 'cycles per input tuple', + 'breakdown_sort_figure', True) + + # DrawLegend(legend_labels, 'breakdown_radix_legend') + + +def DrawPercentageFigure(x_values, y_values, legend_labels, x_label, y_label, filename, allow_legend, title): + # you may change the figure size on your own. + fig = plt.figure(figsize=(9, 3)) + figure = fig.add_subplot(111) + + FIGURE_LABEL = legend_labels + + # values in the x_xis + index = np.arange(len(x_values)) + # the bar width. + # you may need to tune it to get the best figure. + width = 0.5 + # draw the bars + bottom_base = np.zeros(len(y_values[0])) + bars = [None] * (len(FIGURE_LABEL)) + for i in range(len(y_values)): + bars[i] = plt.bar(index + width / 2, y_values[i], width, hatch=PATTERNS[i], color=LINE_COLORS[i], + label=FIGURE_LABEL[i], bottom=bottom_base, edgecolor='black', linewidth=3) + bottom_base = np.array(y_values[i]) + bottom_base + + # sometimes you may not want to draw legends. + if allow_legend == True: + plt.legend(bars, FIGURE_LABEL + # mode='expand', + # shadow=False, + # columnspacing=0.25, + # labelspacing=-2.2, + # borderpad=5, + # bbox_transform=ax.transAxes, + # frameon=False, + # columnspacing=5.5, + # handlelength=2, + ) + if allow_legend == True: + handles, labels = figure.get_legend_handles_labels() + if allow_legend == True: + print(handles[::-1], labels[::-1]) + leg = plt.legend(handles[::-1], labels[::-1], + loc='center', + prop=LEGEND_FP, + ncol=3, + bbox_to_anchor=(0.5, 1.2), + handletextpad=0.1, + borderaxespad=0.0, + handlelength=1.8, + labelspacing=0.3, + columnspacing=0.3, + ) + leg.get_frame().set_linewidth(2) + leg.get_frame().set_edgecolor("black") + + # sometimes you may not want to draw legends. + # if allow_legend == True: + # leg=plt.legend(bars, + # FIGURE_LABEL, + # prop=LEGEND_FP, + # loc='right', + # ncol=1, + # # mode='expand', + # bbox_to_anchor=(0.45, 1.1), shadow=False, + # columnspacing=0.1, + # frameon=True, borderaxespad=0.0, handlelength=1.5, + # handletextpad=0.1, + # labelspacing=0.1) + # leg.get_frame().set_linewidth(2) + # leg.get_frame().set_edgecolor("black") + + plt.ylim(0, 100) + + # you may need to tune the xticks position to get the best figure. + plt.xticks(index + 0.5 * width, x_values) + plt.xticks(rotation=30) + + # plt.xlim(0,) + # plt.ylim(0,1) + + plt.grid(axis='y', color='gray') + figure.yaxis.set_major_locator(LinearLocator(6)) + + figure.get_xaxis().set_tick_params(direction='in', pad=10) + figure.get_yaxis().set_tick_params(direction='in', pad=10) + + plt.xlabel(x_label, fontproperties=LABEL_FP) + plt.ylabel(y_label, fontproperties=LABEL_FP) + + size = fig.get_size_inches() + dpi = fig.get_dpi() + + plt.savefig(filename + ".pdf", bbox_inches='tight', format='pdf') diff --git a/benchmark/scripts/breakdownHNSW/autoParase.py b/benchmark/scripts/breakdownHNSW/autoParase.py new file mode 100755 index 000000000..9375c8e0b --- /dev/null +++ b/benchmark/scripts/breakdownHNSW/autoParase.py @@ -0,0 +1,87 @@ +import csv + + +def paraseValidStageNames(a): + nameList = [] + with open(a, 'r') as f: + reader = csv.reader(f) + # reader = [each for each in csv.DictReader(f, delimiter=',')] + result = list(reader) + rows = len(result) + # print('rows=',rows) + firstRow = result[0] + # print(firstRow) + index = 0 + # define what may attract our interest + idxCpu = 0 + idxName = 0 + for i in firstRow: + # print(i) + if (i == 'cpu'): + idxCpu = index + if (i == 'name'): + idxName = index + index = index + 1 + # read the valid stages + vdataEntries = 0 + + for k in range(1, rows): + if (result[k][idxCpu] != 'NA'): + R1 = ((result[k][idxName])) + nameList.append(R1) + return nameList + + +def paraseValidColums(a, nameList, colTitle): + with open(a, 'r') as f: + reader = csv.reader(f) + # reader = [each for each in csv.DictReader(f, delimiter=',')] + result = list(reader) + rows = len(result) + # print('rows=',rows) + firstRow = result[0] + # print(firstRow) + index = 0 + # define what may attract our interest + idxCpu = 0 + idxName = 0 + idxTitle = 0 + for i in firstRow: + # print(i) + if (i == 'cpu'): + idxCpu = index + if (i == 'name'): + idxName = index + if (i == colTitle): + idxTitle = index + index = index + 1 + # read the valid stages + vdataEntries = 0 + ru = [] + for k in range(1, rows): + if (result[k][idxCpu] != 'NA'): + R1 = ((result[k][idxName])) + for j in range(len(nameList)): + if (R1 == nameList[j]): + s = int(result[k][idxTitle]) + ru.append(s) + break + return ru + + +def maxInList(a): + # a in [[1,2] [3,4]] + inLen = len(a[0]) + ru = [] + index = [] + ti = 0 + for i in range(len(a[0])): + ts = 0 + ti = 0 + for k in range(len(a)): + if (a[k][i] > ts): + ts = a[k][i] + ti = k + ru.append(ts) + index.append(ti) + return ru, index diff --git a/benchmark/scripts/breakdownHNSW/config_e2e_static_lazy.csv b/benchmark/scripts/breakdownHNSW/config_e2e_static_lazy.csv new file mode 100644 index 000000000..c3f57a047 --- /dev/null +++ b/benchmark/scripts/breakdownHNSW/config_e2e_static_lazy.csv @@ -0,0 +1,41 @@ +key,value,type +vecDim,768,I64 +vecVolume,100000,I64 +batchSize,5000,I64 +metricType,IP,String +DCOBatchSize,5000,I64 +ammAlgo,crs,String +sketchSize,128,I64 +initialRows,50000,I64 +indexTag,faiss,String +congestionDropWorker_algoTag,faiss,String +eventRateTps,4000,I64 +querySize,100,I64 +zipfAlpha,0,Double +coarseGrainedClusters,96,I64 +maskReference,0,Double +encodeLen,1,I64 +numberOfBuckets,8192,I64 +cutOffTimeSeconds,14400,I64 +useSeparateQuery,1,I64 +sampleRows,2048,I64 +faissIndexTag,flat,String +useCRS,1,I64 +crsDim,10,I64 +dataLoaderTag,fvecs,String +staticDataSet,0,I64 +maxBuildIteration,200,I64 +lshMatrixType,random,String +ANNK,10,I64 +frozenLevel,0,I64 +cudaBuild,1,I64 +candidateTimes,1,I64 +disableADC,0,I64 +isOnlinePQ,0,I64 +fineGrainedBuiltPath,OnlinePQIndex_fine.rbt,String +dataPath,datasets/DPR/DPR100KC4.fvecs,String +queryPath,datasets/DPR/DPR10KC4Q.fvecs,String +waitPendingWrite,1,I64 +is_NSW,0,I64 +isOnlinePQ,0,I64 +flannIndexTag,1,I64 diff --git a/benchmark/scripts/breakdownHNSW/deps.txt b/benchmark/scripts/breakdownHNSW/deps.txt new file mode 100644 index 000000000..6fe0c21b9 --- /dev/null +++ b/benchmark/scripts/breakdownHNSW/deps.txt @@ -0,0 +1,6 @@ +transformers>=4.30.0 +setuptools==65.5.1 +torch +datasets==2.14.3 +numpy>=1.25.0 +nltk==3.8.1 diff --git a/benchmark/scripts/breakdownHNSW/drawSVI.py b/benchmark/scripts/breakdownHNSW/drawSVI.py new file mode 100755 index 000000000..6e2d8c00a --- /dev/null +++ b/benchmark/scripts/breakdownHNSW/drawSVI.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python3 +import csv +import numpy as np +import matplotlib.pyplot as plt +import accuBar as accuBar +import groupBar as groupBar +import groupBar2 as groupBar2 +import groupLine as groupLine +from autoParase import * +import itertools as it +import os + +import matplotlib +import matplotlib.pyplot as plt +import numpy as np +import pylab +from matplotlib.font_manager import FontProperties +from matplotlib.ticker import LogLocator, LinearLocator +import os +import pandas as pd +import sys +from OoOCommon import * + +OPT_FONT_NAME = 'Helvetica' +TICK_FONT_SIZE = 22 +LABEL_FONT_SIZE = 28 +LEGEND_FONT_SIZE = 30 +LABEL_FP = FontProperties(style='normal', size=LABEL_FONT_SIZE) +LEGEND_FP = FontProperties(style='normal', size=LEGEND_FONT_SIZE) +TICK_FP = FontProperties(style='normal', size=TICK_FONT_SIZE) + +MARKERS = (['*', '|', 'v', "^", "", "h", "<", ">", "+", "d", "<", "|", "", "+", "_"]) +# you may want to change the color map for different figures +COLOR_MAP = ( + '#B03A2E', '#2874A6', '#239B56', '#7D3C98', '#FFFFFF', '#F1C40F', '#F5CBA7', '#82E0AA', '#AEB6BF', '#AA4499') +# you may want to change the patterns for different figures +PATTERNS = (["////", "o", "", "||", "-", "//", "\\", "o", "O", "////", ".", "|||", "o", "---", "+", "\\\\", "*"]) +LABEL_WEIGHT = 'bold' +LINE_COLORS = COLOR_MAP +LINE_WIDTH = 3.0 +MARKER_SIZE = 15.0 +MARKER_FREQUENCY = 1000 + +matplotlib.rcParams['ps.useafm'] = True +matplotlib.rcParams['pdf.use14corefonts'] = True +matplotlib.rcParams['xtick.labelsize'] = TICK_FONT_SIZE +matplotlib.rcParams['ytick.labelsize'] = TICK_FONT_SIZE +matplotlib.rcParams['font.family'] = OPT_FONT_NAME +matplotlib.rcParams['pdf.fonttype'] = 42 + + +def runPeriod(exePath, period, resultPath, configTemplate="config.csv"): + # resultFolder="periodTests" + configFname = "config_period_" + str(period) + ".csv" + # configTemplate = "config.csv" + # clear old files + os.system("cd " + exePath + "&& rm *.csv") + + # editConfig(configTemplate, exePath + configFname, "earlierEmitMs", 0) + editConfig(configTemplate, exePath + configFname, "watermarkTimeMs", period) + # prepare new file + # run + os.system("cd " + exePath + "&& ./benchmark " + configFname) + # copy result + os.system("rm -rf " + resultPath + "/" + str(period)) + os.system("mkdir " + resultPath + "/" + str(period)) + os.system("cd " + exePath + "&& cp *.csv " + resultPath + "/" + str(period)) + + +def runPeriodVector(exePath, periodVec, resultPath, configTemplate="config.csv"): + for i in periodVec: + runPeriod(exePath, i, resultPath, configTemplate) + + +def readResultPeriod(period, resultPath): + resultFname = resultPath + "/" + str(period) + "/default_general.csv" + avgLat = readConfig(resultFname, "AvgLatency") + lat95 = readConfig(resultFname, "95%Latency") + thr = readConfig(resultFname, "Throughput") + err = readConfig(resultFname, "AQPError") + return avgLat, lat95, thr, err + + +def readResultVectorPeriod(periodVec, resultPath): + avgLatVec = [] + lat95Vec = [] + thrVec = [] + errVec = [] + compVec = [] + for i in periodVec: + avgLat, lat95, thr, err = readResultPeriod(i, resultPath) + avgLatVec.append(float(avgLat) / 1000.0) + lat95Vec.append(float(lat95) / 1000.0) + thrVec.append(float(thr) / 1000.0) + errVec.append(abs(float(err))) + compVec.append(1 - abs(float(err))) + return avgLatVec, lat95Vec, thrVec, errVec, compVec + + +def compareMethod(exeSpace, commonPathBase, resultPaths, csvTemplates, periodVec, reRun=1): + lat95All = [] + errAll = [] + periodAll = [] + for i in range(len(csvTemplates)): + resultPath = commonPathBase + resultPaths[i] + if (reRun == 1): + os.system("rm -rf " + resultPath) + os.system("mkdir " + resultPath) + runPeriodVector(exeSpace, periodVec, resultPath, csvTemplates[i]) + avgLatVec, lat95Vec, thrVec, errVec, compVec = readResultVectorPeriod(periodVec, resultPath) + lat95All.append(lat95Vec) + errAll.append(errVec) + periodAll.append(periodVec) + return lat95All, errAll, periodAll + + +def main(): + exeSpace = os.path.abspath(os.path.join(os.getcwd(), "../..")) + "/" + commonBasePath = os.path.abspath(os.path.join(os.getcwd(), "../..")) + "/results/figE2E/" + + figPath = os.path.abspath(os.path.join(os.getcwd(), "../..")) + "/figures/" + configTemplate = exeSpace + "config.csv" + periodVec = [7, 8, 9, 10, 11, 12] + periodVecDisp = np.array(periodVec) + periodVecDisp = periodVecDisp + print(configTemplate) + + reRun = 0 + if (len(sys.argv) < 2): + os.system("rm -rf " + commonBasePath) + os.system("mkdir " + commonBasePath) + reRun = 1 + # runPeriodVector(exeSpace, periodVec, resultPath) + # os.system("mkdir " + figPath) + # print(lat95All) + # print(lat95All) + # lat95All[3]=ts + methodTags = ["WMJ", "PECJ-alf", "(PECJ-vae)/7.5", "svi"] + resultPaths = ["wa", "pecj_ks", "pec_ai", "svi"] + csvTemplates = ["config_waterMark.csv", "config_ima.csv", "config_pecjAI.csv", "config_svi.csv"] + lat95All, errAll, periodAll = compareMethod(exeSpace, commonBasePath, resultPaths, csvTemplates, periodVec, reRun) + npLat = np.array(lat95All) + npLat[2] = npLat[2] / 7.5 + groupLine.DrawFigure2(npLat, errAll, methodTags, "95% latency (ms)", "Error", 0, 1, figPath + "svie2ESmall", True) + + +if __name__ == "__main__": + main() diff --git a/benchmark/scripts/breakdownHNSW/drawTogether.py b/benchmark/scripts/breakdownHNSW/drawTogether.py new file mode 100644 index 000000000..4abcd8ba7 --- /dev/null +++ b/benchmark/scripts/breakdownHNSW/drawTogether.py @@ -0,0 +1,389 @@ +#!/usr/bin/env python3 +# Note: the concept drift is not learnt by indexing in this group +import csv +import numpy as np +import matplotlib.pyplot as plt +import accuBar as accuBar +import groupBar2 as groupBar2 +import groupLine as groupLine +from autoParase import * +import itertools as it +import os + +import matplotlib +import matplotlib.pyplot as plt +import numpy as np +import pylab +from matplotlib.font_manager import FontProperties +from matplotlib import ticker +from matplotlib.ticker import LogLocator, LinearLocator + +import os +import pandas as pd +import sys +from OoOCommon import * + +OPT_FONT_NAME = 'Helvetica' +TICK_FONT_SIZE = 22 +LABEL_FONT_SIZE = 22 +LEGEND_FONT_SIZE = 22 +LABEL_FP = FontProperties(style='normal', size=LABEL_FONT_SIZE) +LEGEND_FP = FontProperties(style='normal', size=LEGEND_FONT_SIZE) +TICK_FP = FontProperties(style='normal', size=TICK_FONT_SIZE) + +MARKERS = (['*', '|', 'v', "^", "", "h", "<", ">", "+", "d", "<", "|", "", "+", "_"]) +# you may want to change the color map for different figures +COLOR_MAP = ( + '#B03A2E', '#2874A6', '#239B56', '#7D3C98', '#FFFFFF', '#F1C40F', '#F5CBA7', '#82E0AA', '#AEB6BF', '#AA4499') +# you may want to change the patterns for different figures +PATTERNS = (["////", "o", "", "||", "-", "//", "\\", "o", "O", "////", ".", "|||", "o", "---", "+", "\\\\", "*"]) +LABEL_WEIGHT = 'bold' +LINE_COLORS = COLOR_MAP +LINE_WIDTH = 3.0 +MARKER_SIZE = 15.0 +MARKER_FREQUENCY = 1000 + +matplotlib.rcParams['ps.useafm'] = True +matplotlib.rcParams['pdf.use14corefonts'] = True +matplotlib.rcParams['xtick.labelsize'] = TICK_FONT_SIZE +matplotlib.rcParams['ytick.labelsize'] = TICK_FONT_SIZE +matplotlib.rcParams['font.family'] = OPT_FONT_NAME +matplotlib.rcParams['pdf.fonttype'] = 42 + + + +def runPeriod(exePath, algoTag, resultPath, configTemplate="config.csv", prefixTagRaw="null"): + # resultFolder="periodTests" + prefixTag = str(prefixTagRaw) + configFname = "config_period_" + str(prefixTag) + ".csv" + configTemplate = "config_e2e_static_lazy.csv" + # clear old files + os.system("cd " + exePath + "&& sudo rm *.csv") + os.system("cp perfListEvaluation.csv " + exePath) + dataPathCommon = exePath + "/results/scanIPConceptDriftHotSpot/" + desiredDataFname = dataPathCommon + "driftData/" + "data_" + str(prefixTag) + '.fvecs' + desiredQueryFname = dataPathCommon + "driftData/" + "query_" + str(prefixTag) + '.fvecs' + # editConfig(configTemplate, exePath + configFname, "earlierEmitMs", 0) + editConfig(configTemplate, "temp3.csv", "dataPath", desiredDataFname) + editConfig("temp3.csv", "temp2.csv", "queryPath", desiredQueryFname) + editConfig("temp2.csv", exePath + "temp1.csv", "faissIndexTag", algoTag) + if (algoTag == 'LSH'): + editConfig(exePath + "temp1.csv", exePath + "temp2.csv", "numberOfBuckets", 1) + editConfig(exePath + "temp2.csv", exePath + "temp3.csv", "useCRS", 0) + editConfig(exePath + "temp3.csv", exePath + "temp4.csv", "congestionDropWorker_algoTag", "onlineIVFLSH") + editConfig(exePath + "temp4.csv", exePath + "temp1.csv", "encodeLen", 3) + if (algoTag == 'LSH-H'): + editConfig(exePath + "temp1.csv", exePath + "temp2.csv", "congestionDropWorker_algoTag", "onlineIVFLSH") + editConfig(exePath + "temp2.csv", exePath + "temp4.csv", "useCRS", 0) + editConfig(exePath + "temp4.csv", exePath + "temp1.csv", "encodeLen", 3) + if (algoTag == 'flatAMMIP'): + editConfig(exePath + "temp1.csv", exePath + "temp2.csv", "congestionDropWorker_algoTag", "flatAMMIP") + editConfig(exePath + "temp2.csv", exePath + "temp1.csv", "sketchSize", 256) + if (algoTag == 'flatAMMIPSMPPCA'): + editConfig(exePath + "temp1.csv", exePath + "temp2.csv", "congestionDropWorker_algoTag", "flatAMMIP") + editConfig(exePath + "temp2.csv", exePath + "temp4.csv", "sketchSize", 128) + editConfig(exePath + "temp4.csv", exePath + "temp1.csv", "ammAlgo", 'smp-pca') + if (algoTag == 'flat'): + editConfig(exePath + "temp1.csv", exePath + "temp2.csv", "congestionDropWorker_algoTag", "flat") + editConfig(exePath + "temp2.csv", exePath + "temp1.csv", "sketchSize", 256) + if (algoTag == 'NSW'): + editConfig(exePath + "temp1.csv", exePath + "temp2.csv", "congestionDropWorker_algoTag", "NSW") + editConfig(exePath + "temp2.csv", exePath + "temp1.csv", "is_NSW", 1) + if (algoTag == 'nnDescent'): + editConfig(exePath + "temp1.csv", exePath + "temp2.csv", "congestionDropWorker_algoTag", "nnDescent") + editConfig(exePath + "temp2.csv", exePath + "temp1.csv", "frozenLevel", 1) + if (algoTag == 'onlinePQ'): + editConfig(exePath + "temp1.csv", exePath + "temp3.csv", "faissIndexTag", "PQ") + editConfig(exePath + "temp3.csv", exePath + "temp2.csv", "isOnlinePQ", 1) + editConfig(exePath + "temp2.csv", exePath + "temp1.csv", "sketchSize", 256) + if (algoTag == 'Flann'): + editConfig(exePath + "temp1.csv", exePath + "temp2.csv", "congestionDropWorker_algoTag", "Flann") + editConfig(exePath + "temp2.csv", exePath + "temp1.csv", "sketchSize", 256) + if (algoTag == 'DPG'): + editConfig(exePath + "temp1.csv", exePath + "temp2.csv", "congestionDropWorker_algoTag", "DPG") + editConfig(exePath + "temp2.csv", exePath + "temp1.csv", "frozenLevel", 1) + exeTag = "onlineInsert" + # prepare new file + os.system("rm -rf " + exePath + "*.rbt") + os.system("cp *.rbt " + exePath) + # run + # os.system("cd " + exePath + "&& export OMP_NUM_THREADS=1 &&" + "sudo ./" + exeTag + " " + 'temp1.csv') + if (algoTag == 'nnDescent2'): + os.system("cp dummy.csv " + exePath + "onlineInsert_result.csv") + else: + os.system("cd " + exePath + "&& export OMP_NUM_THREADS=1 &&" + "sudo ./" + exeTag + " " + 'temp1.csv') + # copy result + os.system("sudo rm -rf " + resultPath + "/" + str(prefixTag)) + os.system("sudo mkdir " + resultPath + "/" + str(prefixTag)) + + os.system("cd " + exePath + "&& sudo cp *.csv " + resultPath + "/" + str(prefixTag)) + + +def runPeriodVector(exePath, algoTag, resultPath, prefixTag, configTemplate="config.csv", reRun=1): + for i in range(len(prefixTag)): + if reRun == 2: + if checkResultSingle(prefixTag[i], resultPath) == 1: + print("skip " + str(prefixTag[i])) + else: + runPeriod(exePath, algoTag, resultPath, configTemplate, prefixTag[i]) + else: + runPeriod(exePath, algoTag, resultPath, configTemplate, prefixTag[i]) + + +def readResultSingle(singleValue, resultPath): + resultFname = resultPath + "/" + str(singleValue) + "/onlineInsert_result.csv" + elapsedTime = readConfig(resultFname, "latencyOfQuery") + incrementalBuild = readConfig(resultFname, "95%latency(Insert)") + incrementalSearch = readConfig(resultFname, "latencyOfQuery") + recall = readConfig(resultFname, "recall") + pendingWaitTime = readConfig(resultFname, "pendingWrite") + l2Stall = 0 + l3Stall = 0 + totalStall = 0 + froErr = 0 + return elapsedTime, incrementalBuild, incrementalSearch, recall, pendingWaitTime, l2Stall, l3Stall, totalStall, froErr + + +def readResultVector(singleValueVec, resultPath): + elapseTimeVec = [] + incrementalBuildVec = [] + incrementalSearchVec = [] + recallVec = [] + pendingWaitTimeVec = [] + l2StallVec = [] + l3StallVec = [] + totalStallVec = [] + froVec = [] + for i in singleValueVec: + elapsedTime, incrementalBuild, incrementalSearch, recall, pendingWaitTime, l2Stall, l3Stall, totalStall, fro = readResultSingle( + i, resultPath) + elapseTimeVec.append(float(elapsedTime)) + incrementalBuildVec.append(float(incrementalBuild)) + incrementalSearchVec.append(float(incrementalSearch)) + recallVec.append(float(recall)) + pendingWaitTimeVec.append(float(pendingWaitTime)) + l2StallVec.append(float(l2Stall)) + l3StallVec.append(float(l3Stall)) + totalStallVec.append(float(totalStall)) + froVec.append(float(fro)) + return np.array(elapseTimeVec), np.array(incrementalBuildVec), np.array(incrementalSearchVec), np.array( + recallVec), np.array( + pendingWaitTimeVec), np.array(l2StallVec), np.array(l3StallVec), np.array(totalStallVec), np.array(froVec) + + +def checkResultSingle(singleValue, resultPath): + resultFname = resultPath + "/" + str(singleValue) + "/onlineInsert_result.csv" + ruExists = 0 + if os.path.exists(resultFname): + ruExists = 1 + else: + print("File does not exist:" + resultFname) + ruExists = 0 + return ruExists + + +def checkResultVector(singleValueVec, resultPath): + resultIsComplete = 0 + for i in singleValueVec: + resultIsComplete = checkResultSingle(i, resultPath) + if resultIsComplete == 0: + return 0 + return 1 + + +def compareMethod(exeSpace, commonPathBase, resultPaths, csvTemplate, algos, dataSetName, reRun=1): + elapsedTimeAll = [] + incrementalBuildAll = [] + incrementalSearchAll = [] + periodAll = [] + recallAll = [] + pendingWaitTimeAll = [] + l2StallAll = [] + l3StallAll = [] + totalStallAll = [] + froAll = [] + resultIsComplete = 1 + algoCnt = 0 + for i in range(len(algos)): + resultPath = commonPathBase + resultPaths[i] + algoTag = algos[i] + scanVec = dataSetName + if (reRun == 1): + os.system("sudo rm -rf " + resultPath) + os.system("sudo mkdir " + resultPath) + runPeriodVector(exeSpace, algoTag, resultPath, scanVec, csvTemplate) + else: + if (reRun == 2): + resultIsComplete = checkResultVector(scanVec, resultPath) + if resultIsComplete == 1: + print(algoTag + " is complete, skip") + else: + print(algoTag + " is incomplete, redo it") + if os.path.exists(resultPath) == False: + os.system("sudo mkdir " + resultPath) + runPeriodVector(exeSpace, algoTag, resultPath, scanVec, csvTemplate, 2) + resultIsComplete = checkResultVector(scanVec, resultPath) + # exit() + if resultIsComplete: + elapsedTime, incrementalBuild, incrementalSearch, recall, pendingWaitTime, l2Stall, l3Stall, totalStall, froVec = readResultVector( + dataSetName, resultPath) + elapsedTimeAll.append(elapsedTime) + incrementalBuildAll.append(incrementalBuild) + incrementalSearchAll.append(incrementalSearch) + periodAll.append(dataSetName) + recallAll.append(recall) + pendingWaitTimeAll.append(pendingWaitTime) + l2StallAll.append(l2Stall) + l3StallAll.append(l3Stall) + totalStallAll.append(totalStall) + froAll.append(froVec) + algoCnt = algoCnt + 1 + print(algoCnt) + # periodAll.append(periodVec) + return np.array(elapsedTimeAll), np.array(incrementalBuildAll), np.array(periodAll), np.array(recallAll), np.array( + incrementalSearchAll), np.array(pendingWaitTimeAll), np.array(l2StallAll), np.array(l3StallAll), np.array( + totalStallAll), np.array(froAll) + + +def getCyclesPerMethod(cyclesAll, valueChose): + recallPerMethod = [] + for i in range(len(cyclesAll)): + recallPerMethod.append(cyclesAll[int(i)][int(valueChose)]) + return np.array(recallPerMethod) + + +def main(): + exeSpace = os.path.abspath(os.path.join(os.getcwd(), "../..")) + "/" + commonBasePath = os.path.abspath(os.path.join(os.getcwd(), "../..")) + "/results/breakdownHNSW/" + + figPath = os.path.abspath(os.path.join(os.getcwd(), "../..")) + "/figures/breakdownHNSW" + + # add the datasets here + # srcAVec=["datasets/AST/mcfe.mtx"] # 765*756 + # srcBVec=["datasets/AST/mcfe.mtx"] # 765*756 + # dataSetNames=['AST'] + # srcAVec=['datasets/UTM/utm1700a.mtx'] # 1700*1700 + # srcBVec=['datasets/UTM/utm1700b.mtx'] # 1700*1700 + # dataSetNames=['UTM'] + # srcAVec=['datasets/ECO/wm2.mtx',"datasets/DWAVE/dwa512.mtx","datasets/AST/mcfe.mtx",'datasets/UTM/utm1700a.mtx','datasets/RDB/rdb2048.mtx','datasets/ZENIOS/zenios.mtx','datasets/QCD/qcda_small.mtx',"datasets/BUS/gemat1.mtx",] + # srcBVec=['datasets/ECO/wm3.mtx',"datasets/DWAVE/dwb512.mtx","datasets/AST/mcfe.mtx",'datasets/UTM/utm1700b.mtx','datasets/RDB/rdb2048l.mtx','datasets/ZENIOS/zenios.mtx','datasets/QCD/qcdb_small.mtx',"datasets/BUS/gemat1.mtx",] + # dataSetNames=['ECO','DWAVE','AST','UTM','RDB','ZENIOS','QCD','BUS'] + # srcAVec=['datasets/ECO/wm2.mtx',"datasets/DWAVE/dwa512.mtx","datasets/AST/mcfe.mtx",'datasets/UTM/utm1700a.mtx','datasets/RDB/rdb2048.mtx','datasets/ZENIOS/zenios.mtx','datasets/QCD/qcda_small.mtx',"datasets/BUS/gemat1.mtx",] + # srcBVec=['datasets/ECO/wm3.mtx',"datasets/DWAVE/dwb512.mtx","datasets/AST/mcfe.mtx",'datasets/UTM/utm1700b.mtx','datasets/RDB/rdb2048l.mtx','datasets/ZENIOS/zenios.mtx','datasets/QCD/qcdb_small.mtx',"datasets/BUS/gemat1.mtx",] + # aRowVec= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0] + aRowVec = ["DPR",0.05, 0.1, 0.2, 0.4, 0.6, 0.8] + # exit() + # aRowVec=[100, 200] + # add the algo tag here + # algosVec = ['flat', 'LSH-H','flatAMMIP','flatAMMIPSMPPCA','PQ','IVFPQ','HNSW'] + algosVec = ['HNSWbd'] + # algosVec = ['flat', 'LSH-H'] + # algosVec = ['flat', 'onlinePQ'] + # algosVec=['incrementalRaw'] + # algosVec=[ 'pq'] + # algoDisp = ['BrutalForce', 'PQ'] + algoDisp = ['HNSWbd'] + # algoDisp=['BrutalForce','LSH-H'] + # algoDisp=['PQ'] + # add the algo tag here + + # this template configs all algos as lazy mode, all datasets are static and normalized + csvTemplate = 'config_e2e_static_lazy.csv' + # do not change the following + resultPaths = algosVec + os.system("mkdir ../../results") + os.system("mkdir ../../figures") + os.system("mkdir " + figPath) + + # run + reRun = 0 + if (len(sys.argv) < 2): + + os.system("sudo rm -rf " + commonBasePath) + + reRun = 1 + else: + reRun = int(sys.argv[1]) + os.system("sudo mkdir " + commonBasePath) + #prepareEmbeddings(commonBasePath, aRowVec) + # exit(0) + print(reRun) + methodTags = algoDisp + elapsedTimeAll, incrementalBuildAll, periodAll, recall, incrementalSearchAll, pendingWaitTimeAll, l2StallAll, l3StallAll, totalStallAll, froAll = compareMethod( + exeSpace, commonBasePath, resultPaths, csvTemplate, algosVec, aRowVec, reRun) + # Add some pre-process logic for int8 here if it is used + + # groupBar2.DrawFigureYLog(aRowVec, recall/recall[-1], methodTags, "Datasets", "Ins (times of LTMM)", 5, 15, figPath + "/" + "recall", True) + # groupBar2.DrawFigureYLog(aRowVec, fpInsAll/fpInsAll[-1], methodTags, "Datasets", "FP Ins (times of LTMM)", 5, 15, figPath + "/" + "FP_recall", True) + # groupBar2.DrawFigureYLog(aRowVec, memInsAll/memInsAll[-1], methodTags, "Datasets", "Mem Ins (times of LTMM)", 5, 15, figPath + "/" + "mem_recall", True) + # groupBar2.DrawFigure(aRowVec, ratioFpIns, methodTags, "Datasets", "SIMD Utilization (%)", 5, 15, figPath + "/" + "SIMD utilization", True) + # groupBar2.DrawFigure(aRowVec, recall/(memLoadAll+memStoreAll), methodTags, "Datasets", "IPM", 5, 15, figPath + "/" + "IPM", True) + # groupBar2.DrawFigure(aRowVec, fpInsAll/(memLoadAll+memStoreAll), methodTags, "Datasets", "FP Ins per Unit Mem Access", 5, 15, figPath + "/" + "FPIPM", True) + # groupBar2.DrawFigure(aRowVec, (memLoadAll+memStoreAll)/(recall)*100.0, methodTags, "Datasets", "Ratio of Mem Ins (%)", 5, 15, figPath + "/" + "mem", True) + + # groupBar2.DrawFigure(aRowVec, branchAll/recall*100.0, methodTags, "Datasets", "Ratio of Branch Ins (%)", 5, 15, figPath + "/" + "branches", True) + # groupBar2.DrawFigure(aRowVec, otherIns/recall*100.0, methodTags, "Datasets", "Ratio of Other Ins (%)", 5, 15, figPath + "/" + "others", True) + # print(recall[-1],recall[2]) + + # groupBar2.DrawFigure(dataSetNames, np.log(thrAll), methodTags, "Datasets", "elements/ms", 5, 15, figPath + "sec4_1_e2e_static_lazy_throughput_log", True) + #groupLine.DrawFigureYLog(periodAll, incrementalBuildAll / 1000, + # methodTags, + # "Drifted Pos", r'95% Latency of insert (ms)', 0, 1, + # figPath + "/" + "scanIPConceptDriftHotSpot_lat_INSERT", + # True) + #groupLine.DrawFigureYLog(periodAll, pendingWaitTimeAll / 1000, + # methodTags, + # "Drifted Pos", r'Pending wait for insert (ms)', 0, 1, + # figPath + "/" + "scanIPConceptDriftHotSpot_lat_pending", + # True) + #groupLine.DrawFigureYLog(periodAll, incrementalSearchAll / 1000, + # methodTags, + # "Drifted Pos", r'Latency of search (ms)', 0, 1, + # figPath + "/" + "scanIPConceptDriftHotSpot_lat_search", + # True) + #groupLine.DrawFigureYLog(periodAll, (incrementalSearchAll + pendingWaitTimeAll) / 1000, + # methodTags, + # "Drifted Pos", r'Latency of query (ms)', 0, 1, + # figPath + "/" + "scanIPConceptDriftHotSpot_lat_instant", + # True) + #groupLine.DrawFigureYnormal(periodAll, recall, + # methodTags, + # "Prob. of contamination", r'recall@10', 0, 1, + # figPath + "/" + "scanIPConceptDriftHotSpot_recall", + # False) + breakdownVec = [] + greedy=[] + candidate=[] + link=[] + sumstep=[] + for i in range(len(algosVec)): + resultPath = commonBasePath + algosVec[i] + for j in range(len(aRowVec)): + bddirPath = resultPath + "/"+str(aRowVec[j]) + bdPath = bddirPath+"/"+"hnswbd.csv" + #df = pd.read_csv(bdPath, sep=',', header=None).iloc[9,[3,4,5]] + df = pd.read_csv(bdPath, sep=',', header=None).iloc[:,[3,4,5]] + sum = df.sum(axis=1) + result_df = pd.concat([df, sum], axis=1) + greedy.append(result_df.iloc[9,0]/result_df.iloc[9,3]*100.0) + candidate.append(result_df.iloc[9,1]/result_df.iloc[9,3]*100.0) + link.append(result_df.iloc[9,2]/result_df.iloc[9,3]*100.0) + sumstep.append(result_df.iloc[9,3]) + + + #accuBar.DrawFigure(methodTags, + # [memStallPerMethod, l1dStallPerMethod, l2StallPerMethod, l3StallPerMethod, + # otherPerMethod,nonStallPerMethod]/cpuCyclePerMethod*100.0, ['Mem Stall', 'L1D Stall', 'L2 Stall', 'L3 Stall', 'Other Stall', 'Not Stall'], '', + # 'Propotion (%)', figPath + "/" + "cyclesbreakDown" + # + "_cycles_accubar" + str(valueVec[valueChose]), allowLegend, + # '') + #groupBar2.DrawFigure(dataSetNames,l1dStallAll/cpuCycleAll*100.0,methodTags, "Datasets", "Ratio of l1dStalls (%)", 5, 15, figPath + "l1dstall_ratio", True) + accuBar.DrawFigure(aRowVec, [greedy, candidate, link], ["Greedy","Candidate", "Link"], "",'Propotion (%)',figPath + "/" + "cyclesbreakDown"+ "_cycles_accubar", True, '') + + + + + +if __name__ == "__main__": + main() + diff --git a/benchmark/scripts/breakdownHNSW/dummy.csv b/benchmark/scripts/breakdownHNSW/dummy.csv new file mode 100644 index 000000000..07eeb6197 --- /dev/null +++ b/benchmark/scripts/breakdownHNSW/dummy.csv @@ -0,0 +1,8 @@ +key,value,type +latencyOfQuery,0,I64 +normalExit,0,I64 +recall,0,I64 +throughput,0,I64 +throughputByElements,0,I64 +95%latency(Insert),947361.000000,Double +pendingWrite,947361.000000,Double diff --git a/benchmark/scripts/breakdownHNSW/groupBar2.py b/benchmark/scripts/breakdownHNSW/groupBar2.py new file mode 100644 index 000000000..fa1c3fda6 --- /dev/null +++ b/benchmark/scripts/breakdownHNSW/groupBar2.py @@ -0,0 +1,383 @@ +import itertools as it +import os + +import matplotlib +import matplotlib.pyplot as plt +import numpy as np +import pylab +from matplotlib.font_manager import FontProperties +from matplotlib.ticker import LogLocator, LinearLocator +import matplotlib.ticker as mtick + +OPT_FONT_NAME = 'Helvetica' +TICK_FONT_SIZE = 32 +LABEL_FONT_SIZE = 30 +LEGEND_FONT_SIZE = 32 +LABEL_FP = FontProperties(style='normal', size=LABEL_FONT_SIZE) +LEGEND_FP = FontProperties(style='normal', size=LEGEND_FONT_SIZE) +TICK_FP = FontProperties(style='normal', size=TICK_FONT_SIZE) + +MARKERS = (["+", 'o', 's', 'v', "^", "", "h", "<", ">", "+", "d", "<", "|", "", "+", "_"]) +# you may want to change the color map for different figures +COLOR_MAP = ( + '#AA4499', '#B03A2E', '#2874A6', '#239B56', '#7D3C98', '#00FFFF', '#F1C40F', '#F5CBA7', '#82E0AA', '#AEB6BF', + '#AA4499') +# you may want to change the patterns for different figures +PATTERNS = ( + ["\\\\", "////", "\\\\", "//", "o", "", "||", "-", "//", "\\", "o", "O", "////", ".", "|||", "o", "---", "+", + "\\\\", + "*"]) +LABEL_WEIGHT = 'bold' +LINE_COLORS = COLOR_MAP +LINE_WIDTH = 3.0 +MARKER_SIZE = 15.0 +MARKER_FREQUENCY = 1000 + +matplotlib.rcParams['ps.useafm'] = True +matplotlib.rcParams['pdf.use14corefonts'] = True +matplotlib.rcParams['xtick.labelsize'] = TICK_FONT_SIZE +matplotlib.rcParams['ytick.labelsize'] = TICK_FONT_SIZE +matplotlib.rcParams['font.family'] = OPT_FONT_NAME +matplotlib.rcParams['pdf.fonttype'] = 42 + +exp_dir = "/data1/xtra" + +FIGURE_FOLDER = exp_dir + '/results/figure' + + +def DrawLegend(legend_labels, filename): + fig = pylab.figure() + ax1 = fig.add_subplot(111) + FIGURE_LABEL = legend_labels + LEGEND_FP = FontProperties(style='normal', size=26) + figlegend = pylab.figure(figsize=(16, 0.5)) + bars = [None] * (len(FIGURE_LABEL)) + data = [1] + x_values = [1] + + width = 0.3 + for i in range(len(FIGURE_LABEL)): + bars[i] = ax1.bar(x_values, data, width, + hatch=PATTERNS[i], + color=LINE_COLORS[i], + label=FIGURE_LABEL[i], + edgecolor='black', linewidth=3) + + # LEGEND + + figlegend.legend(bars, FIGURE_LABEL, prop=LEGEND_FP, \ + loc=1, ncol=len(FIGURE_LABEL), mode="expand", shadow=True, \ + frameon=True, handlelength=2, handletextpad=0.3, columnspacing=0.5, + borderaxespad=-0.2, fancybox=True + ) + figlegend.savefig(FIGURE_FOLDER + '/' + filename + '.pdf') + + +# draw a bar chart + + +def DrawFigure(x_values, y_values, legend_labels, x_label, y_label, y_min, y_max, filename, allow_legend): + fig = plt.figure(figsize=(20, 6)) + figure = fig.add_subplot(111) + + LINE_COLORS = [ + '#FF8C00', '#FFE4C4', '#00FFFF', '#E0FFFF', + '#FF6347', '#98FB98', '#800080', '#FFD700', + '#7CFC00', '#8A2BE2', '#FF4500', '#20B2AA', + '#B0E0E6', '#DC143C', '#00FF7F' + ] + HATCH_PATTERNS = ['/', '-', 'o', '///', '\\', '|', 'x', '\\\\', '+', '.', '*', 'oo', '++++', '....', 'xxx'] + + FIGURE_LABEL = legend_labels + index = np.arange(len(x_values)) + width = 0.5 / len(x_values) + bars = [None] * (len(FIGURE_LABEL)) + for i in range(len(y_values)): + bars[i] = plt.bar(index + i * width + width / 2, + y_values[i], width, + hatch=HATCH_PATTERNS[i % len(HATCH_PATTERNS)], + color=LINE_COLORS[i % len(LINE_COLORS)], + label=FIGURE_LABEL[i], edgecolor='black', linewidth=3) + + if allow_legend: + plt.legend(bars, FIGURE_LABEL, + prop={'size': 28}, + ncol=len(bars) / 2, # Set the number of columns to match the number of bars + loc='upper center', + bbox_to_anchor=(0.5, 1.35), # Adjust the position + shadow=True, frameon=True, edgecolor='black', borderaxespad=0, columnspacing=0.2, handletextpad=0.2 + ) + + plt.xticks(index + len(x_values) / 2 * width, x_values, rotation=0) + figure.yaxis.set_major_locator(LinearLocator(5)) + # figure.xaxis.set_major_locator(LinearLocator(5)) + figure.get_xaxis().set_tick_params(direction='in', pad=10) + figure.get_yaxis().set_tick_params(direction='in', pad=10) + figure.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.1f')) + plt.xlabel(x_label, fontsize=TICK_FONT_SIZE) + plt.ylabel(y_label, fontsize=TICK_FONT_SIZE) + plt.xticks(fontsize=TICK_FONT_SIZE) + plt.yticks(fontsize=TICK_FONT_SIZE) + + fig.savefig(filename + ".pdf", bbox_inches='tight') + + +def DrawFigureYLog(x_values, y_values, legend_labels, x_label, y_label, y_min, y_max, filename, allow_legend): + fig = plt.figure(figsize=(20, 6)) + figure = fig.add_subplot(111) + + LINE_COLORS = [ + '#FF8C00', '#FFE4C4', '#00FFFF', '#E0FFFF', + '#FF6347', '#98FB98', '#800080', '#FFD700', + '#7CFC00', '#8A2BE2', '#FF4500', '#20B2AA', + '#B0E0E6', '#DC143C', '#00FF7F' + ] + HATCH_PATTERNS = ['/', '-', 'o', '///', '\\', '|', 'x', '\\\\', '+', '.', '*', 'oo', '++++', '....', 'xxx'] + + FIGURE_LABEL = legend_labels + index = np.arange(len(x_values)) + width = 0.5 / len(x_values) + bars = [None] * (len(FIGURE_LABEL)) + for i in range(len(y_values)): + bars[i] = plt.bar(index + i * width + width / 2, + y_values[i], width, + hatch=HATCH_PATTERNS[i % len(HATCH_PATTERNS)], + color=LINE_COLORS[i % len(LINE_COLORS)], + label=FIGURE_LABEL[i], edgecolor='black', linewidth=3) + + if allow_legend: + plt.legend(bars, FIGURE_LABEL, + prop={'size': 28}, + ncol=2, # Set the number of columns to match the number of bars + loc='upper center', + bbox_to_anchor=(-0.4, 0.75), # Adjust the position + shadow=True, frameon=True, edgecolor='black', borderaxespad=0, columnspacing=0.2, handletextpad=0.2 + ) + + plt.xticks(index + len(x_values) / 2 * width, x_values, rotation=0) + + plt.yscale('log') + figure.yaxis.set_major_locator(LogLocator(10)) + figure.get_xaxis().set_tick_params(direction='in', pad=10) + figure.get_yaxis().set_tick_params(direction='in', pad=10) + + plt.xticks(fontsize=TICK_FONT_SIZE) + plt.yticks(fontsize=TICK_FONT_SIZE) + plt.grid(axis='y', color='gray', alpha=0.5, linewidth=0.5) + plt.xlabel(x_label, fontsize=LABEL_FONT_SIZE) + plt.ylabel(y_label, fontsize=LABEL_FONT_SIZE) + # plt.show() + + fig.savefig(filename + ".pdf", bbox_inches='tight') + + +def DrawFigureYLog2(x_values, y_values, legend_labels, x_label, y_label, y_min, y_max, filename, allow_legend): + fig = plt.figure(figsize=(20, 6)) + figure = fig.add_subplot(111) + + LINE_COLORS = [ + '#FF8C00', '#FFE4C4', '#00FFFF', '#E0FFFF', + '#FF6347', '#98FB98', '#800080', '#FFD700', + '#7CFC00', '#8A2BE2', '#FF4500', '#20B2AA', + '#B0E0E6', '#DC143C', '#00FF7F' + ] + HATCH_PATTERNS = ['/', '-', 'o', '///', '\\', '|', 'x', '\\\\', '+', '.', '*', 'oo', '++++', '....', 'xxx'] + + FIGURE_LABEL = legend_labels + index = np.arange(len(x_values)) + width = 0.5 / 3 + bars = [None] * (len(FIGURE_LABEL)) + for i in range(len(y_values)): + bars[i] = plt.bar(index + i * width + width / 2, + y_values[i], width, + hatch=HATCH_PATTERNS[i % len(HATCH_PATTERNS)], + color=LINE_COLORS[i % len(LINE_COLORS)], + label=FIGURE_LABEL[i], edgecolor='black', linewidth=3) + + if allow_legend: + plt.legend(bars, FIGURE_LABEL, + prop={'size': 28}, + ncol=len(bars) / 2, # Set the number of columns to match the number of bars + loc='upper center', + bbox_to_anchor=(0.5, 1.3), # Adjust the position + shadow=True, frameon=True, edgecolor='black', borderaxespad=0, columnspacing=0.5, handletextpad=0.1, + labelspacing=0., + ) + + plt.xticks(index + 0.75 * width, x_values, rotation=30) + plt.xticks(fontsize=TICK_FONT_SIZE) + plt.yticks(fontsize=TICK_FONT_SIZE) + plt.xlabel(x_label, fontsize=24) + plt.ylabel(y_label, fontsize=24) + plt.axhline(y=1.0, color='red', linestyle='--') + figure.text(1.8, 5.0, "Instructions=1.0", fontsize=TICK_FONT_SIZE, ha='center') + plt.yscale('log') + figure.yaxis.set_major_locator(LogLocator(10)) + figure.get_xaxis().set_tick_params(direction='in', pad=10) + figure.get_yaxis().set_tick_params(direction='in', pad=10) + + plt.grid(axis='y', color='gray', alpha=0.5, linewidth=0.5) + + # plt.show() + + fig.savefig(filename + ".pdf", bbox_inches='tight') + + +# def DrawFigure(x_values, y_values, legend_labels, x_label, y_label, y_min, y_max, filename, allow_legend): +# # you may change the figure size on your own. +# fig = plt.figure(figsize=(10, 3)) +# figure = fig.add_subplot(111) +# +# FIGURE_LABEL = legend_labels +# +# # values in the x_xis +# index = np.arange(len(x_values)) +# # the bar width. +# # you may need to tune it to get the best figure. +# width = 0.6 / len(x_values) +# # draw the bars +# bars = [None] * (len(FIGURE_LABEL)) +# for i in range(len(y_values)): +# bars[i] = plt.bar(index + i * width + width / 2, +# y_values[i], width, +# hatch=PATTERNS[i], +# color=LINE_COLORS[i], +# label=FIGURE_LABEL[i], edgecolor='black', linewidth=3) +# +# # sometimes you may not want to draw legends. +# if allow_legend == True: +# plt.legend(bars, FIGURE_LABEL, +# prop=LEGEND_FP, +# ncol=2, +# loc='upper center', +# # mode='expand', +# shadow=False, +# bbox_to_anchor=(0.45, 1.7), +# columnspacing=0.1, +# handletextpad=0.2, +# # bbox_transform=ax.transAxes, +# # frameon=True, +# # columnspacing=5.5, +# # handlelength=2, +# ) +# +# # you may need to tune the xticks position to get the best figure. +# plt.xticks(index + len(x_values) / 2 * width, x_values, rotation=0) +# +# # plt.ticklabel_format(axis="y", style="sci", scilimits=(0, 0)) +# # plt.grid(axis='y', color='gray') +# # figure.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter()) +# +# # you may need to tune the xticks position to get the best figure. +# # plt.yscale('log') +# # +# # plt.grid(axis='y', color='gray') +# figure.yaxis.set_major_locator(LinearLocator(5)) +# # figure.xaxis.set_major_locator(LinearLocator(5)) +# figure.get_xaxis().set_tick_params(direction='in', pad=10) +# figure.get_yaxis().set_tick_params(direction='in', pad=10) +# figure.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.1f')) +# plt.xlabel(x_label, fontproperties=LABEL_FP) +# plt.ylabel(y_label, fontproperties=LABEL_FP) +# +# plt.savefig(filename + ".pdf", bbox_inches='tight') + + +# example for reading csv file +def ReadFile(): + y = [] + col1 = [] + col2 = [] + col3 = [] + col4 = [] + col5 = [] + col6 = [] + col7 = [] + col8 = [] + col9 = [] + + for id in it.chain(range(38, 42)): + col9.append(0) + y.append(col9) # this is a fake empty line to separate eager and lazy. + + for id in it.chain(range(38, 42)): + file = exp_dir + '/results/latency/NPJ_{}.txt'.format(id) + f = open(file, "r") + read = f.readlines() + x = float(read.pop(int(len(read) * 0.95)).strip("\n")) # get the 99th timestamp + col1.append(x) + y.append(col1) + + for id in it.chain(range(38, 42)): + file = exp_dir + '/results/latency/PRJ_{}.txt'.format(id) + f = open(file, "r") + read = f.readlines() + x = float(read.pop(int(len(read) * 0.95)).strip("\n")) # get the 99th timestamp + col2.append(x) + y.append(col2) + + for id in it.chain(range(38, 42)): + file = exp_dir + '/results/latency/MWAY_{}.txt'.format(id) + f = open(file, "r") + read = f.readlines() + x = float(read.pop(int(len(read) * 0.95)).strip("\n")) # get the 99th timestamp + col3.append(x) + y.append(col3) + + for id in it.chain(range(38, 42)): + file = exp_dir + '/results/latency/MPASS_{}.txt'.format(id) + f = open(file, "r") + read = f.readlines() + x = float(read.pop(int(len(read) * 0.95)).strip("\n")) # get the 99th timestamp + col4.append(x) + y.append(col4) + + y.append(col9) # this is a fake empty line to separate eager and lazy. + + for id in it.chain(range(38, 42)): + file = exp_dir + '/results/latency/SHJ_JM_NP_{}.txt'.format(id) + f = open(file, "r") + read = f.readlines() + x = float(read.pop(int(len(read) * 0.95)).strip("\n")) # get last timestamp + col5.append(x) + y.append(col5) + + for id in it.chain(range(38, 42)): + file = exp_dir + '/results/latency/SHJ_JBCR_NP_{}.txt'.format(id) + f = open(file, "r") + read = f.readlines() + x = float(read.pop(int(len(read) * 0.95)).strip("\n")) # get last timestamp + col6.append(x) + y.append(col6) + + for id in it.chain(range(38, 42)): + file = exp_dir + '/results/latency/PMJ_JM_NP_{}.txt'.format(id) + f = open(file, "r") + read = f.readlines() + x = float(read.pop(int(len(read) * 0.95)).strip("\n")) # get last timestamp + col7.append(x) + y.append(col7) + + for id in it.chain(range(38, 42)): + file = exp_dir + '/results/latency/PMJ_JBCR_NP_{}.txt'.format(id) + f = open(file, "r") + read = f.readlines() + x = float(read.pop(int(len(read) * 0.95)).strip("\n")) # get last timestamp + col8.append(x) + y.append(col8) + return y + + +if __name__ == "__main__": + x_values = ["Stock", "Rovio", "YSB", "DEBS"] + + y_values = ReadFile() + + legend_labels = ['Lazy:', 'NPJ', 'PRJ', 'MWAY', 'MPASS', + 'Eager:', 'SHJ$^{JM}$', 'SHJ$^{JB}$', 'PMJ$^{JM}$', 'PMJ$^{JB}$'] + print(y_values) + DrawFigure(x_values, y_values, legend_labels, + '', 'Latency (ms)', 0, + 400, 'latency_figure_app', False) + + # DrawLegend(legend_labels, 'latency_legend') diff --git a/benchmark/scripts/breakdownHNSW/groupLine.py b/benchmark/scripts/breakdownHNSW/groupLine.py new file mode 100644 index 000000000..8c0e33b8b --- /dev/null +++ b/benchmark/scripts/breakdownHNSW/groupLine.py @@ -0,0 +1,400 @@ +import itertools as it +import os + +import matplotlib +import matplotlib.pyplot as plt +import numpy as np +import pylab +from matplotlib.font_manager import FontProperties +from matplotlib.ticker import MaxNLocator +from matplotlib.font_manager import FontProperties +from matplotlib.ticker import LinearLocator, LogLocator, MaxNLocator, ScalarFormatter +from numpy import double +import matplotlib.patches as patches + +OPT_FONT_NAME = 'Helvetica' +TICK_FONT_SIZE = 32 +LABEL_FONT_SIZE = 30 +LEGEND_FONT_SIZE = 32 +LABEL_FP = FontProperties(style='normal', size=LABEL_FONT_SIZE) +LEGEND_FP = FontProperties(style='normal', size=LEGEND_FONT_SIZE) +TICK_FP = FontProperties(style='normal', size=TICK_FONT_SIZE) +MARKERS = ['s', 'o', '^', 'v', '+', '*', 'h', 'x', 'p', '1', '2', 'o', '+', '|'] +COLOR_MAP = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', + '#17becf', '#1f77b4'] +# you may want to change the patterns for different figures +PATTERNS = (["|", "\\", "/", "+", "-", ".", "*", "x", "o", "O", "////", ".", "|||", "o", "---", "+", "\\\\", "*"]) +LABEL_WEIGHT = 'bold' +LINE_COLORS = COLOR_MAP +LINE_WIDTH = 3.0 +MARKER_SIZE = 13.0 +MARKER_FREQUENCY = 1000 + +matplotlib.rcParams['ps.useafm'] = True +matplotlib.rcParams['pdf.use14corefonts'] = True +matplotlib.rcParams['xtick.labelsize'] = TICK_FONT_SIZE +matplotlib.rcParams['ytick.labelsize'] = TICK_FONT_SIZE +matplotlib.rcParams['font.family'] = OPT_FONT_NAME + +FIGURE_FOLDER = '/data1/xtra/results/figure' + + +# there are some embedding problems if directly exporting the pdf figure using matplotlib. +# so we generate the eps format first and convert it to pdf. +def ConvertEpsToPdf(dir_filename): + os.system("epstopdf --outfile " + dir_filename + ".pdf " + dir_filename + ".eps") + os.system("rm -rf " + dir_filename + ".eps") + + +def DrawLegend(legend_labels, filename): + fig = pylab.figure() + ax1 = fig.add_subplot(111) + FIGURE_LABEL = legend_labels + LINE_WIDTH = 8.0 + MARKER_SIZE = 12.0 + LEGEND_FP = FontProperties(style='normal', size=26) + + figlegend = pylab.figure(figsize=(12, 0.5)) + idx = 0 + lines = [None] * (len(FIGURE_LABEL)) + data = [1] + x_values = [1] + + idx = 0 + for group in range(len(FIGURE_LABEL)): + lines[idx], = ax1.plot(x_values, data, + color=LINE_COLORS[idx], linewidth=LINE_WIDTH, + marker=MARKERS[idx], markersize=MARKER_SIZE, label=str(group)) + idx = idx + 1 + + # LEGEND + figlegend.legend(lines, FIGURE_LABEL, prop=LEGEND_FP, + loc=1, ncol=len(FIGURE_LABEL), mode="expand", shadow=False, + frameon=False, borderaxespad=0.0, handlelength=2) + + if not os.path.exists(FIGURE_FOLDER): + os.makedirs(FIGURE_FOLDER) + # no need to export eps in this case. + figlegend.savefig(filename + '.pdf') + + +# draw a line chart +def DrawFigure2(xvalues, yvalues, legend_labels, x_label, y_label, y_min, y_max, filename, allow_legend): + fig = plt.figure(figsize=(10, 4)) + + markers = ['s', 'o', '^', 'v', '+', '*', ',', 'x', 'p', '1', '2', 'o'] + linestyles = ['-.', '-.', 'dotted', 'dotted', 'dotted', 'dotted', 'dotted', ':', 'dashed', 'dotted', 'dotted', '-'] + colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', + '#17becf', '#1f77b4'] + linewidth = 2 + + FIGURE_LABEL = legend_labels + x_values = xvalues + y_values = yvalues + + lines = [None] * (len(FIGURE_LABEL)) + for i in range(len(y_values)): + lines[i], = plt.plot(x_values[i], y_values[i], color=colors[i], \ + linewidth=linewidth, marker=markers[i], \ + markersize=9, linestyle=linestyles[i], \ + label=FIGURE_LABEL[i]) + + # for i in range(len(x_values)): + # plt.axvline(x=x_values[i][0], linestyle='--', color='gray') + # plt.xticks(x_values.flatten()) + + if allow_legend: + plt.legend(lines, + FIGURE_LABEL, + fontsize=12, + loc='upper center', + ncol=3, + bbox_to_anchor=(0.5, 1.15), + borderaxespad=0., + frameon=True) + plt.xlabel(x_label, fontsize=20) + plt.ylabel(y_label, fontsize=20) + + plt.ylim(y_min, y_max) + plt.grid(axis='y', color='gray', alpha=0.5, linewidth=0.5) + + # plt.show() + + fig.savefig(filename + ".pdf", bbox_inches='tight') + + +# def DrawFigure2(xvalues, yvalues, legend_labels, x_label, y_label, y_min, y_max, filename, allow_legend): +# # you may change the figure size on your own. +# fig = plt.figure(figsize=(10, 3)) +# figure = fig.add_subplot(111) +# +# FIGURE_LABEL = legend_labels +# +# x_values = xvalues +# y_values = yvalues +# +# lines = [None] * (len(FIGURE_LABEL)) +# for i in range(len(y_values)): +# lines[i], = figure.plot(x_values[i], y_values[i], color=LINE_COLORS[i], \ +# linewidth=LINE_WIDTH, marker=MARKERS[i], \ +# markersize=MARKER_SIZE, label=FIGURE_LABEL[i]) +# +# # sometimes you may not want to draw legends. +# if allow_legend == True: +# plt.legend(lines, +# FIGURE_LABEL, +# prop=LEGEND_FP, +# loc='upper center', +# ncol=3, +# # mode='expand', +# bbox_to_anchor=(0.55, 1.6), shadow=False, +# columnspacing=0.1, +# frameon=True, borderaxespad=0.0, handlelength=1.5, +# handletextpad=0.1, +# labelspacing=0.1) +# # plt.xscale('log') +# # plt.yscale('log') +# # plt.yscale('log') +# +# # you may control the limits on your own. +# +# # lt.ylim(y_min, y_max) +# +# plt.grid(axis='y', color='gray') +# # figure.yaxis.set_major_locator(LogLocator(base=10)) +# # figure.xaxis.set_major_locator(LogLocator(base=10)) +# +# # figure.get_xaxis().set_tick_params(direction='in', pad=10) +# # figure.get_yaxis().set_tick_params(direction='in', pad=10) +# +# plt.xlabel(x_label, fontproperties=LABEL_FP) +# plt.ylabel(y_label, fontproperties=LABEL_FP) +# +# size = fig.get_size_inches() +# dpi = fig.get_dpi() +# +# plt.savefig(filename + ".pdf", bbox_inches='tight') + + +# draw a line chart +def DrawFigureYnormal(xvalues, yvalues, legend_labels, x_label, y_label, y_min, y_max, filename, allow_legend): + # you may change the figure size on your own. + fig = plt.figure(figsize=(10, 6)) + figure = fig.add_subplot(111) + LINE_COLORS = [ + '#FF8C00', '#FFE4C4', '#00FFFF', '#E0FFFF', + '#FF6347', '#98FB98', '#800080', '#FFD700', + '#7CFC00', '#8A2BE2', '#FF4500', '#20B2AA', + '#B0E0E6', '#00000F', '#00FF7F' + ] + FIGURE_LABEL = legend_labels + + x_values = xvalues + y_values = yvalues + print(len(FIGURE_LABEL), len(x_values)) + lines = [None] * (len(FIGURE_LABEL)) + for i in range(len(y_values)): + lines[i], = figure.plot(x_values[i], y_values[i], color=LINE_COLORS[i], \ + linewidth=LINE_WIDTH, marker=MARKERS[i], \ + markersize=MARKER_SIZE, label=FIGURE_LABEL[i], markeredgecolor='k') + + # sometimes you may not want to draw legends. + if allow_legend == True: + plt.legend(lines, + FIGURE_LABEL, + prop=LEGEND_FP, + loc='upper center', + ncol=1, + bbox_to_anchor=(-0.3, 1.0), shadow=False, + columnspacing=0.1, + frameon=True, borderaxespad=0, handlelength=1.2, + handletextpad=0.1, + labelspacing=0.1) + # plt.xscale('log') + + # plt.yscale('log') + + # you may control the limits on your own. + + # plt.ylim(y_min, y_max) + + plt.grid(axis='y', color='gray') + # figure.yaxis.set_major_locator(LogLocator(base=10)) + # figure.xaxis.set_major_locator(LogLocator(base=10)) + plt.xticks(fontsize=TICK_FONT_SIZE) + figure.get_xaxis().set_tick_params(direction='in', pad=10) + figure.get_yaxis().set_tick_params(direction='in', pad=10) + # Create a rectangle with bias lines + # rectangle = patches.Rectangle((6.0, 0.00), 2.5, 0.2, edgecolor='black', hatch='\\', fill=False) + # figure.text(7.0, 0.21, "user demand", fontsize=TICK_FONT_SIZE, ha='center') + # figure.add_patch(rectangle) + plt.xlabel(x_label, fontproperties=LABEL_FP) + plt.ylabel(y_label, fontproperties=LABEL_FP) + plt.xticks(fontsize=TICK_FONT_SIZE) + plt.yticks(fontsize=TICK_FONT_SIZE) + size = fig.get_size_inches() + dpi = fig.get_dpi() + plt.savefig(filename + ".pdf", bbox_inches='tight') + + +def DrawFigureYLog(xvalues, yvalues, legend_labels, x_label, y_label, y_min, y_max, filename, allow_legend): + # you may change the figure size on your own. + fig = plt.figure(figsize=(20, 6)) + figure = fig.add_subplot(111) + LINE_COLORS = [ + '#FF8C00', '#FFE4C4', '#00FFFF', '#E0FFFF', + '#FF6347', '#98FB98', '#800080', '#FFD700', + '#7CFC00', '#8A2BE2', '#FF4500', '#20B2AA', + '#B0E0E6', '#00000F', '#00FF7F' + ] + FIGURE_LABEL = legend_labels + + x_values = xvalues + y_values = yvalues + print(len(FIGURE_LABEL), len(x_values)) + lines = [None] * (len(FIGURE_LABEL)) + for i in range(len(y_values)): + lines[i], = figure.plot(x_values[i], y_values[i], color=LINE_COLORS[i], \ + linewidth=LINE_WIDTH, marker=MARKERS[i], \ + markersize=MARKER_SIZE, label=FIGURE_LABEL[i], markeredgecolor='k') + + # sometimes you may not want to draw legends. + if allow_legend == True: + plt.legend(lines, + FIGURE_LABEL, + prop=LEGEND_FP, + loc='upper center', + ncol=2, + bbox_to_anchor=(-0.285, 0.7), shadow=False, + columnspacing=0.1, + frameon=True, borderaxespad=0, handlelength=1.2, + handletextpad=0.1, + labelspacing=0.1) + plt.xscale('log') + + plt.yscale('log') + + # you may control the limits on your own. + + # plt.ylim(y_min, y_max) + + plt.grid(axis='y', color='gray') + figure.yaxis.set_major_locator(LogLocator(base=10)) + figure.xaxis.set_major_locator(LogLocator(base=10)) + plt.xticks(fontsize=TICK_FONT_SIZE) + figure.get_xaxis().set_tick_params(direction='in', pad=10) + figure.get_yaxis().set_tick_params(direction='in', pad=10) + # Create a rectangle with bias lines + # rectangle = patches.Rectangle((6.0, 0.00), 2.5, 0.2, edgecolor='black', hatch='\\', fill=False) + # figure.text(7.0, 0.21, "user demand", fontsize=TICK_FONT_SIZE, ha='center') + # figure.add_patch(rectangle) + plt.xlabel(x_label, fontproperties=LABEL_FP) + plt.ylabel(y_label, fontproperties=LABEL_FP) + plt.xticks(fontsize=TICK_FONT_SIZE) + plt.yticks(fontsize=TICK_FONT_SIZE) + size = fig.get_size_inches() + dpi = fig.get_dpi() + + plt.savefig(filename + ".pdf", bbox_inches='tight') + + +# example for reading csv file +def ReadFile(): + y = [] + col1 = [] + col2 = [] + col3 = [] + col4 = [] + col5 = [] + col6 = [] + col7 = [] + col8 = [] + + for id in it.chain(range(28, 32)): + file = '/data1/xtra/results/timestamps/PRJ_{}.txt'.format(id) + f = open(file, "r") + read = f.readlines() + x = float(read.pop(len(read) - 1).strip("\n")) # get last timestamp + value = len(read) / x # get throughput (#items/ms) + col1.append(value) + y.append(col1) + + for id in it.chain(range(28, 32)): + file = '/data1/xtra/results/timestamps/NPJ_{}.txt'.format(id) + f = open(file, "r") + read = f.readlines() + x = float(read.pop(len(read) - 1).strip("\n")) # get last timestamp + value = len(read) / x # get throughput (#items/ms) + col2.append(value) + y.append(col2) + + for id in it.chain(range(28, 32)): + file = '/data1/xtra/results/timestamps/MPASS_{}.txt'.format(id) + f = open(file, "r") + read = f.readlines() + x = float(read.pop(len(read) - 1).strip("\n")) # get last timestamp + value = len(read) / x # get throughput (#items/ms) + col3.append(value) + y.append(col3) + + for id in it.chain(range(28, 32)): + file = '/data1/xtra/results/timestamps/MWAY_{}.txt'.format(id) + f = open(file, "r") + read = f.readlines() + x = float(read.pop(len(read) - 1).strip("\n")) # get last timestamp + value = len(read) / x # get throughput (#items/ms) + col4.append(value) + y.append(col4) + + for id in it.chain(range(28, 32)): + file = '/data1/xtra/results/timestamps/SHJ_JM_NP_{}.txt'.format(id) + f = open(file, "r") + read = f.readlines() + x = float(read.pop(len(read) - 1).strip("\n")) # get last timestamp + value = len(read) / x # get throughput (#items/ms) + col5.append(value) + y.append(col5) + + for id in it.chain(range(28, 32)): + file = '/data1/xtra/results/timestamps/SHJ_JBCR_NP_{}.txt'.format(id) + f = open(file, "r") + read = f.readlines() + x = float(read.pop(len(read) - 1).strip("\n")) # get last timestamp + value = len(read) / x # get throughput (#items/ms) + col6.append(value) + y.append(col6) + + for id in it.chain(range(28, 32)): + file = '/data1/xtra/results/timestamps/PMJ_JM_NP_{}.txt'.format(id) + f = open(file, "r") + read = f.readlines() + x = float(read.pop(len(read) - 1).strip("\n")) # get last timestamp + value = len(read) / x # get throughput (#items/ms) + col7.append(value) + y.append(col7) + + for id in it.chain(range(28, 32)): + file = '/data1/xtra/results/timestamps/PMJ_JBCR_NP_{}.txt'.format(id) + f = open(file, "r") + read = f.readlines() + x = float(read.pop(len(read) - 1).strip("\n")) # get last timestamp + value = len(read) / x # get throughput (#items/ms) + col8.append(value) + y.append(col8) + return y + + +if __name__ == "__main__": + # x_values = ['Unique', 'Zipf(0)', 'Zipf(0.2)', 'Zipf(0.4)', 'Zipf(0.8)', 'Zipf(1)'] + x_values = [1600, 3200, 6400, 12800, 25600] + + y_values = ReadFile() + + legend_labels = ['NPJ', 'PRJ', 'MWAY', 'MPASS', 'SHJ$^{JM}$', 'SHJ$^{JB}$', 'PMJ$^{JM}$', + 'PMJ$^{JB}$'] + + DrawFigure(x_values, y_values, legend_labels, + 'Input arrival rate of R (e/ms)', 'Tpt. (#matches/ms)', x_values[0], + x_values[4], 'throughput_figure1_1', False) + +# DrawLegend(legend_labels, 'factor_legend') diff --git a/benchmark/scripts/breakdownHNSW/temp2.csv b/benchmark/scripts/breakdownHNSW/temp2.csv new file mode 100644 index 000000000..21dcdad44 --- /dev/null +++ b/benchmark/scripts/breakdownHNSW/temp2.csv @@ -0,0 +1,44 @@ +key,value,type +vecDim,768,I64 +vecVolume,100000,I64 +batchSize,4000,I64 +metricType,IP,String +DCOBatchSize,5000,I64 +ammAlgo,crs,String +sketchSize,128,I64 +initialRows,50000,I64 +driftPosition,50000,I64 +driftOffset,0.1,Double +indexTag,congestionDrop,String +congestionDropWorker_algoTag,faiss,String +eventRateTps,4000,I64 +querySize,100,I64 +zipfAlpha,0,Double +coarseGrainedClusters,96,I64 +maskReference,0,Double +encodeLen,1,I64 +numberOfBuckets,8192,I64 +cutOffTimeSeconds,14400,I64 +useSeparateQuery,1,I64 +sampleRows,2048,I64 +faissIndexTag,flat,String +useCRS,1,I64 +crsDim,10,I64 +dataLoaderTag,fvecs,String +initialRows,50000,I64 +staticDataSet,0,I64 +maxBuildIteration,200,I64 +lshMatrixType,random,String +ANNK,10,I64 +frozenLevel,0,I64 +cudaBuild,1,I64 +candidateTimes,1,I64 +disableADC,0,I64 +isOnlinePQ,0,I64 +fineGrainedBuiltPath,OnlinePQIndex_fine.rbt,String +dataPath,/home/rag/projects/CANDY/build/benchmark//results/scanIPConceptDriftHotSpot/driftData/data_0.8.fvecs,String +queryPath,/home/rag/projects/CANDY/build/benchmark//results/scanIPConceptDriftHotSpot/driftData/query_0.8.fvecs,String +waitPendingWrite,1,I64 +is_NSW,0,I64 +isOnlinePQ,0,I64 +flannIndexTag,1,I64 diff --git a/benchmark/scripts/breakdownHNSW/temp3.csv b/benchmark/scripts/breakdownHNSW/temp3.csv new file mode 100644 index 000000000..1d77414cf --- /dev/null +++ b/benchmark/scripts/breakdownHNSW/temp3.csv @@ -0,0 +1,44 @@ +key,value,type +vecDim,768,I64 +vecVolume,100000,I64 +batchSize,4000,I64 +metricType,IP,String +DCOBatchSize,5000,I64 +ammAlgo,crs,String +sketchSize,128,I64 +initialRows,50000,I64 +driftPosition,50000,I64 +driftOffset,0.1,Double +indexTag,congestionDrop,String +congestionDropWorker_algoTag,faiss,String +eventRateTps,4000,I64 +querySize,100,I64 +zipfAlpha,0,Double +coarseGrainedClusters,96,I64 +maskReference,0,Double +encodeLen,1,I64 +numberOfBuckets,8192,I64 +cutOffTimeSeconds,14400,I64 +useSeparateQuery,1,I64 +sampleRows,2048,I64 +faissIndexTag,flat,String +useCRS,1,I64 +crsDim,10,I64 +dataLoaderTag,fvecs,String +initialRows,50000,I64 +staticDataSet,0,I64 +maxBuildIteration,200,I64 +lshMatrixType,random,String +ANNK,10,I64 +frozenLevel,0,I64 +cudaBuild,1,I64 +candidateTimes,1,I64 +disableADC,0,I64 +isOnlinePQ,0,I64 +fineGrainedBuiltPath,OnlinePQIndex_fine.rbt,String +dataPath,/home/rag/projects/CANDY/build/benchmark//results/scanIPConceptDriftHotSpot/driftData/data_0.8.fvecs,String +queryPath,datasets/DPR/DPR10KC4Q.fvecs,String +waitPendingWrite,1,I64 +is_NSW,0,I64 +isOnlinePQ,0,I64 +flannIndexTag,1,I64 diff --git a/src/CANDY/FaissIndex.cpp b/src/CANDY/FaissIndex.cpp index 12546a87d..7436b7477 100644 --- a/src/CANDY/FaissIndex.cpp +++ b/src/CANDY/FaissIndex.cpp @@ -8,7 +8,7 @@ #include #include #include - +#include bool CANDY::FaissIndex::setConfig(INTELLI::ConfigMapPtr cfg) { AbstractIndex::setConfig(cfg); INTELLI_INFO("SETTING CONFIG FOR FaissIndex"); @@ -24,7 +24,11 @@ bool CANDY::FaissIndex::setConfig(INTELLI::ConfigMapPtr cfg) { INTELLI_INFO("ENCAPSULATED FAISS INDEX: USE HNSWFlat"); auto M = cfg->tryI64("maxConnection", 32, true); index = new faiss::IndexHNSWFlat(vecDim, M, faissMetric); - } else if (index_type == "PQ") { + } else if (index_type == "HNSWbd") { + INTELLI_INFO("ENCAPSULATED FAISS INDEX: USE HNSWFlat with breakdown enabled!"); + auto M = cfg->tryI64("maxConnection", 32, true); + index = new faiss::IndexHNSWbdFlat(vecDim, M, faissMetric); +} else if (index_type == "PQ") { INTELLI_INFO("ENCAPSULATED FAISS INDEX: USE PQ"); // number of bits in PQ auto nbits = cfg->tryI64("encodeLenBits", bytes * 8, true); @@ -251,4 +255,4 @@ std::vector CANDY::FaissIndex::getTensorByIndex(std::vector + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern "C" { + +/* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */ + +int sgemm_( + const char* transa, + const char* transb, + FINTEGER* m, + FINTEGER* n, + FINTEGER* k, + const float* alpha, + const float* a, + FINTEGER* lda, + const float* b, + FINTEGER* ldb, + float* beta, + float* c, + FINTEGER* ldc); +} + +namespace faiss { + +using MinimaxHeap = HNSWbd::MinimaxHeap; +using storage_idx_t = HNSWbd::storage_idx_t; +using NodeDistFarther = HNSWbd::NodeDistFarther; + + + +/************************************************************** + * add / search blocks of descriptors + **************************************************************/ + +namespace { + +/* Wrap the distance computer into one that negates the + distances. This makes supporting INNER_PRODUCE search easier */ + +struct NegativeDistanceComputer : DistanceComputer { + /// owned by this + DistanceComputer* basedis; + + explicit NegativeDistanceComputer(DistanceComputer* basedis) + : basedis(basedis) {} + + void set_query(const float* x) override { + basedis->set_query(x); + } + + /// compute distance of vector i to current query + float operator()(idx_t i) override { + return -(*basedis)(i); + } + + void distances_batch_4( + const idx_t idx0, + const idx_t idx1, + const idx_t idx2, + const idx_t idx3, + float& dis0, + float& dis1, + float& dis2, + float& dis3) override { + basedis->distances_batch_4( + idx0, idx1, idx2, idx3, dis0, dis1, dis2, dis3); + dis0 = -dis0; + dis1 = -dis1; + dis2 = -dis2; + dis3 = -dis3; + } + + /// compute distance between two stored vectors + float symmetric_dis(idx_t i, idx_t j) override { + return -basedis->symmetric_dis(i, j); + } + + virtual ~NegativeDistanceComputer() { + delete basedis; + } +}; + +DistanceComputer* storage_distance_computer(const Index* storage) { + if (is_similarity_metric(storage->metric_type)) { + return new NegativeDistanceComputer(storage->get_distance_computer()); + } else { + return storage->get_distance_computer(); + } +} + +void hnsw_add_vertices( + IndexHNSWbd& index_hnsw, + size_t n0, + size_t n, + const float* x, + bool verbose, + bool preset_levels = false) { + size_t d = index_hnsw.d; + HNSWbd& hnsw = index_hnsw.hnsw; + size_t ntotal = n0 + n; + double t0 = getmillisecs(); + if (verbose) { + //printf("hnsw_add_vertices: adding %zd elements on top of %zd " + // "(preset_levels=%d)\n", + // n, + // n0, + //int(preset_levels)); + } + + if (n == 0) { + return; + } + + int max_level = hnsw.prepare_level_tab(n, preset_levels); + + if (verbose) { + //printf(" max_level = %d\n", max_level); + } + + std::vector locks(ntotal); + for (int i = 0; i < ntotal; i++) + omp_init_lock(&locks[i]); + + // add vectors from highest to lowest level + std::vector hist; + std::vector order(n); + + { // make buckets with vectors of the same level + + // build histogram + for (int i = 0; i < n; i++) { + storage_idx_t pt_id = i + n0; + int pt_level = hnsw.levels[pt_id] - 1; + while (pt_level >= hist.size()) + hist.push_back(0); + hist[pt_level]++; + } + + // accumulate + std::vector offsets(hist.size() + 1, 0); + for (int i = 0; i < hist.size() - 1; i++) { + offsets[i + 1] = offsets[i] + hist[i]; + } + + // bucket sort + for (int i = 0; i < n; i++) { + storage_idx_t pt_id = i + n0; + int pt_level = hnsw.levels[pt_id] - 1; + order[offsets[pt_level]++] = pt_id; + } + } + + idx_t check_period = InterruptCallback::get_period_hint( + max_level * index_hnsw.d * hnsw.efConstruction); + + { // perform add + RandomGenerator rng2(789); + + int i1 = n; + + for (int pt_level = hist.size() - 1; pt_level >= 0; pt_level--) { + int i0 = i1 - hist[pt_level]; + + if (verbose) { + // printf("Adding %d elements at level %d\n", i1 - i0, pt_level); + } + + // random permutation to get rid of dataset order bias + for (int j = i0; j < i1; j++) + std::swap(order[j], order[j + rng2.rand_int(i1 - j)]); + + bool interrupt = false; + +//#pragma omp parallel if (i1 > i0 + 100) + { + VisitedTable vt(ntotal); + + std::unique_ptr dis( + storage_distance_computer(index_hnsw.storage)); + int prev_display = + verbose && omp_get_thread_num() == 0 ? 0 : -1; + size_t counter = 0; + + // here we should do schedule(dynamic) but this segfaults for + // some versions of LLVM. The performance impact should not be + // too large when (i1 - i0) / num_threads >> 1 +//#pragma omp for schedule(static) + for (int i = i0; i < i1; i++) { + storage_idx_t pt_id = order[i]; + dis->set_query(x + (pt_id - n0) * d); + + // cannot break + if (interrupt) { + continue; + } + + hnsw.add_with_locks(*dis, pt_level, pt_id, locks, vt); + + // if (prev_display >= 0 && i - i0 > prev_display + 10000) { + // prev_display = i - i0; + // printf(" %d / %d\r", i - i0, i1 - i0); + // fflush(stdout); + //} + if (counter % check_period == 0) { + if (InterruptCallback::is_interrupted()) { + interrupt = true; + } + } + counter++; + } + } + if (interrupt) { + FAISS_THROW_MSG("computation interrupted"); + } + i1 = i0; + } + FAISS_ASSERT(i1 == 0); + } + if (verbose) { + // printf("Done in %.3f ms\n", getmillisecs() - t0); + } + + for (int i = 0; i < ntotal; i++) { + omp_destroy_lock(&locks[i]); + } +} + +} // namespace + +/************************************************************** + * IndexHNSW implementation + **************************************************************/ + +IndexHNSWbd::IndexHNSWbd(int d, int M, MetricType metric) + : Index(d, metric), hnsw(M) {} + +IndexHNSWbd::IndexHNSWbd(Index* storage, int M) + : Index(storage->d, storage->metric_type), hnsw(M), storage(storage) {} + +IndexHNSWbd::~IndexHNSWbd() { + if (own_fields) { + delete storage; + } +} + +void IndexHNSWbd::train(idx_t n, const float* x) { + FAISS_THROW_IF_NOT_MSG( + storage, + "Please use IndexHNSWFlat (or variants) instead of IndexHNSW directly"); + // hnsw structure does not require training + storage->train(n, x); + is_trained = true; +} + +void IndexHNSWbd::search( + idx_t n, + const float* x, + idx_t k, + float* distances, + idx_t* labels, + const SearchParameters* params_in) const { + FAISS_THROW_IF_NOT(k > 0); + FAISS_THROW_IF_NOT_MSG( + storage, + "Please use IndexHNSWFlat (or variants) instead of IndexHNSW directly"); + const SearchParametersHNSW* params = nullptr; + + int efSearch = hnsw.efSearch; + if (params_in) { + params = dynamic_cast(params_in); + FAISS_THROW_IF_NOT_MSG(params, "params type invalid"); + efSearch = params->efSearch; + } + size_t n1 = 0, n2 = 0, n3 = 0, ndis = 0, nreorder = 0; + + idx_t check_period = + InterruptCallback::get_period_hint(hnsw.max_level * d * efSearch); + hnsw.bd_stat.reset(); + for (idx_t i0 = 0; i0 < n; i0 += check_period) { + idx_t i1 = std::min(i0 + check_period, n); + +//#pragma omp parallel + { + VisitedTable vt(ntotal); + + std::unique_ptr dis( + storage_distance_computer(storage)); + +//#pragma omp for reduction(+ : n1, n2, n3, ndis, nreorder) schedule(guided) + for (idx_t i = i0; i < i1; i++) { + idx_t* idxi = labels + i * k; + float* simi = distances + i * k; + dis->set_query(x + i * d); + + maxheap_heapify(k, simi, idxi); + HNSWStats stats = hnsw.search(*dis, k, idxi, simi, vt, params); + n1 += stats.n1; + n2 += stats.n2; + n3 += stats.n3; + ndis += stats.ndis; + nreorder += stats.nreorder; + maxheap_reorder(k, simi, idxi); + + if (reconstruct_from_neighbors && + reconstruct_from_neighbors->k_reorder != 0) { + int k_reorder = reconstruct_from_neighbors->k_reorder; + if (k_reorder == -1 || k_reorder > k) + k_reorder = k; + + nreorder += reconstruct_from_neighbors->compute_distances( + k_reorder, idxi, x + i * d, simi); + + // sort top k_reorder + maxheap_heapify( + k_reorder, simi, idxi, simi, idxi, k_reorder); + maxheap_reorder(k_reorder, simi, idxi); + } + } + } + InterruptCallback::check(); + } + + if (is_similarity_metric(metric_type)) { + // we need to revert the negated distances + for (size_t i = 0; i < k * n; i++) { + distances[i] = -distances[i]; + } + } + + hnsw_stats.combine({n1, n2, n3, ndis, nreorder}); + //hnsw.bd_stat.print(); +} + +void IndexHNSWbd::add(idx_t n, const float* x) { + FAISS_THROW_IF_NOT_MSG( + storage, + "Please use IndexHNSWFlat (or variants) instead of IndexHNSW directly"); + FAISS_THROW_IF_NOT(is_trained); + int n0 = ntotal; + storage->add(n, x); + ntotal = storage->ntotal; + hnsw.bd_stat.reset(); + hnsw_add_vertices(*this, n0, n, x, verbose, hnsw.levels.size() == ntotal); + hnsw.bd_stat.print(); +} + +void IndexHNSWbd::reset() { + hnsw.reset(); + storage->reset(); + ntotal = 0; +} + +void IndexHNSWbd::reconstruct(idx_t key, float* recons) const { + storage->reconstruct(key, recons); +} + +void IndexHNSWbd::shrink_level_0_neighbors(int new_size) { +//#pragma omp parallel + { + std::unique_ptr dis( + storage_distance_computer(storage)); + +//#pragma omp for + for (idx_t i = 0; i < ntotal; i++) { + size_t begin, end; + hnsw.neighbor_range(i, 0, &begin, &end); + + std::priority_queue initial_list; + + for (size_t j = begin; j < end; j++) { + int v1 = hnsw.neighbors[j]; + if (v1 < 0) + break; + initial_list.emplace(dis->symmetric_dis(i, v1), v1); + + // initial_list.emplace(qdis(v1), v1); + } + + std::vector shrunk_list; + HNSWbd::shrink_neighbor_list( + *dis, initial_list, shrunk_list, new_size,hnsw.bd_stat); + + for (size_t j = begin; j < end; j++) { + if (j - begin < shrunk_list.size()) + hnsw.neighbors[j] = shrunk_list[j - begin].id; + else + hnsw.neighbors[j] = -1; + } + } + } +} + +void IndexHNSWbd::search_level_0( + idx_t n, + const float* x, + idx_t k, + const storage_idx_t* nearest, + const float* nearest_d, + float* distances, + idx_t* labels, + int nprobe, + int search_type) const { + FAISS_THROW_IF_NOT(k > 0); + FAISS_THROW_IF_NOT(nprobe > 0); + + storage_idx_t ntotal = hnsw.levels.size(); + +//#pragma omp parallel + { + std::unique_ptr qdis( + storage_distance_computer(storage)); + HNSWStats search_stats; + VisitedTable vt(ntotal); + +//#pragma omp for + for (idx_t i = 0; i < n; i++) { + idx_t* idxi = labels + i * k; + float* simi = distances + i * k; + + qdis->set_query(x + i * d); + maxheap_heapify(k, simi, idxi); + + hnsw.search_level_0( + *qdis.get(), + k, + idxi, + simi, + nprobe, + nearest + i * nprobe, + nearest_d + i * nprobe, + search_type, + search_stats, + vt); + + vt.advance(); + maxheap_reorder(k, simi, idxi); + } +//#pragma omp critical + { hnsw_stats.combine(search_stats); } + } +} + +void IndexHNSWbd::init_level_0_from_knngraph( + int k, + const float* D, + const idx_t* I) { + int dest_size = hnsw.nb_neighbors(0); + +//#pragma omp parallel for + for (idx_t i = 0; i < ntotal; i++) { + DistanceComputer* qdis = storage_distance_computer(storage); + std::vector vec(d); + storage->reconstruct(i, vec.data()); + qdis->set_query(vec.data()); + + std::priority_queue initial_list; + + for (size_t j = 0; j < k; j++) { + int v1 = I[i * k + j]; + if (v1 == i) + continue; + if (v1 < 0) + break; + initial_list.emplace(D[i * k + j], v1); + } + + std::vector shrunk_list; + HNSWbd::shrink_neighbor_list(*qdis, initial_list, shrunk_list, dest_size, hnsw.bd_stat); + + size_t begin, end; + hnsw.neighbor_range(i, 0, &begin, &end); + + for (size_t j = begin; j < end; j++) { + if (j - begin < shrunk_list.size()) + hnsw.neighbors[j] = shrunk_list[j - begin].id; + else + hnsw.neighbors[j] = -1; + } + } +} + +void IndexHNSWbd::init_level_0_from_entry_points( + int n, + const storage_idx_t* points, + const storage_idx_t* nearests) { + std::vector locks(ntotal); + for (int i = 0; i < ntotal; i++) + omp_init_lock(&locks[i]); + +//#pragma omp parallel + { + VisitedTable vt(ntotal); + + std::unique_ptr dis( + storage_distance_computer(storage)); + std::vector vec(storage->d); + +//#pragma omp for schedule(dynamic) + for (int i = 0; i < n; i++) { + storage_idx_t pt_id = points[i]; + storage_idx_t nearest = nearests[i]; + storage->reconstruct(pt_id, vec.data()); + dis->set_query(vec.data()); + + hnsw.add_links_starting_from( + *dis, pt_id, nearest, (*dis)(nearest), 0, locks.data(), vt); + + if (verbose && i % 10000 == 0) { + printf(" %d / %d\r", i, n); + fflush(stdout); + } + } + } + if (verbose) { + printf("\n"); + } + + for (int i = 0; i < ntotal; i++) + omp_destroy_lock(&locks[i]); +} + +void IndexHNSWbd::reorder_links() { + int M = hnsw.nb_neighbors(0); + +//#pragma omp parallel + { + std::vector distances(M); + std::vector order(M); + std::vector tmp(M); + std::unique_ptr dis( + storage_distance_computer(storage)); + +//#pragma omp for + for (storage_idx_t i = 0; i < ntotal; i++) { + size_t begin, end; + hnsw.neighbor_range(i, 0, &begin, &end); + + for (size_t j = begin; j < end; j++) { + storage_idx_t nj = hnsw.neighbors[j]; + if (nj < 0) { + end = j; + break; + } + distances[j - begin] = dis->symmetric_dis(i, nj); + tmp[j - begin] = nj; + } + + fvec_argsort(end - begin, distances.data(), order.data()); + for (size_t j = begin; j < end; j++) { + hnsw.neighbors[j] = tmp[order[j - begin]]; + } + } + } +} + +void IndexHNSWbd::link_singletons() { + printf("search for singletons\n"); + + std::vector seen(ntotal); + + for (size_t i = 0; i < ntotal; i++) { + size_t begin, end; + hnsw.neighbor_range(i, 0, &begin, &end); + for (size_t j = begin; j < end; j++) { + storage_idx_t ni = hnsw.neighbors[j]; + if (ni >= 0) + seen[ni] = true; + } + } + + int n_sing = 0, n_sing_l1 = 0; + std::vector singletons; + for (storage_idx_t i = 0; i < ntotal; i++) { + if (!seen[i]) { + singletons.push_back(i); + n_sing++; + if (hnsw.levels[i] > 1) + n_sing_l1++; + } + } + + printf(" Found %d / %" PRId64 " singletons (%d appear in a level above)\n", + n_sing, + ntotal, + n_sing_l1); + + std::vector recons(singletons.size() * d); + for (int i = 0; i < singletons.size(); i++) { + FAISS_ASSERT(!"not implemented"); + } +} + +void IndexHNSWbd::permute_entries(const idx_t* perm) { + auto flat_storage = dynamic_cast(storage); + FAISS_THROW_IF_NOT_MSG( + flat_storage, "don't know how to permute this index"); + flat_storage->permute_entries(perm); + hnsw.permute_entries(perm); +} + +/************************************************************** + * ReconstructFromNeighbors implementation + **************************************************************/ + +ReconstructFromNeighborsbd::ReconstructFromNeighborsbd( + const IndexHNSWbd& index, + size_t k, + size_t nsq) + : index(index), k(k), nsq(nsq) { + M = index.hnsw.nb_neighbors(0); + FAISS_ASSERT(k <= 256); + code_size = k == 1 ? 0 : nsq; + ntotal = 0; + d = index.d; + FAISS_ASSERT(d % nsq == 0); + dsub = d / nsq; + k_reorder = -1; +} + +void ReconstructFromNeighborsbd::reconstruct( + storage_idx_t i, + float* x, + float* tmp) const { + const HNSWbd& hnsw = index.hnsw; + size_t begin, end; + hnsw.neighbor_range(i, 0, &begin, &end); + + if (k == 1 || nsq == 1) { + const float* beta; + if (k == 1) { + beta = codebook.data(); + } else { + int idx = codes[i]; + beta = codebook.data() + idx * (M + 1); + } + + float w0 = beta[0]; // weight of image itself + index.storage->reconstruct(i, tmp); + + for (int l = 0; l < d; l++) + x[l] = w0 * tmp[l]; + + for (size_t j = begin; j < end; j++) { + storage_idx_t ji = hnsw.neighbors[j]; + if (ji < 0) + ji = i; + float w = beta[j - begin + 1]; + index.storage->reconstruct(ji, tmp); + for (int l = 0; l < d; l++) + x[l] += w * tmp[l]; + } + } else if (nsq == 2) { + int idx0 = codes[2 * i]; + int idx1 = codes[2 * i + 1]; + + const float* beta0 = codebook.data() + idx0 * (M + 1); + const float* beta1 = codebook.data() + (idx1 + k) * (M + 1); + + index.storage->reconstruct(i, tmp); + + float w0; + + w0 = beta0[0]; + for (int l = 0; l < dsub; l++) + x[l] = w0 * tmp[l]; + + w0 = beta1[0]; + for (int l = dsub; l < d; l++) + x[l] = w0 * tmp[l]; + + for (size_t j = begin; j < end; j++) { + storage_idx_t ji = hnsw.neighbors[j]; + if (ji < 0) + ji = i; + index.storage->reconstruct(ji, tmp); + float w; + w = beta0[j - begin + 1]; + for (int l = 0; l < dsub; l++) + x[l] += w * tmp[l]; + + w = beta1[j - begin + 1]; + for (int l = dsub; l < d; l++) + x[l] += w * tmp[l]; + } + } else { + std::vector betas(nsq); + { + const float* b = codebook.data(); + const uint8_t* c = &codes[i * code_size]; + for (int sq = 0; sq < nsq; sq++) { + betas[sq] = b + (*c++) * (M + 1); + b += (M + 1) * k; + } + } + + index.storage->reconstruct(i, tmp); + { + int d0 = 0; + for (int sq = 0; sq < nsq; sq++) { + float w = *(betas[sq]++); + int d1 = d0 + dsub; + for (int l = d0; l < d1; l++) { + x[l] = w * tmp[l]; + } + d0 = d1; + } + } + + for (size_t j = begin; j < end; j++) { + storage_idx_t ji = hnsw.neighbors[j]; + if (ji < 0) + ji = i; + + index.storage->reconstruct(ji, tmp); + int d0 = 0; + for (int sq = 0; sq < nsq; sq++) { + float w = *(betas[sq]++); + int d1 = d0 + dsub; + for (int l = d0; l < d1; l++) { + x[l] += w * tmp[l]; + } + d0 = d1; + } + } + } +} + +void ReconstructFromNeighborsbd::reconstruct_n( + storage_idx_t n0, + storage_idx_t ni, + float* x) const { +//#pragma omp parallel + { + std::vector tmp(index.d); +//#pragma omp for + for (storage_idx_t i = 0; i < ni; i++) { + reconstruct(n0 + i, x + i * index.d, tmp.data()); + } + } +} + +size_t ReconstructFromNeighborsbd::compute_distances( + size_t n, + const idx_t* shortlist, + const float* query, + float* distances) const { + std::vector tmp(2 * index.d); + size_t ncomp = 0; + for (int i = 0; i < n; i++) { + if (shortlist[i] < 0) + break; + reconstruct(shortlist[i], tmp.data(), tmp.data() + index.d); + distances[i] = fvec_L2sqr(query, tmp.data(), index.d); + ncomp++; + } + return ncomp; +} + +void ReconstructFromNeighborsbd::get_neighbor_table(storage_idx_t i, float* tmp1) + const { + const HNSWbd& hnsw = index.hnsw; + size_t begin, end; + hnsw.neighbor_range(i, 0, &begin, &end); + size_t d = index.d; + + index.storage->reconstruct(i, tmp1); + + for (size_t j = begin; j < end; j++) { + storage_idx_t ji = hnsw.neighbors[j]; + if (ji < 0) + ji = i; + index.storage->reconstruct(ji, tmp1 + (j - begin + 1) * d); + } +} + +/// called by add_codes +void ReconstructFromNeighborsbd::estimate_code( + const float* x, + storage_idx_t i, + uint8_t* code) const { + // fill in tmp table with the neighbor values + std::unique_ptr tmp1(new float[d * (M + 1) + (d * k)]); + float* tmp2 = tmp1.get() + d * (M + 1); + + // collect coordinates of base + get_neighbor_table(i, tmp1.get()); + + for (size_t sq = 0; sq < nsq; sq++) { + int d0 = sq * dsub; + + { + FINTEGER ki = k, di = d, m1 = M + 1; + FINTEGER dsubi = dsub; + float zero = 0, one = 1; + + sgemm_("N", + "N", + &dsubi, + &ki, + &m1, + &one, + tmp1.get() + d0, + &di, + codebook.data() + sq * (m1 * k), + &m1, + &zero, + tmp2, + &dsubi); + } + + float min = HUGE_VAL; + int argmin = -1; + for (size_t j = 0; j < k; j++) { + float dis = fvec_L2sqr(x + d0, tmp2 + j * dsub, dsub); + if (dis < min) { + min = dis; + argmin = j; + } + } + code[sq] = argmin; + } +} + +void ReconstructFromNeighborsbd::add_codes(size_t n, const float* x) { + if (k == 1) { // nothing to encode + ntotal += n; + return; + } + codes.resize(codes.size() + code_size * n); +//#pragma omp parallel for + for (int i = 0; i < n; i++) { + estimate_code( + x + i * index.d, + ntotal + i, + codes.data() + (ntotal + i) * code_size); + } + ntotal += n; + FAISS_ASSERT(codes.size() == ntotal * code_size); +} + +/************************************************************** + * IndexHNSWFlat implementation + **************************************************************/ + + + +/************************************************************** + * IndexHNSWPQ implementation + **************************************************************/ + +IndexHNSWbdPQ::IndexHNSWbdPQ() = default; + +IndexHNSWbdPQ::IndexHNSWbdPQ(int d, int pq_m, int M, int pq_nbits) + : IndexHNSWbd(new IndexPQ(d, pq_m, pq_nbits), M) { + own_fields = true; + is_trained = false; +} + +void IndexHNSWbdPQ::train(idx_t n, const float* x) { + IndexHNSWbd::train(n, x); + (dynamic_cast(storage))->pq.compute_sdc_table(); +} + +/************************************************************** + * IndexHNSWSQ implementation + **************************************************************/ + +IndexHNSWbdSQ::IndexHNSWbdSQ( + int d, + ScalarQuantizer::QuantizerType qtype, + int M, + MetricType metric) + : IndexHNSWbd(new IndexScalarQuantizer(d, qtype, metric), M) { + is_trained = this->storage->is_trained; + own_fields = true; +} + +IndexHNSWbdSQ::IndexHNSWbdSQ() = default; + +/************************************************************** + * IndexHNSW2Level implementation + **************************************************************/ + +IndexHNSWbd2Level::IndexHNSWbd2Level( + Index* quantizer, + size_t nlist, + int m_pq, + int M) + : IndexHNSWbd(new Index2Layer(quantizer, nlist, m_pq), M) { + own_fields = true; + is_trained = false; +} + +IndexHNSWbd2Level::IndexHNSWbd2Level() = default; + +namespace { + +// same as search_from_candidates but uses v +// visno -> is in result list +// visno + 1 -> in result list + in candidates +int search_from_candidates_2( + const HNSWbd& hnsw, + DistanceComputer& qdis, + int k, + idx_t* I, + float* D, + MinimaxHeap& candidates, + VisitedTable& vt, + HNSWStats& stats, + int level, + int nres_in = 0) { + int nres = nres_in; + int ndis = 0; + for (int i = 0; i < candidates.size(); i++) { + idx_t v1 = candidates.ids[i]; + FAISS_ASSERT(v1 >= 0); + vt.visited[v1] = vt.visno + 1; + } + + int nstep = 0; + + while (candidates.size() > 0) { + float d0 = 0; + int v0 = candidates.pop_min(&d0); + + size_t begin, end; + hnsw.neighbor_range(v0, level, &begin, &end); + + for (size_t j = begin; j < end; j++) { + int v1 = hnsw.neighbors[j]; + if (v1 < 0) + break; + if (vt.visited[v1] == vt.visno + 1) { + // nothing to do + } else { + ndis++; + float d = qdis(v1); + candidates.push(v1, d); + + // never seen before --> add to heap + if (vt.visited[v1] < vt.visno) { + if (nres < k) { + faiss::maxheap_push(++nres, D, I, d, v1); + } else if (d < D[0]) { + faiss::maxheap_replace_top(nres, D, I, d, v1); + } + } + vt.visited[v1] = vt.visno + 1; + } + } + + nstep++; + if (nstep > hnsw.efSearch) { + break; + } + } + + stats.n1++; + if (candidates.size() == 0) + stats.n2++; + + return nres; +} + +} // namespace + +void IndexHNSWbd2Level::search( + idx_t n, + const float* x, + idx_t k, + float* distances, + idx_t* labels, + const SearchParameters* params) const { + FAISS_THROW_IF_NOT(k > 0); + FAISS_THROW_IF_NOT_MSG( + !params, "search params not supported for this index"); + + if (dynamic_cast(storage)) { + IndexHNSWbd::search(n, x, k, distances, labels); + + } else { // "mixed" search + size_t n1 = 0, n2 = 0, n3 = 0, ndis = 0, nreorder = 0; + + const IndexIVFPQ* index_ivfpq = + dynamic_cast(storage); + + int nprobe = index_ivfpq->nprobe; + + std::unique_ptr coarse_assign(new idx_t[n * nprobe]); + std::unique_ptr coarse_dis(new float[n * nprobe]); + + index_ivfpq->quantizer->search( + n, x, nprobe, coarse_dis.get(), coarse_assign.get()); + + index_ivfpq->search_preassigned( + n, + x, + k, + coarse_assign.get(), + coarse_dis.get(), + distances, + labels, + false); + +//#pragma omp parallel + { + VisitedTable vt(ntotal); + std::unique_ptr dis( + storage_distance_computer(storage)); + + int candidates_size = hnsw.upper_beam; + MinimaxHeap candidates(candidates_size); + +//#pragma omp for reduction(+ : n1, n2, n3, ndis, nreorder) + for (idx_t i = 0; i < n; i++) { + idx_t* idxi = labels + i * k; + float* simi = distances + i * k; + dis->set_query(x + i * d); + + // mark all inverted list elements as visited + + for (int j = 0; j < nprobe; j++) { + idx_t key = coarse_assign[j + i * nprobe]; + if (key < 0) + break; + size_t list_length = index_ivfpq->get_list_size(key); + const idx_t* ids = index_ivfpq->invlists->get_ids(key); + + for (int jj = 0; jj < list_length; jj++) { + vt.set(ids[jj]); + } + } + + candidates.clear(); + + for (int j = 0; j < hnsw.upper_beam && j < k; j++) { + if (idxi[j] < 0) + break; + candidates.push(idxi[j], simi[j]); + } + + // reorder from sorted to heap + maxheap_heapify(k, simi, idxi, simi, idxi, k); + + HNSWStats search_stats; + search_from_candidates_2( + hnsw, + *dis, + k, + idxi, + simi, + candidates, + vt, + search_stats, + 0, + k); + n1 += search_stats.n1; + n2 += search_stats.n2; + n3 += search_stats.n3; + ndis += search_stats.ndis; + nreorder += search_stats.nreorder; + + vt.advance(); + vt.advance(); + + maxheap_reorder(k, simi, idxi); + } + } + + hnsw_stats.combine({n1, n2, n3, ndis, nreorder}); + } +} + +void IndexHNSWbd2Level::flip_to_ivf() { + Index2Layer* storage2l = dynamic_cast(storage); + + FAISS_THROW_IF_NOT(storage2l); + + IndexIVFPQ* index_ivfpq = new IndexIVFPQ( + storage2l->q1.quantizer, + d, + storage2l->q1.nlist, + storage2l->pq.M, + 8); + index_ivfpq->pq = storage2l->pq; + index_ivfpq->is_trained = storage2l->is_trained; + index_ivfpq->precompute_table(); + index_ivfpq->own_fields = storage2l->q1.own_fields; + storage2l->transfer_to_IVFPQ(*index_ivfpq); + index_ivfpq->make_direct_map(true); + + storage = index_ivfpq; + delete storage2l; +} + +} // namespace faiss diff --git a/thirdparty/faiss/faiss/IndexHNSWbd.h b/thirdparty/faiss/faiss/IndexHNSWbd.h new file mode 100644 index 000000000..19d3ad087 --- /dev/null +++ b/thirdparty/faiss/faiss/IndexHNSWbd.h @@ -0,0 +1,199 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +// -*- c++ -*- + +#pragma once + +#include + +#include +#include +#include +#include +#include + +namespace faiss { + +struct IndexHNSWbd; + +struct ReconstructFromNeighborsbd { + typedef HNSWbd::storage_idx_t storage_idx_t; + + const IndexHNSWbd& index; + size_t M; // number of neighbors + size_t k; // number of codebook entries + size_t nsq; // number of subvectors + size_t code_size; + int k_reorder; // nb to reorder. -1 = all + + std::vector codebook; // size nsq * k * (M + 1) + + std::vector codes; // size ntotal * code_size + size_t ntotal; + size_t d, dsub; // derived values + + explicit ReconstructFromNeighborsbd( + const IndexHNSWbd& index, + size_t k = 256, + size_t nsq = 1); + + /// codes must be added in the correct order and the IndexHNSW + /// must be populated and sorted + void add_codes(size_t n, const float* x); + + size_t compute_distances( + size_t n, + const idx_t* shortlist, + const float* query, + float* distances) const; + + /// called by add_codes + void estimate_code(const float* x, storage_idx_t i, uint8_t* code) const; + + /// called by compute_distances + void reconstruct(storage_idx_t i, float* x, float* tmp) const; + + void reconstruct_n(storage_idx_t n0, storage_idx_t ni, float* x) const; + + /// get the M+1 -by-d table for neighbor coordinates for vector i + void get_neighbor_table(storage_idx_t i, float* out) const; +}; + +/** The HNSW index is a normal random-access index with a HNSW + * link structure built on top */ + +struct IndexHNSWbd : Index { + typedef HNSWbd::storage_idx_t storage_idx_t; + + // the link strcuture + HNSWbd hnsw; + + // the sequential storage + bool own_fields = false; + Index* storage = nullptr; + + ReconstructFromNeighborsbd* reconstruct_from_neighbors = nullptr; + + explicit IndexHNSWbd(int d = 0, int M = 32, MetricType metric = METRIC_L2); + explicit IndexHNSWbd(Index* storage, int M = 32); + + ~IndexHNSWbd() override; + + void add(idx_t n, const float* x) override; + + /// Trains the storage if needed + void train(idx_t n, const float* x) override; + + /// entry point for search + void search( + idx_t n, + const float* x, + idx_t k, + float* distances, + idx_t* labels, + const SearchParameters* params = nullptr) const override; + + void reconstruct(idx_t key, float* recons) const override; + + void reset() override; + + void shrink_level_0_neighbors(int size); + + /** Perform search only on level 0, given the starting points for + * each vertex. + * + * @param search_type 1:perform one search per nprobe, 2: enqueue + * all entry points + */ + void search_level_0( + idx_t n, + const float* x, + idx_t k, + const storage_idx_t* nearest, + const float* nearest_d, + float* distances, + idx_t* labels, + int nprobe = 1, + int search_type = 1) const; + + /// alternative graph building + void init_level_0_from_knngraph(int k, const float* D, const idx_t* I); + + /// alternative graph building + void init_level_0_from_entry_points( + int npt, + const storage_idx_t* points, + const storage_idx_t* nearests); + + // reorder links from nearest to farthest + void reorder_links(); + + void link_singletons(); + + void permute_entries(const idx_t* perm); +}; + +/** Flat index topped with with a HNSW structure to access elements + * more efficiently. + */ + +struct IndexHNSWbdFlat : IndexHNSWbd { + IndexHNSWbdFlat() { + is_trained = true; + } + + IndexHNSWbdFlat(int d, int M, MetricType metric) + : IndexHNSWbd( + (metric == METRIC_L2) ? new IndexFlatL2(d) + : new IndexFlat(d, metric), + M) { + own_fields = true; + is_trained = true; + } +}; + +/** PQ index topped with with a HNSW structure to access elements + * more efficiently. + */ +struct IndexHNSWbdPQ : IndexHNSWbd { + IndexHNSWbdPQ(); + IndexHNSWbdPQ(int d, int pq_m, int M, int pq_nbits = 8); + void train(idx_t n, const float* x) override; +}; + +/** SQ index topped with with a HNSW structure to access elements + * more efficiently. + */ +struct IndexHNSWbdSQ : IndexHNSWbd { + IndexHNSWbdSQ(); + IndexHNSWbdSQ( + int d, + ScalarQuantizer::QuantizerType qtype, + int M, + MetricType metric = METRIC_L2); +}; + +/** 2-level code structure with fast random access + */ +struct IndexHNSWbd2Level : IndexHNSWbd { + IndexHNSWbd2Level(); + IndexHNSWbd2Level(Index* quantizer, size_t nlist, int m_pq, int M); + + void flip_to_ivf(); + + /// entry point for search + void search( + idx_t n, + const float* x, + idx_t k, + float* distances, + idx_t* labels, + const SearchParameters* params = nullptr) const override; +}; + +} // namespace faiss diff --git a/thirdparty/faiss/faiss/impl/HNSWbd.cpp b/thirdparty/faiss/faiss/impl/HNSWbd.cpp new file mode 100644 index 000000000..24856de9a --- /dev/null +++ b/thirdparty/faiss/faiss/impl/HNSWbd.cpp @@ -0,0 +1,1189 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +// -*- c++ -*- + +#include +#include +#include + +#include +#include +#include +#include + +#include + +#ifdef __AVX2__ +#include + +#include +#include +#endif + +#define chronoElapsedTime(start) \ + std::chrono::duration_cast( \ + std::chrono::high_resolution_clock::now() - start) \ + .count() + +namespace faiss { + +/************************************************************** + * HNSWbd structure implementation + **************************************************************/ + +int HNSWbd::nb_neighbors(int layer_no) const { + return cum_nneighbor_per_level[layer_no + 1] - + cum_nneighbor_per_level[layer_no]; +} + +void HNSWbd::set_nb_neighbors(int level_no, int n) { + FAISS_THROW_IF_NOT(levels.size() == 0); + int cur_n = nb_neighbors(level_no); + for (int i = level_no + 1; i < cum_nneighbor_per_level.size(); i++) { + cum_nneighbor_per_level[i] += n - cur_n; + } +} + +int HNSWbd::cum_nb_neighbors(int layer_no) const { + return cum_nneighbor_per_level[layer_no]; +} + +void HNSWbd::neighbor_range(idx_t no, int layer_no, size_t* begin, size_t* end) + const { + size_t o = offsets[no]; + *begin = o + cum_nb_neighbors(layer_no); + *end = o + cum_nb_neighbors(layer_no + 1); +} + +HNSWbd::HNSWbd(int M) : rng(12345) { + set_default_probas(M, 1.0 / log(M)); + offsets.push_back(0); + bd_stat.reset(); + M_ = M; +} + +int HNSWbd::random_level() { + double f = rng.rand_float(); + // could be a bit faster with bissection + for (int level = 0; level < assign_probas.size(); level++) { + if (f < assign_probas[level]) { + return level; + } + f -= assign_probas[level]; + } + // happens with exponentially low probability + return assign_probas.size() - 1; +} + +void HNSWbd::set_default_probas(int M, float levelMult) { + int nn = 0; + cum_nneighbor_per_level.push_back(0); + for (int level = 0;; level++) { + float proba = exp(-level / levelMult) * (1 - exp(-1 / levelMult)); + if (proba < 1e-9) + break; + assign_probas.push_back(proba); + nn += level == 0 ? M * 2 : M; + cum_nneighbor_per_level.push_back(nn); + } +} + +void HNSWbd::clear_neighbor_tables(int level) { + for (int i = 0; i < levels.size(); i++) { + size_t begin, end; + neighbor_range(i, level, &begin, &end); + for (size_t j = begin; j < end; j++) { + neighbors[j] = -1; + } + } +} + +void HNSWbd::reset() { + max_level = -1; + entry_point = -1; + offsets.clear(); + offsets.push_back(0); + levels.clear(); + neighbors.clear(); +} + +void HNSWbd::print_neighbor_stats(int level) const { + FAISS_THROW_IF_NOT(level < cum_nneighbor_per_level.size()); + printf("stats on level %d, max %d neighbors per vertex:\n", + level, + nb_neighbors(level)); + size_t tot_neigh = 0, tot_common = 0, tot_reciprocal = 0, n_node = 0; + //#pragma omp parallel for reduction(+: tot_neigh) reduction(+: tot_common) \ + reduction(+: tot_reciprocal) reduction(+: n_node) + for (int i = 0; i < levels.size(); i++) { + if (levels[i] > level) { + n_node++; + size_t begin, end; + neighbor_range(i, level, &begin, &end); + std::unordered_set neighset; + for (size_t j = begin; j < end; j++) { + if (neighbors[j] < 0) + break; + neighset.insert(neighbors[j]); + } + int n_neigh = neighset.size(); + int n_common = 0; + int n_reciprocal = 0; + for (size_t j = begin; j < end; j++) { + storage_idx_t i2 = neighbors[j]; + if (i2 < 0) + break; + FAISS_ASSERT(i2 != i); + size_t begin2, end2; + neighbor_range(i2, level, &begin2, &end2); + for (size_t j2 = begin2; j2 < end2; j2++) { + storage_idx_t i3 = neighbors[j2]; + if (i3 < 0) + break; + if (i3 == i) { + n_reciprocal++; + continue; + } + if (neighset.count(i3)) { + neighset.erase(i3); + n_common++; + } + } + } + tot_neigh += n_neigh; + tot_common += n_common; + tot_reciprocal += n_reciprocal; + } + } + float normalizer = n_node; + printf(" nb of nodes at that level %zd\n", n_node); + printf(" neighbors per node: %.2f (%zd)\n", + tot_neigh / normalizer, + tot_neigh); + printf(" nb of reciprocal neighbors: %.2f\n", + tot_reciprocal / normalizer); + printf(" nb of neighbors that are also neighbor-of-neighbors: %.2f (%zd)\n", + tot_common / normalizer, + tot_common); +} + +void HNSWbd::fill_with_random_links(size_t n) { + int max_level = prepare_level_tab(n); + RandomGenerator rng2(456); + + for (int level = max_level - 1; level >= 0; --level) { + std::vector elts; + for (int i = 0; i < n; i++) { + if (levels[i] > level) { + elts.push_back(i); + } + } + printf("linking %zd elements in level %d\n", elts.size(), level); + + if (elts.size() == 1) + continue; + + for (int ii = 0; ii < elts.size(); ii++) { + int i = elts[ii]; + size_t begin, end; + neighbor_range(i, 0, &begin, &end); + for (size_t j = begin; j < end; j++) { + int other = 0; + do { + other = elts[rng2.rand_int(elts.size())]; + } while (other == i); + + neighbors[j] = other; + } + } + } +} + +int HNSWbd::prepare_level_tab(size_t n, bool preset_levels) { + size_t n0 = offsets.size() - 1; + + if (preset_levels) { + FAISS_ASSERT(n0 + n == levels.size()); + } else { + FAISS_ASSERT(n0 == levels.size()); + for (int i = 0; i < n; i++) { + int pt_level = random_level(); + levels.push_back(pt_level + 1); + } + } + + int max_level = 0; + for (int i = 0; i < n; i++) { + int pt_level = levels[i + n0] - 1; + if (pt_level > max_level) + max_level = pt_level; + offsets.push_back(offsets.back() + cum_nb_neighbors(pt_level + 1)); + neighbors.resize(offsets.back(), -1); + } + + return max_level; +} + +/** Enumerate vertices from nearest to farthest from query, keep a + * neighbor only if there is no previous neighbor that is closer to + * that vertex than the query. + */ +void HNSWbd::shrink_neighbor_list( + DistanceComputer& qdis, + std::priority_queue& input, + std::vector& output, + int max_size, + struct HNSW_breakdown_stats& bd_stats) { + while (input.size() > 0) { + NodeDistFarther v1 = input.top(); + input.pop(); + float dist_v1_q = v1.d; + + bool good = true; + for (NodeDistFarther v2 : output) { + auto start = std::chrono::high_resolution_clock::now(); + float dist_v1_v2 = qdis.symmetric_dis(v2.id, v1.id); + bd_stats.time_dc_linking += chronoElapsedTime(start); + bd_stats.step_linking +=1; + + if (dist_v1_v2 < dist_v1_q) { + good = false; + break; + } + } + + if (good) { + output.push_back(v1); + if (output.size() >= max_size) { + return; + } + } + } +} + +namespace { + +using storage_idx_t = HNSWbd::storage_idx_t; +using NodeDistCloser = HNSWbd::NodeDistCloser; +using NodeDistFarther = HNSWbd::NodeDistFarther; + +/************************************************************** + * Addition subroutines + **************************************************************/ + +/// remove neighbors from the list to make it smaller than max_size +void shrink_neighbor_list( + DistanceComputer& qdis, + std::priority_queue& resultSet1, + int max_size, + struct HNSW_breakdown_stats& bd_stats) { + if (resultSet1.size() < max_size) { + return; + } + std::priority_queue resultSet; + std::vector returnlist; + while (resultSet1.size() > 0) { + resultSet.emplace(resultSet1.top().d, resultSet1.top().id); + resultSet1.pop(); + } + + HNSWbd::shrink_neighbor_list(qdis, resultSet, returnlist, max_size, bd_stats); + + for (NodeDistFarther curen2 : returnlist) { + resultSet1.emplace(curen2.d, curen2.id); + } +} + +/// add a link between two elements, possibly shrinking the list +/// of links to make room for it. +void add_link( + HNSWbd& hnsw, + DistanceComputer& qdis, + storage_idx_t src, + storage_idx_t dest, + int level) { + size_t begin, end; + hnsw.neighbor_range(src, level, &begin, &end); + if (hnsw.neighbors[end - 1] == -1) { + // there is enough room, find a slot to add it + size_t i = end; + while (i > begin) { + if (hnsw.neighbors[i - 1] != -1) + break; + i--; + } + hnsw.neighbors[i] = dest; + return; + } + + // otherwise we let them fight out which to keep + + // copy to resultSet... + std::priority_queue resultSet; + auto start = std::chrono::high_resolution_clock::now(); + auto dist = qdis.symmetric_dis(src,dest); + hnsw.bd_stat.time_dc_linking += chronoElapsedTime(start); + resultSet.emplace(dist, dest); + for (size_t i = begin; i < end; i++) { // HERE WAS THE BUG + storage_idx_t neigh = hnsw.neighbors[i]; + auto start = std::chrono::high_resolution_clock::now(); + dist = qdis.symmetric_dis(src,neigh); + hnsw.bd_stat.time_dc_linking += chronoElapsedTime(start); + hnsw.bd_stat.step_before_shrinking +=1; + resultSet.emplace(dist, neigh); + } + + shrink_neighbor_list(qdis, resultSet, end - begin,hnsw.bd_stat); + + // ...and back + size_t i = begin; + while (resultSet.size()) { + hnsw.neighbors[i++] = resultSet.top().id; + resultSet.pop(); + } + // they may have shrunk more than just by 1 element + while (i < end) { + hnsw.neighbors[i++] = -1; + } +} + void search_neighbors_to_add( + HNSWbd& hnsw, + DistanceComputer& qdis, + std::priority_queue& results, + int entry_point, + float d_entry_point, + int level, + VisitedTable& vt); +/// search neighbors on a single level, starting from an entry point +void search_neighbors_to_add( + HNSWbd& hnsw, + DistanceComputer& qdis, + std::priority_queue& results, + int entry_point, + float d_entry_point, + int level, + VisitedTable& vt) { + // top is nearest candidate + std::priority_queue candidates; + + NodeDistFarther ev(d_entry_point, entry_point); + candidates.push(ev); + results.emplace(d_entry_point, entry_point); + vt.set(entry_point); + + while (!candidates.empty()) { + // get nearest + const NodeDistFarther& currEv = candidates.top(); + + if (currEv.d > results.top().d) { + break; + } + int currNode = currEv.id; + candidates.pop(); + + // loop over neighbors + size_t begin, end; + hnsw.neighbor_range(currNode, level, &begin, &end); + for (size_t i = begin; i < end; i++) { + storage_idx_t nodeId = hnsw.neighbors[i]; + if (nodeId < 0) + break; + if (vt.get(nodeId)) + continue; + vt.set(nodeId); + hnsw.bd_stat.steps_iterating_add = + hnsw.bd_stat.steps_iterating_add + 1; + auto start = std::chrono::high_resolution_clock::now(); + float dis = qdis(nodeId); + hnsw.bd_stat.time_dc+=chronoElapsedTime(start); + NodeDistFarther evE1(dis, nodeId); + + if (results.size() < hnsw.efConstruction || results.top().d > dis) { + results.emplace(dis, nodeId); + candidates.emplace(dis, nodeId); + if (results.size() > hnsw.efConstruction) { + results.pop(); + } + } + } + } + vt.advance(); +} + +/************************************************************** + * Searching subroutines + **************************************************************/ + +/// greedily update a nearest vector at a given level +void greedy_update_nearest( + const HNSWbd& hnsw, + DistanceComputer& qdis, + int level, + storage_idx_t& nearest, + float& d_nearest) { + for (;;) { + storage_idx_t prev_nearest = nearest; + + size_t begin, end; + hnsw.neighbor_range(nearest, level, &begin, &end); + for (size_t i = begin; i < end; i++) { + hnsw.bd_stat.steps_greedy = hnsw.bd_stat.steps_greedy + 1; + storage_idx_t v = hnsw.neighbors[i]; + if (v < 0) + break; + float dis = qdis(v); + if (dis < d_nearest) { + nearest = v; + d_nearest = dis; + } + } + if (nearest == prev_nearest) { + return; + } + } +} + +} // namespace + +/// Finds neighbors and builds links with them, starting from an entry +/// point. The own neighbor list is assumed to be locked. +void HNSWbd::add_links_starting_from( + DistanceComputer& ptdis, + storage_idx_t pt_id, + storage_idx_t nearest, + float d_nearest, + int level, + omp_lock_t* locks, + VisitedTable& vt) { + std::priority_queue link_targets; + auto start = std::chrono::high_resolution_clock::now(); + search_neighbors_to_add( + *this, ptdis, link_targets, nearest, d_nearest, level, vt); + bd_stat.time_searching_neighbors_to_add += chronoElapsedTime(start); + + // but we can afford only this many neighbors + int M = nb_neighbors(level); + + start = std::chrono::high_resolution_clock::now(); + ::faiss::shrink_neighbor_list(ptdis, link_targets, M,bd_stat); + + std::vector neighbors; + neighbors.reserve(link_targets.size()); + while (!link_targets.empty()) { + storage_idx_t other_id = link_targets.top().id; + add_link(*this, ptdis, pt_id, other_id, level); + neighbors.push_back(other_id); + link_targets.pop(); + } + + omp_unset_lock(&locks[pt_id]); + for (storage_idx_t other_id : neighbors) { + omp_set_lock(&locks[other_id]); + add_link(*this, ptdis, other_id, pt_id, level); + omp_unset_lock(&locks[other_id]); + } + omp_set_lock(&locks[pt_id]); + bd_stat.time_add_links += chronoElapsedTime(start); +} + +/************************************************************** + * Building, parallel + **************************************************************/ + +void HNSWbd::add_with_locks( + DistanceComputer& ptdis, + int pt_level, + int pt_id, + std::vector& locks, + VisitedTable& vt) { + // greedy search on upper levels + + storage_idx_t nearest; + //#pragma omp critical + { + nearest = entry_point; + + if (nearest == -1) { + max_level = pt_level; + entry_point = pt_id; + } + } + + if (nearest < 0) { + return; + } + + omp_set_lock(&locks[pt_id]); + + int level = max_level; // level at which we start adding neighbors + float d_nearest = ptdis(nearest); + auto greedy_start = std::chrono::high_resolution_clock::now(); + for (; level > pt_level; level--) { + greedy_update_nearest(*this, ptdis, level, nearest, d_nearest); + } + bd_stat.time_greedy_insert += chronoElapsedTime(greedy_start); + + for (; level >= 0; level--) { + add_links_starting_from( + ptdis, pt_id, nearest, d_nearest, level, locks.data(), vt); + } + + omp_unset_lock(&locks[pt_id]); + + if (pt_level > max_level) { + max_level = pt_level; + entry_point = pt_id; + } +} + +/************************************************************** + * Searching + **************************************************************/ + +namespace { + +using MinimaxHeap = HNSWbd::MinimaxHeap; +using Node = HNSWbd::Node; +/** Do a BFS on the candidates list */ + +int search_from_candidates( + const HNSWbd& hnsw, + DistanceComputer& qdis, + int k, + idx_t* I, + float* D, + MinimaxHeap& candidates, + VisitedTable& vt, + HNSWStats& stats, + int level, + int nres_in = 0, + const SearchParametersHNSW* params = nullptr) { + int nres = nres_in; + int ndis = 0; + + // can be overridden by search params + bool do_dis_check = params ? params->check_relative_distance + : hnsw.check_relative_distance; + int efSearch = params ? params->efSearch : hnsw.efSearch; + const IDSelector* sel = params ? params->sel : nullptr; + + for (int i = 0; i < candidates.size(); i++) { + idx_t v1 = candidates.ids[i]; + float d = candidates.dis[i]; + FAISS_ASSERT(v1 >= 0); + if (!sel || sel->is_member(v1)) { + if (nres < k) { + faiss::maxheap_push(++nres, D, I, d, v1); + } else if (d < D[0]) { + faiss::maxheap_replace_top(nres, D, I, d, v1); + } + } + vt.set(v1); + } + + int nstep = 0; + + while (candidates.size() > 0) { + float d0 = 0; + int v0 = candidates.pop_min(&d0); + + if (do_dis_check) { + // tricky stopping condition: there are more that ef + // distances that are processed already that are smaller + // than d0 + + int n_dis_below = candidates.count_below(d0); + if (n_dis_below >= efSearch) { + break; + } + } + + size_t begin, end; + hnsw.neighbor_range(v0, level, &begin, &end); + + // // baseline version + // for (size_t j = begin; j < end; j++) { + // int v1 = hnsw.neighbors[j]; + // if (v1 < 0) + // break; + // if (vt.get(v1)) { + // continue; + // } + // vt.set(v1); + // ndis++; + // float d = qdis(v1); + // if (!sel || sel->is_member(v1)) { + // if (nres < k) { + // faiss::maxheap_push(++nres, D, I, d, v1); + // } else if (d < D[0]) { + // faiss::maxheap_replace_top(nres, D, I, d, v1); + // } + // } + // candidates.push(v1, d); + // } + + // the following version processes 4 neighbors at a time + size_t jmax = begin; + for (size_t j = begin; j < end; j++) { + int v1 = hnsw.neighbors[j]; + if (v1 < 0) + break; + + prefetch_L2(vt.visited.data() + v1); + jmax += 1; + } + + int counter = 0; + size_t saved_j[4]; + + ndis += jmax - begin; + + auto add_to_heap = [&](const size_t idx, const float dis) { + if (!sel || sel->is_member(idx)) { + if (nres < k) { + faiss::maxheap_push(++nres, D, I, dis, idx); + } else if (dis < D[0]) { + faiss::maxheap_replace_top(nres, D, I, dis, idx); + } + } + candidates.push(idx, dis); + }; + + for (size_t j = begin; j < jmax; j++) { + int v1 = hnsw.neighbors[j]; + hnsw.bd_stat.steps_iterating_search = + hnsw.bd_stat.steps_iterating_search + 1; + bool vget = vt.get(v1); + vt.set(v1); + saved_j[counter] = v1; + counter += vget ? 0 : 1; + + if (counter == 4) { + float dis[4]; + qdis.distances_batch_4( + saved_j[0], + saved_j[1], + saved_j[2], + saved_j[3], + dis[0], + dis[1], + dis[2], + dis[3]); + + for (size_t id4 = 0; id4 < 4; id4++) { + add_to_heap(saved_j[id4], dis[id4]); + } + + counter = 0; + } + } + + for (size_t icnt = 0; icnt < counter; icnt++) { + float dis = qdis(saved_j[icnt]); + add_to_heap(saved_j[icnt], dis); + } + + nstep++; + if (!do_dis_check && nstep > efSearch) { + break; + } + } + + if (level == 0) { + stats.n1++; + if (candidates.size() == 0) { + stats.n2++; + } + stats.n3 += ndis; + } + + return nres; +} + +std::priority_queue search_from_candidate_unbounded( + const HNSWbd& hnsw, + const Node& node, + DistanceComputer& qdis, + int ef, + VisitedTable* vt, + HNSWStats& stats) { + int ndis = 0; + std::priority_queue top_candidates; + std::priority_queue, std::greater> candidates; + + top_candidates.push(node); + candidates.push(node); + + vt->set(node.second); + + while (!candidates.empty()) { + float d0; + storage_idx_t v0; + std::tie(d0, v0) = candidates.top(); + + if (d0 > top_candidates.top().first) { + break; + } + + candidates.pop(); + + size_t begin, end; + hnsw.neighbor_range(v0, 0, &begin, &end); + + // // baseline version + // for (size_t j = begin; j < end; ++j) { + // int v1 = hnsw.neighbors[j]; + // + // if (v1 < 0) { + // break; + // } + // if (vt->get(v1)) { + // continue; + // } + // + // vt->set(v1); + // + // float d1 = qdis(v1); + // ++ndis; + // + // if (top_candidates.top().first > d1 || + // top_candidates.size() < ef) { + // candidates.emplace(d1, v1); + // top_candidates.emplace(d1, v1); + // + // if (top_candidates.size() > ef) { + // top_candidates.pop(); + // } + // } + // } + + // the following version processes 4 neighbors at a time + size_t jmax = begin; + for (size_t j = begin; j < end; j++) { + int v1 = hnsw.neighbors[j]; + if (v1 < 0) + break; + + prefetch_L2(vt->visited.data() + v1); + jmax += 1; + } + + int counter = 0; + size_t saved_j[4]; + + ndis += jmax - begin; + + auto add_to_heap = [&](const size_t idx, const float dis) { + if (top_candidates.top().first > dis || + top_candidates.size() < ef) { + candidates.emplace(dis, idx); + top_candidates.emplace(dis, idx); + + if (top_candidates.size() > ef) { + top_candidates.pop(); + } + } + }; + + for (size_t j = begin; j < jmax; j++) { + int v1 = hnsw.neighbors[j]; + + bool vget = vt->get(v1); + vt->set(v1); + saved_j[counter] = v1; + counter += vget ? 0 : 1; + + if (counter == 4) { + float dis[4]; + qdis.distances_batch_4( + saved_j[0], + saved_j[1], + saved_j[2], + saved_j[3], + dis[0], + dis[1], + dis[2], + dis[3]); + + for (size_t id4 = 0; id4 < 4; id4++) { + add_to_heap(saved_j[id4], dis[id4]); + } + + counter = 0; + } + } + + for (size_t icnt = 0; icnt < counter; icnt++) { + float dis = qdis(saved_j[icnt]); + add_to_heap(saved_j[icnt], dis); + } + } + + ++stats.n1; + if (candidates.size() == 0) { + ++stats.n2; + } + stats.n3 += ndis; + + return top_candidates; +} + +} // anonymous namespace + +HNSWStats HNSWbd::search( + DistanceComputer& qdis, + int k, + idx_t* I, + float* D, + VisitedTable& vt, + const SearchParametersHNSW* params) const { + HNSWStats stats; + if (entry_point == -1) { + return stats; + } + if (upper_beam == 1) { + // greedy search on upper levels + storage_idx_t nearest = entry_point; + float d_nearest = qdis(nearest); + auto start = std::chrono::high_resolution_clock::now(); + for (int level = max_level; level >= 1; level--) { + greedy_update_nearest(*this, qdis, level, nearest, d_nearest); + } + bd_stat.time_greedy_search += chronoElapsedTime(start); + + int ef = std::max(efSearch, k); + if (search_bounded_queue) { // this is the most common branch + MinimaxHeap candidates(ef); + + candidates.push(nearest, d_nearest); + start = std::chrono::high_resolution_clock::now(); + search_from_candidates( + *this, qdis, k, I, D, candidates, vt, stats, 0, 0, params); + bd_stat.time_search_from_candidates += chronoElapsedTime(start); + } else { + std::priority_queue top_candidates = + search_from_candidate_unbounded( + *this, + Node(d_nearest, nearest), + qdis, + ef, + &vt, + stats); + + while (top_candidates.size() > k) { + top_candidates.pop(); + } + + int nres = 0; + while (!top_candidates.empty()) { + float d; + storage_idx_t label; + std::tie(d, label) = top_candidates.top(); + faiss::maxheap_push(++nres, D, I, d, label); + top_candidates.pop(); + } + } + + vt.advance(); + + } else { + int candidates_size = upper_beam; + MinimaxHeap candidates(candidates_size); + + std::vector I_to_next(candidates_size); + std::vector D_to_next(candidates_size); + + int nres = 1; + I_to_next[0] = entry_point; + D_to_next[0] = qdis(entry_point); + + for (int level = max_level; level >= 0; level--) { + // copy I, D -> candidates + + candidates.clear(); + + for (int i = 0; i < nres; i++) { + candidates.push(I_to_next[i], D_to_next[i]); + } + + if (level == 0) { + nres = search_from_candidates( + *this, qdis, k, I, D, candidates, vt, stats, 0); + } else { + nres = search_from_candidates( + *this, + qdis, + candidates_size, + I_to_next.data(), + D_to_next.data(), + candidates, + vt, + stats, + level); + } + vt.advance(); + } + } + + return stats; +} + +void HNSWbd::search_level_0( + DistanceComputer& qdis, + int k, + idx_t* idxi, + float* simi, + idx_t nprobe, + const storage_idx_t* nearest_i, + const float* nearest_d, + int search_type, + HNSWStats& search_stats, + VisitedTable& vt) const { + const HNSWbd& hnsw = *this; + + if (search_type == 1) { + int nres = 0; + + for (int j = 0; j < nprobe; j++) { + storage_idx_t cj = nearest_i[j]; + + if (cj < 0) + break; + + if (vt.get(cj)) + continue; + + int candidates_size = std::max(hnsw.efSearch, int(k)); + MinimaxHeap candidates(candidates_size); + + candidates.push(cj, nearest_d[j]); + + nres = search_from_candidates( + hnsw, + qdis, + k, + idxi, + simi, + candidates, + vt, + search_stats, + 0, + nres); + } + } else if (search_type == 2) { + int candidates_size = std::max(hnsw.efSearch, int(k)); + candidates_size = std::max(candidates_size, int(nprobe)); + + MinimaxHeap candidates(candidates_size); + for (int j = 0; j < nprobe; j++) { + storage_idx_t cj = nearest_i[j]; + + if (cj < 0) + break; + candidates.push(cj, nearest_d[j]); + } + + search_from_candidates( + hnsw, qdis, k, idxi, simi, candidates, vt, search_stats, 0); + } +} + +void HNSWbd::permute_entries(const idx_t* map) { + // remap levels + storage_idx_t ntotal = levels.size(); + std::vector imap(ntotal); // inverse mapping + // map: new index -> old index + // imap: old index -> new index + for (int i = 0; i < ntotal; i++) { + assert(map[i] >= 0 && map[i] < ntotal); + imap[map[i]] = i; + } + if (entry_point != -1) { + entry_point = imap[entry_point]; + } + std::vector new_levels(ntotal); + std::vector new_offsets(ntotal + 1); + std::vector new_neighbors(neighbors.size()); + size_t no = 0; + for (int i = 0; i < ntotal; i++) { + storage_idx_t o = map[i]; // corresponding "old" index + new_levels[i] = levels[o]; + for (size_t j = offsets[o]; j < offsets[o + 1]; j++) { + storage_idx_t neigh = neighbors[j]; + new_neighbors[no++] = neigh >= 0 ? imap[neigh] : neigh; + } + new_offsets[i + 1] = no; + } + assert(new_offsets[ntotal] == offsets[ntotal]); + // swap everyone + std::swap(levels, new_levels); + std::swap(offsets, new_offsets); + std::swap(neighbors, new_neighbors); +} + +/************************************************************** + * MinimaxHeap + **************************************************************/ + +void HNSWbd::MinimaxHeap::push(storage_idx_t i, float v) { + if (k == n) { + if (v >= dis[0]) + return; + if (ids[0] != -1) { + --nvalid; + } + faiss::heap_pop(k--, dis.data(), ids.data()); + } + faiss::heap_push(++k, dis.data(), ids.data(), v, i); + ++nvalid; +} + +float HNSWbd::MinimaxHeap::max() const { + return dis[0]; +} + +int HNSWbd::MinimaxHeap::size() const { + return nvalid; +} + +void HNSWbd::MinimaxHeap::clear() { + nvalid = k = 0; +} + +#ifdef __AVX2__ +int HNSWbd::MinimaxHeap::pop_min(float* vmin_out) { + assert(k > 0); + static_assert( + std::is_same::value, + "This code expects storage_idx_t to be int32_t"); + + int32_t min_idx = -1; + float min_dis = std::numeric_limits::infinity(); + + size_t iii = 0; + + __m256i min_indices = _mm256_setr_epi32(-1, -1, -1, -1, -1, -1, -1, -1); + __m256 min_distances = + _mm256_set1_ps(std::numeric_limits::infinity()); + __m256i current_indices = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + __m256i offset = _mm256_set1_epi32(8); + + // The baseline version is available in non-AVX2 branch. + + // The following loop tracks the rightmost index with the min distance. + // -1 index values are ignored. + const int k8 = (k / 8) * 8; + for (; iii < k8; iii += 8) { + __m256i indices = + _mm256_loadu_si256((const __m256i*)(ids.data() + iii)); + __m256 distances = _mm256_loadu_ps(dis.data() + iii); + + // This mask filters out -1 values among indices. + __m256i m1mask = _mm256_cmpgt_epi32(_mm256_setzero_si256(), indices); + + __m256i dmask = _mm256_castps_si256( + _mm256_cmp_ps(min_distances, distances, _CMP_LT_OS)); + __m256 finalmask = _mm256_castsi256_ps(_mm256_or_si256(m1mask, dmask)); + + const __m256i min_indices_new = _mm256_castps_si256(_mm256_blendv_ps( + _mm256_castsi256_ps(current_indices), + _mm256_castsi256_ps(min_indices), + finalmask)); + + const __m256 min_distances_new = + _mm256_blendv_ps(distances, min_distances, finalmask); + + min_indices = min_indices_new; + min_distances = min_distances_new; + + current_indices = _mm256_add_epi32(current_indices, offset); + } + + // Vectorizing is doable, but is not practical + int32_t vidx8[8]; + float vdis8[8]; + _mm256_storeu_ps(vdis8, min_distances); + _mm256_storeu_si256((__m256i*)vidx8, min_indices); + + for (size_t j = 0; j < 8; j++) { + if (min_dis > vdis8[j] || (min_dis == vdis8[j] && min_idx < vidx8[j])) { + min_idx = vidx8[j]; + min_dis = vdis8[j]; + } + } + + // process last values. Vectorizing is doable, but is not practical + for (; iii < k; iii++) { + if (ids[iii] != -1 && dis[iii] <= min_dis) { + min_dis = dis[iii]; + min_idx = iii; + } + } + + if (min_idx == -1) { + return -1; + } + + if (vmin_out) { + *vmin_out = min_dis; + } + int ret = ids[min_idx]; + ids[min_idx] = -1; + --nvalid; + return ret; +} + +#else + +// baseline non-vectorized version +int HNSWbd::MinimaxHeap::pop_min(float* vmin_out) { + assert(k > 0); + // returns min. This is an O(n) operation + int i = k - 1; + while (i >= 0) { + if (ids[i] != -1) { + break; + } + i--; + } + if (i == -1) { + return -1; + } + int imin = i; + float vmin = dis[i]; + i--; + while (i >= 0) { + if (ids[i] != -1 && dis[i] < vmin) { + vmin = dis[i]; + imin = i; + } + i--; + } + if (vmin_out) { + *vmin_out = vmin; + } + int ret = ids[imin]; + ids[imin] = -1; + --nvalid; + + return ret; +} +#endif + +int HNSWbd::MinimaxHeap::count_below(float thresh) { + int n_below = 0; + for (int i = 0; i < k; i++) { + if (dis[i] < thresh) { + n_below++; + } + } + + return n_below; +} + +} // namespace faiss \ No newline at end of file diff --git a/thirdparty/faiss/faiss/impl/HNSWbd.h b/thirdparty/faiss/faiss/impl/HNSWbd.h new file mode 100644 index 000000000..daaaa401d --- /dev/null +++ b/thirdparty/faiss/faiss/impl/HNSWbd.h @@ -0,0 +1,310 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +// -*- c++ -*- + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace faiss { + +/** Implementation of the Hierarchical Navigable Small World + * datastructure. + * + * Efficient and robust approximate nearest neighbor search using + * Hierarchical Navigable Small World graphs + * + * Yu. A. Malkov, D. A. Yashunin, arXiv 2017 + * + * This implementation is heavily influenced by the NMSlib + * implementation by Yury Malkov and Leonid Boystov + * (https://github.com/searchivarius/nmslib) + * + * The HNSW object stores only the neighbor link structure, see + * IndexHNSW.h for the full index object. + */ + + +struct HNSW_breakdown_stats { + size_t steps_greedy = + 0; // number of vertices traversing in greedy search in add + size_t steps_iterating_add = + 0; // number of vertices visited in add_neighbors + size_t steps_iterating_search = + 0; // number of vertices visited in searching from candidates + + size_t time_greedy_insert = 0; + size_t time_searching_neighbors_to_add = 0; + size_t time_add_links = 0; + + size_t time_greedy_search = 0; + size_t time_search_from_candidates = 0; + size_t time_dc = 0; + size_t time_dc_linking = 0; + size_t step_linking =0; + size_t step_before_shrinking=0; + HNSW_breakdown_stats() = default; + //std::string filename="hnswbd.csv"; + void reset() { + steps_greedy = 0; + steps_iterating_add = 0; + steps_iterating_search = 0; + time_greedy_insert = 0; + time_searching_neighbors_to_add = 0; + time_add_links = 0; + time_greedy_search = 0; + time_search_from_candidates = 0; + time_dc = 0; + time_dc_linking = 0; + step_before_shrinking = 0; + step_linking = 0; + } + + + void print() { + std::cout << steps_greedy << ","; + std::cout << steps_iterating_add << ","; + std::cout << steps_iterating_search << ","; + + std::cout << time_greedy_insert << ","; + std::cout << time_searching_neighbors_to_add << ","; + std::cout << time_add_links << ","; + + std::cout << time_greedy_search << ","; + std::cout << time_search_from_candidates << ","; + std::cout< Node; + + mutable struct HNSW_breakdown_stats bd_stat; + + /** Heap structure that allows fast + */ + struct MinimaxHeap { + int n; + int k; + int nvalid; + + std::vector ids; + std::vector dis; + typedef faiss::CMax HC; + + explicit MinimaxHeap(int n) : n(n), k(0), nvalid(0), ids(n), dis(n) {} + + void push(storage_idx_t i, float v); + + float max() const; + + int size() const; + + void clear(); + + int pop_min(float* vmin_out = nullptr); + + int count_below(float thresh); + }; + + /// to sort pairs of (id, distance) from nearest to fathest or the reverse + struct NodeDistCloser { + float d; + int id; + NodeDistCloser(float d, int id) : d(d), id(id) {} + bool operator<(const NodeDistCloser& obj1) const { + return d < obj1.d; + } + }; + + struct NodeDistFarther { + float d; + int id; + NodeDistFarther(float d, int id) : d(d), id(id) {} + bool operator<(const NodeDistFarther& obj1) const { + return d > obj1.d; + } + }; + + /// assignment probability to each layer (sum=1) + std::vector assign_probas; + + /// number of neighbors stored per layer (cumulative), should not + /// be changed after first add + std::vector cum_nneighbor_per_level; + + /// level of each vector (base level = 1), size = ntotal + std::vector levels; + + /// offsets[i] is the offset in the neighbors array where vector i is stored + /// size ntotal + 1 + std::vector offsets; + + /// neighbors[offsets[i]:offsets[i+1]] is the list of neighbors of vector i + /// for all levels. this is where all storage goes. + std::vector neighbors; + + /// entry point in the search structure (one of the points with maximum + /// level + storage_idx_t entry_point = -1; + + faiss::RandomGenerator rng; + + /// maximum level + int max_level = -1; + + /// expansion factor at construction time + int efConstruction = 40; + + /// expansion factor at search time + int efSearch = 16; + + int M_ = 32; + + /// during search: do we check whether the next best distance is good + /// enough? + bool check_relative_distance = true; + + /// number of entry points in levels > 0. + int upper_beam = 1; + + /// use bounded queue during exploration + bool search_bounded_queue = true; + + // methods that initialize the tree sizes + + /// initialize the assign_probas and cum_nneighbor_per_level to + /// have 2*M links on level 0 and M links on levels > 0 + void set_default_probas(int M, float levelMult); + + /// set nb of neighbors for this level (before adding anything) + void set_nb_neighbors(int level_no, int n); + + // methods that access the tree sizes + + /// nb of neighbors for this level + int nb_neighbors(int layer_no) const; + + /// cumumlative nb up to (and excluding) this level + int cum_nb_neighbors(int layer_no) const; + + /// range of entries in the neighbors table of vertex no at layer_no + void neighbor_range(idx_t no, int layer_no, size_t* begin, size_t* end) + const; + + /// only mandatory parameter: nb of neighbors + explicit HNSWbd(int M = 32); + + /// pick a random level for a new point + int random_level(); + + /// add n random levels to table (for debugging...) + void fill_with_random_links(size_t n); + + void add_links_starting_from( + DistanceComputer& ptdis, + storage_idx_t pt_id, + storage_idx_t nearest, + float d_nearest, + int level, + omp_lock_t* locks, + VisitedTable& vt); + + /** add point pt_id on all levels <= pt_level and build the link + * structure for them. */ + void add_with_locks( + DistanceComputer& ptdis, + int pt_level, + int pt_id, + std::vector& locks, + VisitedTable& vt); + + /// search interface for 1 point, single thread + HNSWStats search( + DistanceComputer& qdis, + int k, + idx_t* I, + float* D, + VisitedTable& vt, + const SearchParametersHNSW* params = nullptr) const; + + /// search only in level 0 from a given vertex + void search_level_0( + DistanceComputer& qdis, + int k, + idx_t* idxi, + float* simi, + idx_t nprobe, + const storage_idx_t* nearest_i, + const float* nearest_d, + int search_type, + HNSWStats& search_stats, + VisitedTable& vt) const; + + void reset(); + + void clear_neighbor_tables(int level); + void print_neighbor_stats(int level) const; + + int prepare_level_tab(size_t n, bool preset_levels = false); + + static void shrink_neighbor_list( + DistanceComputer& qdis, + std::priority_queue& input, + std::vector& output, + int max_size, + struct HNSW_breakdown_stats& bd_stats); + + void permute_entries(const idx_t* map); +}; + + + +} // namespace faiss \ No newline at end of file