Skip to content

Commit a1a4a06

Browse files
Setup Java Code Coverage and add more tests (#42)
Fixes #40 Authors: - Vivek Narang (https://github.com/narangvivek10) Approvers: - Corey J. Nolet (https://github.com/cjnolet) - Bradley Dice (https://github.com/bdice) URL: #42
1 parent 313dd7e commit a1a4a06

12 files changed

+1252
-6
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@
55
target
66
**/.DS_Store
77
cuvs-workdir
8+
bin

build.sh

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,11 @@ if ! hasArg --run-java-tests; then
3838
MAVEN_VERIFY_ARGS=("-DskipTests")
3939
fi
4040

41-
mvn verify "${MAVEN_VERIFY_ARGS[@]}" \
41+
mvn clean verify "${MAVEN_VERIFY_ARGS[@]}" \
4242
&& mvn install:install-file -Dfile=./target/cuvs-lucene-$VERSION.jar -DgroupId=$GROUP_ID -DartifactId=cuvs-lucene -Dversion=$VERSION -Dpackaging=jar \
4343
&& cp pom.xml ./target/
44+
45+
# Generate JaCoCo code coverage reports available here: target/site/jacoco/index.html
46+
if hasArg --run-java-tests; then
47+
mvn jacoco:report
48+
fi

pom.xml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,25 @@
165165
</execution>
166166
</executions>
167167
</plugin>
168+
<plugin>
169+
<groupId>org.jacoco</groupId>
170+
<artifactId>jacoco-maven-plugin</artifactId>
171+
<version>0.8.14</version>
172+
<executions>
173+
<execution>
174+
<goals>
175+
<goal>prepare-agent</goal>
176+
</goals>
177+
</execution>
178+
<execution>
179+
<id>report</id>
180+
<phase>prepare-package</phase>
181+
<goals>
182+
<goal>report</goal>
183+
</goals>
184+
</execution>
185+
</executions>
186+
</plugin>
168187
</plugins>
169188
</build>
170189
</project>
Lines changed: 332 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,332 @@
1+
/*
2+
* SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
package com.nvidia.cuvs.lucene;
6+
7+
import static com.nvidia.cuvs.lucene.TestUtils.generateDataset;
8+
import static com.nvidia.cuvs.lucene.TestUtils.generateRandomVector;
9+
10+
import java.io.IOException;
11+
import java.util.ArrayList;
12+
import java.util.HashSet;
13+
import java.util.List;
14+
import java.util.Random;
15+
import java.util.Set;
16+
import java.util.logging.Logger;
17+
import org.apache.lucene.codecs.Codec;
18+
import org.apache.lucene.document.Document;
19+
import org.apache.lucene.document.Field;
20+
import org.apache.lucene.document.KnnFloatVectorField;
21+
import org.apache.lucene.document.StringField;
22+
import org.apache.lucene.index.DirectoryReader;
23+
import org.apache.lucene.index.IndexWriter;
24+
import org.apache.lucene.index.IndexWriterConfig;
25+
import org.apache.lucene.index.Term;
26+
import org.apache.lucene.index.VectorSimilarityFunction;
27+
import org.apache.lucene.search.IndexSearcher;
28+
import org.apache.lucene.search.KnnFloatVectorQuery;
29+
import org.apache.lucene.search.Query;
30+
import org.apache.lucene.search.ScoreDoc;
31+
import org.apache.lucene.search.TermQuery;
32+
import org.apache.lucene.search.TopDocs;
33+
import org.apache.lucene.store.Directory;
34+
import org.apache.lucene.tests.analysis.MockAnalyzer;
35+
import org.apache.lucene.tests.analysis.MockTokenizer;
36+
import org.apache.lucene.tests.index.RandomIndexWriter;
37+
import org.apache.lucene.tests.util.LuceneTestCase;
38+
import org.apache.lucene.tests.util.LuceneTestCase.SuppressSysoutChecks;
39+
import org.apache.lucene.tests.util.TestUtil;
40+
import org.junit.BeforeClass;
41+
import org.junit.Test;
42+
43+
@SuppressSysoutChecks(bugUrl = "")
44+
public class TestAcceleratedHNSWDeletedDocuments extends LuceneTestCase {
45+
46+
protected static Logger log =
47+
Logger.getLogger(TestAcceleratedHNSWDeletedDocuments.class.getName());
48+
49+
static final Codec codec =
50+
TestUtil.alwaysKnnVectorsFormat(new Lucene99AcceleratedHNSWVectorsFormat());
51+
private static Random random;
52+
53+
@BeforeClass
54+
public static void beforeClass() throws Exception {
55+
assumeTrue("cuVS not supported", Lucene99AcceleratedHNSWVectorsFormat.supported());
56+
random = random();
57+
}
58+
59+
@Test
60+
public void testVectorSearchWithDeletedDocuments() throws IOException {
61+
62+
try (Directory directory = newDirectory()) {
63+
int datasetSize = random.nextInt(200, 1000); // 200-1200 documents
64+
int dimensions = random.nextInt(64, 256); // 64-320 dimensions
65+
int topK = Math.min(random.nextInt(20) + 5, datasetSize / 2); // 5-25 results
66+
float deletionProbability = random.nextFloat() * 0.4f + 0.1f; // 10-50% deletion rate
67+
68+
float[][] dataset = generateDataset(random, datasetSize, dimensions);
69+
Set<Integer> deletedDocs = new HashSet<>();
70+
71+
// Create index with all documents having vectors
72+
try (RandomIndexWriter writer = createWriter(directory)) {
73+
for (int i = 0; i < datasetSize; i++) {
74+
Document doc = new Document();
75+
doc.add(new StringField("id", String.valueOf(i), Field.Store.YES));
76+
doc.add(
77+
new KnnFloatVectorField("vector", dataset[i], VectorSimilarityFunction.EUCLIDEAN));
78+
writer.addDocument(doc);
79+
}
80+
81+
// Delete documents randomly based on probability
82+
for (int i = 0; i < datasetSize; i++) {
83+
if (random.nextFloat() < deletionProbability) {
84+
writer.deleteDocuments(new Term("id", String.valueOf(i)));
85+
deletedDocs.add(i);
86+
}
87+
}
88+
writer.commit();
89+
}
90+
91+
// Search and verify deleted documents are not returned
92+
try (DirectoryReader reader = DirectoryReader.open(directory)) {
93+
IndexSearcher searcher = newSearcher(reader);
94+
// Use a random vector for query
95+
float[] queryVector = generateRandomVector(dimensions, random);
96+
97+
Query query = new KnnFloatVectorQuery("vector", queryVector, topK);
98+
ScoreDoc[] hits = searcher.search(query, topK).scoreDocs;
99+
100+
// Verify we got results
101+
assertTrue("Should have search results", hits.length > 0);
102+
103+
// Verify no deleted documents in results
104+
for (ScoreDoc hit : hits) {
105+
String docId = reader.storedFields().document(hit.doc).get("id");
106+
int id = Integer.parseInt(docId);
107+
assertFalse(
108+
"Deleted document " + id + " should not appear in results", deletedDocs.contains(id));
109+
log.info("Found non-deleted document: " + id + ", Score: " + hit.score);
110+
}
111+
112+
// Verify deleted documents are truly deleted
113+
for (int deletedId : deletedDocs) {
114+
TopDocs result =
115+
searcher.search(new TermQuery(new Term("id", String.valueOf(deletedId))), 1);
116+
assertEquals(
117+
"Deleted document " + deletedId + " should not be found",
118+
0,
119+
result.totalHits.value());
120+
}
121+
}
122+
}
123+
}
124+
125+
@Test
126+
public void testVectorSearchWithMixedDeletedAndMissingVectors() throws IOException {
127+
128+
try (Directory directory = newDirectory()) {
129+
int datasetSize = random.nextInt(200) + 50; // 50-250 documents
130+
int dimensions = random.nextInt(256) + 64; // 64-320 dimensions
131+
int topK = Math.min(random.nextInt(20) + 5, datasetSize / 2); // 5-25 results
132+
float vectorProbability = random.nextFloat() * 0.5f + 0.3f; // 30-80% have vectors
133+
float deletionProbability = random.nextFloat() * 0.3f + 0.1f; // 10-40% deletion rate
134+
135+
float[][] dataset = generateDataset(random, datasetSize, dimensions);
136+
Set<Integer> docsWithoutVectors = new HashSet<>();
137+
Set<Integer> deletedDocs = new HashSet<>();
138+
139+
// Create index with mixed documents
140+
try (RandomIndexWriter writer = createWriter(directory)) {
141+
for (int i = 0; i < datasetSize; i++) {
142+
Document doc = new Document();
143+
doc.add(new StringField("id", String.valueOf(i), Field.Store.YES));
144+
// Randomly assign categories
145+
String category = random.nextBoolean() ? "A" : "B";
146+
doc.add(new StringField("category", category, Field.Store.YES));
147+
148+
// Randomly decide whether to add vectors
149+
if (random.nextFloat() < vectorProbability) {
150+
doc.add(
151+
new KnnFloatVectorField("vector", dataset[i], VectorSimilarityFunction.EUCLIDEAN));
152+
} else {
153+
docsWithoutVectors.add(i);
154+
}
155+
writer.addDocument(doc);
156+
}
157+
158+
// Delete documents randomly
159+
for (int i = 0; i < datasetSize; i++) {
160+
if (random.nextFloat() < deletionProbability) {
161+
writer.deleteDocuments(new Term("id", String.valueOf(i)));
162+
deletedDocs.add(i);
163+
}
164+
}
165+
writer.commit();
166+
}
167+
168+
// Test vector search behavior
169+
try (DirectoryReader reader = DirectoryReader.open(directory)) {
170+
IndexSearcher searcher = newSearcher(reader);
171+
float[] queryVector = generateRandomVector(dimensions, random);
172+
173+
Query query = new KnnFloatVectorQuery("vector", queryVector, topK);
174+
ScoreDoc[] hits = searcher.search(query, topK).scoreDocs;
175+
176+
// Verify results
177+
for (ScoreDoc hit : hits) {
178+
String docId = reader.storedFields().document(hit.doc).get("id");
179+
int id = Integer.parseInt(docId);
180+
assertFalse("Deleted document should not appear", deletedDocs.contains(id));
181+
assertFalse("Document without vector should not appear", docsWithoutVectors.contains(id));
182+
log.info("Found document with vector: " + id + ", Score: " + hit.score);
183+
}
184+
185+
// Test filtered search with deletions
186+
Query filter = new TermQuery(new Term("category", "A"));
187+
Query filteredQuery = new KnnFloatVectorQuery("vector", queryVector, topK, filter);
188+
ScoreDoc[] filteredHits = searcher.search(filteredQuery, topK).scoreDocs;
189+
190+
for (ScoreDoc hit : filteredHits) {
191+
Document doc = reader.storedFields().document(hit.doc);
192+
String category = doc.get("category");
193+
assertEquals("Should only match category A", "A", category);
194+
int id = Integer.parseInt(doc.get("id"));
195+
assertFalse(
196+
"Deleted document should not appear in filtered results", deletedDocs.contains(id));
197+
}
198+
}
199+
}
200+
}
201+
202+
@Test
203+
public void testVectorSearchAfterAllDocumentsDeleted() throws IOException {
204+
205+
try (Directory directory = newDirectory()) {
206+
int datasetSize = random.nextInt(20) + 5; // 5-25 documents for this test
207+
int dimensions = random.nextInt(128) + 32; // 32-160 dimensions
208+
int topK = Math.min(random.nextInt(10) + 5, datasetSize); // 5-15 results
209+
210+
float[][] dataset = generateDataset(random, datasetSize, dimensions);
211+
212+
// Create and delete all documents
213+
try (IndexWriter writer = new IndexWriter(directory, createWriterConfig())) {
214+
for (int i = 0; i < datasetSize; i++) {
215+
Document doc = new Document();
216+
doc.add(new StringField("id", String.valueOf(i), Field.Store.YES));
217+
doc.add(
218+
new KnnFloatVectorField("vector", dataset[i], VectorSimilarityFunction.EUCLIDEAN));
219+
writer.addDocument(doc);
220+
}
221+
writer.commit();
222+
223+
// Delete all documents
224+
for (int i = 0; i < datasetSize; i++) {
225+
writer.deleteDocuments(new Term("id", String.valueOf(i)));
226+
}
227+
writer.commit();
228+
writer.forceMerge(1); // Force merge to apply deletions
229+
}
230+
231+
// Verify search returns no results
232+
try (DirectoryReader reader = DirectoryReader.open(directory)) {
233+
IndexSearcher searcher = newSearcher(reader);
234+
float[] queryVector = generateRandomVector(dimensions, random);
235+
236+
Query query = new KnnFloatVectorQuery("vector", queryVector, topK);
237+
TopDocs results = searcher.search(query, topK);
238+
239+
assertEquals(
240+
"Should return no results when all documents are deleted",
241+
0,
242+
results.totalHits.value());
243+
}
244+
}
245+
}
246+
247+
@Test
248+
public void testVectorSearchWithPartialDeletionAndReindexing() throws IOException {
249+
250+
try (Directory directory = newDirectory()) {
251+
int datasetSize = random.nextInt(200) + 50; // 50-250 documents
252+
int dimensions = random.nextInt(256) + 64; // 64-320 dimensions
253+
int topK = Math.min(random.nextInt(20) + 5, datasetSize / 2); // 5-25 results
254+
float deletionProbability = random.nextFloat() * 0.3f + 0.1f; // 10-40% deletion rate
255+
256+
float[][] dataset = generateDataset(random, datasetSize, dimensions);
257+
List<Integer> activeDocIds = new ArrayList<>();
258+
259+
// Initial indexing
260+
try (IndexWriter writer = new IndexWriter(directory, createWriterConfig())) {
261+
int initialDocs = datasetSize / 2 + random.nextInt(datasetSize / 4); // 50-75% of dataset
262+
for (int i = 0; i < initialDocs; i++) {
263+
Document doc = new Document();
264+
doc.add(new StringField("id", String.valueOf(i), Field.Store.YES));
265+
doc.add(
266+
new KnnFloatVectorField("vector", dataset[i], VectorSimilarityFunction.EUCLIDEAN));
267+
writer.addDocument(doc);
268+
activeDocIds.add(i);
269+
}
270+
271+
// Delete some documents randomly
272+
List<Integer> candidatesForDeletion = new ArrayList<>(activeDocIds);
273+
for (int docId : candidatesForDeletion) {
274+
if (random.nextFloat() < deletionProbability) {
275+
writer.deleteDocuments(new Term("id", String.valueOf(docId)));
276+
activeDocIds.remove(Integer.valueOf(docId));
277+
}
278+
}
279+
280+
// Add new documents with higher IDs
281+
for (int i = initialDocs; i < datasetSize; i++) {
282+
Document doc = new Document();
283+
doc.add(new StringField("id", String.valueOf(i), Field.Store.YES));
284+
doc.add(
285+
new KnnFloatVectorField("vector", dataset[i], VectorSimilarityFunction.EUCLIDEAN));
286+
writer.addDocument(doc);
287+
activeDocIds.add(i);
288+
}
289+
writer.commit();
290+
}
291+
292+
// Verify search behavior after deletions and additions
293+
try (DirectoryReader reader = DirectoryReader.open(directory)) {
294+
IndexSearcher searcher = newSearcher(reader);
295+
float[] queryVector = generateRandomVector(dimensions, random);
296+
297+
Query query = new KnnFloatVectorQuery("vector", queryVector, topK);
298+
ScoreDoc[] hits = searcher.search(query, topK).scoreDocs;
299+
300+
Set<Integer> resultIds = new HashSet<>();
301+
for (ScoreDoc hit : hits) {
302+
String docId = reader.storedFields().document(hit.doc).get("id");
303+
int id = Integer.parseInt(docId);
304+
resultIds.add(id);
305+
assertTrue("Result should be from active documents", activeDocIds.contains(id));
306+
}
307+
308+
log.info(
309+
"Search returned "
310+
+ hits.length
311+
+ " results from "
312+
+ activeDocIds.size()
313+
+ " active documents");
314+
}
315+
}
316+
}
317+
318+
private RandomIndexWriter createWriter(Directory directory) throws IOException {
319+
return new RandomIndexWriter(
320+
random(),
321+
directory,
322+
newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true))
323+
.setCodec(codec)
324+
.setMergePolicy(newTieredMergePolicy()));
325+
}
326+
327+
private IndexWriterConfig createWriterConfig() {
328+
return newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true))
329+
.setCodec(codec)
330+
.setMergePolicy(newTieredMergePolicy());
331+
}
332+
}

0 commit comments

Comments
 (0)