Skip to content

removing check in TextToVectorUpdateProcessorFactory which brakes update for new dynamic fields #3426

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions solr/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,8 @@ Bug Fixes

* SOLR-17792: Fix deadlocks in ParallelHttpShardHandler, re-implement synchronization in HttpShardHandler (Houston Putman)

* GITHUB#3426: Improving check in TextToVectorUpdateProcessorFactory, which breaks update for new dynamic fields. (Renato Haeberli via Alessandro Benedetti)

Dependency Upgrades
---------------------
* SOLR-17795: Upgrade Lucene to 9.12.2. (Pierre Salagnac, Christine Poerschke, Houston Putman)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,23 +18,28 @@

import dev.langchain4j.data.embedding.Embedding;
import dev.langchain4j.model.embedding.EmbeddingModel;
import java.lang.invoke.MethodHandles;
import java.lang.reflect.Method;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Map;
import java.util.Objects;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.solr.common.SolrException;
import org.apache.solr.core.SolrResourceLoader;
import org.apache.solr.llm.textvectorisation.store.TextToVectorModelException;
import org.apache.solr.llm.textvectorisation.store.rest.ManagedTextToVectorModelStore;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* This object wraps a {@link dev.langchain4j.model.embedding.EmbeddingModel} to encode text to
* vector. It's meant to be used as a managed resource with the {@link
* ManagedTextToVectorModelStore}
*/
public class SolrTextToVectorModel implements Accountable {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private static final long BASE_RAM_BYTES =
RamUsageEstimator.shallowSizeOfInstance(SolrTextToVectorModel.class);
private static final String TIMEOUT_PARAM = "timeout";
Expand Down Expand Up @@ -80,41 +85,40 @@ public static SolrTextToVectorModel getInstance(
* support, some of them may require to be handled in here as separate switch cases
*/
switch (paramName) {
case TIMEOUT_PARAM:
case TIMEOUT_PARAM -> {
Duration timeOut = Duration.ofSeconds((Long) params.get(paramName));
builder.getClass().getMethod(paramName, Duration.class).invoke(builder, timeOut);
break;
case MAX_SEGMENTS_PER_BATCH_PARAM:
builder
.getClass()
.getMethod(paramName, Integer.class)
.invoke(builder, ((Long) params.get(paramName)).intValue());
break;
case MAX_RETRIES_PARAM:
builder
.getClass()
.getMethod(paramName, Integer.class)
.invoke(builder, ((Long) params.get(paramName)).intValue());
break;
}
case MAX_SEGMENTS_PER_BATCH_PARAM, MAX_RETRIES_PARAM -> builder
.getClass()
.getMethod(paramName, Integer.class)
.invoke(builder, ((Long) params.get(paramName)).intValue());

/*
* For primitive params if there's only one setter available, we call it.
* If there's choice we default to the string one
*/
default:
default -> {
ArrayList<Method> paramNameMatches = new ArrayList<>();
for (var method : builder.getClass().getMethods()) {
if (paramName.equals(method.getName()) && method.getParameterCount() == 1) {
paramNameMatches.add(method);
}
}
if (paramNameMatches.size() == 1) {
paramNameMatches.get(0).invoke(builder, params.get(paramName));
paramNameMatches.getFirst().invoke(builder, params.get(paramName));
} else {
builder
.getClass()
.getMethod(paramName, String.class)
.invoke(builder, params.get(paramName).toString());
try {
builder
.getClass()
.getMethod(paramName, String.class)
.invoke(builder, params.get(paramName).toString());
} catch (NoSuchMethodException e) {
log.error("Parameter {} not supported by model {}", paramName, className);
Copy link
Contributor Author

@renatoh renatoh Jul 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Implementing my own DimensionAwareEmbeddingModel, I notice that if the attribute is not supported by the model, the error was not logged at all and I had to debug to see what the issue was.
I also took the liberty tot clean up the duplicate code snippets.

throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e.getMessage(), e);
}
}
}
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,16 @@ public void init(final NamedList<?> args) {
public UpdateRequestProcessor getInstance(
SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) {
IndexSchema latestSchema = req.getCore().getLatestSchema();
if (!latestSchema.hasExplicitField(inputField)) {

if (!latestSchema.isDynamicField(inputField) && !latestSchema.hasExplicitField(inputField)) {
throw new SolrException(
SolrException.ErrorCode.SERVER_ERROR, "undefined field: \"" + inputField + "\"");
}
if (!latestSchema.hasExplicitField(outputField)) {

if (!latestSchema.isDynamicField(outputField) && !latestSchema.hasExplicitField(outputField)) {
throw new SolrException(
SolrException.ErrorCode.SERVER_ERROR, "undefined field: \"" + outputField + "\"");
}

final SchemaField outputFieldSchema = latestSchema.getField(outputField);
assertIsDenseVectorField(outputFieldSchema);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
<fieldType name="high_dimensional_float_knn_vector" class="solr.DenseVectorField" vectorDimension="2048" similarityFunction="cosine" vectorEncoding="FLOAT32"/>
<fieldType name="high_dimensional_byte_knn_vector" class="solr.DenseVectorField" vectorDimension="2048" similarityFunction="cosine" vectorEncoding="BYTE"/>
<fieldType name="plong" class="solr.LongPointField" useDocValuesAsStored="false"/>
<dynamicField name="*_s" type="string" indexed="true" stored="true" />
<dynamicField name="*_vector1024" type="knn_vector" indexed="true" stored="true" />

<field name="id" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
<field name="vector" type="knn_vector" indexed="true" stored="true"/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,13 @@
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.llm.TestLlmBase;
import org.apache.solr.llm.textvectorisation.model.SolrTextToVectorModel;
import org.apache.solr.llm.textvectorisation.store.rest.ManagedTextToVectorModelStore;
import org.apache.solr.request.SolrQueryRequestBase;
import org.apache.solr.update.processor.UpdateRequestProcessor;
import org.junit.After;
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;

Expand All @@ -38,6 +43,18 @@ public static void cleanup() throws Exception {
afterTest();
}

SolrCore collection1;

@Before
public void setup() {
collection1 = solrClientTestRule.getCoreContainer().getCore("collection1");
}

@After
public void after() {
collection1.close();
}

@Test
public void init_fullArgs_shouldInitAllParams() {
NamedList<String> args = new NamedList<>();
Expand Down Expand Up @@ -100,13 +117,11 @@ public void init_notExistentOutputField_shouldThrowExceptionWithDetailedMessage(
TextToVectorUpdateProcessorFactory factoryToTest = new TextToVectorUpdateProcessorFactory();

ModifiableSolrParams params = new ModifiableSolrParams();
SolrCore collection1 = solrClientTestRule.getCoreContainer().getCore("collection1");
SolrQueryRequestBase req = new SolrQueryRequestBase(collection1, params) {};
factoryToTest.init(args);
SolrException e =
assertThrows(SolrException.class, () -> factoryToTest.getInstance(req, null, null));
assertEquals("undefined field: \"notExistentOutput\"", e.getMessage());
collection1.close();
}

/* Following test depends on a real solr schema and depends on BeforeClass-AfterClass methods */
Expand All @@ -120,14 +135,12 @@ public void init_notDenseVectorOutputField_shouldThrowExceptionWithDetailedMessa
TextToVectorUpdateProcessorFactory factoryToTest = new TextToVectorUpdateProcessorFactory();

ModifiableSolrParams params = new ModifiableSolrParams();
SolrCore collection1 = solrClientTestRule.getCoreContainer().getCore("collection1");
SolrQueryRequestBase req = new SolrQueryRequestBase(collection1, params) {};
factoryToTest.init(args);
SolrException e =
assertThrows(SolrException.class, () -> factoryToTest.getInstance(req, null, null));
assertEquals(
"only DenseVectorField is compatible with Vector Query Parsers: _text_", e.getMessage());
collection1.close();
}

/* Following test depends on a real solr schema and depends on BeforeClass-AfterClass methods */
Expand All @@ -141,12 +154,51 @@ public void init_notExistentInputField_shouldThrowExceptionWithDetailedMessage()
TextToVectorUpdateProcessorFactory factoryToTest = new TextToVectorUpdateProcessorFactory();

ModifiableSolrParams params = new ModifiableSolrParams();
SolrCore collection1 = solrClientTestRule.getCoreContainer().getCore("collection1");
SolrQueryRequestBase req = new SolrQueryRequestBase(collection1, params) {};
factoryToTest.init(args);
SolrException e =
assertThrows(SolrException.class, () -> factoryToTest.getInstance(req, null, null));
assertEquals("undefined field: \"notExistentInput\"", e.getMessage());
collection1.close();
}

/* Following test depends on a real solr schema and depends on BeforeClass-AfterClass methods */
@Test
public void init_notExistentDynamicInputField_shouldNotThrowException() {
String inputFieldName = "text_s";
String outputFieldName = "vector";

UpdateRequestProcessor instance =
createUpdateProcessor(inputFieldName, outputFieldName, collection1, "model1");
assertNotNull(instance);
}

/* Following test depends on a real solr schema and depends on BeforeClass-AfterClass methods */
@Test
public void init_notExistingDynamicOutputField_shouldNotThrowException() {
String inputFieldName = "_text_";
String outputFieldName = "vec_vector1024";

UpdateRequestProcessor instance =
createUpdateProcessor(inputFieldName, outputFieldName, collection1, "model2");
assertNotNull(instance);
}

private UpdateRequestProcessor createUpdateProcessor(
String inputFieldName, String outputFieldName, SolrCore collection1, String modelName) {
NamedList<String> args = new NamedList<>();

ManagedTextToVectorModelStore.getManagedModelStore(collection1)
.addModel(new SolrTextToVectorModel(modelName, null, null));
args.add("inputField", inputFieldName);
args.add("outputField", outputFieldName);
args.add("model", modelName);

TextToVectorUpdateProcessorFactory factoryToTest = new TextToVectorUpdateProcessorFactory();
ModifiableSolrParams params = new ModifiableSolrParams();
factoryToTest.init(args);

SolrQueryRequestBase req = new SolrQueryRequestBase(collection1, params) {};

return factoryToTest.getInstance(req, null, null);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.llm.TestLlmBase;
import org.apache.solr.llm.textvectorisation.store.rest.ManagedTextToVectorModelStore;
import org.junit.After;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
Expand All @@ -40,6 +41,13 @@ public static void cleanup() throws Exception {
afterTest();
}

@After
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the deletion within the test method will not happen if a test fails, such a failing test was then impacting all following test since the deletion was not done and the setup is done only once for the test-class

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It makes sense, have you checked that all is fine? I remember I struggled a bit with the after/afterClass

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, now with the @after, even if I make one test fail on purpose, the result is consistent across 5 test runs. Without the @after one failing test has unwanted side effect and other test of the class are randomly failing due to the data which got not cleaned up.

public void afterEachTest() throws Exception {
restTestHarness.delete(ManagedTextToVectorModelStore.REST_END_POINT + "/dummy-1");
restTestHarness.delete(
ManagedTextToVectorModelStore.REST_END_POINT + "/exception-throwing-model"); // clean
}

@Test
public void processAdd_inputField_shouldVectoriseInputField() throws Exception {
loadModel("dummy-model.json"); // preparation
Expand All @@ -50,10 +58,7 @@ public void processAdd_inputField_shouldVectoriseInputField() throws Exception {
"textToVector");
assertU(commit());

final String solrQuery = "*:*";
final SolrQuery query = new SolrQuery();
query.setQuery(solrQuery);
query.add("fl", "id,vector");
final SolrQuery query = getSolrQuery();

assertJQ(
"/query" + query.toQueryString(),
Expand All @@ -66,6 +71,14 @@ public void processAdd_inputField_shouldVectoriseInputField() throws Exception {
restTestHarness.delete(ManagedTextToVectorModelStore.REST_END_POINT + "/dummy-1"); // clean up
}

private SolrQuery getSolrQuery() {
final String solrQuery = "*:*";
final SolrQuery query = new SolrQuery();
query.setQuery(solrQuery);
query.add("fl", "id,vector");
return query;
}

/*
This test looks for the 'dummy-1' model, but such model is not loaded, the model store is empty, so the update fails
*/
Expand Down Expand Up @@ -93,10 +106,7 @@ public void processAdd_emptyInputField_shouldLogAndIndexWithNoVector() throws Ex
addWithChain(sdoc("id", "98", "_text_", "Vegeta is the saiyan prince."), "textToVector");
assertU(commit());

final String solrQuery = "*:*";
final SolrQuery query = new SolrQuery();
query.setQuery(solrQuery);
query.add("fl", "id,vector");
final SolrQuery query = getSolrQuery();

assertJQ(
"/query" + query.toQueryString(),
Expand All @@ -116,10 +126,7 @@ public void processAdd_nullInputField_shouldLogAndIndexWithNoVector() throws Exc
assertU(adoc("id", "98"));
assertU(commit());

final String solrQuery = "*:*";
final SolrQuery query = new SolrQuery();
query.setQuery(solrQuery);
query.add("fl", "id,vector");
final SolrQuery query = getSolrQuery();

assertJQ(
"/query" + query.toQueryString(),
Expand All @@ -141,10 +148,7 @@ public void processAdd_failingVectorisation_shouldLogAndIndexWithNoVector() thro
"failingTextToVector");
assertU(commit());

final String solrQuery = "*:*";
final SolrQuery query = new SolrQuery();
query.setQuery(solrQuery);
query.add("fl", "id,vector");
final SolrQuery query = getSolrQuery();

assertJQ(
"/query" + query.toQueryString(),
Expand Down
Loading