Skip to content

Commit 7a47088

Browse files
committed
honor expected outputs column being optional
1 parent 91ee6af commit 7a47088

File tree

2 files changed

+34
-27
lines changed

2 files changed

+34
-27
lines changed

ddtrace/llmobs/experimentation/_dataset.py

Lines changed: 29 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -222,10 +222,10 @@ def _validate_data(self, data: List[Dict[str, Union[str, Dict[str, Any]]]]) -> N
222222
f"got {sorted(new_keys)}"
223223
)
224224

225-
required_keys = {'input', 'expected_output'}
225+
required_keys = {'input'}
226226
if not required_keys.issubset(first_row_keys):
227227
missing = required_keys - first_row_keys
228-
raise ValueError(f"Records must contain 'input' and 'expected_output' fields. Missing: {missing}")
228+
raise ValueError(f"Records must contain the 'input' field")
229229

230230
# Validate consistency within new data
231231
for row in data:
@@ -363,7 +363,10 @@ def _prepare_batch_payload(self, overwrite: bool) -> Dict[str, Any]:
363363
if self._changes['added']:
364364
insert_records = []
365365
for record in self._changes['added']:
366-
new_record = {"input": record["input"], "expected_output": record["expected_output"]}
366+
new_record = {"input": record["input"]}
367+
if record.get("expected_output"):
368+
new_record["expected_output"] = record["expected_output"]
369+
367370
metadata = {k: v for k, v in record.items() if k not in ["input", "expected_output", "record_id"]}
368371
if metadata:
369372
new_record["metadata"] = metadata
@@ -539,8 +542,10 @@ def _build_insert_record(record: Dict[str, Any]) -> Dict[str, Any]:
539542
"""Convert an internal record representation into the *insert_records* payload format."""
540543
new_rec = {
541544
"input": record["input"],
542-
"expected_output": record["expected_output"],
545+
543546
}
547+
if record.get("expected_output"):
548+
new_rec["expected_output"] = record["expected_output"]
544549
metadata = {k: v for k, v in record.items() if k not in ["input", "expected_output", "record_id"]}
545550
if metadata:
546551
new_rec["metadata"] = metadata
@@ -556,7 +561,7 @@ def _build_update_record(old: Dict[str, Any], new: Dict[str, Any]) -> Dict[str,
556561
if old.get("input") != new.get("input"):
557562
upd["input"] = new["input"]
558563
if old.get("expected_output") != new.get("expected_output"):
559-
upd["expected_output"] = new["expected_output"]
564+
upd["expected_output"] = new.get("expected_output")
560565
# Diff metadata.
561566
old_meta = {k: v for k, v in old.items() if k not in ["input", "expected_output", "record_id"]}
562567
new_meta = {k: v for k, v in new.items() if k not in ["input", "expected_output", "record_id"]}
@@ -605,7 +610,7 @@ def _send_batch_updates(
605610
attrs["update_records"] = update_records
606611
if delete_records:
607612
attrs["delete_records"] = delete_records
608-
613+
609614
# Use create_new_version for first chunk, then overwrite=True for subsequent chunks
610615
# to append to the version established by the first chunk
611616
if idx == 0:
@@ -657,12 +662,11 @@ def from_csv(
657662
Dataset: A new Dataset instance containing the CSV data, structured for LLM experiments.
658663
659664
Raises:
660-
ValueError: If input_columns or expected_output_columns are not provided,
661-
or if the CSV is missing those columns, or if the file is empty.
665+
ValueError: If input_columns is not provided, or if the CSV is missing those columns, or if the file is empty.
662666
DatasetFileError: If there are issues reading the CSV file (e.g., file not found, permission error, malformed).
663667
"""
664-
if input_columns is None or expected_output_columns is None:
665-
raise ValueError("`input_columns` and `expected_output_columns` must be provided.")
668+
if input_columns is None:
669+
raise ValueError("`input_columns` must be provided.")
666670

667671
data = []
668672
try:
@@ -683,7 +687,9 @@ def from_csv(
683687

684688
header_columns = reader.fieldnames
685689
missing_input_columns = [col for col in input_columns if col not in header_columns]
686-
missing_output_columns = [col for col in expected_output_columns if col not in header_columns]
690+
missing_output_columns = False
691+
if expected_output_columns is not None:
692+
missing_output_columns = [col for col in expected_output_columns if col not in header_columns]
687693

688694
if missing_input_columns:
689695
raise ValueError(f"Input columns not found in CSV header: {missing_input_columns}")
@@ -698,7 +704,7 @@ def from_csv(
698704

699705
# Determine metadata columns (all columns not used for input or expected output)
700706
metadata_columns = [
701-
col for col in header_columns if col not in input_columns and col not in expected_output_columns
707+
col for col in header_columns if col not in input_columns and (expected_output_columns is not None and col not in expected_output_columns)
702708
]
703709

704710
for row in rows:
@@ -713,7 +719,9 @@ def from_csv(
713719

714720
try:
715721
input_data = row[input_columns[0]] if len(input_columns) == 1 else {col: row[col] for col in input_columns}
716-
expected_output_data = row[expected_output_columns[0]] if len(expected_output_columns) == 1 else {col: row[col] for col in expected_output_columns}
722+
expected_output_data = None
723+
if expected_output_columns is not None and len(expected_output_columns) > 0:
724+
expected_output_data = row[expected_output_columns[0]] if len(expected_output_columns) == 1 else {col: row[col] for col in expected_output_columns}
717725

718726
metadata = {}
719727
for col in metadata_columns:
@@ -726,13 +734,14 @@ def from_csv(
726734
# Other errors during row processing also indicate CSV issues
727735
raise DatasetFileError(f"Error parsing CSV file (row processing): {e}")
728736

729-
data.append(
730-
{
731-
"input": input_data,
732-
"expected_output": expected_output_data,
733-
**metadata,
734-
}
735-
)
737+
to_append = {
738+
"input": input_data,
739+
**metadata,
740+
}
741+
742+
if expected_output_data:
743+
to_append["expected_output"] = expected_output_data
744+
data.append(to_append)
736745
except csv.Error as e:
737746
# Catch CSV-specific parsing errors
738747
raise DatasetFileError(f"Error parsing CSV file: {e}")

tests/llmobs/test_experimentation_dataset.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -710,11 +710,11 @@ def test_push_synced_no_changes(self, experiments_vcr, synced_dataset, capsys):
710710
"""Test pushing a synced dataset with no local changes."""
711711
initial_len = len(synced_dataset)
712712
initial_version = synced_dataset._datadog_dataset_version
713-
713+
714714
# This cassette should ideally show no POST requests or minimal GETs
715715
with experiments_vcr.use_cassette("test_dataset_push_synced_no_change.yaml"):
716716
synced_dataset.push()
717-
717+
718718
captured = capsys.readouterr()
719719
assert f"Dataset '{synced_dataset.name}' (v{initial_version}) is already synced and has no pending changes" in captured.out
720720

@@ -1093,11 +1093,9 @@ def test_from_csv_missing_output_column(self, csv_file_simple):
10931093

10941094
def test_from_csv_missing_column_specifications(self, csv_file_simple):
10951095
"""Test calling from_csv without input/output columns raises ValueError."""
1096-
with pytest.raises(ValueError, match="`input_columns` and `expected_output_columns` must be provided"):
1096+
with pytest.raises(ValueError, match="`input_columns` must be provided"):
10971097
dne.Dataset.from_csv(csv_file_simple, name="bad")
1098-
with pytest.raises(ValueError, match="`input_columns` and `expected_output_columns` must be provided"):
1099-
dne.Dataset.from_csv(csv_file_simple, name="bad", input_columns=["question"])
1100-
with pytest.raises(ValueError, match="`input_columns` and `expected_output_columns` must be provided"):
1098+
with pytest.raises(ValueError, match="`input_columns` must be provided"):
11011099
dne.Dataset.from_csv(csv_file_simple, name="bad", expected_output_columns=["answer"])
11021100

11031101
def test_from_csv_malformed_file(self, csv_file_malformed):
@@ -1216,4 +1214,4 @@ def test_repr_empty_dataset(self):
12161214
assert "Structure:" not in rep_clean # No structure derivable when empty
12171215
assert "Datadog: Local only" in rep_clean
12181216
# Should show the deletion as a change until pushed/synced
1219-
assert "Changes: -1 deleted" in rep_clean
1217+
assert "Changes: -1 deleted" in rep_clean

0 commit comments

Comments
 (0)