Skip to content

Commit d7c5cde

Browse files
committed
lower halluciantion threshold
1 parent 30e1af0 commit d7c5cde

File tree

1 file changed

+12
-21
lines changed

1 file changed

+12
-21
lines changed

tests/pytest/test_hallucination.py

Lines changed: 12 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[Evaluation
2222
return [
2323
EvaluationRow(
2424
messages=[Message(role="user", content=f"Knowledge: {item['knowledge']}\n\nQuestion: {item['question']}")],
25-
ground_truth=item["right_answer"]
25+
ground_truth=item["right_answer"],
2626
)
2727
for item in data
2828
]
@@ -34,7 +34,7 @@ def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[Evaluation
3434
model=["accounts/fireworks/models/kimi-k2-instruct"],
3535
rollout_input_params=[{"temperature": 0.0, "max_tokens": 512}],
3636
rollout_processor=default_single_turn_rollout_processor,
37-
threshold_of_success=1.0,
37+
threshold_of_success=0.5,
3838
num_runs=1,
3939
mode="pointwise",
4040
)
@@ -49,7 +49,7 @@ def test_hallucination_detection(row: EvaluationRow) -> EvaluationRow:
4949
return EvaluateResult(score=0.0, reason="❌ No assistant response found")
5050

5151
correct_answer = row.ground_truth
52-
52+
5353
system_prompt = """
5454
TASK
5555
- You will be given an assistant's response and the correct answer.
@@ -78,42 +78,33 @@ def test_hallucination_detection(row: EvaluationRow) -> EvaluationRow:
7878

7979
try:
8080
response = judge_llm.chat.completions.create(
81-
messages=[
82-
{"role": "system", "content": system_prompt},
83-
{"role": "user", "content": user_prompt}
84-
],
81+
messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}],
8582
temperature=0.1,
8683
max_tokens=500,
8784
)
88-
85+
8986
result_data = json.loads(response.choices[0].message.content)
9087
is_correct = result_data.get("is_correct", False)
9188
reasoning = result_data.get("reasoning", "Could not parse reasoning")
92-
89+
9390
except Exception as e:
9491
# Fallback if parsing fails
9592
is_correct = False
9693
reasoning = f"Evaluation failed: {str(e)}"
97-
94+
9895
score = 1.0 if is_correct else 0.0
99-
96+
10097
if is_correct:
10198
assessment = "✅ Response is correct"
10299
else:
103100
assessment = "❌ Response is incorrect"
104-
101+
105102
reason = f"{assessment}\nReasoning: {reasoning}"
106103

107104
row.evaluation_result = EvaluateResult(
108105
score=score,
109106
reason=reason,
110-
metrics={
111-
"llm_judge": MetricResult(
112-
score=score,
113-
reason=reasoning,
114-
is_score_valid=True
115-
)
116-
}
107+
metrics={"llm_judge": MetricResult(score=score, reason=reasoning, is_score_valid=True)},
117108
)
118-
119-
return row
109+
110+
return row

0 commit comments

Comments
 (0)