lower halluciantion threshold

mayinghan · mayinghan · commit d7c5cde4316a · 2025-08-05T14:36:48.000-07:00
diff --git a/tests/pytest/test_hallucination.py b/tests/pytest/test_hallucination.py
@@ -22,7 +22,7 @@ def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[Evaluation
     return [
         EvaluationRow(
             messages=[Message(role="user", content=f"Knowledge: {item['knowledge']}\n\nQuestion: {item['question']}")],
-            ground_truth=item["right_answer"]
+            ground_truth=item["right_answer"],
         )
         for item in data
     ]
@@ -34,7 +34,7 @@ def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[Evaluation
     model=["accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 512}],
     rollout_processor=default_single_turn_rollout_processor,
-    threshold_of_success=1.0,
+    threshold_of_success=0.5,
     num_runs=1,
     mode="pointwise",
 )
@@ -49,7 +49,7 @@ def test_hallucination_detection(row: EvaluationRow) -> EvaluationRow:
         return EvaluateResult(score=0.0, reason="❌ No assistant response found")
 
     correct_answer = row.ground_truth
-    
+
     system_prompt = """
     TASK
     - You will be given an assistant's response and the correct answer.
@@ -78,42 +78,33 @@ def test_hallucination_detection(row: EvaluationRow) -> EvaluationRow:
 
     try:
         response = judge_llm.chat.completions.create(
-            messages=[
-                {"role": "system", "content": system_prompt},
-                {"role": "user", "content": user_prompt}
-            ],
+            messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}],
             temperature=0.1,
             max_tokens=500,
         )
-        
+
         result_data = json.loads(response.choices[0].message.content)
         is_correct = result_data.get("is_correct", False)
         reasoning = result_data.get("reasoning", "Could not parse reasoning")
-        
+
     except Exception as e:
         # Fallback if parsing fails
         is_correct = False
         reasoning = f"Evaluation failed: {str(e)}"
-    
+
     score = 1.0 if is_correct else 0.0
-    
+
     if is_correct:
         assessment = "✅ Response is correct"
     else:
         assessment = "❌ Response is incorrect"
-    
+
     reason = f"{assessment}\nReasoning: {reasoning}"
 
     row.evaluation_result = EvaluateResult(
         score=score,
         reason=reason,
-        metrics={
-            "llm_judge": MetricResult(
-                score=score,
-                reason=reasoning,
-                is_score_valid=True
-            )
-        }
+        metrics={"llm_judge": MetricResult(score=score, reason=reasoning, is_score_valid=True)},
     )
-    
-    return row
+
+    return row