@@ -22,7 +22,7 @@ def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[Evaluation
2222 return [
2323 EvaluationRow (
2424 messages = [Message (role = "user" , content = f"Knowledge: { item ['knowledge' ]} \n \n Question: { item ['question' ]} " )],
25- ground_truth = item ["right_answer" ]
25+ ground_truth = item ["right_answer" ],
2626 )
2727 for item in data
2828 ]
@@ -34,7 +34,7 @@ def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[Evaluation
3434 model = ["accounts/fireworks/models/kimi-k2-instruct" ],
3535 rollout_input_params = [{"temperature" : 0.0 , "max_tokens" : 512 }],
3636 rollout_processor = default_single_turn_rollout_processor ,
37- threshold_of_success = 1.0 ,
37+ threshold_of_success = 0.5 ,
3838 num_runs = 1 ,
3939 mode = "pointwise" ,
4040)
@@ -49,7 +49,7 @@ def test_hallucination_detection(row: EvaluationRow) -> EvaluationRow:
4949 return EvaluateResult (score = 0.0 , reason = "❌ No assistant response found" )
5050
5151 correct_answer = row .ground_truth
52-
52+
5353 system_prompt = """
5454 TASK
5555 - You will be given an assistant's response and the correct answer.
@@ -78,42 +78,33 @@ def test_hallucination_detection(row: EvaluationRow) -> EvaluationRow:
7878
7979 try :
8080 response = judge_llm .chat .completions .create (
81- messages = [
82- {"role" : "system" , "content" : system_prompt },
83- {"role" : "user" , "content" : user_prompt }
84- ],
81+ messages = [{"role" : "system" , "content" : system_prompt }, {"role" : "user" , "content" : user_prompt }],
8582 temperature = 0.1 ,
8683 max_tokens = 500 ,
8784 )
88-
85+
8986 result_data = json .loads (response .choices [0 ].message .content )
9087 is_correct = result_data .get ("is_correct" , False )
9188 reasoning = result_data .get ("reasoning" , "Could not parse reasoning" )
92-
89+
9390 except Exception as e :
9491 # Fallback if parsing fails
9592 is_correct = False
9693 reasoning = f"Evaluation failed: { str (e )} "
97-
94+
9895 score = 1.0 if is_correct else 0.0
99-
96+
10097 if is_correct :
10198 assessment = "✅ Response is correct"
10299 else :
103100 assessment = "❌ Response is incorrect"
104-
101+
105102 reason = f"{ assessment } \n Reasoning: { reasoning } "
106103
107104 row .evaluation_result = EvaluateResult (
108105 score = score ,
109106 reason = reason ,
110- metrics = {
111- "llm_judge" : MetricResult (
112- score = score ,
113- reason = reasoning ,
114- is_score_valid = True
115- )
116- }
107+ metrics = {"llm_judge" : MetricResult (score = score , reason = reasoning , is_score_valid = True )},
117108 )
118-
119- return row
109+
110+ return row
0 commit comments