Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
194 changes: 194 additions & 0 deletions code/chapter12/10_Universal_llm_judge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
"""
第十二章示例10:Universal LLM Judge 评估器 - 自定义维度案例

对应文档:12.4.2 LLM Judge评估

这个示例展示如何使用底层 UniversalLLMJudgeEvaluator,
结合自定义评估配置来评估代码质量。

关键演示:
- 使用底层接口 UniversalLLMJudgeEvaluator
- 自定义评估维度(dimension)
- 字段映射处理非标准数据格式
"""

import sys
import os
import json
from pathlib import Path

project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))

from hello_agents import HelloAgentsLLM, UniversalLLMJudgeEvaluator
from hello_agents.evaluation.benchmarks.data_generation_Universal.evaluation_config import (
EvaluationConfig,
)

# ============================================================================
# 测试数据准备
# ============================================================================

def prepare_code_data():
"""准备代码评估的测试数据"""
code_problems = [
{
"id": "code_001",
"code": """
def fibonacci(n):
if n <= 1:
return n
a, b = 0, 1
for _ in range(n):
a, b = b, a + b
return a
""",
"expected_output": "Returns correct fibonacci numbers efficiently",
"context": "Optimized fibonacci implementation",
},
{
"id": "code_002",
"code": """
def merge_sorted_arrays(arr1, arr2):
result = []
i = j = 0
while i < len(arr1) and j < len(arr2):
if arr1[i] <= arr2[j]:
result.append(arr1[i])
i += 1
else:
result.append(arr2[j])
j += 1
result.extend(arr1[i:])
result.extend(arr2[j:])
return result
""",
"expected_output": "Merge two sorted arrays in O(n+m) time",
"context": "Two-pointer merge algorithm",
}
]
return code_problems


# ============================================================================
# 主程序:代码评估(使用自定义维度)
# ============================================================================

def main():
"""
使用 UniversalLLMJudgeEvaluator 评估代码质量

演示特性:
- 底层接口使用
- 自定义评估维度(code 模板的维度)
- 非标准字段名(code, expected_output)
- 需要字段映射:code → problem, expected_output → answer
"""
print("\n" + "="*70)
print("📌 Universal LLM Judge 评估器 - 代码质量评估(自定义维度)")
print("="*70)

code_data = prepare_code_data()

# 创建 LLM 和评估器(使用 code 模板)
print("\n[初始化] 创建 LLM 和评估器...")
llm = HelloAgentsLLM(provider="deepseek", model="deepseek-chat")
eval_config = EvaluationConfig.load_template("code")

print(f"✓ 评估模板: code")
print(f"✓ 评估维度: {', '.join(eval_config.get_dimension_names())}")

# 定义字段映射(适应非标准字段名)
field_mapping = {
"problem": "code", # 源数据中的 "code" 字段映射到 "problem"
"answer": "expected_output", # 源数据中的 "expected_output" 字段映射到 "answer"
}
print(f"✓ 字段映射: {field_mapping}")

evaluator = UniversalLLMJudgeEvaluator(
llm=llm,
eval_config=eval_config,
field_mapping=field_mapping
)

# 进行评估
print("\n[评估] 开始评估代码...")
print("="*70)

all_scores = []
for i, problem in enumerate(code_data, 1):
print(f"\n评估代码 {i}/{len(code_data)}")
print(f"ID: {problem['id']}")
print(f"描述: {problem['context']}")

result = evaluator.evaluate_single(problem)

print(f"\n评估结果:")
for dim, score in result['scores'].items():
print(f" {dim}: {score:.1f}/5")
print(f" 平均分: {result['total_score']:.2f}/5")

all_scores.append(result)

# 统计汇总
print("\n" + "="*70)
print("总体统计")
print("="*70)

avg_total = sum(s['total_score'] for s in all_scores) / len(all_scores)
print(f"\n平均总分: {avg_total:.2f}/5")

# 按维度统计平均分
if all_scores:
dimension_names = list(all_scores[0]['scores'].keys())
print("\n各维度平均分:")
for dim in dimension_names:
avg_dim = sum(s['scores'][dim] for s in all_scores) / len(all_scores)
print(f" {dim}: {avg_dim:.2f}/5")

# 保存结果
print("\n[保存] 保存评估结果...")
os.makedirs("./evaluation_results", exist_ok=True)
with open("./evaluation_results/code_judge_results.json", 'w', encoding='utf-8') as f:
json.dump({
'scenario': 'Code_Custom_Dimensions',
'template': 'code',
'field_mapping': field_mapping,
'dimensions': dimension_names,
'data': code_data,
'results': all_scores,
'avg_total_score': avg_total
}, f, indent=2, ensure_ascii=False)

print("✓ 结果已保存到 ./evaluation_results/code_judge_results.json")
print("\n✅ 评估完成!")


if __name__ == "__main__":
main()

"""
# LLM Judge 评估报告

**生成时间**: 2025-10-28 16:36:15
**评估模板**: code
**评估样本数**: 2

## 总体评分

- <strong>平均总分</strong>: 3.30/5.0
- <strong>通过率</strong>: 50.0% (≥3.5分)
- <strong>优秀率</strong>: 0.0% (≥4.5分)

## 各维度评分

| 维度 | 平均分 | 评级 |
|------|--------|------|
| correctness | 4.00/5.0 | 良好 ⭐⭐⭐⭐ |
| robustness | 3.00/5.0 | 及格 ⭐⭐ |
| efficiency | 2.50/5.0 | 待改进 ⭐ |
| readability | 4.50/5.0 | 优秀 ⭐⭐⭐⭐⭐ |
| style_compliance | 2.50/5.0 | 待改进 ⭐ |


"""
191 changes: 191 additions & 0 deletions code/chapter12/11_Universal_win_rate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
"""
第十二章示例11:Universal Win Rate 评估 - 自定义维度案例

对应文档:12.4.4 Win Rate评估

这个示例展示如何使用底层 UniversalWinRateEvaluator,
结合自定义评估配置来对比数学题质量。

关键演示:
- 使用底层接口 UniversalWinRateEvaluator
- 自定义评估维度(dimension)
- 标准字段名(problem, answer)无需映射
- Win Rate 胜率对比评估
"""

import sys
import os
import json
from pathlib import Path

project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))

from hello_agents import HelloAgentsLLM, UniversalWinRateEvaluator
from hello_agents.evaluation.benchmarks.data_generation_Universal.evaluation_config import (
EvaluationConfig,
)

# ============================================================================
# 测试数据准备
# ============================================================================

def prepare_math_data():
"""准备数学题对比的测试数据(生成数据和参考数据)"""

math_generated = [
{
"id": "math_gen_001",
"problem": "Find the number of positive integers $n$ such that $n^2 + 19n + 92$ is a perfect square.",
"answer": "Using completing the square: $(2n+19)^2 - 4m^2 = -7$. Solve the Pell-like equation.",
},
{
"id": "math_gen_002",
"problem": "In triangle ABC with sides AB=13, BC=14, CA=15, find the area.",
"answer": "Using Heron's formula: $s=21$, Area $= \\sqrt{21 \\cdot 8 \\cdot 7 \\cdot 6} = 84$.",
},
]

math_reference = [
{
"id": "math_ref_001",
"problem": "Find the number of positive integers $n$ such that $n^2 + 19n + 92$ is a perfect square.",
"answer": "Let $n^2 + 19n + 92 = m^2$. Complete the square and solve systematically.",
},
{
"id": "math_ref_002",
"problem": "In triangle ABC with sides AB=13, BC=14, CA=15, find the area.",
"answer": "Use Heron's formula with semi-perimeter s=21.",
},
]

return math_generated, math_reference


# ============================================================================
# 主程序:数学题质量 Win Rate 对比(使用自定义维度)
# ============================================================================

def main():
"""
使用 UniversalWinRateEvaluator 对比数学题质量

演示特性:
- 底层接口使用
- 自定义评估维度(math 模板的维度)
- 标准字段名(problem, answer)无需映射
- Win Rate 胜率评估
"""
print("\n" + "="*70)
print("📌 Universal Win Rate 评估器 - 数学题质量对比(自定义维度)")
print("="*70)

math_gen, math_ref = prepare_math_data()

# 创建 LLM 和评估器(使用 math 模板)
print("\n[初始化] 创建 LLM 和评估器...")
llm = HelloAgentsLLM(provider="deepseek", model="deepseek-chat")
eval_config = EvaluationConfig.load_template("math")

print(f"✓ 评估模板: math")
print(f"✓ 评估维度: {', '.join(eval_config.get_dimension_names())}")

# 定义字段映射(math 模板使用标准字段名)
field_mapping = {
"problem": "problem", # 源数据中的 "problem" 字段映射到 "problem"
"answer": "answer", # 源数据中的 "answer" 字段映射到 "answer"
}
print(f"✓ 字段映射: {field_mapping}")

evaluator = UniversalWinRateEvaluator(
llm=llm,
eval_config=eval_config,
field_mapping=field_mapping
)

# 进行 Win Rate 对比
print("\n[评估] 开始进行 Win Rate 对比...")
print("="*70)

result = evaluator.evaluate_win_rate(math_gen, math_ref, num_comparisons=2)

# 显示评估结果
print("\n评估结果:")
print(f" 胜率 (Win Rate): {result.get('win_rate', 'N/A')}")
print(f" 胜场数: {result.get('wins', 'N/A')}")
print(f" 平局数: {result.get('ties', 'N/A')}")
print(f" 负场数: {result.get('losses', 'N/A')}")

# Win Rate 解读
print("\n[解读]")
win_rate = result.get('win_rate', 0)
if isinstance(win_rate, str):
win_rate_float = float(win_rate.rstrip('%')) / 100
else:
win_rate_float = win_rate

if 0.45 <= win_rate_float <= 0.55:
print("✅ 生成质量接近参考数据水平(理想情况)")
elif win_rate_float > 0.55:
print("⭐ 生成质量优于参考数据(可能有评估偏差)")
else:
print("⚠️ 生成质量低于参考数据(需要改进)")

# 保存结果
print("\n[保存] 保存评估结果...")
os.makedirs("./evaluation_results", exist_ok=True)
with open("./evaluation_results/math_winrate_results.json", "w", encoding="utf-8") as f:
json.dump(
{
"scenario": "Math_Custom_Dimensions",
"template": "math",
"field_mapping": field_mapping,
"dimensions": eval_config.get_dimension_names(),
"generated_data": math_gen,
"reference_data": math_ref,
"results": result,
},
f,
indent=2,
ensure_ascii=False,
)

print("✓ 结果已保存到 ./evaluation_results/math_winrate_results.json")
print("\n✅ 评估完成!")


if __name__ == "__main__":
main()


"""
**评估模板**: math
**对比次数**: 2

## 胜率统计

| 指标 | 数值 | 百分比 |
|------|------|--------|
| 生成数据胜出 | 1次 | 50.0% |
| 参考数据胜出 | 1次 | 50.0% |
| 平局 | 0次 | 0.0% |

<strong>Win Rate</strong>: 50.0%

✅ <strong>优秀</strong>: 生成数据质量优于参考数据(胜率≥50%)。

## 对比详情

### 对比 1

- **赢家**: Reference
- **原因**: 从各维度详细比较:1. correctness:两者数学逻辑都正确,但A的答案不完整,只给出思路没有具体数值结果;B使用海伦公式准确计算出面积=84。2. clarity:A的问题表述清晰,但解答过于简略,仅提到配方法和佩尔方程,缺乏具体推导;B的问题表述清晰,解答步骤明确,直接应用标准公式。3. completeness:A严重不完整,缺少具体求解过程和最终答案;B完整展示了从已知条件到最终结果的完整过程。4. difficulty_match:A作为数论问题难度较高,与中学生数学竞赛匹配;B作为几何问题难度适中,与中学数学课程匹配,两者都符合预期难度。5. originality:A展示了非标准的佩尔方程解法,具有一定启发性;B使用标准解法,缺乏创新性。综合来看,虽然A在原创性上稍优,但B在完整性、清晰度和答案准确性上显著优于A,特别是在解答的完整性方面差距明显,因此B整体质量更好。

### 对比 2

- **赢家**: Generated
- **原因**: 从各维度对比分析:1. correctness:两者数学逻辑都正确,最终答案准确无误,此维度相当;2. clarity:A的表述更清晰,直接给出了完整的计算过程和最终结果,而B只给出了半周长,没有完成面积计算,逻辑不完整;3. completeness:A提供了完整的解题步骤(s=21,面积公式代入计算,最终结果84),B只给出半周长,解答不完整;4. difficulty_match:两者问题难度相同,都匹配三角形面积计算的预期标准;5. originality:两者都使用标准的海伦公式,没有展示多种解法或创新思路,此维度相当。综合来看,虽然两个问题在正确性和难度匹配上相当,但A在清晰度和完整性上明显优于B,提供了完整的解答过程,因此A更好。



"""
Loading