diff --git a/code/chapter12/10_Universal_llm_judge.py b/code/chapter12/10_Universal_llm_judge.py
new file mode 100644
index 0000000..83309c3
--- /dev/null
+++ b/code/chapter12/10_Universal_llm_judge.py
@@ -0,0 +1,194 @@
+"""
+第十二章示例10：Universal LLM Judge 评估器 - 自定义维度案例
+
+对应文档：12.4.2 LLM Judge评估
+
+这个示例展示如何使用底层 UniversalLLMJudgeEvaluator，
+结合自定义评估配置来评估代码质量。
+
+关键演示：
+- 使用底层接口 UniversalLLMJudgeEvaluator
+- 自定义评估维度（dimension）
+- 字段映射处理非标准数据格式
+"""
+
+import sys
+import os
+import json
+from pathlib import Path
+
+project_root = Path(__file__).parent.parent.parent
+sys.path.insert(0, str(project_root))
+
+from hello_agents import HelloAgentsLLM, UniversalLLMJudgeEvaluator
+from hello_agents.evaluation.benchmarks.data_generation_Universal.evaluation_config import (
+    EvaluationConfig,
+)
+
+# ============================================================================
+# 测试数据准备
+# ============================================================================
+
+def prepare_code_data():
+    """准备代码评估的测试数据"""
+    code_problems = [
+        {
+            "id": "code_001",
+            "code": """
+def fibonacci(n):
+    if n <= 1:
+        return n
+    a, b = 0, 1
+    for _ in range(n):
+        a, b = b, a + b
+    return a
+            """,
+            "expected_output": "Returns correct fibonacci numbers efficiently",
+            "context": "Optimized fibonacci implementation",
+        },
+        {
+            "id": "code_002",
+            "code": """
+def merge_sorted_arrays(arr1, arr2):
+    result = []
+    i = j = 0
+    while i < len(arr1) and j < len(arr2):
+        if arr1[i] <= arr2[j]:
+            result.append(arr1[i])
+            i += 1
+        else:
+            result.append(arr2[j])
+            j += 1
+    result.extend(arr1[i:])
+    result.extend(arr2[j:])
+    return result
+            """,
+            "expected_output": "Merge two sorted arrays in O(n+m) time",
+            "context": "Two-pointer merge algorithm",
+        }
+    ]
+    return code_problems
+
+
+# ============================================================================
+# 主程序：代码评估（使用自定义维度）
+# ============================================================================
+
+def main():
+    """
+    使用 UniversalLLMJudgeEvaluator 评估代码质量
+
+    演示特性：
+    - 底层接口使用
+    - 自定义评估维度（code 模板的维度）
+    - 非标准字段名（code, expected_output）
+    - 需要字段映射：code → problem, expected_output → answer
+    """
+    print("\n" + "="*70)
+    print("📌 Universal LLM Judge 评估器 - 代码质量评估（自定义维度）")
+    print("="*70)
+
+    code_data = prepare_code_data()
+
+    # 创建 LLM 和评估器（使用 code 模板）
+    print("\n[初始化] 创建 LLM 和评估器...")
+    llm = HelloAgentsLLM(provider="deepseek", model="deepseek-chat")
+    eval_config = EvaluationConfig.load_template("code")
+
+    print(f"✓ 评估模板: code")
+    print(f"✓ 评估维度: {', '.join(eval_config.get_dimension_names())}")
+
+    # 定义字段映射（适应非标准字段名）
+    field_mapping = {
+        "problem": "code",                  # 源数据中的 "code" 字段映射到 "problem"
+        "answer": "expected_output",        # 源数据中的 "expected_output" 字段映射到 "answer"
+    }
+    print(f"✓ 字段映射: {field_mapping}")
+
+    evaluator = UniversalLLMJudgeEvaluator(
+        llm=llm,
+        eval_config=eval_config,
+        field_mapping=field_mapping
+    )
+
+    # 进行评估
+    print("\n[评估] 开始评估代码...")
+    print("="*70)
+
+    all_scores = []
+    for i, problem in enumerate(code_data, 1):
+        print(f"\n评估代码 {i}/{len(code_data)}")
+        print(f"ID: {problem['id']}")
+        print(f"描述: {problem['context']}")
+
+        result = evaluator.evaluate_single(problem)
+
+        print(f"\n评估结果:")
+        for dim, score in result['scores'].items():
+            print(f"  {dim}: {score:.1f}/5")
+        print(f"  平均分: {result['total_score']:.2f}/5")
+
+        all_scores.append(result)
+
+    # 统计汇总
+    print("\n" + "="*70)
+    print("总体统计")
+    print("="*70)
+
+    avg_total = sum(s['total_score'] for s in all_scores) / len(all_scores)
+    print(f"\n平均总分: {avg_total:.2f}/5")
+
+    # 按维度统计平均分
+    if all_scores:
+        dimension_names = list(all_scores[0]['scores'].keys())
+        print("\n各维度平均分:")
+        for dim in dimension_names:
+            avg_dim = sum(s['scores'][dim] for s in all_scores) / len(all_scores)
+            print(f"  {dim}: {avg_dim:.2f}/5")
+
+    # 保存结果
+    print("\n[保存] 保存评估结果...")
+    os.makedirs("./evaluation_results", exist_ok=True)
+    with open("./evaluation_results/code_judge_results.json", 'w', encoding='utf-8') as f:
+        json.dump({
+            'scenario': 'Code_Custom_Dimensions',
+            'template': 'code',
+            'field_mapping': field_mapping,
+            'dimensions': dimension_names,
+            'data': code_data,
+            'results': all_scores,
+            'avg_total_score': avg_total
+        }, f, indent=2, ensure_ascii=False)
+
+    print("✓ 结果已保存到 ./evaluation_results/code_judge_results.json")
+    print("\n✅ 评估完成！")
+
+
+if __name__ == "__main__":
+    main()
+
+"""
+# LLM Judge 评估报告
+
+**生成时间**: 2025-10-28 16:36:15
+**评估模板**: code
+**评估样本数**: 2
+
+## 总体评分
+
+- <strong>平均总分</strong>: 3.30/5.0
+- <strong>通过率</strong>: 50.0% (≥3.5分)
+- <strong>优秀率</strong>: 0.0% (≥4.5分)
+
+## 各维度评分
+
+| 维度 | 平均分 | 评级 |
+|------|--------|------|
+| correctness | 4.00/5.0 | 良好 ⭐⭐⭐⭐ |
+| robustness | 3.00/5.0 | 及格 ⭐⭐ |
+| efficiency | 2.50/5.0 | 待改进 ⭐ |
+| readability | 4.50/5.0 | 优秀 ⭐⭐⭐⭐⭐ |
+| style_compliance | 2.50/5.0 | 待改进 ⭐ |
+
+
+"""
\ No newline at end of file
diff --git a/code/chapter12/11_Universal_win_rate.py b/code/chapter12/11_Universal_win_rate.py
new file mode 100644
index 0000000..c955e94
--- /dev/null
+++ b/code/chapter12/11_Universal_win_rate.py
@@ -0,0 +1,191 @@
+"""
+第十二章示例11：Universal Win Rate 评估 - 自定义维度案例
+
+对应文档：12.4.4 Win Rate评估
+
+这个示例展示如何使用底层 UniversalWinRateEvaluator，
+结合自定义评估配置来对比数学题质量。
+
+关键演示：
+- 使用底层接口 UniversalWinRateEvaluator
+- 自定义评估维度（dimension）
+- 标准字段名（problem, answer）无需映射
+- Win Rate 胜率对比评估
+"""
+
+import sys
+import os
+import json
+from pathlib import Path
+
+project_root = Path(__file__).parent.parent.parent
+sys.path.insert(0, str(project_root))
+
+from hello_agents import HelloAgentsLLM, UniversalWinRateEvaluator
+from hello_agents.evaluation.benchmarks.data_generation_Universal.evaluation_config import (
+    EvaluationConfig,
+)
+
+# ============================================================================
+# 测试数据准备
+# ============================================================================
+
+def prepare_math_data():
+    """准备数学题对比的测试数据（生成数据和参考数据）"""
+
+    math_generated = [
+        {
+            "id": "math_gen_001",
+            "problem": "Find the number of positive integers $n$ such that $n^2 + 19n + 92$ is a perfect square.",
+            "answer": "Using completing the square: $(2n+19)^2 - 4m^2 = -7$. Solve the Pell-like equation.",
+        },
+        {
+            "id": "math_gen_002",
+            "problem": "In triangle ABC with sides AB=13, BC=14, CA=15, find the area.",
+            "answer": "Using Heron's formula: $s=21$, Area $= \\sqrt{21 \\cdot 8 \\cdot 7 \\cdot 6} = 84$.",
+        },
+    ]
+
+    math_reference = [
+        {
+            "id": "math_ref_001",
+            "problem": "Find the number of positive integers $n$ such that $n^2 + 19n + 92$ is a perfect square.",
+            "answer": "Let $n^2 + 19n + 92 = m^2$. Complete the square and solve systematically.",
+        },
+        {
+            "id": "math_ref_002",
+            "problem": "In triangle ABC with sides AB=13, BC=14, CA=15, find the area.",
+            "answer": "Use Heron's formula with semi-perimeter s=21.",
+        },
+    ]
+
+    return math_generated, math_reference
+
+
+# ============================================================================
+# 主程序：数学题质量 Win Rate 对比（使用自定义维度）
+# ============================================================================
+
+def main():
+    """
+    使用 UniversalWinRateEvaluator 对比数学题质量
+
+    演示特性：
+    - 底层接口使用
+    - 自定义评估维度（math 模板的维度）
+    - 标准字段名（problem, answer）无需映射
+    - Win Rate 胜率评估
+    """
+    print("\n" + "="*70)
+    print("📌 Universal Win Rate 评估器 - 数学题质量对比（自定义维度）")
+    print("="*70)
+
+    math_gen, math_ref = prepare_math_data()
+
+    # 创建 LLM 和评估器（使用 math 模板）
+    print("\n[初始化] 创建 LLM 和评估器...")
+    llm = HelloAgentsLLM(provider="deepseek", model="deepseek-chat")
+    eval_config = EvaluationConfig.load_template("math")
+
+    print(f"✓ 评估模板: math")
+    print(f"✓ 评估维度: {', '.join(eval_config.get_dimension_names())}")
+
+    # 定义字段映射（math 模板使用标准字段名）
+    field_mapping = {
+        "problem": "problem",       # 源数据中的 "problem" 字段映射到 "problem"
+        "answer": "answer",         # 源数据中的 "answer" 字段映射到 "answer"
+    }
+    print(f"✓ 字段映射: {field_mapping}")
+
+    evaluator = UniversalWinRateEvaluator(
+        llm=llm,
+        eval_config=eval_config,
+        field_mapping=field_mapping
+    )
+
+    # 进行 Win Rate 对比
+    print("\n[评估] 开始进行 Win Rate 对比...")
+    print("="*70)
+
+    result = evaluator.evaluate_win_rate(math_gen, math_ref, num_comparisons=2)
+
+    # 显示评估结果
+    print("\n评估结果:")
+    print(f"  胜率 (Win Rate): {result.get('win_rate', 'N/A')}")
+    print(f"  胜场数: {result.get('wins', 'N/A')}")
+    print(f"  平局数: {result.get('ties', 'N/A')}")
+    print(f"  负场数: {result.get('losses', 'N/A')}")
+
+    # Win Rate 解读
+    print("\n[解读]")
+    win_rate = result.get('win_rate', 0)
+    if isinstance(win_rate, str):
+        win_rate_float = float(win_rate.rstrip('%')) / 100
+    else:
+        win_rate_float = win_rate
+
+    if 0.45 <= win_rate_float <= 0.55:
+        print("✅ 生成质量接近参考数据水平（理想情况）")
+    elif win_rate_float > 0.55:
+        print("⭐ 生成质量优于参考数据（可能有评估偏差）")
+    else:
+        print("⚠️  生成质量低于参考数据（需要改进）")
+
+    # 保存结果
+    print("\n[保存] 保存评估结果...")
+    os.makedirs("./evaluation_results", exist_ok=True)
+    with open("./evaluation_results/math_winrate_results.json", "w", encoding="utf-8") as f:
+        json.dump(
+            {
+                "scenario": "Math_Custom_Dimensions",
+                "template": "math",
+                "field_mapping": field_mapping,
+                "dimensions": eval_config.get_dimension_names(),
+                "generated_data": math_gen,
+                "reference_data": math_ref,
+                "results": result,
+            },
+            f,
+            indent=2,
+            ensure_ascii=False,
+        )
+
+    print("✓ 结果已保存到 ./evaluation_results/math_winrate_results.json")
+    print("\n✅ 评估完成！")
+
+
+if __name__ == "__main__":
+    main()
+
+
+"""
+**评估模板**: math
+**对比次数**: 2
+
+## 胜率统计
+
+| 指标 | 数值 | 百分比 |
+|------|------|--------|
+| 生成数据胜出 | 1次 | 50.0% |
+| 参考数据胜出 | 1次 | 50.0% |
+| 平局 | 0次 | 0.0% |
+
+<strong>Win Rate</strong>: 50.0%
+
+✅ <strong>优秀</strong>: 生成数据质量优于参考数据（胜率≥50%）。
+
+## 对比详情
+
+### 对比 1
+
+- **赢家**: Reference
+- **原因**: 从各维度详细比较：1. correctness：两者数学逻辑都正确，但A的答案不完整，只给出思路没有具体数值结果；B使用海伦公式准确计算出面积=84。2. clarity：A的问题表述清晰，但解答过于简略，仅提到配方法和佩尔方程，缺乏具体推导；B的问题表述清晰，解答步骤明确，直接应用标准公式。3. completeness：A严重不完整，缺少具体求解过程和最终答案；B完整展示了从已知条件到最终结果的完整过程。4. difficulty_match：A作为数论问题难度较高，与中学生数学竞赛匹配；B作为几何问题难度适中，与中学数学课程匹配，两者都符合预期难度。5. originality：A展示了非标准的佩尔方程解法，具有一定启发性；B使用标准解法，缺乏创新性。综合来看，虽然A在原创性上稍优，但B在完整性、清晰度和答案准确性上显著优于A，特别是在解答的完整性方面差距明显，因此B整体质量更好。
+
+### 对比 2
+
+- **赢家**: Generated
+- **原因**: 从各维度对比分析：1. correctness：两者数学逻辑都正确，最终答案准确无误，此维度相当；2. clarity：A的表述更清晰，直接给出了完整的计算过程和最终结果，而B只给出了半周长，没有完成面积计算，逻辑不完整；3. completeness：A提供了完整的解题步骤（s=21，面积公式代入计算，最终结果84），B只给出半周长，解答不完整；4. difficulty_match：两者问题难度相同，都匹配三角形面积计算的预期标准；5. originality：两者都使用标准的海伦公式，没有展示多种解法或创新思路，此维度相当。综合来看，虽然两个问题在正确性和难度匹配上相当，但A在清晰度和完整性上明显优于B，提供了完整的解答过程，因此A更好。
+
+
+
+"""
\ No newline at end of file
diff --git "a/docs/chapter12/\347\254\254\345\215\201\344\272\214\347\253\240 \346\231\272\350\203\275\344\275\223\346\200\247\350\203\275\350\257\204\344\274\260.md" "b/docs/chapter12/\347\254\254\345\215\201\344\272\214\347\253\240 \346\231\272\350\203\275\344\275\223\346\200\247\350\203\275\350\257\204\344\274\260.md"
index 61b6d6d..b453f91 100644
--- "a/docs/chapter12/\347\254\254\345\215\201\344\272\214\347\253\240 \346\231\272\350\203\275\344\275\223\346\200\247\350\203\275\350\257\204\344\274\260.md"	
+++ "b/docs/chapter12/\347\254\254\345\215\201\344\272\214\347\253\240 \346\231\272\350\203\275\344\275\223\346\200\247\350\203\275\350\257\204\344\274\260.md"	
@@ -2652,7 +2652,285 @@ python data_generation/run_complete_evaluation.py 30 3.0
 
 对于LLM Judge和Win Rate评估，HelloAgents也进行了工具集成，并提供了完整的示例代码。如果你对这两种评估方法的具体实现细节感兴趣，同样可以参考示例代码。
 
+## 12.4.9 Universal LLMJudge 和 Universal Win Rate：通用评估模块
+
+  在前面的内容中，我们深入学习了如何使用 AIME 数据集进行具体的数据生成质量评估。现在，我们将介绍 HelloAgents
+  框架提供的**通用评估模块**——**UniversalLLMJudge** 和
+  **UniversalWinRate**。这些模块可以应用于**任何类型的内容评估**，不仅限于数学题目，还支持代码、写作、问答等多种场景。
+
+  ### 12.4.9.1 两层级 API 设计
+
+  HelloAgents 评估系统采用**两层级 API 设计**，为不同需求的用户提供灵活的使用方式：
+
+  **低层级 API（Evaluator 类）**：提供完全的定制能力和控制权
+  - `UniversalLLMJudgeEvaluator`：单项评估，支持自定义评估维度
+  - `UniversalWinRateEvaluator`：成对对比评估
+  - 支持自定义模板、字段映射、评估维度
+  - 适合需要精细控制评估流程的高级用户
+
+  **高层级 API（Tool 类）**：提供一键式的快速评估体验
+  - `UniversalLLMJudgeTool`：高级封装
+  - `UniversalWinRateTool`：高级封装
+  - 支持批量处理、进度跟踪、错误处理
+  - 适合快速原型验证和生产环境部署
+
+  ### 12.4.9.2 核心特性概览
+
+  | 特性 | 说明 | 使用场景 |
+  |------|------|--------|
+  | **多模板支持** | 内置 math、code、writing、qa 四种评估模板 | 快速评估标准场景 |
+  | **自定义维度** | 通过 `EvaluationConfig.custom()` 创建自定义评估维度 | 领域专用评估 |
+  | **智能字段映射** | 自动适配不同数据格式，无需手动转换 | 处理非标准数据源 |
+  | **多维度评分** | 每个样本从多个维度获得评分 | 全面理解质量水平 |
+  | **成对对比** | Win Rate 评估，计算相对质量 | 对标竞品/参考数据 |
+  | **详细报告** | 自动生成 Markdown 报告和可视化 | 结果展示和分析 |
+
+  ### 12.4.9.3 评估模板详解
+
+  HelloAgents 提供了四种内置评估模板，每种模板针对特定的内容类型设计：
+
+  #### Math 模板 - 数学问题评估
+  **评估维度**：
+  - `correctness`：解答的数学正确性
+  - `clarity`：解答过程的清晰度和逻辑性
+  - `completeness`：解答的完整性和步骤的详尽程度
+  - `difficulty_match`：解答难度与题目难度的匹配度
+  - `originality`：解法的新颖性和创造性
+
+  #### Code 模板 - 代码质量评估
+  **评估维度**：
+  - `correctness`：代码的功能正确性
+  - `robustness`：代码的健壮性和异常处理能力
+  - `efficiency`：算法效率和资源消耗
+  - `readability`：代码的可读性和命名规范
+  - `style_compliance`：编码规范的遵循程度
+
+  #### Writing 模板 - 写作质量评估
+  **评估维度**：
+  - `accuracy`：内容的事实准确性
+  - `coherence`：文章的逻辑连贯性和结构清晰度
+  - `richness`：词汇丰富性和表达多样性
+  - `creativity_style`：创意性和个人风格特色
+  - `engagement`：内容的吸引力和读者参与度
+
+  #### QA 模板 - 问答质量评估
+  **评估维度**：
+  - `correctness`：回答的事实准确性
+  - `clarity`：回答的清晰度和易理解性
+  - `completeness`：回答的完整性和覆盖面
+  - `helpfulness`：回答的实用价值和帮助程度
+
+  ### 12.4.9.4 使用参考
+
+  #### 使用 UniversalLLMJudgeEvaluator 评估代码质量
+
+  ```python
+
+  # 初始化 LLM
+  llm = HelloAgentsLLM(provider="deepseek", model="deepseek-chat")
+
+  # 加载代码评估模板
+  eval_config = EvaluationConfig.load_template("code")
+
+  # 准备要评估的代码
+  code_samples = [
+      {
+          "id": "code_001",
+          "code": "def fibonacci(n):\n    if n <= 1:\n        return n\n    return fibonacci(n-1) + fibonacci(n-2)",
+          "expected_output": "Exponential time complexity, not optimized"
+      },
+      {
+          "id": "code_002",
+          "code": "def fibonacci(n):\n    if n <= 1:\n        return n\n    a, b = 0, 1\n    for _ in range(n-1):\n        a, b = b, a + b\n       
+   return b",
+          "expected_output": "Linear time complexity, memory efficient"
+      }
+  ]
+
+  # 定义字段映射
+  field_mapping = {
+      "problem": "code",
+      "answer": "expected_output",
+  }
+
+  # 创建评估器
+  evaluator = UniversalLLMJudgeEvaluator(
+      llm=llm,
+      eval_config=eval_config,
+      field_mapping=field_mapping
+  )
+
+  # 评估单个样本
+  for sample in code_samples:
+      result = evaluator.evaluate_single(sample)
+      print(f"样本 {sample['id']}:")
+      print(f"  总分: {result['total_score']:.1f}/5.0")
+      print(f"  各维度得分: {result['scores']}")
+      print(f"  评估理由: {result['reasoning']}\n")
+```
+
+### 使用 UniversalWinRateEvaluator 对比数学题质量
+
+```python
+
+  # 初始化 LLM
+  llm = HelloAgentsLLM(provider="deepseek", model="deepseek-chat")
+
+  # 加载数学评估模板
+  eval_config = EvaluationConfig.load_template("math")
+
+  # 生成的数学题
+  generated_problems = [
+      {
+          "id": "gen_001",
+          "problem": "Find the number of positive integers $n$ such that $n^2 + 19n + 92$ is a perfect square.",
+          "answer": "Using completing the square: $(2n+19)^2 - 4m^2 = -7$. Solve the Pell-like equation."
+      },
+      {
+          "id": "gen_002",
+          "problem": "In triangle ABC with sides AB=13, BC=14, CA=15, find the area.",
+          "answer": "Using Heron's formula: $s=21$, Area $= \\sqrt{21 \\cdot 8 \\cdot 7 \\cdot 6} = 84$."
+      }
+  ]
+
+  # 参考数学题（真题）
+  reference_problems = [
+      {
+          "id": "ref_001",
+          "problem": "Find the number of positive integers $n$ such that $n^2 + 19n + 92$ is a perfect square.",
+          "answer": "Let $n^2 + 19n + 92 = m^2$. Complete the square and solve systematically."
+      },
+      {
+          "id": "ref_002",
+          "problem": "In triangle ABC with sides AB=13, BC=14, CA=15, find the area.",
+          "answer": "Use Heron's formula with semi-perimeter s=21."
+      }
+  ]
+
+  # 定义字段映射（标准字段名不需要映射）
+  field_mapping = {
+      "problem": "problem",
+      "answer": "answer",
+  }
+
+  # 创建 Win Rate 评估器
+  evaluator = UniversalWinRateEvaluator(
+      llm=llm,
+      eval_config=eval_config,
+      field_mapping=field_mapping
+  )
+
+  # 进行 Win Rate 对比
+  result = evaluator.evaluate_win_rate(
+      generated_data=generated_problems,
+      reference_data=reference_problems,
+      num_comparisons=2
+  )
+```
+
+### 12.4.9.5 自定义评估维度
+```python
+
+  当内置模板无法满足特定需求时，可以创建完全自定义的评估维度：
+  from hello_agents.evaluation.benchmarks.data_generation_Universal.evaluation_config import EvaluationConfig
+
+  # 举例：创建医疗领域专用评估配置
+  medical_eval_config = EvaluationConfig.custom(
+      diagnostic_accuracy="诊断准确性和医学依据",
+      treatment_appropriateness="治疗方案合理性和安全性",
+      patient_safety="患者安全考虑和风险评估",
+      clinical_evidence="临床证据支持和科学性"
+  )
+
+  # 创建软件设计评估配置
+  software_eval_config = EvaluationConfig.custom(
+      architecture_quality="系统架构设计的合理性和扩展性",
+      security_consideration="安全考虑和风险防控措施",
+      scalability="系统的可扩展性和性能规划",
+      maintainability="代码的可维护性和文档完整性"
+  )
+
+  # 使用自定义配置
+  llm = HelloAgentsLLM(provider="deepseek", model="deepseek-chat")
+  evaluator = UniversalLLMJudgeEvaluator(
+      llm=llm,
+      eval_config=medical_eval_config
+  )
+
+  # 评估医学内容
+  medical_data = {
+      "id": "case_001",
+      "problem": "患者，男，65岁，胸痛3小时，心电图示ST段抬高",
+      "answer": "初步诊断为急性心肌梗死，建议立即进行冠脉造影并准备PCI治疗"
+  }
+
+  result = evaluator.evaluate_single(medical_data)
+  print(f"诊断准确性: {result['scores']['diagnostic_accuracy']}/5")
+  print(f"治疗方案合理性: {result['scores']['treatment_appropriateness']}/5")
+```
+
+###  12.4.9.6 字段映射最佳实践
+
+  当数据源中的字段名与标准字段不一致时，使用字段映射自动转换：
+
+```python
+  # 数据格式不符合标准时的字段映射
+  original_data = {
+      "item_id": "001",
+      "code_snippet": "def add(a, b): return a + b",
+      "expected_result": "sum of two numbers"
+  }
+
+  # 字段映射配置
+  field_mapping = {
+      "id": "item_id",
+      "problem": "code_snippet",
+      "answer": "expected_result"
+  }
+
+  # 使用字段映射创建评估器
+  evaluator = UniversalLLMJudgeEvaluator(
+      llm=llm,
+      eval_config=EvaluationConfig.load_template("code"),
+      field_mapping=field_mapping
+  )
+
+  # 直接使用原始数据格式，评估器会自动进行转换
+  result = evaluator.evaluate_single(original_data)
+```
+
+ ### 12.4.9.7 API 选择建议
+
+  | 场景         | 推荐 API                     | 原因         |
+  |------------|----------------------------|------------|
+  | 需要自定义评估维度  | UniversalLLMJudgeEvaluator | 支持灵活的自定义配置 |
+  | 快速原型验证     | UniversalLLMJudgeTool      | 一键式快速评估    |
+  | 对标参考数据质量   | UniversalWinRateEvaluator  | 成对对比评估     |
+
+
+###  12.4.9.8 常见问题
+
+```python
+  Q: 如何获取模板的所有维度？
+  config = EvaluationConfig.load_template("math")
+  print(config.get_dimension_names())
+  # 输出: ['correctness', 'clarity', 'completeness', 'difficulty_match', 'originality']
+
+  Q: 可以修改评估维度的权重吗？
+
+  当前所有维度权重相等，但可以通过后处理实现自定义权重：
+  result = evaluator.evaluate_single(data)
+  weighted_score = sum(
+      result['scores'][dim] * weights[dim]
+      for dim in weights
+  ) / sum(weights.values())
+  result['weighted_score'] = weighted_score
+```
+
+  通过学习 UniversalLLMJudge 和 UniversalWinRate 这两个通用模块，你可以轻松扩展评估系统到任何垂直应用场景。
+
 
+  ---
 
 
 ## 12.5 本章小结