diff --git a/benchmark/benchmark_serving.py b/benchmark/benchmark_serving.py
index 2527507d33..b66a253ed9 100644
--- a/benchmark/benchmark_serving.py
+++ b/benchmark/benchmark_serving.py
@@ -42,8 +42,6 @@ def get_output_file(model_path, backend, server_config):
     params = [
         ('bs', server_config['max_batch_size']),
         ('tp', server_config.get('tp', 1)),
-        ('dp', server_config.get('dp', '')),
-        ('ep', server_config.get('ep', '')),
         ('cache', server_config.get('cache_max_entry_count', 0.8)),
         ('mptk', server_config.get('max_prefill_token_num', '')),
     ]
@@ -57,15 +55,8 @@ def get_output_file(model_path, backend, server_config):
 
 def get_server_ip_port(backend: str, server_config: Dict) -> Tuple[str, int]:
     if backend in ['turbomind', 'pytorch']:
-        if server_config.get('proxy_url'):
-            # If proxy_url is set, we use the proxy server's IP and port
-            parts = server_config['proxy_url'].split(':')
-            server_ip = parts[1].lstrip('//')
-            server_port = int(parts[2])
-        else:
-            # Default to the server IP and port specified in the config
-            server_ip = server_config.get('server_ip', '0.0.0.0')
-            server_port = server_config.get('server_port', 23333)
+        server_ip = server_config.get('server_ip', '0.0.0.0')
+        server_port = server_config.get('server_port', 23333)
     elif backend == 'sglang':
         return (server_config.get('server_ip', '0.0.0.0'), server_config.get('port', 30000))
     elif backend == 'vllm':
@@ -75,7 +66,7 @@ def get_server_ip_port(backend: str, server_config: Dict) -> Tuple[str, int]:
     return server_ip, server_port
 
 
-def wait_server_ready(server_ip: str, server_port: int) -> bool:
+def get_served_model_name(server_ip: str, server_port: int) -> bool:
     """Wait for the API server to become ready."""
     from openai import OpenAI
     while True:
@@ -84,7 +75,7 @@ def wait_server_ready(server_ip: str, server_port: int) -> bool:
             model_name = client.models.list().data[0].id
             if model_name:
                 print('Server is ready.')
-                return True
+                return model_name
         except Exception as e:
             print(f'connect to server http://{server_ip}:{server_port} failed {e}')
             time.sleep(5)
@@ -135,7 +126,7 @@ def benchmark(model_path: str, backend: str, server_config: Dict, data_config: D
         print(f"Starting api_server: {' '.join(server_cmd)}", flush=True)
         proc = subprocess.Popen(server_cmd)
         # Wait for the server to be ready
-        wait_server_ready(server_ip, server_port)
+        get_served_model_name(server_ip, server_port)
         # Run benchmarks
         output_file = get_output_file(model_path, backend, server_config)
         for data in data_config:
@@ -166,25 +157,30 @@ def benchmark(model_path: str, backend: str, server_config: Dict, data_config: D
                 proc.kill()
 
 
-def validate_config(config: Dict) -> None:
-    """Validate the configuration structure.
+def benchmark_proxy(backend: str, server_config: Dict, data_config: Dict | List[Dict]):
+    server_ip = server_config.get('server_ip', '0.0.0.0')
+    server_port = server_config.get('server_port', 8000)
 
-    Args:
-        config: Loaded configuration dictionary
-
-    Raises:
-        BenchmarkConfigError: If configuration is invalid
-    """
-    required_sections = ['api_server', 'engine', 'data']
-    for section in required_sections:
-        if section not in config:
-            raise ValueError(f'Missing required config section: {section}')
-
-    if not isinstance(config['engine'], (Dict, List)):
-        raise ValueError('engine config must be a dict or list of dicts')
+    if isinstance(data_config, Dict):
+        data_config = [data_config]
+    if not (isinstance(data_config, List) and all(isinstance(d, Dict) for d in data_config)):
+        raise ValueError('data_config must be a dict or list of dicts')
 
-    if not isinstance(config['data'], (Dict, List)):
-        raise ValueError('data config must be a dict or list of dicts')
+    try:
+        # Wait for the proxy_server to be ready
+        model_name = get_served_model_name(server_ip, server_port)
+        model_name = model_name.replace('/', '_')
+        # Run benchmarks
+        output_file = f'benchmark_proxy_{model_name}_{backend}.csv'
+        for data in data_config:
+            data = data.copy()
+            data['output_file'] = output_file
+            client_cmd = get_client_cmd(backend, server_ip, server_port, data)
+            print(f"Running benchmark: {' '.join(client_cmd)}")
+            subprocess.run(client_cmd, check=True)
+    except Exception as e:
+        print(f'Unexpected error: {e}')
+        raise
 
 
 def main(backend: str, config_path: str, model_path: Optional[str] = None):
@@ -197,11 +193,16 @@ def main(backend: str, config_path: str, model_path: Optional[str] = None):
     Raises:
         BenchmarkConfigError: If required parameters are missing or config is invalid
     """
-    with open(config_path, 'r') as f:
+    with open(config_path, 'r', encoding='utf-8') as f:
         config = yaml.safe_load(f)
         server_config = config['server']
         engine_configs = config['engine']
         data_config = config['data']
+
+        server_type = server_config.get('type', 'api_server')
+        if server_type == 'proxy':
+            benchmark_proxy(backend, server_config, data_config)
+            return
         if isinstance(engine_configs, Dict):
             engine_configs = [engine_configs]
         assert isinstance(engine_configs, List) and all(isinstance(s, Dict) for s in engine_configs)
diff --git a/benchmark/lmdeploy.yml b/benchmark/lmdeploy.yml
index 539da0a68e..1705ae23b7 100644
--- a/benchmark/lmdeploy.yml
+++ b/benchmark/lmdeploy.yml
@@ -4,6 +4,8 @@ dataset_name: &dataset_name "sharegpt"
 model_path: &model_path "Qwen/Qwen3-30B-A3B-FP8"
 server:
   server_port: 23333
+  # The type of the server. It is either "api_server" or "proxy".
+  type: "api_server"
 # Inference engine configuration
 engine:
   - model_path: *model_path
@@ -15,13 +17,6 @@ engine:
     cache_max_entry_count: 0.9
     max_prefill_token_num: 4096
     tp: 1
-  - model_path: "Qwen/Qwen3-235B-A22B-FP8"
-    max_batch_size: 64
-    cache_max_entry_count: 0.7
-    max_prefill_token_num: 4096
-    dp: 8
-    ep: 8
-    proxy_url: "http://localhost:8000"
 # Benchmark test configuration for profile_restful_api.py
 # Defines multiple test cases with different output lengths to evaluate API performance
 data:
diff --git a/docs/en/benchmark/evaluate_with_opencompass.md b/docs/en/benchmark/evaluate_with_opencompass.md
index 0cf0439221..6847d2a9bd 100644
--- a/docs/en/benchmark/evaluate_with_opencompass.md
+++ b/docs/en/benchmark/evaluate_with_opencompass.md
@@ -10,33 +10,44 @@ If sufficient computational resources are available, please refer to the [End-to
 
 ## Environment Setup
 
+Install LMDeploy and OpenCompass in separate Python virtual environments to avoid potential dependency conflicts.
+
+- install lmdeploy
+
 ```shell
+conda create -n lmdeploy python=3.10 -y
 pip install lmdeploy
-pip install "opencompass[full]"
-
 # Download the lmdeploy source code, which will be used in subsequent steps to access eval script and configuration
 git clone --depth=1 https://github.com/InternLM/lmdeploy.git
 ```
 
-It is recommended to install LMDeploy and OpenCompass in separate Python virtual environments to avoid potential dependency conflicts.
+- install opencompass
+
+```shell
+conda create -n opencompass python=3.10 -y
+pip install "opencompass[full]"
+```
 
 ## End-to-End Evaluation
 
 1. **Deploy Target Model**
 
 ```shell
+conda activate lmdeploy
 lmdeploy serve api_server <model_path> --server-port 10000 <--other-options>
 ```
 
 2. **Deploy Evaluation Model (Judger)**
 
 ```shell
+conda activate lmdeploy
 lmdeploy serve api_server opencompass/CompassVerifier-32B --server-port 20000 --tp 2
 ```
 
 3. **Generate Evaluation Configuration and Execute**
 
 ```shell
+conda activate opencompass
 
 cd {the/root/path/of/lmdeploy/repo}
 
@@ -64,12 +75,14 @@ This stage generates model responses for the dataset.
 1. **Deploy Target Model**
 
 ```shell
+conda activate lmdeploy
 lmdeploy serve api_server <model_path> --server-port 10000 <--other-options>
 ```
 
 2. **Generate Inference Configuration and Execute**
 
 ```shell
+conda activate opencompass
 cd {the/root/path/of/lmdeploy/repo}
 
 ## Specify the dataset path. OC will download the datasets automatically if they are
@@ -92,12 +105,15 @@ This stage uses the evaluation model (Judger) to assess the quality of inference
 1. **Deploy Evaluation Model (Judger)**
 
 ```shell
+conda activate lmdeploy
 lmdeploy serve api_server opencompass/CompassVerifier-32B --server-port 20000 --tp 2 --session-len 65536
 ```
 
 2. **Generate Evaluation Configuration and Execute**
 
 ```shell
+conda activate opencompass
+
 cd {the/root/path/of/lmdeploy/repo}
 
 ## Specify the dataset path. OC will download the datasets automatically if they are
@@ -105,7 +121,7 @@ cd {the/root/path/of/lmdeploy/repo}
 export COMPASS_DATA_CACHE=/nvme1/shared/opencompass/.cache
 export HF_DATASETS_CACHE=/nvme4/huggingface_hub/datasets
 # Run evaluation task
-opencompass /path/to/judger_config.py -m eval -w {oc_output_dir} -r {yyyymmdd_hhmmss}
+python eval/eval.py {task_name} --mode eval --judger-server http://{judger-server-ip}:20000 -w {oc_output_dir} -r {yyyymmdd_hhmmss}
 ```
 
 Important Notes:
diff --git a/docs/zh_cn/benchmark/evaluate_with_opencompass.md b/docs/zh_cn/benchmark/evaluate_with_opencompass.md
index 3f46c3f6de..dc96ebb0ee 100644
--- a/docs/zh_cn/benchmark/evaluate_with_opencompass.md
+++ b/docs/zh_cn/benchmark/evaluate_with_opencompass.md
@@ -10,33 +10,44 @@
 
 ## 环境准备
 
+在不同的 Python 虚拟环境中分别安装 LMDeploy 和 OpenCompass，以避免可能的依赖冲突。
+
+- 安装 lmdeploy
+
 ```shell
+conda create -n lmdeploy python=3.10 -y
 pip install lmdeploy
-pip install "opencompass[full]"
-
 # 下载 lmdeploy 源码，在后续步骤中会使用到 eval/* 中的评测脚本和配置文件
 git clone --depth=1 https://github.com/InternLM/lmdeploy.git
 ```
 
-建议将 LMDeploy 和 OpenCompass 安装在不同的 Python 虚拟环境中，以避免可能的依赖冲突。
+- 安装 opencompass
+
+```shell
+pip install "opencompass[full]"
+```
 
 ## 端到端评测
 
 1. **部署待评测模型**
 
 ```shell
+conda activate lmdeploy
 lmdeploy serve api_server <model_path> --server-port 10000 <--other-options>
 ```
 
 2. **部署评测模型（Judger）**
 
 ```shell
+conda activate lmdeploy
 lmdeploy serve api_server opencompass/CompassVerifier-32B --server-port 20000 --tp 2 --session-len 65536
 ```
 
 3. **生成评测配置并执行评测**
 
 ```shell
+conda activate opencompass
+
 cd {the/root/path/of/lmdeploy/repo}
 
 ## 指定数据集路径。如果在路径下没有找到评测数据集，OC会自动下载
@@ -62,12 +73,15 @@ python eval/eval.py {task_name} \
 1. **部署待评测模型**
 
 ```shell
+conda activate lmdeploy
 lmdeploy serve api_server <model_path> --server-port 10000 <--other-options>
 ```
 
 2. **生成推理配置并执行推理**
 
 ```shell
+conda activate opencompass
+
 cd {the/root/path/of/lmdeploy/repo}
 
 ## 指定数据集路径。如果在路径下没有找到评测数据集，OC会自动下载
@@ -91,12 +105,15 @@ python eval/eval.py {task_name} \
 1. **部署评测模型（Judger）**
 
 ```shell
+conda activate lmdeploy
 lmdeploy serve api_server opencompass/CompassVerifier-32B --server-port 20000 --tp 2
 ```
 
 2. **生成评判配置并执行评判**
 
 ```shell
+conda activate opencompass
+
 cd {the/root/path/of/lmdeploy/repo}
 
 ## 指定数据集路径。如果在路径下没有找到评测数据集，OC会自动下载