This toolkit is used for benchmark evaluation in PyVision-RL project.
conda create -n pyvision-eval python=3.10
pip install -r requirements.txt# api_config.json
{
"api_key": [
"sk-**",
"sk-**"
],
"base_url": "xx"
}First, modify the 'evaluation/run_eval_pl.sh' script.
# evaluation/run_eval_pl.sh
API_NAME_MODEL=pyvision-rl
API_NAME=pyvision-image-rl-7b
CLIENT_TYPE=openai
PROMPT_TYPE=vistool_with_img_info_v2
temp=0.6
top_p=1.0
top_k=20
min_p=0.0
repetition_penalty=1.0
BENCH_TYPE=vstar_32
DATA_FILE_PATH=./dataset/vstar/vstar_bench_repeat32.json
DATA_DIR_PATH=./dataset/vstar/results
python ./evaluation/eval_gpt_pl.py \
--bench_type ${BENCH_TYPE} \
--data_file_path ${DATA_FILE_PATH} \
--output_file_path ${DATA_DIR_PATH}/results_${API_NAME}_${PROMPT_TYPE}_t${temp}_top_p${top_p}_top_k${top_k}_min_p${min_p}_rp${repetition_penalty}.json \
--prompt_template ./prompt_template/prompt_template_vis.json \
--prompt ${PROMPT_TYPE} \
--max_tokens 16000 \
--temperature ${temp} \
--top_p ${top_p} \
--top_k ${top_k} \
--min_p ${min_p} \
--repetition_penalty ${repetition_penalty} \
--messages_save_dir ${DATA_DIR_PATH}/test_generated_messages_${API_NAME}_${PROMPT_TYPE}_t${temp}_top_p${top_p}_top_k${top_k}_min_p${min_p}_rp${repetition_penalty} \
--api_config_path ./api_config_intern.json \
--client_type ${CLIENT_TYPE} \
--api_name ${API_NAME_MODEL} \
--exe_code \
--max_images 50Then run it:
bash evaluation/run_eval_pl.shFirst, modify the 'evaluation/run_eval_video_pl.sh' script.
# evaluation/run_eval_video_pl.sh
API_NAME_MODEL=pyvision-rl
API_NAME=pyvision-rl-wohint-vsi-std-sort-enable-bs16-multitask-verifier-1113-000001-wo-stdnorm-steps750
CLIENT_TYPE=openai
PROMPT_TYPE=vis_tool_with_img_info_video_v4
BENCH_TYPE=vsi_bench
DATA_FILE_PATH=./dataset/VSI-Bench/vsi_bench.json
DATA_DIR_PATH=./dataset/VSI-Bench/results
NUM_FRAMES=0
temp=0.01
top_p=1.0
top_k=0
min_p=0.0
python ./evaluation/eval_gpt_video_pl.py \
--bench_type ${BENCH_TYPE} \
--data_file_path ${DATA_FILE_PATH} \
--output_file_path ${DATA_DIR_PATH}/results_${API_NAME}_${PROMPT_TYPE}_t${temp}_${NUM_FRAMES}frames_top_p${top_p}_top_k${top_k}_min_p${min_p}.json \
--prompt_template ./prompt_template/prompt_template_vis.json \
--prompt ${PROMPT_TYPE} \
--max_tokens 10000 \
--temperature ${temp} \
--top_p ${top_p} \
--top_k ${top_k} \
--min_p ${min_p} \
--messages_save_dir ${DATA_DIR_PATH}/generated_messages_${API_NAME}_${PROMPT_TYPE}_t0.6_${NUM_FRAMES}frames_top_p${top_p}_top_k${top_k}_min_p${min_p} \
--api_config_path ./api_config_intern.json \
--client_type ${CLIENT_TYPE} \
--api_name ${API_NAME_MODEL} \
--exe_code \
--sampled_frames_num ${NUM_FRAMES}Then run it:
bash evaluation/run_eval_video_pl.shUpload it here: https://huggingface.co/spaces/Agents-X/data-view
@article{zhao2026pyvisionrl,
title={PyVision-RL: Forging Open Agentic Vision Models via RL.},
author={Zhao, Shitian and Lin, Shaoheng and Li, Ming and Zhang, Haoquan and Peng, Wenshuo and Zhang, Kaipeng and Wei, Chen},
journal={arxiv preprint arxiv:2602.20739},
year={2026},
}