@@ -157,6 +157,7 @@ async def send_stream_request(
157
157
prompt : str ,
158
158
prompt_len : int ,
159
159
output_len : int ,
160
+ ignore_eos : bool ,
160
161
best_of : int ,
161
162
use_beam_search : bool ,
162
163
top_k : int ,
@@ -180,7 +181,7 @@ async def send_stream_request(
180
181
"temperature" : 0.0 if use_beam_search else 1.0 ,
181
182
"top_p" : 1.0 ,
182
183
"max_tokens" : output_len ,
183
- "ignore_eos" : True ,
184
+ "ignore_eos" : ignore_eos ,
184
185
"stream" : True ,
185
186
}
186
187
elif backend == "jetstream" :
@@ -264,6 +265,7 @@ async def send_request(
264
265
prompt : str ,
265
266
prompt_len : int ,
266
267
output_len : int ,
268
+ ignore_eos : bool ,
267
269
best_of : int ,
268
270
use_beam_search : bool ,
269
271
top_k : int ,
@@ -287,7 +289,7 @@ async def send_request(
287
289
"temperature" : 0.0 if use_beam_search else 1.0 ,
288
290
"top_p" : 1.0 ,
289
291
"max_tokens" : output_len ,
290
- "ignore_eos" : False ,
292
+ "ignore_eos" : ignore_eos ,
291
293
"stream" : False ,
292
294
}
293
295
elif backend == "tgi" :
@@ -418,11 +420,11 @@ async def run_single_request(args: argparse.Namespace, api_url: str, tokenizer:
418
420
prompt : str , prompt_len : int , output_len : int , chosen_model : str ) -> Tuple [str , Tuple ]:
419
421
if args .stream_request :
420
422
result = await send_stream_request (
421
- args .backend , api_url , prompt , prompt_len , output_len ,
423
+ args .backend , api_url , prompt , prompt_len , output_len , args . ignore_eos ,
422
424
args .best_of , args .use_beam_search , args .top_k , tokenizer , args .sax_model , chosen_model , args .request_timeout ,)
423
425
else :
424
426
result = await send_request (
425
- args .backend , api_url , prompt , prompt_len , output_len ,
427
+ args .backend , api_url , prompt , prompt_len , output_len , args . ignore_eos ,
426
428
args .best_of , args .use_beam_search , args .top_k , tokenizer , args .sax_model , chosen_model , args .request_timeout ,)
427
429
return chosen_model , result
428
430
@@ -973,6 +975,14 @@ def parse_traffic_split(arg):
973
975
"Maximum number of input tokens for filtering the benchmark dataset."
974
976
),
975
977
)
978
+ parser .add_argument (
979
+ "--ignore-eos" ,
980
+ action = "store_true" ,
981
+ help = (
982
+ "If set and model server is vllm, the generation process will ignore the end-of-sequence (EOS) token, "
983
+ "allowing output to continue until reaching --max-output-length or another stopping condition."
984
+ ),
985
+ )
976
986
parser .add_argument (
977
987
"--top-k" ,
978
988
type = int ,
0 commit comments