Skip to content

Commit 6f18d6c

Browse files
committed
prompt updates, tracing
1 parent 261d0c7 commit 6f18d6c

File tree

3 files changed

+19
-10
lines changed

3 files changed

+19
-10
lines changed

fmbench/configs/deepseek/config-deepseek-r1-quant1.58-longbench-byoe.yml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ pricing: pricing.yml
152152
inference_parameters:
153153
ec2_llama_server:
154154
model: {model_id}
155-
temperature: 0.6
155+
temperature: 0.1
156156
# top_p: 0.92
157157
# top_k: 120
158158
# max_tokens: 512
@@ -199,10 +199,10 @@ experiments:
199199
serving.properties:
200200
# runs are done for each combination of payload file and concurrency level
201201
payload_files:
202-
# - payload_en_1-500.jsonl
203-
# - payload_en_500-1000.jsonl
204-
# - payload_en_1000-2000.jsonl
205-
# - payload_en_2000-3000.jsonl
202+
- payload_en_1-500.jsonl
203+
- payload_en_500-1000.jsonl
204+
- payload_en_1000-2000.jsonl
205+
- payload_en_2000-3000.jsonl
206206
- payload_en_3000-3840.jsonl
207207

208208
# concurrency level refers to number of requests sent in parallel to an endpoint
Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,19 @@
1+
<think>
12
There can be multiple question answer pairs in the context.
23
As soon as you find the first question in the text below immediately stop reading any further and just answer the question.
3-
Always start your response with "<think>\\n" at the beginning of every output.
4+
Always start your response with "<think>" at the beginning of every output and think step by step.
45
Keep your thinking process short and your answers concise, do not overthink.
5-
Put your final answer in one line starting with Answer:
6+
Make sure to always provide an answer, if you do not know the answer then say I do not known but never leave the answer field empty in your response.
7+
</think>
68

9+
<answer>
10+
Put your final answer in one line starting with the word Answer:
11+
</answer>
712

13+
Here is the text for you to work on:
14+
15+
<text>
816
{input}
917

10-
{context}
18+
{context}
19+
</text>

fmbench/scripts/ec2_predictor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ def get_prediction(self, payload: Dict) -> FMBenchPredictionResponse:
152152
elif container_type == constants.CONTAINER_TYPE_LLAMA_SERVER:
153153
payload2['prompt'] = payload2.pop('inputs')
154154
payload2 = payload2 | self._inference_spec["parameters"]
155-
logger.debug(f"payload={payload2}")
155+
logger.info(f"payload={payload2}")
156156
st = time.perf_counter()
157157
response = requests.post(self._endpoint_name, json=payload2)
158158
# record the latency for the response generated
@@ -195,7 +195,7 @@ def get_prediction(self, payload: Dict) -> FMBenchPredictionResponse:
195195
if full_output is None:
196196
logger.error(f"failed to extract output from response text, response text = \"{response.text}\"")
197197
else:
198-
logger.debug(f"full_output={full_output}")
198+
logger.info(f"full_output={full_output}")
199199
elif container_type == constants.CONTAINER_TYPE_OLLAMA:
200200
payload2['prompt'] = payload2.pop('inputs')
201201
payload2 = payload2 | self._inference_spec["parameters"]

0 commit comments

Comments
 (0)