prompt updates, tracing

aarora79 · aarora79 · commit 6f18d6c329ce · 2025-02-10T23:24:04.000Z
diff --git a/fmbench/configs/deepseek/config-deepseek-r1-quant1.58-longbench-byoe.yml b/fmbench/configs/deepseek/config-deepseek-r1-quant1.58-longbench-byoe.yml
@@ -152,7 +152,7 @@ pricing: pricing.yml
 inference_parameters: 
   ec2_llama_server:
     model: {model_id}
-    temperature: 0.6
+    temperature: 0.1
     # top_p: 0.92
     # top_k: 120  
     # max_tokens: 512
@@ -199,10 +199,10 @@ experiments:
     serving.properties: 
     # runs are done for each combination of payload file and concurrency level
     payload_files:
-    # - payload_en_1-500.jsonl
-    # - payload_en_500-1000.jsonl
-    # - payload_en_1000-2000.jsonl
-    # - payload_en_2000-3000.jsonl
+    - payload_en_1-500.jsonl
+    - payload_en_500-1000.jsonl
+    - payload_en_1000-2000.jsonl
+    - payload_en_2000-3000.jsonl
     - payload_en_3000-3840.jsonl
 
     # concurrency level refers to number of requests sent in parallel to an endpoint
diff --git a/fmbench/prompt_template/prompt_template_deepseek_longbench.txt b/fmbench/prompt_template/prompt_template_deepseek_longbench.txt
@@ -1,10 +1,19 @@
+<think>
 There can be multiple question answer pairs in the context.
 As soon as you find the first question in the text below immediately stop reading any further and just answer the question.
-Always start your response with "<think>\\n" at the beginning of every output.
+Always start your response with "<think>" at the beginning of every output and think step by step.
 Keep your thinking process short and your answers concise, do not overthink.
-Put your final answer in one line starting with Answer:
+Make sure to always provide an answer, if you do not know the answer then say I do not known but never leave the answer field empty in your response.
+</think>
 
+<answer>
+Put your final answer in one line starting with the word Answer:
+</answer>
 
+Here is the text for you to work on:
+
+<text>
 {input}
 
-{context}
+{context}
+</text>
diff --git a/fmbench/scripts/ec2_predictor.py b/fmbench/scripts/ec2_predictor.py
@@ -152,7 +152,7 @@ def get_prediction(self, payload: Dict) -> FMBenchPredictionResponse:
             elif container_type == constants.CONTAINER_TYPE_LLAMA_SERVER:                
                 payload2['prompt'] = payload2.pop('inputs')
                 payload2 = payload2 | self._inference_spec["parameters"]
-                logger.debug(f"payload={payload2}")
+                logger.info(f"payload={payload2}")
                 st = time.perf_counter()
                 response = requests.post(self._endpoint_name, json=payload2)
                 # record the latency for the response generated
@@ -195,7 +195,7 @@ def get_prediction(self, payload: Dict) -> FMBenchPredictionResponse:
                 if full_output is None:
                     logger.error(f"failed to extract output from response text, response text = \"{response.text}\"") 
                 else:
-                    logger.debug(f"full_output={full_output}")
+                    logger.info(f"full_output={full_output}")
             elif container_type == constants.CONTAINER_TYPE_OLLAMA:                
                 payload2['prompt'] = payload2.pop('inputs')
                 payload2 = payload2 | self._inference_spec["parameters"]