diff --git a/src/config.yaml b/src/config.yaml index 70bf4da..099af3d 100644 --- a/src/config.yaml +++ b/src/config.yaml @@ -39,5 +39,4 @@ llmargs: temperature: 0.5 max_tokens: 1024 output: ../output/AspectAdded/semeval-agg/aspectAdded.pkl - - + top_k_aspects: 1 diff --git a/src/llm/aspect_extraction_pipeline.py b/src/llm/aspect_extraction_pipeline.py index 5f7a1cb..fedce16 100644 --- a/src/llm/aspect_extraction_pipeline.py +++ b/src/llm/aspect_extraction_pipeline.py @@ -17,7 +17,7 @@ class LLMReviewProcessor: def __init__(self, cfg: DictConfig): self.cfg = cfg self.llm_handler = self._init_llm_handler() - self.prompt_builder = PromptBuilder() + self.prompt_builder = PromptBuilder(top_k=cfg.llmargs.top_k_aspects) def _init_llm_handler(self): config = LLMconfig( @@ -39,7 +39,6 @@ def find_aspect_indices(aspect: str, sentence_tokens) : for i in range(len(tokens) - len(aspect_tokens) + 1): if tokens[i:i + len(aspect_tokens)] == aspect_tokens: return list(range(i, i + len(aspect_tokens))) - return -1 def process_reviews(self, reviews: list): @@ -49,13 +48,32 @@ def process_reviews(self, reviews: list): if sample_review.get('implicit', [False])[0] is not True: continue prompt = self.prompt_builder.build_prompt(sample_review) - response = self.llm_handler.get_response(prompt) - matches = re.findall(r'\{.*?\}', response, re.DOTALL) + + max_retries = 5 + valid_json_found = False + matches = [] + + for attempt in range(max_retries): + response = self.llm_handler.get_response(prompt) + matches = re.findall(r'\{.*?\}', response, re.DOTALL) + + for json_str in matches: + try: + aspect_data = json.loads(json_str) + if "aspect" in aspect_data and aspect_data["aspect"]: + valid_json_found = True + break + except json.JSONDecodeError: + continue + + if valid_json_found: + break + else: + print(f"Invalid or no valid JSON with 'aspect' found. Attempt {attempt + 1} of {max_retries}") if not matches: print("No JSON object found in response") continue - all_aspects = [] seen_aspects = set() tokens = [word.strip().lower() for sentences in sample_review["sentences"] for word in sentences] diff --git a/src/llm/prompt_builder.py b/src/llm/prompt_builder.py index 2ccd27a..d47d7f2 100644 --- a/src/llm/prompt_builder.py +++ b/src/llm/prompt_builder.py @@ -1,16 +1,18 @@ class PromptBuilder: - def __init__(self, task_description=None): + def __init__(self, task_description=None, top_k=1): self.task_description = task_description or ( "Identify the latent aspect targeted by the sentiment in the review. " "If the aspect is explicitly mentioned, return its index; if it's implicit, return the inferred aspect and use index -1." ) + self.top_k = top_k def build_prompt(self, review_entry: dict) -> str: review_text = ' '.join(review_entry['sentences'][0]) prompt = ( f"Review: \"{review_text}\"\n" f"Task: {self.task_description}\n" + f"Return exactly the top {self.top_k} aspect(s) that best represent the sentiment in this review.\n" f"Output Format: {{\"aspect\": \"\", \"index\": }}" )