From 9638be8c71b97db92066e187154cce61ea891ff9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arthur=20Ca=CC=82mara?= Date: Fri, 29 Sep 2023 20:41:07 +0200 Subject: [PATCH 1/3] added rank_vicuna quantized --- rank_llm/rank_vicuna_q.py | 105 ++++++++++++++++++++++++++++++++++++++ rank_llm/run_rank_llm.py | 11 ++++ requirements.txt | 5 +- 3 files changed, 119 insertions(+), 2 deletions(-) create mode 100644 rank_llm/rank_vicuna_q.py diff --git a/rank_llm/rank_vicuna_q.py b/rank_llm/rank_vicuna_q.py new file mode 100644 index 00000000..20db34e4 --- /dev/null +++ b/rank_llm/rank_vicuna_q.py @@ -0,0 +1,105 @@ +import re +from typing import Tuple, List, Union, Dict, Any + +from fastchat.model import load_model, get_conversation_template, add_model_args +from ftfy import fix_text +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM +from transformers.generation import GenerationConfig + +from rank_llm import RankLLM, PromptMode +from llama_cpp import Llama, LlamaCache + + +def replace_number(s): + return re.sub(r"\[(\d+)\]", r"(\1)", s) + + +class RankVicunaQ(RankLLM): + def __init__( + self, + model: str, + context_size: int, + top_k_candidates: int, + dataset: str, + prompt_mode: PromptMode, + device: str, + num_gpus: int = 1, # AFAIK, support for multiple GPUS is not very good at Llama.cpp. + ) -> None: + super().__init__(model, context_size, top_k_candidates, dataset, prompt_mode) + self._device = device + if self._device == "cuda": + assert torch.cuda.is_available() + if prompt_mode != PromptMode.RANK_GPT: + raise ValueError( + f"Unsuported prompt mode: {prompt_mode}. The only prompt mode cuurently supported by vicuna is a slight variation of Rank_GPT prompt." + ) + # ToDo: Make repetition_penalty configurable + self._llm = Llama( + model_path=model, + n_ctx=context_size, + n_gpu_layers=-1, + verbose=True, + ) + self._llm.set_cache(LlamaCache()) + + def run_llm(self, prompt: str) -> Tuple[str, int]: + output: Dict[str, Any] = self._llm( + prompt, max_tokens=self.max_tokens(), temperature=0.9, top_p=0.6 + ) # type: ignore + text = output["choices"][0]["text"] + n_tokens = output["usage"]["completion_tokens"] + return text, n_tokens + + def num_output_tokens(self) -> int: + return 200 + + def _add_prefix_prompt(self, query: str, num: int) -> str: + return f"I will provide you with {num} passages, each indicated by a numerical identifier []. Rank the passages based on their relevance to the search query: {query}.\n" + + def _add_post_prompt(self, query: str, num: int) -> str: + return f"Search Query: {query}.\nRank the {num} passages above based on their relevance to the search query. All the passages should be included and listed using identifiers, in descending order of relevance. The output format should be [] > [], e.g., [4] > [2], Only respond with the ranking results, do not say any word or explain." + + def create_prompt( + self, retrieved_result: Dict[str, Any], rank_start: int, rank_end: int + ) -> Tuple[str, int]: + query = retrieved_result["query"] + num = len(retrieved_result["hits"][rank_start:rank_end]) + max_length = 300 + while True: + conv = get_conversation_template(self._model) + # conv.set_system_message( + # "You are RankVicuna, an intelligent assistant that can rank passages based on their relevancy to the query." + # ) + prefix = self._add_prefix_prompt(query, num) + rank = 0 + input_context = f"{prefix}\n" + for hit in retrieved_result["hits"][rank_start:rank_end]: + rank += 1 + content = hit["content"] + content = content.replace("Title: Content: ", "") + content = content.strip() + # For Japanese should cut by character: content = content[:int(max_length)] + content = " ".join(content.split()[: int(max_length)]) + input_context += f"[{rank}] {replace_number(content)}\n" + + input_context += self._add_post_prompt(query, num) + conv.append_message(conv.roles[0], input_context) + prompt = conv.get_prompt() + " ASSISTANT:" + prompt = fix_text(prompt) + num_tokens = self.get_num_tokens(prompt) + if num_tokens <= self.max_tokens() - self.num_output_tokens(): + break + else: + max_length -= max( + 1, + (num_tokens - self.max_tokens() + self.num_output_tokens()) + // (rank_end - rank_start), + ) + return prompt, self.get_num_tokens(prompt) + + def get_num_tokens(self, prompt: str) -> int: + return len(self._tokenizer.encode(prompt)) + + def cost_per_1k_token(self, input_token: bool) -> float: + return 0 diff --git a/rank_llm/run_rank_llm.py b/rank_llm/run_rank_llm.py index 2d272279..e292fc83 100644 --- a/rank_llm/run_rank_llm.py +++ b/rank_llm/run_rank_llm.py @@ -9,6 +9,7 @@ from rank_gpt import SafeOpenai from rank_llm import PromptMode from rank_vicuna import RankVicuna +from rank_vicuna_q import RankVicunaQ from topics_dict import TOPICS from trec_eval import EvalFunction @@ -42,6 +43,16 @@ def main(args): prompt_mode=prompt_mode, keys=openai_keys, ) + elif "q4" or "q8" in model_path: + agent = RankVicunaQ( + model=model_path, + context_size=context_size, + top_k_candidates=top_k_candidates, + dataset=dataset, + prompt_mode=prompt_mode, + device=device, + num_gpus=num_gpus, + ) else: agent = RankVicuna( model=model_path, diff --git a/requirements.txt b/requirements.txt index 72e8f7aa..e59ee035 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,8 @@ tiktoken==0.4.0 transformers==4.31.0 pyserini==0.22.0 python-dotenv==1.0.0 -faiss-gpu==1.7.2 +# faiss-gpu==1.7.2 ftfy==6.1.1 fschat==0.2.28 -accelerate==0.23.0 \ No newline at end of file +accelerate==0.23.0 +llama-cpp-python==0.2.7 From 809717da04cefef01230b6eaa1156d6abd6ad6f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arthur=20Ca=CC=82mara?= Date: Fri, 29 Sep 2023 21:04:42 +0200 Subject: [PATCH 2/3] fixed tokenizer --- rank_llm/rank_vicuna_q.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/rank_llm/rank_vicuna_q.py b/rank_llm/rank_vicuna_q.py index 20db34e4..d409f161 100644 --- a/rank_llm/rank_vicuna_q.py +++ b/rank_llm/rank_vicuna_q.py @@ -34,12 +34,11 @@ def __init__( raise ValueError( f"Unsuported prompt mode: {prompt_mode}. The only prompt mode cuurently supported by vicuna is a slight variation of Rank_GPT prompt." ) - # ToDo: Make repetition_penalty configurable self._llm = Llama( model_path=model, n_ctx=context_size, n_gpu_layers=-1, - verbose=True, + verbose=False, ) self._llm.set_cache(LlamaCache()) @@ -99,7 +98,7 @@ def create_prompt( return prompt, self.get_num_tokens(prompt) def get_num_tokens(self, prompt: str) -> int: - return len(self._tokenizer.encode(prompt)) + return len(self._llm.tokenize(prompt.encode())) def cost_per_1k_token(self, input_token: bool) -> float: return 0 From 507c086f19b873520c1b5401fc076f84455ce9a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arthur=20C=C3=A2mara?= Date: Fri, 29 Sep 2023 21:14:16 +0200 Subject: [PATCH 3/3] Update requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e59ee035..56def181 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ tiktoken==0.4.0 transformers==4.31.0 pyserini==0.22.0 python-dotenv==1.0.0 -# faiss-gpu==1.7.2 +faiss-gpu==1.7.2 ftfy==6.1.1 fschat==0.2.28 accelerate==0.23.0