diff --git a/examples/agent/evaluation/agent_eval.py b/examples/agent/evaluation/agent_eval.py new file mode 100644 index 0000000..3ae5fa8 --- /dev/null +++ b/examples/agent/evaluation/agent_eval.py @@ -0,0 +1,131 @@ +import os +import logging + +import deepeval +import datetime as dt +from typing import Any + +from deepeval import evaluate +from deepeval.models import DeepEvalBaseLLM +from deepeval.test_case import LLMTestCase, ToolCall +from deepeval.metrics import TaskCompletionMetric +from langfuse import Langfuse +from langfuse.api import TraceWithDetails +from langchain_openai import ChatOpenAI + + +class DeepEvalOpenAI(DeepEvalBaseLLM): + def __init__(self, model): + self.model = model + + def load_model(self): + return self.model + + def generate(self, prompt: str) -> str: + chat_model = self.load_model() + return chat_model.invoke(prompt).content + + async def a_generate(self, prompt: str) -> str: + chat_model = self.load_model() + res = await chat_model.ainvoke(prompt) + return res.content + + def get_model_name(self): + return "Custom Azure OpenAI Model" + + +# 拉取 traces +def fetch_traces(langfuse_cli: Any, lookback_minutes: int) -> list[TraceWithDetails]: + now_timestamp = dt.datetime.now(dt.UTC) + from_timestamp = now_timestamp - dt.timedelta(minutes=lookback_minutes) + try: + response = langfuse_cli.fetch_traces(from_timestamp=from_timestamp, to_timestamp=now_timestamp) + return response.data + except Exception as e: + print(f"Failed to get traces: {e}") + return [] + + +# 使用 langchain sdk 自定义 llm +def get_model(model_name: str) -> DeepEvalBaseLLM: + model = ChatOpenAI( + model=model_name, + temperature=0, + max_tokens=None, + timeout=None, + max_retries=2, + api_key=os.getenv("OPENAI_API_KEY"), + base_url=os.getenv("OPENAI_API_BASE"), + ) + return DeepEvalOpenAI(model=model) + + +def handel_traces(traces: list[TraceWithDetails]) -> list[LLMTestCase]: + test_cases = [] + + for t in traces: + tools_called_map = {} + tools_called_list = [] + actual_output = "" + user_input = t.input["messages"] + + if isinstance(t.output, str): + logging.error(t) + elif isinstance(t.output, dict) and "messages" in t.output: + for message in t.output["messages"]: + tool_calls = message.get("tool_calls", []) + if isinstance(tool_calls, list) and len(tool_calls) > 0: + for tool_call in tool_calls: + tools_called_map[tool_call["id"]] = ToolCall( + name=tool_call["name"], + input_parameters=tool_call["args"], + output=None, + ) + if message["type"] == "tool": + tool_call_id = message.get("tool_call_id") + if tool_call_id in tools_called_map: + tools_called_map[tool_call_id].output = message["content"] + if message["type"] == "ai" and message["response_metadata"]["finish_reason"] == "stop": + actual_output = message["content"] + + for _, v in tools_called_map.items(): + tools_called_list.append(v) + + test_case = LLMTestCase( + input=user_input, + actual_output=actual_output, + tools_called=tools_called_list, + ) + test_cases.append(test_case) + + return test_cases + + +if __name__ == "__main__": + # Get keys for your project from the project settings page + os.environ["LANGFUSE_PUBLIC_KEY"] = "pk-lf-xxxxxx" # your langfuse public key + os.environ["LANGFUSE_SECRET_KEY"] = "sk-lf-xxxxxx" # your langfuse secret key + os.environ["LANGFUSE_HOST"] = "http://xx.xx.xx.xx" # your langfuse host + os.environ["DEEPEVAL_RESULTS_FOLDER"] = "/Users/deepeval_result" # 本地保存评估结果路径(建议) + CONFIDENT_API_KEY = "xxxxxxxx" # confident ai 的 api key(可选) + + llm = get_model(model_name="") + + metric = TaskCompletionMetric( + threshold=0.7, + model=llm, + include_reason=True + ) + + langfuse = Langfuse() + lookback_minutes = 30 + traces = fetch_traces(langfuse_cli=langfuse, lookback_minutes=lookback_minutes) + logging.info(f"Fetched {len(traces)} traces for last {lookback_minutes} minutes.") + + deepeval.login_with_confident_api_key(CONFIDENT_API_KEY) + + test_cases = handel_traces(traces=traces) + logging.info(f"Got {len(test_cases)} test cases.") + + # Evaluate end-to-end + evaluate(test_cases=test_cases, metrics=[metric]) diff --git a/examples/agent/evaluation/eval-actions-demo.yaml b/examples/agent/evaluation/eval-actions-demo.yaml new file mode 100644 index 0000000..04aa32c --- /dev/null +++ b/examples/agent/evaluation/eval-actions-demo.yaml @@ -0,0 +1,48 @@ +name: LLM App Unit Testing + +on: + push: + pull_request: + +jobs: + test: + runs-on: ubuntu-latest + steps: + - name: Checkout Code + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + + - name: Install Poetry + run: | + curl -SSL https://install.python-poetry.org | python3 - + echo "$HOME/.local/bin" >> $GITHUB_PATH + + - name: Install Dependencies + run: poetry install --no-root + + - name: Set OpenAI API Key + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + run: echo "OPENAI_API_KEY=$OPENAI_API_KEY" >> $GITHUB_ENV + + - name: Set OpenAI API Base + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_BASE }} + run: echo "OPENAI_API_BASE=$OPENAI_API_BASE" >> $GITHUB_ENV + + - name: Set LLM + env: + OPENAI_API_KEY: ${{ secrets.LLM_ID }} + run: echo "LLM_ID=$LLM_ID" >> $GITHUB_ENV + + - name: Login to Confident AI + env: + CONFIDENT_API_KEY: ${{ secrets.CONFIDENT_API_KEY }} + run: poetry run deepeval login --confident-api-key "$CONFIDENT_API_KEY" + + - name: Run DeepEval Test Run + run: poetry run deepeval test run test_llm_app.py -i \ No newline at end of file diff --git a/examples/agent/evaluation/test_llm_app.py b/examples/agent/evaluation/test_llm_app.py new file mode 100644 index 0000000..65209b1 --- /dev/null +++ b/examples/agent/evaluation/test_llm_app.py @@ -0,0 +1,129 @@ +import os +import logging +import pytest +import datetime as dt +from typing import Any + +from deepeval import assert_test +from deepeval.models import DeepEvalBaseLLM +from deepeval.test_case import LLMTestCase, ToolCall +from deepeval.metrics import TaskCompletionMetric +from langfuse import Langfuse +from langfuse.api import TraceWithDetails +from langchain_openai import ChatOpenAI +from deepeval.dataset import EvaluationDataset + + +class DeepEvalOpenAI(DeepEvalBaseLLM): + def __init__(self, model): + self.model = model + + def load_model(self): + return self.model + + def generate(self, prompt: str) -> str: + chat_model = self.load_model() + return chat_model.invoke(prompt).content + + async def a_generate(self, prompt: str) -> str: + chat_model = self.load_model() + res = await chat_model.ainvoke(prompt) + return res.content + + def get_model_name(self): + return "Custom Azure OpenAI Model" + + +# 拉取 traces +def fetch_traces(langfuse_cli: Any, lookback_minutes: int) -> list[TraceWithDetails]: + now_timestamp = dt.datetime.now(dt.UTC) + from_timestamp = now_timestamp - dt.timedelta(minutes=lookback_minutes) + try: + response = langfuse_cli.fetch_traces(from_timestamp=from_timestamp, to_timestamp=now_timestamp) + return response.data + except Exception as e: + print(f"Failed to get traces: {e}") + return [] + + +# 使用 langchain sdk 自定义 llm +def get_model(model_name: str) -> DeepEvalBaseLLM: + model = ChatOpenAI( + model=model_name, + temperature=0, + max_tokens=None, + timeout=None, + max_retries=2, + api_key=os.getenv("OPENAI_API_KEY"), + base_url=os.getenv("OPENAI_API_BASE"), + ) + return DeepEvalOpenAI(model=model) + + +# Get keys for your project from the project settings page +os.environ["LANGFUSE_PUBLIC_KEY"] = "pk-lf-xxxxxx" # your langfuse public key +os.environ["LANGFUSE_SECRET_KEY"] = "sk-lf-xxxxxx" # your langfuse secret key +os.environ["LANGFUSE_HOST"] = "http://xx.xx.xx.xx" # your langfuse host +os.environ["DEEPEVAL_RESULTS_FOLDER"] = "/Users/deepeval_result" # 本地保存评估结果路径 + +llm = get_model(model_name=os.getenv("LLM_ID")) + +metric = TaskCompletionMetric( + threshold=0.7, + model=llm, + include_reason=True +) + +langfuse = Langfuse() +lookback_minutes = 30 +traces = fetch_traces(langfuse_cli=langfuse, lookback_minutes=lookback_minutes) +logging.info(f"Fetched {len(traces)} traces for last {lookback_minutes} minutes.") + +test_cases = [] + +for t in traces: + tools_called_map = {} + tools_called_list = [] + actual_output = "" + user_input = t.input["messages"] + + if isinstance(t.output, str): + logging.error(t) + elif isinstance(t.output, dict) and "messages" in t.output: + for message in t.output["messages"]: + tool_calls = message.get("tool_calls", []) + if isinstance(tool_calls, list) and len(tool_calls) > 0: + for tool_call in tool_calls: + tools_called_map[tool_call["id"]] = ToolCall( + name=tool_call["name"], + input_parameters=tool_call["args"], + output=None, + ) + if message["type"] == "tool": + tool_call_id = message.get("tool_call_id") + if tool_call_id in tools_called_map: + tools_called_map[tool_call_id].output = message["content"] + if message["type"] == "ai" and message["response_metadata"]["finish_reason"] == "stop": + actual_output = message["content"] + + for _, v in tools_called_map.items(): + tools_called_list.append(v) + + test_case = LLMTestCase( + input=user_input, + actual_output=actual_output, + tools_called=tools_called_list, + ) + test_cases.append(test_case) + dataset = EvaluationDataset(test_cases=test_cases) + +logging.info(f"Got {len(test_cases)} test cases.") + + +# Loop through test cases +@pytest.mark.parametrize("test_case", dataset) +def test_llm_app(test_case: LLMTestCase): + assert_test(test_case, [metric]) + +# RUN CMD +# deepeval test run llm-app-eval/test_llm_app.py -i diff --git a/examples/agent/langgraph-agent.py b/examples/agent/langgraph-agent.py new file mode 100644 index 0000000..e34f71d --- /dev/null +++ b/examples/agent/langgraph-agent.py @@ -0,0 +1,52 @@ +import os +import asyncio + +from langchain_openai import ChatOpenAI +from langgraph.prebuilt import create_react_agent +from langchain_mcp_adapters.client import MultiServerMCPClient +from langfuse.callback import CallbackHandler + + +# react agent + mcp +async def multi_tool_demo(model: ChatOpenAI, query: str, config: dict): + async with MultiServerMCPClient({ + "math": { + "command": "python", + # Make sure to update to the full absolute path to your math.py file + "args": ["math_server.py"], + "transport": "stdio", + }, + }) as client: + agent = create_react_agent(model, client.get_tools()) + try: + response = await agent.ainvoke({"messages": query}, config=config) + print(f"\n工具调用结果(query: {query}):") + for m in response['messages']: + m.pretty_print() + except Exception as e: + print(f"工具调用出错: {e}") + +if __name__ == "__main__": + # get keys for your project + os.environ["LANGFUSE_PUBLIC_KEY"] = "pk-lf-***" # your langfuse public key + os.environ["LANGFUSE_SECRET_KEY"] = "sk-lf-***" # your langfuse secret key + os.environ["LANGFUSE_HOST"] = "http://xx.xx.xx.xx" # your langfuse host + + query = "今有雉兔同笼,上有三十五头,下有九十四足,问雉兔各几何?(请使用我给你提供的工具)" + + # init model + model = ChatOpenAI( + model="", + api_key=os.getenv("OPENAI_API_KEY"), + base_url=os.getenv("OPENAI_API_BASE"), + ) + + # Initialize Langfuse CallbackHandler for Langchain (tracing) + langfuse_handler = CallbackHandler() + config = {"callbacks": [langfuse_handler]} + + # invoke agent + async def run_tools(): + await multi_tool_demo(model=model, query=query, config=config) + + asyncio.run(run_tools()) diff --git a/examples/agent/math_server.py b/examples/agent/math_server.py new file mode 100644 index 0000000..2910908 --- /dev/null +++ b/examples/agent/math_server.py @@ -0,0 +1,33 @@ +from mcp.server.fastmcp import FastMCP + +mcp = FastMCP("Math") + + +@mcp.tool() +def add(a: int, b: int) -> int: + """Add two numbers""" + return a + b + + +@mcp.tool() +def subtract(a: int, b: int) -> int: + """Subtract b from a""" + return a - b + + +@mcp.tool() +def multiply(a: int, b: int) -> int: + """Multiply two numbers""" + return a * b + + +@mcp.tool() +def divide(a: int, b: int) -> float: + """Divide a by b""" + if b == 0: + raise ValueError("Division by zero is not allowed.") + return a / b + + +if __name__ == "__main__": + mcp.run(transport="stdio") diff --git a/examples/agent/requirements.txt b/examples/agent/requirements.txt new file mode 100644 index 0000000..4bbb3ac --- /dev/null +++ b/examples/agent/requirements.txt @@ -0,0 +1,7 @@ +langchain-openai +langgraph +langchain-mcp-adapters +langfuse +mcp +deepeval +pytest \ No newline at end of file