diff --git a/recipes/natural_language_processing/text-to-json/Makefile b/recipes/natural_language_processing/text-to-json/Makefile new file mode 100644 index 00000000..0d2af6e2 --- /dev/null +++ b/recipes/natural_language_processing/text-to-json/Makefile @@ -0,0 +1,13 @@ +MODEL_URL ?= +MODEL_NAME ?= +MODEL_DIR ?= models + +.PHONY: download-model +download-model: + curl -H "Cache-Control: no-cache" -s -S -L -f $(MODEL_URL) -z $(MODEL_DIR)/$(MODEL_NAME) -o $(MODEL_DIR)/$(MODEL_NAME).tmp && \ + mv -f $(MODEL_NAME).tmp $(MODEL_NAME) 2>/dev/null || \ + rm -f $(MODEL_DIR)/$(MODEL_NAME).tmp $(MODEL_DIR)/$(MODEL_NAME) + +.PHONY: download-model-mistral # default model +download-model-mistral: + MODEL_NAME=mistral-7b-instruct-v0.1.Q4_K_M.gguf MODEL_URL=https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf make -f Makefile download-model diff --git a/recipes/natural_language_processing/text-to-json/README.md b/recipes/natural_language_processing/text-to-json/README.md new file mode 100644 index 00000000..c4fe8a2f --- /dev/null +++ b/recipes/natural_language_processing/text-to-json/README.md @@ -0,0 +1,7 @@ +Steps: + +1. begin local dev +2. Work on datasources + - connect with SRE teams to figure out ways we could get SRE tickets normalized into a training dataset easliy ingested by the model + - scrape stackoverflow, stackexchange, and medium for training data +3. deploy with langserve diff --git a/recipes/natural_language_processing/text-to-json/requirements.txt b/recipes/natural_language_processing/text-to-json/requirements.txt new file mode 100644 index 00000000..5065c924 --- /dev/null +++ b/recipes/natural_language_processing/text-to-json/requirements.txt @@ -0,0 +1,42 @@ +aiohttp==3.9.5 +aiosignal==1.3.1 +annotated-types==0.6.0 +anyio==4.3.0 +attrs==23.2.0 +certifi==2024.2.2 +charset-normalizer==3.3.2 +dataclasses-json==0.6.4 +fastapi==0.110.2 +frozenlist==1.4.1 +h11==0.14.0 +httpcore==1.0.5 +httpx==0.27.0 +idna==3.7 +jsonpatch==1.33 +jsonpointer==2.4 +langchain==0.1.16 +langchain-community==0.0.34 +langchain-core==0.1.45 +langchain-text-splitters==0.0.1 +langserve==0.1.0 +langsmith==0.1.49 +marshmallow==3.21.1 +multidict==6.0.5 +mypy-extensions==1.0.0 +numpy==1.26.4 +orjson==3.10.1 +packaging==23.2 +pathlib==1.0.1 +pydantic==2.7.0 +pydantic_core==2.18.1 +PyYAML==6.0.1 +requests==2.31.0 +sniffio==1.3.1 +SQLAlchemy==2.0.29 +starlette==0.37.2 +tenacity==8.2.3 +typing-inspect==0.9.0 +typing_extensions==4.11.0 +urllib3==2.2.1 +validators==0.28.1 +yarl==1.9.4 diff --git a/recipes/natural_language_processing/text-to-json/source/main.py b/recipes/natural_language_processing/text-to-json/source/main.py new file mode 100644 index 00000000..26d238af --- /dev/null +++ b/recipes/natural_language_processing/text-to-json/source/main.py @@ -0,0 +1,76 @@ +from langchain_core.callbacks import StreamingStdOutCallbackHandler +# from langchain.chains import LLMChain +# from langchain_core.prompts.prompt import PromptTemplate +from langchain_openai import OpenAI +from langchain_core.output_parsers import JsonOutputParser +from fastapi import FastAPI +import tiktoken +import os +import json +# import streamlit as st +from pprint import pprint +from pathlib import Path +# from langchain_text_splitters import RecursiveJsonSplitter + +app = FastAPI() +model_path = os.getenv("MODEL_PATH", default="/locallm/models") +model_name = os.getenv("MODEL_NAME", default="mistral-7b-instruct-v0.1.Q4_K_M.gguf") +model_port = os.getenv("MODEL_PORT", default=8001) +model_server_ip = os.getenv("MODEL_SERVER_ENDPOINT", default="http://localhost") +model = f"{model_path}/{model_name}" +base_model_service = f"{model_server_ip}:{model_port}" +v1_model_service = f"{base_model_service}/v1" + +revision = os.getenv("MODEL_REVISION", default="no_timm") + +def log_template(string: str) -> None: + print("==================== PROMPT TEMPLATE ====================") + pprint(string) + print("================== END PROMPT TEMPLATE ==================") + +# App function +def count_tokens(string: str) -> int: + encoding_name = "p50k_base" + encoding = tiktoken.get_encoding(encoding_name) + num_tokens = len(encoding.encode(string)) + return num_tokens + +# MS Function +def initialize_model_client() -> OpenAI: + callbacks = [StreamingStdOutCallbackHandler()] + openai_client = OpenAI( + base_url=v1_model_service, + api_key = "sk-no-key-required", + tiktoken_model_name="mistral", + temperature=0.9, + callbacks=callbacks, + # verbose=True, + # schema_json=True, + max_tokens=4000, + # streaming=True + ) + return openai_client + +@app.post("/populate_json_schema") +def no_download_json_chain(model: OpenAI, file_name: str, input: str): + with open (f"schemas/{file_name}", "r") as f: + json_schema = json.load(f) + json_schema_string = json.dumps(json_schema) + print("schema tokens: ", count_tokens(json_schema_string)) + # dropping chunk splitting --> moving to a model with bigger token input + # splitter = RecursiveJsonSplitter(max_chunk_size=300) + # json_chunks = splitter.split_json(json_data=json_data) + template = """You are a world class engineer, who specializes in generating JSON objects. You will be provided text describing something, along with a JSON schema. Generate a JSON object from the schema based on the content of the text you are provided. Considering all possible cases, including but not limited to, text input missing the fields required in the schema, and irrelevant sections of the text input. + %JSON schema + {json_schema_string} + %User input: + {input}""" + template = template.format(json_schema_string=json_schema_string, input=input) + log_template(template) + print("template tokens: ", count_tokens(template)) + return model.invoke(template) + +model_client = initialize_model_client() +# no_download_json_chain(model_client, "fruit.json", "A red banana.") +test = no_download_json_chain(model_client, "employee.json", "My name is Gregory Pereira. I work in the Emereging Technologies department and the Platform and Services team. I like apples, and long walks on the beach.") +pprint(test) \ No newline at end of file diff --git a/recipes/natural_language_processing/text-to-json/source/schemas/car.json b/recipes/natural_language_processing/text-to-json/source/schemas/car.json new file mode 100644 index 00000000..c975c43e --- /dev/null +++ b/recipes/natural_language_processing/text-to-json/source/schemas/car.json @@ -0,0 +1,36 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Car", + "type": "object", + "properties": { + "make": { + "type": "string", + "description": "The make or manufacturer of the car" + }, + "model": { + "type": "string", + "description": "The model of the car" + }, + "year": { + "type": "integer", + "minimum": 1900, + "maximum": 2024, + "description": "The manufacturing year of the car" + }, + "color": { + "type": "string", + "description": "The color of the car" + }, + "mileage": { + "type": "number", + "minimum": 0, + "description": "The mileage of the car in kilometers" + }, + "price": { + "type": "number", + "minimum": 0, + "description": "The price of the car in USD" + } + }, + "required": ["make", "model", "year", "color", "mileage", "price"] +} diff --git a/recipes/natural_language_processing/text-to-json/source/schemas/computer.json b/recipes/natural_language_processing/text-to-json/source/schemas/computer.json new file mode 100644 index 00000000..0a583d3f --- /dev/null +++ b/recipes/natural_language_processing/text-to-json/source/schemas/computer.json @@ -0,0 +1,78 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Computer", + "type": "object", + "properties": { + "brand": { + "type": "string", + "description": "The brand or manufacturer of the computer" + }, + "model": { + "type": "string", + "description": "The model of the computer" + }, + "processor": { + "type": "object", + "description": "Details about the processor of the computer", + "properties": { + "manufacturer": { + "type": "string", + "description": "The manufacturer of the processor" + }, + "model": { + "type": "string", + "description": "The model of the processor" + }, + "cores": { + "type": "integer", + "minimum": 1, + "description": "The number of processor cores" + }, + "clock_speed": { + "type": "number", + "minimum": 0, + "description": "The clock speed of the processor in GHz" + } + }, + "required": ["manufacturer", "model", "cores", "clock_speed"] + }, + "ram": { + "type": "object", + "description": "Details about the RAM of the computer", + "properties": { + "size_gb": { + "type": "number", + "minimum": 0, + "description": "The size of RAM in gigabytes" + }, + "type": { + "type": "string", + "description": "The type of RAM (e.g., DDR4)" + } + }, + "required": ["size_gb", "type"] + }, + "storage": { + "type": "object", + "description": "Details about the storage of the computer", + "properties": { + "type": { + "type": "string", + "description": "The type of storage (e.g., SSD, HDD)" + }, + "capacity_gb": { + "type": "number", + "minimum": 0, + "description": "The capacity of storage in gigabytes" + } + }, + "required": ["type", "capacity_gb"] + }, + "price": { + "type": "number", + "minimum": 0, + "description": "The price of the computer in USD" + } + }, + "required": ["brand", "model", "processor", "ram", "storage", "price"] + } \ No newline at end of file diff --git a/recipes/natural_language_processing/text-to-json/source/schemas/employee.json b/recipes/natural_language_processing/text-to-json/source/schemas/employee.json new file mode 100644 index 00000000..5ad3d67f --- /dev/null +++ b/recipes/natural_language_processing/text-to-json/source/schemas/employee.json @@ -0,0 +1,59 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "https://github.com/containers/ai-lab-recipes/recipes/json-to-text/employee.schema.json", + "title": "employee", + "description": "Acme's Employee Information", + "type": "object", + "properties": { + "name": { + "description": "The employee's full name", + "type": "string" + }, + "employeeId": { + "description": "The unique identifier for a product", + "type": "integer" + }, + "title": { + "description": "An Identifier for what position the employee holds within the company", + "type": "string" + }, + "manager": { + "description": "Who sits above the employee in the Org chart and is responsible for managing them.", + "type": "object", + "items": { + "$ref": "#" + } + }, + "teams": { + "description": "Which products, services or other initiatives is this employee responsible for contributing to.", + "type": "array", + "items": { + "type": "object", + "properties": { + "teamName": { + "description": "A name used to refer to and distinguish between teams.", + "type": "string" + }, + "teamId": { + "description": "A unique integer used to identify a team.", + "type": "integer" + }, + "leader": { + "description": "The one who is responsbile for guiding the team.", + "items": { + "$ref": "#" + } + }, + "description": { + "description": "A short blurb giving information on the team.", + "type": "string" + } + }, + "required": ["teamName", "teamId"] + }, + "minItems": 1, + "uniqueItems": true + } + }, + "required": ["employeeId", "title", "teams"] +} \ No newline at end of file diff --git a/recipes/natural_language_processing/text-to-json/source/schemas/fruit.json b/recipes/natural_language_processing/text-to-json/source/schemas/fruit.json new file mode 100644 index 00000000..1ecebc85 --- /dev/null +++ b/recipes/natural_language_processing/text-to-json/source/schemas/fruit.json @@ -0,0 +1,16 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Fruit", + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "The name of the fruit" + }, + "color": { + "type": "string", + "description": "The color of the fruit" + } + }, + "required": ["name", "color"] +} \ No newline at end of file diff --git a/recipes/natural_language_processing/text-to-json/source/utils.py b/recipes/natural_language_processing/text-to-json/source/utils.py new file mode 100644 index 00000000..113c3264 --- /dev/null +++ b/recipes/natural_language_processing/text-to-json/source/utils.py @@ -0,0 +1,14 @@ +import validators.url +from urllib.error import URLError, HTTPError +import urllib.request + +def download_json_file(url: str, file_name: str): + if validators.url(url): + try: + urllib.request.urlretrieve(url, file_name) + except HTTPError as e: + print(f"HTTP Error: {e.code}, {e.reason}") + except URLError as e: + print(f"URL Error: {e.reason}") + except Exception as e: + print(f"An unexpected error occurred: {e}") \ No newline at end of file