From c32c660222fcfa831996e19d7789b92ef39ad921 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Wed, 30 Oct 2024 14:45:25 +0000 Subject: [PATCH 1/9] add optimum intel optimum --- libs/infinity_emb/Dockerfile.intel_auto | 132 ++++++++++++++++++ .../infinity_emb/_optional_imports.py | 1 + .../transformer/embedder/optimum.py | 50 ++++--- .../infinity_emb/transformer/utils_optimum.py | 86 ++++++++---- 4 files changed, 218 insertions(+), 51 deletions(-) create mode 100644 libs/infinity_emb/Dockerfile.intel_auto diff --git a/libs/infinity_emb/Dockerfile.intel_auto b/libs/infinity_emb/Dockerfile.intel_auto new file mode 100644 index 00000000..2a8ffe99 --- /dev/null +++ b/libs/infinity_emb/Dockerfile.intel_auto @@ -0,0 +1,132 @@ +# Autogenerated warning: +# This file is generated from Dockerfile.jinja2. Do not edit the Dockerfile.cuda|cpu|amd file directly. +# Only contribute to the Dockerfile.jinja2 and dockerfile_template.yaml and regenerate the Dockerfile.cuda|cpu|amd + +FROM ubuntu:22.04 AS base + +ENV PYTHONUNBUFFERED=1 \ + \ + # pip + PIP_NO_CACHE_DIR=off \ + PIP_DISABLE_PIP_VERSION_CHECK=on \ + PIP_DEFAULT_TIMEOUT=100 \ + \ + # make poetry create the virtual environment in the project's root + # it gets named `.venv` + POETRY_VIRTUALENVS_CREATE="true" \ + POETRY_VIRTUALENVS_IN_PROJECT="true" \ + # do not ask any interactive question + POETRY_NO_INTERACTION=1 \ + EXTRAS="all" \ + PYTHON="python3.11" +RUN apt-get update && apt-get install --no-install-recommends -y build-essential python3-dev libsndfile1 $PYTHON-venv $PYTHON curl +WORKDIR /app + +FROM base as builder +# Set the working directory for the app +# Define the version of Poetry to install (default is 1.7.1) +# Define the directory to install Poetry to (default is /opt/poetry) +ARG POETRY_VERSION=1.8.4 +ARG POETRY_HOME=/opt/poetry +# Create a Python virtual environment for Poetry and install it +RUN curl -sSL https://install.python-poetry.org | POETRY_HOME=$POETRY_HOME POETRY_VERSION=$POETRY_VERSION $PYTHON - +ENV PATH=$POETRY_HOME/bin:$PATH +# Test if Poetry is installed in the expected path +RUN echo "Poetry version:" && poetry --version +# Copy the rest of the app source code (this layer will be invalidated and rebuilt whenever the source code changes) +COPY poetry.lock poetry.toml pyproject.toml README.md /app/ +# Install dependencies only +# +# "RUN poetry install --no-interaction --no-ansi --no-root --extras \"${EXTRAS}\" --without lint,test && poetry cache clear pypi --all" +COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh +RUN ./requirements_install_from_poetry.sh --no-root --without lint,test "https://download.pytorch.org/whl/cpu" + +RUN poetry run python -m pip install --upgrade --upgrade-strategy eager "optimum[openvino]" + +COPY infinity_emb infinity_emb +# Install dependency with infinity_emb package +# "RUN poetry install --no-interaction --no-ansi --extras \"${EXTRAS}\" --without lint,test && poetry cache clear pypi --all" +COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh +RUN ./requirements_install_from_poetry.sh --without lint,test "https://download.pytorch.org/whl/cpu" + +# + + +FROM builder as testing +# install lint and test dependencies +# "RUN poetry install --no-interaction --no-ansi --extras \"${EXTRAS}\" --with lint,test && poetry cache clear pypi --all" +COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh +RUN ./requirements_install_from_poetry.sh --with lint,test "https://download.pytorch.org/whl/cpu" + +# # lint +# # RUN poetry run ruff check . +# # RUN poetry run mypy . +# # pytest +# COPY tests tests +# # run end to end tests because of duration of build in github ci. +# # Run tests/end_to_end on TARGETPLATFORM x86_64 otherwise run tests/end_to_end_gpu +# # poetry run python -m pytest tests/end_to_end -x # TODO: does not work. +# RUN if [ -z "$TARGETPLATFORM" ]; then \ +# ARCH=$(uname -m); \ +# if [ "$ARCH" = "x86_64" ]; then \ +# TARGETPLATFORM="linux/amd64"; \ +# elif [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then \ +# TARGETPLATFORM="linux/arm64"; \ +# else \ +# echo "Unsupported architecture: $ARCH"; exit 1; \ +# fi; \ +# fi; \ +# echo "Running tests on TARGETPLATFORM=$TARGETPLATFORM"; \ +# if [ "$TARGETPLATFORM" = "linux/arm64" ] ; then \ +# poetry run python -m pytest tests/end_to_end/test_api_with_dummymodel.py -x ; \ +# else \ +# poetry run python -m pytest tests/end_to_end/test_api_with_dummymodel.py tests/end_to_end/test_sentence_transformers.py -m "not performance" -x ; \ +# fi +# RUN echo "all tests passed" > "test_results.txt" + + +# # Use a multi-stage build -> production version, with download +# FROM base AS tested-builder +# COPY --from=builder /app /app +# # force testing stage to run +# COPY --from=testing /app/test_results.txt /app/test_results.txt +# ENV HF_HOME=/app/.cache/huggingface +# ENV PATH=/app/.venv/bin:$PATH +# # do nothing +# RUN echo "copied all files" + + +# Export with tensorrt, not recommended. +# docker buildx build --target=production-tensorrt -f Dockerfile . +# FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 AS production-tensorrt +# ENV PYTHONUNBUFFERED=1 \ +# PIP_NO_CACHE_DIR=off \ +# PYTHON="python3.11" +# RUN apt-get update && apt-get install python3-dev python3-pip $PYTHON build-essential curl -y +# COPY --from=builder /app /app +# # force testing stage to run +# COPY --from=testing /app/test_results.txt /app/test_results.txt +# ENV HF_HOME=/app/.cache/torch +# ENV PATH=/app/.venv/bin:$PATH +# RUN pip install --no-cache-dir "onnxruntime-gpu==1.17.0" "tensorrt==8.6.*" +# ENV LD_LIBRARY_PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt:/usr/lib/x86_64-linux-gnu:/app/.venv/lib/$(PYTHON)/site-packages/tensorrt_libs:${LD_LIBRARY_PATH} +# ENV PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt/bin:${PATH} +# ENTRYPOINT ["infinity_emb"] + + +# # Use a multi-stage build -> production version, with download +# # docker buildx build --target=production-with-download \ +# # --build-arg MODEL_NAME=BAAI/bge-small-en-v1.5 --build-arg ENGINE=torch -f Dockerfile -t infinity-BAAI-small . +# FROM tested-builder AS production-with-download +# # collect model name and engine from build args +# ARG MODEL_NAME +# RUN if [ -z "${MODEL_NAME}" ]; then echo "Error: Build argument MODEL_NAME not set." && exit 1; fi +# ARG ENGINE +# RUN if [ -z "${ENGINE}" ]; then echo "Error: Build argument ENGINE not set." && exit 1; fi +# # will exit with 3 if model is downloaded # TODO: better exit code +# RUN infinity_emb v2 --model-id $MODEL_NAME --engine $ENGINE --preload-only || [ $? -eq 3 ] +# ENTRYPOINT ["infinity_emb"] + +# # Use a multi-stage build -> production version +# FROM tested-builder AS production +# ENTRYPOINT ["infinity_emb"] diff --git a/libs/infinity_emb/infinity_emb/_optional_imports.py b/libs/infinity_emb/infinity_emb/_optional_imports.py index 63193aac..886aeef8 100644 --- a/libs/infinity_emb/infinity_emb/_optional_imports.py +++ b/libs/infinity_emb/infinity_emb/_optional_imports.py @@ -68,6 +68,7 @@ def _raise_error(self) -> None: "optimum.neuron", "", ) +CHECK_OPTIMUM_INTEL = OptionalImports("optimum.intel", "optimum") CHECK_PIL = OptionalImports("PIL", "vision") CHECK_POSTHOG = OptionalImports("posthog", "server") CHECK_PYDANTIC = OptionalImports("pydantic", "server") diff --git a/libs/infinity_emb/infinity_emb/transformer/embedder/optimum.py b/libs/infinity_emb/infinity_emb/transformer/embedder/optimum.py index e32639de..fcc47f1b 100644 --- a/libs/infinity_emb/infinity_emb/transformer/embedder/optimum.py +++ b/libs/infinity_emb/infinity_emb/transformer/embedder/optimum.py @@ -14,7 +14,7 @@ from infinity_emb.transformer.utils_optimum import ( cls_token_pooling, device_to_onnx, - get_onnx_files, + # get_onnx_files, mean_pooling, normalize, optimize_model, @@ -25,42 +25,52 @@ from optimum.onnxruntime import ( # type: ignore[import-untyped] ORTModelForFeatureExtraction, ) + from infinity_emb.transformer.utils_optimum import get_onnx_files except (ImportError, RuntimeError, Exception) as ex: CHECK_ONNXRUNTIME.mark_dirty(ex) + +if CHECK_OPTIMUM_INTEL.is_available: + try: + from optimum.intel import OVModelForFeatureExtraction as ORTModelForFeatureExtraction # type: ignore[import-untyped] + + except (ImportError, RuntimeError, Exception) as ex: + CHECK_OPTIMUM_INTEL.mark_dirty(ex) + if CHECK_TRANSFORMERS.is_available: from transformers import AutoConfig, AutoTokenizer # type: ignore[import-untyped] class OptimumEmbedder(BaseEmbedder): def __init__(self, *, engine_args: EngineArgs): - CHECK_ONNXRUNTIME.mark_required() + # CHECK_ONNXRUNTIME.mark_required() provider = device_to_onnx(engine_args.device) - onnx_file = get_onnx_files( - model_name_or_path=engine_args.model_name_or_path, - revision=engine_args.revision, - use_auth_token=True, - prefer_quantized="cpu" in provider.lower(), - ) + if CHECK_ONNXRUNTIME.is_available(): + onnx_file = get_onnx_files( + model_name_or_path=engine_args.model_name_or_path, + revision=engine_args.revision, + use_auth_token=True, + prefer_quantized="cpu" in provider.lower(), + ) + self.model = optimize_model( + model_name_or_path=engine_args.model_name_or_path, + revision=engine_args.revision, + trust_remote_code=engine_args.trust_remote_code, + execution_provider=provider, + file_name=onnx_file.as_posix(), + optimize_model=not os.environ.get( + "INFINITY_ONNX_DISABLE_OPTIMIZE", False + ), # TODO: make this env variable public + model_class=ORTModelForFeatureExtraction, + ) + self.model.use_io_binding = False self.pooling = ( mean_pooling if engine_args.pooling_method == PoolingMethod.mean else cls_token_pooling ) - self.model = optimize_model( - model_name_or_path=engine_args.model_name_or_path, - revision=engine_args.revision, - trust_remote_code=engine_args.trust_remote_code, - execution_provider=provider, - file_name=onnx_file.as_posix(), - optimize_model=not os.environ.get( - "INFINITY_ONNX_DISABLE_OPTIMIZE", False - ), # TODO: make this env variable public - model_class=ORTModelForFeatureExtraction, - ) - self.model.use_io_binding = False self.tokenizer = AutoTokenizer.from_pretrained( engine_args.model_name_or_path, diff --git a/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py b/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py index aa5bc712..d9344c66 100644 --- a/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py +++ b/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py @@ -87,12 +87,7 @@ def optimize_model( revision (Optional[str], optional): The revision to use. Defaults to None. trust_remote_code (bool, optional): Whether to trust the remote code. Defaults to True. """ - CHECK_ONNXRUNTIME.mark_required() - path_folder = ( - Path(HUGGINGFACE_HUB_CACHE) / "infinity_onnx" / execution_provider / model_name_or_path - ) - OPTIMIZED_SUFFIX = "_optimized.onnx" - files_optimized = list(path_folder.glob(f"**/*{OPTIMIZED_SUFFIX}")) + if execution_provider == "TensorrtExecutionProvider": return model_class.from_pretrained( model_name_or_path, @@ -110,14 +105,33 @@ def optimize_model( # "trt_int8_enable": "quantize" in file_name, }, ) + + CHECK_ONNXRUNTIME.mark_required() + + files_optimized = [] + + if CHECK_OPTIMUM_INTEL.is_available(): # Optimum Intel OpenVINO path + path_folder = ( + Path(HUGGINGFACE_HUB_CACHE) / "infinity_openvino" / execution_provider / model_name_or_path + ) + OPTIMIZED_PREFIX="openvino_model" + files_optimized = list(path_folder.glob(f"**/{OPTIMIZED_PREFIX}*")) + else: # Optimum onnx cpu path + path_folder = ( + Path(HUGGINGFACE_HUB_CACHE) / "infinity_onnx" / execution_provider / model_name_or_path + ) + OPTIMIZED_SUFFIX = "_optimized.onnx" + files_optimized = list(path_folder.glob(f"**/*{OPTIMIZED_SUFFIX}")) + if files_optimized: + print("files_optimized: ", files_optimized) file_optimized = files_optimized[0] logger.info(f"Optimized model found at {file_optimized}, skipping optimization") return model_class.from_pretrained( file_optimized.parent.as_posix(), revision=revision, trust_remote_code=trust_remote_code, - provider=execution_provider, + provider=execution_provider, # will be ignored by optimum intel file_name=file_optimized.name, ) @@ -126,39 +140,49 @@ def optimize_model( revision=revision, trust_remote_code=trust_remote_code, provider=execution_provider, - file_name=file_name, + file_name=file_name ) if not optimize_model or execution_provider == "TensorrtExecutionProvider": return unoptimized_model try: logger.info("Optimizing model") + if CHECK_OPTIMUM_INTEL.is_available(): + model = OVModelForFeatureExtraction.from_pretrained( + model_id, + export=True, + ov_config={"INFERENCE_PRECISION_HINT": "fp16"} # fp16 for now as it has better precision than bf16 + ) + model.save_pretrained(path_folder.as_posix()) # save the model - optimizer = ORTOptimizer.from_pretrained(unoptimized_model) + else: + optimizer = ORTOptimizer.from_pretrained(unoptimized_model) + + is_gpu = "cpu" not in execution_provider.lower() + optimization_config = OptimizationConfig( + optimization_level=99, + optimize_with_onnxruntime_only=False, + optimize_for_gpu=is_gpu, + fp16=is_gpu, + # enable_gelu_approximation=True, + # enable_gemm_fast_gelu_fusion=True, # might not work + ) - is_gpu = "cpu" not in execution_provider.lower() - optimization_config = OptimizationConfig( - optimization_level=99, - optimize_with_onnxruntime_only=False, - optimize_for_gpu=is_gpu, - fp16=is_gpu, - # enable_gelu_approximation=True, - # enable_gemm_fast_gelu_fusion=True, # might not work - ) + optimized_model_path = optimizer.optimize( + optimization_config=optimization_config, + save_dir=path_folder.as_posix(), + # if larger than 2gb use external data format + one_external_file=True, + ) + + model = model_class.from_pretrained( + optimized_model_path, + revision=revision, + trust_remote_code=trust_remote_code, + provider=execution_provider, + file_name=Path(file_name).name.replace(".onnx", OPTIMIZED_SUFFIX), + ) - optimized_model_path = optimizer.optimize( - optimization_config=optimization_config, - save_dir=path_folder.as_posix(), - # if larger than 2gb use external data format - one_external_file=True, - ) - model = model_class.from_pretrained( - optimized_model_path, - revision=revision, - trust_remote_code=trust_remote_code, - provider=execution_provider, - file_name=Path(file_name).name.replace(".onnx", OPTIMIZED_SUFFIX), - ) except Exception as e: logger.warning(f"Optimization failed with {e}. Going to use the unoptimized model.") model = unoptimized_model From e7279519cb17c75b2cf1f5298a1b911d38986f0b Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Sun, 3 Nov 2024 16:41:50 +0000 Subject: [PATCH 2/9] add optimum-intel code path --- libs/infinity_emb/infinity_emb/primitives.py | 1 + .../transformer/embedder/optimum.py | 44 +++++++- .../infinity_emb/transformer/utils_optimum.py | 100 ++++++++++++++---- 3 files changed, 121 insertions(+), 24 deletions(-) diff --git a/libs/infinity_emb/infinity_emb/primitives.py b/libs/infinity_emb/infinity_emb/primitives.py index 66bcf512..1e727447 100644 --- a/libs/infinity_emb/infinity_emb/primitives.py +++ b/libs/infinity_emb/infinity_emb/primitives.py @@ -106,6 +106,7 @@ def default_value(): class Device(EnumType): cpu = "cpu" + openvino = "openvino" cuda = "cuda" mps = "mps" tensorrt = "tensorrt" diff --git a/libs/infinity_emb/infinity_emb/transformer/embedder/optimum.py b/libs/infinity_emb/infinity_emb/transformer/embedder/optimum.py index fcc47f1b..feddabd5 100644 --- a/libs/infinity_emb/infinity_emb/transformer/embedder/optimum.py +++ b/libs/infinity_emb/infinity_emb/transformer/embedder/optimum.py @@ -6,7 +6,11 @@ import numpy as np -from infinity_emb._optional_imports import CHECK_ONNXRUNTIME, CHECK_TRANSFORMERS +from infinity_emb._optional_imports import ( + CHECK_ONNXRUNTIME, + CHECK_TRANSFORMERS, + CHECK_OPTIMUM_INTEL, +) from infinity_emb.args import EngineArgs from infinity_emb.primitives import EmbeddingReturnType, PoolingMethod from infinity_emb.transformer.abstract import BaseEmbedder @@ -33,21 +37,52 @@ if CHECK_OPTIMUM_INTEL.is_available: try: - from optimum.intel import OVModelForFeatureExtraction as ORTModelForFeatureExtraction # type: ignore[import-untyped] + from optimum.intel import OVModelForFeatureExtraction # type: ignore[import-untyped] + from infinity_emb.transformer.utils_optimum import get_openvino_files except (ImportError, RuntimeError, Exception) as ex: CHECK_OPTIMUM_INTEL.mark_dirty(ex) + if CHECK_TRANSFORMERS.is_available: from transformers import AutoConfig, AutoTokenizer # type: ignore[import-untyped] class OptimumEmbedder(BaseEmbedder): def __init__(self, *, engine_args: EngineArgs): - # CHECK_ONNXRUNTIME.mark_required() provider = device_to_onnx(engine_args.device) + self.provider = provider + print(f"provider: {provider}") + print("CHECK_ONNXRUNTIME: ", CHECK_ONNXRUNTIME.is_available) + + if provider == "OpenVINOExecutionProvider": + CHECK_OPTIMUM_INTEL.mark_required() + filename = "" + try: + openvino_file = get_openvino_files( + model_name_or_path=engine_args.model_name_or_path, + revision=engine_args.revision, + use_auth_token=True, + ) + filename = openvino_file.as_posix() + except Exception as e: # show error then let the optimum intel compress on the fly + print(str(e)) - if CHECK_ONNXRUNTIME.is_available(): + self.model = optimize_model( + model_name_or_path=engine_args.model_name_or_path, + revision=engine_args.revision, + trust_remote_code=engine_args.trust_remote_code, + execution_provider=provider, + file_name=filename, + optimize_model=not os.environ.get( + "INFINITY_ONNX_DISABLE_OPTIMIZE", False + ), # TODO: make this env variable public + model_class=OVModelForFeatureExtraction, + ) + print(type(self.model)) + + elif provider == "CPUExecutionProvider": + CHECK_ONNXRUNTIME.mark_required() onnx_file = get_onnx_files( model_name_or_path=engine_args.model_name_or_path, revision=engine_args.revision, @@ -71,7 +106,6 @@ def __init__(self, *, engine_args: EngineArgs): mean_pooling if engine_args.pooling_method == PoolingMethod.mean else cls_token_pooling ) - self.tokenizer = AutoTokenizer.from_pretrained( engine_args.model_name_or_path, revision=engine_args.revision, diff --git a/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py b/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py index d9344c66..0bc85fe1 100644 --- a/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py +++ b/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py @@ -8,10 +8,11 @@ from huggingface_hub import HfApi, HfFolder # type: ignore from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE # type: ignore -from infinity_emb._optional_imports import CHECK_ONNXRUNTIME, CHECK_TORCH +from infinity_emb._optional_imports import CHECK_ONNXRUNTIME, CHECK_TORCH, CHECK_OPTIMUM_INTEL from infinity_emb.log_handler import logger from infinity_emb.primitives import Device + if CHECK_ONNXRUNTIME.is_available: try: from optimum.modeling_base import OptimizedModel # type: ignore @@ -23,6 +24,14 @@ except (ImportError, RuntimeError, Exception) as ex: CHECK_ONNXRUNTIME.mark_dirty(ex) +if CHECK_OPTIMUM_INTEL.is_available: + try: + from optimum.intel import OVModelForFeatureExtraction # type: ignore[import-untyped] + + except (ImportError, RuntimeError, Exception) as ex: + CHECK_OPTIMUM_INTEL.mark_dirty(ex) + + if CHECK_TORCH.is_available: import torch @@ -51,6 +60,8 @@ def normalize(input_array, p=2, dim=1, eps=1e-12): def device_to_onnx(device: Device) -> str: if device == Device.cpu: return "CPUExecutionProvider" + elif device == Device.openvino: + return "OpenVINOExecutionProvider" elif device == Device.cuda: return "CUDAExecutionProvider" elif device == Device.mps: @@ -87,7 +98,7 @@ def optimize_model( revision (Optional[str], optional): The revision to use. Defaults to None. trust_remote_code (bool, optional): Whether to trust the remote code. Defaults to True. """ - + if execution_provider == "TensorrtExecutionProvider": return model_class.from_pretrained( model_name_or_path, @@ -106,32 +117,44 @@ def optimize_model( }, ) - CHECK_ONNXRUNTIME.mark_required() - files_optimized = [] - if CHECK_OPTIMUM_INTEL.is_available(): # Optimum Intel OpenVINO path + print(f"model_class: {model_class}") + + if execution_provider == "OpenVINOExecutionProvider": # Optimum Intel OpenVINO path + CHECK_OPTIMUM_INTEL.mark_required() path_folder = ( - Path(HUGGINGFACE_HUB_CACHE) / "infinity_openvino" / execution_provider / model_name_or_path + Path(HUGGINGFACE_HUB_CACHE) + / "infinity_openvino" + / execution_provider + / model_name_or_path ) - OPTIMIZED_PREFIX="openvino_model" + OPTIMIZED_PREFIX = "openvino_model" files_optimized = list(path_folder.glob(f"**/{OPTIMIZED_PREFIX}*")) - else: # Optimum onnx cpu path + elif execution_provider == "CPUExecutionProvider": # Optimum onnx cpu path + CHECK_ONNXRUNTIME.mark_required() path_folder = ( Path(HUGGINGFACE_HUB_CACHE) / "infinity_onnx" / execution_provider / model_name_or_path ) OPTIMIZED_SUFFIX = "_optimized.onnx" files_optimized = list(path_folder.glob(f"**/*{OPTIMIZED_SUFFIX}")) - + else: + raise ValueError( + f"Does not support {execution_provider}." + "Optimum engine only support `OpenVINOExecutionProvider` " + "and `CPUExecutionProvider`." + ) + if files_optimized: - print("files_optimized: ", files_optimized) + # print("files_optimized: ", files_optimized) file_optimized = files_optimized[0] + logger.info(file_optimized.name) logger.info(f"Optimized model found at {file_optimized}, skipping optimization") return model_class.from_pretrained( file_optimized.parent.as_posix(), revision=revision, trust_remote_code=trust_remote_code, - provider=execution_provider, # will be ignored by optimum intel + provider=execution_provider, # will be ignored by optimum intel file_name=file_optimized.name, ) @@ -140,21 +163,25 @@ def optimize_model( revision=revision, trust_remote_code=trust_remote_code, provider=execution_provider, - file_name=file_name + file_name=file_name, ) if not optimize_model or execution_provider == "TensorrtExecutionProvider": return unoptimized_model try: logger.info("Optimizing model") - if CHECK_OPTIMUM_INTEL.is_available(): + if execution_provider == "OpenVINOExecutionProvider": model = OVModelForFeatureExtraction.from_pretrained( - model_id, - export=True, - ov_config={"INFERENCE_PRECISION_HINT": "fp16"} # fp16 for now as it has better precision than bf16 + model_name_or_path, + export=True, + # ov_config={"INFERENCE_PRECISION_HINT": "fp32"} # fp16 for now as it has better precision than bf16 + # ov_config={"INFERENCE_PRECISION_HINT": "fp16"} # fp16 for now as it has better precision than bf16 + ov_config={ + "INFERENCE_PRECISION_HINT": "bf16" + }, # fp16 for now as it has better precision than bf16 ) - model.save_pretrained(path_folder.as_posix()) # save the model + model.save_pretrained(path_folder.as_posix()) # save the model - else: + elif execution_provider == "CPUExecutionProvider": # Optimum onnx cpu path optimizer = ORTOptimizer.from_pretrained(unoptimized_model) is_gpu = "cpu" not in execution_provider.lower() @@ -181,12 +208,19 @@ def optimize_model( provider=execution_provider, file_name=Path(file_name).name.replace(".onnx", OPTIMIZED_SUFFIX), ) - + else: + raise ValueError( + f"Does not support {execution_provider}." + "Optimum engine only support `OpenVINOExecutionProvider` " + "and `CPUExecutionProvider`." + ) except Exception as e: logger.warning(f"Optimization failed with {e}. Going to use the unoptimized model.") model = unoptimized_model + print(type(model)) + return model @@ -239,3 +273,31 @@ def get_onnx_files( return onnx_files[0] else: raise ValueError(f"No onnx files found for {model_name_or_path} and revision {revision}") + + +def get_openvino_files( + *, + model_name_or_path: str, + revision: Union[str, None] = None, + use_auth_token: Union[bool, str] = True, +) -> Path: + """gets the onnx files from the repo""" + repo_files = _list_all_repo_files( + model_name_or_path=model_name_or_path, + revision=revision, + use_auth_token=use_auth_token, + ) + pattern = "**/openvino_model.*" + openvino_files = [p for p in repo_files if p.match(pattern)] + + if len(openvino_files) > 1: + logger.info(f"Found {len(openvino_files)} onnx files: {openvino_files}") + openvino_file = openvino_files[-1] + logger.info(f"Using {openvino_file} as the model") + return openvino_file + elif len(openvino_files) == 1: + return openvino_files[0] + else: + raise ValueError( + f"No openvino files found for {model_name_or_path} and revision {revision}" + ) From 06cc5e3b1a8d83f21670a044bcac4725c765b9ad Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Tue, 5 Nov 2024 04:16:59 +0000 Subject: [PATCH 3/9] remove print functions --- libs/infinity_emb/infinity_emb/transformer/embedder/optimum.py | 3 --- libs/infinity_emb/infinity_emb/transformer/utils_optimum.py | 2 -- 2 files changed, 5 deletions(-) diff --git a/libs/infinity_emb/infinity_emb/transformer/embedder/optimum.py b/libs/infinity_emb/infinity_emb/transformer/embedder/optimum.py index feddabd5..648f4932 100644 --- a/libs/infinity_emb/infinity_emb/transformer/embedder/optimum.py +++ b/libs/infinity_emb/infinity_emb/transformer/embedder/optimum.py @@ -52,8 +52,6 @@ class OptimumEmbedder(BaseEmbedder): def __init__(self, *, engine_args: EngineArgs): provider = device_to_onnx(engine_args.device) self.provider = provider - print(f"provider: {provider}") - print("CHECK_ONNXRUNTIME: ", CHECK_ONNXRUNTIME.is_available) if provider == "OpenVINOExecutionProvider": CHECK_OPTIMUM_INTEL.mark_required() @@ -79,7 +77,6 @@ def __init__(self, *, engine_args: EngineArgs): ), # TODO: make this env variable public model_class=OVModelForFeatureExtraction, ) - print(type(self.model)) elif provider == "CPUExecutionProvider": CHECK_ONNXRUNTIME.mark_required() diff --git a/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py b/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py index 0bc85fe1..f8103bc1 100644 --- a/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py +++ b/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py @@ -119,8 +119,6 @@ def optimize_model( files_optimized = [] - print(f"model_class: {model_class}") - if execution_provider == "OpenVINOExecutionProvider": # Optimum Intel OpenVINO path CHECK_OPTIMUM_INTEL.mark_required() path_folder = ( From 087d7146b004263e53c91dea95937314541527a4 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Tue, 5 Nov 2024 15:53:46 +0000 Subject: [PATCH 4/9] add openvino cpu support to Docker.cpu --- libs/infinity_emb/Docker.template.yaml | 1 + libs/infinity_emb/Dockerfile.cpu_auto | 3 +++ 2 files changed, 4 insertions(+) diff --git a/libs/infinity_emb/Docker.template.yaml b/libs/infinity_emb/Docker.template.yaml index c4c8ae02..9bc1f33a 100644 --- a/libs/infinity_emb/Docker.template.yaml +++ b/libs/infinity_emb/Docker.template.yaml @@ -16,6 +16,7 @@ cpu: # "RUN poetry install --no-interaction --no-ansi --no-root --extras \"${EXTRAS}\" --without lint,test && poetry cache clear pypi --all" COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh RUN ./requirements_install_from_poetry.sh --no-root --without lint,test "https://download.pytorch.org/whl/cpu" + RUN poetry run python -m pip install --upgrade --upgrade-strategy eager "optimum[openvino]" amd: # 2 . command: jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s amd > Dockerfile.amd_auto diff --git a/libs/infinity_emb/Dockerfile.cpu_auto b/libs/infinity_emb/Dockerfile.cpu_auto index 4da2bf59..78546c34 100644 --- a/libs/infinity_emb/Dockerfile.cpu_auto +++ b/libs/infinity_emb/Dockerfile.cpu_auto @@ -40,12 +40,14 @@ COPY poetry.lock poetry.toml pyproject.toml README.md /app/ # "RUN poetry install --no-interaction --no-ansi --no-root --extras \"${EXTRAS}\" --without lint,test && poetry cache clear pypi --all" COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh RUN ./requirements_install_from_poetry.sh --no-root --without lint,test "https://download.pytorch.org/whl/cpu" +RUN poetry run python -m pip install --upgrade --upgrade-strategy eager "optimum[openvino]" COPY infinity_emb infinity_emb # Install dependency with infinity_emb package # "RUN poetry install --no-interaction --no-ansi --extras \"${EXTRAS}\" --without lint,test && poetry cache clear pypi --all" COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh RUN ./requirements_install_from_poetry.sh --without lint,test "https://download.pytorch.org/whl/cpu" +RUN poetry run python -m pip install --upgrade --upgrade-strategy eager "optimum[openvino]" # @@ -55,6 +57,7 @@ FROM builder as testing # "RUN poetry install --no-interaction --no-ansi --extras \"${EXTRAS}\" --with lint,test && poetry cache clear pypi --all" COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh RUN ./requirements_install_from_poetry.sh --with lint,test "https://download.pytorch.org/whl/cpu" +RUN poetry run python -m pip install --upgrade --upgrade-strategy eager "optimum[openvino]" # lint RUN poetry run ruff check . From a73316d6fbb934ca2ab01d4bfe390b3761f9c8ec Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Wed, 6 Nov 2024 09:39:10 +0000 Subject: [PATCH 5/9] fix optimum optimized weight code path --- .../infinity_emb/transformer/utils_optimum.py | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py b/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py index f8103bc1..13640da7 100644 --- a/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py +++ b/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py @@ -117,7 +117,9 @@ def optimize_model( }, ) - files_optimized = [] + file_optimized = '' + + logger.info(f"file_name: {file_name}") if execution_provider == "OpenVINOExecutionProvider": # Optimum Intel OpenVINO path CHECK_OPTIMUM_INTEL.mark_required() @@ -128,7 +130,11 @@ def optimize_model( / model_name_or_path ) OPTIMIZED_PREFIX = "openvino_model" - files_optimized = list(path_folder.glob(f"**/{OPTIMIZED_PREFIX}*")) + files_optimized = sorted(list(path_folder.glob(f"**/{OPTIMIZED_PREFIX}*"))) + if files_optimized: + file_optimized = files_optimized[-1] + if file_name: + file_optimized = file_name elif execution_provider == "CPUExecutionProvider": # Optimum onnx cpu path CHECK_ONNXRUNTIME.mark_required() path_folder = ( @@ -136,6 +142,10 @@ def optimize_model( ) OPTIMIZED_SUFFIX = "_optimized.onnx" files_optimized = list(path_folder.glob(f"**/*{OPTIMIZED_SUFFIX}")) + if files_optimized: + file_optimized = files_optimized[0] + + file_name=file_optimized.name else: raise ValueError( f"Does not support {execution_provider}." @@ -143,17 +153,15 @@ def optimize_model( "and `CPUExecutionProvider`." ) - if files_optimized: + if file_optimized: # print("files_optimized: ", files_optimized) - file_optimized = files_optimized[0] - logger.info(file_optimized.name) logger.info(f"Optimized model found at {file_optimized}, skipping optimization") return model_class.from_pretrained( - file_optimized.parent.as_posix(), + file_optimized.parent.as_posix() if not isinstance(file_optimized, str) else model_name_or_path, revision=revision, trust_remote_code=trust_remote_code, provider=execution_provider, # will be ignored by optimum intel - file_name=file_optimized.name, + file_name=file_optimized.name if not isinstance(file_optimized, str) else file_optimized, ) unoptimized_model = model_class.from_pretrained( @@ -285,7 +293,7 @@ def get_openvino_files( revision=revision, use_auth_token=use_auth_token, ) - pattern = "**/openvino_model.*" + pattern = "**openvino_model.*" openvino_files = [p for p in repo_files if p.match(pattern)] if len(openvino_files) > 1: From 295e840f00462bd37d15c6eaf079abf7a10d79c3 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Thu, 7 Nov 2024 07:39:58 +0000 Subject: [PATCH 6/9] add openvino inference hinting as extra arguments --- .../infinity_emb/transformer/utils_optimum.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py b/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py index 13640da7..e7e6d983 100644 --- a/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py +++ b/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py @@ -117,7 +117,9 @@ def optimize_model( }, ) - file_optimized = '' + file_optimized: Union[str, list] = '' + + extra_args = {} logger.info(f"file_name: {file_name}") @@ -135,6 +137,13 @@ def optimize_model( file_optimized = files_optimized[-1] if file_name: file_optimized = file_name + + extra_args = { + "ov_config":{ + "INFERENCE_PRECISION_HINT": "bf16" + } + } + elif execution_provider == "CPUExecutionProvider": # Optimum onnx cpu path CHECK_ONNXRUNTIME.mark_required() path_folder = ( @@ -144,8 +153,6 @@ def optimize_model( files_optimized = list(path_folder.glob(f"**/*{OPTIMIZED_SUFFIX}")) if files_optimized: file_optimized = files_optimized[0] - - file_name=file_optimized.name else: raise ValueError( f"Does not support {execution_provider}." @@ -161,7 +168,8 @@ def optimize_model( revision=revision, trust_remote_code=trust_remote_code, provider=execution_provider, # will be ignored by optimum intel - file_name=file_optimized.name if not isinstance(file_optimized, str) else file_optimized, + file_name=file_optimized.name if not isinstance(file_optimized, str) else file_optimized, + **extra_args ) unoptimized_model = model_class.from_pretrained( @@ -225,8 +233,6 @@ def optimize_model( logger.warning(f"Optimization failed with {e}. Going to use the unoptimized model.") model = unoptimized_model - print(type(model)) - return model From 75351014aa210c875373c42c44107e55c7a8792a Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Sat, 16 Nov 2024 06:24:13 +0000 Subject: [PATCH 7/9] fix loading optimized openvinoe model --- .../infinity_emb/transformer/utils_optimum.py | 80 +++++++++++++++---- 1 file changed, 63 insertions(+), 17 deletions(-) diff --git a/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py b/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py index e7e6d983..190baf26 100644 --- a/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py +++ b/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py @@ -25,11 +25,22 @@ CHECK_ONNXRUNTIME.mark_dirty(ex) if CHECK_OPTIMUM_INTEL.is_available: - try: - from optimum.intel import OVModelForFeatureExtraction # type: ignore[import-untyped] + from optimum.intel import ( + OVModelForFeatureExtraction, # type: ignore[import-untyped] + OVWeightQuantizationConfig, + OVConfig, + OVQuantizer, + ) + # try: + # from optimum.intel import ( + # OVModelForFeatureExtraction, # type: ignore[import-untyped] + # OVWeightQuantizationConfig, + # OVConfig, + # OVQuantizer, + # ) - except (ImportError, RuntimeError, Exception) as ex: - CHECK_OPTIMUM_INTEL.mark_dirty(ex) + # except (ImportError, RuntimeError, Exception) as ex: + # CHECK_OPTIMUM_INTEL.mark_dirty(ex) if CHECK_TORCH.is_available: @@ -117,7 +128,7 @@ def optimize_model( }, ) - file_optimized: Union[str, list] = '' + file_optimized: Path | str = "" extra_args = {} @@ -138,12 +149,8 @@ def optimize_model( if file_name: file_optimized = file_name - extra_args = { - "ov_config":{ - "INFERENCE_PRECISION_HINT": "bf16" - } - } - + extra_args = {"ov_config": {"INFERENCE_PRECISION_HINT": "bf16"}} + elif execution_provider == "CPUExecutionProvider": # Optimum onnx cpu path CHECK_ONNXRUNTIME.mark_required() path_folder = ( @@ -164,12 +171,19 @@ def optimize_model( # print("files_optimized: ", files_optimized) logger.info(f"Optimized model found at {file_optimized}, skipping optimization") return model_class.from_pretrained( - file_optimized.parent.as_posix() if not isinstance(file_optimized, str) else model_name_or_path, + file_optimized.parent.as_posix() + if not isinstance(file_optimized, str) + else model_name_or_path, revision=revision, trust_remote_code=trust_remote_code, provider=execution_provider, # will be ignored by optimum intel - file_name=file_optimized.name if not isinstance(file_optimized, str) else file_optimized, - **extra_args + file_name=file_optimized.name + if not isinstance(file_optimized, str) + else file_optimized, + # **extra_args, + ov_config={ + "INFERENCE_PRECISION_HINT": "bf16" + }, # fp16 for now as it has better precision than bf16, ) unoptimized_model = model_class.from_pretrained( @@ -184,7 +198,17 @@ def optimize_model( try: logger.info("Optimizing model") if execution_provider == "OpenVINOExecutionProvider": - model = OVModelForFeatureExtraction.from_pretrained( + logger.info("Optimizing model OpenVINOExecutionProvider") + # model = OVModelForFeatureExtraction.from_pretrained( + # model_name_or_path, + # export=True, + # # ov_config={"INFERENCE_PRECISION_HINT": "fp32"} # fp16 for now as it has better precision than bf16 + # # ov_config={"INFERENCE_PRECISION_HINT": "fp16"} # fp16 for now as it has better precision than bf16 + # ov_config={ + # "INFERENCE_PRECISION_HINT": "bf16" + # }, # fp16 for now as it has better precision than bf16 + # ) + ov_model = OVModelForFeatureExtraction.from_pretrained( model_name_or_path, export=True, # ov_config={"INFERENCE_PRECISION_HINT": "fp32"} # fp16 for now as it has better precision than bf16 @@ -193,7 +217,29 @@ def optimize_model( "INFERENCE_PRECISION_HINT": "bf16" }, # fp16 for now as it has better precision than bf16 ) - model.save_pretrained(path_folder.as_posix()) # save the model + quantizer = OVQuantizer.from_pretrained(ov_model, task="feature-extraction", export=True) + ov_config = OVConfig( + quantization_config=OVWeightQuantizationConfig( + bits=4, + sym=False, + ratio=1.0, + group_size=128, + all_layers=None, + ) + ) + # print("ov_config.dtype: ", ov_config.dtype) + quantizer.quantize(ov_config=ov_config, save_directory=path_folder.as_posix()) + model = OVModelForFeatureExtraction.from_pretrained( + path_folder.as_posix(), + # ov_config={"INFERENCE_PRECISION_HINT": "fp32"} # fp16 for now as it has better precision than bf16 + # ov_config={"INFERENCE_PRECISION_HINT": "fp16"} # fp16 for now as it has better precision than bf16 + ov_config={ + "INFERENCE_PRECISION_HINT": "bf16" + }, # fp16 for now as it has better precision than bf16, + export=False, + ) + logger.info("Successfully load optimized model OpenVINOExecutionProvider") + # model.save_pretrained(path_folder.as_posix()) # save the model elif execution_provider == "CPUExecutionProvider": # Optimum onnx cpu path optimizer = ORTOptimizer.from_pretrained(unoptimized_model) @@ -300,7 +346,7 @@ def get_openvino_files( use_auth_token=use_auth_token, ) pattern = "**openvino_model.*" - openvino_files = [p for p in repo_files if p.match(pattern)] + openvino_files = sorted([p for p in repo_files if p.match(pattern)]) if len(openvino_files) > 1: logger.info(f"Found {len(openvino_files)} onnx files: {openvino_files}") From 7fda9bf604e25d45388bec1b927c8a01fb12a53b Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Tue, 19 Nov 2024 17:05:52 +0000 Subject: [PATCH 8/9] reproduced performant openvino support --- libs/infinity_emb/Docker.template.yaml | 10 ++++++- libs/infinity_emb/Dockerfile.cpu_auto | 30 +++++++++++++++++-- .../infinity_emb/transformer/utils_optimum.py | 19 ++---------- .../transformer/embedder/test_optimum.py | 23 ++++++++++++++ 4 files changed, 61 insertions(+), 21 deletions(-) diff --git a/libs/infinity_emb/Docker.template.yaml b/libs/infinity_emb/Docker.template.yaml index 9bc1f33a..e231e48b 100644 --- a/libs/infinity_emb/Docker.template.yaml +++ b/libs/infinity_emb/Docker.template.yaml @@ -15,8 +15,16 @@ cpu: main_install: | # "RUN poetry install --no-interaction --no-ansi --no-root --extras \"${EXTRAS}\" --without lint,test && poetry cache clear pypi --all" COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh + RUN apt update -y && apt install git -y RUN ./requirements_install_from_poetry.sh --no-root --without lint,test "https://download.pytorch.org/whl/cpu" - RUN poetry run python -m pip install --upgrade --upgrade-strategy eager "optimum[openvino]" + RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly" poetry run python -m pip install -U --pre optimum-intel@git+https://github.com/huggingface/optimum-intel.git \ + openvino-tokenizers[transformers]==2024.5.* \ + openvino==2024.5.* \ + nncf>=2.11.0 \ + sentence_transformers==3.1.1 \ + openai \ + "transformers>4.45" \ + einops amd: # 2 . command: jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s amd > Dockerfile.amd_auto diff --git a/libs/infinity_emb/Dockerfile.cpu_auto b/libs/infinity_emb/Dockerfile.cpu_auto index 78546c34..0a7b230f 100644 --- a/libs/infinity_emb/Dockerfile.cpu_auto +++ b/libs/infinity_emb/Dockerfile.cpu_auto @@ -39,15 +39,31 @@ COPY poetry.lock poetry.toml pyproject.toml README.md /app/ # # "RUN poetry install --no-interaction --no-ansi --no-root --extras \"${EXTRAS}\" --without lint,test && poetry cache clear pypi --all" COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh +RUN apt update -y && apt install git -y RUN ./requirements_install_from_poetry.sh --no-root --without lint,test "https://download.pytorch.org/whl/cpu" -RUN poetry run python -m pip install --upgrade --upgrade-strategy eager "optimum[openvino]" +RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly" poetry run python -m pip install -U --pre optimum-intel@git+https://github.com/huggingface/optimum-intel.git \ + openvino-tokenizers[transformers]==2024.5.* \ + openvino==2024.5.* \ + nncf>=2.11.0 \ + sentence_transformers==3.1.1 \ + openai \ + "transformers>4.45" \ + einops COPY infinity_emb infinity_emb # Install dependency with infinity_emb package # "RUN poetry install --no-interaction --no-ansi --extras \"${EXTRAS}\" --without lint,test && poetry cache clear pypi --all" COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh +RUN apt update -y && apt install git -y RUN ./requirements_install_from_poetry.sh --without lint,test "https://download.pytorch.org/whl/cpu" -RUN poetry run python -m pip install --upgrade --upgrade-strategy eager "optimum[openvino]" +RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly" poetry run python -m pip install -U --pre optimum-intel@git+https://github.com/huggingface/optimum-intel.git \ + openvino-tokenizers[transformers]==2024.5.* \ + openvino==2024.5.* \ + nncf>=2.11.0 \ + sentence_transformers==3.1.1 \ + openai \ + "transformers>4.45" \ + einops # @@ -56,8 +72,16 @@ FROM builder as testing # install lint and test dependencies # "RUN poetry install --no-interaction --no-ansi --extras \"${EXTRAS}\" --with lint,test && poetry cache clear pypi --all" COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh +RUN apt update -y && apt install git -y RUN ./requirements_install_from_poetry.sh --with lint,test "https://download.pytorch.org/whl/cpu" -RUN poetry run python -m pip install --upgrade --upgrade-strategy eager "optimum[openvino]" +RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly" poetry run python -m pip install -U --pre optimum-intel@git+https://github.com/huggingface/optimum-intel.git \ + openvino-tokenizers[transformers]==2024.5.* \ + openvino==2024.5.* \ + nncf>=2.11.0 \ + sentence_transformers==3.1.1 \ + openai \ + "transformers>4.45" \ + einops # lint RUN poetry run ruff check . diff --git a/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py b/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py index 190baf26..1aafa87c 100644 --- a/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py +++ b/libs/infinity_emb/infinity_emb/transformer/utils_optimum.py @@ -168,7 +168,6 @@ def optimize_model( ) if file_optimized: - # print("files_optimized: ", files_optimized) logger.info(f"Optimized model found at {file_optimized}, skipping optimization") return model_class.from_pretrained( file_optimized.parent.as_posix() @@ -180,10 +179,7 @@ def optimize_model( file_name=file_optimized.name if not isinstance(file_optimized, str) else file_optimized, - # **extra_args, - ov_config={ - "INFERENCE_PRECISION_HINT": "bf16" - }, # fp16 for now as it has better precision than bf16, + **extra_args, ) unoptimized_model = model_class.from_pretrained( @@ -199,15 +195,6 @@ def optimize_model( logger.info("Optimizing model") if execution_provider == "OpenVINOExecutionProvider": logger.info("Optimizing model OpenVINOExecutionProvider") - # model = OVModelForFeatureExtraction.from_pretrained( - # model_name_or_path, - # export=True, - # # ov_config={"INFERENCE_PRECISION_HINT": "fp32"} # fp16 for now as it has better precision than bf16 - # # ov_config={"INFERENCE_PRECISION_HINT": "fp16"} # fp16 for now as it has better precision than bf16 - # ov_config={ - # "INFERENCE_PRECISION_HINT": "bf16" - # }, # fp16 for now as it has better precision than bf16 - # ) ov_model = OVModelForFeatureExtraction.from_pretrained( model_name_or_path, export=True, @@ -227,7 +214,6 @@ def optimize_model( all_layers=None, ) ) - # print("ov_config.dtype: ", ov_config.dtype) quantizer.quantize(ov_config=ov_config, save_directory=path_folder.as_posix()) model = OVModelForFeatureExtraction.from_pretrained( path_folder.as_posix(), @@ -239,7 +225,6 @@ def optimize_model( export=False, ) logger.info("Successfully load optimized model OpenVINOExecutionProvider") - # model.save_pretrained(path_folder.as_posix()) # save the model elif execution_provider == "CPUExecutionProvider": # Optimum onnx cpu path optimizer = ORTOptimizer.from_pretrained(unoptimized_model) @@ -349,7 +334,7 @@ def get_openvino_files( openvino_files = sorted([p for p in repo_files if p.match(pattern)]) if len(openvino_files) > 1: - logger.info(f"Found {len(openvino_files)} onnx files: {openvino_files}") + logger.info(f"Found {len(openvino_files)} openvino files: {openvino_files}") openvino_file = openvino_files[-1] logger.info(f"Using {openvino_file} as the model") return openvino_file diff --git a/libs/infinity_emb/tests/unit_test/transformer/embedder/test_optimum.py b/libs/infinity_emb/tests/unit_test/transformer/embedder/test_optimum.py index c612d305..1ccef7a8 100644 --- a/libs/infinity_emb/tests/unit_test/transformer/embedder/test_optimum.py +++ b/libs/infinity_emb/tests/unit_test/transformer/embedder/test_optimum.py @@ -25,3 +25,26 @@ def test_embedder_optimum(size="large"): cosine_sim = np.dot(r, e) / (np.linalg.norm(e) * np.linalg.norm(r)) assert cosine_sim > 0.94 np.testing.assert_allclose(embeds, embeds_orig, atol=0.25) + + +def test_embedder_optimum_openvino_cpu(size="large"): + model = OptimumEmbedder( + engine_args=EngineArgs(model_name_or_path=f"BAAI/bge-{size}-en-v1.5", device="openvino") + ) + st_model = SentenceTransformer(model_name_or_path=f"BAAI/bge-{size}-en-v1.5", device="cpu") + + sentences = ["This is awesome.", "I am depressed."] + + encode_pre = model.encode_pre(sentences) + encode_core = model.encode_core(encode_pre) + embeds = model.encode_post(encode_core) + + embeds_orig = st_model.encode(sentences) + + assert len(embeds) == len(sentences) + + for r, e in zip(embeds, embeds_orig): + cosine_sim = np.dot(r, e) / (np.linalg.norm(e) * np.linalg.norm(r)) + assert cosine_sim > 0.94 + np.testing.assert_allclose(embeds, embeds_orig, atol=0.25) + From 81756bd13c09f674b7d41dc649824f58415a6dc8 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Fri, 22 Nov 2024 09:02:20 +0000 Subject: [PATCH 9/9] install openvino 2024.5 from stable release instead --- libs/infinity_emb/Docker.template.yaml | 10 ++++++++- libs/infinity_emb/Dockerfile.cpu_auto | 30 +++++++++++++++++++++++--- 2 files changed, 36 insertions(+), 4 deletions(-) diff --git a/libs/infinity_emb/Docker.template.yaml b/libs/infinity_emb/Docker.template.yaml index e231e48b..c7d6fd64 100644 --- a/libs/infinity_emb/Docker.template.yaml +++ b/libs/infinity_emb/Docker.template.yaml @@ -17,7 +17,7 @@ cpu: COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh RUN apt update -y && apt install git -y RUN ./requirements_install_from_poetry.sh --no-root --without lint,test "https://download.pytorch.org/whl/cpu" - RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly" poetry run python -m pip install -U --pre optimum-intel@git+https://github.com/huggingface/optimum-intel.git \ + RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" poetry run python -m pip install -U --pre optimum-intel@git+https://github.com/huggingface/optimum-intel.git \ openvino-tokenizers[transformers]==2024.5.* \ openvino==2024.5.* \ nncf>=2.11.0 \ @@ -25,6 +25,14 @@ cpu: openai \ "transformers>4.45" \ einops + # RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly" poetry run python -m pip install -U --pre optimum-intel@git+https://github.com/huggingface/optimum-intel.git \ + # openvino-tokenizers[transformers]==2024.5.* \ + # openvino==2024.5.* \ + # nncf>=2.11.0 \ + # sentence_transformers==3.1.1 \ + # openai \ + # "transformers>4.45" \ + # einops amd: # 2 . command: jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s amd > Dockerfile.amd_auto diff --git a/libs/infinity_emb/Dockerfile.cpu_auto b/libs/infinity_emb/Dockerfile.cpu_auto index 0a7b230f..6c1fea82 100644 --- a/libs/infinity_emb/Dockerfile.cpu_auto +++ b/libs/infinity_emb/Dockerfile.cpu_auto @@ -41,7 +41,7 @@ COPY poetry.lock poetry.toml pyproject.toml README.md /app/ COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh RUN apt update -y && apt install git -y RUN ./requirements_install_from_poetry.sh --no-root --without lint,test "https://download.pytorch.org/whl/cpu" -RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly" poetry run python -m pip install -U --pre optimum-intel@git+https://github.com/huggingface/optimum-intel.git \ +RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" poetry run python -m pip install -U --pre optimum-intel@git+https://github.com/huggingface/optimum-intel.git \ openvino-tokenizers[transformers]==2024.5.* \ openvino==2024.5.* \ nncf>=2.11.0 \ @@ -49,6 +49,14 @@ RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.op openai \ "transformers>4.45" \ einops +# RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly" poetry run python -m pip install -U --pre optimum-intel@git+https://github.com/huggingface/optimum-intel.git \ +# openvino-tokenizers[transformers]==2024.5.* \ +# openvino==2024.5.* \ +# nncf>=2.11.0 \ +# sentence_transformers==3.1.1 \ +# openai \ +# "transformers>4.45" \ +# einops COPY infinity_emb infinity_emb # Install dependency with infinity_emb package @@ -56,7 +64,7 @@ COPY infinity_emb infinity_emb COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh RUN apt update -y && apt install git -y RUN ./requirements_install_from_poetry.sh --without lint,test "https://download.pytorch.org/whl/cpu" -RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly" poetry run python -m pip install -U --pre optimum-intel@git+https://github.com/huggingface/optimum-intel.git \ +RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" poetry run python -m pip install -U --pre optimum-intel@git+https://github.com/huggingface/optimum-intel.git \ openvino-tokenizers[transformers]==2024.5.* \ openvino==2024.5.* \ nncf>=2.11.0 \ @@ -64,6 +72,14 @@ RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.op openai \ "transformers>4.45" \ einops +# RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly" poetry run python -m pip install -U --pre optimum-intel@git+https://github.com/huggingface/optimum-intel.git \ +# openvino-tokenizers[transformers]==2024.5.* \ +# openvino==2024.5.* \ +# nncf>=2.11.0 \ +# sentence_transformers==3.1.1 \ +# openai \ +# "transformers>4.45" \ +# einops # @@ -74,7 +90,7 @@ FROM builder as testing COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh RUN apt update -y && apt install git -y RUN ./requirements_install_from_poetry.sh --with lint,test "https://download.pytorch.org/whl/cpu" -RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly" poetry run python -m pip install -U --pre optimum-intel@git+https://github.com/huggingface/optimum-intel.git \ +RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" poetry run python -m pip install -U --pre optimum-intel@git+https://github.com/huggingface/optimum-intel.git \ openvino-tokenizers[transformers]==2024.5.* \ openvino==2024.5.* \ nncf>=2.11.0 \ @@ -82,6 +98,14 @@ RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.op openai \ "transformers>4.45" \ einops +# RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly" poetry run python -m pip install -U --pre optimum-intel@git+https://github.com/huggingface/optimum-intel.git \ +# openvino-tokenizers[transformers]==2024.5.* \ +# openvino==2024.5.* \ +# nncf>=2.11.0 \ +# sentence_transformers==3.1.1 \ +# openai \ +# "transformers>4.45" \ +# einops # lint RUN poetry run ruff check .