diff --git a/dev-requirements.txt b/dev-requirements.txt index 7aa26658..95175f8a 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -321,6 +321,7 @@ tabulate==0.9.0 # via sphinx-toolbox tomli==2.2.1 # via + # labthings-fastapi (pyproject.toml) # coverage # flake8-pyproject # mypy diff --git a/docs/source/blobs.rst b/docs/source/blobs.rst index 4bcdb8f8..a03da48d 100644 --- a/docs/source/blobs.rst +++ b/docs/source/blobs.rst @@ -7,14 +7,14 @@ Blob input/output If interactions require only simple data types that can easily be represented in JSON, very little thought needs to be given to data types - strings and numbers will be converted to and from JSON automatically, and your Python code should only ever see native Python datatypes whether it's running on the server or a remote client. However, if you want to transfer larger data objects such as images, large arrays or other binary data, you will need to use a `.Blob` object. -`.Blob` objects are not part of the Web of Things specification, which doesn't give much consideration to returning large or complicated datatypes. In LabThings-FastAPI, the `.Blob` mechanism is intended to provide an efficient way to work with arbitrary binary data. If it's used to transfer data between two Things on the same server, the data should not be copied or otherwise iterated over - and when it must be transferred over the network it can be done using a binary transfer, rather than embedding in JSON with base64 encoding. +`.Blob` objects are not part of the Web of Things specification, which doesn't give much consideration to returning large or complicated datatypes. In LabThings-FastAPI, the `.Blob` mechanism is intended to provide an efficient way to work with arbitrary binary data. If a `.Blob` is passed between two Things on the same server, the data will not be copied - and when it must be transferred over the network it can be done using a binary transfer, rather than embedding in JSON with base64 encoding. -A `.Blob` consists of some data and a MIME type, which sets how the data should be interpreted. It is best to create a subclass of `.Blob` with the content type set: this makes it clear what kind of data is in the `.Blob`. In the future, it might be possible to add functionality to `.Blob` subclasses, for example to make it simple to obtain an image object from a `.Blob` containing JPEG data. However, this will not currently work across both client and server code. +A `.Blob` consists of some data and a MIME type, which sets how the data should be interpreted. It is best to create a subclass of `.Blob` with the ``media_type`` set: this makes it clear what kind of data is in the `.Blob`. In the future, it might be possible to add functionality to `.Blob` subclasses, for example to make it simple to obtain an image object from a `.Blob` containing JPEG data. However, this will not currently work across both client and server code. Creating and using `.Blob` objects ------------------------------------------------ -Blobs can be created from binary data that is in memory (a `bytes` object) with `.Blob.from_bytes`, on disk (with `.Blob.from_temporary_directory` or `.Blob.from_file`), or using a URL as a placeholder. The intention is that the code that uses a `.Blob` should not need to know which of these is the case, and should be able to use the same code regardless of how the data is stored. +Blobs can be created from binary data that is in memory (a `bytes` object) with `.Blob.from_bytes`, on disk (with `.Blob.from_temporary_directory` or `.Blob.from_file`). A `.Blob` may also point to remote data (see `.Blob.from_url`). Code that uses a `.Blob` should not need to know how the data is stored, as the interface is the same in each case. Blobs offer three ways to access their data: @@ -122,7 +122,7 @@ On the client, we can use the `capture_image` action directly (as before), or we HTTP interface and serialization -------------------------------- -`.Blob` objects are subclasses of `pydantic.BaseModel`, which means they can be serialized to JSON and deserialized from JSON. When this happens, the `.Blob` is represented as a JSON object with `.Blob.url` and `.Blob.content_type` fields. The `.Blob.url` field is a link to the data. The `.Blob.content_type` field is a string representing the MIME type of the data. It is worth noting that models may be nested: this means an action may return many `.Blob` objects in its output, either as a list or as fields in a `pydantic.BaseModel` subclass. Each `.Blob` in the output will be serialized to JSON with its URL and content type, and the client can then download the data from the URL, one download per `.Blob` object. +`.Blob` objects can be serialized to JSON and deserialized from JSON. When this happens, the `.Blob` is represented as a JSON object with ``href`` and ``content_type`` fields. The ``href`` field is a link to the data. The ``content_type`` field is a string representing the MIME type of the data. It is worth noting that models may be nested: this means an action may return many `.Blob` objects in its output, either as a list or as fields in a `pydantic.BaseModel` subclass. Each `.Blob` in the output will be serialized to JSON with its URL and content type, and the client can then download the data from the URL, one download per `.Blob` object. When a `.Blob` is serialized, the URL is generated with a unique ID to allow it to be downloaded. The URL is not guaranteed to be permanent, and should not be used as a long-term reference to the data. For `.Blob` objects that are part of the output of an action, the URL will expire after 5 minutes (or the retention time set for the action), and the data will no longer be available for download after that time. @@ -136,7 +136,7 @@ It may be possible to have actions return binary data directly in the future, bu .. note:: - Serialising or deserialising `.Blob` objects requires access to the `.BlobDataManager`\ . As there is no way to pass this in to the relevant methods at serialisation/deserialisation time, we use context variables to access them. This means that a `.blob_serialisation_context_manager` should be used to set (and then clear) those context variables. This is done by the `.BlobIOContextDep` dependency on the relevant endpoints (currently any endpoint that may return the output of an action). + Serialising or deserialising `.Blob` objects generates URLs, which are specific to the HTTP request. This means that `.Blob` objects cannot be serialised or deserialised outside the context of an HTTP request handler, so if code in an Action or Property attempts to turn a `.Blob` into JSON, it is likely to raise exceptions. For more detail on this mechanism, see `.middleware.url_for`\ . Memory management and retention diff --git a/pyproject.toml b/pyproject.toml index 72e72adc..41797675 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,6 +41,7 @@ dev = [ "sphinx>=7.2", "sphinx-autoapi", "sphinx-toolbox", + "tomli; python_version < '3.11'", "codespell", ] @@ -171,5 +172,8 @@ check-return-types = false check-class-attributes = false # prefer docstrings on the attributes check-yield-types = false # use type annotations instead +[tool.codespell] +ignore-words-list = ["ser"] + [project.scripts] labthings-server = "labthings_fastapi.server.cli:serve_from_cli" diff --git a/src/labthings_fastapi/actions.py b/src/labthings_fastapi/actions.py index 95b9910f..ff8091de 100644 --- a/src/labthings_fastapi/actions.py +++ b/src/labthings_fastapi/actions.py @@ -39,6 +39,8 @@ from fastapi import FastAPI, HTTPException, Request, Body, BackgroundTasks from pydantic import BaseModel, create_model +from labthings_fastapi.middleware.url_for import URLFor + from .base_descriptor import BaseDescriptor from .logs import add_thing_log_destination from .utilities import model_to_dict, wrap_plain_types_in_rootmodel @@ -47,10 +49,8 @@ from .exceptions import ( InvocationCancelledError, InvocationError, - NoBlobManagerError, NotConnectedToServerError, ) -from .outputs.blob import BlobIOContextDep, blobdata_to_url_ctx from . import invocation_contexts from .utilities.introspection import ( EmptyInput, @@ -149,23 +149,7 @@ def id(self) -> uuid.UUID: @property def output(self) -> Any: - """Return value of the Action. If the Action is still running, returns None. - - :raise NoBlobManagerError: If this is called in a context where the blob - manager context variables are not available. This stops errors being raised - later once the blob is returned and tries to serialise. If the errors - happen during serialisation the stack-trace will not clearly identify - the route with the missing dependency. - """ - try: - blobdata_to_url_ctx.get() - except LookupError as e: - raise NoBlobManagerError( - "An invocation output has been requested from a api route that " - "doesn't have a BlobIOContextDep dependency. This dependency is needed " - " for blobs to identify their url." - ) from e - + """Return value of the Action. If the Action is still running, returns None.""" with self._status_lock: return self._return_value @@ -225,25 +209,20 @@ def cancel(self) -> None: """ self.cancel_hook.set() - def response(self, request: Optional[Request] = None) -> InvocationModel: + def response(self) -> InvocationModel: """Generate a representation of the invocation suitable for HTTP. When an invocation is polled, we return a JSON object that includes its status, any log entries, a return value (if completed), and a link to poll for updates. - :param request: is used to generate the ``href`` in the response, which - should retrieve an updated version of this response. - :return: an `.InvocationModel` representing this `.Invocation`. """ - if request: - href = str(request.url_for("action_invocation", id=self.id)) - else: - href = f"{ACTION_INVOCATIONS_PATH}/{self.id}" links = [ - LinkElement(rel="self", href=href), - LinkElement(rel="output", href=href + "/output"), + LinkElement(rel="self", href=URLFor("action_invocation", id=self.id)), + LinkElement( + rel="output", href=URLFor("action_invocation_output", id=self.id) + ), ] # The line below confuses MyPy because self.action **evaluates to** a Descriptor # object (i.e. we don't call __get__ on the descriptor). @@ -251,7 +230,7 @@ def response(self, request: Optional[Request] = None) -> InvocationModel: status=self.status, id=self.id, action=self.thing.path + self.action.name, # type: ignore[call-overload] - href=href, + href=URLFor("action_invocation", id=self.id), timeStarted=self._start_time, timeCompleted=self._end_time, timeRequested=self._request_time, @@ -442,7 +421,7 @@ def list_invocations( :return: A list of invocations, optionally filtered by Thing and/or Action. """ return [ - i.response(request=request) + i.response() for i in self.invocations if thing is None or i.thing == thing if action is None or i.action == action # type: ignore[call-overload] @@ -470,25 +449,19 @@ def attach_to_app(self, app: FastAPI) -> None: """ @app.get(ACTION_INVOCATIONS_PATH, response_model=list[InvocationModel]) - def list_all_invocations( - request: Request, _blob_manager: BlobIOContextDep - ) -> list[InvocationModel]: + def list_all_invocations(request: Request) -> list[InvocationModel]: return self.list_invocations(request=request) @app.get( ACTION_INVOCATIONS_PATH + "/{id}", responses={404: {"description": "Invocation ID not found"}}, ) - def action_invocation( - id: uuid.UUID, request: Request, _blob_manager: BlobIOContextDep - ) -> InvocationModel: + def action_invocation(id: uuid.UUID, request: Request) -> InvocationModel: """Return a description of a specific action. :param id: The action's ID (from the path). :param request: FastAPI dependency for the request object, used to find URLs via ``url_for``. - :param _blob_manager: FastAPI dependency that enables `.Blob` objects - to be serialised. :return: Details of the invocation. @@ -497,7 +470,7 @@ def action_invocation( """ try: with self._invocations_lock: - return self._invocations[id].response(request=request) + return self._invocations[id].response() except KeyError as e: raise HTTPException( status_code=404, @@ -518,17 +491,13 @@ def action_invocation( 503: {"description": "No result is available for this invocation"}, }, ) - def action_invocation_output( - id: uuid.UUID, _blob_manager: BlobIOContextDep - ) -> Any: + def action_invocation_output(id: uuid.UUID) -> Any: """Get the output of an action invocation. This returns just the "output" component of the action invocation. If the output is a file, it will return the file. :param id: The action's ID (from the path). - :param _blob_manager: FastAPI dependency that enables `.Blob` objects - to be serialised. :return: The output of the invocation, as a `pydantic.BaseModel` instance. If this is a `.Blob`, it may be returned directly. @@ -806,8 +775,6 @@ def add_to_fastapi(self, app: FastAPI, thing: Thing) -> None: # The solution below is to manually add the annotation, before passing # the function to the decorator. def start_action( - _blob_manager: BlobIOContextDep, - request: Request, body: Any, # This annotation will be overwritten below. id: NonWarningInvocationID, background_tasks: BackgroundTasks, @@ -822,7 +789,7 @@ def start_action( id=id, ) background_tasks.add_task(action_manager.expire_invocations) - return action.response(request=request) + return action.response() if issubclass(self.input_model, EmptyInput): annotation = Body(default_factory=StrictEmptyInput) @@ -884,7 +851,7 @@ def start_action( ), summary=f"All invocations of {self.name}.", ) - def list_invocations(_blob_manager: BlobIOContextDep) -> list[InvocationModel]: + def list_invocations() -> list[InvocationModel]: action_manager = thing._thing_server_interface._action_manager return action_manager.list_invocations(self, thing) diff --git a/src/labthings_fastapi/client/__init__.py b/src/labthings_fastapi/client/__init__.py index 1b45d4c4..8fbc2024 100644 --- a/src/labthings_fastapi/client/__init__.py +++ b/src/labthings_fastapi/client/__init__.py @@ -13,9 +13,9 @@ import httpx from urllib.parse import urlparse, urljoin -from pydantic import BaseModel +from pydantic import BaseModel, TypeAdapter -from .outputs import ClientBlobOutput +from ..outputs.blob import Blob, RemoteBlobData from ..exceptions import ( FailedToInvokeActionError, ServerActionError, @@ -206,16 +206,14 @@ def invoke_action(self, path: str, **kwargs: Any) -> Any: """ for k in kwargs.keys(): value = kwargs[k] - if isinstance(value, ClientBlobOutput): - # ClientBlobOutput objects may be used as input to a subsequent - # action. When this is done, they should be serialised to a dict - # with `href` and `media_type` keys, as done below. - # Ideally this should be replaced with `Blob` and the use of - # `pydantic` models to serialise action inputs. + if isinstance(value, Blob): + # Blob objects may be used as input to a subsequent + # action. When this is done, they should be serialised by + # pydantic, to a dictionary that includes href and media_type. # # Note that the blob will not be uploaded: we rely on the blob # still existing on the server. - kwargs[k] = {"href": value.href, "media_type": value.media_type} + kwargs[k] = TypeAdapter(Blob).dump_python(value) response = self.client.post(urljoin(self.path, path), json=kwargs) if response.is_error: message = _construct_failed_to_invoke_message(path, response) @@ -228,10 +226,12 @@ def invoke_action(self, path: str, **kwargs: Any) -> Any: and "href" in invocation["output"] and "media_type" in invocation["output"] ): - return ClientBlobOutput( - media_type=invocation["output"]["media_type"], - href=invocation["output"]["href"], - client=self.client, + return Blob( + RemoteBlobData( + media_type=invocation["output"]["media_type"], + href=invocation["output"]["href"], + client=self.client, + ) ) return invocation["output"] message = _construct_invocation_error_message(invocation) diff --git a/src/labthings_fastapi/client/outputs.py b/src/labthings_fastapi/client/outputs.py deleted file mode 100644 index 09c4962a..00000000 --- a/src/labthings_fastapi/client/outputs.py +++ /dev/null @@ -1,77 +0,0 @@ -"""A client-side implementation of `.Blob`. - -.. note:: - - In the future, both client and server code are planned to use `.Blob` to - represent binary data, or data held in a file. - -When a `.ThingClient` returns data to a client that matches the schema of a `.Blob` -(specifically, it needs an `href` and a `media_type`), we convert it into a -`.ClientBlobOutput` object. This is a work-a-like for `.Blob`, meaning it can -be saved to a file or have its contents accessed in the same ways. -""" - -import io -from typing import Optional -import httpx - - -class ClientBlobOutput: - """An output from LabThings best returned as a file. - - This object is returned by a client when the output is not serialised to JSON. - It may be either retrieved to memory using `.ClientBlobOutput.content`, or - saved to a file using `.ClientBlobOutput.save`. - - .. note:: - - In the future, it is planned to replace this with `.Blob` as used on - server-side code. The ``.content`` and ``.save()`` methods should be - identical between the two. - """ - - media_type: str - download_url: str - - def __init__( - self, media_type: str, href: str, client: Optional[httpx.Client] = None - ) -> None: - """Create a ClientBlobOutput to wrap a link to a downloadable file. - - :param media_type: the MIME type of the remote file. - :param href: the URL where it may be downloaded. - :param client: if supplied, this `httpx.Client` will be used to - download the data. - """ - self.media_type = media_type - self.href = href - self.client = client or httpx.Client() - - @property - def content(self) -> bytes: - """The binary data, as a `bytes` object.""" - return self.client.get(self.href).content - - def save(self, filepath: str) -> None: - """Save the output to a file. - - This may remove the need to hold the output in memory, though for now it - simply retrieves the output into memory, then writes it to a file. - - :param filepath: the file will be saved at this location. - """ - with open(filepath, "wb") as f: - f.write(self.content) - - def open(self) -> io.IOBase: - """Open the output as a binary file-like object. - - Internally, this will download the file to memory, and wrap the - resulting `bytes` object in an `io.BytesIO` object to allow it to - function as a file-like object. - - To work with the data on disk, use `.ClientBlobOutput.save` instead. - - :return: a file-like object containing the downloaded data. - """ - return io.BytesIO(self.content) diff --git a/src/labthings_fastapi/invocations.py b/src/labthings_fastapi/invocations.py index 7ed10e93..21f156e2 100644 --- a/src/labthings_fastapi/invocations.py +++ b/src/labthings_fastapi/invocations.py @@ -13,6 +13,8 @@ from pydantic import BaseModel, ConfigDict, model_validator +from labthings_fastapi.middleware.url_for import URLFor + from .thing_description._model import Links @@ -91,7 +93,7 @@ class GenericInvocationModel(BaseModel, Generic[InputT, OutputT]): status: InvocationStatus id: uuid.UUID action: str - href: str + href: URLFor timeStarted: Optional[datetime] timeRequested: Optional[datetime] timeCompleted: Optional[datetime] diff --git a/src/labthings_fastapi/middleware/url_for.py b/src/labthings_fastapi/middleware/url_for.py index 09ab2c77..f34f453c 100644 --- a/src/labthings_fastapi/middleware/url_for.py +++ b/src/labthings_fastapi/middleware/url_for.py @@ -187,9 +187,7 @@ def __get_pydantic_core_schema__( return core_schema.no_info_wrap_validator_function( cls._validate, AnyUrl.__get_pydantic_core_schema__(AnyUrl, handler), - serialization=core_schema.to_string_ser_schema( # codespell:ignore ser - when_used="always" - ), + serialization=core_schema.to_string_ser_schema(when_used="always"), ) @classmethod @@ -199,9 +197,9 @@ def _validate(cls, value: Any, handler: Callable[[Any], Self]) -> Self: :param value: The value to validate. :param handler: The handler to convert the value if needed. :return: The validated URLFor instance. - :raises TypeError: if the value is not a URLFor instance. + :raises ValueError: if the value is not a URLFor instance. """ if isinstance(value, cls): return value else: - raise TypeError("URLFor instances may not be created from strings.") + raise ValueError("URLFor instances may not be created from strings.") diff --git a/src/labthings_fastapi/outputs/blob.py b/src/labthings_fastapi/outputs/blob.py index a873b045..4f117db4 100644 --- a/src/labthings_fastapi/outputs/blob.py +++ b/src/labthings_fastapi/outputs/blob.py @@ -1,4 +1,4 @@ -"""BLOB Output Module. +r"""BLOB Output Module. The ``.Blob`` class is used when you need to return something file-like that can't easily (or efficiently) be converted to JSON. This is useful for returning large objects @@ -36,64 +36,120 @@ def get_image(self) -> MyImageBlob: action outputs may be retrieved multiple times after the action has completed, possibly concurrently. Creating a temp folder and making a file inside it with `.Blob.from_temporary_directory` is the safest way to deal with this. + +**Serialisation** + +`.Blob` objects are serialised to a JSON representation that includes a download +``href``\ . This is generated using `.middleware.url_for` which uses a context +variable to pass the function that generates URLs to the serialiser code. That +context variable is available in every response handler function in the FastAPI +app - but it is not, in general, available in action or property code (because +actions and properties run their code in separate threads). The sequence of events +that leads to a `Blob` being downloaded as a result of an action is roughly: + +* A `POST` request invokes the action. + * `.middleware.url_for.url_for_middleware` makes `url_for` accessible via + a context variable + * A `201` response is returned that includes an ``href`` to poll the action. + * Action code is run in a separate thread (without `url_for` in the context): + * The action creates a `.Blob` object. + * The function that creates the `.Blob` object also creates a `.BlobData` + object as a property of the `.Blob` + * The `.BlobData` object's constructor adds it to the ``blob_manager`` and + sets its ``id`` property accordingly. + * The `.Blob` is returned by the action. + * The output value of the action is stored in the `.Invocation` thread. +* A `GET` request polls the action. Once it has completed: + * `.middleware.url_for.url_for_middleware` makes `url_for` accessible via + a context variable + * The `.Invocation` model is returned, which includes the `.Blob` in the + ``output`` field. + * FastAPI serialises the invocation model, which in turn serialises the `.Blob` + and uses ``url_for`` to generate a valid download ``href`` including the ``id`` + of the `.BlobData` object. +* A further `GET` request actually downloads the `.Blob`\ . + +This slightly complicated sequence ensures that we only ever send URLs back to the +client using `url_for` from the current `.fastapi.Request` object. That means the +URL used should be consistent with the URL of the request - so if an action is +started by a client using one IP address or DNS name, and polled by a different +client, each client will get a download ``href`` that matches the address they are +already using. + +In the future, it may be possible to respond directly with the `.Blob` data to +the original `POST` request, however this only works for quick actions so for now +we use the sequence above, which will work for both quick and slow actions. """ from __future__ import annotations -from contextvars import ContextVar +from collections.abc import Callable import io import os import re import shutil from typing import ( - Annotated, Any, - AsyncGenerator, - Callable, Literal, Mapping, - Optional, ) from warnings import warn from weakref import WeakValueDictionary -from typing_extensions import TypeAlias from tempfile import TemporaryDirectory import uuid -from fastapi import FastAPI, Depends, Request +from fastapi import FastAPI from fastapi.responses import FileResponse, Response +import httpx from pydantic import ( BaseModel, - create_model, - model_serializer, - model_validator, + GetCoreSchemaHandler, + GetJsonSchemaHandler, ) -from starlette.exceptions import HTTPException -from typing_extensions import Self, Protocol, runtime_checkable +from pydantic.json_schema import JsonSchemaValue +from pydantic_core import core_schema +from typing_extensions import Self +from labthings_fastapi.middleware.url_for import url_for -@runtime_checkable -class BlobData(Protocol): - """The interface for the data store of a Blob. +class BlobData: + """The data store of a Blob. `.Blob` objects can represent their data in various ways. Each of those options must provide three ways to access the data, which are the `content` property, the `save()` method, and the `open()` method. - This protocol defines the interface needed by any data store used by a + This base class defines the interface needed by any data store used by a `.Blob`. - Objects that are used on the server will additionally need to implement the - [`ServerSideBlobData`](#labthings_fastapi.outputs.blob.ServerSideBlobData) protocol, - which adds a `response()` method and `id` property. + Blobs that store their data locally should subclass `.LocalBlobData` + which adds a `response()` method and `id` property, appropriate for data + that would need to be downloaded from a server. It also takes care of + generating a download URL when it's needed. """ + def __init__(self, media_type: str) -> None: + """Initialise a `.BlobData` object. + + :param media_type: the MIME type of the data. + """ + self._media_type = media_type + @property def media_type(self) -> str: - """The MIME type of the data, e.g. 'image/png' or 'application/json'. + """The MIME type of the data, e.g. 'image/png' or 'application/json'.""" + return self._media_type + + def get_href(self) -> str: + """Return the URL to download the blob. + The implementation of this method for local blobs will need + `.url_for.url_for` and thus it should only be called in a response + handler when the `.middeware.url_for` middleware is enabled. + + :return: the URL as a string. :raises NotImplementedError: always, as this must be implemented by subclasses. """ - raise NotImplementedError("media_type property must be implemented.") + raise NotImplementedError("get_href must be implemented.") @property def content(self) -> bytes: @@ -107,44 +163,134 @@ def save(self, filename: str) -> None: """Save the data to a file. :param filename: the path where the file should be saved. + :raises NotImplementedError: always, as this must be implemented by subclasses. """ - ... # pragma: no cover + raise NotImplementedError("save must be implemented.") def open(self) -> io.IOBase: """Return a file-like object that may be read from. :return: an open file-like object. + :raises NotImplementedError: always, as this must be implemented by subclasses. """ - ... # pragma: no cover + raise NotImplementedError("open must be implemented.") -class ServerSideBlobData(BlobData, Protocol): - """A BlobData protocol for server-side use, i.e. including `response()`. +class RemoteBlobData(BlobData): + r"""A BlobData subclass that references remote data via a URL. - `.Blob` objects returned by actions must use `.BlobData` objects - that can be downloaded. This protocol extends the `.BlobData` protocol to - include a `.ServerSideBlobData.response` method that returns a - `fastapi.Response` object. + This `.BlobData` implementation will download data lazily, and + provides it in the three ways defined by `.BlobData`\ . It + does not cache downloaded data: if the `.content` attribute is + accessed multiple times, the data will be downloaded again each + time. - See `.BlobBytes` or `.BlobFile` for concrete implementations. + .. note:: + + This class is rarely instantiated directly. It is usually best to use + `.Blob.from_url` on a `.Blob` subclass. """ - id: Optional[uuid.UUID] = None - """A unique identifier for this BlobData object. + def __init__( + self, media_type: str, href: str, client: httpx.Client | None = None + ) -> None: + """Create a reference to remote `.Blob` data. + + :param media_type: the MIME type of the data. + :param href: the URL where it may be downloaded. + :param client: if supplied, this `httpx.Client` will be used to + download the data. + """ + super().__init__(media_type=media_type) + self._href = href + self._client = client or httpx.Client() + + def get_href(self) -> str: + """Return the URL to download the data. + + :return: the URL as a string. + """ + return self._href + + @property + def content(self) -> bytes: + """The binary data, as a `bytes` object.""" + return self._client.get(self._href).content - The ID is set when the BlobData object is added to the BlobDataManager. - It is used to retrieve the BlobData object from the manager. + def save(self, filepath: str) -> None: + """Save the output to a file. + + Note that the current implementation retrieves the data into + memory in its entirety, and saves to file afterwards. + + :param filepath: the file will be saved at this location. + """ + with open(filepath, "wb") as f: + f.write(self.content) + + def open(self) -> io.IOBase: + """Open the output as a binary file-like object. + + Internally, this will download the file to memory, and wrap the + resulting `bytes` object in an `io.BytesIO` object to allow it to + function as a file-like object. + + To work with the data on disk, use `save` instead. + + :return: a file-like object containing the downloaded data. + """ + return io.BytesIO(self.content) + + +class LocalBlobData(BlobData): + """A BlobData subclass where the data is stored locally. + + `.Blob` objects can reference data by a URL, or can wrap data + held in memory or on disk. For the non-URL options, we need to register the + data with the `.BlobManager` and allow it to be downloaded. This class takes + care of registering with the `.BlobManager` and adds the `.response` method + that must be overridden by subclasses to allow downloading. + + See `.BlobBytes` or `.BlobFile` for concrete implementations. """ + def __init__(self, media_type: str) -> None: + """Initialise the LocalBlobData object. + + :param media_type: the MIME type of the data. + """ + super().__init__(media_type=media_type) + self._id = blob_data_manager.add_blob(self) + + @property + def id(self) -> uuid.UUID: + """A unique identifier for this BlobData object. + + The ID is set when the BlobData object is added to the `BlobDataManager` + during initialisation. + """ + return self._id + + def get_href(self) -> str: + r"""Return a URL where this data may be downloaded. + + Note that this should only be called in a response handler, as it + relies on `.url_for.url_for`\ . + + :return: the URL as a string. + """ + return str(url_for("download_blob", blob_id=self.id)) + def response(self) -> Response: """Return a`fastapi.Response` object that sends binary data. :return: a response that streams the data from disk or memory. + :raises NotImplementedError: always, as this must be implemented by subclasses. """ - ... # pragma: no cover + raise NotImplementedError -class BlobBytes: +class BlobBytes(LocalBlobData): """A `.Blob` that holds its data in memory as a `bytes` object. `.Blob` objects use objects conforming to the `.BlobData` protocol to @@ -157,20 +303,21 @@ class BlobBytes: `.Blob.from_bytes` on a `.Blob` subclass. """ - id: Optional[uuid.UUID] = None - """A unique ID to identify the data in a `.BlobManager`.""" + _id: uuid.UUID def __init__(self, data: bytes, media_type: str) -> None: """Create a `.BlobBytes` object. - `.BlobBytes` objects wrap data stored in memory as `bytes`. They - are not usually instantiated directly, but made using `.Blob.from_bytes`. + .. note:: + + This class is rarely instantiated directly. It is usually best to use + `.Blob.from_bytes` on a `.Blob` subclass. :param data: is the data to be wrapped. :param media_type: is the MIME type of the data. """ + super().__init__(media_type=media_type) self._bytes = data - self.media_type = media_type @property def content(self) -> bytes: @@ -202,12 +349,8 @@ def response(self) -> Response: return Response(content=self._bytes, media_type=self.media_type) -class BlobFile: - """A `.Blob` that holds its data in a file. - - `.Blob` objects use objects conforming to the `.BlobData` protocol to - store their data either on disk or in a file. This implements the protocol - using a file on disk. +class BlobFile(LocalBlobData): + """A `.BlobData` backed by a file on disk. Only the filepath is retained by default. If you are using e.g. a temporary directory, you should add the `.TemporaryDirectory` as an instance attribute, @@ -216,12 +359,9 @@ class BlobFile: .. note:: This class is rarely instantiated directly. It is usually best to use - `.Blob.from_temporary_directory` on a `.Blob` subclass. + `.Blob.from_file` on a `.Blob` subclass. """ - id: Optional[uuid.UUID] = None - """A unique ID to identify the data in a `.BlobManager`.""" - def __init__(self, file_path: str, media_type: str, **kwargs: Any) -> None: r"""Create a `.BlobFile` to wrap data stored on disk. @@ -237,10 +377,10 @@ def __init__(self, file_path: str, media_type: str, **kwargs: Any) -> None: :raise IOError: if the file specified does not exist. """ + super().__init__(media_type=media_type) if not os.path.exists(file_path): raise IOError("Tried to return a file that doesn't exist.") self._file_path = file_path - self.media_type = media_type for key, val in kwargs.items(): setattr(self, key, val) @@ -287,36 +427,16 @@ def response(self) -> Response: return FileResponse(self._file_path, media_type=self.media_type) -class Blob(BaseModel): - """A container for binary data that may be retrieved over HTTP. - - See :ref:`blobs` for more information on how to use this class. - - A `.Blob` may be created to hold data using the class methods - `.Blob.from_bytes`, `.Blob.from_file` or `.Blob.from_temporary_directory`. - The constructor will attempt to deserialise a Blob from a URL - (see `__init__` method) and is unlikely to be used except in code - internal to LabThings. +class BlobModel(BaseModel): + """A model for JSON-serialised `.Blob` objects. - You are strongly advised to use a subclass of this class that specifies the - `.Blob.media_type` attribute, as this will propagate to the auto-generated - documentation. + This model describes the JSON representation of a `.Blob` + and does not offer any useful functionality. """ href: str - """The URL where the data may be retrieved. - - `.Blob` objects on a `.ThingServer` are assigned a URL when they are - serialised to JSON. This allows them to be downloaded as binary data in a - separate HTTP request. - - `.Blob` objects created by a `.ThingClient` contain a URL pointing to the - data, which will be downloaded when it is required. - - `.Blob` objects that store their data in a file or in memory will have the - ``href`` attribute set to the special value `blob://local`. - """ - media_type: str = "*/*" + """The URL where the data may be retrieved.""" + media_type: str """The MIME type of the data. This should be overridden in subclasses.""" rel: Literal["output"] = "output" """The relation of this link to the host object. @@ -330,129 +450,252 @@ class Blob(BaseModel): ) """This description is added to the serialised `.Blob`.""" - _data: Optional[ServerSideBlobData] = None - """This object holds the data, either in memory or as a file. - If `_data` is `None`, then the Blob has not been deserialised yet, and the - `href` should point to a valid address where the data may be downloaded. +def parse_media_type(media_type: str) -> tuple[str, str]: + """Parse a media type string into its type and subtype. + + :param media_type: the media type string to parse. + + :return: a tuple of (type, subtype) where each is a string or None. + :raises ValueError: if the media type is invalid. + """ + # Ignore leading whitespace and parameters (after a ;) + media_type = media_type.strip().split(";")[0] + # We expect a type and subtype separated with a / + parts = media_type.split("/") + if len(parts) != 2: + raise ValueError( + f"Invalid media type: {media_type} must contain exactly one '/'." + ) + main_type = parts[0].strip() + sub_type = parts[1].strip() + if len(main_type) == 0 or len(sub_type) == 0: + raise ValueError( + f"Invalid media type: {media_type} must have both type and subtype." + ) + if main_type == "*" and sub_type != "*": + raise ValueError( + f"Invalid media type: {media_type} has no type but has a subtype." + ) + return main_type, sub_type + + +def match_media_types(media_type: str, pattern: str) -> bool: + """Check if a media type matches a pattern. + + The pattern may include wildcards, e.g. ``image/*`` or ``*/*``. + + :param media_type: the media type to check. + :param pattern: the pattern to match against. + + :return: True if the media type matches the pattern, False otherwise. + """ + type_a, subtype_a = parse_media_type(media_type) + type_b, subtype_b = parse_media_type(pattern) + if type_b != "*" and type_a != type_b: + return False + if subtype_b != "*" and subtype_a != subtype_b: + return False + return True + + +class Blob: + r"""A container for binary data that may be retrieved over HTTP. + + See :ref:`blobs` for more information on how to use this class. + + A `.Blob` may be created to hold data using the class methods + `.Blob.from_bytes`, `.Blob.from_file` or `.Blob.from_temporary_directory`\ . + It may also reference remote data, using `.Blob.from_url`\ , though this + is currently only used on the client side. + The constructor requires a `.BlobData` instance, so the methods mentioned + previously are likely a more convenient way to instantiate a `.Blob`\ . + + You are strongly advised to use a subclass of this class that specifies the + `.Blob.media_type` attribute, as this will propagate to the auto-generated + documentation and make the return type of your action clearer. + + This class is `pydantic` compatible, in that it provides a schema, validator + and serialiser. However, it may use `.url_for.url_for` during serialisation, + so it should only be serialised in a request handler function. This + functionality is intended for use by LabThings library functions only. + Validation and serialisation behaviour is described in the docstrings of + `.Blob._validate` and `.Blob._serialize`. """ - @model_validator(mode="after") - def retrieve_data(self) -> Self: - r"""Retrieve the data from the URL. - - When a `.Blob` is created using its constructor, `pydantic` - will attempt to deserialise it by retrieving the data from the URL - specified in `.Blob.href`. Currently, this must be a URL pointing to a - `.Blob` that already exists on this server, and any other URL will - cause a `LookupError`. - - This validator will only work if the function to resolve URLs to - `.BlobData` objects - has been set in the context variable `.blob.url_to_blobdata_ctx`\ . - This is done when actions are being invoked over HTTP by the - `.BlobIOContextDep` dependency. - - :return: the `.Blob` object (i.e. ``self``), after retrieving the data. - - :raise ValueError: if the ``href`` is set as ``"blob://local"`` but - the ``_data`` attribute has not been set. This happens when the - `.Blob` is being constructed using `.Blob.from_bytes` or similar. - :raise LookupError: if the `.Blob` is being constructed from a URL - and the URL does not correspond to a `.BlobData` instance that - exists on this server (i.e. one that has been previously created - and added to the `.BlobManager` as the result of a previous action). - """ - if self.href == "blob://local": - if self._data: - return self - raise ValueError("Blob objects must have data if the href is blob://local") + media_type: str = "*/*" + """The MIME type of the data. This should be overridden in subclasses.""" + description: str | None = None + """An optional description that may be added to the serialised `.Blob`.""" + _data: BlobData + """This object stores the data - in memory, on disk, or at a URL.""" + + def __init__(self, data: BlobData, description: str | None = None) -> None: + """Create a `.Blob` object wrapping the given data. + + :param data: the `.BlobData` object that stores the data. + :param description: an optional description of the blob. + + :raise ValueError: if the media_type of the data does not match + the media_type of the `.Blob` subclass. + """ + super().__init__() + self._data = data + if description is not None: + self.description = description + if not match_media_types(data.media_type, self.media_type): + raise ValueError( + f"Blob data media_type '{data.media_type}' does not match " + f"Blob media_type '{self.media_type}'." + ) + # The data may have a more specific media_type, so we use that + # in preference to the default defined by the class. + self.media_type = data.media_type + + @classmethod + def __get_pydantic_core_schema__( + cls, source: type[Any], handler: GetCoreSchemaHandler + ) -> core_schema.CoreSchema: + """Get the pydantic core schema for this type. + + This magic method allows `pydantic` to serialise `.Blob` + instances, and generate a JSONSchema for them. + + We tell `pydantic` to base its handling of `Blob` on the + `.BlobModel` schema, with custom validation and serialisation. + Validation and serialisation behaviour is described in the docstrings + of `.Blob._validate` and `.Blob._serialize`. + + The JSONSchema is generated for `.BlobModel` but is then refined + in `__get_pydantic_json_schema__` to include the ``media_type`` + and ``description`` defaults. + + :param source: The source type being converted. + :param handler: The pydantic core schema handler. + :return: The pydantic core schema for the URLFor type. + """ + return core_schema.no_info_wrap_validator_function( + cls._validate, + BlobModel.__get_pydantic_core_schema__(BlobModel, handler), + serialization=core_schema.wrap_serializer_function_ser_schema( + cls._serialize, + is_field_serializer=False, + info_arg=False, + when_used="always", + ), + ) + + @classmethod + def __get_pydantic_json_schema__( + cls, core_schema: core_schema.CoreSchema, handler: GetJsonSchemaHandler + ) -> JsonSchemaValue: + """Customise the JSON Schema to include the media_type. + + :param core_schema: The core schema for the Blob type. + :param handler: The pydantic JSON schema handler. + :return: The JSON schema for the Blob type, with media_type included. + """ + json_schema = handler(core_schema) + json_schema = handler.resolve_ref_schema(json_schema) + # Set the title to the class name, not BlobModel + json_schema["title"] = cls.__name__ + # Add the media_type default value from this class + json_schema["properties"]["media_type"]["default"] = cls.media_type + # If the media_type is specific, add a const constraint + # This shows that only this media_type is valid + if "*" not in cls.media_type: + json_schema["properties"]["media_type"]["const"] = [cls.media_type] + # Add the default description + if cls.description is not None: + json_schema["properties"]["description"]["default"] = cls.description + return json_schema + + @classmethod + def _validate(cls, value: Any, handler: Callable[[Any], BlobModel]) -> Self: + r"""Validate and convert a value to a `.Blob` instance. + + :param value: The input value, as passed in or loaded from JSON. + :param handler: A function that runs the validation logic of BlobModel. + + If the value is already a `.Blob`, it will be returned directly. + Otherwise, we first validate the input using the `.BlobModel` schema. + + When a `.Blob` is validated, we check to see if the URL given + as its ``href`` looks like a `.Blob` download URL on this server. If + it does, the returned object will hold a reference to the local data. + + If we can't match the URL to a `.Blob` on this server, we will raise + an error. Handling of `.Blob` input is currently experimental, and + limited to passing the output of one Action as input to a subsequent + one. + + :return: a `.Blob` object pointing to the data. + + :raise ValueError: if the ``href`` does not contain a valid Blob ID, or + if the Blob ID is not found on this server. + """ + # If the value is already a Blob, return it directly + if isinstance(value, cls): + return value + # We start by validating the input, which should fit a `BlobModel` + # (this validator is wrapping the BlobModel schema) + model = handler(value) + id = url_to_id(model.href) + if not id: + raise ValueError("Blob URLs must contain a Blob ID.") try: - url_to_blobdata = url_to_blobdata_ctx.get() - self._data = url_to_blobdata(self.href) - self.href = "blob://local" - except LookupError as e: - raise LookupError( - "Blobs may only be created from URLs passed in over HTTP." - f"The URL in question was {self.href}." - ) from e - return self - - @model_serializer(mode="plain", when_used="always") - def to_dict(self) -> Mapping[str, str]: - r"""Serialise the Blob to a dictionary and make it downloadable. - - When `pydantic` serialises this object, - it will call this method to convert it to a dictionary. There is a - significant side-effect, which is that we will add the blob to the - `.BlobDataManager` so it can be downloaded. - - This serialiser will only work if the function to assign URLs to - `.BlobData` objects has been set in the context variable - `.blobdata_to_url_ctx`\ . - This is done when actions are being returned over HTTP by the - `.BlobIOContextDep` dependency. + data = blob_data_manager.get_blob(id) + return cls(data) + except KeyError as error: + raise ValueError(f"Blob ID {id} wasn't found on this server.") from error + + @classmethod + def _serialize( + cls, obj: Self, handler: Callable[[BlobModel], Mapping[str, str]] + ) -> Mapping[str, str]: + """Serialise the Blob to a dictionary. + See `.Blob.to_blobmodel` for a description of how we serialise. + + :param obj: the `.Blob` instance to serialise. + :param handler: the handler (provided by pydantic) takes a BlobModel + and converts it to a dictionary. The handler runs the serialiser of + the core schema we've wrapped, in this case the BlobModel serialiser. :return: a JSON-serialisable dictionary with a URL that allows the `.Blob` to be downloaded from the `.BlobManager`. + """ + return handler(obj.to_blobmodel()) - :raise LookupError: if the context variable providing access to the - `.BlobManager` is not available. This usually means the `.Blob` is - being serialised somewhere other than the output of an action. - """ - if self.href == "blob://local": - try: - blobdata_to_url = blobdata_to_url_ctx.get() - # MyPy seems to miss that `self.data` is a property, hence the ignore - href = blobdata_to_url(self.data) # type: ignore[arg-type] - except LookupError as e: - raise LookupError( - "Blobs may only be serialised inside the " - "context created by BlobIOContextDep." - ) from e - else: - href = self.href - return { - "href": href, - "media_type": self.media_type, - "rel": self.rel, - "description": self.description, - } - - @classmethod - def default_media_type(cls) -> str: - """Return the default media type. + def to_blobmodel(self) -> BlobModel: + r"""Represent the `.Blob` as a `.BlobModel` to get ready to serialise. - `.Blob` should generally be subclassed to define the default media type, - as this forms part of the auto-generated documentation. Using the - `.Blob` class directly will result in a media type of `*/*`, which makes - it unclear what format the output is in. + When `pydantic` serialises this object, we first generate a `.BlobModel` + with just the information to be serialised. + We use `.from_url.from_url` to generate the URL, so this will error if + it is serialised anywhere other than a request handler with the + middleware from `.middleware.url_for` enabled. - :return: the default media type as a MIME type string, e.g. ``image/png``. + :return: a JSON-serialisable dictionary with a URL that allows + the `.Blob` to be downloaded from the `.BlobManager`. """ - return cls.model_fields["media_type"].get_default() + data = { + "href": self.data.get_href(), + "media_type": self.media_type, + } + if self.description is not None: + data["description"] = self.description + return BlobModel(**data) @property - def data(self) -> ServerSideBlobData: + def data(self) -> BlobData: """The data store for this Blob. - `.Blob` objects may hold their data in various ways, defined by the - `.ServerSideBlobData` protocol. This property returns the data store - for this `.Blob`. - - If the `.Blob` has not yet been downloaded, there may be no data - held locally, in which case this function will raise an exception. - It is recommended to use the `.Blob.content` property or `.Blob.save` or `.Blob.open` methods rather than accessing this property directly. :return: the data store wrapping data on disk or in memory. - - :raise ValueError: if there is no data stored on disk or in memory. """ - if self._data is None: - raise ValueError("This Blob has no data.") return self._data @property @@ -504,10 +747,7 @@ def from_bytes(cls, data: bytes) -> Self: :return: a `.Blob` wrapping the supplied data. """ - return cls.model_construct( # type: ignore[return-value] - href="blob://local", - _data=BlobBytes(data, media_type=cls.default_media_type()), - ) + return cls(BlobBytes(data, media_type=cls.media_type)) @classmethod def from_temporary_directory(cls, folder: TemporaryDirectory, file: str) -> Self: @@ -529,11 +769,10 @@ def from_temporary_directory(cls, folder: TemporaryDirectory, file: str) -> Self :return: a `.Blob` wrapping the file. """ file_path = os.path.join(folder.name, file) - return cls.model_construct( # type: ignore[return-value] - href="blob://local", - _data=BlobFile( + return cls( + BlobFile( file_path, - media_type=cls.default_media_type(), + media_type=cls.media_type, # Prevent the temporary directory from being cleaned up _temporary_directory=folder, ), @@ -558,9 +797,30 @@ def from_file(cls, file: str) -> Self: :return: a `.Blob` object referencing the specified file. """ - return cls.model_construct( # type: ignore[return-value] - href="blob://local", - _data=BlobFile(file, media_type=cls.default_media_type()), + return cls( + BlobFile(file, media_type=cls.media_type), + ) + + @classmethod + def from_url(cls, href: str, client: httpx.Client | None = None) -> Self: + """Create a `.Blob` that references data at a URL. + + This is the recommended way to create a `.Blob` that references + data held remotely. It should ideally be called on a subclass + of `.Blob` that has set ``media_type``. + + :param href: the URL where the data may be downloaded. + :param client: if supplied, this `httpx.Client` will be used to + download the data. + + :return: a `.Blob` object referencing the specified URL. + """ + return cls( + RemoteBlobData( + media_type=cls.media_type, + href=href, + client=client, + ), ) def response(self) -> Response: @@ -570,8 +830,16 @@ def response(self) -> Response: that returns the data over HTTP. :return: an HTTP response that streams data from memory or file. + :raise NotImplementedError: if the data is not local. It's not currently + possible to serve remote data via the `.BlobManager`. """ - return self.data.response() + data = self.data + if isinstance(data, LocalBlobData): + return data.response() + else: + raise NotImplementedError( + "Currently, only local BlobData can be served over HTTP." + ) def blob_type(media_type: str) -> type[Blob]: @@ -585,10 +853,9 @@ def blob_type(media_type: str) -> type[Blob]: class MyImageBlob(Blob): media_type = "image/png" - :param media_type: will be the default value of the ``media_type`` property - on the `.Blob` subclass. + :param media_type: the media type that the new `.Blob` subclass will use. - :return: a subclass of `.Blob` with the specified default media type. + :return: a subclass of `.Blob` with the specified media type. :raise ValueError: if the media type contains ``'`` or ``\``. """ @@ -600,14 +867,12 @@ class MyImageBlob(Blob): ) if "'" in media_type or "\\" in media_type: raise ValueError("media_type must not contain single quotes or backslashes") - return create_model( + return type( f"{media_type.replace('/', '_')}_blob", - __base__=Blob, - media_type=(eval(f"Literal[r'{media_type}']"), media_type), # noqa: S307 - # This can't be done with `literal_eval` as that does not support subscripts. - # Basic sanitisation is done above by removing backslashes and single quotes, - # and using a raw string. However, the long term solution is to remove this - # function in favour of subclassing Blob, as recommended in the docs. + (Blob,), + { + "media_type": media_type, + }, ) @@ -620,7 +885,7 @@ class BlobDataManager: reference, and will be expired by the `.ActionManager`. Note that the `.BlobDataManager` does not work with `.Blob` objects directly, - it holds only the `.ServerSideBlobData` object, which is where the data is + it holds only the `.LocalBlobData` object, which is where the data is stored. This means you should not rely on any custom attributes of a `.Blob` subclass being preserved when the `.Blob` is passed from one action to another. @@ -629,49 +894,46 @@ class BlobDataManager: def __init__(self) -> None: """Initialise a BlobDataManager object.""" - self._blobs: WeakValueDictionary[uuid.UUID, ServerSideBlobData] = ( + self._blobs: WeakValueDictionary[uuid.UUID, LocalBlobData] = ( WeakValueDictionary() ) - def add_blob(self, blob: ServerSideBlobData) -> uuid.UUID: + def add_blob(self, blob: LocalBlobData) -> uuid.UUID: """Add a `.Blob` to the manager, generating a unique ID. - This function adds a `.ServerSideBlobData` object to the + This function adds a `.LocalBlobData` object to the `.BlobDataManager`. It will retain a weak reference to the - `.ServerSideBlobData` object: you are responsible for ensuring + `.LocalBlobData` object: you are responsible for ensuring the data is not garbage collected, for example by including the parent `.Blob` in the output of an action. - :param blob: a `.ServerSideBlobData` object that holds the data + :param blob: a `.LocalBlobData` object that holds the data being added. :return: a unique ID identifying the data. This forms part of the URL to download the data. - :raise ValueError: if the `.ServerSideBlobData` object already + :raise ValueError: if the `.LocalBlobData` object already has an ``id`` attribute but is not in the dictionary of data. This suggests the object has been added to another `.BlobDataManager`, which should never happen. """ - if hasattr(blob, "id") and blob.id is not None: - if blob.id in self._blobs: - return blob.id - else: - raise ValueError( - f"BlobData already has an ID {blob.id} " - "but was not found in this BlobDataManager" - ) - blob.id = uuid.uuid4() - self._blobs[blob.id] = blob - return blob.id - - def get_blob(self, blob_id: uuid.UUID) -> ServerSideBlobData: + if blob in self._blobs.values(): + raise ValueError( + "BlobData objects may only be added to the manager once! " + "This is a LabThings bug." + ) + id = uuid.uuid4() + self._blobs[id] = blob + return id + + def get_blob(self, blob_id: uuid.UUID) -> LocalBlobData: """Retrieve a `.Blob` from the manager. :param blob_id: the unique ID assigned when the data was added to this `.BlobDataManager`. - :return: the `.ServerSideBlobData` object holding the data. + :return: the `.LocalBlobData` object holding the data. """ return self._blobs[blob_id] @@ -679,7 +941,7 @@ def download_blob(self, blob_id: uuid.UUID) -> Response: """Download a `.Blob`. This function returns a `fastapi.Response` allowing the data to be - downloaded, using the `.ServerSideBlobData.response` method. + downloaded, using the `.LocalBlobData.response` method. :param blob_id: the unique ID assigned when the data was added to this `.BlobDataManager`. @@ -694,94 +956,31 @@ def attach_to_app(self, app: FastAPI) -> None: """Attach the BlobDataManager to a FastAPI app. Add an endpoint to a FastAPI application that will serve the content of - the `.ServerSideBlobData` objects in response to ``GET`` requests. + the `.LocalBlobData` objects in response to ``GET`` requests. :param app: the `fastapi.FastAPI` application to which we are adding the endpoint. """ - app.get("/blob/{blob_id}")(self.download_blob) + app.get( + "/blob/{blob_id}", + name="download_blob", + )(self.download_blob) blob_data_manager = BlobDataManager() """A global register of all BlobData objects.""" -blobdata_to_url_ctx = ContextVar[Callable[[ServerSideBlobData], str]]("blobdata_to_url") -"""This context variable gives access to a function that makes BlobData objects -downloadable, by assigning a URL and adding them to the -[`BlobDataManager`](#labthings_fastapi.outputs.blob.BlobDataManager). - -It is only available within a -[`blob_serialisation_context_manager`](#labthings_fastapi.outputs.blob.blob_serialisation_context_manager) -because it requires access to the `BlobDataManager` and the `url_for` function -from the FastAPI app. -""" - -url_to_blobdata_ctx = ContextVar[Callable[[str], ServerSideBlobData]]("url_to_blobdata") -"""This context variable gives access to a function that makes BlobData objects -from a URL, by retrieving them from the -[`BlobDataManager`](#labthings_fastapi.outputs.blob.BlobDataManager). - -It is only available within a -[`blob_serialisation_context_manager`](#labthings_fastapi.outputs.blob.blob_serialisation_context_manager) -because it requires access to the `BlobDataManager`. -""" +def url_to_id(url: str) -> uuid.UUID | None: + """Extract the blob ID from a URL. + Currently, this checks for a UUID at the end of a URL. In the future, + it might check if the URL refers to this server. -async def blob_serialisation_context_manager( - request: Request, -) -> AsyncGenerator[BlobDataManager, None]: - r"""Set context variables to allow blobs to be [de]serialised. - - In order to serialise a `.Blob` to a JSON-serialisable dictionary, we must - add it to the `.BlobDataManager` and use that to generate a URL. This - requires that the serialisation code (which may be nested deep within a - `pydantic.BaseModel`) has access to the `.BlobDataManager` and also the - `fastapi.Request.url_for` method. At time of writing, there was not an - obvious way to pass these functions in to the serialisation code. - - Similar problems exist for blobs used as input: the validator needs to - retrieve the data from the `.BlobDataManager` but does not have access. - - This async context manager yields the `.BlobDataManager`, but more - importantly it sets the `.url_to_blobdata_ctx` and `blobdata_to_url_ctx` - context variables, which may be accessed by the code within `.Blob` to - correctly add and retrieve `.ServerSideBlobData` objects to and from the - `.BlobDataManager`\ . - - This function will usually be called from a FastAPI dependency. See - :ref:`dependencies` for more on that mechanism. - - :param request: the `fastapi.Request` object, used to access the server - and ``url_for`` method. - - :yield: the `.BlobDataManager`. This is usually ignored. + :param url: a URL previously generated by `blobdata_to_url`. + :return: the UUID blob ID extracted from the URL. """ - url_for = request.url_for - - def blobdata_to_url(blob: ServerSideBlobData) -> str: - blob_id = blob_data_manager.add_blob(blob) - return str(url_for("download_blob", blob_id=blob_id)) - - def url_to_blobdata(url: str) -> ServerSideBlobData: - m = re.search(r"blob/([0-9a-z\-]+)", url) - if not m: - raise HTTPException( - status_code=404, detail="Could not find blob ID in href" - ) - invocation_id = uuid.UUID(m.group(1)) - return blob_data_manager.get_blob(invocation_id) - - t1 = blobdata_to_url_ctx.set(blobdata_to_url) - t2 = url_to_blobdata_ctx.set(url_to_blobdata) - try: - yield blob_data_manager - finally: - blobdata_to_url_ctx.reset(t1) - url_to_blobdata_ctx.reset(t2) - - -BlobIOContextDep: TypeAlias = Annotated[ - BlobDataManager, Depends(blob_serialisation_context_manager) -] -"""A dependency that enables `.Blob` to be serialised and deserialised.""" + m = re.search(r"blob/([0-9a-z\-]+)", url) + if not m: + return None + return uuid.UUID(m.group(1)) diff --git a/src/labthings_fastapi/server/__init__.py b/src/labthings_fastapi/server/__init__.py index eb7434ff..716a7917 100644 --- a/src/labthings_fastapi/server/__init__.py +++ b/src/labthings_fastapi/server/__init__.py @@ -18,6 +18,7 @@ from collections.abc import Mapping, Sequence from types import MappingProxyType +from ..middleware.url_for import url_for_middleware from ..thing_slots import ThingSlot from ..utilities import class_attributes @@ -86,6 +87,7 @@ def __init__( self._config = ThingServerConfig(things=things, settings_folder=settings_folder) self.app = FastAPI(lifespan=self.lifespan) self._set_cors_middleware() + self._set_url_for_middleware() self.settings_folder = settings_folder or "./settings" self.action_manager = ActionManager() self.action_manager.attach_to_app(self.app) @@ -129,6 +131,15 @@ def _set_cors_middleware(self) -> None: allow_headers=["*"], ) + def _set_url_for_middleware(self) -> None: + """Add middleware to support `url_for` in Pydantic models. + + This middleware adds a request state variable that allows + `labthings_fastapi.server.URLFor` instances to be serialised + using FastAPI's `url_for` function. + """ + self.app.middleware("http")(url_for_middleware) + @property def things(self) -> Mapping[str, Thing]: """Return a dictionary of all the things. diff --git a/src/labthings_fastapi/thing_description/_model.py b/src/labthings_fastapi/thing_description/_model.py index 83fae777..b4ee2144 100644 --- a/src/labthings_fastapi/thing_description/_model.py +++ b/src/labthings_fastapi/thing_description/_model.py @@ -39,6 +39,8 @@ ) from pydantic import AnyUrl, BaseModel, Field, ConfigDict, AfterValidator +from labthings_fastapi.middleware.url_for import URLFor + class Version(BaseModel): """Version info for a Thing. @@ -240,7 +242,7 @@ class Form(BaseModel, Generic[OpT]): model_config = ConfigDict(extra="allow") - href: AnyUri + href: URLFor | AnyUri op: Optional[Union[OpT, List[OpT]]] = None contentType: Optional[str] = None contentCoding: Optional[str] = None @@ -296,7 +298,7 @@ class LinkElement(BaseModel): model_config = ConfigDict(extra="allow") - href: AnyUri + href: URLFor | AnyUri type: Optional[str] = None rel: Optional[str] = None anchor: Optional[AnyUri] = None @@ -439,7 +441,7 @@ class WotTdSchema16October2019(BaseModel): version: Optional[Version] = None links: Links = None forms: Optional[List[Form[RootOp]]] = Field(None, min_length=1) - base: Optional[AnyUri] = None + base: Optional[URLFor | AnyUri] = None securityDefinitions: Dict[str, SecurityScheme] support: Optional[AnyUri] = None created: Optional[datetime] = None diff --git a/tests/test_blob_output.py b/tests/test_blob_output.py index 8a155624..06337761 100644 --- a/tests/test_blob_output.py +++ b/tests/test_blob_output.py @@ -6,16 +6,23 @@ from tempfile import TemporaryDirectory from uuid import uuid4 +import fastapi from fastapi.testclient import TestClient +from pydantic import TypeAdapter from pydantic_core import PydanticSerializationError import pytest import labthings_fastapi as lt -from labthings_fastapi.client.outputs import ClientBlobOutput -from labthings_fastapi.testing import create_thing_without_server +from labthings_fastapi.exceptions import FailedToInvokeActionError +from labthings_fastapi.testing import create_thing_without_server, use_dummy_url_for class TextBlob(lt.blob.Blob): - media_type: str = "text/plain" + media_type = "text/plain" + + +class VagueTextBlob(lt.blob.Blob): + media_type = "text/*" + description = "This URL will download some vague text data." class ThingOne(lt.Thing): @@ -83,33 +90,126 @@ def client(): yield client -def test_blobdata_protocol(): - """Check the definition of the blobdata protocol, and the implementations.""" - - class BadBlob(lt.blob.BlobData): - pass +@pytest.mark.filterwarnings("ignore:.*removed in v0.1.0.*:DeprecationWarning") +def test_blob_type(): + """Check we can't put dodgy values into a blob output model""" + with pytest.raises(ValueError): + lt.blob.blob_type(media_type="text/plain\\'DROP TABLES") + M = lt.blob.blob_type(media_type="text/plain") + assert M.from_bytes(b"").media_type == "text/plain" - bad_blob = BadBlob() - assert not isinstance(bad_blob, lt.blob.ServerSideBlobData) +@pytest.mark.parametrize( + ("media_type", "expected"), + [ + ("text/plain", ("text", "plain")), + ("text/plain; charset=utf-8", ("text", "plain")), + ("text/*", ("text", "*")), + ("*/*", ("*", "*")), + ], +) +def test_media_type_parsing(media_type, expected): + """Check that media type parsing works as expected.""" + assert lt.blob.parse_media_type(media_type) == expected + + +@pytest.mark.parametrize( + ("media_type", "msg"), + [ + ("too/many/slashes", "exactly one '/'"), + ("/leadingslash", "both type and subtype"), + ("*/plain", "has no type"), + ], +) +def test_invalid_media_type_parsing(media_type, msg): + """Check that invalid media types raise an error.""" + with pytest.raises(ValueError, match=msg): + lt.blob.parse_media_type(media_type) + + +@pytest.mark.parametrize( + ("data_media_type", "blob_media_type", "expected"), + [ + ("text/plain", "text/plain", True), + ("text/html", "text/*", True), + ("image/png", "image/*", True), + ("application/json", "*/*", True), + ("text/plain", "text/html", False), + ("image/jpeg", "image/png", False), + ("application/xml", "application/json", False), + ("text/plain", "image/*", False), + ], +) +def test_media_type_matching(data_media_type, blob_media_type, expected): + """Check that media type matching works as expected.""" + assert lt.blob.match_media_types(data_media_type, blob_media_type) is expected + + +def test_blobdata_base_class(): + """Check that BlobData/LocalBlobData abstract methods raise the right error.""" + bd = lt.blob.BlobData("*/*") + with pytest.raises(NotImplementedError): + _ = bd.content + with pytest.raises(NotImplementedError): + _ = bd.open() + with pytest.raises(NotImplementedError): + bd.save("somefile") + with pytest.raises(NotImplementedError): + _ = bd.get_href() + lbd = lt.blob.LocalBlobData(media_type="text/plain") + with pytest.raises(NotImplementedError): + _ = lbd.content with pytest.raises(NotImplementedError): - _ = bad_blob.media_type + _ = lbd.open() with pytest.raises(NotImplementedError): - _ = bad_blob.content + lbd.save("somefile") + with pytest.raises(NotImplementedError): + _ = lbd.response() + + +def test_blob_schema(): + """Check that the Blob schema is as expected.""" + schema = TypeAdapter(TextBlob).json_schema() + assert schema["title"] == "TextBlob" + assert schema["type"] == "object" + assert "href" in schema["properties"] + assert "media_type" in schema["properties"] + assert schema["properties"]["media_type"]["default"] == "text/plain" + # Since media_type is specific, it should have a const constraint + assert schema["properties"]["media_type"]["const"] == ["text/plain"] + + # Check that a vague blob type has no const constraint + # This is because multiple media types are valid - it ends with * + schema = TypeAdapter(VagueTextBlob).json_schema() + assert "const" not in schema["properties"]["media_type"] + assert schema["properties"]["description"]["default"] == ( + "This URL will download some vague text data." + ) -@pytest.mark.filterwarnings("ignore:.*removed in v0.1.0.*:DeprecationWarning") -def test_blob_type(): - """Check we can't put dodgy values into a blob output model""" +def test_blob_initialisation(): + """Check that blobs can be initialised correctly.""" + data = lt.blob.BlobBytes(b"Test data", media_type="text/plain") + blob = TextBlob(data, description="A test blob") + assert blob.content == b"Test data" + assert blob.media_type == "text/plain" + assert blob.description == "A test blob" + + # Check that the media type is refined if the data is more + # specific than the Blob class + vague_blob = VagueTextBlob(data) + assert vague_blob.content == b"Test data" + assert vague_blob.media_type == "text/plain" + + # Check we get an error if the media type doesn't match + data_bad = lt.blob.BlobBytes(b"Bad data", media_type="image/png") with pytest.raises(ValueError): - lt.blob.blob_type(media_type="text/plain\\'DROP TABLES") - M = lt.blob.blob_type(media_type="text/plain") - assert M.from_bytes(b"").media_type == "text/plain" + _ = TextBlob(data_bad) def test_blob_creation(): - """Check that blobs can be created in three ways""" + """Check that blobs can be created in four ways""" TEXT = b"Test input" # Create a blob from a file in a temporary directory td = TemporaryDirectory() @@ -130,37 +230,89 @@ def test_blob_creation(): with pytest.raises(IOError): _ = TextBlob.from_temporary_directory(td, "nonexistent") - # Finally, check we can make a blob from a bytes object, no file. + # Check we can make a blob from a bytes object, no file. blob = TextBlob.from_bytes(TEXT) assert blob.content == TEXT + # Check we can make a blob from a URL + blob = TextBlob.from_url(href="https://example.com/blob") + assert blob.to_blobmodel().href == "https://example.com/blob" + + +def test_blob_data_manager(): + """Check blobs appear in the data manager.""" + blob = TextBlob.from_bytes(b"Some Data") + data = blob.data + assert isinstance(data, lt.blob.LocalBlobData) + id = data.id + assert lt.blob.blob_data_manager.get_blob(id) is data + with pytest.raises(ValueError): + lt.blob.blob_data_manager.add_blob(data) + del data + del blob + # Check that the blob doesn't linger due to references + with pytest.raises(KeyError): + lt.blob.blob_data_manager.get_blob(id) def test_blob_serialisation(): """Check that blobs may be serialised.""" blob = TextBlob.from_bytes(b"Some data") - # Can't serialise a blob (with data) without a BlobDataManager + # Can't serialise a blob (with data) without url_for in the context with pytest.raises(PydanticSerializationError): - blob.model_dump() + TypeAdapter(TextBlob).dump_python(blob) # Fake the required context variable, and it should work - try: - token = lt.outputs.blob.blobdata_to_url_ctx.set(lambda b: "https://example/") - data = blob.model_dump() - assert data["href"] == "https://example/" - assert data["media_type"] == "text/plain" - finally: - lt.outputs.blob.blobdata_to_url_ctx.reset(token) - - # Blobs that already refer to a remote URL should serialise without error - # though there's currently no way to create one on the server. - remoteblob = TextBlob.model_construct( - media_type="text/plain", + with use_dummy_url_for(): + data = TypeAdapter(TextBlob).dump_python(blob) + assert data["href"].startswith("urlfor://download_blob/?blob_id=") + assert data["media_type"] == "text/plain" + + vagueblob = VagueTextBlob.from_bytes(b"Some data") + # VagueTextBlob has a customised description that should be included + with use_dummy_url_for(): + data = TypeAdapter(VagueTextBlob).dump_python(vagueblob) + assert data["href"].startswith("urlfor://download_blob/?blob_id=") + assert data["media_type"] == "text/*" + assert data["description"] == "This URL will download some vague text data." + + # Blobs that already refer to a remote URL should serialise without error. + remoteblob = TextBlob.from_url( href="https://example/", ) - data = remoteblob.model_dump() + data = TypeAdapter(TextBlob).dump_python(remoteblob) assert data["href"] == "https://example/" assert data["media_type"] == "text/plain" +def test_blob_download(): + """Check that blob downloading works as expected.""" + # We use a bare FastAPI app to do an isolated test + app = fastapi.FastAPI() + # # This is needed to generate download URLs + # app.middleware("http")(url_for.url_for_middleware) + + # @app.get("/blob_json/") + # def blob_json() -> TextBlob: + # return TextBlob.from_bytes(b"Blob JSON!") + + @app.get("/download_blob/") + def download_blob(): + blob = TextBlob.from_bytes(b"Download me!") + return blob.response() + + with TestClient(app) as client: + response = client.get("/download_blob/") + assert response.status_code == 200 + assert response.content == b"Download me!" + assert response.headers["content-type"].startswith("text/plain") + + # Remote blobs can't be downloaded yet. There's no need to do this + # within a server, as the error is the same either way (and it's + # much easier to catch here). + remote = TextBlob.from_url(href="https://example.com/remote_blob") + with pytest.raises(NotImplementedError): + _ = remote.response() + + def test_blob_output_client(client): """Test that blob outputs work as expected when used over HTTP.""" tc = lt.ThingClient.from_url("/thing_one/", client=client) @@ -182,7 +334,6 @@ def test_blob_output_inserver(client): def check_blob(output, expected_content: bytes): """Test that a BlobOutput can be retrieved in three ways""" - print(f"Testing blob output {output} which has attributes {output.__dict__}") assert output.content == expected_content with TemporaryDirectory() as dir: output.save(os.path.join(dir, "test_output")) @@ -219,18 +370,21 @@ def test_blob_input(client): # Check that a bad URL results in an error. # This URL is not totally bad - it follows the right form, but the # UUID is not found on the server. - bad_blob = ClientBlobOutput( - media_type="text/plain", href=f"http://nonexistent.local/blob/{uuid4()}" + bad_blob = lt.blob.Blob( + lt.blob.RemoteBlobData( + media_type="text/plain", href=f"http://nonexistent.local/blob/{uuid4()}" + ) ) - with pytest.raises(LookupError): + with pytest.raises(FailedToInvokeActionError, match="wasn't found"): tc.passthrough_blob(blob=bad_blob) # Try again with a totally bogus URL - bad_blob = ClientBlobOutput( - media_type="text/plain", href="http://nonexistent.local/totally_bogus" + bad_blob = lt.blob.Blob( + lt.blob.RemoteBlobData( + media_type="text/plain", href="http://nonexistent.local/totally_bogus" + ) ) - - msg = "Error when invoking action passthrough_blob: Could not find blob ID in href" - with pytest.raises(lt.exceptions.FailedToInvokeActionError, match=msg): + msg = "must contain a Blob ID" + with pytest.raises(FailedToInvokeActionError, match=msg): tc.passthrough_blob(blob=bad_blob) # Check that the same thing works on the server side diff --git a/tests/test_middleware_url_for.py b/tests/test_middleware_url_for.py index e8f0dc88..09ef9d23 100644 --- a/tests/test_middleware_url_for.py +++ b/tests/test_middleware_url_for.py @@ -2,7 +2,7 @@ import threading import pytest -from pydantic import BaseModel +from pydantic import BaseModel, ValidationError from pydantic_core import PydanticSerializationError from fastapi import FastAPI from starlette.testclient import TestClient @@ -64,11 +64,12 @@ def test_validation(): assert m.url is u # Trying to initialise with anything else should raise an error - with pytest.raises(TypeError): + msg = "URLFor instances may not be created from strings" + with pytest.raises(ValidationError, match=msg): _ = ModelWithURL(url="https://example.com") - with pytest.raises(TypeError): + with pytest.raises(ValidationError): _ = ModelWithURL(url="endpoint_name") - with pytest.raises(TypeError): + with pytest.raises(ValidationError): _ = ModelWithURL(url=None)