diff --git a/README.md b/README.md index 05c4f017..405e9d67 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,11 @@ docker run -p 8080:8080 -e 'GITHUB_OAUTH_KEY=YOURKEY' \ Or to use your GitHub personal access token, you can just set `GITHUB_API_TOKEN`. +## S3 buckets +Files in S3 buckets can be access by their s3 uri like `s3://bucket/path/to/key`. This works directly for public buckets. If you want to access private buckets, you need to provide the s3 authentication credentials to the docker container or in your environment. +For the docker container this can be done by setting the [environment variables](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html#environment-variables) with `-e AWS_ACCESS_KEY_ID=my_secret_id -e AWS_SECRET_ACCESS_KEY=my_secret_key`. +Or you can provide the [shared credentials file](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html#shared-credentials-file) to the user running the nbviewer (in docker with a volume). + ## GitHub Enterprise To use nbviewer on your own GitHub Enterprise instance you need to set `GITHUB_API_URL`. diff --git a/nbviewer/app.py b/nbviewer/app.py index c883d9b7..4398ed09 100644 --- a/nbviewer/app.py +++ b/nbviewer/app.py @@ -197,6 +197,10 @@ class NBViewer(Application): default_value="nbviewer.providers.local.handlers.LocalFileHandler", help="The Tornado handler to use for viewing notebooks found on a local filesystem", ).tag(config=True) + s3_handler = Unicode( + default_value="nbviewer.providers.s3.handlers.S3Handler", + help="The Tornado handler to use for viewing notebooks from amazon S3", + ).tag(config=True) url_handler = Unicode( default_value="nbviewer.providers.url.handlers.URLHandler", help="The Tornado handler to use for viewing notebooks accessed via URL", @@ -625,6 +629,7 @@ def init_tornado_application(self): github_user_handler=self.github_user_handler, index_handler=self.index_handler, local_handler=self.local_handler, + s3_handler=self.s3_handler, url_handler=self.url_handler, user_gists_handler=self.user_gists_handler, ) diff --git a/nbviewer/providers/__init__.py b/nbviewer/providers/__init__.py index 7f00972c..1a67d134 100644 --- a/nbviewer/providers/__init__.py +++ b/nbviewer/providers/__init__.py @@ -6,12 +6,12 @@ # ----------------------------------------------------------------------------- default_providers = [ - "nbviewer.providers.{}".format(prov) for prov in ["url", "github", "gist"] + "nbviewer.providers.{}".format(prov) for prov in ["url", "github", "gist", "s3"] ] default_rewrites = [ "nbviewer.providers.{}".format(prov) - for prov in ["gist", "github", "dropbox", "huggingface", "url"] + for prov in ["gist", "github", "dropbox", "huggingface", "s3", "url"] ] @@ -83,7 +83,7 @@ def _load_provider_feature(feature, providers, **handler_names): try: # Ex: handler_names['url_handler'] handler_names[provider_handler_key] - except KeyError: + except KeyError as e: continue else: # Ex: provider_handlers['url_handler'] = handler_names['url_handler'] diff --git a/nbviewer/providers/s3/__init__.py b/nbviewer/providers/s3/__init__.py new file mode 100644 index 00000000..9701609e --- /dev/null +++ b/nbviewer/providers/s3/__init__.py @@ -0,0 +1,3 @@ +from .handlers import default_handlers +from .handlers import S3Handler +from .handlers import uri_rewrites diff --git a/nbviewer/providers/s3/handlers.py b/nbviewer/providers/s3/handlers.py new file mode 100644 index 00000000..751e215b --- /dev/null +++ b/nbviewer/providers/s3/handlers.py @@ -0,0 +1,149 @@ +# ----------------------------------------------------------------------------- +# Copyright (C) Jupyter Development Team +# +# Distributed under the terms of the BSD License. The full license is in +# the file COPYING, distributed as part of this software. +# ----------------------------------------------------------------------------- +import errno +import io +import os +from datetime import datetime +from urllib.parse import urlparse + +import boto3 +import botocore +from tornado import iostream +from tornado import web + +from .. import _load_handler_from_location +from ...utils import url_path_join +from ..base import cached +from ..base import RenderingHandler + + +class S3Handler(RenderingHandler): + """Renderer for s3:// + + Serving notebooks from S3 buckets + """ + + def initialize(self, **kwargs): + self.s3_client = boto3.client("s3") + self._downloadable_data = None + self._downloaded_path = None + super().initialize(**kwargs) + + async def download(self, path): + """Download the notebook""" + headers = await self.get_notebook_headers(path) + filename = os.path.basename(path) + self.set_header("Content-Length", headers["ContentLength"]) + # Escape commas to workaround Chrome issue with commas in download filenames + self.set_header( + "Content-Disposition", + "attachment; filename={};".format(filename.replace(",", "_")), + ) + if self._downloaded_path == path and self._downloadable_data is not None: + content = self._downloadable_data + else: + content = await self.read_s3_file(path) + + if isinstance(content, bytes): + content = [content] + for chunk in content: + try: + self.write(chunk) + await self.flush() + except iostream.StreamClosedError: + return + + async def get_notebook_data(self, path): + """Get additional notebook data""" + is_download = self.get_query_arguments("download") + if is_download: + await self.download(path) + return + + return path + + async def get_notebook_headers(self, path): + """Get the size of a notebook file.""" + o = urlparse(path) + bucket = o.netloc + key = o.path[1:] + self.log.debug("Getting headers for %s from %s", key, bucket) + try: + head = self.s3_client.head_object(Bucket=bucket, Key=key) + except botocore.exceptions.ClientError as ex: + if ex.response["Error"]["Code"] == "404": + self.log.info("The notebook %s does not exist.", path) + raise web.HTTPError(404) + raise ex + return head + + async def read_s3_file(self, path): + """Download the notebook file from s3.""" + o = urlparse(path) + bucket = o.netloc + key = o.path[1:] + s3_file = io.BytesIO() + self.log.debug("Reading %s from %s", key, bucket) + try: + self.s3_client.download_fileobj(bucket, key, s3_file) + except botocore.exceptions.ClientError as ex: + if ex.response["Error"]["Code"] == "404": + self.log.info("The notebook %s does not exist.", path) + raise web.HTTPError(404) + raise ex + s3_file.seek(0) + self.log.debug("Done downloading.") + self._downloadable_data = s3_file.read().decode("utf-8") + self._downloaded_path = path + return self._downloadable_data + + async def deliver_notebook(self, path): + nbdata = await self.read_s3_file(path) + + # Explanation of some kwargs passed into `finish_notebook`: + # breadcrumbs: list of dict + # Breadcrumb 'name' and 'url' to render as links at the top of the notebook page + # title: str + # Title to use as the HTML page title (i.e., text on the browser tab) + await self.finish_notebook( + nbdata, + download_url="?download", + msg="file from s3: %s" % path, + public=False, + breadcrumbs=[], + title=os.path.basename(path), + ) + + @cached + async def get(self, path): + """Get an s3 notebook + + Parameters + ========== + path: str + s3 uri + """ + fullpath = await self.get_notebook_data(path) + + # get_notebook_data returns None if a directory is to be shown or a notebook is to be downloaded, + # i.e. if no notebook is supposed to be rendered, making deliver_notebook inappropriate + if fullpath is not None: + await self.deliver_notebook(fullpath) + + +def default_handlers(handlers=[], **handler_names): + """Tornado handlers""" + + s3_handler = _load_handler_from_location(handler_names["s3_handler"]) + + return handlers + [(r"/(s3%3A//.*)", s3_handler, {})] + + +def uri_rewrites(rewrites=[]): + return [ + (r"^(s3://.*)$", "{0}"), + ] diff --git a/nbviewer/providers/s3/tests/__init__.py b/nbviewer/providers/s3/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nbviewer/providers/s3/tests/test_s3.py b/nbviewer/providers/s3/tests/test_s3.py new file mode 100644 index 00000000..5b7fcb53 --- /dev/null +++ b/nbviewer/providers/s3/tests/test_s3.py @@ -0,0 +1,96 @@ +# ----------------------------------------------------------------------------- +# Copyright (C) Jupyter Development Team +# +# Distributed under the terms of the BSD License. The full license is in +# the file COPYING, distributed as part of this software. +# ----------------------------------------------------------------------------- +import io +import json +from copy import deepcopy +from unittest.mock import patch + +import boto3 +import requests + +from ....tests.base import FormatHTMLMixin +from ....tests.base import NBViewerTestCase + + +MOCK_NOTEBOOK = { + "cells": [ + { + "cell_type": "code", + "execution_count": None, + "id": "b0939771-a810-4ee0-b440-dbbaeb4f1653", + "metadata": {}, + "outputs": [], + "source": [], + }, + { + "cell_type": "code", + "execution_count": None, + "id": "cc0d476a-d09c-4919-8dd2-c8d67f7431b3", + "metadata": {}, + "outputs": [], + "source": [], + }, + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3", + }, + "language_info": { + "codemirror_mode": {"name": "ipython", "version": 3}, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12", + }, + }, + "nbformat": 4, + "nbformat_minor": 5, +} + + +class MockBoto3: + def download_fileobj(self, Bucket, Key, fileobj): + """Mock downloading fileobjects""" + data = deepcopy(MOCK_NOTEBOOK) + data["cells"][0]["source"] = [f"print({Bucket})", f"print({Key})"] + bin_data = json.dumps(data).encode("utf-8") + fileobj.write(bin_data) + + def head_object(self, Bucket, Key): + """Mock getting key headers""" + output_file = io.BytesIO() + f = self.download_fileobj(Bucket, Key, output_file) + f.seek(0) + return {"ContentLength": len(f.read())} + + +""" +# This test won't work because the server is started through subprocess.POpen, so we can't mock boto3. + +class S3TestCase(NBViewerTestCase): + + @patch("boto3.client") + def test_url(self, mock_boto3_client): + mockBoto3 = MockBoto3() + mock_boto3_client.return_value = mockBoto3 + with patch.object(mockBoto3, 'download_fileobj') as mock_download: + bucket="my_bucket" + key="my_file.ipynb" + url = self.url(f"s3%3A//{bucket}/{key}") + r = requests.get(url) + self.assertEqual(r.status_code, 200) + args = mock_download.call_args_list[-1][:2] + self.assertEqual(args, (bucket, key)) + + +class FormatHTMLLocalFileDefaultTestCase(S3TestCase, FormatHTMLMixin): + pass +""" diff --git a/requirements.in b/requirements.in index b39a0d85..67ab6b93 100644 --- a/requirements.in +++ b/requirements.in @@ -1,5 +1,6 @@ elasticsearch ipython>=8 +boto3 jupyter_client jupyter_server>=0.2.0 markdown>=3.0,==3.1.1 # pin until we workaround #909, which is a regression in 3.2 diff --git a/requirements.txt b/requirements.txt index 8dcd6d9c..1ca7018a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,6 +20,12 @@ beautifulsoup4==4.11.1 # via nbconvert bleach==5.0.1 # via nbconvert +boto3==1.23.3 + # via -r requirements.in +botocore==1.26.3 + # via + # boto3 + # s3transfer certifi==2022.12.7 # via elastic-transport cffi==1.15.1 @@ -48,6 +54,10 @@ jinja2==3.1.2 # via # jupyter-server # nbconvert +jmespath==1.0.0 + # via + # boto3 + # botocore jsonschema==4.17.0 # via nbformat jupyter-client==7.4.4 @@ -130,11 +140,15 @@ pyparsing==3.0.9 pyrsistent==0.19.2 # via jsonschema python-dateutil==2.8.2 - # via jupyter-client + # via + # botocore + # jupyter-client pyzmq==24.0.1 # via # jupyter-client # jupyter-server +s3transfer==0.5.2 + # via boto3 send2trash==1.8.0 # via jupyter-server six==1.16.0 @@ -171,7 +185,9 @@ traitlets==5.5.0 # nbconvert # nbformat urllib3==1.26.12 - # via elastic-transport + # via + # botocore + # elastic-transport wcwidth==0.2.5 # via prompt-toolkit webencodings==0.5.1