diff --git a/nbviewer/app.py b/nbviewer/app.py index 5e8b05a4..7501f7aa 100644 --- a/nbviewer/app.py +++ b/nbviewer/app.py @@ -197,6 +197,10 @@ class NBViewer(Application): default_value="nbviewer.providers.local.handlers.LocalFileHandler", help="The Tornado handler to use for viewing notebooks found on a local filesystem", ).tag(config=True) + fsspec_handler = Unicode( + default_value="nbviewer.providers.fsspec.handlers.FsspecHandler", + help="The Tornado handler to use for viewing notebooks found on a fsspec filesystem", + ).tag(config=True) url_handler = Unicode( default_value="nbviewer.providers.url.handlers.URLHandler", help="The Tornado handler to use for viewing notebooks accessed via URL", @@ -632,6 +636,7 @@ def init_tornado_application(self): github_user_handler=self.github_user_handler, index_handler=self.index_handler, local_handler=self.local_handler, + fsspec_handler=self.fsspec_handler, url_handler=self.url_handler, user_gists_handler=self.user_gists_handler, ) diff --git a/nbviewer/providers/__init__.py b/nbviewer/providers/__init__.py index 7f00972c..d8ed80fb 100644 --- a/nbviewer/providers/__init__.py +++ b/nbviewer/providers/__init__.py @@ -6,12 +6,12 @@ # ----------------------------------------------------------------------------- default_providers = [ - "nbviewer.providers.{}".format(prov) for prov in ["url", "github", "gist"] + "nbviewer.providers.{}".format(prov) for prov in ["url", "github", "gist", "fsspec"] ] default_rewrites = [ "nbviewer.providers.{}".format(prov) - for prov in ["gist", "github", "dropbox", "huggingface", "url"] + for prov in ["gist", "github", "dropbox", "huggingface", "fsspec", "url"] ] diff --git a/nbviewer/providers/fsspec/__init__.py b/nbviewer/providers/fsspec/__init__.py new file mode 100644 index 00000000..ee7b5cb1 --- /dev/null +++ b/nbviewer/providers/fsspec/__init__.py @@ -0,0 +1,4 @@ +from .handlers import default_handlers +from .handlers import uri_rewrites + +__all__ = ["default_handlers", "uri_rewrites"] diff --git a/nbviewer/providers/fsspec/handlers.py b/nbviewer/providers/fsspec/handlers.py new file mode 100644 index 00000000..a1ce8fd3 --- /dev/null +++ b/nbviewer/providers/fsspec/handlers.py @@ -0,0 +1,250 @@ +# ----------------------------------------------------------------------------- +# Copyright (C) Jupyter Development Team +# +# Distributed under the terms of the BSD License. The full license is in +# the file COPYING, distributed as part of this software. +# ----------------------------------------------------------------------------- +import errno +import os + +import fsspec +from tornado import iostream +from tornado import web + +from .. import _load_handler_from_location +from ...utils import url_path_join +from ..base import cached +from ..base import RenderingHandler + +IGNORED_PROTOCOLS = {"http", "https"} +ALLOWED_PROTOCOLS = os.environ.get("FSSPEC_ALLOWED", "s3").split(",") +ALLOWED_DIR_LISTING = os.environ.get("FSSPEC_ALLOW_DIR_LISTING", True) + + +def build_url(protocol, url, *args): + return url_path_join(f"/fsspec/{protocol}", url, *args) + + +class FsspecHandler(RenderingHandler): + """ + + Serving notebooks from the fsspec filesystems. + """ + + async def download(self, fs, url): + """Download the file at the given url + + Parameters + ========== + fs: fsspec.AbstractFileSystem + Filesystem object + url: str + URL of the file + """ + + with fs.open(url, "rb") as f: + info = fs.info(url) + filename = os.path.basename(url) + + self.set_header("Content-Length", info["size"]) + # Escape commas to workaround Chrome issue with commas in download filenames + self.set_header( + "Content-Disposition", + "attachment; filename={};".format(filename.replace(",", "_")), + ) + + try: + self.write(f.read()) + await self.flush() + except iostream.StreamClosedError: + return + + def can_show(self, protocol, url): + """ + Generally determine whether the given path is displayable. + This function is useful for failing fast - further checks may + be applied at notebook render to confirm a file may be shown. + + """ + if protocol not in ALLOWED_PROTOCOLS: + return False + if not url: + return False + return True + + async def get_notebook_data(self, fs, protocol, url): + + if not self.can_show(protocol, url): + self.log.info("Path: '%s' is not visible from within nbviewer", url) + raise web.HTTPError(404) + + if ALLOWED_DIR_LISTING and fs.isdir(url): + html = self.show_dir(fs, protocol, url) + await self.cache_and_finish(html) + return + + is_download = self.get_query_arguments("download") + if is_download: + await self.download(fs, url) + return + + return url + + async def deliver_notebook(self, fs, protocol, path): + try: + with fs.open(path, encoding="utf-8") as f: + nbdata = f.read() + except OSError as ex: + if ex.errno == errno.EACCES: + # py3: can't read the file, so don't give away it exists + self.log.info("Path : '%s' is not readable from within nbviewer", path) + raise web.HTTPError(404) + raise ex + + # Explanation of some kwargs passed into `finish_notebook`: + # breadcrumbs: list of dict + # Breadcrumb 'name' and 'url' to render as links at the top of the notebook page + # title: str + # Title to use as the HTML page title (i.e., text on the browser tab) + breadcrumbs = [{"url": build_url(protocol, path, "../"), "name": "Up"}] + self.log.info("Rendering notebook from path: %s", path) + await self.finish_notebook( + nbdata, + download_url="?download", + msg="file from location %s" % path, + public=False, + breadcrumbs=breadcrumbs, + title=os.path.basename(path), + ) + + @cached + async def get(self, protocol, url): + """Get a directory listing, rendered notebook, or raw file + at the given path based on the type and URL query parameters. + + If the path points to an accessible directory, render its contents. + If the path points to an accessible notebook file, render it. + If the path points to an accessible file and the URL contains a + 'download' query parameter, respond with the file as a download. + + Parameters + ========== + protocol: str + Protocol of the file + url: str + URL of the file + """ + + if self.can_show(protocol, url) is False: + self.log.info("Path: '%s' is not visible from within nbviewer", url) + raise web.HTTPError(404) + + fs = fsspec.filesystem(protocol, use_listings_cache=False) + fs.invalidate_cache(url) + + fullpath = await self.get_notebook_data(fs, protocol, url) + + # get_notebook_data returns None if a directory is to be shown or a notebook is to be downloaded, + # i.e. if no notebook is supposed to be rendered, making deliver_notebook inappropriate + if fullpath: + await self.deliver_notebook(fs, protocol, url) + + # Make available to increase modularity for subclassing + # E.g. so subclasses can implement templates with custom logic + # without having to copy-paste the entire show_dir method + def render_dirview_template(self, entries, breadcrumbs, title, **namespace): + """ + breadcrumbs: list of dict + Breadcrumb 'name' and 'url' to render as links at the top of the notebook page + title: str + Title to use as the HTML page title (i.e., text on the browser tab) + """ + return self.render_template( + "dirview.html", + entries=entries, + breadcrumbs=breadcrumbs, + title=title, + **namespace, + ) + + def show_dir(self, fs, protocol, url): + """Render the directory view template for a given filesystem path. + + Parameters + ========== + fs: fsspec.AbstractFileSystem + Filesystem object + url: str + URL of the directory + + Returns + ======= + str + Rendered HTML + """ + entries = [] + dirs = [] + ipynbs = [] + + try: + contents = fs.listdir(url) + except OSError as ex: + if ex.errno == errno.EACCES: + # can't access the dir, so don't give away its presence + self.log.info( + "Contents of path: '%s' cannot be listed from within nbviewer", + url, + ) + raise web.HTTPError(404) + + for info in contents: + + entry = {} + name = os.path.basename(info["name"]) + entry["name"] = name + entry["url"] = build_url(protocol, url, name) + + # We need to make UTC timestamps conform to true ISO-8601 by + # appending Z(ulu). Without a timezone, the spec says it should be + # treated as local time which is not what we want and causes + # moment.js on the frontend to show times in the past or future + # depending on the user's timezone. + # https://en.wikipedia.org/wiki/ISO_8601#Time_zone_designators + + if info["type"] == "directory": + entry["class"] = "fa fa-folder-open" + dirs.append(entry) + elif info["type"] == "file" and name.endswith(".ipynb"): + entry["class"] = "fa fa-book" + ipynbs.append(entry) + else: + self.log.info(f"Ignored: {info}") + + dirs.sort(key=lambda e: e["name"]) + ipynbs.sort(key=lambda e: e["name"]) + + entries.extend(dirs) + entries.extend(ipynbs) + + breadcrumbs = [{"url": build_url(protocol, url, "../"), "name": "Up"}] + html = self.render_dirview_template( + entries=entries, + breadcrumbs=breadcrumbs, + title=url_path_join(url, "/"), + ) + return html + + +def default_handlers(handlers=[], **handler_names): + """Tornado handlers""" + + url_handler = _load_handler_from_location(handler_names["fsspec_handler"]) + return handlers + [(r"/fsspec/(?P[^/]+)/(?P.*)", url_handler, {})] + + +def uri_rewrites(rewrites=[]): + protocols = (i for i in ALLOWED_PROTOCOLS) + for protocol in protocols: + rewrites.append((f"^{protocol}://(.*?)$", f"/fsspec/{protocol}" + "/{0}")) + + return rewrites diff --git a/nbviewer/templates/dirview.html b/nbviewer/templates/dirview.html index 423657f9..4d22fd30 100644 --- a/nbviewer/templates/dirview.html +++ b/nbviewer/templates/dirview.html @@ -26,7 +26,9 @@ {% endif %} - {{ entry.modtime }} + {% if entry.modtime %} + {{ entry.modtime }} + {% endif %} {% endfor %} diff --git a/requirements.in b/requirements.in index b39a0d85..8cff6c9a 100644 --- a/requirements.in +++ b/requirements.in @@ -10,3 +10,5 @@ pycurl pylibmc statsd tornado>=6.0 +s3fs +fsspec diff --git a/requirements.txt b/requirements.txt index 8dcd6d9c..40c4b2ef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,20 @@ # -# This file is autogenerated by pip-compile with python 3.10 -# To update, run: +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: # -# pip-compile +# pip-compile --output-file=requirements.txt requirements.in # + +aiobotocore==2.12.3 + # via s3fs +aiohttp==3.9.5 + # via + # aiobotocore + # s3fs +aioitertools==0.11.0 + # via aiobotocore +aiosignal==1.3.1 + # via aiohttp anyio==3.6.2 # via jupyter-server argon2-cffi==21.3.0 @@ -12,14 +23,20 @@ argon2-cffi-bindings==21.2.0 # via argon2-cffi asttokens==2.1.0 # via stack-data +async-timeout==4.0.3 + # via aiohttp attrs==22.1.0 - # via jsonschema + # via + # aiohttp + # jsonschema backcall==0.2.0 # via ipython beautifulsoup4==4.11.1 # via nbconvert bleach==5.0.1 # via nbconvert +botocore==1.34.69 + # via aiobotocore certifi==2022.12.7 # via elastic-transport cffi==1.15.1 @@ -38,8 +55,18 @@ executing==1.2.0 # via stack-data fastjsonschema==2.16.2 # via nbformat +frozenlist==1.4.1 + # via + # aiohttp + # aiosignal +fsspec==2024.3.1 + # via + # -r requirements.in + # s3fs idna==3.4 - # via anyio + # via + # anyio + # yarl ipython==8.6.0 # via -r requirements.in jedi==0.18.1 @@ -48,6 +75,8 @@ jinja2==3.1.2 # via # jupyter-server # nbconvert +jmespath==1.0.1 + # via botocore jsonschema==4.17.0 # via nbformat jupyter-client==7.4.4 @@ -75,6 +104,10 @@ matplotlib-inline==0.1.6 # via ipython mistune==2.0.4 # via nbconvert +multidict==6.0.5 + # via + # aiohttp + # yarl nbclient==0.7.0 # via nbconvert nbconvert==7.2.3 @@ -130,11 +163,15 @@ pyparsing==3.0.9 pyrsistent==0.19.2 # via jsonschema python-dateutil==2.8.2 - # via jupyter-client + # via + # botocore + # jupyter-client pyzmq==24.0.1 # via # jupyter-client # jupyter-server +s3fs==2024.3.1 + # via -r requirements.in send2trash==1.8.0 # via jupyter-server six==1.16.0 @@ -171,7 +208,9 @@ traitlets==5.5.0 # nbconvert # nbformat urllib3==1.26.12 - # via elastic-transport + # via + # botocore + # elastic-transport wcwidth==0.2.5 # via prompt-toolkit webencodings==0.5.1 @@ -180,6 +219,10 @@ webencodings==0.5.1 # tinycss2 websocket-client==1.4.2 # via jupyter-server +wrapt==1.16.0 + # via aiobotocore +yarl==1.9.4 + # via aiohttp # The following packages are considered to be unsafe in a requirements file: # setuptools