From d5f272f4e4e0ed87193f80b34d86861611357ca4 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 6 Jun 2025 15:21:07 -0400 Subject: [PATCH 01/13] Use mkdocs-material for Zarr-Python documentation --- .github/PULL_REQUEST_TEMPLATE.md | 2 +- .github/labeler.yml | 2 +- .gitignore | 1 - .readthedocs.yaml | 16 +- changes/{2921.bugfix.rst => 2921.bugfix.md} | 0 changes/{3021.feature.rst => 3021.feature.md} | 0 changes/{3066.feature.rst => 3066.feature.md} | 0 changes/{3068.bugfix.rst => 3068.bugfix.md} | 0 changes/{3081.feature.rst => 3081.feature.md} | 0 changes/{3082.feature.rst => 3082.feature.md} | 0 changes/{3100.bugfix.rst => 3100.bugfix.md} | 0 changes/{3103.bugfix.rst => 3103.bugfix.md} | 0 changes/README.md | 2 +- docs/Makefile | 231 ------ docs/_static/custom.css | 110 --- docs/_static/custom.js | 17 - docs/_static/index_api.svg | 97 --- docs/_static/index_contribute.svg | 76 -- docs/_static/index_getting_started.svg | 66 -- docs/_static/index_user_guide.svg | 67 -- docs/about.rst | 24 - docs/api/abc/codec.md | 18 + docs/api/abc/metadata.md | 5 + docs/api/abc/store.md | 13 + docs/api/api_async.md | 32 + docs/api/api_sync.md | 32 + docs/api/array.md | 2 + docs/api/codecs.md | 17 + docs/api/config.md | 5 + docs/api/convenience.md | 10 + docs/api/create.md | 19 + docs/api/deprecated/convenience.md | 14 + docs/api/deprecated/creation.md | 15 + docs/api/errors.md | 10 + docs/api/group.md | 2 + docs/api/load.md | 5 + docs/api/open.md | 9 + docs/api/registry.md | 12 + docs/api/save.md | 7 + docs/api/storage.md | 19 + docs/api/testing.md | 56 ++ docs/conf.py | 379 ---------- docs/contributing.md | 257 +++++++ docs/developers/contributing.rst | 386 ---------- docs/developers/index.rst | 9 - docs/developers/roadmap.rst | 696 ------------------ docs/index.md | 230 ++++++ docs/index.rst | 113 --- docs/overrides/main.html | 9 + docs/overrides/stylesheets/extra.css | 52 ++ docs/quickstart.rst | 209 ------ docs/release-notes.md | 210 ++++++ docs/release-notes.rst | 269 ------- docs/talks/scipy2019/submission.rst | 144 ---- docs/user-guide/arrays.md | 688 +++++++++++++++++ docs/user-guide/arrays.rst | 658 ----------------- docs/user-guide/attributes.md | 29 + docs/user-guide/attributes.rst | 30 - docs/user-guide/config.md | 92 +++ docs/user-guide/config.rst | 91 --- docs/user-guide/consolidated_metadata.md | 117 +++ docs/user-guide/consolidated_metadata.rst | 116 --- .../{extending.rst => extending.md} | 68 +- docs/user-guide/gpu.md | 32 + docs/user-guide/gpu.rst | 37 - docs/user-guide/groups.md | 172 +++++ docs/user-guide/groups.rst | 172 ----- docs/user-guide/index.rst | 30 - docs/user-guide/installation.md | 50 ++ docs/user-guide/installation.rst | 54 -- docs/user-guide/performance.md | 263 +++++++ docs/user-guide/performance.rst | 278 ------- docs/user-guide/storage.md | 140 ++++ docs/user-guide/storage.rst | 148 ---- docs/user-guide/v3_migration.md | 220 ++++++ docs/user-guide/v3_migration.rst | 238 ------ mkdocs.yml | 154 ++++ pyproject.toml | 26 +- 78 files changed, 3067 insertions(+), 4812 deletions(-) rename changes/{2921.bugfix.rst => 2921.bugfix.md} (100%) rename changes/{3021.feature.rst => 3021.feature.md} (100%) rename changes/{3066.feature.rst => 3066.feature.md} (100%) rename changes/{3068.bugfix.rst => 3068.bugfix.md} (100%) rename changes/{3081.feature.rst => 3081.feature.md} (100%) rename changes/{3082.feature.rst => 3082.feature.md} (100%) rename changes/{3100.bugfix.rst => 3100.bugfix.md} (100%) rename changes/{3103.bugfix.rst => 3103.bugfix.md} (100%) delete mode 100644 docs/Makefile delete mode 100644 docs/_static/custom.css delete mode 100644 docs/_static/custom.js delete mode 100644 docs/_static/index_api.svg delete mode 100644 docs/_static/index_contribute.svg delete mode 100644 docs/_static/index_getting_started.svg delete mode 100644 docs/_static/index_user_guide.svg delete mode 100644 docs/about.rst create mode 100644 docs/api/abc/codec.md create mode 100644 docs/api/abc/metadata.md create mode 100644 docs/api/abc/store.md create mode 100644 docs/api/api_async.md create mode 100644 docs/api/api_sync.md create mode 100644 docs/api/array.md create mode 100644 docs/api/codecs.md create mode 100644 docs/api/config.md create mode 100644 docs/api/convenience.md create mode 100644 docs/api/create.md create mode 100644 docs/api/deprecated/convenience.md create mode 100644 docs/api/deprecated/creation.md create mode 100644 docs/api/errors.md create mode 100644 docs/api/group.md create mode 100644 docs/api/load.md create mode 100644 docs/api/open.md create mode 100644 docs/api/registry.md create mode 100644 docs/api/save.md create mode 100644 docs/api/storage.md create mode 100644 docs/api/testing.md delete mode 100644 docs/conf.py create mode 100644 docs/contributing.md delete mode 100644 docs/developers/contributing.rst delete mode 100644 docs/developers/index.rst delete mode 100644 docs/developers/roadmap.rst create mode 100644 docs/index.md delete mode 100644 docs/index.rst create mode 100644 docs/overrides/main.html create mode 100644 docs/overrides/stylesheets/extra.css delete mode 100644 docs/quickstart.rst create mode 100644 docs/release-notes.md delete mode 100644 docs/release-notes.rst delete mode 100644 docs/talks/scipy2019/submission.rst create mode 100644 docs/user-guide/arrays.md delete mode 100644 docs/user-guide/arrays.rst create mode 100644 docs/user-guide/attributes.md delete mode 100644 docs/user-guide/attributes.rst create mode 100644 docs/user-guide/config.md delete mode 100644 docs/user-guide/config.rst create mode 100644 docs/user-guide/consolidated_metadata.md delete mode 100644 docs/user-guide/consolidated_metadata.rst rename docs/user-guide/{extending.rst => extending.md} (52%) create mode 100644 docs/user-guide/gpu.md delete mode 100644 docs/user-guide/gpu.rst create mode 100644 docs/user-guide/groups.md delete mode 100644 docs/user-guide/groups.rst delete mode 100644 docs/user-guide/index.rst create mode 100644 docs/user-guide/installation.md delete mode 100644 docs/user-guide/installation.rst create mode 100644 docs/user-guide/performance.md delete mode 100644 docs/user-guide/performance.rst create mode 100644 docs/user-guide/storage.md delete mode 100644 docs/user-guide/storage.rst create mode 100644 docs/user-guide/v3_migration.md delete mode 100644 docs/user-guide/v3_migration.rst create mode 100644 mkdocs.yml diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 9b64c97d0a..c36428b300 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -3,7 +3,7 @@ TODO: * [ ] Add unit tests and/or doctests in docstrings * [ ] Add docstrings and API docs for any new/modified user-facing classes and functions -* [ ] New/modified features documented in `docs/user-guide/*.rst` +* [ ] New/modified features documented in `docs/user-guide/*.md` * [ ] Changes documented as a new file in `changes/` * [ ] GitHub Actions have all passed * [ ] Test coverage is 100% (Codecov passes) diff --git a/.github/labeler.yml b/.github/labeler.yml index ede89c9d35..7eb74211ea 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -1,4 +1,4 @@ needs release notes: - all: - changed-files: - - all-globs-to-all-files: '!changes/*.rst' + - all-globs-to-all-files: '!changes/*.md' diff --git a/.gitignore b/.gitignore index 1b2b63e651..cfc455b715 100644 --- a/.gitignore +++ b/.gitignore @@ -51,7 +51,6 @@ coverage.xml # Sphinx documentation docs/_build/ -docs/api docs/data data data.zip diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 6253a7196f..05239456a0 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -11,16 +11,8 @@ build: then towncrier build --version Unreleased --yes; fi + commands: + - mamba install -c conda-forge -c nodefaults hatch + - hatch env run --env docs build + - hatch env run --env docs readthedocs -sphinx: - configuration: docs/conf.py - fail_on_warning: true - -formats: all - -python: - install: - - method: pip - path: . - extra_requirements: - - docs diff --git a/changes/2921.bugfix.rst b/changes/2921.bugfix.md similarity index 100% rename from changes/2921.bugfix.rst rename to changes/2921.bugfix.md diff --git a/changes/3021.feature.rst b/changes/3021.feature.md similarity index 100% rename from changes/3021.feature.rst rename to changes/3021.feature.md diff --git a/changes/3066.feature.rst b/changes/3066.feature.md similarity index 100% rename from changes/3066.feature.rst rename to changes/3066.feature.md diff --git a/changes/3068.bugfix.rst b/changes/3068.bugfix.md similarity index 100% rename from changes/3068.bugfix.rst rename to changes/3068.bugfix.md diff --git a/changes/3081.feature.rst b/changes/3081.feature.md similarity index 100% rename from changes/3081.feature.rst rename to changes/3081.feature.md diff --git a/changes/3082.feature.rst b/changes/3082.feature.md similarity index 100% rename from changes/3082.feature.rst rename to changes/3082.feature.md diff --git a/changes/3100.bugfix.rst b/changes/3100.bugfix.md similarity index 100% rename from changes/3100.bugfix.rst rename to changes/3100.bugfix.md diff --git a/changes/3103.bugfix.rst b/changes/3103.bugfix.md similarity index 100% rename from changes/3103.bugfix.rst rename to changes/3103.bugfix.md diff --git a/changes/README.md b/changes/README.md index 74ed9f94a9..889a52baa4 100644 --- a/changes/README.md +++ b/changes/README.md @@ -1,7 +1,7 @@ Writing a changelog entry ------------------------- -Please put a new file in this directory named `xxxx..rst`, where +Please put a new file in this directory named `xxxx..md`, where - `xxxx` is the pull request number associated with this entry - `` is one of: diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index f42ee840e9..0000000000 --- a/docs/Makefile +++ /dev/null @@ -1,231 +0,0 @@ -# Makefile for Sphinx documentation -# - -# You can set these variables from the command line. -SPHINXOPTS = -W --keep-going -SPHINXBUILD = sphinx-build -PAPER = -BUILDDIR = _build - -# User-friendly check for sphinx-build -ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) - $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don\'t have Sphinx installed, grab it from https://www.sphinx-doc.org/) -endif - -# Internal variables. -PAPEROPT_a4 = -D latex_paper_size=a4 -PAPEROPT_letter = -D latex_paper_size=letter -ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . -# the i18n builder cannot share the environment and doctrees with the others -I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . - -.PHONY: help -help: - @echo "Please use \`make ' where is one of" - @echo " html to make standalone HTML files" - @echo " dirhtml to make HTML files named index.html in directories" - @echo " singlehtml to make a single large HTML file" - @echo " pickle to make pickle files" - @echo " json to make JSON files" - @echo " htmlhelp to make HTML files and a HTML help project" - @echo " qthelp to make HTML files and a qthelp project" - @echo " applehelp to make an Apple Help Book" - @echo " devhelp to make HTML files and a Devhelp project" - @echo " epub to make an epub" - @echo " epub3 to make an epub3" - @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" - @echo " latexpdf to make LaTeX files and run them through pdflatex" - @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" - @echo " text to make text files" - @echo " man to make manual pages" - @echo " texinfo to make Texinfo files" - @echo " info to make Texinfo files and run them through makeinfo" - @echo " gettext to make PO message catalogs" - @echo " changes to make an overview of all changed/added/deprecated items" - @echo " xml to make Docutils-native XML files" - @echo " pseudoxml to make pseudoxml-XML files for display purposes" - @echo " linkcheck to check all external links for integrity" - @echo " doctest to run all doctests embedded in the documentation (if enabled)" - @echo " coverage to run coverage check of the documentation (if enabled)" - @echo " dummy to check syntax errors of document sources" - -.PHONY: clean -clean: - rm -rf $(BUILDDIR)/* - rm -rf $(BUILDDIR)/../api - -.PHONY: html -html: - $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." - -.PHONY: dirhtml -dirhtml: - $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." - -.PHONY: singlehtml -singlehtml: - $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml - @echo - @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." - -.PHONY: pickle -pickle: - $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle - @echo - @echo "Build finished; now you can process the pickle files." - -.PHONY: json -json: - $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json - @echo - @echo "Build finished; now you can process the JSON files." - -.PHONY: htmlhelp -htmlhelp: - $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp - @echo - @echo "Build finished; now you can run HTML Help Workshop with the" \ - ".hhp project file in $(BUILDDIR)/htmlhelp." - -.PHONY: qthelp -qthelp: - $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp - @echo - @echo "Build finished; now you can run "qcollectiongenerator" with the" \ - ".qhcp project file in $(BUILDDIR)/qthelp, like this:" - @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/zarr.qhcp" - @echo "To view the help file:" - @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/zarr.qhc" - -.PHONY: applehelp -applehelp: - $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp - @echo - @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." - @echo "N.B. You won't be able to view it unless you put it in" \ - "~/Library/Documentation/Help or install it in your application" \ - "bundle." - -.PHONY: devhelp -devhelp: - $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp - @echo - @echo "Build finished." - @echo "To view the help file:" - @echo "# mkdir -p $$HOME/.local/share/devhelp/zarr" - @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/zarr" - @echo "# devhelp" - -.PHONY: epub -epub: - $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub - @echo - @echo "Build finished. The epub file is in $(BUILDDIR)/epub." - -.PHONY: epub3 -epub3: - $(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3 - @echo - @echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3." - -.PHONY: latex -latex: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo - @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." - @echo "Run \`make' in that directory to run these through (pdf)latex" \ - "(use \`make latexpdf' here to do that automatically)." - -.PHONY: latexpdf -latexpdf: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through pdflatex..." - $(MAKE) -C $(BUILDDIR)/latex all-pdf - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -.PHONY: latexpdfja -latexpdfja: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through platex and dvipdfmx..." - $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -.PHONY: text -text: - $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text - @echo - @echo "Build finished. The text files are in $(BUILDDIR)/text." - -.PHONY: man -man: - $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man - @echo - @echo "Build finished. The manual pages are in $(BUILDDIR)/man." - -.PHONY: texinfo -texinfo: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo - @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." - @echo "Run \`make' in that directory to run these through makeinfo" \ - "(use \`make info' here to do that automatically)." - -.PHONY: info -info: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo "Running Texinfo files through makeinfo..." - make -C $(BUILDDIR)/texinfo info - @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." - -.PHONY: gettext -gettext: - $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale - @echo - @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." - -.PHONY: changes -changes: - $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes - @echo - @echo "The overview file is in $(BUILDDIR)/changes." - -.PHONY: linkcheck -linkcheck: - $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck - @echo - @echo "Link check complete; look for any errors in the above output " \ - "or in $(BUILDDIR)/linkcheck/output.txt." - -.PHONY: doctest -doctest: - $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest - @echo "Testing of doctests in the sources finished, look at the " \ - "results in $(BUILDDIR)/doctest/output.txt." - -.PHONY: coverage -coverage: - $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage - @echo "Testing of coverage in the sources finished, look at the " \ - "results in $(BUILDDIR)/coverage/python.txt." - -.PHONY: xml -xml: - $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml - @echo - @echo "Build finished. The XML files are in $(BUILDDIR)/xml." - -.PHONY: pseudoxml -pseudoxml: - $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml - @echo - @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." - -.PHONY: dummy -dummy: - $(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy - @echo - @echo "Build finished. Dummy builder generates no files." diff --git a/docs/_static/custom.css b/docs/_static/custom.css deleted file mode 100644 index 1d32606f9a..0000000000 --- a/docs/_static/custom.css +++ /dev/null @@ -1,110 +0,0 @@ -@import url('https://fonts.googleapis.com/css2?family=Lato:ital,wght@0,400;0,700;0,900;1,400;1,700;1,900&family=Open+Sans:ital,wght@0,400;0,600;1,400;1,600&display=swap'); - -body { - font-family: 'Open Sans', sans-serif; -} - -pre, code { - font-size: 100%; - line-height: 155%; -} - -/* Style the active version button. - -- dev: orange -- stable: green -- old, PR: red - -Colors from: - -Wong, B. Points of view: Color blindness. -Nat Methods 8, 441 (2011). https://doi.org/10.1038/nmeth.1618 -*/ - -/* If the active version has the name "dev", style it orange */ -#version_switcher_button[data-active-version-name*="dev"] { - background-color: #E69F00; - border-color: #E69F00; - color:#000000; -} - -/* green for `stable` */ -#version_switcher_button[data-active-version-name*="stable"] { - background-color: #009E73; - border-color: #009E73; -} - -/* red for `old` */ -#version_switcher_button:not([data-active-version-name*="stable"], [data-active-version-name*="dev"], [data-active-version-name=""]) { - background-color: #980F0F; - border-color: #980F0F; -} - -/* Main page overview cards */ - -.sd-card { - background: #fff; - border-radius: 0; - padding: 30px 10px 20px 10px; - margin: 10px 0px; -} - -.sd-card .sd-card-header { - text-align: center; -} - -.sd-card .sd-card-header .sd-card-text { - margin: 0px; -} - -.sd-card .sd-card-img-top { - height: 52px; - width: 52px; - margin-left: auto; - margin-right: auto; -} - -.sd-card .sd-card-header { - border: none; - background-color: white; - font-size: var(--pst-font-size-h5); - font-weight: bold; - padding: 2.5rem 0rem 0.5rem 0rem; -} - -.sd-card .sd-card-footer { - border: none; - background-color: white; -} - -.sd-card .sd-card-footer .sd-card-text { - max-width: 220px; - margin-left: auto; - margin-right: auto; -} - -/* Dark theme tweaking */ -html[data-theme=dark] .sd-card img[src*='.svg'] { - filter: invert(0.82) brightness(0.8) contrast(1.2); -} - -/* Main index page overview cards */ -html[data-theme=dark] .sd-card { - background-color:var(--pst-color-background); -} - -html[data-theme=dark] .sd-shadow-sm { - box-shadow: 0 .1rem 1rem rgba(250, 250, 250, .6) !important -} - -html[data-theme=dark] .sd-card .sd-card-header { - background-color:var(--pst-color-background); -} - -html[data-theme=dark] .sd-card .sd-card-footer { - background-color:var(--pst-color-background); -} - -html[data-theme=dark] h1 { - color: var(--pst-color-primary); -} diff --git a/docs/_static/custom.js b/docs/_static/custom.js deleted file mode 100644 index 52f1cba9e0..0000000000 --- a/docs/_static/custom.js +++ /dev/null @@ -1,17 +0,0 @@ -// handle redirects -(() => { - let anchorMap = { - "installation": "installation.html", - "getting-started": "getting_started.html#getting-started", - "highlights": "getting_started.html#highlights", - "contributing": "contributing.html", - "projects-using-zarr": "getting_started.html#projects-using-zarr", - "contents": "getting_started.html#contents", - "indices-and-tables": "api.html#indices-and-tables" - } - - let hash = window.location.hash.substring(1); - if (hash && hash in anchorMap) { - window.location.replace(anchorMap[hash]); - } -})(); diff --git a/docs/_static/index_api.svg b/docs/_static/index_api.svg deleted file mode 100644 index 69f7ba1d2d..0000000000 --- a/docs/_static/index_api.svg +++ /dev/null @@ -1,97 +0,0 @@ - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - diff --git a/docs/_static/index_contribute.svg b/docs/_static/index_contribute.svg deleted file mode 100644 index de3d902379..0000000000 --- a/docs/_static/index_contribute.svg +++ /dev/null @@ -1,76 +0,0 @@ - - - - - - - - - - image/svg+xml - - - - - - - - - - - - diff --git a/docs/_static/index_getting_started.svg b/docs/_static/index_getting_started.svg deleted file mode 100644 index 2d36622cb7..0000000000 --- a/docs/_static/index_getting_started.svg +++ /dev/null @@ -1,66 +0,0 @@ - - - - - - - - - - image/svg+xml - - - - - - - - - diff --git a/docs/_static/index_user_guide.svg b/docs/_static/index_user_guide.svg deleted file mode 100644 index bd17053517..0000000000 --- a/docs/_static/index_user_guide.svg +++ /dev/null @@ -1,67 +0,0 @@ - - - - - - - - - - image/svg+xml - - - - - - - - - diff --git a/docs/about.rst b/docs/about.rst deleted file mode 100644 index 7a0af998c0..0000000000 --- a/docs/about.rst +++ /dev/null @@ -1,24 +0,0 @@ -About -===== - -Zarr is a format for the storage of chunked, compressed, N-dimensional arrays -inspired by `HDF5 `_, `h5py -`_ and `bcolz `_. - -These documents describe the Zarr-Python implementation. More information -about the Zarr format can be found on the `main website `_. - -Projects using Zarr -------------------- - -If you are using Zarr-Python, we would `love to hear about it -`_. - -Funding -------- -The project is fiscally sponsored by `NumFOCUS `_, a US -501(c)(3) public charity, and development is supported by the -`MRC Centre for Genomics and Global Health `_ -and the `Chan Zuckerberg Initiative `_. - -.. _NumCodecs: https://numcodecs.readthedocs.io/ diff --git a/docs/api/abc/codec.md b/docs/api/abc/codec.md new file mode 100644 index 0000000000..88e35d01bf --- /dev/null +++ b/docs/api/abc/codec.md @@ -0,0 +1,18 @@ +--- +title: codec +--- + +## Attributes + +::: zarr.abc.codec.CodecInput +::: zarr.abc.codec.CodecOutput + +## Classes + +::: zarr.abc.codec.ArrayArrayCodec +::: zarr.abc.codec.ArrayBytesCodec +::: zarr.abc.codec.ArrayBytesCodecPartialDecodeMixin +::: zarr.abc.codec.ArrayBytesCodecPartialEncodeMixin +::: zarr.abc.codec.BaseCodec +::: zarr.abc.codec.BytesBytesCodec +::: zarr.abc.codec.CodecPipeline diff --git a/docs/api/abc/metadata.md b/docs/api/abc/metadata.md new file mode 100644 index 0000000000..00c670d3e4 --- /dev/null +++ b/docs/api/abc/metadata.md @@ -0,0 +1,5 @@ +--- +title: metadata +--- + +::: zarr.abc.metadata.Metadata diff --git a/docs/api/abc/store.md b/docs/api/abc/store.md new file mode 100644 index 0000000000..bb8ceebb91 --- /dev/null +++ b/docs/api/abc/store.md @@ -0,0 +1,13 @@ +--- +title: store +--- + +## Classes + +::: zarr.abc.store.ByteGetter +::: zarr.abc.store.ByteSetter +::: zarr.abc.store.Store + +## Functions + +::: zarr.abc.store.set_or_delete diff --git a/docs/api/api_async.md b/docs/api/api_async.md new file mode 100644 index 0000000000..1380469cff --- /dev/null +++ b/docs/api/api_async.md @@ -0,0 +1,32 @@ +--- +title: asynchronous +--- + +::: zarr.api.asynchronous.array +::: zarr.api.asynchronous.consolidate_metadata +::: zarr.api.asynchronous.copy +::: zarr.api.asynchronous.copy_all +::: zarr.api.asynchronous.copy_store +::: zarr.api.asynchronous.create +::: zarr.api.asynchronous.create_array +::: zarr.api.asynchronous.create_hierarchy +::: zarr.api.asynchronous.empty +::: zarr.api.asynchronous.empty_like +::: zarr.api.asynchronous.from_array +::: zarr.api.asynchronous.full +::: zarr.api.asynchronous.full_like +::: zarr.api.asynchronous.group +::: zarr.api.asynchronous.load +::: zarr.api.asynchronous.ones +::: zarr.api.asynchronous.ones_like +::: zarr.api.asynchronous.open +::: zarr.api.asynchronous.open_array +::: zarr.api.asynchronous.open_consolidated +::: zarr.api.asynchronous.open_group +::: zarr.api.asynchronous.open_like +::: zarr.api.asynchronous.save +::: zarr.api.asynchronous.save_array +::: zarr.api.asynchronous.save_group +::: zarr.api.asynchronous.tree +::: zarr.api.asynchronous.zeros +::: zarr.api.asynchronous.zeros_like diff --git a/docs/api/api_sync.md b/docs/api/api_sync.md new file mode 100644 index 0000000000..730be3449e --- /dev/null +++ b/docs/api/api_sync.md @@ -0,0 +1,32 @@ +--- +title: synchronous +--- + +::: zarr.api.synchronous.array +::: zarr.api.synchronous.consolidate_metadata +::: zarr.api.synchronous.copy +::: zarr.api.synchronous.copy_all +::: zarr.api.synchronous.copy_store +::: zarr.api.synchronous.create +::: zarr.api.synchronous.create_array +::: zarr.api.synchronous.create_hierarchy +::: zarr.api.synchronous.empty +::: zarr.api.synchronous.empty_like +::: zarr.api.synchronous.from_array +::: zarr.api.synchronous.full +::: zarr.api.synchronous.full_like +::: zarr.api.synchronous.group +::: zarr.api.synchronous.load +::: zarr.api.synchronous.ones +::: zarr.api.synchronous.ones_like +::: zarr.api.synchronous.open +::: zarr.api.synchronous.open_array +::: zarr.api.synchronous.open_consolidated +::: zarr.api.synchronous.open_group +::: zarr.api.synchronous.open_like +::: zarr.api.synchronous.save +::: zarr.api.synchronous.save_array +::: zarr.api.synchronous.save_group +::: zarr.api.synchronous.tree +::: zarr.api.synchronous.zeros +::: zarr.api.synchronous.zeros_like diff --git a/docs/api/array.md b/docs/api/array.md new file mode 100644 index 0000000000..ff61cb1fe2 --- /dev/null +++ b/docs/api/array.md @@ -0,0 +1,2 @@ +::: zarr.Array +::: zarr.AsyncArray diff --git a/docs/api/codecs.md b/docs/api/codecs.md new file mode 100644 index 0000000000..f2793875f4 --- /dev/null +++ b/docs/api/codecs.md @@ -0,0 +1,17 @@ +--- +title: codecs +--- + +::: zarr.codecs.BloscCname +::: zarr.codecs.BloscCodec +::: zarr.codecs.BloscShuffle +::: zarr.codecs.BytesCodec +::: zarr.codecs.Crc32cCodec +::: zarr.codecs.Endian +::: zarr.codecs.GzipCodec +::: zarr.codecs.ShardingCodec +::: zarr.codecs.ShardingCodecIndexLocation +::: zarr.codecs.TransposeCodec +::: zarr.codecs.VLenBytesCodec +::: zarr.codecs.VLenUTF8Codec +::: zarr.codecs.ZstdCodec diff --git a/docs/api/config.md b/docs/api/config.md new file mode 100644 index 0000000000..30803918f5 --- /dev/null +++ b/docs/api/config.md @@ -0,0 +1,5 @@ +--- +title: config +--- + +::: zarr.config diff --git a/docs/api/convenience.md b/docs/api/convenience.md new file mode 100644 index 0000000000..f2614e3724 --- /dev/null +++ b/docs/api/convenience.md @@ -0,0 +1,10 @@ +--- +title: convenience +--- + +::: zarr.consolidate_metadata +::: zarr.copy +::: zarr.copy_all +::: zarr.copy_store +::: zarr.print_debug_info +::: zarr.tree diff --git a/docs/api/create.md b/docs/api/create.md new file mode 100644 index 0000000000..971e9c293c --- /dev/null +++ b/docs/api/create.md @@ -0,0 +1,19 @@ +--- +title: create +--- + +::: zarr.array +::: zarr.create +::: zarr.create_array +::: zarr.create_group +::: zarr.create_hierarchy +::: zarr.empty +::: zarr.empty_like +::: zarr.full +::: zarr.full_like +::: zarr.from_array +::: zarr.group +::: zarr.ones +::: zarr.ones_like +::: zarr.zeros +::: zarr.zeros_like diff --git a/docs/api/deprecated/convenience.md b/docs/api/deprecated/convenience.md new file mode 100644 index 0000000000..f7629b7ac6 --- /dev/null +++ b/docs/api/deprecated/convenience.md @@ -0,0 +1,14 @@ +!!! warning "Deprecated" + This sub-module is deprecated. All functions here are defined in the top level zarr namespace instead. + +::: zarr.convenience.consolidate_metadata +::: zarr.convenience.copy +::: zarr.convenience.copy_all +::: zarr.convenience.copy_store +::: zarr.convenience.load +::: zarr.convenience.open +::: zarr.convenience.open_consolidated +::: zarr.convenience.save +::: zarr.convenience.save_array +::: zarr.convenience.save_group +::: zarr.convenience.tree diff --git a/docs/api/deprecated/creation.md b/docs/api/deprecated/creation.md new file mode 100644 index 0000000000..1f87c7cf2e --- /dev/null +++ b/docs/api/deprecated/creation.md @@ -0,0 +1,15 @@ +!!! warning "Deprecated" + This sub-module is deprecated. All functions here are defined in the top level zarr namespace instead. + +::: zarr.creation.array +::: zarr.creation.create +::: zarr.creation.empty +::: zarr.creation.empty_like +::: zarr.creation.full +::: zarr.creation.full_like +::: zarr.creation.ones +::: zarr.creation.ones_like +::: zarr.creation.open_array +::: zarr.creation.open_like +::: zarr.creation.zeros +::: zarr.creation.zeros_like diff --git a/docs/api/errors.md b/docs/api/errors.md new file mode 100644 index 0000000000..dfdd74c07a --- /dev/null +++ b/docs/api/errors.md @@ -0,0 +1,10 @@ +--- +title: errors +--- + +::: zarr.errors.BaseZarrError +::: zarr.errors.ContainsArrayAndGroupError +::: zarr.errors.ContainsArrayError +::: zarr.errors.ContainsGroupError +::: zarr.errors.MetadataValidationError +::: zarr.errors.NodeTypeValidationError diff --git a/docs/api/group.md b/docs/api/group.md new file mode 100644 index 0000000000..0cf9372de2 --- /dev/null +++ b/docs/api/group.md @@ -0,0 +1,2 @@ +::: zarr.Group +::: zarr.AsyncGroup diff --git a/docs/api/load.md b/docs/api/load.md new file mode 100644 index 0000000000..d6463ca976 --- /dev/null +++ b/docs/api/load.md @@ -0,0 +1,5 @@ +--- +title: load +--- + +::: zarr.load diff --git a/docs/api/open.md b/docs/api/open.md new file mode 100644 index 0000000000..c59f896129 --- /dev/null +++ b/docs/api/open.md @@ -0,0 +1,9 @@ +--- +title: open +--- + +::: zarr.open +::: zarr.open_array +::: zarr.open_consolidated +::: zarr.open_group +::: zarr.open_like diff --git a/docs/api/registry.md b/docs/api/registry.md new file mode 100644 index 0000000000..97262f8920 --- /dev/null +++ b/docs/api/registry.md @@ -0,0 +1,12 @@ +--- +title: registry +--- + +::: zarr.registry.get_buffer_class +::: zarr.registry.get_codec_class +::: zarr.registry.get_ndbuffer_class +::: zarr.registry.get_pipeline_class +::: zarr.registry.register_buffer +::: zarr.registry.register_codec +::: zarr.registry.register_ndbuffer +::: zarr.registry.register_pipeline diff --git a/docs/api/save.md b/docs/api/save.md new file mode 100644 index 0000000000..c611d10a4c --- /dev/null +++ b/docs/api/save.md @@ -0,0 +1,7 @@ +--- +title: save +--- + +::: zarr.save +::: zarr.save_array +::: zarr.save_group diff --git a/docs/api/storage.md b/docs/api/storage.md new file mode 100644 index 0000000000..ac8e94158b --- /dev/null +++ b/docs/api/storage.md @@ -0,0 +1,19 @@ +--- +title: storage +--- + +## Attributes + +::: zarr.storage.StoreLike + +## Classes + +::: zarr.storage.FsspecStore +::: zarr.storage.GpuMemoryStore +::: zarr.storage.LocalStore +::: zarr.storage.LoggingStore +::: zarr.storage.MemoryStore +::: zarr.storage.ObjectStore +::: zarr.storage.StorePath +::: zarr.storage.WrapperStore +::: zarr.storage.ZipStore \ No newline at end of file diff --git a/docs/api/testing.md b/docs/api/testing.md new file mode 100644 index 0000000000..257d0ff59b --- /dev/null +++ b/docs/api/testing.md @@ -0,0 +1,56 @@ +--- +title: testing +--- + +## Buffer + +::: zarr.testing.buffer.NDBufferUsingTestNDArrayLike +::: zarr.testing.buffer.StoreExpectingTestBuffer +::: zarr.testing.buffer.TestBuffer + +## Stateful + +::: zarr.testing.stateful.MAX_BINARY_SIZE +::: zarr.testing.stateful.SyncStoreWrapper +::: zarr.testing.stateful.ZarrHierarchyStateMachine +::: zarr.testing.stateful.ZarrStoreStateMachine +::: zarr.testing.stateful.split_prefix_name + +## Store + +::: zarr.testing.store.StoreTests + +## Strategies + +::: zarr.testing.strategies.array_names +::: zarr.testing.strategies.array_shapes +::: zarr.testing.strategies.attrs +::: zarr.testing.strategies.compressors +::: zarr.testing.strategies.node_names +::: zarr.testing.strategies.short_node_names +::: zarr.testing.strategies.stores +::: zarr.testing.strategies.zarr_formats +::: zarr.testing.strategies.zarr_key_chars +::: zarr.testing.strategies.array_metadata +::: zarr.testing.strategies.arrays +::: zarr.testing.strategies.basic_indices +::: zarr.testing.strategies.chunk_shapes +::: zarr.testing.strategies.clear_store +::: zarr.testing.strategies.dimension_names +::: zarr.testing.strategies.end_slices +::: zarr.testing.strategies.is_negative_slice +::: zarr.testing.strategies.key_ranges +::: zarr.testing.strategies.keys +::: zarr.testing.strategies.np_array_and_chunks +::: zarr.testing.strategies.numpy_arrays +::: zarr.testing.strategies.orthogonal_indices +::: zarr.testing.strategies.paths +::: zarr.testing.strategies.safe_unicode_for_dtype +::: zarr.testing.strategies.shard_shapes +::: zarr.testing.strategies.simple_arrays +::: zarr.testing.strategies.v2_dtypes +::: zarr.testing.strategies.v3_dtypes + +## Utils + +::: zarr.testing.utils.assert_bytes_equal diff --git a/docs/conf.py b/docs/conf.py deleted file mode 100644 index 9bb1c48901..0000000000 --- a/docs/conf.py +++ /dev/null @@ -1,379 +0,0 @@ -#!/usr/bin/env python3 -# -# zarr documentation build configuration file, created by -# sphinx-quickstart on Mon May 2 21:40:09 2016. -# -# This file is execfile()d with the current directory set to its -# containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. - - -import os -import sys -from importlib.metadata import version as get_version -from typing import Any - -import sphinx -import sphinx.application - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -sys.path.append(os.path.abspath("..")) - - -# -- General configuration ------------------------------------------------ - -# If your documentation needs a minimal Sphinx version, state it here. -# needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - "sphinx.ext.autodoc", - "sphinx.ext.autosummary", - "sphinx.ext.viewcode", - "sphinx.ext.intersphinx", - 'autoapi.extension', - "numpydoc", - "sphinx_issues", - "sphinx_copybutton", - "sphinx_design", - 'sphinx_reredirects', -] - -issues_github_path = "zarr-developers/zarr-python" - -autoapi_dirs = ['../src/zarr'] -autoapi_add_toctree_entry = False -autoapi_generate_api_docs = True -autoapi_member_order = "groupwise" -autoapi_root = "api" -autoapi_keep_files = True -autoapi_options = [ 'members', 'undoc-members', 'show-inheritance', 'show-module-summary', 'imported-members', ] - -def skip_submodules( - app: sphinx.application.Sphinx, - what: str, - name: str, - obj: object, - skip: bool, - options: dict[str, Any] - ) -> bool: - # Skip documenting zarr.codecs submodules - # codecs are documented in the main zarr.codecs namespace - if what == "module" and name.startswith("zarr.codecs.") or name.startswith("zarr.core"): - skip = True - return skip - -# Add any paths that contain templates here, relative to this directory. -templates_path = ["_templates"] - -# The suffix(es) of source filenames. -# You can specify multiple suffix as a list of string: -# source_suffix = ['.rst', '.md'] -source_suffix = ".rst" - -# The encoding of source files. -# source_encoding = 'utf-8-sig' - -# The main toctree document. -main_doc = "index" - -# General information about the project. -project = "zarr" -copyright = "2025, Zarr Developers" -author = "Zarr Developers" - -version = get_version("zarr") -release = get_version("zarr") - -redirects = { - "spec": "https://zarr-specs.readthedocs.io", - "spec/v1": 'https://zarr-specs.readthedocs.io/en/latest/v1/v1.0.html', - "spec/v2": "https://zarr-specs.readthedocs.io/en/latest/v2/v2.0.html", - "spec/v3": "https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html", - "license": "https://github.com/zarr-developers/zarr-python/blob/main/LICENSE.txt", - "tutorial": "user-guide", - "getting-started": "quickstart", - "roadmap": "developers/roadmap.html", - "installation": "user-guide/installation.html", - "api": "api/zarr/index", - "release": "release-notes" -} - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# -# This is also used if you do content translation via gettext catalogs. -# Usually you set "language" from the command line for these cases. -language = "en" - -# There are two options for replacing |today|: either, you set today to some -# non-false value, then it is used: -# today = '' -# Else, today_fmt is used as the format for a strftime call. -# today_fmt = '%B %d, %Y' - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This patterns also effect to html_static_path and html_extra_path -exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "talks"] - -# The reST default role (used for this markup: `text`) to use for all -# documents. -# default_role = None - -# If true, '()' will be appended to :func: etc. cross-reference text. -# add_function_parentheses = True - -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -# add_module_names = True - -# If true, sectionauthor and moduleauthor directives will be shown in the -# output. They are ignored by default. -# show_authors = False - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = "sphinx" - -# A list of ignored prefixes for module index sorting. -# modindex_common_prefix = [] - -# If true, keep warnings as "system message" paragraphs in the built documents. -# keep_warnings = False - -# If true, `todo` and `todoList` produce output, else they produce nothing. -todo_include_todos = False - - -# -- Options for HTML output ---------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -html_theme = "pydata_sphinx_theme" - -html_favicon = "_static/logo1.png" - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -html_theme_options = { - "github_url": "https://github.com/zarr-developers/zarr-python", - "twitter_url": "https://twitter.com/zarr_dev", - "icon_links": [ - { - "name": "Zarr Dev", - "url": "https://zarr.dev/", - "icon": "_static/logo1.png", - "type": "local", - }, - ], - "collapse_navigation": True, - "navigation_with_keys": False, - "announcement": "Zarr-Python 3 is here! Check out the release announcement here.", -} - -# Add any paths that contain custom themes here, relative to this directory. -# html_theme_path = [] - -# The name for this set of Sphinx documents. -# " v documentation" by default. -# html_title = 'zarr v@@' - -# A shorter title for the navigation bar. Default is the same as html_title. -# html_short_title = None - -# The name of an image file (relative to this directory) to place at the top -# of the sidebar. -html_logo = "_static/logo_horizontal.svg" - - -def setup(app: sphinx.application.Sphinx) -> None: - app.add_css_file("custom.css") - app.connect("autoapi-skip-member", skip_submodules) - - -# The name of an image file (relative to this directory) to use as a favicon of -# the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 -# pixels large. -# html_favicon = None - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ["_static"] -html_js_files = [ - "custom.js", -] - -# Add any extra paths that contain custom files (such as robots.txt or -# .htaccess) here, relative to this directory. These files are copied -# directly to the root of the documentation. -# html_extra_path = [] - -# If not None, a 'Last updated on:' timestamp is inserted at every page -# bottom, using the given strftime format. -# The empty string is equivalent to '%b %d, %Y'. -# html_last_updated_fmt = None - -# If true, SmartyPants will be used to convert quotes and dashes to -# typographically correct entities. -# html_use_smartypants = True - -# Custom sidebar templates, maps document names to template names. -html_sidebars = {"tutorial": []} - -# Additional templates that should be rendered to pages, maps page names to -# template names. -# html_additional_pages = {} - -# If false, no module index is generated. -# html_domain_indices = True - -# If false, no index is generated. -# html_use_index = True - -# If true, the index is split into individual pages for each letter. -# html_split_index = False - -# If true, links to the reST sources are added to the pages. -# html_show_sourcelink = True - -# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -# html_show_sphinx = True - -# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -# html_show_copyright = True - -# If true, an OpenSearch description file will be output, and all pages will -# contain a tag referring to it. The value of this option must be the -# base URL from which the finished HTML is served. -# html_use_opensearch = '' - -# This is the file name suffix for HTML files (e.g. ".xhtml"). -# html_file_suffix = None - -# Language to be used for generating the HTML full-text search index. -# Sphinx supports the following languages: -# 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja' -# 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr', 'zh' -# html_search_language = 'en' - -# A dictionary with options for the search language support, empty by default. -# 'ja' uses this config value. -# 'zh' user can custom change `jieba` dictionary path. -# html_search_options = {'type': 'default'} - -# The name of a javascript file (relative to the configuration directory) that -# implements a search results scorer. If empty, the default will be used. -# html_search_scorer = 'scorer.js' - -# Output file base name for HTML help builder. -htmlhelp_basename = "zarrdoc" - -maximum_signature_line_length = 80 - -# -- Options for LaTeX output --------------------------------------------- - -latex_elements = { - # The paper size ('letterpaper' or 'a4paper'). - #'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). - #'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. - #'preamble': '', - # Latex figure (float) alignment - #'figure_align': 'htbp', -} - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). -latex_documents = [ - (main_doc, "zarr.tex", "Zarr-Python", author, "manual"), -] - -# The name of an image file (relative to this directory) to place at the top of -# the title page. -# latex_logo = None - -# For "manual" documents, if this is true, then toplevel headings are parts, -# not chapters. -# latex_use_parts = False - -# If true, show page references after internal links. -# latex_show_pagerefs = False - -# If true, show URL addresses after external links. -# latex_show_urls = False - -# Documents to append as an appendix to all manuals. -# latex_appendices = [] - -# If false, no module index is generated. -# latex_domain_indices = True - - -# -- Options for manual page output --------------------------------------- - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [(main_doc, "zarr", "Zarr-Python", [author], 1)] - -# If true, show URL addresses after external links. -# man_show_urls = False - - -# -- Options for Texinfo output ------------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -texinfo_documents = [ - ( - main_doc, - "zarr", - "Zarr-Python", - author, - "zarr", - "One line description of project.", - "Miscellaneous", - ), -] - -# Documents to append as an appendix to all manuals. -# texinfo_appendices = [] - -# If false, no module index is generated. -# texinfo_domain_indices = True - -# How to display URL addresses: 'footnote', 'no', or 'inline'. -# texinfo_show_urls = 'footnote' - -# If true, do not generate a @detailmenu in the "Top" node's menu. -# texinfo_no_detailmenu = False - - -# Example configuration for intersphinx: refer to the Python standard library. -# use in refs e.g: -# :ref:`comparison manual ` -intersphinx_mapping = { - "python": ("https://docs.python.org/3/", None), - "numpy": ("https://numpy.org/doc/stable/", None), - "numcodecs": ("https://numcodecs.readthedocs.io/en/stable/", None), - "obstore": ("https://developmentseed.org/obstore/latest/", None), -} - - -# sphinx-copybutton configuration -copybutton_prompt_text = r">>> |\.\.\. |\$ |In \[\d*\]: | {2,5}\.\.\.: | {5,8}: " -copybutton_line_continuation_character = "\\" -copybutton_prompt_is_regexp = True diff --git a/docs/contributing.md b/docs/contributing.md new file mode 100644 index 0000000000..b53bffe8a0 --- /dev/null +++ b/docs/contributing.md @@ -0,0 +1,257 @@ +# Contributing + +Zarr is a community maintained project. We welcome contributions in the form of bug reports, bug fixes, documentation, enhancement proposals and more. This page provides information on how best to contribute. + +## Asking for help + +If you have a question about how to use Zarr, please post your question on StackOverflow using the ["zarr" tag](https://stackoverflow.com/questions/tagged/zarr). If you don't get a response within a day or two, feel free to raise a [GitHub issue](https://github.com/zarr-developers/zarr-python/issues/new) including a link to your StackOverflow question. We will try to respond to questions as quickly as possible, but please bear in mind that there may be periods where we have limited time to answer questions due to other commitments. + +## Bug reports + +If you find a bug, please raise a [GitHub issue](https://github.com/zarr-developers/zarr-python/issues/new). Please include the following items in a bug report: + +1. A minimal, self-contained snippet of Python code reproducing the problem. You can format the code nicely using markdown, e.g.: + +```python +import zarr +g = zarr.group() +# etc. +``` + +2. An explanation of why the current behaviour is wrong/not desired, and what you expect instead. + +3. Information about the version of Zarr, along with versions of dependencies and the Python interpreter, and installation information. The version of Zarr can be obtained from the `zarr.__version__` property. Please also state how Zarr was installed, e.g., "installed via pip into a virtual environment", or "installed using conda". Information about other packages installed can be obtained by executing `pip freeze` (if using pip to install packages) or `conda env export` (if using conda to install packages) from the operating system command prompt. The version of the Python interpreter can be obtained by running a Python interactive session, e.g.: + +``` +$ python + Python 3.12.7 | packaged by conda-forge | (main, Oct 4 2024, 15:57:01) [Clang 17.0.6 ] on darwin +``` + +## Enhancement proposals + +If you have an idea about a new feature or some other improvement to Zarr, please raise a [GitHub issue](https://github.com/zarr-developers/zarr-python/issues/new) first to discuss. + +We very much welcome ideas and suggestions for how to improve Zarr, but please bear in mind that we are likely to be conservative in accepting proposals for new features. The reasons for this are that we would like to keep the Zarr code base lean and focused on a core set of functionalities, and available time for development, review and maintenance of new features is limited. But if you have a great idea, please don't let that stop you from posting it on GitHub, just please don't be offended if we respond cautiously. + +## Contributing code and/or documentation + +### Forking the repository + +The Zarr source code is hosted on GitHub at the following location: + +* [https://github.com/zarr-developers/zarr-python](https://github.com/zarr-developers/zarr-python) + +You will need your own fork to work on the code. Go to the link above and hit the ["Fork"](https://github.com/zarr-developers/zarr-python/fork) button. Then clone your fork to your local machine: + +```bash +$ git clone git@github.com:your-user-name/zarr-python.git +$ cd zarr-python +$ git remote add upstream git@github.com:zarr-developers/zarr-python.git +``` + +### Creating a development environment + +To work with the Zarr source code, it is recommended to use [hatch](https://hatch.pypa.io/latest/index.html) to create and manage development environments. Hatch will automatically install all Zarr dependencies using the same versions as are used by the core developers and continuous integration services. Assuming you have a Python 3 interpreter already installed, and you have cloned the Zarr source code and your current working directory is the root of the repository, you can do something like the following: + +```bash +$ pip install hatch +$ hatch env show # list all available environments +``` + +To verify that your development environment is working, you can run the unit tests for one of the test environments, e.g.: + +```bash +$ hatch env run --env test.py3.12-2.1-optional run-pytest +``` + +### Creating a branch + +Before you do any new work or submit a pull request, please open an issue on GitHub to report the bug or propose the feature you'd like to add. + +It's best to synchronize your fork with the upstream repository, then create a new, separate branch for each piece of work you want to do. E.g.: + +```bash +git checkout main +git fetch upstream +git checkout -b shiny-new-feature upstream/main +git push -u origin shiny-new-feature +``` + +This changes your working directory to the 'shiny-new-feature' branch. Keep any changes in this branch specific to one bug or feature so it is clear what the branch brings to Zarr. + +To update this branch with latest code from Zarr, you can retrieve the changes from the main branch and perform a rebase: + +```bash +git fetch upstream +git rebase upstream/main +``` + +This will replay your commits on top of the latest Zarr git main. If this leads to merge conflicts, these need to be resolved before submitting a pull request. Alternatively, you can merge the changes in from upstream/main instead of rebasing, which can be simpler: + +```bash +git pull upstream main +``` + +Again, any conflicts need to be resolved before submitting a pull request. + +### Running the test suite + +Zarr includes a suite of unit tests. The simplest way to run the unit tests is to activate your development environment (see [creating a development environment](#creating-a-development-environment) above) and invoke: + +```bash +$ hatch env run --env test.py3.12-2.1-optional run-pytest +``` + +All tests are automatically run via GitHub Actions for every pull request and must pass before code can be accepted. Test coverage is also collected automatically via the Codecov service. + +> **Note:** Previous versions of Zarr-Python made extensive use of doctests. These tests were not maintained during the 3.0 refactor but may be brought back in the future. See issue #2614 for more details. + +### Code standards - using pre-commit + +All code must conform to the PEP8 standard. Regarding line length, lines up to 100 characters are allowed, although please try to keep under 90 wherever possible. + +`Zarr` uses a set of `pre-commit` hooks and the `pre-commit` bot to format, type-check, and prettify the codebase. `pre-commit` can be installed locally by running: + +```bash +$ python -m pip install pre-commit +``` + +The hooks can be installed locally by running: + +```bash +$ pre-commit install +``` + +This would run the checks every time a commit is created locally. These checks will also run on every commit pushed to an open PR, resulting in some automatic styling fixes by the `pre-commit` bot. The checks will by default only run on the files modified by a commit, but the checks can be triggered for all the files by running: + +```bash +$ pre-commit run --all-files +``` + +If you would like to skip the failing checks and push the code for further discussion, use the `--no-verify` option with `git commit`. + +### Test coverage + +> **Note:** Test coverage for Zarr-Python 3 is currently not at 100%. This is a known issue and help is welcome to bring test coverage back to 100%. See issue #2613 for more details. + +Zarr strives to maintain 100% test coverage under the latest Python stable release. Both unit tests and docstring doctests are included when computing coverage. Running: + +```bash +$ hatch env run --env test.py3.12-2.1-optional run-coverage +``` + +will automatically run the test suite with coverage and produce a XML coverage report. This should be 100% before code can be accepted into the main code base. + +You can also generate an HTML coverage report by running: + +```bash +$ hatch env run --env test.py3.12-2.1-optional run-coverage-html +``` + +When submitting a pull request, coverage will also be collected across all supported Python versions via the Codecov service, and will be reported back within the pull request. Codecov coverage must also be 100% before code can be accepted. + +### Documentation + +Docstrings for user-facing classes and functions should follow the [numpydoc](https://numpydoc.readthedocs.io/en/stable/format.html#docstring-standard) standard, including sections for Parameters and Examples. All examples should run and pass as doctests under Python 3.11. + +Zarr uses mkdocs for documentation, hosted on readthedocs.org. Documentation is written in the Markdown markup language (.md files) in the `docs` folder. The documentation consists both of prose and API documentation. All user-facing classes and functions are included in the API documentation, under the `docs/api` folder using the [mkdocstrings](https://mkdocstrings.github.io/) extension. Add any new public functions or classes to the relevant markdown file in `docs/api/*.md`. Any new features or important usage information should be included in the user-guide (`docs/user-guide`). Any changes should also be included as a new file in the `changes` directory. + +The documentation can be built locally by running: + +```bash +$ hatch --env docs run build +``` + +The resulting built documentation will be available in the `docs/_build/html` folder. + +Hatch can also be used to serve continuously updating version of the documentation during development at [http://0.0.0.0:8000/](http://0.0.0.0:8000/). This can be done by running: + +```bash +$ hatch --env docs run serve +``` + +### Changelog + +zarr-python uses [towncrier](https://towncrier.readthedocs.io/en/stable/tutorial.html) to manage release notes. Most pull requests should include at least one news fragment describing the changes. To add a release note, you'll need the GitHub issue or pull request number and the type of your change (`feature`, `bugfix`, `doc`, `removal`, `misc`). With that, run `towncrier create` with your development environment, which will prompt you for the issue number, change type, and the news text: + +```bash +towncrier create +``` + +Alternatively, you can manually create the files in the `changes` directory using the naming convention `{issue-number}.{change-type}.md`. + +See the [towncrier](https://towncrier.readthedocs.io/en/stable/tutorial.html) docs for more. + +## Development best practices, policies and procedures + +The following information is mainly for core developers, but may also be of interest to contributors. + +### Merging pull requests + +Pull requests submitted by an external contributor should be reviewed and approved by at least one core developer before being merged. Ideally, pull requests submitted by a core developer should be reviewed and approved by at least one other core developer before being merged. + +Pull requests should not be merged until all CI checks have passed (GitHub Actions Codecov) against code that has had the latest main merged in. + +### Compatibility and versioning policies + +#### Versioning + +Versions of this library are identified by a triplet of integers with the form `..`, for example `3.0.4`. A release of `zarr-python` is associated with a new version identifier. That new identifier is generated by incrementing exactly one of the components of the previous version identifier by 1. When incrementing the `major` component of the version identifier, the `minor` and `patch` components is reset to 0. When incrementing the minor component, the patch component is reset to 0. + +Releases are classified by the library changes contained in that release. This classification determines which component of the version identifier is incremented on release. + +* **major** releases (for example, `2.18.0` -> `3.0.0`) are for changes that will require extensive adaptation efforts from many users and downstream projects. For example, breaking changes to widely-used user-facing APIs should only be applied in a major release. + + Users and downstream projects should carefully consider the impact of a major release before adopting it. In advance of a major release, developers should communicate the scope of the upcoming changes, and help users prepare for them. + +* **minor** releases (for example, `3.0.0` -> `3.1.0`) are for changes that do not require significant effort from most users or downstream downstream projects to respond to. API changes are possible in minor releases if the burden on users imposed by those changes is sufficiently small. + + For example, a recently released API may need fixes or refinements that are breaking, but low impact due to the recency of the feature. Such API changes are permitted in a minor release. + + Minor releases are safe for most users and downstream projects to adopt. + +* **patch** releases (for example, `3.1.0` -> `3.1.1`) are for changes that contain no breaking or behaviour changes for downstream projects or users. Examples of changes suitable for a patch release are bugfixes and documentation improvements. + + Users should always feel safe upgrading to a the latest patch release. + +Note that this versioning scheme is not consistent with [Semantic Versioning](https://semver.org/). Contrary to SemVer, the Zarr library may release breaking changes in `minor` releases, or even `patch` releases under exceptional circumstances. But we should strive to avoid doing so. + +A better model for our versioning scheme is [Intended Effort Versioning](https://jacobtomlinson.dev/effver/), or "EffVer". The guiding principle off EffVer is to categorize releases based on the *expected effort required to upgrade to that release*. + +Zarr developers should make changes as smooth as possible for users. This means making backwards-compatible changes wherever possible. When a backwards-incompatible change is necessary, users should be notified well in advance, e.g. via informative deprecation warnings. + +#### Data format compatibility + +The Zarr library is an implementation of a file format standard defined externally -- see the [Zarr specifications website](https://zarr-specs.readthedocs.io) for the list of Zarr file format specifications. + +If an existing Zarr format version changes, or a new version of the Zarr format is released, then the Zarr library will generally require changes. It is very likely that a new Zarr format will require extensive breaking changes to the Zarr library, and so support for a new Zarr format in the Zarr library will almost certainly come in new `major` release. When the Zarr library adds support for a new Zarr format, there may be a period of accelerated changes as developers refine newly added APIs and deprecate old APIs. In such a transitional phase breaking changes may be more frequent than usual. + +### Release procedure + +> **Note:** Most of the release process is now handled by GitHub workflow which should automatically push a release to PyPI if a tag is pushed. + +#### Pre-release + +1. Make sure that all pull requests which will be included in the release have been properly documented as changelog files in `changes`. +2. Run `towncrier build --version x.y.z` to create the changelog. + +#### Releasing + +To make a new release, go to https://github.com/zarr-developers/zarr-python/releases and click "Draft a new release". Choose a version number prefixed with a `v` (e.g. `v0.0.0`). For pre-releases, include the appropriate suffix (e.g. `v0.0.0a1` or `v0.0.0rc2`). + +Set the description of the release to: + +``` +See release notes https://zarr.readthedocs.io/en/stable/release-notes.html#release-0-0-0 +``` + +replacing the correct version numbers. For pre-release versions, the URL should omit the pre-release suffix, e.g. "a1" or "rc1". + +Click on "Generate release notes" to auto-file the description. + +After creating the release, the documentation will be built on https://readthedocs.io. Full releases will be available under [/stable](https://zarr.readthedocs.io/en/stable) while pre-releases will be available under [/latest](https://zarr.readthedocs.io/en/latest). + +#### Post-release + +- Review and merge the pull request on the [conda-forge feedstock](https://github.com/conda-forge/zarr-feedstock) that will be automatically generated. +- Create a new "Unreleased" section in the release notes diff --git a/docs/developers/contributing.rst b/docs/developers/contributing.rst deleted file mode 100644 index fa65f71d48..0000000000 --- a/docs/developers/contributing.rst +++ /dev/null @@ -1,386 +0,0 @@ -.. _dev-guide-contributing: - -Contributing to Zarr -==================== - -Zarr is a community maintained project. We welcome contributions in the form of bug -reports, bug fixes, documentation, enhancement proposals and more. This page provides -information on how best to contribute. - -Asking for help ---------------- - -If you have a question about how to use Zarr, please post your question on -StackOverflow using the `"zarr" tag `_. -If you don't get a response within a day or two, feel free to raise a `GitHub issue -`_ including a link to your StackOverflow -question. We will try to respond to questions as quickly as possible, but please bear -in mind that there may be periods where we have limited time to answer questions -due to other commitments. - -Bug reports ------------ - -If you find a bug, please raise a `GitHub issue -`_. Please include the following items in -a bug report: - -1. A minimal, self-contained snippet of Python code reproducing the problem. You can - format the code nicely using markdown, e.g.:: - - - ```python - import zarr - g = zarr.group() - # etc. - ``` - -2. An explanation of why the current behaviour is wrong/not desired, and what you - expect instead. - -3. Information about the version of Zarr, along with versions of dependencies and the - Python interpreter, and installation information. The version of Zarr can be obtained - from the ``zarr.__version__`` property. Please also state how Zarr was installed, - e.g., "installed via pip into a virtual environment", or "installed using conda". - Information about other packages installed can be obtained by executing ``pip freeze`` - (if using pip to install packages) or ``conda env export`` (if using conda to install - packages) from the operating system command prompt. The version of the Python - interpreter can be obtained by running a Python interactive session, e.g.:: - - $ python - Python 3.12.7 | packaged by conda-forge | (main, Oct 4 2024, 15:57:01) [Clang 17.0.6 ] on darwin - -Enhancement proposals ---------------------- - -If you have an idea about a new feature or some other improvement to Zarr, please raise a -`GitHub issue `_ first to discuss. - -We very much welcome ideas and suggestions for how to improve Zarr, but please bear in -mind that we are likely to be conservative in accepting proposals for new features. The -reasons for this are that we would like to keep the Zarr code base lean and focused on -a core set of functionalities, and available time for development, review and maintenance -of new features is limited. But if you have a great idea, please don't let that stop -you from posting it on GitHub, just please don't be offended if we respond cautiously. - -Contributing code and/or documentation --------------------------------------- - -Forking the repository -~~~~~~~~~~~~~~~~~~~~~~ - -The Zarr source code is hosted on GitHub at the following location: - -* `https://github.com/zarr-developers/zarr-python `_ - -You will need your own fork to work on the code. Go to the link above and hit -the `"Fork" `_ button. -Then clone your fork to your local machine:: - - $ git clone git@github.com:your-user-name/zarr-python.git - $ cd zarr-python - $ git remote add upstream git@github.com:zarr-developers/zarr-python.git - -Creating a development environment -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -To work with the Zarr source code, it is recommended to use -`hatch `_ to create and manage development -environments. Hatch will automatically install all Zarr dependencies using the same -versions as are used by the core developers and continuous integration services. -Assuming you have a Python 3 interpreter already installed, and you have cloned the -Zarr source code and your current working directory is the root of the repository, -you can do something like the following:: - - $ pip install hatch - $ hatch env show # list all available environments - -To verify that your development environment is working, you can run the unit tests -for one of the test environments, e.g.:: - - $ hatch env run --env test.py3.12-2.1-optional run-pytest - -Creating a branch -~~~~~~~~~~~~~~~~~ - -Before you do any new work or submit a pull request, please open an issue on GitHub to -report the bug or propose the feature you'd like to add. - -It's best to synchronize your fork with the upstream repository, then create a -new, separate branch for each piece of work you want to do. E.g.:: - - git checkout main - git fetch upstream - git checkout -b shiny-new-feature upstream/main - git push -u origin shiny-new-feature - -This changes your working directory to the 'shiny-new-feature' branch. Keep any changes in -this branch specific to one bug or feature so it is clear what the branch brings to -Zarr. - -To update this branch with latest code from Zarr, you can retrieve the changes from -the main branch and perform a rebase:: - - git fetch upstream - git rebase upstream/main - -This will replay your commits on top of the latest Zarr git main. If this leads to -merge conflicts, these need to be resolved before submitting a pull request. -Alternatively, you can merge the changes in from upstream/main instead of rebasing, -which can be simpler:: - - git pull upstream main - -Again, any conflicts need to be resolved before submitting a pull request. - -Running the test suite -~~~~~~~~~~~~~~~~~~~~~~ - -Zarr includes a suite of unit tests. The simplest way to run the unit tests -is to activate your development environment -(see `creating a development environment`_ above) and invoke:: - - $ hatch env run --env test.py3.12-2.1-optional run-pytest - -All tests are automatically run via GitHub Actions for every pull -request and must pass before code can be accepted. Test coverage is -also collected automatically via the Codecov service. - -.. note:: - Previous versions of Zarr-Python made extensive use of doctests. These tests were - not maintained during the 3.0 refactor but may be brought back in the future. - See :issue:`2614` for more details. - -Code standards - using pre-commit -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -All code must conform to the PEP8 standard. Regarding line length, lines up to 100 -characters are allowed, although please try to keep under 90 wherever possible. - -``Zarr`` uses a set of ``pre-commit`` hooks and the ``pre-commit`` bot to format, -type-check, and prettify the codebase. ``pre-commit`` can be installed locally by -running:: - - $ python -m pip install pre-commit - -The hooks can be installed locally by running:: - - $ pre-commit install - -This would run the checks every time a commit is created locally. These checks will also run -on every commit pushed to an open PR, resulting in some automatic styling fixes by the -``pre-commit`` bot. The checks will by default only run on the files modified by a commit, -but the checks can be triggered for all the files by running:: - - $ pre-commit run --all-files - -If you would like to skip the failing checks and push the code for further discussion, use -the ``--no-verify`` option with ``git commit``. - - -Test coverage -~~~~~~~~~~~~~ - -.. note:: - Test coverage for Zarr-Python 3 is currently not at 100%. This is a known issue and help - is welcome to bring test coverage back to 100%. See :issue:`2613` for more details. - -Zarr strives to maintain 100% test coverage under the latest Python stable release -Both unit tests and docstring doctests are included when computing coverage. Running:: - - $ hatch env run --env test.py3.12-2.1-optional run-coverage - -will automatically run the test suite with coverage and produce a XML coverage report. -This should be 100% before code can be accepted into the main code base. - -You can also generate an HTML coverage report by running:: - - $ hatch env run --env test.py3.12-2.1-optional run-coverage-html - -When submitting a pull request, coverage will also be collected across all supported -Python versions via the Codecov service, and will be reported back within the pull -request. Codecov coverage must also be 100% before code can be accepted. - -Documentation -~~~~~~~~~~~~~ - -Docstrings for user-facing classes and functions should follow the -`numpydoc -`_ -standard, including sections for Parameters and Examples. All examples -should run and pass as doctests under Python 3.11. - -Zarr uses Sphinx for documentation, hosted on readthedocs.org. Documentation is -written in the RestructuredText markup language (.rst files) in the ``docs`` folder. -The documentation consists both of prose and API documentation. All user-facing classes -and functions are included in the API documentation, under the ``docs/api`` folder -using the `autodoc `_ -extension to sphinx. Any new features or important usage information should be included in the -user-guide (``docs/user-guide``). Any changes should also be included as a new file in the -:file:`changes` directory. - -The documentation can be built locally by running:: - - $ hatch --env docs run build - -The resulting built documentation will be available in the ``docs/_build/html`` folder. - -Hatch can also be used to serve continuously updating version of the documentation -during development at `http://0.0.0.0:8000/ `_. This can be done by running:: - - $ hatch --env docs run serve - -.. _changelog: - -Changelog -~~~~~~~~~ - -zarr-python uses `towncrier`_ to manage release notes. Most pull requests should -include at least one news fragment describing the changes. To add a release -note, you'll need the GitHub issue or pull request number and the type of your -change (``feature``, ``bugfix``, ``doc``, ``removal``, ``misc``). With that, run -```towncrier create``` with your development environment, which will prompt you -for the issue number, change type, and the news text:: - - towncrier create - -Alternatively, you can manually create the files in the ``changes`` directory -using the naming convention ``{issue-number}.{change-type}.rst``. - -See the `towncrier`_ docs for more. - -.. _towncrier: https://towncrier.readthedocs.io/en/stable/tutorial.html - -Development best practices, policies and procedures ---------------------------------------------------- - -The following information is mainly for core developers, but may also be of interest to -contributors. - -Merging pull requests -~~~~~~~~~~~~~~~~~~~~~ - -Pull requests submitted by an external contributor should be reviewed and approved by at least -one core developer before being merged. Ideally, pull requests submitted by a core developer -should be reviewed and approved by at least one other core developer before being merged. - -Pull requests should not be merged until all CI checks have passed (GitHub Actions -Codecov) against code that has had the latest main merged in. - -Compatibility and versioning policies -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Versioning -"""""""""" -Versions of this library are identified by a triplet of integers with the form -``..``, for example ``3.0.4``. A release of ``zarr-python`` is associated with a new -version identifier. That new identifier is generated by incrementing exactly one of the components of -the previous version identifier by 1. When incrementing the ``major`` component of the version identifier, -the ``minor`` and ``patch`` components is reset to 0. When incrementing the minor component, -the patch component is reset to 0. - -Releases are classified by the library changes contained in that release. This classification -determines which component of the version identifier is incremented on release. - -* ``major`` releases (for example, ``2.18.0`` -> ``3.0.0``) are for changes that will - require extensive adaptation efforts from many users and downstream projects. - For example, breaking changes to widely-used user-facing APIs should only be applied in a major release. - - - Users and downstream projects should carefully consider the impact of a major release before - adopting it. - In advance of a major release, developers should communicate the scope of the upcoming changes, - and help users prepare for them. - -* ``minor`` releases (or example, ``3.0.0`` -> ``3.1.0``) are for changes that do not require - significant effort from most users or downstream downstream projects to respond to. API changes - are possible in minor releases if the burden on users imposed by those changes is sufficiently small. - - For example, a recently released API may need fixes or refinements that are breaking, but low impact - due to the recency of the feature. Such API changes are permitted in a minor release. - - - Minor releases are safe for most users and downstream projects to adopt. - - -* ``patch`` releases (for example, ``3.1.0`` -> ``3.1.1``) are for changes that contain no breaking - or behaviour changes for downstream projects or users. Examples of changes suitable for a patch release are - bugfixes and documentation improvements. - - - Users should always feel safe upgrading to a the latest patch release. - -Note that this versioning scheme is not consistent with `Semantic Versioning `_. -Contrary to SemVer, the Zarr library may release breaking changes in ``minor`` releases, or even -``patch`` releases under exceptional circumstances. But we should strive to avoid doing so. - -A better model for our versioning scheme is `Intended Effort Versioning `_, -or "EffVer". The guiding principle off EffVer is to categorize releases based on the *expected effort -required to upgrade to that release*. - -Zarr developers should make changes as smooth as possible for users. This means making -backwards-compatible changes wherever possible. When a backwards-incompatible change is necessary, -users should be notified well in advance, e.g. via informative deprecation warnings. - -Data format compatibility -^^^^^^^^^^^^^^^^^^^^^^^^^ - -The Zarr library is an implementation of a file format standard defined externally -- -see the `Zarr specifications website `_ for the list of -Zarr file format specifications. - - -If an existing Zarr format version changes, or a new version of the Zarr format is released, then -the Zarr library will generally require changes. It is very likely that a new Zarr format will -require extensive breaking changes to the Zarr library, and so support for a new Zarr format in the -Zarr library will almost certainly come in new ``major`` release. -When the Zarr library adds support for a new Zarr format, there may be a period of accelerated -changes as developers refine newly added APIs and deprecate old APIs. In such a transitional phase -breaking changes may be more frequent than usual. - - -Release procedure -~~~~~~~~~~~~~~~~~ - -.. note:: - - Most of the release process is now handled by GitHub workflow which should - automatically push a release to PyPI if a tag is pushed. - -Pre-release -""""""""""" -1. Make sure that all pull requests which will be included in the release - have been properly documented as changelog files in :file:`changes`. -2. Run ``towncrier build --version x.y.z`` to create the changelog. - -Releasing -""""""""" -To make a new release, go to -https://github.com/zarr-developers/zarr-python/releases and -click "Draft a new release". Choose a version number prefixed -with a `v` (e.g. `v0.0.0`). For pre-releases, include the -appropriate suffix (e.g. `v0.0.0a1` or `v0.0.0rc2`). - - -Set the description of the release to:: - - See release notes https://zarr.readthedocs.io/en/stable/release-notes.html#release-0-0-0 - -replacing the correct version numbers. For pre-release versions, -the URL should omit the pre-release suffix, e.g. "a1" or "rc1". - -Click on "Generate release notes" to auto-file the description. - -After creating the release, the documentation will be built on -https://readthedocs.io. Full releases will be available under -`/stable `_ while -pre-releases will be available under -`/latest `_. - -Post-release -"""""""""""" - -- Review and merge the pull request on the - `conda-forge feedstock `_ that will be - automatically generated. -- Create a new "Unreleased" section in the release notes diff --git a/docs/developers/index.rst b/docs/developers/index.rst deleted file mode 100644 index 4bccb3a469..0000000000 --- a/docs/developers/index.rst +++ /dev/null @@ -1,9 +0,0 @@ - -Developer's Guide ------------------ - -.. toctree:: - :maxdepth: 1 - - contributing - roadmap diff --git a/docs/developers/roadmap.rst b/docs/developers/roadmap.rst deleted file mode 100644 index d9fc32b775..0000000000 --- a/docs/developers/roadmap.rst +++ /dev/null @@ -1,696 +0,0 @@ -Roadmap -======= - -- Status: active -- Author: Joe Hamman -- Created On: October 31, 2023 -- Input from: - - - Davis Bennett / @d-v-b - - Norman Rzepka / @normanrz - - Deepak Cherian @dcherian - - Brian Davis / @monodeldiablo - - Oliver McCormack / @olimcc - - Ryan Abernathey / @rabernat - - Jack Kelly / @JackKelly - - Martin Durrant / @martindurant - -.. note:: - - This document was written in the early stages of the 3.0 refactor. Some - aspects of the design have changed since this was originally written. - Questions and discussion about the contents of this document should be directed to - `this GitHub Discussion `__. - -Introduction ------------- - -This document lays out a design proposal for version 3.0 of the -`Zarr-Python `__ package. A -specific focus of the design is to bring Zarr-Python’s API up to date -with the `Zarr V3 -specification `__, -with the hope of enabling the development of the many features and -extensions that motivated the V3 Spec. The ideas presented here are -expected to result in a major release of Zarr-Python (version 3.0) -including significant a number of breaking API changes. For clarity, -“V3” will be used to describe the version of the Zarr specification and -“3.0” will be used to describe the release tag of the Zarr-Python -project. - -Current status of V3 in Zarr-Python -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -During the development of the V3 Specification, a `prototype -implementation `__ -was added to the Zarr-Python library. Since that implementation, the V3 -spec evolved in significant ways and as a result, the Zarr-Python -library is now out of sync with the approved spec. Downstream libraries -(e.g. `Xarray `__) have added support -for this implementation and will need to migrate to the accepted spec -when its available in Zarr-Python. - -Goals ------ - -- Provide a complete implementation of Zarr V3 through the Zarr-Python - API -- Clear the way for exciting extensions / ZEPs - (i.e. `sharding `__, - `variable chunking `__, - etc.) -- Provide a developer API that can be used to implement and register V3 - extensions -- Improve the performance of Zarr-Python by streamlining the interface - between the Store layer and higher level APIs (e.g. Groups and - Arrays) -- Clean up the internal and user facing APIs -- Improve code quality and robustness (e.g. achieve 100% type hint - coverage) -- Align the Zarr-Python array API with the `array API - Standard `__ - -Examples of what 3.0 will enable? ---------------------------------- - -1. Reading and writing V3 spec-compliant groups and arrays -2. V3 extensions including sharding and variable chunking. -3. Improved performance by leveraging concurrency when - creating/reading/writing to stores (imagine a - ``create_hierarchy(zarr_objects)`` function). -4. User-developed extensions (e.g. storage-transformers) can be - registered with Zarr-Python at runtime - -Non-goals (of this document) ----------------------------- - -- Implementation of any unaccepted Zarr V3 extensions -- Major revisions to the Zarr V3 spec - -Requirements ------------- - -1. Read and write spec compliant V2 and V3 data -2. Limit unnecessary traffic to/from the store -3. Cleanly define the Array/Group/Store abstractions -4. Cleanly define how V2 will be supported going forward -5. Provide a clear roadmap to help users upgrade to 3.0 -6. Developer tools / hooks for registering extensions - -Design ------- - -Async API -~~~~~~~~~ - -Zarr-Python is an IO library. As such, supporting concurrent action -against the storage layer is critical to achieving acceptable -performance. The Zarr-Python 2 was not designed with asynchronous -computation in mind and as a result has struggled to effectively -leverage the benefits of concurrency. At one point, ``getitems`` and -``setitems`` support was added to the Zarr store model but that is only -used for operating on a set of chunks in a single variable. - -With Zarr-Python 3.0, we have the opportunity to revisit this design. -The proposal here is as follows: - -1. The ``Store`` interface will be entirely async. -2. On top of the async ``Store`` interface, we will provide an - ``AsyncArray`` and ``AsyncGroup`` interface. -3. Finally, the primary user facing API will be synchronous ``Array`` - and ``Group`` classes that wrap the async equivalents. - -**Examples** - -- **Store** - - .. code:: python - - class Store: - ... - async def get(self, key: str) -> bytes: - ... - async def get_partial_values(self, key_ranges: List[Tuple[str, Tuple[int, Optional[int]]]]) -> bytes: - ... - # (no sync interface here) - -- **Array** - - .. code:: python - - class AsyncArray: - ... - - async def getitem(self, selection: Selection) -> np.ndarray: - # the core logic for getitem goes here - - class Array: - _async_array: AsyncArray - - def __getitem__(self, selection: Selection) -> np.ndarray: - return sync(self._async_array.getitem(selection)) - -- **Group** - - .. code:: python - - class AsyncGroup: - ... - - async def create_group(self, path: str, **kwargs) -> AsyncGroup: - # the core logic for create_group goes here - - class Group: - _async_group: AsyncGroup - - def create_group(self, path: str, **kwargs) -> Group: - return sync(self._async_group.create_group(path, **kwargs)) - - **Internal Synchronization API** - -With the ``Store`` and core ``AsyncArray``/ ``AsyncGroup`` classes being -predominantly async, Zarr-Python will need an internal API to provide a -synchronous API. The proposal here is to use the approach in -`fsspec `__ -to provide a high-level ``sync`` function that takes an ``awaitable`` -and runs it in its managed IO Loop / thread. - -| **FAQ** 1. Why two levels of Arrays/groups? a. First, this is an - intentional decision and departure from the current Zarrita - implementation b. The idea is that users rarely want to mix - interfaces. Either they are working within an async context (currently - quite rare) or they are in a typical synchronous context. c. Splitting - the two will allow us to clearly define behavior on the ``AsyncObj`` - and simply wrap it in the ``SyncObj``. 2. What if a store is only has - a synchronous backend? a. First off, this is expected to be a fairly - rare occurrence. Most storage backends have async interfaces. b. But - in the event a storage backend doesn’t have a async interface, there - is nothing wrong with putting synchronous code in ``async`` methods. - There are approaches to enabling concurrent action through wrappers - like AsyncIO’s ``loop.run_in_executor`` (`ref - 1 `__, - `ref 2 `__, `ref - 3 `__, - `ref - 4 `__. -| 3. Will Zarr help manage the async contexts encouraged by some - libraries - (e.g. `AioBotoCore `__)? - a. Many async IO libraries require entering an async context before - interacting with the API. We expect some experimentation to be needed - here but the initial design will follow something close to what fsspec - does (`example in - s3fs `__). - 4. Why not provide a synchronous Store interface? a. We could but this - design is simpler. It would mean supporting it in the ``AsyncGroup`` - and ``AsyncArray`` classes which, may be more trouble than its worth. - Storage backends that do not have an async API will be encouraged to - wrap blocking calls in an async wrapper - (e.g. ``loop.run_in_executor``). - -Store API -~~~~~~~~~ - -The ``Store`` API is specified directly in the V3 specification. All V3 -stores should implement this abstract API, omitting Write and List -support as needed. As described above, all stores will be expected to -expose the required methods as async methods. - -**Example** - -.. code:: python - - class ReadWriteStore: - ... - async def get(self, key: str) -> bytes: - ... - - async def get_partial_values(self, key_ranges: List[Tuple[str, int, int]) -> bytes: - ... - - async def set(self, key: str, value: Union[bytes, bytearray, memoryview]) -> None: - ... # required for writable stores - - async def set_partial_values(self, key_start_values: List[Tuple[str, int, Union[bytes, bytearray, memoryview]]]) -> None: - ... # required for writable stores - - async def list(self) -> List[str]: - ... # required for listable stores - - async def list_prefix(self, prefix: str) -> List[str]: - ... # required for listable stores - - async def list_dir(self, prefix: str) -> List[str]: - ... # required for listable stores - - # additional (optional methods) - async def getsize(self, prefix: str) -> int: - ... - - async def rename(self, src: str, dest: str) -> None - ... - - -Recognizing that there are many Zarr applications today that rely on the -``MutableMapping`` interface supported by Zarr-Python 2, a wrapper store -will be developed to allow existing stores to plug directly into this -API. - -Array API -~~~~~~~~~ - -The user facing array interface will implement a subset of the `Array -API Standard `__. Most of the -computational parts of the Array API Standard don’t fit into Zarr right -now. That’s okay. What matters most is that we ensure we can give -downstream applications a compliant API. - -*Note, Zarr already does most of this so this is more about formalizing -the relationship than a substantial change in API.* - -+------------------------+------------------------+-------------------------+-------------------------+ -| | Included | Not Included | Unknown / Maybe Possible| -+========================+========================+=========================+=========================+ -| **Attributes** | ``dtype`` | ``mT`` | ``device`` | -+------------------------+------------------------+-------------------------+-------------------------+ -| | ``ndim`` | ``T`` | | -+------------------------+------------------------+-------------------------+-------------------------+ -| | ``shape`` | | | -+------------------------+------------------------+-------------------------+-------------------------+ -| | ``size`` | | | -+------------------------+------------------------+-------------------------+-------------------------+ -| **Methods** | ``__getitem__`` | ``__array_namespace__`` | ``to_device`` | -+------------------------+------------------------+-------------------------+-------------------------+ -| | ``__setitem__`` | ``__abs__`` | ``__bool__`` | -+------------------------+------------------------+-------------------------+-------------------------+ -| | ``__eq__`` | ``__add__`` | ``__complex__`` | -+------------------------+------------------------+-------------------------+-------------------------+ -| | ``__bool__`` | ``__and__`` | ``__dlpack__`` | -+------------------------+------------------------+-------------------------+-------------------------+ -| | | ``__floordiv__`` | ``__dlpack_device__`` | -+------------------------+------------------------+-------------------------+-------------------------+ -| | | ``__ge__`` | ``__float__`` | -+------------------------+------------------------+-------------------------+-------------------------+ -| | | ``__gt__`` | ``__index__`` | -+------------------------+------------------------+-------------------------+-------------------------+ -| | | ``__invert__`` | ``__int__`` | -+------------------------+------------------------+-------------------------+-------------------------+ -| | | ``__le__`` | | -+------------------------+------------------------+-------------------------+-------------------------+ -| | | ``__lshift__`` | | -+------------------------+------------------------+-------------------------+-------------------------+ -| | | ``__lt__`` | | -+------------------------+------------------------+-------------------------+-------------------------+ -| | | ``__matmul__`` | | -+------------------------+------------------------+-------------------------+-------------------------+ -| | | ``__mod__`` | | -+------------------------+------------------------+-------------------------+-------------------------+ -| | | ``__mul__`` | | -+------------------------+------------------------+-------------------------+-------------------------+ -| | | ``__ne__`` | | -+------------------------+------------------------+-------------------------+-------------------------+ -| | | ``__neg__`` | | -+------------------------+------------------------+-------------------------+-------------------------+ -| | | ``__or__`` | | -+------------------------+------------------------+-------------------------+-------------------------+ -| | | ``__pos__`` | | -+------------------------+------------------------+-------------------------+-------------------------+ -| | | ``__pow__`` | | -+------------------------+------------------------+-------------------------+-------------------------+ -| | | ``__rshift__`` | | -+------------------------+------------------------+-------------------------+-------------------------+ -| | | ``__sub__`` | | -+------------------------+------------------------+-------------------------+-------------------------+ -| | | ``__truediv__`` | | -+------------------------+------------------------+-------------------------+-------------------------+ -| | | ``__xor__`` | | -+------------------------+------------------------+-------------------------+-------------------------+ -| **Creation functions** | ``zeros`` | | ``arange`` | -| (``zarr.creation``) | | | | -+------------------------+------------------------+-------------------------+-------------------------+ -| | ``zeros_like`` | | ``asarray`` | -+------------------------+------------------------+-------------------------+-------------------------+ -| | ``ones`` | | ``eye`` | -+------------------------+------------------------+-------------------------+-------------------------+ -| | ``ones_like`` | | ``from_dlpack`` | -+------------------------+------------------------+-------------------------+-------------------------+ -| | ``full`` | | ``linspace`` | -+------------------------+------------------------+-------------------------+-------------------------+ -| | ``full_like`` | | ``meshgrid`` | -+------------------------+------------------------+-------------------------+-------------------------+ -| | ``empty`` | | ``tril`` | -+------------------------+------------------------+-------------------------+-------------------------+ -| | ``empty_like`` | | ``triu`` | -+------------------------+------------------------+-------------------------+-------------------------+ - -In addition to the core array API defined above, the Array class should -have the following Zarr specific properties: - -- ``.metadata`` (see Metadata Interface below) -- ``.attrs`` - (pulled from metadata object) -- ``.info`` - (repolicated from existing property †) - -*† In Zarr-Python 2, the info property listed the store to identify -initialized chunks. By default this will be turned off in 3.0 but will -be configurable.* - -**Indexing** - -Zarr-Python currently supports ``__getitem__`` style indexing and the -special ``oindex`` and ``vindex`` indexers. These are not part of the -current Array API standard (see -`data-apis/array-api#669 `__) -but they have been `proposed as a -NEP `__. -Zarr-Python will maintain these in 3.0. - -We are also exploring a new high-level indexing API that will enabled -optimized batch/concurrent loading of many chunks. We expect this to be -important to enable performant loading of data in the context of -sharding. See `this -discussion `__ -for more detail. - -Concurrent indexing across multiple arrays will be possible using the -AsyncArray API. - -**Async and Sync Array APIs** - -Most the logic to support Zarr Arrays will live in the ``AsyncArray`` -class. There are a few notable differences that should be called out. - -=============== ============ -Sync Method Async Method -=============== ============ -``__getitem__`` ``getitem`` -``__setitem__`` ``setitem`` -``__eq__`` ``equals`` -=============== ============ - -**Metadata interface** - -Zarr-Python 2.\* closely mirrors the V2 spec metadata schema in the -Array and Group classes. In 3.0, we plan to move the underlying metadata -representation to a separate interface (e.g. ``Array.metadata``). This -interface will return either a ``V2ArrayMetadata`` or -``V3ArrayMetadata`` object (both will inherit from a parent -``ArrayMetadataABC`` class. The ``V2ArrayMetadata`` and -``V3ArrayMetadata`` classes will be responsible for producing valid JSON -representations of their metadata, and yielding a consistent view to the -``Array`` or ``Group`` class. - -Group API -~~~~~~~~~ - -The main question is how closely we should follow the existing -Zarr-Python implementation / ``MutableMapping`` interface. The table -below shows the primary ``Group`` methods in Zarr-Python 2 and attempts -to identify if and how they would be implemented in 3.0. - -+---------------------+------------------+------------------+-----------------------+ -| V2 Group Methods | ``AsyncGroup`` | ``Group`` | ``h5py_compat.Group`` | -+=====================+==================+==================+=======================+ -| ``__len__`` | ``length`` | ``__len__`` | ``__len__`` | -+---------------------+------------------+------------------+-----------------------+ -| ``__iter__`` | ``__aiter__`` | ``__iter__`` | ``__iter__`` | -+---------------------+------------------+------------------+-----------------------+ -| ``__contains__`` | ``contains`` | ``__contains__`` | ``__contains__`` | -+---------------------+------------------+------------------+-----------------------+ -| ``__getitem__`` | ``getitem`` | ``__getitem__`` | ``__getitem__`` | -+---------------------+------------------+------------------+-----------------------+ -| ``__enter__`` | N/A | N/A | ``__enter__`` | -+---------------------+------------------+------------------+-----------------------+ -| ``__exit__`` | N/A | N/A | ``__exit__`` | -+---------------------+------------------+------------------+-----------------------+ -| ``group_keys`` | ``group_keys`` | ``group_keys`` | N/A | -+---------------------+------------------+------------------+-----------------------+ -| ``groups`` | ``groups`` | ``groups`` | N/A | -+---------------------+------------------+------------------+-----------------------+ -| ``array_keys`` | ``array_key`` | ``array_keys`` | N/A | -+---------------------+------------------+------------------+-----------------------+ -| ``arrays`` | ``arrays`` | ``arrays`` | N/A | -+---------------------+------------------+------------------+-----------------------+ -| ``visit`` | ? | ? | ``visit`` | -+---------------------+------------------+------------------+-----------------------+ -| ``visitkeys`` | ? | ? | ? | -+---------------------+------------------+------------------+-----------------------+ -| ``visitvalues`` | ? | ? | ? | -+---------------------+------------------+------------------+-----------------------+ -| ``visititems`` | ? | ? | ``visititems`` | -+---------------------+------------------+------------------+-----------------------+ -| ``tree`` | ``tree`` | ``tree`` | ``Both`` | -+---------------------+------------------+------------------+-----------------------+ -| ``create_group`` | ``create_group`` | ``create_group`` | ``create_group`` | -+---------------------+------------------+------------------+-----------------------+ -| ``require_group`` | N/A | N/A | ``require_group`` | -+---------------------+------------------+------------------+-----------------------+ -| ``create_groups`` | ? | ? | N/A | -+---------------------+------------------+------------------+-----------------------+ -| ``require_groups`` | ? | ? | ? | -+---------------------+------------------+------------------+-----------------------+ -| ``create_dataset`` | N/A | N/A | ``create_dataset`` | -+---------------------+------------------+------------------+-----------------------+ -| ``require_dataset`` | N/A | N/A | ``require_dataset`` | -+---------------------+------------------+------------------+-----------------------+ -| ``create`` | ``create_array`` | ``create_array`` | N/A | -+---------------------+------------------+------------------+-----------------------+ -| ``empty`` | ``empty`` | ``empty`` | N/A | -+---------------------+------------------+------------------+-----------------------+ -| ``zeros`` | ``zeros`` | ``zeros`` | N/A | -+---------------------+------------------+------------------+-----------------------+ -| ``ones`` | ``ones`` | ``ones`` | N/A | -+---------------------+------------------+------------------+-----------------------+ -| ``full`` | ``full`` | ``full`` | N/A | -+---------------------+------------------+------------------+-----------------------+ -| ``array`` | ``create_array`` | ``create_array`` | N/A | -+---------------------+------------------+------------------+-----------------------+ -| ``empty_like`` | ``empty_like`` | ``empty_like`` | N/A | -+---------------------+------------------+------------------+-----------------------+ -| ``zeros_like`` | ``zeros_like`` | ``zeros_like`` | N/A | -+---------------------+------------------+------------------+-----------------------+ -| ``ones_like`` | ``ones_like`` | ``ones_like`` | N/A | -+---------------------+------------------+------------------+-----------------------+ -| ``full_like`` | ``full_like`` | ``full_like`` | N/A | -+---------------------+------------------+------------------+-----------------------+ -| ``move`` | ``move`` | ``move`` | ``move`` | -+---------------------+------------------+------------------+-----------------------+ - -**``zarr.h5compat.Group``** --- -Zarr-Python 2.\* made an attempt to align its API with that of -`h5py `__. With 3.0, we will -relax this alignment in favor of providing an explicit compatibility -module (``zarr.h5py_compat``). This module will expose the ``Group`` and -``Dataset`` APIs that map to Zarr-Python’s ``Group`` and ``Array`` -objects. - -Creation API -~~~~~~~~~~~~ - -Zarr-Python 2.\* bundles together the creation and serialization of Zarr -objects. Zarr-Python 3.\* will make it possible to create objects in -memory separate from serializing them. This will specifically enable -writing hierarchies of Zarr objects in a single batch step. For example: - -.. code:: python - - - arr1 = Array(shape=(10, 10), path="foo/bar", dtype="i4", store=store) - arr2 = Array(shape=(10, 10), path="foo/spam", dtype="f8", store=store) - - arr1.save() - arr2.save() - - # or equivalently - - zarr.save_many([arr1 ,arr2]) - -*Note: this batch creation API likely needs additional design effort -prior to implementation.* - -Plugin API -~~~~~~~~~~ - -Zarr V3 was designed to be extensible at multiple layers. Zarr-Python -will support these extensions through a combination of `Abstract Base -Classes `__ (ABCs) and -`Entrypoints `__. - -**ABCs** - -Zarr V3 will expose Abstract base classes for the following objects: - -- ``Store``, ``ReadStore``, ``ReadWriteStore``, ``ReadListStore``, and - ``ReadWriteListStore`` -- ``BaseArray``, ``SynchronousArray``, and ``AsynchronousArray`` -- ``BaseGroup``, ``SynchronousGroup``, and ``AsynchronousGroup`` -- ``Codec``, ``ArrayArrayCodec``, ``ArrayBytesCodec``, - ``BytesBytesCodec`` - -**Entrypoints** - -Lots more thinking here but the idea here is to provide entrypoints for -``data type``, ``chunk grid``, ``chunk key encoding``, ``codecs``, -``storage_transformers`` and ``stores``. These might look something -like: - -:: - - entry_points=""" - [zarr.codecs] - blosc_codec=codec_plugin:make_blosc_codec - zlib_codec=codec_plugin:make_zlib_codec - """ - -Python type hints and static analysis -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Target 100% Mypy coverage in 3.0 source. - -Observability -~~~~~~~~~~~~~ - -A persistent problem in Zarr-Python is diagnosing problems that span -many parts of the stack. To address this in 3.0, we will add a basic -logging framework that can be used to debug behavior at various levels -of the stack. We propose to add the separate loggers for the following -namespaces: - -- ``array`` -- ``group`` -- ``store`` -- ``codec`` - -These should be documented such that users know how to activate them and -developers know how to use them when developing extensions. - -Dependencies -~~~~~~~~~~~~ - -Today, Zarr-Python has the following required dependencies: - -.. code:: python - - dependencies = [ - 'asciitree', - 'numpy>=1.20,!=1.21.0', - 'fasteners', - 'numcodecs>=0.10.0', - ] - -What other dependencies should be considered? - -1. Attrs - Zarrita makes extensive use of the Attrs library -2. Fsspec - Zarrita has a hard dependency on Fsspec. This could be - easily relaxed though. - -Breaking changes relative to Zarr-Python 2.\* ---------------------------------------------- - -1. H5py compat moved to a stand alone module? -2. ``Group.__getitem__`` support moved to ``Group.members.__getitem__``? -3. Others? - -Open questions --------------- - -1. How to treat V2 - - a. Note: Zarrita currently implements a separate ``V2Array`` and - ``V3Array`` classes. This feels less than ideal. - b. We could easily convert metadata from v2 to the V3 Array, but what - about writing? - c. Ideally, we don’t have completely separate code paths. But if its - too complicated to support both within one interface, its probably - better. - -2. How and when to remove the current implementation of V3. - - a. It’s hidden behind a hard-to-use feature flag so we probably don’t - need to do anything. - -3. How to model runtime configuration? -4. Which extensions belong in Zarr-Python and which belong in separate - packages? - - a. We don’t need to take a strong position on this here. It’s likely - that someone will want to put Sharding in. That will be useful to - develop in parallel because it will give us a good test case for - the plugin interface. - -Testing -------- - -Zarr-python 3.0 adds a major new dimension to Zarr: Async support. This -also comes with a compatibility risk, we will need to thoroughly test -support in key execution environments. Testing plan: - Reuse the -existing test suite for testing the ``v3`` API. - ``xfail`` tests that -expose breaking changes with ``3.0 - breaking change`` description. This -will help identify additional and/or unintentional breaking changes - -Rework tests that were only testing internal APIs. - Add a set of -functional / integration tests targeting real-world workflows in various -contexts (e.g. w/ Dask) - -Development process -------------------- - -Zarr-Python 3.0 will introduce a number of new APIs and breaking changes -to existing APIs. In order to facilitate ongoing support for Zarr-Python -2.*, we will take on the following development process: - -- Create a ``v3`` branch that can be use for developing the core - functionality apart from the ``main`` branch. This will allow us to - support ongoing work and bug fixes on the ``main`` branch. -- Put the ``3.0`` APIs inside a ``zarr.v3`` module. Imports from this - namespace will all be new APIs that users can develop and test - against once the ``v3`` branch is merged to ``main``. -- Kickstart the process by pulling in the current state of ``zarrita`` - - which has many of the features described in this design. -- Release a series of 2.\* releases with the ``v3`` namespace -- When ``v3`` is complete, move contents of ``v3`` to the package root - -**Milestones** - -Below are a set of specific milestones leading toward the completion of -this process. As work begins, we expect this list to grow in -specificity. - -1. Port current version of Zarrita to Zarr-Python -2. Formalize Async interface by splitting ``Array`` and ``Group`` - objects into Sync and Async versions -3. Implement “fancy” indexing operations on the ``AsyncArray`` -4. Implement an abstract base class for the ``Store`` interface and a - wrapper ``Store`` to make use of existing ``MutableMapping`` stores. -5. Rework the existing unit test suite to use the ``v3`` namespace. -6. Develop a plugin interface for extensions -7. Develop a set of functional and integration tests -8. Work with downstream libraries (Xarray, Dask, etc.) to test new APIs - -TODOs ------ - -The following subjects are not covered in detail above but perhaps -should be. Including them here so they are not forgotten. - -1. [Store] Should Zarr provide an API for caching objects after first - read/list/etc. Read only stores? -2. [Array] buffer protocol support -3. [Array] ``meta_array`` support -4. [Extensions] Define how Zarr-Python will consume the various plugin - types -5. [Misc] H5py compatibility requires a bit more work and a champion to - drive it forward. -6. [Misc] Define ``chunk_store`` API in 3.0 -7. [Misc] Define ``synchronizer`` API in 3.0 - -References ----------- - -1. `Zarr-Python - repository `__ -2. `Zarr core specification (version 3.0) — Zarr specs - documentation `__ -3. `Zarrita repository `__ -4. `Async-Zarr `__ -5. `Zarr-Python Discussion - Topic `__ diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000000..240f939d90 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,230 @@ +# Zarr-Python + +**Useful links**: +[Source Repository](https://github.com/zarr-developers/zarr-python) | +[Issue Tracker](https://github.com/zarr-developers/zarr-python/issues) | +[Developer Chat](https://ossci.zulipchat.com/) | +[Zarr specifications](https://zarr-specs.readthedocs.io) + + +Zarr is a powerful library for storage of n-dimensional arrays, supporting chunking, +compression, and various backends, making it a versatile choice for scientific and +large-scale data. + +Zarr-Python is a Python library for reading and writing Zarr groups and arrays. Highlights include: + +* Specification support for both Zarr format 2 and 3. +* Create and read from N-dimensional arrays using NumPy-like semantics. +* Flexible storage enables reading and writing from local, cloud and in-memory stores. +* High performance: Enables fast I/O with support for asynchronous I/O and multi-threading. +* Extensible: Customizable with user-defined codecs and stores. + +## Installation + +Zarr requires Python 3.11 or higher. You can install it via `pip`: + +```bash +pip install zarr +``` + +or `conda`: + +```bash +conda install --channel conda-forge zarr +``` + +## Quick Start 🚀 + +This section will help you get up and running with +the Zarr library in Python to efficiently manage and analyze multi-dimensional arrays. + +### Creating an Array + +To get started, you can create a simple Zarr array: + +```python +import zarr +import numpy as np + +# Create a 2D Zarr array +z = zarr.create_array( + store="data/example-1.zarr", + shape=(100, 100), + chunks=(10, 10), + dtype="f4" +) + +# Assign data to the array +z[:, :] = np.random.random((100, 100)) +z.info + +# Type : Array +# Zarr format : 3 +# Data type : DataType.float32 +# Shape : (100, 100) +# Chunk shape : (10, 10) +# Order : C +# Read-only : False +# Store type : LocalStore +# Codecs : [{'endian': }, {'level': 0, 'checksum': False}] +# No. bytes : 40000 (39.1K) +``` + +Here, we created a 2D array of shape `(100, 100)`, chunked into blocks of +`(10, 10)`, and filled it with random floating-point data. This array was +written to a `LocalStore` in the `data/example-1.zarr` directory. + +#### Compression and Filters + +Zarr supports data compression and filters. For example, to use Blosc compression: + +```python +z = zarr.create_array( + "data/example-3.zarr", + mode="w", shape=(100, 100), + chunks=(10, 10), dtype="f4", + compressors=zarr.codecs.BloscCodec(cname="zstd", clevel=3, shuffle=zarr.codecs.BloscShuffle.shuffle) +) +z[:, :] = np.random.random((100, 100)) +z.info +# Type : Array +# Zarr format : 3 +# Data type : DataType.float32 +# Shape : (100, 100) +# Chunk shape : (10, 10) +# Order : C +# Read-only : False +# Store type : LocalStore +# Codecs : [{'endian': }, {'level': 0, 'checksum': False}] +# No. bytes : 40000 (39.1K) +``` + +This compresses the data using the Zstandard codec with shuffle enabled for better compression. + +### Hierarchical Groups + +Zarr allows you to create hierarchical groups, similar to directories: + +```python +# Create nested groups and add arrays +root = zarr.group("data/example-2.zarr") +foo = root.create_group(name="foo") +bar = root.create_array( + name="bar", shape=(100, 10), chunks=(10, 10), dtype="f4" +) +spam = foo.create_array(name="spam", shape=(10,), dtype="i4") + +# Assign values +bar[:, :] = np.random.random((100, 10)) +spam[:] = np.arange(10) + +# print the hierarchy +root.tree() +# / +# ├── bar (100, 10) float32 +# └── foo +# └── spam (10,) int32 +``` + +This creates a group with two datasets: `foo` and `bar`. + +#### Batch Hierarchy Creation + +Zarr provides tools for creating a collection of arrays and groups with a single function call. +Suppose we want to copy existing groups and arrays into a new storage backend: + +```python +# Create nested groups and add arrays +root = zarr.group("data/example-3.zarr", attributes={'name': 'root'}) +foo = root.create_group(name="foo") +bar = root.create_array( + name="bar", shape=(100, 10), chunks=(10, 10), dtype="f4" +) +nodes = {'': root.metadata} | {k: v.metadata for k,v in root.members()} +print(nodes) +from zarr.storage import MemoryStore +new_nodes = dict(zarr.create_hierarchy(store=MemoryStore(), nodes=nodes)) +new_root = new_nodes[''] +assert new_root.attrs == root.attrs +``` + +Note that `zarr.create_hierarchy` will only initialize arrays and groups -- copying array data must +be done in a separate step. + +### Persistent Storage + +Zarr supports persistent storage to disk or cloud-compatible backends. While examples above +utilized a `zarr.storage.LocalStore`, a number of other storage options are available. + +Zarr integrates seamlessly with cloud object storage such as Amazon S3 and Google Cloud Storage +using external libraries like [s3fs](https://s3fs.readthedocs.io) or +[gcsfs](https://gcsfs.readthedocs.io): + +```python +import s3fs + +z = zarr.create_array("s3://example-bucket/foo", mode="w", shape=(100, 100), chunks=(10, 10), dtype="f4") +z[:, :] = np.random.random((100, 100)) +``` + +A single-file store can also be created using the `zarr.storage.ZipStore`: + +```python +# Store the array in a ZIP file +store = zarr.storage.ZipStore("data/example-3.zip", mode='w') + +z = zarr.create_array( + store=store, + mode="w", + shape=(100, 100), + chunks=(10, 10), + dtype="f4" +) + +# write to the array +z[:, :] = np.random.random((100, 100)) + +# the ZipStore must be explicitly closed +store.close() +``` + +To open an existing array from a ZIP file: + +```python +# Open the ZipStore in read-only mode +store = zarr.storage.ZipStore("data/example-3.zip", read_only=True) + +z = zarr.open_array(store, mode='r') + +# read the data as a NumPy Array +z[:] +# array([[0.66734236, 0.15667458, 0.98720884, ..., 0.36229587, 0.67443246, +# 0.34315267], +# [0.65787303, 0.9544212 , 0.4830079 , ..., 0.33097172, 0.60423803, +# 0.45621237], +# [0.27632037, 0.9947008 , 0.42434934, ..., 0.94860053, 0.6226942 , +# 0.6386924 ], +# ..., +# [0.12854576, 0.934397 , 0.19524333, ..., 0.11838563, 0.4967675 , +# 0.43074256], +# [0.82029045, 0.4671437 , 0.8090906 , ..., 0.7814118 , 0.42650765, +# 0.95929915], +# [0.4335856 , 0.7565437 , 0.7828931 , ..., 0.48119593, 0.66220033, +# 0.6652362 ]], shape=(100, 100), dtype=float32) +``` + +Read more about Zarr's storage options in the [User Guide](user-guide/storage.md). + +## Project Status + +More information about the Zarr format can be found on the [main website](https://zarr.dev). + +If you are using Zarr-Python, we would [love to hear about it](https://github.com/zarr-developers/community/issues/19). + +### Funding and Support +The project is fiscally sponsored by [NumFOCUS](https://numfocus.org/), a US +501(c)(3) public charity, and development has been supported by the +[MRC Centre for Genomics and Global Health](https://www.cggh.org) +and the [Chan Zuckerberg Initiative](https://chanzuckerberg.com/). + +[Donate to Zarr](https://numfocus.org/donate-to-zarr) to support the project! diff --git a/docs/index.rst b/docs/index.rst deleted file mode 100644 index 83d427e290..0000000000 --- a/docs/index.rst +++ /dev/null @@ -1,113 +0,0 @@ -.. _zarr_docs_mainpage: - -*********** -Zarr-Python -*********** - -.. toctree:: - :maxdepth: 1 - :hidden: - - quickstart - user-guide/index - API reference - release-notes - developers/index - about - -**Version**: |version| - -**Useful links**: -`Source Repository `_ | -`Issue Tracker `_ | -`Developer Chat `_ | -`Zarr specifications `_ - -Zarr-Python is a Python library for reading and writing Zarr groups and arrays. Highlights include: - -* Specification support for both Zarr format 2 and 3. -* Create and read from N-dimensional arrays using NumPy-like semantics. -* Flexible storage enables reading and writing from local, cloud and in-memory stores. -* High performance: Enables fast I/O with support for asynchronous I/O and multi-threading. -* Extensible: Customizable with user-defined codecs and stores. - -.. grid:: 2 - - .. grid-item-card:: - :img-top: _static/index_getting_started.svg - - Quick Start - ^^^^^^^^^^^ - - New to Zarr? Check out the quick start guide. It contains a brief - introduction to Zarr's main concepts and links to additional tutorials. - - +++ - - .. button-ref:: quickstart - :expand: - :color: dark - :click-parent: - - To the Quick Start - - .. grid-item-card:: - :img-top: _static/index_user_guide.svg - - Guide - ^^^^^ - - A detailed guide for how to use Zarr-Python. - - +++ - - .. button-ref:: user-guide/index - :expand: - :color: dark - :click-parent: - - To the user guide - - .. grid-item-card:: - :img-top: _static/index_api.svg - - API Reference - ^^^^^^^^^^^^^ - - The reference guide contains a detailed description of the functions, - modules, and objects included in Zarr. The reference describes how the - methods work and which parameters can be used. It assumes that you have an - understanding of the key concepts. - - +++ - - .. button-ref:: api/zarr/index - :expand: - :color: dark - :click-parent: - - To the API reference - - .. grid-item-card:: - :img-top: _static/index_contribute.svg - - Contributor's Guide - ^^^^^^^^^^^^^^^^^^^ - - Want to contribute to Zarr? We welcome contributions in the form of bug reports, - bug fixes, documentation, enhancement proposals and more. The contributing guidelines - will guide you through the process of improving Zarr. - - +++ - - .. button-ref:: developers/contributing - :expand: - :color: dark - :click-parent: - - To the contributor's guide - - -**Download documentation**: `PDF/Zipped HTML `_ - -.. _NumCodecs: https://numcodecs.readthedocs.io diff --git a/docs/overrides/main.html b/docs/overrides/main.html new file mode 100644 index 0000000000..d61a1f54dc --- /dev/null +++ b/docs/overrides/main.html @@ -0,0 +1,9 @@ + +{% extends "base.html" %} + +{% block outdated %} + You're not viewing the latest version. + + Click here to go to latest. + +{% endblock %} diff --git a/docs/overrides/stylesheets/extra.css b/docs/overrides/stylesheets/extra.css new file mode 100644 index 0000000000..3e2ef3d330 --- /dev/null +++ b/docs/overrides/stylesheets/extra.css @@ -0,0 +1,52 @@ +:root, +[data-md-color-scheme="default"] { + /* --md-primary-fg-color: #cf3f02; + --md-default-fg-color: #443f3f; */ + --boxShadowD: 0px 12px 24px 0px rgba(68, 63, 63, 0.08), + 0px 0px 4px 0px rgba(68, 63, 63, 0.08); +} +body { + margin: 0; + padding: 0; + /* font-size: 16px; */ +} +h1, +h2, +h3, +h4, +h5, +h6 { + font-family: var(--md-heading-font); + font-weight: bold; +} +.md-typeset h1, +.md-typeset h2 { + font-weight: normal; + color: var(--md-default-fg-color); +} +.md-typeset h3, +.md-typeset h4 { + font-weight: bold; + color: var(--md-default-fg-color); +} +.md-button, +.md-typeset .md-button { + font-family: var(--md-heading-font); +} +.md-content .supheading { + font-family: var(--md-heading-font); + text-transform: uppercase; + color: var(--md-primary-fg-color); + font-size: 0.75rem; + font-weight: bold; +} + +.md-header__button.md-logo img, +.md-header__button.md-logo svg { + height: 2rem; + width: auto; +} + +.md-header { + padding: 0.2rem 0; +} diff --git a/docs/quickstart.rst b/docs/quickstart.rst deleted file mode 100644 index 66bdae2a2e..0000000000 --- a/docs/quickstart.rst +++ /dev/null @@ -1,209 +0,0 @@ -.. only:: doctest - - >>> import shutil - >>> shutil.rmtree('data', ignore_errors=True) - >>> - >>> import numpy as np - >>> np.random.seed(0) - -Quickstart -========== - -Welcome to the Zarr-Python Quickstart guide! This page will help you get up and running with -the Zarr library in Python to efficiently manage and analyze multi-dimensional arrays. - -Zarr is a powerful library for storage of n-dimensional arrays, supporting chunking, -compression, and various backends, making it a versatile choice for scientific and -large-scale data. - -Installation ------------- - -Zarr requires Python 3.11 or higher. You can install it via `pip`: - -.. code-block:: bash - - pip install zarr - -or `conda`: - -.. code-block:: bash - - conda install --channel conda-forge zarr - -Creating an Array ------------------ - -To get started, you can create a simple Zarr array:: - - >>> import zarr - >>> import numpy as np - >>> - >>> # Create a 2D Zarr array - >>> z = zarr.create_array( - ... store="data/example-1.zarr", - ... shape=(100, 100), - ... chunks=(10, 10), - ... dtype="f4" - ... ) - >>> - >>> # Assign data to the array - >>> z[:, :] = np.random.random((100, 100)) - >>> z.info - Type : Array - Zarr format : 3 - Data type : DataType.float32 - Shape : (100, 100) - Chunk shape : (10, 10) - Order : C - Read-only : False - Store type : LocalStore - Codecs : [{'endian': }, {'level': 0, 'checksum': False}] - No. bytes : 40000 (39.1K) - -Here, we created a 2D array of shape ``(100, 100)``, chunked into blocks of -``(10, 10)``, and filled it with random floating-point data. This array was -written to a ``LocalStore`` in the ``data/example-1.zarr`` directory. - -Compression and Filters -~~~~~~~~~~~~~~~~~~~~~~~ - -Zarr supports data compression and filters. For example, to use Blosc compression:: - - >>> z = zarr.create_array( - ... "data/example-3.zarr", - ... mode="w", shape=(100, 100), - ... chunks=(10, 10), dtype="f4", - ... compressors=zarr.codecs.BloscCodec(cname="zstd", clevel=3, shuffle=zarr.codecs.BloscShuffle.shuffle) - ... ) - >>> z[:, :] = np.random.random((100, 100)) - >>> - >>> z.info - Type : Array - Zarr format : 3 - Data type : DataType.float32 - Shape : (100, 100) - Chunk shape : (10, 10) - Order : C - Read-only : False - Store type : LocalStore - Codecs : [{'endian': }, {'level': 0, 'checksum': False}] - No. bytes : 40000 (39.1K) - -This compresses the data using the Zstandard codec with shuffle enabled for better compression. - -Hierarchical Groups -------------------- - -Zarr allows you to create hierarchical groups, similar to directories:: - - >>> # Create nested groups and add arrays - >>> root = zarr.group("data/example-2.zarr") - >>> foo = root.create_group(name="foo") - >>> bar = root.create_array( - ... name="bar", shape=(100, 10), chunks=(10, 10), dtype="f4" - ... ) - >>> spam = foo.create_array(name="spam", shape=(10,), dtype="i4") - >>> - >>> # Assign values - >>> bar[:, :] = np.random.random((100, 10)) - >>> spam[:] = np.arange(10) - >>> - >>> # print the hierarchy - >>> root.tree() - / - ├── bar (100, 10) float32 - └── foo - └── spam (10,) int32 - - -This creates a group with two datasets: ``foo`` and ``bar``. - -Batch Hierarchy Creation -~~~~~~~~~~~~~~~~~~~~~~~~ - -Zarr provides tools for creating a collection of arrays and groups with a single function call. -Suppose we want to copy existing groups and arrays into a new storage backend: - - >>> # Create nested groups and add arrays - >>> root = zarr.group("data/example-3.zarr", attributes={'name': 'root'}) - >>> foo = root.create_group(name="foo") - >>> bar = root.create_array( - ... name="bar", shape=(100, 10), chunks=(10, 10), dtype="f4" - ... ) - >>> nodes = {'': root.metadata} | {k: v.metadata for k,v in root.members()} - >>> print(nodes) - >>> from zarr.storage import MemoryStore - >>> new_nodes = dict(zarr.create_hierarchy(store=MemoryStore(), nodes=nodes)) - >>> new_root = new_nodes[''] - >>> assert new_root.attrs == root.attrs - -Note that :func:`zarr.create_hierarchy` will only initialize arrays and groups -- copying array data must -be done in a separate step. - -Persistent Storage ------------------- - -Zarr supports persistent storage to disk or cloud-compatible backends. While examples above -utilized a :class:`zarr.storage.LocalStore`, a number of other storage options are available. - -Zarr integrates seamlessly with cloud object storage such as Amazon S3 and Google Cloud Storage -using external libraries like `s3fs `_ or -`gcsfs `_:: - - >>> import s3fs # doctest: +SKIP - >>> - >>> z = zarr.create_array("s3://example-bucket/foo", mode="w", shape=(100, 100), chunks=(10, 10), dtype="f4") # doctest: +SKIP - >>> z[:, :] = np.random.random((100, 100)) # doctest: +SKIP - -A single-file store can also be created using the the :class:`zarr.storage.ZipStore`:: - - >>> # Store the array in a ZIP file - >>> store = zarr.storage.ZipStore("data/example-3.zip", mode='w') - >>> - >>> z = zarr.create_array( - ... store=store, - ... mode="w", - ... shape=(100, 100), - ... chunks=(10, 10), - ... dtype="f4" - ... ) - >>> - >>> # write to the array - >>> z[:, :] = np.random.random((100, 100)) - >>> - >>> # the ZipStore must be explicitly closed - >>> store.close() - -To open an existing array from a ZIP file:: - - >>> # Open the ZipStore in read-only mode - >>> store = zarr.storage.ZipStore("data/example-3.zip", read_only=True) - >>> - >>> z = zarr.open_array(store, mode='r') - >>> - >>> # read the data as a NumPy Array - >>> z[:] - array([[0.66734236, 0.15667458, 0.98720884, ..., 0.36229587, 0.67443246, - 0.34315267], - [0.65787303, 0.9544212 , 0.4830079 , ..., 0.33097172, 0.60423803, - 0.45621237], - [0.27632037, 0.9947008 , 0.42434934, ..., 0.94860053, 0.6226942 , - 0.6386924 ], - ..., - [0.12854576, 0.934397 , 0.19524333, ..., 0.11838563, 0.4967675 , - 0.43074256], - [0.82029045, 0.4671437 , 0.8090906 , ..., 0.7814118 , 0.42650765, - 0.95929915], - [0.4335856 , 0.7565437 , 0.7828931 , ..., 0.48119593, 0.66220033, - 0.6652362 ]], shape=(100, 100), dtype=float32) - -Read more about Zarr's storage options in the :ref:`User Guide `. - -Next Steps ----------- - -Now that you're familiar with the basics, explore the following resources: - -- `User Guide `_ -- `API Reference `_ diff --git a/docs/release-notes.md b/docs/release-notes.md new file mode 100644 index 0000000000..0f576b4f4e --- /dev/null +++ b/docs/release-notes.md @@ -0,0 +1,210 @@ +# Release notes + +## 3.0.8 (2025-05-19) + +> **Warning** +> In versions 3.0.0 to 3.0.7 opening arrays or groups with `mode='a'` (the default for many builtin functions) +> would cause any existing paths in the store to be deleted. This is fixed in 3.0.8, and +> we recommend all users upgrade to avoid this bug that could cause unintentional data loss. + +### Features + +- Added a `print_debug_info` function for bug reports. ([#2913](https://github.com/zarr-developers/zarr-python/issues/2913)) + +### Bugfixes + +- Fix a bug that prevented the number of initialized chunks being counted properly. ([#2862](https://github.com/zarr-developers/zarr-python/issues/2862)) +- Fixed sharding with GPU buffers. ([#2978](https://github.com/zarr-developers/zarr-python/issues/2978)) +- Fix structured `dtype` fill value serialization for consolidated metadata ([#2998](https://github.com/zarr-developers/zarr-python/issues/2998)) +- It is now possible to specify no compressor when creating a zarr format 2 array. + This can be done by passing `compressor=None` to the various array creation routines. + + The default behaviour of automatically choosing a suitable default compressor remains if the compressor argument is not given. + To reproduce the behaviour in previous zarr-python versions when `compressor=None` was passed, pass `compressor='auto'` instead. ([#3039](https://github.com/zarr-developers/zarr-python/issues/3039)) +- Fixed the typing of `dimension_names` arguments throughout so that it now accepts iterables that contain `None` alongside `str`. ([#3045](https://github.com/zarr-developers/zarr-python/issues/3045)) +- Using various functions to open data with `mode='a'` no longer deletes existing data in the store. ([#3062](https://github.com/zarr-developers/zarr-python/issues/3062)) +- Internally use `typesize` constructor parameter for `numcodecs.blosc.Blosc` to improve compression ratios back to the v2-package levels. ([#2962](https://github.com/zarr-developers/zarr-python/issues/2962)) +- Specifying the memory order of Zarr format 2 arrays using the `order` keyword argument has been fixed. ([#2950](https://github.com/zarr-developers/zarr-python/issues/2950)) + +### Misc + +- [#2972](https://github.com/zarr-developers/zarr-python/issues/2972), [#3027](https://github.com/zarr-developers/zarr-python/issues/3027), [#3049](https://github.com/zarr-developers/zarr-python/issues/3049) + +## 3.0.7 (2025-04-22) + +### Features + +- Add experimental ObjectStore storage class based on obstore. ([#1661](https://github.com/zarr-developers/zarr-python/issues/1661)) +- Add `zarr.from_array` using concurrent streaming of source data ([#2622](https://github.com/zarr-developers/zarr-python/issues/2622)) + +### Bugfixes + +- 0-dimensional arrays are now returning a scalar. Therefore, the return type of `__getitem__` changed + to NDArrayLikeOrScalar. This change is to make the behavior of 0-dimensional arrays consistent with + `numpy` scalars. ([#2718](https://github.com/zarr-developers/zarr-python/issues/2718)) +- Fix `fill_value` serialization for `NaN` in `ArrayV2Metadata` and add property-based testing of round-trip serialization ([#2802](https://github.com/zarr-developers/zarr-python/issues/2802)) +- Fixes `ConsolidatedMetadata` serialization of `nan`, `inf`, and `-inf` to be + consistent with the behavior of `ArrayMetadata`. ([#2996](https://github.com/zarr-developers/zarr-python/issues/2996)) + +### Improved Documentation + +- Updated the 3.0 migration guide to include the removal of "." syntax for getting group members. ([#2991](https://github.com/zarr-developers/zarr-python/issues/2991), [#2997](https://github.com/zarr-developers/zarr-python/issues/2997)) + +### Misc + +- Define a new versioning policy based on Effective Effort Versioning. This replaces the old Semantic + Versioning-based policy. ([#2924](https://github.com/zarr-developers/zarr-python/issues/2924), [#2910](https://github.com/zarr-developers/zarr-python/issues/2910)) +- Make warning filters in the tests more specific, so warnings emitted by tests added in the future + are more likely to be caught instead of ignored. ([#2714](https://github.com/zarr-developers/zarr-python/issues/2714)) +- Avoid an unnecessary memory copy when writing Zarr to a local file ([#2944](https://github.com/zarr-developers/zarr-python/issues/2944)) + +## 3.0.6 (2025-03-20) + +### Bugfixes + +- Restore functionality of `del z.attrs['key']` to actually delete the key. ([#2908](https://github.com/zarr-developers/zarr-python/issues/2908)) + +## 3.0.5 (2025-03-07) + +### Bugfixes + +- Fixed a bug where `StorePath` creation would not apply standard path normalization to the `path` parameter, + which led to the creation of arrays and groups with invalid keys. ([#2850](https://github.com/zarr-developers/zarr-python/issues/2850)) +- Prevent update_attributes calls from deleting old attributes ([#2870](https://github.com/zarr-developers/zarr-python/issues/2870)) + +### Misc + +- [#2796](https://github.com/zarr-developers/zarr-python/issues/2796) + +## 3.0.4 (2025-02-23) + +### Features + +- Adds functions for concurrently creating multiple arrays and groups. ([#2665](https://github.com/zarr-developers/zarr-python/issues/2665)) + +### Bugfixes + +- Fixed a bug where `ArrayV2Metadata` could save `filters` as an empty array. ([#2847](https://github.com/zarr-developers/zarr-python/issues/2847)) +- Fix a bug when setting values of a smaller last chunk. ([#2851](https://github.com/zarr-developers/zarr-python/issues/2851)) + +### Misc + +- [#2828](https://github.com/zarr-developers/zarr-python/issues/2828) + +## 3.0.3 (2025-02-14) + +### Features + +- Improves performance of FsspecStore.delete_dir for remote filesystems supporting concurrent/batched deletes, e.g., s3fs. ([#2661](https://github.com/zarr-developers/zarr-python/issues/2661)) +- Added `zarr.config.enable_gpu` to update Zarr's configuration to use GPUs. ([#2751](https://github.com/zarr-developers/zarr-python/issues/2751)) +- Avoid reading chunks during writes where possible. [#757](https://github.com/zarr-developers/zarr-python/issues/757) ([#2784](https://github.com/zarr-developers/zarr-python/issues/2784)) +- `LocalStore` learned to `delete_dir`. This makes array and group deletes more efficient. ([#2804](https://github.com/zarr-developers/zarr-python/issues/2804)) +- Add `zarr.testing.strategies.array_metadata` to generate ArrayV2Metadata and ArrayV3Metadata instances. ([#2813](https://github.com/zarr-developers/zarr-python/issues/2813)) +- Add arbitrary `shards` to Hypothesis strategy for generating arrays. ([#2822](https://github.com/zarr-developers/zarr-python/issues/2822)) + +### Bugfixes + +- Fixed bug with Zarr using device memory, instead of host memory, for storing metadata when using GPUs. ([#2751](https://github.com/zarr-developers/zarr-python/issues/2751)) +- The array returned by `zarr.empty` and an empty `zarr.core.buffer.cpu.NDBuffer` will now be filled with the + specified fill value, or with zeros if no fill value is provided. + This fixes a bug where Zarr format 2 data with no fill value was written with un-predictable chunk sizes. ([#2755](https://github.com/zarr-developers/zarr-python/issues/2755)) +- Fix zip-store path checking for stores with directories listed as files. ([#2758](https://github.com/zarr-developers/zarr-python/issues/2758)) +- Use removeprefix rather than replace when removing filename prefixes in `FsspecStore.list` ([#2778](https://github.com/zarr-developers/zarr-python/issues/2778)) +- Enable automatic removal of `needs release notes` with labeler action ([#2781](https://github.com/zarr-developers/zarr-python/issues/2781)) +- Use the proper label config ([#2785](https://github.com/zarr-developers/zarr-python/issues/2785)) +- Alters the behavior of `create_array` to ensure that any groups implied by the array's name are created if they do not already exist. Also simplifies the type signature for any function that takes an ArrayConfig-like object. ([#2795](https://github.com/zarr-developers/zarr-python/issues/2795)) +- Enitialise empty chunks to the default fill value during writing and add default fill values for datetime, timedelta, structured, and other (void* fixed size) data types ([#2799](https://github.com/zarr-developers/zarr-python/issues/2799)) +- Ensure utf8 compliant strings are used to construct numpy arrays in property-based tests ([#2801](https://github.com/zarr-developers/zarr-python/issues/2801)) +- Fix pickling for ZipStore ([#2807](https://github.com/zarr-developers/zarr-python/issues/2807)) +- Update numcodecs to not overwrite codec configuration ever. Closes [#2800](https://github.com/zarr-developers/zarr-python/issues/2800). ([#2811](https://github.com/zarr-developers/zarr-python/issues/2811)) +- Fix fancy indexing (e.g. arr[5, [0, 1]]) with the sharding codec ([#2817](https://github.com/zarr-developers/zarr-python/issues/2817)) + +### Improved Documentation + +- Added new user guide on GPU. ([#2751](https://github.com/zarr-developers/zarr-python/issues/2751)) + +## 3.0.2 (2025-01-31) + +### Features + +- Test `getsize()` and `getsize_prefix()` in `StoreTests`. ([#2693](https://github.com/zarr-developers/zarr-python/issues/2693)) +- Test that a `ValueError` is raised for invalid byte range syntax in `StoreTests`. ([#2693](https://github.com/zarr-developers/zarr-python/issues/2693)) +- Separate instantiating and opening a store in `StoreTests`. ([#2693](https://github.com/zarr-developers/zarr-python/issues/2693)) +- Add a test for using Stores as a context managers in `StoreTests`. ([#2693](https://github.com/zarr-developers/zarr-python/issues/2693)) +- Implemented `LogingStore.open()`. ([#2693](https://github.com/zarr-developers/zarr-python/issues/2693)) +- `LoggingStore` is now a generic class. ([#2693](https://github.com/zarr-developers/zarr-python/issues/2693)) +- Change StoreTest's `test_store_repr`, `test_store_supports_writes`, + `test_store_supports_partial_writes`, and `test_store_supports_listing` + to to be implemented using `@abstractmethod`, rather raising `NotImplementedError`. ([#2693](https://github.com/zarr-developers/zarr-python/issues/2693)) +- Test the error raised for invalid buffer arguments in `StoreTests`. ([#2693](https://github.com/zarr-developers/zarr-python/issues/2693)) +- Test that data can be written to a store that's not yet open using the store.set method in `StoreTests`. ([#2693](https://github.com/zarr-developers/zarr-python/issues/2693)) +- Adds a new function `init_array` for initializing an array in storage, and refactors `create_array` + to use `init_array`. `create_array` takes two new parameters: `data`, an optional array-like object, and `write_data`, a bool which defaults to `True`. + If `data` is given to `create_array`, then the `dtype` and `shape` attributes of `data` are used to define the + corresponding attributes of the resulting Zarr array. Additionally, if `data` given and `write_data` is `True`, + then the values in `data` will be written to the newly created array. ([#2761](https://github.com/zarr-developers/zarr-python/issues/2761)) + +### Bugfixes + +- Wrap sync fsspec filesystems with `AsyncFileSystemWrapper`. ([#2533](https://github.com/zarr-developers/zarr-python/issues/2533)) +- Added backwards compatibility for Zarr format 2 structured arrays. ([#2681](https://github.com/zarr-developers/zarr-python/issues/2681)) +- Update equality for `LoggingStore` and `WrapperStore` such that 'other' must also be a `LoggingStore` or `WrapperStore` respectively, rather than only checking the types of the stores they wrap. ([#2693](https://github.com/zarr-developers/zarr-python/issues/2693)) +- Ensure that `ZipStore` is open before getting or setting any values. ([#2693](https://github.com/zarr-developers/zarr-python/issues/2693)) +- Use stdout rather than stderr as the default stream for `LoggingStore`. ([#2693](https://github.com/zarr-developers/zarr-python/issues/2693)) +- Match the errors raised by read only stores in `StoreTests`. ([#2693](https://github.com/zarr-developers/zarr-python/issues/2693)) +- Fixed `ZipStore` to make sure the correct attributes are saved when instances are pickled. + This fixes a previous bug that prevent using `ZipStore` with a `ProcessPoolExecutor`. ([#2762](https://github.com/zarr-developers/zarr-python/issues/2762)) +- Updated the optional test dependencies to include `botocore` and `fsspec`. ([#2768](https://github.com/zarr-developers/zarr-python/issues/2768)) +- Fixed the fsspec tests to skip if `botocore` is not installed. + Previously they would have failed with an import error. ([#2768](https://github.com/zarr-developers/zarr-python/issues/2768)) +- Optimize full chunk writes. ([#2782](https://github.com/zarr-developers/zarr-python/issues/2782)) + +### Improved Documentation + +- Changed the machinery for creating changelog entries. + Now individual entries should be added as files to the `changes` directory in the `zarr-python` repository, instead of directly to the changelog file. ([#2736](https://github.com/zarr-developers/zarr-python/issues/2736)) + +### Other + +- Created a type alias `ChunkKeyEncodingLike` to model the union of `ChunkKeyEncoding` instances and the dict form of the + parameters of those instances. `ChunkKeyEncodingLike` should be used by high-level functions to provide a convenient + way for creating `ChunkKeyEncoding` objects. ([#2763](https://github.com/zarr-developers/zarr-python/issues/2763)) + +## 3.0.1 (Jan. 17, 2025) + +* Implement `zarr.from_array` using concurrent streaming ([#2622](https://github.com/zarr-developers/zarr-python/issues/2622)). + +### Bug fixes + +* Fixes `order` argument for Zarr format 2 arrays ([#2679](https://github.com/zarr-developers/zarr-python/issues/2679)). +* Fixes a bug that prevented reading Zarr format 2 data with consolidated + metadata written using `zarr-python` version 2 ([#2694](https://github.com/zarr-developers/zarr-python/issues/2694)). +* Ensure that compressor=None results in no compression when writing Zarr + format 2 data ([#2708](https://github.com/zarr-developers/zarr-python/issues/2708)). +* Fix for empty consolidated metadata dataset: backwards compatibility with + Zarr-Python 2 ([#2695](https://github.com/zarr-developers/zarr-python/issues/2695)). + +### Documentation + +* Add v3.0.0 release announcement banner ([#2677](https://github.com/zarr-developers/zarr-python/issues/2677)). +* Quickstart guide alignment with V3 API ([#2697](https://github.com/zarr-developers/zarr-python/issues/2697)). +* Fix doctest failures related to numcodecs 0.15 ([#2727](https://github.com/zarr-developers/zarr-python/issues/2727)). + +### Other + +* Removed some unnecessary files from the source distribution + to reduce its size. ([#2686](https://github.com/zarr-developers/zarr-python/issues/2686)). +* Enable codecov in GitHub actions ([#2682](https://github.com/zarr-developers/zarr-python/issues/2682)). +* Speed up hypothesis tests ([#2650](https://github.com/zarr-developers/zarr-python/issues/2650)). +* Remove multiple imports for an import name ([#2723](https://github.com/zarr-developers/zarr-python/issues/2723)). + +## 3.0.0 (Jan. 9, 2025) + +3.0.0 is a new major release of Zarr-Python, with many breaking changes. +See the [v3 migration guide](user-guide/v3_migration.md) for a listing of what's changed. + +Normal release note service will resume with further releases in the 3.0.0 +series. + +Release notes for the zarr-python 2.x and 1.x releases can be found here: +https://zarr.readthedocs.io/en/support-v2/release.html diff --git a/docs/release-notes.rst b/docs/release-notes.rst deleted file mode 100644 index a89046dd6d..0000000000 --- a/docs/release-notes.rst +++ /dev/null @@ -1,269 +0,0 @@ -Release notes -============= - -.. towncrier release notes start - -3.0.8 (2025-05-19) ------------------- - -.. warning:: - - In versions 3.0.0 to 3.0.7 opening arrays or groups with ``mode='a'`` (the default for many builtin functions) - would cause any existing paths in the store to be deleted. This is fixed in 3.0.8, and - we recommend all users upgrade to avoid this bug that could cause unintentional data loss. - -Features -~~~~~~~~ - -- Added a `print_debug_info` function for bug reports. (:issue:`2913`) - - -Bugfixes -~~~~~~~~ - -- Fix a bug that prevented the number of initialized chunks being counted properly. (:issue:`2862`) -- Fixed sharding with GPU buffers. (:issue:`2978`) -- Fix structured `dtype` fill value serialization for consolidated metadata (:issue:`2998`) -- It is now possible to specify no compressor when creating a zarr format 2 array. - This can be done by passing ``compressor=None`` to the various array creation routines. - - The default behaviour of automatically choosing a suitable default compressor remains if the compressor argument is not given. - To reproduce the behaviour in previous zarr-python versions when ``compressor=None`` was passed, pass ``compressor='auto'`` instead. (:issue:`3039`) -- Fixed the typing of ``dimension_names`` arguments throughout so that it now accepts iterables that contain `None` alongside `str`. (:issue:`3045`) -- Using various functions to open data with ``mode='a'`` no longer deletes existing data in the store. (:issue:`3062`) -- Internally use `typesize` constructor parameter for :class:`numcodecs.blosc.Blosc` to improve compression ratios back to the v2-package levels. (:issue:`2962`) -- Specifying the memory order of Zarr format 2 arrays using the ``order`` keyword argument has been fixed. (:issue:`2950`) - - -Misc -~~~~ - -- :issue:`2972`, :issue:`3027`, :issue:`3049` - - -3.0.7 (2025-04-22) ------------------- - -Features -~~~~~~~~ - -- Add experimental ObjectStore storage class based on obstore. (:issue:`1661`) -- Add ``zarr.from_array`` using concurrent streaming of source data (:issue:`2622`) - - -Bugfixes -~~~~~~~~ - -- 0-dimensional arrays are now returning a scalar. Therefore, the return type of ``__getitem__`` changed - to NDArrayLikeOrScalar. This change is to make the behavior of 0-dimensional arrays consistent with - ``numpy`` scalars. (:issue:`2718`) -- Fix `fill_value` serialization for `NaN` in `ArrayV2Metadata` and add property-based testing of round-trip serialization (:issue:`2802`) -- Fixes `ConsolidatedMetadata` serialization of `nan`, `inf`, and `-inf` to be - consistent with the behavior of `ArrayMetadata`. (:issue:`2996`) - - -Improved Documentation -~~~~~~~~~~~~~~~~~~~~~~ - -- Updated the 3.0 migration guide to include the removal of "." syntax for getting group members. (:issue:`2991`, :issue:`2997`) - - -Misc -~~~~ -- Define a new versioning policy based on Effective Effort Versioning. This replaces the old Semantic - Versioning-based policy. (:issue:`2924`, :issue:`2910`) -- Make warning filters in the tests more specific, so warnings emitted by tests added in the future - are more likely to be caught instead of ignored. (:issue:`2714`) -- Avoid an unnecessary memory copy when writing Zarr to a local file (:issue:`2944`) - - -3.0.6 (2025-03-20) ------------------- - -Bugfixes -~~~~~~~~ - -- Restore functionality of `del z.attrs['key']` to actually delete the key. (:issue:`2908`) - - -3.0.5 (2025-03-07) ------------------- - -Bugfixes -~~~~~~~~ - -- Fixed a bug where ``StorePath`` creation would not apply standard path normalization to the ``path`` parameter, - which led to the creation of arrays and groups with invalid keys. (:issue:`2850`) -- Prevent update_attributes calls from deleting old attributes (:issue:`2870`) - - -Misc -~~~~ - -- :issue:`2796` - -3.0.4 (2025-02-23) ------------------- - -Features -~~~~~~~~ - -- Adds functions for concurrently creating multiple arrays and groups. (:issue:`2665`) - -Bugfixes -~~~~~~~~ - -- Fixed a bug where ``ArrayV2Metadata`` could save ``filters`` as an empty array. (:issue:`2847`) -- Fix a bug when setting values of a smaller last chunk. (:issue:`2851`) - -Misc -~~~~ - -- :issue:`2828` - - -3.0.3 (2025-02-14) ------------------- - -Features -~~~~~~~~ - -- Improves performance of FsspecStore.delete_dir for remote filesystems supporting concurrent/batched deletes, e.g., s3fs. (:issue:`2661`) -- Added :meth:`zarr.config.enable_gpu` to update Zarr's configuration to use GPUs. (:issue:`2751`) -- Avoid reading chunks during writes where possible. :issue:`757` (:issue:`2784`) -- :py:class:`LocalStore` learned to ``delete_dir``. This makes array and group deletes more efficient. (:issue:`2804`) -- Add `zarr.testing.strategies.array_metadata` to generate ArrayV2Metadata and ArrayV3Metadata instances. (:issue:`2813`) -- Add arbitrary `shards` to Hypothesis strategy for generating arrays. (:issue:`2822`) - - -Bugfixes -~~~~~~~~ - -- Fixed bug with Zarr using device memory, instead of host memory, for storing metadata when using GPUs. (:issue:`2751`) -- The array returned by ``zarr.empty`` and an empty ``zarr.core.buffer.cpu.NDBuffer`` will now be filled with the - specified fill value, or with zeros if no fill value is provided. - This fixes a bug where Zarr format 2 data with no fill value was written with un-predictable chunk sizes. (:issue:`2755`) -- Fix zip-store path checking for stores with directories listed as files. (:issue:`2758`) -- Use removeprefix rather than replace when removing filename prefixes in `FsspecStore.list` (:issue:`2778`) -- Enable automatic removal of `needs release notes` with labeler action (:issue:`2781`) -- Use the proper label config (:issue:`2785`) -- Alters the behavior of ``create_array`` to ensure that any groups implied by the array's name are created if they do not already exist. Also simplifies the type signature for any function that takes an ArrayConfig-like object. (:issue:`2795`) -- Enitialise empty chunks to the default fill value during writing and add default fill values for datetime, timedelta, structured, and other (void* fixed size) data types (:issue:`2799`) -- Ensure utf8 compliant strings are used to construct numpy arrays in property-based tests (:issue:`2801`) -- Fix pickling for ZipStore (:issue:`2807`) -- Update numcodecs to not overwrite codec configuration ever. Closes :issue:`2800`. (:issue:`2811`) -- Fix fancy indexing (e.g. arr[5, [0, 1]]) with the sharding codec (:issue:`2817`) - - -Improved Documentation -~~~~~~~~~~~~~~~~~~~~~~ - -- Added new user guide on :ref:`user-guide-gpu`. (:issue:`2751`) - - -3.0.2 (2025-01-31) ------------------- - -Features -~~~~~~~~ - -- Test ``getsize()`` and ``getsize_prefix()`` in ``StoreTests``. (:issue:`2693`) -- Test that a ``ValueError`` is raised for invalid byte range syntax in ``StoreTests``. (:issue:`2693`) -- Separate instantiating and opening a store in ``StoreTests``. (:issue:`2693`) -- Add a test for using Stores as a context managers in ``StoreTests``. (:issue:`2693`) -- Implemented ``LogingStore.open()``. (:issue:`2693`) -- ``LoggingStore`` is now a generic class. (:issue:`2693`) -- Change StoreTest's ``test_store_repr``, ``test_store_supports_writes``, - ``test_store_supports_partial_writes``, and ``test_store_supports_listing`` - to to be implemented using ``@abstractmethod``, rather raising ``NotImplementedError``. (:issue:`2693`) -- Test the error raised for invalid buffer arguments in ``StoreTests``. (:issue:`2693`) -- Test that data can be written to a store that's not yet open using the store.set method in ``StoreTests``. (:issue:`2693`) -- Adds a new function ``init_array`` for initializing an array in storage, and refactors ``create_array`` - to use ``init_array``. ``create_array`` takes two new parameters: ``data``, an optional array-like object, and ``write_data``, a bool which defaults to ``True``. - If ``data`` is given to ``create_array``, then the ``dtype`` and ``shape`` attributes of ``data`` are used to define the - corresponding attributes of the resulting Zarr array. Additionally, if ``data`` given and ``write_data`` is ``True``, - then the values in ``data`` will be written to the newly created array. (:issue:`2761`) - - -Bugfixes -~~~~~~~~ - -- Wrap sync fsspec filesystems with ``AsyncFileSystemWrapper``. (:issue:`2533`) -- Added backwards compatibility for Zarr format 2 structured arrays. (:issue:`2681`) -- Update equality for ``LoggingStore`` and ``WrapperStore`` such that 'other' must also be a ``LoggingStore`` or ``WrapperStore`` respectively, rather than only checking the types of the stores they wrap. (:issue:`2693`) -- Ensure that ``ZipStore`` is open before getting or setting any values. (:issue:`2693`) -- Use stdout rather than stderr as the default stream for ``LoggingStore``. (:issue:`2693`) -- Match the errors raised by read only stores in ``StoreTests``. (:issue:`2693`) -- Fixed ``ZipStore`` to make sure the correct attributes are saved when instances are pickled. - This fixes a previous bug that prevent using ``ZipStore`` with a ``ProcessPoolExecutor``. (:issue:`2762`) -- Updated the optional test dependencies to include ``botocore`` and ``fsspec``. (:issue:`2768`) -- Fixed the fsspec tests to skip if ``botocore`` is not installed. - Previously they would have failed with an import error. (:issue:`2768`) -- Optimize full chunk writes. (:issue:`2782`) - - -Improved Documentation -~~~~~~~~~~~~~~~~~~~~~~ - -- Changed the machinery for creating changelog entries. - Now individual entries should be added as files to the `changes` directory in the `zarr-python` repository, instead of directly to the changelog file. (:issue:`2736`) - -Other -~~~~~ - -- Created a type alias ``ChunkKeyEncodingLike`` to model the union of ``ChunkKeyEncoding`` instances and the dict form of the - parameters of those instances. ``ChunkKeyEncodingLike`` should be used by high-level functions to provide a convenient - way for creating ``ChunkKeyEncoding`` objects. (:issue:`2763`) - - -3.0.1 (Jan. 17, 2025) ---------------------- - -* Implement ``zarr.from_array`` using concurrent streaming (:issue:`2622`). - -Bug fixes -~~~~~~~~~ -* Fixes ``order`` argument for Zarr format 2 arrays (:issue:`2679`). - -* Fixes a bug that prevented reading Zarr format 2 data with consolidated - metadata written using ``zarr-python`` version 2 (:issue:`2694`). - -* Ensure that compressor=None results in no compression when writing Zarr - format 2 data (:issue:`2708`). - -* Fix for empty consolidated metadata dataset: backwards compatibility with - Zarr-Python 2 (:issue:`2695`). - -Documentation -~~~~~~~~~~~~~ -* Add v3.0.0 release announcement banner (:issue:`2677`). - -* Quickstart guide alignment with V3 API (:issue:`2697`). - -* Fix doctest failures related to numcodecs 0.15 (:issue:`2727`). - -Other -~~~~~ -* Removed some unnecessary files from the source distribution - to reduce its size. (:issue:`2686`). - -* Enable codecov in GitHub actions (:issue:`2682`). - -* Speed up hypothesis tests (:issue:`2650`). - -* Remove multiple imports for an import name (:issue:`2723`). - - -.. _release_3.0.0: - -3.0.0 (Jan. 9, 2025) --------------------- - -3.0.0 is a new major release of Zarr-Python, with many breaking changes. -See the :ref:`v3 migration guide` for a listing of what's changed. - -Normal release note service will resume with further releases in the 3.0.0 -series. - -Release notes for the zarr-python 2.x and 1.x releases can be found here: -https://zarr.readthedocs.io/en/support-v2/release.html diff --git a/docs/talks/scipy2019/submission.rst b/docs/talks/scipy2019/submission.rst deleted file mode 100644 index 57fd925b1f..0000000000 --- a/docs/talks/scipy2019/submission.rst +++ /dev/null @@ -1,144 +0,0 @@ -Zarr - scalable storage of tensor data for use in parallel and distributed computing -==================================================================================== - -SciPy 2019 submission. - - -Short summary -------------- - -Many scientific problems involve computing over large N-dimensional -typed arrays of data, and reading or writing data is often the major -bottleneck limiting speed or scalability. The Zarr project is -developing a simple, scalable approach to storage of such data in a -way that is compatible with a range of approaches to distributed and -parallel computing. We describe the Zarr protocol and data storage -format, and the current state of implementations for various -programming languages including Python. We also describe current uses -of Zarr in malaria genomics, the Human Cell Atlas, and the Pangeo -project. - - -Abstract --------- - -Background -~~~~~~~~~~ - -Across a broad range of scientific disciplines, data are naturally -represented and stored as N-dimensional typed arrays, also known as -tensors. The volume of data being generated is outstripping our -ability to analyse it, and scientific communities are looking for ways -to leverage modern multi-core CPUs and distributed computing -platforms, including cloud computing. Retrieval and storage of data is -often the major bottleneck, and new approaches to data storage are -needed to accelerate distributed computations and enable them to scale -on a variety of platforms. - -Methods -~~~~~~~ - -We have designed a new storage format and protocol for tensor data -[1_], and have released an open source Python implementation [2_, -3_]. Our approach builds on data storage concepts from HDF5 [4_], -particularly chunking and compression, and hierarchical organisation -of datasets. Key design goals include: a simple protocol and format -that can be implemented in other programming languages; support for -multiple concurrent readers or writers; support for a variety of -parallel computing environments, from multi-threaded execution on a -single CPU to multi-process execution across a multi-node cluster; -pluggable storage subsystem with support for file systems, key-value -databases and cloud object stores; pluggable encoding subsystem with -support for a variety of modern compressors. - -Results -~~~~~~~ - -We illustrate the use of Zarr with examples from several scientific -domains. Zarr is being used within the Pangeo project [5_], which is -building a community platform for big data geoscience. The Pangeo -community have converted a number of existing climate modelling and -satellite observation datasets to Zarr [6_], and have demonstrated -their use in computations using HPC and cloud computing -environments. Within the MalariaGEN project [7_], Zarr is used to -store genome variation data from next-generation sequencing of natural -populations of malaria parasites and mosquitoes [8_] and these data -are used as input to analyses of the evolution of these organisms in -response to selective pressure from anti-malarial drugs and -insecticides. Zarr is being used within the Human Cell Atlas (HCA) -project [9_], which is building a reference atlas of healthy human -cell types. This project hopes to leverage this information to better -understand the dysregulation of cellular states that underly human -disease. The Human Cell Atlas uses Zarr as the output data format -because it enables the project to easily generate matrices containing -user-selected subsets of cells. - -Conclusions -~~~~~~~~~~~ - -Zarr is generating interest across a range of scientific domains, and -work is ongoing to establish a community process to support further -development of the specifications and implementations in other -programming languages [10_, 11_, 12_] and building interoperability -with a similar project called N5 [13_]. Other packages within the -PyData ecosystem, notably Dask [14_], Xarray [15_] and Intake [16_], -have added capability to read and write Zarr, and together these -packages provide a compelling solution for large scale data science -using Python [17_]. Zarr has recently been presented in several -venues, including a webinar for the ESIP Federation tech dive series -[18_], and a talk at the AGU Fall Meeting 2018 [19_]. - - -References -~~~~~~~~~~ - -.. _1: https://zarr.readthedocs.io/en/stable/spec/v2.html -.. _2: https://github.com/zarr-developers/zarr-python -.. _3: https://github.com/zarr-developers/numcodecs -.. _4: https://www.hdfgroup.org/solutions/hdf5/ -.. _5: https://pangeo.io/ -.. _6: https://pangeo.io/catalog.html -.. _7: https://www.malariagen.net/ -.. _8: http://alimanfoo.github.io/2016/09/21/genotype-compression-benchmark.html -.. _9: https://www.humancellatlas.org/ -.. _10: https://github.com/constantinpape/z5 -.. _11: https://github.com/lasersonlab/ndarray.scala -.. _12: https://github.com/meggart/ZarrNative.jl -.. _13: https://github.com/saalfeldlab/n5 -.. _14: http://docs.dask.org/en/latest/array-creation.html -.. _15: http://xarray.pydata.org/en/stable/io.html -.. _16: https://github.com/ContinuumIO/intake-xarray -.. _17: http://matthewrocklin.com/blog/work/2018/01/22/pangeo-2 -.. _18: http://wiki.esipfed.org/index.php/Interoperability_and_Technology/Tech_Dive_Webinar_Series#8_March.2C_2018:_.22Zarr:_A_simple.2C_open.2C_scalable_solution_for_big_NetCDF.2FHDF_data_on_the_Cloud.22:_Alistair_Miles.2C_University_of_Oxford. -.. _19: https://agu.confex.com/agu/fm18/meetingapp.cgi/Paper/390015 - - -Authors -------- - -Project contributors are listed in alphabetical order by surname. - -* `Ryan Abernathey `_, Columbia University -* `Stephan Balmer `_, Meteotest -* `Ambrose Carr `_, Chan Zuckerberg Initiative -* `Tim Crone `_, Columbia University -* `Martin Durant `_, Anaconda, inc. -* `Jan Funke `_, HHMI Janelia -* `Darren Gallagher `_, Satavia -* `Fabian Gans `_, Max Planck Institute for Biogeochemistry -* `Shikhar Goenka `_, Satavia -* `Joe Hamman `_, NCAR -* `Stephan Hoyer `_, Google -* `Jerome Kelleher `_, University of Oxford -* `John Kirkham `_, HHMI Janelia -* `Alistair Miles `_, University of Oxford -* `Josh Moore `_, University of Dundee -* `Charles Noyes `_, University of Southern California -* `Tarik Onalan `_ -* `Constantin Pape `_, University of Heidelberg -* `Zain Patel `_, University of Cambridge -* `Matthew Rocklin `_, NVIDIA -* `Stephan Saafeld `_, HHMI Janelia -* `Vincent Schut `_, Satelligence -* `Justin Swaney `_, MIT -* `Ryan Williams `_, Chan Zuckerberg Initiative diff --git a/docs/user-guide/arrays.md b/docs/user-guide/arrays.md new file mode 100644 index 0000000000..b91f79a008 --- /dev/null +++ b/docs/user-guide/arrays.md @@ -0,0 +1,688 @@ +# Working with arrays + +## Creating an array + +Zarr has several functions for creating arrays. For example: + +```python +import zarr +store = zarr.storage.MemoryStore() +z = zarr.create_array(store=store, shape=(10000, 10000), chunks=(1000, 1000), dtype='int32') +z +# +``` + +The code above creates a 2-dimensional array of 32-bit integers with 10000 rows +and 10000 columns, divided into chunks where each chunk has 1000 rows and 1000 +columns (and so there will be 100 chunks in total). The data is written to a +`zarr.storage.MemoryStore` (e.g. an in-memory dict). See +[Persistent arrays](#persistent-arrays) for details on storing arrays in other stores. + +For a complete list of array creation routines see the `zarr` +module documentation. + +## Reading and writing data + +Zarr arrays support a similar interface to [NumPy](https://numpy.org/doc/stable/) +arrays for reading and writing data. For example, the entire array can be filled +with a scalar value: + +```python +z[:] = 42 +``` + +Regions of the array can also be written to, e.g.: + +```python +import numpy as np + +z[0, :] = np.arange(10000) +z[:, 0] = np.arange(10000) +``` + +The contents of the array can be retrieved by slicing, which will load the +requested region into memory as a NumPy array, e.g.: + +```python +z[0, 0] +# array(0, dtype=int32) +z[-1, -1] +# array(42, dtype=int32) +z[0, :] +# array([ 0, 1, 2, ..., 9997, 9998, 9999], +# shape=(10000,), dtype=int32) +z[:, 0] +# array([ 0, 1, 2, ..., 9997, 9998, 9999], +# shape=(10000,), dtype=int32) +z[:] +# array([[ 0, 1, 2, ..., 9997, 9998, 9999], +# [ 1, 42, 42, ..., 42, 42, 42], +# [ 2, 42, 42, ..., 42, 42, 42], +# ..., +# [9997, 42, 42, ..., 42, 42, 42], +# [9998, 42, 42, ..., 42, 42, 42], +# [9999, 42, 42, ..., 42, 42, 42]], +# shape=(10000, 10000), dtype=int32) +``` + +Read more about NumPy-style indexing can be found in the +[NumPy documentation](https://numpy.org/doc/stable/user/basics.indexing.html). + +## Persistent arrays + +In the examples above, compressed data for each chunk of the array was stored in +main memory. Zarr arrays can also be stored on a file system, enabling +persistence of data between sessions. To do this, we can change the store +argument to point to a filesystem path: + +```python +z1 = zarr.create_array(store='data/example-1.zarr', shape=(10000, 10000), chunks=(1000, 1000), dtype='int32') +``` + +The array above will store its configuration metadata and all compressed chunk +data in a directory called `'data/example-1.zarr'` relative to the current working +directory. The `zarr.create_array` function provides a convenient way +to create a new persistent array or continue working with an existing +array. Note, there is no need to close an array: data are automatically +flushed to disk, and files are automatically closed whenever an array is modified. + +Persistent arrays support the same interface for reading and writing data, +e.g.: + +```python +z1[:] = 42 +z1[0, :] = np.arange(10000) +z1[:, 0] = np.arange(10000) +``` + +Check that the data have been written and can be read again: + +```python +z2 = zarr.open_array('data/example-1.zarr', mode='r') +np.all(z1[:] == z2[:]) +# np.True_ +``` + +If you are just looking for a fast and convenient way to save NumPy arrays to +disk then load back into memory later, the functions +`zarr.save` and `zarr.load` may be +useful. E.g.: + +```python +a = np.arange(10) +zarr.save('data/example-2.zarr', a) +zarr.load('data/example-2.zarr') +# array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) +``` + +Please note that there are a number of other options for persistent array +storage, see the Storage Guide for more details. + +## Resizing and appending + +A Zarr array can be resized, which means that any of its dimensions can be +increased or decreased in length. For example: + +```python +z = zarr.create_array(store='data/example-3.zarr', shape=(10000, 10000), dtype='int32',chunks=(1000, 1000)) +z[:] = 42 +z.shape +# (10000, 10000) +z.resize((20000, 10000)) +z.shape +# (20000, 10000) +``` + +Note that when an array is resized, the underlying data are not rearranged in +any way. If one or more dimensions are shrunk, any chunks falling outside the +new array shape will be deleted from the underlying store. + +`zarr.Array.append` is provided as a convenience function, which can be +used to append data to any axis. E.g.: + +```python +a = np.arange(10000000, dtype='int32').reshape(10000, 1000) +z = zarr.create_array(store='data/example-4.zarr', shape=a.shape, dtype=a.dtype, chunks=(1000, 100)) +z[:] = a +z.shape +# (10000, 1000) +z.append(a) +# (20000, 1000) +z.append(np.vstack([a, a]), axis=1) +# (20000, 2000) +z.shape +# (20000, 2000) +``` + +## Compressors + +A number of different compressors can be used with Zarr. Zarr includes Blosc, +Zstandard and Gzip compressors. Additional compressors are available through +a separate package called [NumCodecs](https://numcodecs.readthedocs.io/) which provides various +compressor libraries including LZ4, Zlib, BZ2 and LZMA. +Different compressors can be provided via the `compressors` keyword +argument accepted by all array creation functions. For example: + +```python +compressors = zarr.codecs.BloscCodec(cname='zstd', clevel=3, shuffle=zarr.codecs.BloscShuffle.bitshuffle) +data = np.arange(100000000, dtype='int32').reshape(10000, 10000) +z = zarr.create_array(store='data/example-5.zarr', shape=data.shape, dtype=data.dtype, chunks=(1000, 1000), compressors=compressors) +z[:] = data +z.compressors +# (BloscCodec(typesize=4, cname=, clevel=3, shuffle=, blocksize=0),) +``` + +This array above will use Blosc as the primary compressor, using the Zstandard +algorithm (compression level 3) internally within Blosc, and with the +bit-shuffle filter applied. + +When using a compressor, it can be useful to get some diagnostics on the +compression ratio. Zarr arrays provide the `zarr.Array.info` property +which can be used to print useful diagnostics, e.g.: + +```python +z.info +# Type : Array +# Zarr format : 3 +# Data type : DataType.int32 +# Fill value : 0 +# Shape : (10000, 10000) +# Chunk shape : (1000, 1000) +# Order : C +# Read-only : False +# Store type : LocalStore +# Filters : () +# Serializer : BytesCodec(endian=) +# Compressors : (BloscCodec(typesize=4, cname=, clevel=3, shuffle=, blocksize=0),) +# No. bytes : 400000000 (381.5M) +``` + +The `zarr.Array.info_complete` method inspects the underlying store and +prints additional diagnostics, e.g.: + +```python +z.info_complete() +# Type : Array +# Zarr format : 3 +# Data type : DataType.int32 +# Fill value : 0 +# Shape : (10000, 10000) +# Chunk shape : (1000, 1000) +# Order : C +# Read-only : False +# Store type : LocalStore +# Filters : () +# Serializer : BytesCodec(endian=) +# Compressors : (BloscCodec(typesize=4, cname=, clevel=3, shuffle=, blocksize=0),) +# No. bytes : 400000000 (381.5M) +# No. bytes stored : 3558573 +# Storage ratio : 112.4 +# Chunks Initialized : 100 +``` + +!!! note + `zarr.Array.info_complete` will inspect the underlying store and may + be slow for large arrays. Use `zarr.Array.info` if detailed storage + statistics are not needed. + +If you don't specify a compressor, by default Zarr uses the Zstandard +compressor. + +In addition to Blosc and Zstandard, other compression libraries can also be used. For example, +here is an array using Gzip compression, level 1: + +```python +data = np.arange(100000000, dtype='int32').reshape(10000, 10000) +z = zarr.create_array(store='data/example-6.zarr', shape=data.shape, dtype=data.dtype, chunks=(1000, 1000), compressors=zarr.codecs.GzipCodec(level=1)) +z[:] = data +z.compressors +# (GzipCodec(level=1),) +``` + +Here is an example using LZMA from [NumCodecs](https://numcodecs.readthedocs.io/) with a custom filter pipeline including LZMA's +built-in delta filter: + +```python +import lzma +from numcodecs.zarr3 import LZMA + +lzma_filters = [dict(id=lzma.FILTER_DELTA, dist=4), dict(id=lzma.FILTER_LZMA2, preset=1)] +compressors = LZMA(filters=lzma_filters) +data = np.arange(100000000, dtype='int32').reshape(10000, 10000) +z = zarr.create_array(store='data/example-7.zarr', shape=data.shape, dtype=data.dtype, chunks=(1000, 1000), compressors=compressors) +z.compressors +# (LZMA(codec_name='numcodecs.lzma', codec_config={'filters': [{'id': 3, 'dist': 4}, {'id': 33, 'preset': 1}]}),) +``` + +The default compressor can be changed by setting the value of the using Zarr's +configuration system, e.g.: + +```python +with zarr.config.set({'array.v2_default_compressor.numeric': {'id': 'blosc'}}): + z = zarr.create_array(store={}, shape=(100000000,), chunks=(1000000,), dtype='int32', zarr_format=2) +z.filters +# () +z.compressors +# (Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0),) +``` + +To disable compression, set `compressors=None` when creating an array, e.g.: + +```python +z = zarr.create_array(store='data/example-8.zarr', shape=(100000000,), chunks=(1000000,), dtype='int32', compressors=None) +z.compressors +# () +``` + +## Filters + +In some cases, compression can be improved by transforming the data in some +way. For example, if nearby values tend to be correlated, then shuffling the +bytes within each numerical value or storing the difference between adjacent +values may increase compression ratio. Some compressors provide built-in filters +that apply transformations to the data prior to compression. For example, the +Blosc compressor has built-in implementations of byte- and bit-shuffle filters, +and the LZMA compressor has a built-in implementation of a delta +filter. However, to provide additional flexibility for implementing and using +filters in combination with different compressors, Zarr also provides a +mechanism for configuring filters outside of the primary compressor. + +Here is an example using a delta filter with the Blosc compressor: + +```python +from numcodecs.zarr3 import Delta + +filters = [Delta(dtype='int32')] +compressors = zarr.codecs.BloscCodec(cname='zstd', clevel=1, shuffle=zarr.codecs.BloscShuffle.shuffle) +data = np.arange(100000000, dtype='int32').reshape(10000, 10000) +z = zarr.create_array(store='data/example-9.zarr', shape=data.shape, dtype=data.dtype, chunks=(1000, 1000), filters=filters, compressors=compressors) +z.info +# Type : Array +# Zarr format : 3 +# Data type : DataType.int32 +# Fill value : 0 +# Shape : (10000, 10000) +# Chunk shape : (1000, 1000) +# Order : C +# Read-only : False +# Store type : LocalStore +# Filters : (Delta(codec_name='numcodecs.delta', codec_config={'dtype': 'int32'}),) +# Serializer : BytesCodec(endian=) +# Compressors : (BloscCodec(typesize=4, cname=, clevel=1, shuffle=, blocksize=0),) +# No. bytes : 400000000 (381.5M) +``` + +For more information about available filter codecs, see the [Numcodecs](https://numcodecs.readthedocs.io/) documentation. + +## Advanced indexing + +Zarr arrays support several methods for advanced or "fancy" +indexing, which enable a subset of data items to be extracted or updated in an +array without loading the entire array into memory. + +Note that although this functionality is similar to some of the advanced +indexing capabilities available on NumPy arrays and on h5py datasets, **the Zarr +API for advanced indexing is different from both NumPy and h5py**, so please +read this section carefully. For a complete description of the indexing API, +see the documentation for the `zarr.Array` class. + +### Indexing with coordinate arrays + +Items from a Zarr array can be extracted by providing an integer array of +coordinates. E.g.: + +```python +data = np.arange(10) ** 2 +z = zarr.create_array(store='data/example-10.zarr', shape=data.shape, dtype=data.dtype) +z[:] = data +z[:] +# array([ 0, 1, 4, 9, 16, 25, 36, 49, 64, 81]) +z.get_coordinate_selection([2, 5]) +# array([ 4, 25]) +``` + +Coordinate arrays can also be used to update data, e.g.: + +```python +z.set_coordinate_selection([2, 5], [-1, -2]) +z[:] +# array([ 0, 1, -1, 9, 16, -2, 36, 49, 64, 81]) +``` + +For multidimensional arrays, coordinates must be provided for each dimension, +e.g.: + +```python +data = np.arange(15).reshape(3, 5) +z = zarr.create_array(store='data/example-11.zarr', shape=data.shape, dtype=data.dtype) +z[:] = data +z[:] +# array([[ 0, 1, 2, 3, 4], +# [ 5, 6, 7, 8, 9], +# [10, 11, 12, 13, 14]]) +z.get_coordinate_selection(([0, 2], [1, 3])) +# array([ 1, 13]) +z.set_coordinate_selection(([0, 2], [1, 3]), [-1, -2]) +z[:] +# array([[ 0, -1, 2, 3, 4], +# [ 5, 6, 7, 8, 9], +# [10, 11, 12, -2, 14]]) +``` + +For convenience, coordinate indexing is also available via the `vindex` +property, as well as the square bracket operator, e.g.: + +```python +z.vindex[[0, 2], [1, 3]] +# array([-1, -2]) +z.vindex[[0, 2], [1, 3]] = [-3, -4] +z[:] +# array([[ 0, -3, 2, 3, 4], +# [ 5, 6, 7, 8, 9], +# [10, 11, 12, -4, 14]]) +z[[0, 2], [1, 3]] +# array([-3, -4]) +``` + +When the indexing arrays have different shapes, they are broadcast together. +That is, the following two calls are equivalent: + +```python +z[1, [1, 3]] +# array([6, 8]) +z[[1, 1], [1, 3]] +# array([6, 8]) +``` + +### Indexing with a mask array + +Items can also be extracted by providing a Boolean mask. E.g.: + +```python +data = np.arange(10) ** 2 +z = zarr.create_array(store='data/example-12.zarr', shape=data.shape, dtype=data.dtype) +z[:] = data +z[:] +# array([ 0, 1, 4, 9, 16, 25, 36, 49, 64, 81]) +sel = np.zeros_like(z, dtype=bool) +sel[2] = True +sel[5] = True +z.get_mask_selection(sel) +# array([ 4, 25]) +z.set_mask_selection(sel, [-1, -2]) +z[:] +# array([ 0, 1, -1, 9, 16, -2, 36, 49, 64, 81]) +``` + +Here's a multidimensional example: + +```python +data = np.arange(15).reshape(3, 5) +z = zarr.create_array(store='data/example-13.zarr', shape=data.shape, dtype=data.dtype) +z[:] = data +z[:] +# array([[ 0, 1, 2, 3, 4], +# [ 5, 6, 7, 8, 9], +# [10, 11, 12, 13, 14]]) +sel = np.zeros_like(z, dtype=bool) +sel[0, 1] = True +sel[2, 3] = True +z.get_mask_selection(sel) +# array([ 1, 13]) +z.set_mask_selection(sel, [-1, -2]) +z[:] +# array([[ 0, -1, 2, 3, 4], +# [ 5, 6, 7, 8, 9], +# [10, 11, 12, -2, 14]]) +``` + +For convenience, mask indexing is also available via the `vindex` property, +e.g.: + +```python +z.vindex[sel] +# array([-1, -2]) +z.vindex[sel] = [-3, -4] +z[:] +# array([[ 0, -3, 2, 3, 4], +# [ 5, 6, 7, 8, 9], +# [10, 11, 12, -4, 14]]) +``` + +Mask indexing is conceptually the same as coordinate indexing, and is +implemented internally via the same machinery. Both styles of indexing allow +selecting arbitrary items from an array, also known as point selection. + +### Orthogonal indexing + +Zarr arrays also support methods for orthogonal indexing, which allows +selections to be made along each dimension of an array independently. For +example, this allows selecting a subset of rows and/or columns from a +2-dimensional array. E.g.: + +```python +data = np.arange(15).reshape(3, 5) +z = zarr.create_array(store='data/example-14.zarr', shape=data.shape, dtype=data.dtype) +z[:] = data +z[:] +# array([[ 0, 1, 2, 3, 4], +# [ 5, 6, 7, 8, 9], +# [10, 11, 12, 13, 14]]) +z.get_orthogonal_selection(([0, 2], slice(None))) # select first and third rows +# array([[ 0, 1, 2, 3, 4], +# [10, 11, 12, 13, 14]]) +z.get_orthogonal_selection((slice(None), [1, 3])) # select second and fourth columns +# array([[ 1, 3], +# [ 6, 8], +# [11, 13]]) +z.get_orthogonal_selection(([0, 2], [1, 3])) # select rows [0, 2] and columns [1, 4] +# array([[ 1, 3], +# [11, 13]]) +``` + +Data can also be modified, e.g.: + +```python +z.set_orthogonal_selection(([0, 2], [1, 3]), [[-1, -2], [-3, -4]]) +``` + +For convenience, the orthogonal indexing functionality is also available via the +`oindex` property, e.g.: + +```python +data = np.arange(15).reshape(3, 5) +z = zarr.create_array(store='data/example-15.zarr', shape=data.shape, dtype=data.dtype) +z[:] = data +z.oindex[[0, 2], :] # select first and third rows +# array([[ 0, 1, 2, 3, 4], +# [10, 11, 12, 13, 14]]) +z.oindex[:, [1, 3]] # select second and fourth columns +# array([[ 1, 3], +# [ 6, 8], +# [11, 13]]) +z.oindex[[0, 2], [1, 3]] # select rows [0, 2] and columns [1, 4] +# array([[ 1, 3], +# [11, 13]]) +z.oindex[[0, 2], [1, 3]] = [[-1, -2], [-3, -4]] +z[:] +# array([[ 0, -1, 2, -2, 4], +# [ 5, 6, 7, 8, 9], +# [10, -3, 12, -4, 14]]) +``` + +Any combination of integer, slice, 1D integer array and/or 1D Boolean array can +be used for orthogonal indexing. + +If the index contains at most one iterable, and otherwise contains only slices and integers, +orthogonal indexing is also available directly on the array: + +```python +data = np.arange(15).reshape(3, 5) +z = zarr.create_array(store='data/example-16.zarr', shape=data.shape, dtype=data.dtype) +z[:] = data +np.all(z.oindex[[0, 2], :] == z[[0, 2], :]) +# np.True_ +``` + +### Block Indexing + +Zarr also support block indexing, which allows selections of whole chunks based on their +logical indices along each dimension of an array. For example, this allows selecting +a subset of chunk aligned rows and/or columns from a 2-dimensional array. E.g.: + +```python +data = np.arange(100).reshape(10, 10) +z = zarr.create_array(store='data/example-17.zarr', shape=data.shape, dtype=data.dtype, chunks=(3, 3)) +z[:] = data +``` + +Retrieve items by specifying their block coordinates: + +```python +z.get_block_selection(1) +# array([[30, 31, 32, 33, 34, 35, 36, 37, 38, 39], +# [40, 41, 42, 43, 44, 45, 46, 47, 48, 49], +# [50, 51, 52, 53, 54, 55, 56, 57, 58, 59]]) +``` + +Equivalent slicing: + +```python +z[3:6] +# array([[30, 31, 32, 33, 34, 35, 36, 37, 38, 39], +# [40, 41, 42, 43, 44, 45, 46, 47, 48, 49], +# [50, 51, 52, 53, 54, 55, 56, 57, 58, 59]]) +``` + +For convenience, the block selection functionality is also available via the +`blocks` property, e.g.: + +```python +z.blocks[1] +# array([[30, 31, 32, 33, 34, 35, 36, 37, 38, 39], +# [40, 41, 42, 43, 44, 45, 46, 47, 48, 49], +# [50, 51, 52, 53, 54, 55, 56, 57, 58, 59]]) +``` + +Block index arrays may be multidimensional to index multidimensional arrays. +For example: + +```python +z.blocks[0, 1:3] +# array([[ 3, 4, 5, 6, 7, 8], +# [13, 14, 15, 16, 17, 18], +# [23, 24, 25, 26, 27, 28]]) +``` + +Data can also be modified. Let's start by a simple 2D array: + +```python +z = zarr.create_array(store='data/example-18.zarr', shape=(6, 6), dtype=int, chunks=(2, 2)) +``` + +Set data for a selection of items: + +```python +z.set_block_selection((1, 0), 1) +z[...] +# array([[0, 0, 0, 0, 0, 0], +# [0, 0, 0, 0, 0, 0], +# [1, 1, 0, 0, 0, 0], +# [1, 1, 0, 0, 0, 0], +# [0, 0, 0, 0, 0, 0], +# [0, 0, 0, 0, 0, 0]]) +``` + +For convenience, this functionality is also available via the `blocks` property. +E.g.: + +```python +z.blocks[:, 2] = 7 +z[...] +# array([[0, 0, 0, 0, 7, 7], +# [0, 0, 0, 0, 7, 7], +# [1, 1, 0, 0, 7, 7], +# [1, 1, 0, 0, 7, 7], +# [0, 0, 0, 0, 7, 7], +# [0, 0, 0, 0, 7, 7]]) +``` + +Any combination of integer and slice can be used for block indexing: + +```python +z.blocks[2, 1:3] +# array([[0, 0, 7, 7], +# [0, 0, 7, 7]]) + +root = zarr.create_group('data/example-19.zarr') +foo = root.create_array(name='foo', shape=(1000, 100), chunks=(10, 10), dtype='float32') +bar = root.create_array(name='foo/bar', shape=(100,), dtype='int32') +foo[:, :] = np.random.random((1000, 100)) +bar[:] = np.arange(100) +root.tree() +# / +# └── foo (1000, 100) float32 +``` + +## Sharding + +Using small chunk shapes in very large arrays can lead to a very large number of chunks. +This can become a performance issue for file systems and object storage. +With Zarr format 3, a new sharding feature has been added to address this issue. + +With sharding, multiple chunks can be stored in a single storage object (e.g. a file). +Within a shard, chunks are compressed and serialized separately. +This allows individual chunks to be read independently. +However, when writing data, a full shard must be written in one go for optimal +performance and to avoid concurrency issues. +That means that shards are the units of writing and chunks are the units of reading. +Users need to configure the chunk and shard shapes accordingly. + +Sharded arrays can be created by providing the `shards` parameter to `zarr.create_array`. + +```python +a = zarr.create_array('data/example-20.zarr', shape=(10000, 10000), shards=(1000, 1000), chunks=(100, 100), dtype='uint8') +a[:] = (np.arange(10000 * 10000) % 256).astype('uint8').reshape(10000, 10000) +a.info_complete() +# Type : Array +# Zarr format : 3 +# Data type : DataType.uint8 +# Fill value : 0 +# Shape : (10000, 10000) +# Shard shape : (1000, 1000) +# Chunk shape : (100, 100) +# Order : C +# Read-only : False +# Store type : LocalStore +# Filters : () +# Serializer : BytesCodec(endian=) +# Compressors : (ZstdCodec(level=0, checksum=False),) +# No. bytes : 100000000 (95.4M) +# No. bytes stored : 3981552 +# Storage ratio : 25.1 +# Shards Initialized : 100 +``` + +In this example a shard shape of (1000, 1000) and a chunk shape of (100, 100) is used. +This means that 10*10 chunks are stored in each shard, and there are 10*10 shards in total. +Without the `shards` argument, there would be 10,000 chunks stored as individual files. + +## Missing features in 3.0 + +The following features have not been ported to 3.0 yet. + +### Object arrays + +See the Zarr-Python 2 documentation on [Object arrays](https://zarr.readthedocs.io/en/support-v2/tutorial.html#object-arrays) for more details. + +### Fixed-length string arrays + +See the Zarr-Python 2 documentation on [Fixed-length string arrays](https://zarr.readthedocs.io/en/support-v2/tutorial.html#string-arrays) for more details. + +### Datetime and Timedelta arrays + +See the Zarr-Python 2 documentation on [Datetime and Timedelta](https://zarr.readthedocs.io/en/support-v2/tutorial.html#datetimes-and-timedeltas) for more details. + +### Copying and migrating data + +See the Zarr-Python 2 documentation on [Copying and migrating data](https://zarr.readthedocs.io/en/support-v2/tutorial.html#copying-migrating-data) for more details. diff --git a/docs/user-guide/arrays.rst b/docs/user-guide/arrays.rst deleted file mode 100644 index 5bd6b1500f..0000000000 --- a/docs/user-guide/arrays.rst +++ /dev/null @@ -1,658 +0,0 @@ -.. only:: doctest - - >>> import shutil - >>> shutil.rmtree('data', ignore_errors=True) - -.. _user-guide-arrays: - -Working with arrays -=================== - -Creating an array ------------------ - -Zarr has several functions for creating arrays. For example:: - - >>> import zarr - >>> store = zarr.storage.MemoryStore() - >>> z = zarr.create_array(store=store, shape=(10000, 10000), chunks=(1000, 1000), dtype='int32') - >>> z - - -The code above creates a 2-dimensional array of 32-bit integers with 10000 rows -and 10000 columns, divided into chunks where each chunk has 1000 rows and 1000 -columns (and so there will be 100 chunks in total). The data is written to a -:class:`zarr.storage.MemoryStore` (e.g. an in-memory dict). See -:ref:`user-guide-persist` for details on storing arrays in other stores. - -For a complete list of array creation routines see the :mod:`zarr` -module documentation. - -.. _user-guide-array: - -Reading and writing data ------------------------- - -Zarr arrays support a similar interface to `NumPy `_ -arrays for reading and writing data. For example, the entire array can be filled -with a scalar value:: - - >>> z[:] = 42 - -Regions of the array can also be written to, e.g.:: - - >>> import numpy as np - >>> - >>> z[0, :] = np.arange(10000) - >>> z[:, 0] = np.arange(10000) - -The contents of the array can be retrieved by slicing, which will load the -requested region into memory as a NumPy array, e.g.:: - - >>> z[0, 0] - array(0, dtype=int32) - >>> z[-1, -1] - array(42, dtype=int32) - >>> z[0, :] - array([ 0, 1, 2, ..., 9997, 9998, 9999], - shape=(10000,), dtype=int32) - >>> z[:, 0] - array([ 0, 1, 2, ..., 9997, 9998, 9999], - shape=(10000,), dtype=int32) - >>> z[:] - array([[ 0, 1, 2, ..., 9997, 9998, 9999], - [ 1, 42, 42, ..., 42, 42, 42], - [ 2, 42, 42, ..., 42, 42, 42], - ..., - [9997, 42, 42, ..., 42, 42, 42], - [9998, 42, 42, ..., 42, 42, 42], - [9999, 42, 42, ..., 42, 42, 42]], - shape=(10000, 10000), dtype=int32) - -Read more about NumPy-style indexing can be found in the -`NumPy documentation `_. - -.. _user-guide-persist: - -Persistent arrays ------------------ - -In the examples above, compressed data for each chunk of the array was stored in -main memory. Zarr arrays can also be stored on a file system, enabling -persistence of data between sessions. To do this, we can change the store -argument to point to a filesystem path:: - - >>> z1 = zarr.create_array(store='data/example-1.zarr', shape=(10000, 10000), chunks=(1000, 1000), dtype='int32') - -The array above will store its configuration metadata and all compressed chunk -data in a directory called ``'data/example-1.zarr'`` relative to the current working -directory. The :func:`zarr.create_array` function provides a convenient way -to create a new persistent array or continue working with an existing -array. Note, there is no need to close an array: data are automatically -flushed to disk, and files are automatically closed whenever an array is modified. - -Persistent arrays support the same interface for reading and writing data, -e.g.:: - - >>> z1[:] = 42 - >>> z1[0, :] = np.arange(10000) - >>> z1[:, 0] = np.arange(10000) - -Check that the data have been written and can be read again:: - - >>> z2 = zarr.open_array('data/example-1.zarr', mode='r') - >>> np.all(z1[:] == z2[:]) - np.True_ - -If you are just looking for a fast and convenient way to save NumPy arrays to -disk then load back into memory later, the functions -:func:`zarr.save` and :func:`zarr.load` may be -useful. E.g.:: - - >>> a = np.arange(10) - >>> zarr.save('data/example-2.zarr', a) - >>> zarr.load('data/example-2.zarr') - array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) - -Please note that there are a number of other options for persistent array -storage, see the :ref:`Storage Guide ` guide for more details. - -.. _user-guide-resize: - -Resizing and appending ----------------------- - -A Zarr array can be resized, which means that any of its dimensions can be -increased or decreased in length. For example:: - - >>> z = zarr.create_array(store='data/example-3.zarr', shape=(10000, 10000), dtype='int32',chunks=(1000, 1000)) - >>> z[:] = 42 - >>> z.shape - (10000, 10000) - >>> z.resize((20000, 10000)) - >>> z.shape - (20000, 10000) - -Note that when an array is resized, the underlying data are not rearranged in -any way. If one or more dimensions are shrunk, any chunks falling outside the -new array shape will be deleted from the underlying store. - -:func:`zarr.Array.append` is provided as a convenience function, which can be -used to append data to any axis. E.g.:: - - >>> a = np.arange(10000000, dtype='int32').reshape(10000, 1000) - >>> z = zarr.create_array(store='data/example-4.zarr', shape=a.shape, dtype=a.dtype, chunks=(1000, 100)) - >>> z[:] = a - >>> z.shape - (10000, 1000) - >>> z.append(a) - (20000, 1000) - >>> z.append(np.vstack([a, a]), axis=1) - (20000, 2000) - >>> z.shape - (20000, 2000) - -.. _user-guide-compress: - -Compressors ------------ - -A number of different compressors can be used with Zarr. Zarr includes Blosc, -Zstandard and Gzip compressors. Additional compressors are available through -a separate package called NumCodecs_ which provides various -compressor libraries including LZ4, Zlib, BZ2 and LZMA. -Different compressors can be provided via the ``compressors`` keyword -argument accepted by all array creation functions. For example:: - - >>> compressors = zarr.codecs.BloscCodec(cname='zstd', clevel=3, shuffle=zarr.codecs.BloscShuffle.bitshuffle) - >>> data = np.arange(100000000, dtype='int32').reshape(10000, 10000) - >>> z = zarr.create_array(store='data/example-5.zarr', shape=data.shape, dtype=data.dtype, chunks=(1000, 1000), compressors=compressors) - >>> z[:] = data - >>> z.compressors - (BloscCodec(typesize=4, cname=, clevel=3, shuffle=, blocksize=0),) - -This array above will use Blosc as the primary compressor, using the Zstandard -algorithm (compression level 3) internally within Blosc, and with the -bit-shuffle filter applied. - -When using a compressor, it can be useful to get some diagnostics on the -compression ratio. Zarr arrays provide the :attr:`zarr.Array.info` property -which can be used to print useful diagnostics, e.g.:: - - >>> z.info - Type : Array - Zarr format : 3 - Data type : DataType.int32 - Fill value : 0 - Shape : (10000, 10000) - Chunk shape : (1000, 1000) - Order : C - Read-only : False - Store type : LocalStore - Filters : () - Serializer : BytesCodec(endian=) - Compressors : (BloscCodec(typesize=4, cname=, clevel=3, shuffle=, blocksize=0),) - No. bytes : 400000000 (381.5M) - -The :func:`zarr.Array.info_complete` method inspects the underlying store and -prints additional diagnostics, e.g.:: - - >>> z.info_complete() - Type : Array - Zarr format : 3 - Data type : DataType.int32 - Fill value : 0 - Shape : (10000, 10000) - Chunk shape : (1000, 1000) - Order : C - Read-only : False - Store type : LocalStore - Filters : () - Serializer : BytesCodec(endian=) - Compressors : (BloscCodec(typesize=4, cname=, clevel=3, shuffle=, blocksize=0),) - No. bytes : 400000000 (381.5M) - No. bytes stored : 3558573 - Storage ratio : 112.4 - Chunks Initialized : 100 - -.. note:: - :func:`zarr.Array.info_complete` will inspect the underlying store and may - be slow for large arrays. Use :attr:`zarr.Array.info` if detailed storage - statistics are not needed. - -If you don't specify a compressor, by default Zarr uses the Zstandard -compressor. - -In addition to Blosc and Zstandard, other compression libraries can also be used. For example, -here is an array using Gzip compression, level 1:: - - >>> data = np.arange(100000000, dtype='int32').reshape(10000, 10000) - >>> z = zarr.create_array(store='data/example-6.zarr', shape=data.shape, dtype=data.dtype, chunks=(1000, 1000), compressors=zarr.codecs.GzipCodec(level=1)) - >>> z[:] = data - >>> z.compressors - (GzipCodec(level=1),) - -Here is an example using LZMA from NumCodecs_ with a custom filter pipeline including LZMA's -built-in delta filter:: - - >>> import lzma - >>> from numcodecs.zarr3 import LZMA - >>> - >>> lzma_filters = [dict(id=lzma.FILTER_DELTA, dist=4), dict(id=lzma.FILTER_LZMA2, preset=1)] - >>> compressors = LZMA(filters=lzma_filters) - >>> data = np.arange(100000000, dtype='int32').reshape(10000, 10000) - >>> z = zarr.create_array(store='data/example-7.zarr', shape=data.shape, dtype=data.dtype, chunks=(1000, 1000), compressors=compressors) - >>> z.compressors - (LZMA(codec_name='numcodecs.lzma', codec_config={'filters': [{'id': 3, 'dist': 4}, {'id': 33, 'preset': 1}]}),) - -The default compressor can be changed by setting the value of the using Zarr's -:ref:`user-guide-config`, e.g.:: - - >>> with zarr.config.set({'array.v2_default_compressor.numeric': {'id': 'blosc'}}): - ... z = zarr.create_array(store={}, shape=(100000000,), chunks=(1000000,), dtype='int32', zarr_format=2) - >>> z.filters - () - >>> z.compressors - (Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0),) - -To disable compression, set ``compressors=None`` when creating an array, e.g.:: - - >>> z = zarr.create_array(store='data/example-8.zarr', shape=(100000000,), chunks=(1000000,), dtype='int32', compressors=None) - >>> z.compressors - () - -.. _user-guide-filters: - -Filters -------- - -In some cases, compression can be improved by transforming the data in some -way. For example, if nearby values tend to be correlated, then shuffling the -bytes within each numerical value or storing the difference between adjacent -values may increase compression ratio. Some compressors provide built-in filters -that apply transformations to the data prior to compression. For example, the -Blosc compressor has built-in implementations of byte- and bit-shuffle filters, -and the LZMA compressor has a built-in implementation of a delta -filter. However, to provide additional flexibility for implementing and using -filters in combination with different compressors, Zarr also provides a -mechanism for configuring filters outside of the primary compressor. - -Here is an example using a delta filter with the Blosc compressor:: - - >>> from numcodecs.zarr3 import Delta - >>> - >>> filters = [Delta(dtype='int32')] - >>> compressors = zarr.codecs.BloscCodec(cname='zstd', clevel=1, shuffle=zarr.codecs.BloscShuffle.shuffle) - >>> data = np.arange(100000000, dtype='int32').reshape(10000, 10000) - >>> z = zarr.create_array(store='data/example-9.zarr', shape=data.shape, dtype=data.dtype, chunks=(1000, 1000), filters=filters, compressors=compressors) - >>> z.info - Type : Array - Zarr format : 3 - Data type : DataType.int32 - Fill value : 0 - Shape : (10000, 10000) - Chunk shape : (1000, 1000) - Order : C - Read-only : False - Store type : LocalStore - Filters : (Delta(codec_name='numcodecs.delta', codec_config={'dtype': 'int32'}),) - Serializer : BytesCodec(endian=) - Compressors : (BloscCodec(typesize=4, cname=, clevel=1, shuffle=, blocksize=0),) - No. bytes : 400000000 (381.5M) - -For more information about available filter codecs, see the `Numcodecs -`_ documentation. - -.. _user-guide-indexing: - -Advanced indexing ------------------ - -Zarr arrays support several methods for advanced or "fancy" -indexing, which enable a subset of data items to be extracted or updated in an -array without loading the entire array into memory. - -Note that although this functionality is similar to some of the advanced -indexing capabilities available on NumPy arrays and on h5py datasets, **the Zarr -API for advanced indexing is different from both NumPy and h5py**, so please -read this section carefully. For a complete description of the indexing API, -see the documentation for the :class:`zarr.Array` class. - -Indexing with coordinate arrays -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Items from a Zarr array can be extracted by providing an integer array of -coordinates. E.g.:: - - >>> data = np.arange(10) ** 2 - >>> z = zarr.create_array(store='data/example-10.zarr', shape=data.shape, dtype=data.dtype) - >>> z[:] = data - >>> z[:] - array([ 0, 1, 4, 9, 16, 25, 36, 49, 64, 81]) - >>> z.get_coordinate_selection([2, 5]) - array([ 4, 25]) - -Coordinate arrays can also be used to update data, e.g.:: - - >>> z.set_coordinate_selection([2, 5], [-1, -2]) - >>> z[:] - array([ 0, 1, -1, 9, 16, -2, 36, 49, 64, 81]) - -For multidimensional arrays, coordinates must be provided for each dimension, -e.g.:: - - >>> data = np.arange(15).reshape(3, 5) - >>> z = zarr.create_array(store='data/example-11.zarr', shape=data.shape, dtype=data.dtype) - >>> z[:] = data - >>> z[:] - array([[ 0, 1, 2, 3, 4], - [ 5, 6, 7, 8, 9], - [10, 11, 12, 13, 14]]) - >>> z.get_coordinate_selection(([0, 2], [1, 3])) - array([ 1, 13]) - >>> z.set_coordinate_selection(([0, 2], [1, 3]), [-1, -2]) - >>> z[:] - array([[ 0, -1, 2, 3, 4], - [ 5, 6, 7, 8, 9], - [10, 11, 12, -2, 14]]) - -For convenience, coordinate indexing is also available via the ``vindex`` -property, as well as the square bracket operator, e.g.:: - - >>> z.vindex[[0, 2], [1, 3]] - array([-1, -2]) - >>> z.vindex[[0, 2], [1, 3]] = [-3, -4] - >>> z[:] - array([[ 0, -3, 2, 3, 4], - [ 5, 6, 7, 8, 9], - [10, 11, 12, -4, 14]]) - >>> z[[0, 2], [1, 3]] - array([-3, -4]) - -When the indexing arrays have different shapes, they are broadcast together. -That is, the following two calls are equivalent:: - - >>> z[1, [1, 3]] - array([6, 8]) - >>> z[[1, 1], [1, 3]] - array([6, 8]) - -Indexing with a mask array -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Items can also be extracted by providing a Boolean mask. E.g.:: - - >>> data = np.arange(10) ** 2 - >>> z = zarr.create_array(store='data/example-12.zarr', shape=data.shape, dtype=data.dtype) - >>> z[:] = data - >>> z[:] - array([ 0, 1, 4, 9, 16, 25, 36, 49, 64, 81]) - >>> sel = np.zeros_like(z, dtype=bool) - >>> sel[2] = True - >>> sel[5] = True - >>> z.get_mask_selection(sel) - array([ 4, 25]) - >>> z.set_mask_selection(sel, [-1, -2]) - >>> z[:] - array([ 0, 1, -1, 9, 16, -2, 36, 49, 64, 81]) - -Here's a multidimensional example:: - - >>> data = np.arange(15).reshape(3, 5) - >>> z = zarr.create_array(store='data/example-13.zarr', shape=data.shape, dtype=data.dtype) - >>> z[:] = data - >>> z[:] - array([[ 0, 1, 2, 3, 4], - [ 5, 6, 7, 8, 9], - [10, 11, 12, 13, 14]]) - >>> sel = np.zeros_like(z, dtype=bool) - >>> sel[0, 1] = True - >>> sel[2, 3] = True - >>> z.get_mask_selection(sel) - array([ 1, 13]) - >>> z.set_mask_selection(sel, [-1, -2]) - >>> z[:] - array([[ 0, -1, 2, 3, 4], - [ 5, 6, 7, 8, 9], - [10, 11, 12, -2, 14]]) - -For convenience, mask indexing is also available via the ``vindex`` property, -e.g.:: - - >>> z.vindex[sel] - array([-1, -2]) - >>> z.vindex[sel] = [-3, -4] - >>> z[:] - array([[ 0, -3, 2, 3, 4], - [ 5, 6, 7, 8, 9], - [10, 11, 12, -4, 14]]) - -Mask indexing is conceptually the same as coordinate indexing, and is -implemented internally via the same machinery. Both styles of indexing allow -selecting arbitrary items from an array, also known as point selection. - -Orthogonal indexing -~~~~~~~~~~~~~~~~~~~ - -Zarr arrays also support methods for orthogonal indexing, which allows -selections to be made along each dimension of an array independently. For -example, this allows selecting a subset of rows and/or columns from a -2-dimensional array. E.g.:: - - >>> data = np.arange(15).reshape(3, 5) - >>> z = zarr.create_array(store='data/example-14.zarr', shape=data.shape, dtype=data.dtype) - >>> z[:] = data - >>> z[:] - array([[ 0, 1, 2, 3, 4], - [ 5, 6, 7, 8, 9], - [10, 11, 12, 13, 14]]) - >>> z.get_orthogonal_selection(([0, 2], slice(None))) # select first and third rows - array([[ 0, 1, 2, 3, 4], - [10, 11, 12, 13, 14]]) - >>> z.get_orthogonal_selection((slice(None), [1, 3])) # select second and fourth columns - array([[ 1, 3], - [ 6, 8], - [11, 13]]) - >>> z.get_orthogonal_selection(([0, 2], [1, 3])) # select rows [0, 2] and columns [1, 4] - array([[ 1, 3], - [11, 13]]) - -Data can also be modified, e.g.:: - - >>> z.set_orthogonal_selection(([0, 2], [1, 3]), [[-1, -2], [-3, -4]]) - -For convenience, the orthogonal indexing functionality is also available via the -``oindex`` property, e.g.:: - - >>> data = np.arange(15).reshape(3, 5) - >>> z = zarr.create_array(store='data/example-15.zarr', shape=data.shape, dtype=data.dtype) - >>> z[:] = data - >>> z.oindex[[0, 2], :] # select first and third rows - array([[ 0, 1, 2, 3, 4], - [10, 11, 12, 13, 14]]) - >>> z.oindex[:, [1, 3]] # select second and fourth columns - array([[ 1, 3], - [ 6, 8], - [11, 13]]) - >>> z.oindex[[0, 2], [1, 3]] # select rows [0, 2] and columns [1, 4] - array([[ 1, 3], - [11, 13]]) - >>> z.oindex[[0, 2], [1, 3]] = [[-1, -2], [-3, -4]] - >>> z[:] - array([[ 0, -1, 2, -2, 4], - [ 5, 6, 7, 8, 9], - [10, -3, 12, -4, 14]]) - -Any combination of integer, slice, 1D integer array and/or 1D Boolean array can -be used for orthogonal indexing. - -If the index contains at most one iterable, and otherwise contains only slices and integers, -orthogonal indexing is also available directly on the array:: - - >>> data = np.arange(15).reshape(3, 5) - >>> z = zarr.create_array(store='data/example-16.zarr', shape=data.shape, dtype=data.dtype) - >>> z[:] = data - >>> np.all(z.oindex[[0, 2], :] == z[[0, 2], :]) - np.True_ - -Block Indexing -~~~~~~~~~~~~~~ - -Zarr also support block indexing, which allows selections of whole chunks based on their -logical indices along each dimension of an array. For example, this allows selecting -a subset of chunk aligned rows and/or columns from a 2-dimensional array. E.g.:: - - >>> data = np.arange(100).reshape(10, 10) - >>> z = zarr.create_array(store='data/example-17.zarr', shape=data.shape, dtype=data.dtype, chunks=(3, 3)) - >>> z[:] = data - -Retrieve items by specifying their block coordinates:: - - >>> z.get_block_selection(1) - array([[30, 31, 32, 33, 34, 35, 36, 37, 38, 39], - [40, 41, 42, 43, 44, 45, 46, 47, 48, 49], - [50, 51, 52, 53, 54, 55, 56, 57, 58, 59]]) - -Equivalent slicing:: - - >>> z[3:6] - array([[30, 31, 32, 33, 34, 35, 36, 37, 38, 39], - [40, 41, 42, 43, 44, 45, 46, 47, 48, 49], - [50, 51, 52, 53, 54, 55, 56, 57, 58, 59]]) - -For convenience, the block selection functionality is also available via the -`blocks` property, e.g.:: - - >>> z.blocks[1] - array([[30, 31, 32, 33, 34, 35, 36, 37, 38, 39], - [40, 41, 42, 43, 44, 45, 46, 47, 48, 49], - [50, 51, 52, 53, 54, 55, 56, 57, 58, 59]]) - -Block index arrays may be multidimensional to index multidimensional arrays. -For example:: - - >>> z.blocks[0, 1:3] - array([[ 3, 4, 5, 6, 7, 8], - [13, 14, 15, 16, 17, 18], - [23, 24, 25, 26, 27, 28]]) - -Data can also be modified. Let's start by a simple 2D array:: - - >>> z = zarr.create_array(store='data/example-18.zarr', shape=(6, 6), dtype=int, chunks=(2, 2)) - -Set data for a selection of items:: - - >>> z.set_block_selection((1, 0), 1) - >>> z[...] - array([[0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0], - [1, 1, 0, 0, 0, 0], - [1, 1, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0]]) - -For convenience, this functionality is also available via the ``blocks`` property. -E.g.:: - - >>> z.blocks[:, 2] = 7 - >>> z[...] - array([[0, 0, 0, 0, 7, 7], - [0, 0, 0, 0, 7, 7], - [1, 1, 0, 0, 7, 7], - [1, 1, 0, 0, 7, 7], - [0, 0, 0, 0, 7, 7], - [0, 0, 0, 0, 7, 7]]) - -Any combination of integer and slice can be used for block indexing:: - - >>> z.blocks[2, 1:3] - array([[0, 0, 7, 7], - [0, 0, 7, 7]]) - >>> - >>> root = zarr.create_group('data/example-19.zarr') - >>> foo = root.create_array(name='foo', shape=(1000, 100), chunks=(10, 10), dtype='float32') - >>> bar = root.create_array(name='foo/bar', shape=(100,), dtype='int32') - >>> foo[:, :] = np.random.random((1000, 100)) - >>> bar[:] = np.arange(100) - >>> root.tree() - / - └── foo (1000, 100) float32 - - -.. _user-guide-sharding: - -Sharding --------- - -Using small chunk shapes in very large arrays can lead to a very large number of chunks. -This can become a performance issue for file systems and object storage. -With Zarr format 3, a new sharding feature has been added to address this issue. - -With sharding, multiple chunks can be stored in a single storage object (e.g. a file). -Within a shard, chunks are compressed and serialized separately. -This allows individual chunks to be read independently. -However, when writing data, a full shard must be written in one go for optimal -performance and to avoid concurrency issues. -That means that shards are the units of writing and chunks are the units of reading. -Users need to configure the chunk and shard shapes accordingly. - -Sharded arrays can be created by providing the ``shards`` parameter to :func:`zarr.create_array`. - - >>> a = zarr.create_array('data/example-20.zarr', shape=(10000, 10000), shards=(1000, 1000), chunks=(100, 100), dtype='uint8') - >>> a[:] = (np.arange(10000 * 10000) % 256).astype('uint8').reshape(10000, 10000) - >>> a.info_complete() - Type : Array - Zarr format : 3 - Data type : DataType.uint8 - Fill value : 0 - Shape : (10000, 10000) - Shard shape : (1000, 1000) - Chunk shape : (100, 100) - Order : C - Read-only : False - Store type : LocalStore - Filters : () - Serializer : BytesCodec(endian=) - Compressors : (ZstdCodec(level=0, checksum=False),) - No. bytes : 100000000 (95.4M) - No. bytes stored : 3981552 - Storage ratio : 25.1 - Shards Initialized : 100 - -In this example a shard shape of (1000, 1000) and a chunk shape of (100, 100) is used. -This means that 10*10 chunks are stored in each shard, and there are 10*10 shards in total. -Without the ``shards`` argument, there would be 10,000 chunks stored as individual files. - -Missing features in 3.0 ------------------------ - - -The following features have not been ported to 3.0 yet. - -.. _user-guide-objects: - -Object arrays -~~~~~~~~~~~~~ - -See the Zarr-Python 2 documentation on `Object arrays `_ for more details. - -.. _user-guide-strings: - -Fixed-length string arrays -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -See the Zarr-Python 2 documentation on `Fixed-length string arrays `_ for more details. - -.. _user-guide-datetime: - -Datetime and Timedelta arrays -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -See the Zarr-Python 2 documentation on `Datetime and Timedelta `_ for more details. - -.. _user-guide-copy: - -Copying and migrating data -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -See the Zarr-Python 2 documentation on `Copying and migrating data `_ for more details. diff --git a/docs/user-guide/attributes.md b/docs/user-guide/attributes.md new file mode 100644 index 0000000000..216720c75f --- /dev/null +++ b/docs/user-guide/attributes.md @@ -0,0 +1,29 @@ +# Working with attributes + +Zarr arrays and groups support custom key/value attributes, which can be useful for +storing application-specific metadata. For example: + +```python +import zarr +store = zarr.storage.MemoryStore() +root = zarr.create_group(store=store) +root.attrs['foo'] = 'bar' +z = root.create_array(name='zzz', shape=(10000, 10000), dtype='int32') +z.attrs['baz'] = 42 +z.attrs['qux'] = [1, 4, 7, 12] +sorted(root.attrs) +# ['foo'] +'foo' in root.attrs +# True +root.attrs['foo'] +# 'bar' +sorted(z.attrs) +# ['baz', 'qux'] +z.attrs['baz'] +# 42 +z.attrs['qux'] +# [1, 4, 7, 12] +``` + +Internally Zarr uses JSON to store array attributes, so attribute values must be +JSON serializable. diff --git a/docs/user-guide/attributes.rst b/docs/user-guide/attributes.rst deleted file mode 100644 index ed48623e29..0000000000 --- a/docs/user-guide/attributes.rst +++ /dev/null @@ -1,30 +0,0 @@ -.. _user-guide-attrs: - -Working with attributes -======================= - -Zarr arrays and groups support custom key/value attributes, which can be useful for -storing application-specific metadata. For example:: - - >>> import zarr - >>> store = zarr.storage.MemoryStore() - >>> root = zarr.create_group(store=store) - >>> root.attrs['foo'] = 'bar' - >>> z = root.create_array(name='zzz', shape=(10000, 10000), dtype='int32') - >>> z.attrs['baz'] = 42 - >>> z.attrs['qux'] = [1, 4, 7, 12] - >>> sorted(root.attrs) - ['foo'] - >>> 'foo' in root.attrs - True - >>> root.attrs['foo'] - 'bar' - >>> sorted(z.attrs) - ['baz', 'qux'] - >>> z.attrs['baz'] - 42 - >>> z.attrs['qux'] - [1, 4, 7, 12] - -Internally Zarr uses JSON to store array attributes, so attribute values must be -JSON serializable. diff --git a/docs/user-guide/config.md b/docs/user-guide/config.md new file mode 100644 index 0000000000..b471a6c536 --- /dev/null +++ b/docs/user-guide/config.md @@ -0,0 +1,92 @@ +# Runtime configuration + +`zarr.config` is responsible for managing the configuration of zarr and +is based on the [donfig](https://github.com/pytroll/donfig) Python library. + +Configuration values can be set using code like the following: + +```python +import zarr + +zarr.config.set({'array.order': 'F'}) +# + +# revert this change so it doesn't impact the rest of the docs +zarr.config.set({'array.order': 'C'}) +# +``` + +Alternatively, configuration values can be set using environment variables, e.g. +`ZARR_ARRAY__ORDER=F`. + +The configuration can also be read from a YAML file in standard locations. +For more information, see the +[donfig documentation](https://donfig.readthedocs.io/en/latest/). + +Configuration options include the following: + +- Default Zarr format `default_zarr_version` +- Default array order in memory `array.order` +- Default filters, serializers and compressors, e.g. `array.v3_default_filters`, `array.v3_default_serializer`, `array.v3_default_compressors`, `array.v2_default_filters` and `array.v2_default_compressor` +- Whether empty chunks are written to storage `array.write_empty_chunks` +- Async and threading options, e.g. `async.concurrency` and `threading.max_workers` +- Selections of implementations of codecs, codec pipelines and buffers +- Enabling GPU support with `zarr.config.enable_gpu()`. See GPU support for more. + +For selecting custom implementations of codecs, pipelines, buffers and ndbuffers, +first register the implementations in the registry and then select them in the config. +For example, an implementation of the bytes codec in a class `'custompackage.NewBytesCodec'`, +requires the value of `codecs.bytes.name` to be `'custompackage.NewBytesCodec'`. + +This is the current default configuration: + +```python +zarr.config.pprint() +# {'array': {'order': 'C', +# 'v2_default_compressor': {'bytes': {'checksum': False, +# 'id': 'zstd', +# 'level': 0}, +# 'numeric': {'checksum': False, +# 'id': 'zstd', +# 'level': 0}, +# 'string': {'checksum': False, +# 'id': 'zstd', +# 'level': 0}}, +# 'v2_default_filters': {'bytes': [{'id': 'vlen-bytes'}], +# 'numeric': None, +# 'raw': None, +# 'string': [{'id': 'vlen-utf8'}]}, +# 'v3_default_compressors': {'bytes': [{'configuration': {'checksum': False, +# 'level': 0}, +# 'name': 'zstd'}], +# 'numeric': [{'configuration': {'checksum': False, +# 'level': 0}, +# 'name': 'zstd'}], +# 'string': [{'configuration': {'checksum': False, +# 'level': 0}, +# 'name': 'zstd'}]}, +# 'v3_default_filters': {'bytes': [], 'numeric': [], 'string': []}, +# 'v3_default_serializer': {'bytes': {'name': 'vlen-bytes'}, +# 'numeric': {'configuration': {'endian': 'little'}, +# 'name': 'bytes'}, +# 'string': {'name': 'vlen-utf8'}}, +# 'write_empty_chunks': False}, +# 'async': {'concurrency': 10, 'timeout': None}, +# 'buffer': 'zarr.core.buffer.cpu.Buffer', +# 'codec_pipeline': {'batch_size': 1, +# 'path': 'zarr.core.codec_pipeline.BatchedCodecPipeline'}, +# 'codecs': {'blosc': 'zarr.codecs.blosc.BloscCodec', +# 'bytes': 'zarr.codecs.bytes.BytesCodec', +# 'crc32c': 'zarr.codecs.crc32c_.Crc32cCodec', +# 'endian': 'zarr.codecs.bytes.BytesCodec', +# 'gzip': 'zarr.codecs.gzip.GzipCodec', +# 'sharding_indexed': 'zarr.codecs.sharding.ShardingCodec', +# 'transpose': 'zarr.codecs.transpose.TransposeCodec', +# 'vlen-bytes': 'zarr.codecs.vlen_utf8.VLenBytesCodec', +# 'vlen-utf8': 'zarr.codecs.vlen_utf8.VLenUTF8Codec', +# 'zstd': 'zarr.codecs.zstd.ZstdCodec'}, +# 'default_zarr_format': 3, +# 'json_indent': 2, +# 'ndbuffer': 'zarr.core.buffer.cpu.NDBuffer', +# 'threading': {'max_workers': None}} +``` diff --git a/docs/user-guide/config.rst b/docs/user-guide/config.rst deleted file mode 100644 index 91ffe50b91..0000000000 --- a/docs/user-guide/config.rst +++ /dev/null @@ -1,91 +0,0 @@ -.. _user-guide-config: - -Runtime configuration -===================== - -``zarr.config`` is responsible for managing the configuration of zarr and -is based on the `donfig `_ Python library. - -Configuration values can be set using code like the following:: - - >>> import zarr - >>> - >>> zarr.config.set({'array.order': 'F'}) - - >>> - >>> # revert this change so it doesn't impact the rest of the docs - >>> zarr.config.set({'array.order': 'C'}) - - -Alternatively, configuration values can be set using environment variables, e.g. -``ZARR_ARRAY__ORDER=F``. - -The configuration can also be read from a YAML file in standard locations. -For more information, see the -`donfig documentation `_. - -Configuration options include the following: - -- Default Zarr format ``default_zarr_version`` -- Default array order in memory ``array.order`` -- Default filters, serializers and compressors, e.g. ``array.v3_default_filters``, ``array.v3_default_serializer``, ``array.v3_default_compressors``, ``array.v2_default_filters`` and ``array.v2_default_compressor`` -- Whether empty chunks are written to storage ``array.write_empty_chunks`` -- Async and threading options, e.g. ``async.concurrency`` and ``threading.max_workers`` -- Selections of implementations of codecs, codec pipelines and buffers -- Enabling GPU support with ``zarr.config.enable_gpu()``. See :ref:`user-guide-gpu` for more. - -For selecting custom implementations of codecs, pipelines, buffers and ndbuffers, -first register the implementations in the registry and then select them in the config. -For example, an implementation of the bytes codec in a class ``'custompackage.NewBytesCodec'``, -requires the value of ``codecs.bytes.name`` to be ``'custompackage.NewBytesCodec'``. - -This is the current default configuration:: - - >>> zarr.config.pprint() - {'array': {'order': 'C', - 'v2_default_compressor': {'bytes': {'checksum': False, - 'id': 'zstd', - 'level': 0}, - 'numeric': {'checksum': False, - 'id': 'zstd', - 'level': 0}, - 'string': {'checksum': False, - 'id': 'zstd', - 'level': 0}}, - 'v2_default_filters': {'bytes': [{'id': 'vlen-bytes'}], - 'numeric': None, - 'raw': None, - 'string': [{'id': 'vlen-utf8'}]}, - 'v3_default_compressors': {'bytes': [{'configuration': {'checksum': False, - 'level': 0}, - 'name': 'zstd'}], - 'numeric': [{'configuration': {'checksum': False, - 'level': 0}, - 'name': 'zstd'}], - 'string': [{'configuration': {'checksum': False, - 'level': 0}, - 'name': 'zstd'}]}, - 'v3_default_filters': {'bytes': [], 'numeric': [], 'string': []}, - 'v3_default_serializer': {'bytes': {'name': 'vlen-bytes'}, - 'numeric': {'configuration': {'endian': 'little'}, - 'name': 'bytes'}, - 'string': {'name': 'vlen-utf8'}}, - 'write_empty_chunks': False}, - 'async': {'concurrency': 10, 'timeout': None}, - 'buffer': 'zarr.core.buffer.cpu.Buffer', - 'codec_pipeline': {'batch_size': 1, - 'path': 'zarr.core.codec_pipeline.BatchedCodecPipeline'}, - 'codecs': {'blosc': 'zarr.codecs.blosc.BloscCodec', - 'bytes': 'zarr.codecs.bytes.BytesCodec', - 'crc32c': 'zarr.codecs.crc32c_.Crc32cCodec', - 'endian': 'zarr.codecs.bytes.BytesCodec', - 'gzip': 'zarr.codecs.gzip.GzipCodec', - 'sharding_indexed': 'zarr.codecs.sharding.ShardingCodec', - 'transpose': 'zarr.codecs.transpose.TransposeCodec', - 'vlen-bytes': 'zarr.codecs.vlen_utf8.VLenBytesCodec', - 'vlen-utf8': 'zarr.codecs.vlen_utf8.VLenUTF8Codec', - 'zstd': 'zarr.codecs.zstd.ZstdCodec'}, - 'default_zarr_format': 3, - 'json_indent': 2, - 'ndbuffer': 'zarr.core.buffer.cpu.NDBuffer', - 'threading': {'max_workers': None}} diff --git a/docs/user-guide/consolidated_metadata.md b/docs/user-guide/consolidated_metadata.md new file mode 100644 index 0000000000..20c1cf5554 --- /dev/null +++ b/docs/user-guide/consolidated_metadata.md @@ -0,0 +1,117 @@ +# Consolidated metadata + +!!! warning + The Consolidated Metadata feature in Zarr-Python is considered experimental for v3 + stores. [zarr-specs#309](https://github.com/zarr-developers/zarr-specs/pull/309) + has proposed a formal extension to the v3 specification to support consolidated metadata. + +Zarr-Python implements the [Consolidated Metadata](https://github.com/zarr-developers/zarr-specs/pull/309) for v2 and v3 stores. +Consolidated metadata can reduce the time needed to load the metadata for an +entire hierarchy, especially when the metadata is being served over a network. +Consolidated metadata essentially stores all the metadata for a hierarchy in the +metadata of the root Group. + +## Usage + +If consolidated metadata is present in a Zarr Group's metadata then it is used +by default. The initial read to open the group will need to communicate with +the store (reading from a file for a `zarr.storage.LocalStore`, making a +network request for a `zarr.storage.FsspecStore`). After that, any subsequent +metadata reads get child Group or Array nodes will *not* require reads from the store. + +In Python, the consolidated metadata is available on the `.consolidated_metadata` +attribute of the `GroupMetadata` object. + +```python +import zarr + +store = zarr.storage.MemoryStore() +group = zarr.create_group(store=store) +group.create_array(shape=(1,), name='a', dtype='float64') +# +group.create_array(shape=(2, 2), name='b', dtype='float64') +# +group.create_array(shape=(3, 3, 3), name='c', dtype='float64') +# +zarr.consolidate_metadata(store) +# +``` + +If we open that group, the Group's metadata has a `zarr.core.group.ConsolidatedMetadata` +that can be used.: + +```python +consolidated = zarr.open_group(store=store) +consolidated_metadata = consolidated.metadata.consolidated_metadata.metadata +from pprint import pprint +pprint(dict(sorted(consolidated_metadata.items()))) +# {'a': ArrayV3Metadata(shape=(1,), +# data_type=, +# chunk_grid=RegularChunkGrid(chunk_shape=(1,)), +# chunk_key_encoding=DefaultChunkKeyEncoding(name='default', +# separator='/'), +# fill_value=np.float64(0.0), +# codecs=(BytesCodec(endian=), +# ZstdCodec(level=0, checksum=False)), +# attributes={}, +# dimension_names=None, +# zarr_format=3, +# node_type='array', +# storage_transformers=()), +# 'b': ArrayV3Metadata(shape=(2, 2), +# data_type=, +# chunk_grid=RegularChunkGrid(chunk_shape=(2, 2)), +# chunk_key_encoding=DefaultChunkKeyEncoding(name='default', +# separator='/'), +# fill_value=np.float64(0.0), +# codecs=(BytesCodec(endian=), +# ZstdCodec(level=0, checksum=False)), +# attributes={}, +# dimension_names=None, +# zarr_format=3, +# node_type='array', +# storage_transformers=()), +# 'c': ArrayV3Metadata(shape=(3, 3, 3), +# data_type=, +# chunk_grid=RegularChunkGrid(chunk_shape=(3, 3, 3)), +# chunk_key_encoding=DefaultChunkKeyEncoding(name='default', +# separator='/'), +# fill_value=np.float64(0.0), +# codecs=(BytesCodec(endian=), +# ZstdCodec(level=0, checksum=False)), +# attributes={}, +# dimension_names=None, +# zarr_format=3, +# node_type='array', +# storage_transformers=())} +``` + +Operations on the group to get children automatically use the consolidated metadata.: + +```python +consolidated['a'] # no read / HTTP request to the Store is required +# +``` + +With nested groups, the consolidated metadata is available on the children, recursively.: + +```python +child = group.create_group('child', attributes={'kind': 'child'}) +grandchild = child.create_group('child', attributes={'kind': 'grandchild'}) +consolidated = zarr.consolidate_metadata(store) + +consolidated['child'].metadata.consolidated_metadata +# ConsolidatedMetadata(metadata={'child': GroupMetadata(attributes={'kind': 'grandchild'}, zarr_format=3, consolidated_metadata=ConsolidatedMetadata(metadata={}, kind='inline', must_understand=False), node_type='group')}, kind='inline', must_understand=False) +``` + +## Synchronization and Concurrency + +Consolidated metadata is intended for read-heavy use cases on slowly changing +hierarchies. For hierarchies where new nodes are constantly being added, +removed, or modified, consolidated metadata may not be desirable. + +1. It will add some overhead to each update operation, since the metadata + would need to be re-consolidated to keep it in sync with the store. +2. Readers using consolidated metadata will regularly see a "past" version + of the metadata, at the time they read the root node with its consolidated + metadata. diff --git a/docs/user-guide/consolidated_metadata.rst b/docs/user-guide/consolidated_metadata.rst deleted file mode 100644 index 3c015dcfca..0000000000 --- a/docs/user-guide/consolidated_metadata.rst +++ /dev/null @@ -1,116 +0,0 @@ -.. _user-guide-consolidated-metadata: - -Consolidated metadata -===================== - -.. warning:: - The Consolidated Metadata feature in Zarr-Python is considered experimental for v3 - stores. `zarr-specs#309 `_ - has proposed a formal extension to the v3 specification to support consolidated metadata. - -Zarr-Python implements the `Consolidated Metadata`_ for v2 and v3 stores. -Consolidated metadata can reduce the time needed to load the metadata for an -entire hierarchy, especially when the metadata is being served over a network. -Consolidated metadata essentially stores all the metadata for a hierarchy in the -metadata of the root Group. - -Usage ------ - -If consolidated metadata is present in a Zarr Group's metadata then it is used -by default. The initial read to open the group will need to communicate with -the store (reading from a file for a :class:`zarr.storage.LocalStore`, making a -network request for a :class:`zarr.storage.FsspecStore`). After that, any subsequent -metadata reads get child Group or Array nodes will *not* require reads from the store. - -In Python, the consolidated metadata is available on the ``.consolidated_metadata`` -attribute of the ``GroupMetadata`` object. - - >>> import zarr - >>> - >>> store = zarr.storage.MemoryStore() - >>> group = zarr.create_group(store=store) - >>> group.create_array(shape=(1,), name='a', dtype='float64') - - >>> group.create_array(shape=(2, 2), name='b', dtype='float64') - - >>> group.create_array(shape=(3, 3, 3), name='c', dtype='float64') - - >>> zarr.consolidate_metadata(store) - - -If we open that group, the Group's metadata has a :class:`zarr.core.group.ConsolidatedMetadata` -that can be used.: - - >>> consolidated = zarr.open_group(store=store) - >>> consolidated_metadata = consolidated.metadata.consolidated_metadata.metadata - >>> from pprint import pprint - >>> pprint(dict(sorted(consolidated_metadata.items()))) - {'a': ArrayV3Metadata(shape=(1,), - data_type=, - chunk_grid=RegularChunkGrid(chunk_shape=(1,)), - chunk_key_encoding=DefaultChunkKeyEncoding(name='default', - separator='/'), - fill_value=np.float64(0.0), - codecs=(BytesCodec(endian=), - ZstdCodec(level=0, checksum=False)), - attributes={}, - dimension_names=None, - zarr_format=3, - node_type='array', - storage_transformers=()), - 'b': ArrayV3Metadata(shape=(2, 2), - data_type=, - chunk_grid=RegularChunkGrid(chunk_shape=(2, 2)), - chunk_key_encoding=DefaultChunkKeyEncoding(name='default', - separator='/'), - fill_value=np.float64(0.0), - codecs=(BytesCodec(endian=), - ZstdCodec(level=0, checksum=False)), - attributes={}, - dimension_names=None, - zarr_format=3, - node_type='array', - storage_transformers=()), - 'c': ArrayV3Metadata(shape=(3, 3, 3), - data_type=, - chunk_grid=RegularChunkGrid(chunk_shape=(3, 3, 3)), - chunk_key_encoding=DefaultChunkKeyEncoding(name='default', - separator='/'), - fill_value=np.float64(0.0), - codecs=(BytesCodec(endian=), - ZstdCodec(level=0, checksum=False)), - attributes={}, - dimension_names=None, - zarr_format=3, - node_type='array', - storage_transformers=())} - -Operations on the group to get children automatically use the consolidated metadata.: - - >>> consolidated['a'] # no read / HTTP request to the Store is required - - -With nested groups, the consolidated metadata is available on the children, recursively.: - - >>> child = group.create_group('child', attributes={'kind': 'child'}) - >>> grandchild = child.create_group('child', attributes={'kind': 'grandchild'}) - >>> consolidated = zarr.consolidate_metadata(store) - >>> - >>> consolidated['child'].metadata.consolidated_metadata - ConsolidatedMetadata(metadata={'child': GroupMetadata(attributes={'kind': 'grandchild'}, zarr_format=3, consolidated_metadata=ConsolidatedMetadata(metadata={}, kind='inline', must_understand=False), node_type='group')}, kind='inline', must_understand=False) - -Synchronization and Concurrency -------------------------------- - -Consolidated metadata is intended for read-heavy use cases on slowly changing -hierarchies. For hierarchies where new nodes are constantly being added, -removed, or modified, consolidated metadata may not be desirable. - -1. It will add some overhead to each update operation, since the metadata - would need to be re-consolidated to keep it in sync with the store. -2. Readers using consolidated metadata will regularly see a "past" version - of the metadata, at the time they read the root node with its consolidated - metadata. - -.. _Consolidated Metadata: https://github.com/zarr-developers/zarr-specs/pull/309 diff --git a/docs/user-guide/extending.rst b/docs/user-guide/extending.md similarity index 52% rename from docs/user-guide/extending.rst rename to docs/user-guide/extending.md index 7647703fbb..d950794047 100644 --- a/docs/user-guide/extending.rst +++ b/docs/user-guide/extending.md @@ -1,20 +1,17 @@ - -Extending Zarr -============== +# Extending Zarr Zarr-Python 3 was designed to be extensible. This means that you can extend the library by writing custom classes and plugins. Currently, Zarr can be extended in the following ways: -Custom codecs -------------- +## Custom codecs -.. note:: +!!! note This section explains how custom codecs can be created for Zarr format 3 arrays. For Zarr format 2, codecs should subclass the - `numcodecs.abc.Codec `_ + [numcodecs.abc.Codec](https://numcodecs.readthedocs.io/en/stable/abc.html#numcodecs.abc.Codec) base class and register through - `numcodecs.registry.register_codec `_. + [numcodecs.registry.register_codec](https://numcodecs.readthedocs.io/en/stable/registry.html#numcodecs.registry.register_codec). There are three types of codecs in Zarr: - array-to-array @@ -24,68 +21,65 @@ There are three types of codecs in Zarr: Array-to-array codecs are used to transform the array data before serializing to bytes. Examples include delta encoding or scaling codecs. Array-to-bytes codecs are used for serializing the array data to bytes. In Zarr, the main codec to use for numeric arrays -is the :class:`zarr.codecs.BytesCodec`. Bytes-to-bytes codecs transform the serialized bytestreams +is the `zarr.codecs.BytesCodec`. Bytes-to-bytes codecs transform the serialized bytestreams of the array data. Examples include compression codecs, such as -:class:`zarr.codecs.GzipCodec`, :class:`zarr.codecs.BloscCodec` or -:class:`zarr.codecs.ZstdCodec`, and codecs that add a checksum to the bytestream, such as -:class:`zarr.codecs.Crc32cCodec`. +`zarr.codecs.GzipCodec`, `zarr.codecs.BloscCodec` or +`zarr.codecs.ZstdCodec`, and codecs that add a checksum to the bytestream, such as +`zarr.codecs.Crc32cCodec`. Custom codecs for Zarr are implemented by subclassing the relevant base class, see -:class:`zarr.abc.codec.ArrayArrayCodec`, :class:`zarr.abc.codec.ArrayBytesCodec` and -:class:`zarr.abc.codec.BytesBytesCodec`. Most custom codecs should implemented the -``_encode_single`` and ``_decode_single`` methods. These methods operate on single chunks -of the array data. Alternatively, custom codecs can implement the ``encode`` and ``decode`` +`zarr.abc.codec.ArrayArrayCodec`, `zarr.abc.codec.ArrayBytesCodec` and +`zarr.abc.codec.BytesBytesCodec`. Most custom codecs should implemented the +`_encode_single` and `_decode_single` methods. These methods operate on single chunks +of the array data. Alternatively, custom codecs can implement the `encode` and `decode` methods, which operate on batches of chunks, in case the codec is intended to implement its own batch processing. Custom codecs should also implement the following methods: -- ``compute_encoded_size``, which returns the byte size of the encoded data given the byte - size of the original data. It should raise ``NotImplementedError`` for codecs with +- `compute_encoded_size`, which returns the byte size of the encoded data given the byte + size of the original data. It should raise `NotImplementedError` for codecs with variable-sized outputs, such as compression codecs. -- ``validate`` (optional), which can be used to check that the codec metadata is compatible with the +- `validate` (optional), which can be used to check that the codec metadata is compatible with the array metadata. It should raise errors if not. -- ``resolve_metadata`` (optional), which is important for codecs that change the shape, +- `resolve_metadata` (optional), which is important for codecs that change the shape, dtype or fill value of a chunk. -- ``evolve_from_array_spec`` (optional), which can be useful for automatically filling in +- `evolve_from_array_spec` (optional), which can be useful for automatically filling in codec configuration metadata from the array metadata. To use custom codecs in Zarr, they need to be registered using the -`entrypoint mechanism `_. -Commonly, entrypoints are declared in the ``pyproject.toml`` of your package under the -``[project.entry-points."zarr.codecs"]`` section. Zarr will automatically discover and +[entrypoint mechanism](https://packaging.python.org/en/latest/specifications/entry-points/). +Commonly, entrypoints are declared in the `pyproject.toml` of your package under the +`[project.entry-points."zarr.codecs"]` section. Zarr will automatically discover and load all codecs registered with the entrypoint mechanism from imported modules. -.. code-block:: toml - - [project.entry-points."zarr.codecs"] - "custompackage.fancy_codec" = "custompackage:FancyCodec" +```toml +[project.entry-points."zarr.codecs"] +"custompackage.fancy_codec" = "custompackage:FancyCodec" +``` New codecs need to have their own unique identifier. To avoid naming collisions, it is strongly recommended to prefix the codec identifier with a unique name. For example, -the codecs from ``numcodecs`` are prefixed with ``numcodecs.``, e.g. ``numcodecs.delta``. +the codecs from `numcodecs` are prefixed with `numcodecs.`, e.g. `numcodecs.delta`. -.. note:: +!!! note Note that the extension mechanism for the Zarr format 3 is still under development. Requirements for custom codecs including the choice of codec identifiers might change in the future. It is also possible to register codecs as replacements for existing codecs. This might be useful for providing specialized implementations, such as GPU-based codecs. In case of -multiple codecs, the :mod:`zarr.core.config` mechanism can be used to select the preferred +multiple codecs, the `zarr.core.config` mechanism can be used to select the preferred implementation. -Custom stores -------------- +## Custom stores Coming soon. -Custom array buffers --------------------- +## Custom array buffers Coming soon. -Other extensions ----------------- +## Other extensions In the future, Zarr will support writing custom custom data types and chunk grids. diff --git a/docs/user-guide/gpu.md b/docs/user-guide/gpu.md new file mode 100644 index 0000000000..994667b515 --- /dev/null +++ b/docs/user-guide/gpu.md @@ -0,0 +1,32 @@ +# Using GPUs with Zarr + +Zarr can use GPUs to accelerate your workload by running +`zarr.config.enable_gpu`. + +!!! note + `zarr-python` currently supports reading the ndarray data into device (GPU) + memory as the final stage of the codec pipeline. Data will still be read into + or copied to host (CPU) memory for encoding and decoding. + + In the future, codecs will be available compressing and decompressing data on + the GPU, avoiding the need to move data between the host and device for + compression and decompression. + +## Reading data into device memory + +`zarr.config.enable_gpu` configures Zarr to use GPU memory for the data +buffers used internally by Zarr. + +```python +import zarr +import cupy as cp # doctest: +SKIP +zarr.config.enable_gpu() # doctest: +SKIP +store = zarr.storage.MemoryStore() # doctest: +SKIP +z = zarr.create_array( # doctest: +SKIP + store=store, shape=(100, 100), chunks=(10, 10), dtype="float32", +) +type(z[:10, :10]) # doctest: +SKIP +# cupy.ndarray +``` + +Note that the output type is a `cupy.ndarray` rather than a NumPy array. diff --git a/docs/user-guide/gpu.rst b/docs/user-guide/gpu.rst deleted file mode 100644 index 4d3492f8bd..0000000000 --- a/docs/user-guide/gpu.rst +++ /dev/null @@ -1,37 +0,0 @@ -.. _user-guide-gpu: - -Using GPUs with Zarr -==================== - -Zarr can use GPUs to accelerate your workload by running -:meth:`zarr.config.enable_gpu`. - -.. note:: - - `zarr-python` currently supports reading the ndarray data into device (GPU) - memory as the final stage of the codec pipeline. Data will still be read into - or copied to host (CPU) memory for encoding and decoding. - - In the future, codecs will be available compressing and decompressing data on - the GPU, avoiding the need to move data between the host and device for - compression and decompression. - -Reading data into device memory -------------------------------- - -:meth:`zarr.config.enable_gpu` configures Zarr to use GPU memory for the data -buffers used internally by Zarr. - -.. code-block:: python - - >>> import zarr - >>> import cupy as cp # doctest: +SKIP - >>> zarr.config.enable_gpu() # doctest: +SKIP - >>> store = zarr.storage.MemoryStore() # doctest: +SKIP - >>> z = zarr.create_array( # doctest: +SKIP - ... store=store, shape=(100, 100), chunks=(10, 10), dtype="float32", - ... ) - >>> type(z[:10, :10]) # doctest: +SKIP - cupy.ndarray - -Note that the output type is a ``cupy.ndarray`` rather than a NumPy array. diff --git a/docs/user-guide/groups.md b/docs/user-guide/groups.md new file mode 100644 index 0000000000..0a19679a7f --- /dev/null +++ b/docs/user-guide/groups.md @@ -0,0 +1,172 @@ +# Working with groups + +Zarr supports hierarchical organization of arrays via groups. As with arrays, +groups can be stored in memory, on disk, or via other storage systems that +support a similar interface. + +To create a group, use the `zarr.group` function: + +```python +import zarr +store = zarr.storage.MemoryStore() +root = zarr.create_group(store=store) +root +# +``` + +Groups have a similar API to the Group class from [h5py](https://www.h5py.org/). For example, groups can contain other groups: + +```python +foo = root.create_group('foo') +bar = foo.create_group('bar') +``` + +Groups can also contain arrays, e.g.: + +```python +z1 = bar.create_array(name='baz', shape=(10000, 10000), chunks=(1000, 1000), dtype='int32') +z1 +# +``` + +Members of a group can be accessed via the suffix notation, e.g.: + +```python +root['foo'] +# +``` + +The '/' character can be used to access multiple levels of the hierarchy in one +call, e.g.: + +```python +root['foo/bar'] +# +root['foo/bar/baz'] +# +``` + +The `zarr.Group.tree` method can be used to print a tree +representation of the hierarchy, e.g.: + +```python +root.tree() +# / +# └── foo +# └── bar +# └── baz (10000, 10000) int32 +``` + +The `zarr.open_group` function provides a convenient way to create or +re-open a group stored in a directory on the file-system, with sub-groups stored in +sub-directories, e.g.: + +```python +root = zarr.open_group('data/group.zarr', mode='w') +root +# + +z = root.create_array(name='foo/bar/baz', shape=(10000, 10000), chunks=(1000, 1000), dtype='int32') +z +# +``` + +For more information on groups see the `zarr.Group` API docs. + +## Batch Group Creation + +You can also create multiple groups concurrently with a single function call. `zarr.create_hierarchy` takes +a `zarr.storage.Store` instance and a dict of `key : metadata` pairs, parses that dict, and +writes metadata documents to storage: + +```python +from zarr import create_hierarchy +from zarr.core.group import GroupMetadata +from zarr.storage import LocalStore +node_spec = {'a/b/c': GroupMetadata()} +nodes_created = dict(create_hierarchy(store=LocalStore(root='data'), nodes=node_spec)) +print(sorted(nodes_created.items(), key=lambda kv: len(kv[0]))) +# [('', ), ('a', ), ('a/b', ), ('a/b/c', )] +``` + +Note that we only specified a single group named `a/b/c`, but 4 groups were created. These additional groups +were created to ensure that the desired node `a/b/c` is connected to the root group `''` by a sequence +of intermediate groups. `zarr.create_hierarchy` normalizes the `nodes` keyword argument to +ensure that the resulting hierarchy is complete, i.e. all groups or arrays are connected to the root +of the hierarchy via intermediate groups. + +Because `zarr.create_hierarchy` concurrently creates metadata documents, it's more efficient +than repeated calls to `create_group` or `create_array`, provided you can statically define +the metadata for the groups and arrays you want to create. + +## Array and group diagnostics + +Diagnostic information about arrays and groups is available via the `info` +property. E.g.: + +```python +store = zarr.storage.MemoryStore() +root = zarr.group(store=store) +foo = root.create_group('foo') +bar = foo.create_array(name='bar', shape=1000000, chunks=100000, dtype='int64') +bar[:] = 42 +baz = foo.create_array(name='baz', shape=(1000, 1000), chunks=(100, 100), dtype='float32') +baz[:] = 4.2 +root.info +# Name : +# Type : Group +# Zarr format : 3 +# Read-only : False +# Store type : MemoryStore +foo.info +# Name : foo +# Type : Group +# Zarr format : 3 +# Read-only : False +# Store type : MemoryStore +bar.info_complete() +# Type : Array +# Zarr format : 3 +# Data type : DataType.int64 +# Fill value : 0 +# Shape : (1000000,) +# Chunk shape : (100000,) +# Order : C +# Read-only : False +# Store type : MemoryStore +# Filters : () +# Serializer : BytesCodec(endian=) +# Compressors : (ZstdCodec(level=0, checksum=False),) +# No. bytes : 8000000 (7.6M) +# No. bytes stored : 1614 +# Storage ratio : 4956.6 +# Chunks Initialized : 10 +baz.info +# Type : Array +# Zarr format : 3 +# Data type : DataType.float32 +# Fill value : 0.0 +# Shape : (1000, 1000) +# Chunk shape : (100, 100) +# Order : C +# Read-only : False +# Store type : MemoryStore +# Filters : () +# Serializer : BytesCodec(endian=) +# Compressors : (ZstdCodec(level=0, checksum=False),) +# No. bytes : 4000000 (3.8M) +``` + +Groups also have the `zarr.Group.tree` method, e.g.: + +```python +root.tree() +# / +# └── foo +# ├── bar (1000000,) int64 +# └── baz (1000, 1000) float32 +``` + +!!! note + `zarr.Group.tree` requires the optional [rich](https://rich.readthedocs.io/en/stable/) + dependency. It can be installed with the `[tree]` extra. diff --git a/docs/user-guide/groups.rst b/docs/user-guide/groups.rst deleted file mode 100644 index 99234bad4e..0000000000 --- a/docs/user-guide/groups.rst +++ /dev/null @@ -1,172 +0,0 @@ -.. only:: doctest - - >>> import shutil - >>> shutil.rmtree('data', ignore_errors=True) - -.. _user-guide-groups: - -Working with groups -=================== - -Zarr supports hierarchical organization of arrays via groups. As with arrays, -groups can be stored in memory, on disk, or via other storage systems that -support a similar interface. - -To create a group, use the :func:`zarr.group` function:: - - >>> import zarr - >>> store = zarr.storage.MemoryStore() - >>> root = zarr.create_group(store=store) - >>> root - - -Groups have a similar API to the Group class from `h5py -`_. For example, groups can contain other groups:: - - >>> foo = root.create_group('foo') - >>> bar = foo.create_group('bar') - -Groups can also contain arrays, e.g.:: - - >>> z1 = bar.create_array(name='baz', shape=(10000, 10000), chunks=(1000, 1000), dtype='int32') - >>> z1 - - -Members of a group can be accessed via the suffix notation, e.g.:: - - >>> root['foo'] - - -The '/' character can be used to access multiple levels of the hierarchy in one -call, e.g.:: - - >>> root['foo/bar'] - - >>> root['foo/bar/baz'] - - -The :func:`zarr.Group.tree` method can be used to print a tree -representation of the hierarchy, e.g.:: - - >>> root.tree() - / - └── foo - └── bar - └── baz (10000, 10000) int32 - - -The :func:`zarr.open_group` function provides a convenient way to create or -re-open a group stored in a directory on the file-system, with sub-groups stored in -sub-directories, e.g.:: - - >>> root = zarr.open_group('data/group.zarr', mode='w') - >>> root - - >>> - >>> z = root.create_array(name='foo/bar/baz', shape=(10000, 10000), chunks=(1000, 1000), dtype='int32') - >>> z - - -.. TODO: uncomment after __enter__ and __exit__ are implemented -.. Groups can be used as context managers (in a ``with`` statement). -.. If the underlying store has a ``close`` method, it will be called on exit. - -For more information on groups see the :class:`zarr.Group` API docs. - -.. _user-guide-diagnostics: - -Batch Group Creation --------------------- - -You can also create multiple groups concurrently with a single function call. :func:`zarr.create_hierarchy` takes -a :class:`zarr.storage.Store` instance and a dict of ``key : metadata`` pairs, parses that dict, and -writes metadata documents to storage: - - >>> from zarr import create_hierarchy - >>> from zarr.core.group import GroupMetadata - >>> from zarr.storage import LocalStore - >>> node_spec = {'a/b/c': GroupMetadata()} - >>> nodes_created = dict(create_hierarchy(store=LocalStore(root='data'), nodes=node_spec)) - >>> print(sorted(nodes_created.items(), key=lambda kv: len(kv[0]))) - [('', ), ('a', ), ('a/b', ), ('a/b/c', )] - -Note that we only specified a single group named ``a/b/c``, but 4 groups were created. These additional groups -were created to ensure that the desired node ``a/b/c`` is connected to the root group ``''`` by a sequence -of intermediate groups. :func:`zarr.create_hierarchy` normalizes the ``nodes`` keyword argument to -ensure that the resulting hierarchy is complete, i.e. all groups or arrays are connected to the root -of the hierarchy via intermediate groups. - -Because :func:`zarr.create_hierarchy` concurrently creates metadata documents, it's more efficient -than repeated calls to :func:`create_group` or :func:`create_array`, provided you can statically define -the metadata for the groups and arrays you want to create. - -Array and group diagnostics ---------------------------- - -Diagnostic information about arrays and groups is available via the ``info`` -property. E.g.:: - - >>> store = zarr.storage.MemoryStore() - >>> root = zarr.group(store=store) - >>> foo = root.create_group('foo') - >>> bar = foo.create_array(name='bar', shape=1000000, chunks=100000, dtype='int64') - >>> bar[:] = 42 - >>> baz = foo.create_array(name='baz', shape=(1000, 1000), chunks=(100, 100), dtype='float32') - >>> baz[:] = 4.2 - >>> root.info - Name : - Type : Group - Zarr format : 3 - Read-only : False - Store type : MemoryStore - >>> foo.info - Name : foo - Type : Group - Zarr format : 3 - Read-only : False - Store type : MemoryStore - >>> bar.info_complete() - Type : Array - Zarr format : 3 - Data type : DataType.int64 - Fill value : 0 - Shape : (1000000,) - Chunk shape : (100000,) - Order : C - Read-only : False - Store type : MemoryStore - Filters : () - Serializer : BytesCodec(endian=) - Compressors : (ZstdCodec(level=0, checksum=False),) - No. bytes : 8000000 (7.6M) - No. bytes stored : 1614 - Storage ratio : 4956.6 - Chunks Initialized : 10 - >>> baz.info - Type : Array - Zarr format : 3 - Data type : DataType.float32 - Fill value : 0.0 - Shape : (1000, 1000) - Chunk shape : (100, 100) - Order : C - Read-only : False - Store type : MemoryStore - Filters : () - Serializer : BytesCodec(endian=) - Compressors : (ZstdCodec(level=0, checksum=False),) - No. bytes : 4000000 (3.8M) - -Groups also have the :func:`zarr.Group.tree` method, e.g.:: - - >>> root.tree() - / - └── foo - ├── bar (1000000,) int64 - └── baz (1000, 1000) float32 - - -.. note:: - - :func:`zarr.Group.tree` requires the optional `rich `_ - dependency. It can be installed with the ``[tree]`` extra. diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst deleted file mode 100644 index c50713332b..0000000000 --- a/docs/user-guide/index.rst +++ /dev/null @@ -1,30 +0,0 @@ -.. _user-guide: - -User guide -========== - -.. toctree:: - :maxdepth: 1 - - installation - arrays - groups - attributes - storage - config - v3_migration - -Advanced Topics ---------------- - -.. toctree:: - :maxdepth: 1 - - performance - consolidated_metadata - extending - gpu - - -.. Coming soon - async diff --git a/docs/user-guide/installation.md b/docs/user-guide/installation.md new file mode 100644 index 0000000000..486722cd0d --- /dev/null +++ b/docs/user-guide/installation.md @@ -0,0 +1,50 @@ +# Installation + +## Required dependencies + +Required dependencies include: + +- [Python](https://docs.python.org/3/) (3.11 or later) +- [packaging](https://packaging.pypa.io) (22.0 or later) +- [numpy](https://numpy.org) (1.25 or later) +- [numcodecs[crc32c]](https://numcodecs.readthedocs.io) (0.14 or later) +- [typing_extensions](https://typing-extensions.readthedocs.io) (4.9 or later) +- [donfig](https://donfig.readthedocs.io) (0.8 or later) + +## pip + +Zarr is available on [PyPI](https://pypi.org/project/zarr/). Install it using `pip`: + +```console +$ pip install zarr +``` + +There are a number of optional dependency groups you can install for extra functionality. +These can be installed using `pip install "zarr[]"`, e.g. `pip install "zarr[gpu]"` + +- `gpu`: support for GPUs +- `remote`: support for reading/writing to remote data stores + +Additional optional dependencies include `rich`, `universal_pathlib`. These must be installed separately. + +## conda + +Zarr is also published to [conda-forge](https://conda-forge.org). Install it using `conda`: + +```console +$ conda install -c conda-forge zarr +``` + +Conda does not support optional dependencies, so you will have to manually install any packages +needed to enable extra functionality. + +## Dependency support + +Zarr has endorsed [Scientific-Python SPEC 0](https://scientific-python.org/specs/spec-0000/) and now follows the version support window as outlined below: + +- Python: 36 months after initial release +- Core package dependencies (e.g. NumPy): 24 months after initial release + +## Development + +To install the latest development version of Zarr, see the contributing guide. diff --git a/docs/user-guide/installation.rst b/docs/user-guide/installation.rst deleted file mode 100644 index a79f0763cb..0000000000 --- a/docs/user-guide/installation.rst +++ /dev/null @@ -1,54 +0,0 @@ -Installation -============ - -Required dependencies ---------------------- - -Required dependencies include: - -- `Python `_ (3.11 or later) -- `packaging `_ (22.0 or later) -- `numpy `_ (1.25 or later) -- `numcodecs[crc32c] `_ (0.14 or later) -- `typing_extensions `_ (4.9 or later) -- `donfig `_ (0.8 or later) - -pip ---- - -Zarr is available on `PyPI `_. Install it using ``pip``: - -.. code-block:: console - - $ pip install zarr - -There are a number of optional dependency groups you can install for extra functionality. -These can be installed using ``pip install "zarr[]"``, e.g. ``pip install "zarr[gpu]"`` - -- ``gpu``: support for GPUs -- ``remote``: support for reading/writing to remote data stores - -Additional optional dependencies include ``rich``, ``universal_pathlib``. These must be installed separately. - -conda ------ - -Zarr is also published to `conda-forge `_. Install it using ``conda``: - -.. code-block:: console - - $ conda install -c conda-forge zarr - -Conda does not support optional dependencies, so you will have to manually install any packages -needed to enable extra functionality. - -Dependency support ------------------- -Zarr has endorsed `Scientific-Python SPEC 0 `_ and now follows the version support window as outlined below: - -- Python: 36 months after initial release -- Core package dependencies (e.g. NumPy): 24 months after initial release - -Development ------------ -To install the latest development version of Zarr, see the :ref:`contributing guide `. diff --git a/docs/user-guide/performance.md b/docs/user-guide/performance.md new file mode 100644 index 0000000000..171b04ca08 --- /dev/null +++ b/docs/user-guide/performance.md @@ -0,0 +1,263 @@ +# Optimizing performance + +## Chunk optimizations + +### Chunk size and shape + +In general, chunks of at least 1 megabyte (1M) uncompressed size seem to provide +better performance, at least when using the Blosc compression library. + +The optimal chunk shape will depend on how you want to access the data. E.g., +for a 2-dimensional array, if you only ever take slices along the first +dimension, then chunk across the second dimension. If you know you want to chunk +across an entire dimension you can use the full size of that dimension within the +`chunks` argument, e.g.: + +```python +import zarr +z1 = zarr.create_array(store={}, shape=(10000, 10000), chunks=(100, 10000), dtype='int32') +z1.chunks +# (100, 10000) +``` + +Alternatively, if you only ever take slices along the second dimension, then +chunk across the first dimension, e.g.: + +```python +z2 = zarr.create_array(store={}, shape=(10000, 10000), chunks=(10000, 100), dtype='int32') +z2.chunks +# (10000, 100) +``` + +If you require reasonable performance for both access patterns then you need to +find a compromise, e.g.: + +```python +z3 = zarr.create_array(store={}, shape=(10000, 10000), chunks=(1000, 1000), dtype='int32') +z3.chunks +# (1000, 1000) +``` + +If you are feeling lazy, you can let Zarr guess a chunk shape for your data by +providing `chunks='auto'`, although please note that the algorithm for guessing +a chunk shape is based on simple heuristics and may be far from optimal. E.g.: + +```python +z4 = zarr.create_array(store={}, shape=(10000, 10000), chunks='auto', dtype='int32') +z4.chunks +# (625, 625) +``` + +If you know you are always going to be loading the entire array into memory, you +can turn off chunks by providing `chunks` equal to `shape`, in which case there +will be one single chunk for the array: + +```python +z5 = zarr.create_array(store={}, shape=(10000, 10000), chunks=(10000, 10000), dtype='int32') +z5.chunks +# (10000, 10000) +``` + +### Sharding + +If you have large arrays but need small chunks to efficiently access the data, you can +use sharding. Sharding provides a mechanism to store multiple chunks in a single +storage object or file. This can be useful because traditional file systems and object +storage systems may have performance issues storing and accessing many files. +Additionally, small files can be inefficient to store if they are smaller than the +block size of the file system. + +Picking a good combination of chunk shape and shard shape is important for performance. +The chunk shape determines what unit of your data can be read independently, while the +shard shape determines what unit of your data can be written efficiently. + +For an example, consider you have a 100 GB array and need to read small chunks of 1 MB. +Without sharding, each chunk would be one file resulting in 100,000 files. That can +already cause performance issues on some file systems. +With sharding, you could use a shard size of 1 GB. This would result in 1000 chunks per +file and 100 files in total, which seems manageable for most storage systems. +You would still be able to read each 1 MB chunk independently, but you would need to +write your data in 1 GB increments. + +To use sharding, you need to specify the `shards` parameter when creating the array. + +```python +z6 = zarr.create_array(store={}, shape=(10000, 10000, 1000), shards=(1000, 1000, 1000), chunks=(100, 100, 100), dtype='uint8') +z6.info +# Type : Array +# Zarr format : 3 +# Data type : DataType.uint8 +# Fill value : 0 +# Shape : (10000, 10000, 1000) +# Shard shape : (1000, 1000, 1000) +# Chunk shape : (100, 100, 100) +# Order : C +# Read-only : False +# Store type : MemoryStore +# Filters : () +# Serializer : BytesCodec(endian=) +# Compressors : (ZstdCodec(level=0, checksum=False),) +# No. bytes : 100000000000 (93.1G) +``` + +### Chunk memory layout + +The order of bytes **within each chunk** of an array can be changed via the +`order` config option, to use either C or Fortran layout. For +multi-dimensional arrays, these two layouts may provide different compression +ratios, depending on the correlation structure within the data. E.g.: + +```python +import numpy as np + +a = np.arange(100000000, dtype='int32').reshape(10000, 10000).T +c = zarr.create_array(store={}, shape=a.shape, chunks=(1000, 1000), dtype=a.dtype, config={'order': 'C'}) +c[:] = a +c.info_complete() +# Type : Array +# Zarr format : 3 +# Data type : DataType.int32 +# Fill value : 0 +# Shape : (10000, 10000) +# Chunk shape : (1000, 1000) +# Order : C +# Read-only : False +# Store type : MemoryStore +# Filters : () +# Serializer : BytesCodec(endian=) +# Compressors : (ZstdCodec(level=0, checksum=False),) +# No. bytes : 400000000 (381.5M) +# No. bytes stored : 342588911 +# Storage ratio : 1.2 +# Chunks Initialized : 100 + +with zarr.config.set({'array.order': 'F'}): + f = zarr.create_array(store={}, shape=a.shape, chunks=(1000, 1000), dtype=a.dtype) + f[:] = a +f.info_complete() +# Type : Array +# Zarr format : 3 +# Data type : DataType.int32 +# Fill value : 0 +# Shape : (10000, 10000) +# Chunk shape : (1000, 1000) +# Order : F +# Read-only : False +# Store type : MemoryStore +# Filters : () +# Serializer : BytesCodec(endian=) +# Compressors : (ZstdCodec(level=0, checksum=False),) +# No. bytes : 400000000 (381.5M) +# No. bytes stored : 342588911 +# Storage ratio : 1.2 +# Chunks Initialized : 100 +``` + +In the above example, Fortran order gives a better compression ratio. This is an +artificial example but illustrates the general point that changing the order of +bytes within chunks of an array may improve the compression ratio, depending on +the structure of the data, the compression algorithm used, and which compression +filters (e.g., byte-shuffle) have been applied. + +### Empty chunks + +It is possible to configure how Zarr handles the storage of chunks that are "empty" +(i.e., every element in the chunk is equal to the array's fill value). When creating +an array with `write_empty_chunks=False`, Zarr will check whether a chunk is empty before compression and storage. If a chunk is empty, +then Zarr does not store it, and instead deletes the chunk from storage +if the chunk had been previously stored. + +This optimization prevents storing redundant objects and can speed up reads, but the cost is +added computation during array writes, since the contents of +each chunk must be compared to the fill value, and these advantages are contingent on the content of the array. +If you know that your data will form chunks that are almost always non-empty, then there is no advantage to the optimization described above. +In this case, creating an array with `write_empty_chunks=True` (the default) will instruct Zarr to write every chunk without checking for emptiness. + +The following example illustrates the effect of the `write_empty_chunks` flag on +the time required to write an array with different values.: + +```python +import zarr +import numpy as np +import time + +def timed_write(write_empty_chunks): + """ + Measure the time required and number of objects created when writing + to a Zarr array with random ints or fill value. + """ + chunks = (8192,) + shape = (chunks[0] * 1024,) + data = np.random.randint(0, 255, shape) + dtype = 'uint8' + arr = zarr.create_array( + f'data/example-{write_empty_chunks}.zarr', + shape=shape, + chunks=chunks, + dtype=dtype, + fill_value=0, + config={'write_empty_chunks': write_empty_chunks} + ) + # initialize all chunks + arr[:] = 100 + result = [] + for value in (data, arr.fill_value): + start = time.time() + arr[:] = value + elapsed = time.time() - start + result.append((elapsed, arr.nchunks_initialized)) + return result + +# log results +for write_empty_chunks in (True, False): + full, empty = timed_write(write_empty_chunks) + print(f'\nwrite_empty_chunks={write_empty_chunks}:\n\tRandom Data: {full[0]:.4f}s, {full[1]} objects stored\n\t Empty Data: {empty[0]:.4f}s, {empty[1]} objects stored\n') +# write_empty_chunks=True: +# Random Data: ..., 1024 objects stored +# Empty Data: ...s, 1024 objects stored +# +# write_empty_chunks=False: +# Random Data: ...s, 1024 objects stored +# Empty Data: ...s, 0 objects stored +``` + +In this example, writing random data is slightly slower with `write_empty_chunks=True`, +but writing empty data is substantially faster and generates far fewer objects in storage. + +### Changing chunk shapes (rechunking) + +Coming soon. + +## Parallel computing and synchronization + +Coming soon. + +## Pickle support + +Zarr arrays and groups can be pickled, as long as the underlying store object can be +pickled. With the exception of the `zarr.storage.MemoryStore`, any of the +storage classes provided in the `zarr.storage` module can be pickled. + +If an array or group is backed by a persistent store such as the a `zarr.storage.LocalStore`, +`zarr.storage.ZipStore` or `zarr.storage.FsspecStore` then the store data +**are not** pickled. The only thing that is pickled is the necessary parameters to allow the store +to re-open any underlying files or databases upon being unpickled. + +E.g., pickle/unpickle an local store array: + +```python +import pickle +data = np.arange(100000) +z1 = zarr.create_array(store='data/example-2.zarr', shape=data.shape, chunks=data.shape, dtype=data.dtype) +z1[:] = data +s = pickle.dumps(z1) +z2 = pickle.loads(s) +z1 == z2 +# True +np.all(z1[:] == z2[:]) +# np.True_ +``` + +## Configuring Blosc + +Coming soon. diff --git a/docs/user-guide/performance.rst b/docs/user-guide/performance.rst deleted file mode 100644 index 88329f11b8..0000000000 --- a/docs/user-guide/performance.rst +++ /dev/null @@ -1,278 +0,0 @@ -.. only:: doctest - - >>> import shutil - >>> shutil.rmtree('data', ignore_errors=True) - -.. _user-guide-performance: - -Optimizing performance -====================== - -.. _user-guide-chunks: - -Chunk optimizations -------------------- - -.. _user-guide-chunks-shape: - -Chunk size and shape -~~~~~~~~~~~~~~~~~~~~ - -In general, chunks of at least 1 megabyte (1M) uncompressed size seem to provide -better performance, at least when using the Blosc compression library. - -The optimal chunk shape will depend on how you want to access the data. E.g., -for a 2-dimensional array, if you only ever take slices along the first -dimension, then chunk across the second dimension. If you know you want to chunk -across an entire dimension you can use the full size of that dimension within the -``chunks`` argument, e.g.:: - - >>> import zarr - >>> z1 = zarr.create_array(store={}, shape=(10000, 10000), chunks=(100, 10000), dtype='int32') - >>> z1.chunks - (100, 10000) - -Alternatively, if you only ever take slices along the second dimension, then -chunk across the first dimension, e.g.:: - - >>> z2 = zarr.create_array(store={}, shape=(10000, 10000), chunks=(10000, 100), dtype='int32') - >>> z2.chunks - (10000, 100) - -If you require reasonable performance for both access patterns then you need to -find a compromise, e.g.:: - - >>> z3 = zarr.create_array(store={}, shape=(10000, 10000), chunks=(1000, 1000), dtype='int32') - >>> z3.chunks - (1000, 1000) - -If you are feeling lazy, you can let Zarr guess a chunk shape for your data by -providing ``chunks='auto'``, although please note that the algorithm for guessing -a chunk shape is based on simple heuristics and may be far from optimal. E.g.:: - - >>> z4 = zarr.create_array(store={}, shape=(10000, 10000), chunks='auto', dtype='int32') - >>> z4.chunks - (625, 625) - -If you know you are always going to be loading the entire array into memory, you -can turn off chunks by providing ``chunks`` equal to ``shape``, in which case there -will be one single chunk for the array:: - - >>> z5 = zarr.create_array(store={}, shape=(10000, 10000), chunks=(10000, 10000), dtype='int32') - >>> z5.chunks - (10000, 10000) - - -Sharding -~~~~~~~~ - -If you have large arrays but need small chunks to efficiently access the data, you can -use sharding. Sharding provides a mechanism to store multiple chunks in a single -storage object or file. This can be useful because traditional file systems and object -storage systems may have performance issues storing and accessing many files. -Additionally, small files can be inefficient to store if they are smaller than the -block size of the file system. - -Picking a good combination of chunk shape and shard shape is important for performance. -The chunk shape determines what unit of your data can be read independently, while the -shard shape determines what unit of your data can be written efficiently. - -For an example, consider you have a 100 GB array and need to read small chunks of 1 MB. -Without sharding, each chunk would be one file resulting in 100,000 files. That can -already cause performance issues on some file systems. -With sharding, you could use a shard size of 1 GB. This would result in 1000 chunks per -file and 100 files in total, which seems manageable for most storage systems. -You would still be able to read each 1 MB chunk independently, but you would need to -write your data in 1 GB increments. - -To use sharding, you need to specify the ``shards`` parameter when creating the array. - - >>> z6 = zarr.create_array(store={}, shape=(10000, 10000, 1000), shards=(1000, 1000, 1000), chunks=(100, 100, 100), dtype='uint8') - >>> z6.info - Type : Array - Zarr format : 3 - Data type : DataType.uint8 - Fill value : 0 - Shape : (10000, 10000, 1000) - Shard shape : (1000, 1000, 1000) - Chunk shape : (100, 100, 100) - Order : C - Read-only : False - Store type : MemoryStore - Filters : () - Serializer : BytesCodec(endian=) - Compressors : (ZstdCodec(level=0, checksum=False),) - No. bytes : 100000000000 (93.1G) - -.. _user-guide-chunks-order: - -Chunk memory layout -~~~~~~~~~~~~~~~~~~~ - -The order of bytes **within each chunk** of an array can be changed via the -``order`` config option, to use either C or Fortran layout. For -multi-dimensional arrays, these two layouts may provide different compression -ratios, depending on the correlation structure within the data. E.g.:: - - >>> import numpy as np - >>> - >>> a = np.arange(100000000, dtype='int32').reshape(10000, 10000).T - >>> c = zarr.create_array(store={}, shape=a.shape, chunks=(1000, 1000), dtype=a.dtype, config={'order': 'C'}) - >>> c[:] = a - >>> c.info_complete() - Type : Array - Zarr format : 3 - Data type : DataType.int32 - Fill value : 0 - Shape : (10000, 10000) - Chunk shape : (1000, 1000) - Order : C - Read-only : False - Store type : MemoryStore - Filters : () - Serializer : BytesCodec(endian=) - Compressors : (ZstdCodec(level=0, checksum=False),) - No. bytes : 400000000 (381.5M) - No. bytes stored : 342588911 - Storage ratio : 1.2 - Chunks Initialized : 100 - >>> with zarr.config.set({'array.order': 'F'}): - ... f = zarr.create_array(store={}, shape=a.shape, chunks=(1000, 1000), dtype=a.dtype) - ... f[:] = a - >>> f.info_complete() - Type : Array - Zarr format : 3 - Data type : DataType.int32 - Fill value : 0 - Shape : (10000, 10000) - Chunk shape : (1000, 1000) - Order : F - Read-only : False - Store type : MemoryStore - Filters : () - Serializer : BytesCodec(endian=) - Compressors : (ZstdCodec(level=0, checksum=False),) - No. bytes : 400000000 (381.5M) - No. bytes stored : 342588911 - Storage ratio : 1.2 - Chunks Initialized : 100 - -In the above example, Fortran order gives a better compression ratio. This is an -artificial example but illustrates the general point that changing the order of -bytes within chunks of an array may improve the compression ratio, depending on -the structure of the data, the compression algorithm used, and which compression -filters (e.g., byte-shuffle) have been applied. - -.. _user-guide-chunks-empty-chunks: - -Empty chunks -~~~~~~~~~~~~ - -It is possible to configure how Zarr handles the storage of chunks that are "empty" -(i.e., every element in the chunk is equal to the array's fill value). When creating -an array with ``write_empty_chunks=False``, Zarr will check whether a chunk is empty before compression and storage. If a chunk is empty, -then Zarr does not store it, and instead deletes the chunk from storage -if the chunk had been previously stored. - -This optimization prevents storing redundant objects and can speed up reads, but the cost is -added computation during array writes, since the contents of -each chunk must be compared to the fill value, and these advantages are contingent on the content of the array. -If you know that your data will form chunks that are almost always non-empty, then there is no advantage to the optimization described above. -In this case, creating an array with ``write_empty_chunks=True`` (the default) will instruct Zarr to write every chunk without checking for emptiness. - -The following example illustrates the effect of the ``write_empty_chunks`` flag on -the time required to write an array with different values.:: - - >>> import zarr - >>> import numpy as np - >>> import time - >>> - >>> def timed_write(write_empty_chunks): - ... """ - ... Measure the time required and number of objects created when writing - ... to a Zarr array with random ints or fill value. - ... """ - ... chunks = (8192,) - ... shape = (chunks[0] * 1024,) - ... data = np.random.randint(0, 255, shape) - ... dtype = 'uint8' - ... arr = zarr.create_array( - ... f'data/example-{write_empty_chunks}.zarr', - ... shape=shape, - ... chunks=chunks, - ... dtype=dtype, - ... fill_value=0, - ... config={'write_empty_chunks': write_empty_chunks} - ... ) - ... # initialize all chunks - ... arr[:] = 100 - ... result = [] - ... for value in (data, arr.fill_value): - ... start = time.time() - ... arr[:] = value - ... elapsed = time.time() - start - ... result.append((elapsed, arr.nchunks_initialized)) - ... return result - ... # log results - >>> for write_empty_chunks in (True, False): - ... full, empty = timed_write(write_empty_chunks) - ... print(f'\nwrite_empty_chunks={write_empty_chunks}:\n\tRandom Data: {full[0]:.4f}s, {full[1]} objects stored\n\t Empty Data: {empty[0]:.4f}s, {empty[1]} objects stored\n') - write_empty_chunks=True: - Random Data: ..., 1024 objects stored - Empty Data: ...s, 1024 objects stored - - write_empty_chunks=False: - Random Data: ...s, 1024 objects stored - Empty Data: ...s, 0 objects stored - - -In this example, writing random data is slightly slower with ``write_empty_chunks=True``, -but writing empty data is substantially faster and generates far fewer objects in storage. - -.. _user-guide-rechunking: - -Changing chunk shapes (rechunking) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Coming soon. - -.. _user-guide-sync: - -Parallel computing and synchronization --------------------------------------- - -Coming soon. - -.. _user-guide-pickle: - -Pickle support --------------- - -Zarr arrays and groups can be pickled, as long as the underlying store object can be -pickled. With the exception of the :class:`zarr.storage.MemoryStore`, any of the -storage classes provided in the :mod:`zarr.storage` module can be pickled. - -If an array or group is backed by a persistent store such as the a :class:`zarr.storage.LocalStore`, -:class:`zarr.storage.ZipStore` or :class:`zarr.storage.FsspecStore` then the store data -**are not** pickled. The only thing that is pickled is the necessary parameters to allow the store -to re-open any underlying files or databases upon being unpickled. - -E.g., pickle/unpickle an local store array:: - - >>> import pickle - >>> data = np.arange(100000) - >>> z1 = zarr.create_array(store='data/example-2.zarr', shape=data.shape, chunks=data.shape, dtype=data.dtype) - >>> z1[:] = data - >>> s = pickle.dumps(z1) - >>> z2 = pickle.loads(s) - >>> z1 == z2 - True - >>> np.all(z1[:] == z2[:]) - np.True_ - -.. _user-guide-tips-blosc: - -Configuring Blosc ------------------ - -Coming soon. diff --git a/docs/user-guide/storage.md b/docs/user-guide/storage.md new file mode 100644 index 0000000000..7b9c8ff0a2 --- /dev/null +++ b/docs/user-guide/storage.md @@ -0,0 +1,140 @@ +# Storage guide + +Zarr-Python supports multiple storage backends, including: local file systems, +Zip files, remote stores via [fsspec](https://filesystem-spec.readthedocs.io) (S3, HTTP, etc.), and in-memory stores. In +Zarr-Python 3, stores must implement the abstract store API from +`zarr.abc.store.Store`. + +!!! note + Unlike Zarr-Python 2 where the store interface was built around a generic `MutableMapping` + API, Zarr-Python 3 utilizes a custom store API that utilizes Python's AsyncIO library. + +## Implicit Store Creation + +In most cases, it is not required to create a `Store` object explicitly. Passing a string +to Zarr's top level API will result in the store being created automatically.: + +```python +import zarr + +# Implicitly create a writable LocalStore +zarr.create_group(store='data/foo/bar') +# + +# Implicitly create a read-only FsspecStore +zarr.open_group( + store='s3://noaa-nwm-retro-v2-zarr-pds', + mode='r', + storage_options={'anon': True} +) +# > + +# Implicitly creates a MemoryStore +data = {} +zarr.create_group(store=data) +# +``` + +## Explicit Store Creation + +In some cases, it may be helpful to create a store instance directly. Zarr-Python offers four +built-in store: `zarr.storage.LocalStore`, `zarr.storage.FsspecStore`, +`zarr.storage.ZipStore`, `zarr.storage.MemoryStore`, and `zarr.storage.ObjectStore`. + +### Local Store + +The `zarr.storage.LocalStore` stores data in a nested set of directories on a local +filesystem.: + +```python +store = zarr.storage.LocalStore('data/foo/bar', read_only=True) +zarr.open_group(store=store, mode='r') +# +``` + +### Zip Store + +The `zarr.storage.ZipStore` stores the contents of a Zarr hierarchy in a single +Zip file. The [Zip Store specification](https://github.com/zarr-developers/zarr-specs/pull/311) is currently in draft form.: + +```python +store = zarr.storage.ZipStore('data.zip', mode='w') +zarr.create_array(store=store, shape=(2,), dtype='float64') +# +``` + +### Remote Store + +The `zarr.storage.FsspecStore` stores the contents of a Zarr hierarchy in following the same +logical layout as the `LocalStore`, except the store is assumed to be on a remote storage system +such as cloud object storage (e.g. AWS S3, Google Cloud Storage, Azure Blob Store). The +`zarr.storage.FsspecStore` is backed by [fsspec](https://filesystem-spec.readthedocs.io) and can support any backend +that implements the [AbstractFileSystem](https://filesystem-spec.readthedocs.io/en/stable/api.html#fsspec.spec.AbstractFileSystem) +API. `storage_options` can be used to configure the fsspec backend.: + +```python +store = zarr.storage.FsspecStore.from_url( + 's3://noaa-nwm-retro-v2-zarr-pds', + read_only=True, + storage_options={'anon': True} +) +zarr.open_group(store=store, mode='r') +# > +``` + +### Memory Store + +The `zarr.storage.MemoryStore` a in-memory store that allows for serialization of +Zarr data (metadata and chunks) to a dictionary.: + +```python +data = {} +store = zarr.storage.MemoryStore(data) +# TODO: replace with create_array after #2463 +zarr.create_array(store=store, shape=(2,), dtype='float64') +# +``` + +### Object Store + +`zarr.storage.ObjectStore` stores the contents of the Zarr hierarchy using any ObjectStore +[storage implementation](https://developmentseed.org/obstore/latest/api/store/), including AWS S3 (`obstore.store.S3Store`), Google Cloud Storage (`obstore.store.GCSStore`), and Azure Blob Storage (`obstore.store.AzureStore`). This store is backed by [obstore](https://developmentseed.org/obstore/latest/), which +builds on the production quality Rust library [object_store](https://docs.rs/object_store/latest/object_store/). + +```python +from zarr.storage import ObjectStore +from obstore.store import MemoryStore + +store = ObjectStore(MemoryStore()) +zarr.create_array(store=store, shape=(2,), dtype='float64') +# +``` + +Here's an example of using ObjectStore for accessing remote data: + +```python +from zarr.storage import ObjectStore +from obstore.store import S3Store + +s3_store = S3Store('noaa-nwm-retro-v2-zarr-pds', skip_signature=True, region="us-west-2") +store = zarr.storage.ObjectStore(store=s3_store, read_only=True) +group = zarr.open_group(store=store, mode='r') +group.info +# Name : +# Type : Group +# Zarr format : 2 +# Read-only : True +# Store type : ObjectStore +# No. members : 12 +# No. arrays : 12 +# No. groups : 0 +``` + +!!! warning + The `zarr.storage.ObjectStore` class is experimental. + +## Developing custom stores + +Zarr-Python `zarr.abc.store.Store` API is meant to be extended. The Store Abstract Base +Class includes all of the methods needed to be a fully operational store in Zarr Python. +Zarr also provides a test harness for custom stores: `zarr.testing.store.StoreTests`. diff --git a/docs/user-guide/storage.rst b/docs/user-guide/storage.rst deleted file mode 100644 index 4215cbaf20..0000000000 --- a/docs/user-guide/storage.rst +++ /dev/null @@ -1,148 +0,0 @@ -.. only:: doctest - - >>> import shutil - >>> shutil.rmtree('data', ignore_errors=True) - -.. _user-guide-storage: - -Storage guide -============= - -Zarr-Python supports multiple storage backends, including: local file systems, -Zip files, remote stores via fsspec_ (S3, HTTP, etc.), and in-memory stores. In -Zarr-Python 3, stores must implement the abstract store API from -:class:`zarr.abc.store.Store`. - -.. note:: - Unlike Zarr-Python 2 where the store interface was built around a generic ``MutableMapping`` - API, Zarr-Python 3 utilizes a custom store API that utilizes Python's AsyncIO library. - -Implicit Store Creation ------------------------ - -In most cases, it is not required to create a ``Store`` object explicitly. Passing a string -to Zarr's top level API will result in the store being created automatically.: - - >>> import zarr - >>> - >>> # Implicitly create a writable LocalStore - >>> zarr.create_group(store='data/foo/bar') - - >>> - >>> # Implicitly create a read-only FsspecStore - >>> zarr.open_group( - ... store='s3://noaa-nwm-retro-v2-zarr-pds', - ... mode='r', - ... storage_options={'anon': True} - ... ) - > - >>> - >>> # Implicitly creates a MemoryStore - >>> data = {} - >>> zarr.create_group(store=data) - - -Explicit Store Creation ------------------------ - -In some cases, it may be helpful to create a store instance directly. Zarr-Python offers four -built-in store: :class:`zarr.storage.LocalStore`, :class:`zarr.storage.FsspecStore`, -:class:`zarr.storage.ZipStore`, :class:`zarr.storage.MemoryStore`, and :class:`zarr.storage.ObjectStore`. - -Local Store -~~~~~~~~~~~ - -The :class:`zarr.storage.LocalStore` stores data in a nested set of directories on a local -filesystem.: - - >>> store = zarr.storage.LocalStore('data/foo/bar', read_only=True) - >>> zarr.open_group(store=store, mode='r') - - -Zip Store -~~~~~~~~~ - -The :class:`zarr.storage.ZipStore` stores the contents of a Zarr hierarchy in a single -Zip file. The `Zip Store specification`_ is currently in draft form.: - - >>> store = zarr.storage.ZipStore('data.zip', mode='w') - >>> zarr.create_array(store=store, shape=(2,), dtype='float64') - - -Remote Store -~~~~~~~~~~~~ - -The :class:`zarr.storage.FsspecStore` stores the contents of a Zarr hierarchy in following the same -logical layout as the ``LocalStore``, except the store is assumed to be on a remote storage system -such as cloud object storage (e.g. AWS S3, Google Cloud Storage, Azure Blob Store). The -:class:`zarr.storage.FsspecStore` is backed by `fsspec`_ and can support any backend -that implements the `AbstractFileSystem `_ -API. ``storage_options`` can be used to configure the fsspec backend.: - - >>> store = zarr.storage.FsspecStore.from_url( - ... 's3://noaa-nwm-retro-v2-zarr-pds', - ... read_only=True, - ... storage_options={'anon': True} - ... ) - >>> zarr.open_group(store=store, mode='r') - > - -Memory Store -~~~~~~~~~~~~ - -The :class:`zarr.storage.MemoryStore` a in-memory store that allows for serialization of -Zarr data (metadata and chunks) to a dictionary.: - - >>> data = {} - >>> store = zarr.storage.MemoryStore(data) - >>> # TODO: replace with create_array after #2463 - >>> zarr.create_array(store=store, shape=(2,), dtype='float64') - - -Object Store -~~~~~~~~~~~~ - -:class:`zarr.storage.ObjectStore` stores the contents of the Zarr hierarchy using any ObjectStore -`storage implementation `_, including AWS S3 (:class:`obstore.store.S3Store`), Google Cloud Storage (:class:`obstore.store.GCSStore`), and Azure Blob Storage (:class:`obstore.store.AzureStore`). This store is backed by `obstore `_, which -builds on the production quality Rust library `object_store `_. - - - >>> from zarr.storage import ObjectStore - >>> from obstore.store import MemoryStore - >>> - >>> store = ObjectStore(MemoryStore()) - >>> zarr.create_array(store=store, shape=(2,), dtype='float64') - - -Here's an example of using ObjectStore for accessing remote data: - - >>> from zarr.storage import ObjectStore - >>> from obstore.store import S3Store - >>> - >>> s3_store = S3Store('noaa-nwm-retro-v2-zarr-pds', skip_signature=True, region="us-west-2") - >>> store = zarr.storage.ObjectStore(store=s3_store, read_only=True) - >>> group = zarr.open_group(store=store, mode='r') - >>> group.info - Name : - Type : Group - Zarr format : 2 - Read-only : True - Store type : ObjectStore - No. members : 12 - No. arrays : 12 - No. groups : 0 - -.. warning:: - The :class:`zarr.storage.ObjectStore` class is experimental. - -.. _user-guide-custom-stores: - -Developing custom stores ------------------------- - -Zarr-Python :class:`zarr.abc.store.Store` API is meant to be extended. The Store Abstract Base -Class includes all of the methods needed to be a fully operational store in Zarr Python. -Zarr also provides a test harness for custom stores: :class:`zarr.testing.store.StoreTests`. - -.. _Zip Store Specification: https://github.com/zarr-developers/zarr-specs/pull/311 -.. _fsspec: https://filesystem-spec.readthedocs.io diff --git a/docs/user-guide/v3_migration.md b/docs/user-guide/v3_migration.md new file mode 100644 index 0000000000..9f98b95f22 --- /dev/null +++ b/docs/user-guide/v3_migration.md @@ -0,0 +1,220 @@ +# 3.0 Migration Guide + +Zarr-Python 3 represents a major refactor of the Zarr-Python codebase. Some of the +goals motivating this refactor included: + +* adding support for the Zarr format 3 specification (along with the Zarr format 2 specification) +* cleaning up internal and user facing APIs +* improving performance (particularly in high latency storage environments like + cloud object stores) + +To accommodate this, Zarr-Python 3 introduces a number of changes to the API, including a number +of significant breaking changes and deprecations. + +This page provides a guide explaining breaking changes and deprecations to help you +migrate your code from version 2 to version 3. If we have missed anything, please +open a [GitHub issue](https://github.com/zarr-developers/zarr-python/issues/new) +so we can improve this guide. + +## Compatibility target + +The goals described above necessitated some breaking changes to the API (hence the +major version update), but where possible we have maintained backwards compatibility +in the most widely used parts of the API. This in the `zarr.Array` and +`zarr.Group` classes and the "top-level API" (e.g. `zarr.open_array` and +`zarr.open_group`). + +## Getting ready for 3.0 + +Before migrating to Zarr-Python 3, we suggest projects that depend on Zarr-Python take +the following actions in order: + +1. Pin the supported Zarr-Python version to `zarr>=2,<3`. This is a best practice + and will protect your users from any incompatibilities that may arise during the + release of Zarr-Python 3. This pin can be removed after migrating to Zarr-Python 3. +2. Limit your imports from the Zarr-Python package. Most of the primary API `zarr.*` + will be compatible in Zarr-Python 3. However, the following breaking API changes are + planned: + + - `numcodecs.*` will no longer be available in `zarr.*`. To migrate, import codecs + directly from `numcodecs`: + + ```python + from numcodecs import Blosc + # instead of: + # from zarr import Blosc + ``` + + - The `zarr.v3_api_available` feature flag is being removed. In Zarr-Python 3 + the v3 API is always available, so you shouldn't need to use this flag. + - The following internal modules are being removed or significantly changed. If + your application relies on imports from any of the below modules, you will need + to either a) modify your application to no longer rely on these imports or b) + vendor the parts of the specific modules that you need. + + * `zarr.attrs` has gone, with no replacement + * `zarr.codecs` has gone, use `numcodecs` instead + * `zarr.context` has gone, with no replacement + * `zarr.core` remains but should be considered private API + * `zarr.hierarchy` has gone, with no replacement (use `zarr.Group` inplace of `zarr.hierarchy.Group`) + * `zarr.indexing` has gone, with no replacement + * `zarr.meta` has gone, with no replacement + * `zarr.meta_v1` has gone, with no replacement + * `zarr.sync` has gone, with no replacement + * `zarr.types` has gone, with no replacement + * `zarr.util` has gone, with no replacement + * `zarr.n5` has gone, see below for an alternative N5 options + +3. Test that your package works with version 3. +4. Update the pin to include `zarr>=3,<4`. + +## Zarr-Python 2 support window + +Zarr-Python 2.x is still available, though we recommend migrating to Zarr-Python 3 for +its performance improvements and new features. Security and bug fixes will be made to +the 2.x series for at least six months following the first Zarr-Python 3 release. +If you need to use the latest Zarr-Python 2 release, you can install it with: + +```console +$ pip install "zarr==2.*" +``` + +!!! note + Development and maintenance of the 2.x release series has moved to the + [support/v2](https://github.com/zarr-developers/zarr-python/tree/support/v2) branch. + Issues and pull requests related to this branch are tagged with the + [V2](https://github.com/zarr-developers/zarr-python/labels/V2) label. + +## Migrating to Zarr-Python 3 + +The following sections provide details on breaking changes in Zarr-Python 3. + +### The Array class + +1. Disallow direct construction - the signature for initializing the `Array` class has changed + significantly. Please use `zarr.create_array` or `zarr.open_array` instead of + directly constructing the `zarr.Array` class. + +2. Defaulting to `zarr_format=3` - newly created arrays will use the version 3 of the + Zarr specification. To continue using version 2, set `zarr_format=2` when creating arrays + or set `default_zarr_version=2` in Zarr's runtime configuration. + +### The Group class + +1. Disallow direct construction - use `zarr.open_group` or `zarr.create_group` + instead of directly constructing the `zarr.Group` class. +2. Most of the h5py compatibility methods are deprecated and will issue warnings if used. + The following functions are drop in replacements that have the same signature and functionality: + + - Use `zarr.Group.create_array` in place of `zarr.Group.create_dataset` + - Use `zarr.Group.require_array` in place of `zarr.Group.require_dataset` +3. Disallow "." syntax for getting group members. To get a member of a group named `foo`, + use `group["foo"]` in place of `group.foo`. + +### The Store class + +The Store API has changed significant in Zarr-Python 3. The most notable changes to the +Store API are: + +#### Store Import Paths + +Several store implementations have moved from the top-level module to `zarr.storage`: + +```diff title="Store import changes from v2 to v3" +# Before (v2) +- from zarr import MemoryStore, DirectoryStore ++ from zarr.storage import MemoryStore, LocalStore # LocalStore replaces DirectoryStore +``` + +Common replacements: + +| v2 Import | v3 Import | +|=========================|====================================| +| `zarr.MemoryStore` | `zarr.storage.MemoryStore` | +| `zarr.DirectoryStore` | `zarr.storage.LocalStore` | +| `zarr.TempStore` | Use `tempfile.TemporaryDirectory` | +| | with `LocalStore` | + +1. Replaced the `MutableMapping` base class in favor of a custom abstract base class + (`zarr.abc.store.Store`). +2. Switched to an asynchronous interface for all store methods that result in IO. This + change ensures that all store methods are non-blocking and are as performant as + possible. + +Beyond the changes store interface, a number of deprecated stores were also removed in +Zarr-Python 3. See issue #1274 for more details on the removal of these stores. + +- `N5Store` - see https://github.com/zarr-developers/n5py for an alternative interface to + N5 formatted data. +- `ABSStore` - use the `zarr.storage.FsspecStore` instead along with fsspec's + [adlfs backend](https://github.com/fsspec/adlfs). + +The following stores have been removed altogether. Users who need these stores will have to +implement their own version in zarr-python v3. + +- `DBMStore` +- `LMDBStore` +- `SQLiteStore` +- `MongoDBStore` +- `RedisStore` + +At present, the latter five stores in this list do not have an equivalent in Zarr-Python 3. +If you are interested in developing a custom store that targets these backends, see +developing custom stores or open an +[issue](https://github.com/zarr-developers/zarr-python/issues) to discuss your use case. + +### Dependencies + +When installing using `pip`: + +- The new `remote` dependency group can be used to install a supported version of + `fsspec`, required for remote data access. +- The new `gpu` dependency group can be used to install a supported version of + `cuda`, required for GPU functionality. +- The `jupyter` optional dependency group has been removed, since v3 contains no + jupyter specific functionality. + +### Miscellaneous + +- The keyword argument `zarr_version` available in most creation functions in `zarr` + (e.g. `zarr.create`, `zarr.open`, `zarr.group`, `zarr.array`) has + been deprecated in favor of `zarr_format`. + +## 🚧 Work in Progress 🚧 + +Zarr-Python 3 is still under active development, and is not yet fully complete. +The following list summarizes areas of the codebase that we expect to build out +after the 3.0.0 release. If features listed below are important to your use case +of Zarr-Python, please open (or comment on) a +[GitHub issue](https://github.com/zarr-developers/zarr-python/issues/new). + +- The following functions / methods have not been ported to Zarr-Python 3 yet: + + * `zarr.copy` (issue #2407) + * `zarr.copy_all` (issue #2407) + * `zarr.copy_store` (issue #2407) + * `zarr.Group.move` (issue #2108) + +- The following features (corresponding to function arguments to functions in + `zarr`) have not been ported to Zarr-Python 3 yet. Using these features + will raise a warning or a `NotImplementedError`: + + * `cache_attrs` + * `cache_metadata` + * `chunk_store` (issue #2495) + * `meta_array` + * `object_codec` (issue #2617) + * `synchronizer` (issue #1596) + * `dimension_separator` + +- The following features that were supported by Zarr-Python 2 have not been ported + to Zarr-Python 3 yet: + + * Structured arrays / dtypes (issue #2134) + * Fixed-length string dtypes (issue #2347) + * Datetime and timedelta dtypes (issue #2616) + * Object dtypes (issue #2617) + * Ragged arrays (issue #2618) + * Groups and Arrays do not implement `__enter__` and `__exit__` protocols (issue #2619) + * Big Endian dtypes (issue #2324) + * Default filters for object dtypes for Zarr format 2 arrays (issue #2627) diff --git a/docs/user-guide/v3_migration.rst b/docs/user-guide/v3_migration.rst deleted file mode 100644 index a6258534e4..0000000000 --- a/docs/user-guide/v3_migration.rst +++ /dev/null @@ -1,238 +0,0 @@ -.. _v3 migration guide: - -3.0 Migration Guide -=================== - -Zarr-Python 3 represents a major refactor of the Zarr-Python codebase. Some of the -goals motivating this refactor included: - -* adding support for the Zarr format 3 specification (along with the Zarr format 2 specification) -* cleaning up internal and user facing APIs -* improving performance (particularly in high latency storage environments like - cloud object stores) - -To accommodate this, Zarr-Python 3 introduces a number of changes to the API, including a number -of significant breaking changes and deprecations. - -This page provides a guide explaining breaking changes and deprecations to help you -migrate your code from version 2 to version 3. If we have missed anything, please -open a `GitHub issue `_ -so we can improve this guide. - -Compatibility target --------------------- - -The goals described above necessitated some breaking changes to the API (hence the -major version update), but where possible we have maintained backwards compatibility -in the most widely used parts of the API. This in the :class:`zarr.Array` and -:class:`zarr.Group` classes and the "top-level API" (e.g. :func:`zarr.open_array` and -:func:`zarr.open_group`). - -Getting ready for 3.0 ---------------------- - -Before migrating to Zarr-Python 3, we suggest projects that depend on Zarr-Python take -the following actions in order: - -1. Pin the supported Zarr-Python version to ``zarr>=2,<3``. This is a best practice - and will protect your users from any incompatibilities that may arise during the - release of Zarr-Python 3. This pin can be removed after migrating to Zarr-Python 3. -2. Limit your imports from the Zarr-Python package. Most of the primary API ``zarr.*`` - will be compatible in Zarr-Python 3. However, the following breaking API changes are - planned: - - - ``numcodecs.*`` will no longer be available in ``zarr.*``. To migrate, import codecs - directly from ``numcodecs``: - - .. code-block:: python - - from numcodecs import Blosc - # instead of: - # from zarr import Blosc - - - The ``zarr.v3_api_available`` feature flag is being removed. In Zarr-Python 3 - the v3 API is always available, so you shouldn't need to use this flag. - - The following internal modules are being removed or significantly changed. If - your application relies on imports from any of the below modules, you will need - to either a) modify your application to no longer rely on these imports or b) - vendor the parts of the specific modules that you need. - - * ``zarr.attrs`` has gone, with no replacement - * ``zarr.codecs`` has gone, use ``numcodecs`` instead - * ``zarr.context`` has gone, with no replacement - * ``zarr.core`` remains but should be considered private API - * ``zarr.hierarchy`` has gone, with no replacement (use ``zarr.Group`` inplace of ``zarr.hierarchy.Group``) - * ``zarr.indexing`` has gone, with no replacement - * ``zarr.meta`` has gone, with no replacement - * ``zarr.meta_v1`` has gone, with no replacement - * ``zarr.sync`` has gone, with no replacement - * ``zarr.types`` has gone, with no replacement - * ``zarr.util`` has gone, with no replacement - * ``zarr.n5`` has gone, see below for an alternative N5 options - -3. Test that your package works with version 3. -4. Update the pin to include ``zarr>=3,<4``. - -Zarr-Python 2 support window ----------------------------- - -Zarr-Python 2.x is still available, though we recommend migrating to Zarr-Python 3 for -its performance improvements and new features. Security and bug fixes will be made to -the 2.x series for at least six months following the first Zarr-Python 3 release. -If you need to use the latest Zarr-Python 2 release, you can install it with: - -.. code-block:: console - - $ pip install "zarr==2.*" - -.. note:: - Development and maintenance of the 2.x release series has moved to the - `support/v2 `_ branch. - Issues and pull requests related to this branch are tagged with the - `V2 `_ label. - -Migrating to Zarr-Python 3 --------------------------- - -The following sections provide details on breaking changes in Zarr-Python 3. - -The Array class -~~~~~~~~~~~~~~~ - -1. Disallow direct construction - the signature for initializing the ``Array`` class has changed - significantly. Please use :func:`zarr.create_array` or :func:`zarr.open_array` instead of - directly constructing the :class:`zarr.Array` class. - -2. Defaulting to ``zarr_format=3`` - newly created arrays will use the version 3 of the - Zarr specification. To continue using version 2, set ``zarr_format=2`` when creating arrays - or set ``default_zarr_version=2`` in Zarr's :ref:`runtime configuration `. - -The Group class -~~~~~~~~~~~~~~~ - -1. Disallow direct construction - use :func:`zarr.open_group` or :func:`zarr.create_group` - instead of directly constructing the :class:`zarr.Group` class. -2. Most of the h5py compatibility methods are deprecated and will issue warnings if used. - The following functions are drop in replacements that have the same signature and functionality: - - - Use :func:`zarr.Group.create_array` in place of :func:`zarr.Group.create_dataset` - - Use :func:`zarr.Group.require_array` in place of :func:`zarr.Group.require_dataset` -3. Disallow "." syntax for getting group members. To get a member of a group named ``foo``, - use ``group["foo"]`` in place of ``group.foo``. - -The Store class -~~~~~~~~~~~~~~~ - -The Store API has changed significant in Zarr-Python 3. The most notable changes to the -Store API are: - -Store Import Paths -^^^^^^^^^^^^^^^^^^ -Several store implementations have moved from the top-level module to ``zarr.storage``: - -.. code-block:: diff - :caption: Store import changes from v2 to v3 - - # Before (v2) - - from zarr import MemoryStore, DirectoryStore - + from zarr.storage import MemoryStore, LocalStore # LocalStore replaces DirectoryStore - -Common replacements: - -+-------------------------+------------------------------------+ -| v2 Import | v3 Import | -+=========================+====================================+ -| ``zarr.MemoryStore`` | ``zarr.storage.MemoryStore`` | -+-------------------------+------------------------------------+ -| ``zarr.DirectoryStore`` | ``zarr.storage.LocalStore`` | -+-------------------------+------------------------------------+ -| ``zarr.TempStore`` | Use ``tempfile.TemporaryDirectory``| -| | with ``LocalStore`` | -+-------------------------+------------------------------------+ - -1. Replaced the ``MutableMapping`` base class in favor of a custom abstract base class - (:class:`zarr.abc.store.Store`). -2. Switched to an asynchronous interface for all store methods that result in IO. This - change ensures that all store methods are non-blocking and are as performant as - possible. - -Beyond the changes store interface, a number of deprecated stores were also removed in -Zarr-Python 3. See :issue:`1274` for more details on the removal of these stores. - -- ``N5Store`` - see https://github.com/zarr-developers/n5py for an alternative interface to - N5 formatted data. -- ``ABSStore`` - use the :class:`zarr.storage.FsspecStore` instead along with fsspec's - `adlfs backend `_. - -The following stores have been removed altogether. Users who need these stores will have to -implement their own version in zarr-python v3. - -- ``DBMStore`` -- ``LMDBStore`` -- ``SQLiteStore`` -- ``MongoDBStore`` -- ``RedisStore`` - -At present, the latter five stores in this list do not have an equivalent in Zarr-Python 3. -If you are interested in developing a custom store that targets these backends, see -:ref:`developing custom stores ` or open an -`issue `_ to discuss your use case. - -Dependencies -~~~~~~~~~~~~ - -When installing using ``pip``: - -- The new ``remote`` dependency group can be used to install a supported version of - ``fsspec``, required for remote data access. -- The new ``gpu`` dependency group can be used to install a supported version of - ``cuda``, required for GPU functionality. -- The ``jupyter`` optional dependency group has been removed, since v3 contains no - jupyter specific functionality. - -Miscellaneous -~~~~~~~~~~~~~ - -- The keyword argument ``zarr_version`` available in most creation functions in :mod:`zarr` - (e.g. :func:`zarr.create`, :func:`zarr.open`, :func:`zarr.group`, :func:`zarr.array`) has - been deprecated in favor of ``zarr_format``. - -🚧 Work in Progress 🚧 ----------------------- - -Zarr-Python 3 is still under active development, and is not yet fully complete. -The following list summarizes areas of the codebase that we expect to build out -after the 3.0.0 release. If features listed below are important to your use case -of Zarr-Python, please open (or comment on) a -`GitHub issue `_. - -- The following functions / methods have not been ported to Zarr-Python 3 yet: - - * :func:`zarr.copy` (:issue:`2407`) - * :func:`zarr.copy_all` (:issue:`2407`) - * :func:`zarr.copy_store` (:issue:`2407`) - * :func:`zarr.Group.move` (:issue:`2108`) - -- The following features (corresponding to function arguments to functions in - :mod:`zarr`) have not been ported to Zarr-Python 3 yet. Using these features - will raise a warning or a ``NotImplementedError``: - - * ``cache_attrs`` - * ``cache_metadata`` - * ``chunk_store`` (:issue:`2495`) - * ``meta_array`` - * ``object_codec`` (:issue:`2617`) - * ``synchronizer`` (:issue:`1596`) - * ``dimension_separator`` - -- The following features that were supported by Zarr-Python 2 have not been ported - to Zarr-Python 3 yet: - - * Structured arrays / dtypes (:issue:`2134`) - * Fixed-length string dtypes (:issue:`2347`) - * Datetime and timedelta dtypes (:issue:`2616`) - * Object dtypes (:issue:`2617`) - * Ragged arrays (:issue:`2618`) - * Groups and Arrays do not implement ``__enter__`` and ``__exit__`` protocols (:issue:`2619`) - * Big Endian dtypes (:issue:`2324`) - * Default filters for object dtypes for Zarr format 2 arrays (:issue:`2627`) diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000000..4ae5155ce9 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,154 @@ +# Based on https://github.com/developmentseed/obspec/blob/main/mkdocs.yml +site_name: zarr-python +repo_name: zarr-developers/zarr-python +repo_url: https://github.com/zarr-developers/zarr-python +site_description: An implementation of chunked, compressed, N-dimensional arrays for Python. +site_author: Alistair Miles +site_url: https://zarr.readthedocs.io/ +docs_dir: docs + +extra: + version: + alias: true + provider: mike + +nav: + - "index.md" + - User Guide: + - user-guide/installation.md + - user-guide/arrays.md + - user-guide/groups.md + - user-guide/attributes.md + - user-guide/storage.md + - user-guide/config.md + - user-guide/v3_migration.md + - user-guide/performance.md + - user-guide/extending.md + - user-guide/gpu.md + - user-guide/consolidated_metadata.md + - API Reference: + - api/array.md + - api/group.md + - api/create.md + - api/open.md + - api/load.md + - api/save.md + - api/convenience.md + - api/config.md + - api/codecs.md + - api/errors.md + - api/registry.md + - api/storage.md + - api/testing.md + - API: + - api/api_async.md + - api/api_sync.md + - ABC: + - api/abc/codec.md + - api/abc/metadata.md + - api/abc/store.md + - deprecated: + - api/deprecated/convenience.md + - api/deprecated/creation.md + - release-notes.md + - contributing.md +watch: + - src/zarr + - docs + +theme: + language: en + name: material + custom_dir: docs/overrides + logo: _static/logo_horizontal.svg + palette: + # Palette toggle for automatic mode + - media: "(prefers-color-scheme)" + toggle: + icon: material/brightness-auto + name: Switch to light mode + + # Palette toggle for light mode + - media: "(prefers-color-scheme: light)" + primary: blue grey + accent: pink + toggle: + icon: material/brightness-7 + name: Switch to dark mode + + # Palette toggle for dark mode + - media: "(prefers-color-scheme: dark)" + scheme: slate + primary: grey + accent: pink + toggle: + icon: material/brightness-4 + name: Switch to system preference + + font: + text: Roboto + code: Roboto Mono + + features: + - content.code.annotate + - content.code.copy + - navigation.indexes + - navigation.instant + - navigation.tracking + - search.suggest + - search.share + +extra_css: + - overrides/stylesheets/extra.css + +plugins: + - search + - mkdocstrings: + enable_inventory: true + handlers: + python: + paths: [src/zarr] + options: + # We set allow_inspection: false to ensure that all docstrings come + # from the pyi files, not the Rust-facing doc comments. + allow_inspection: false + docstring_section_style: list + docstring_style: numpy + line_length: 60 + separate_signature: true + show_root_heading: true + show_signature_annotations: true + show_source: false + show_symbol_type_toc: true + signature_crossrefs: true + + inventories: + - https://docs.python.org/3/objects.inv + +# https://github.com/developmentseed/titiler/blob/50934c929cca2fa8d3c408d239015f8da429c6a8/docs/mkdocs.yml#L115-L140 +markdown_extensions: + - admonition + - attr_list + - codehilite: + guess_lang: false + - def_list + - footnotes + - md_in_html + - pymdownx.arithmatex + - pymdownx.betterem + - pymdownx.caret: + insert: false + - pymdownx.details + - pymdownx.escapeall: + hardbreak: true + nbsp: true + - pymdownx.magiclink: + hide_protocol: true + repo_url_shortener: true + - pymdownx.smartsymbols + - pymdownx.superfences + - pymdownx.tasklist: + custom_checkbox: true + - pymdownx.tilde + - toc: + permalink: true diff --git a/pyproject.toml b/pyproject.toml index 1f270b435f..256a84c03d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -90,15 +90,12 @@ remote_tests = [ optional = ["rich", "universal-pathlib"] docs = [ # Doc building - 'sphinx==8.1.3', - 'sphinx-autobuild>=2021.3.14', - 'sphinx-autoapi==3.4.0', - 'sphinx_design', - 'sphinx-issues', - 'sphinx-copybutton', - 'sphinx-reredirects', - 'pydata-sphinx-theme', - 'numpydoc', + "mkdocs-material[imaging]>=9.6.14", + "mkdocs>=1.6.1", + "mkdocstrings>=0.29.1", + "mkdocstrings-python>=1.16.10", + "mike>=2.1.3", + "markdown-exec[ansi]", # Changelog generation 'towncrier', # Optional dependencies to run examples @@ -200,8 +197,9 @@ list-env = "pip list" features = ['docs'] [tool.hatch.envs.docs.scripts] -build = "cd docs && make html" -serve = "sphinx-autobuild docs docs/_build --host 0.0.0.0" +serve = "mkdocs serve" +build = "mkdocs build" +readthedocs = "rm -rf $READTHEDOCS_OUTPUT/html && cp -r site $READTHEDOCS_OUTPUT/html" [tool.hatch.envs.upstream] python = "3.13" @@ -443,9 +441,9 @@ checks = [ [tool.towncrier] directory = 'changes' -filename = "docs/release-notes.rst" -underlines = ["-", "~", "^"] -issue_format = ":issue:`{issue}`" +filename = "docs/release-notes.md" +underlines = ["", "", ""] +issue_format = "[#{issue}](https://github.com/zarr-developers/zarr-python/issues{issue})" [tool.codespell] ignore-words-list = "astroid" From 2d8650bfc7e21c7c2d6981a9499e6707935c0c55 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 6 Jun 2025 15:26:48 -0400 Subject: [PATCH 02/13] Update towncrier instructions --- docs/contributing.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributing.md b/docs/contributing.md index b53bffe8a0..8342fd25b2 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -233,7 +233,7 @@ If an existing Zarr format version changes, or a new version of the Zarr format #### Pre-release 1. Make sure that all pull requests which will be included in the release have been properly documented as changelog files in `changes`. -2. Run `towncrier build --version x.y.z` to create the changelog. +2. Run `hatch env run --env docs -- towncrier build --version x.y.z` to create the changelog. #### Releasing From 1eb4292a4a0ba526b8b39cfaba1f2a677ef097f9 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 6 Jun 2025 15:39:26 -0400 Subject: [PATCH 03/13] Update readthedocs config --- .readthedocs.yaml | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 05239456a0..76e6b30d1e 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -7,12 +7,19 @@ build: jobs: pre_build: - | - if [ "$READTHEDOCS_VERSION_TYPE" != "tag" ]; - then - towncrier build --version Unreleased --yes; + if [ "$READTHEDOCS_VERSION_TYPE" != "tag" ]; then + hatch env run --env docs -- towncrier build --version Unreleased --yes fi - commands: - - mamba install -c conda-forge -c nodefaults hatch - - hatch env run --env docs build - - hatch env run --env docs readthedocs + post_build: + - hatch env run --env docs build + - hatch env run --env docs readthedocs + +conda: + channels: + - conda-forge + dependencies: + - hatch + +mkdocs: + configuration: mkdocs.yml From 631b180586bdd38d12eccee431d039181a765f89 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 6 Jun 2025 15:43:31 -0400 Subject: [PATCH 04/13] Remove conda section --- .readthedocs.yaml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 76e6b30d1e..f660be9ba2 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -14,12 +14,5 @@ build: - hatch env run --env docs build - hatch env run --env docs readthedocs - -conda: - channels: - - conda-forge - dependencies: - - hatch - mkdocs: configuration: mkdocs.yml From d9f16882c01802b68751b58223312d351900aa97 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 6 Jun 2025 15:48:15 -0400 Subject: [PATCH 05/13] Install hatch for rtd --- .readthedocs.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index f660be9ba2..4ec49ba700 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -7,6 +7,7 @@ build: jobs: pre_build: - | + python -m pip install hatch if [ "$READTHEDOCS_VERSION_TYPE" != "tag" ]; then hatch env run --env docs -- towncrier build --version Unreleased --yes fi From 97c6b178035969a26b0d83391d165aaefe091757 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 6 Jun 2025 15:53:43 -0400 Subject: [PATCH 06/13] Move build step --- .readthedocs.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 4ec49ba700..48f71ca006 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -11,8 +11,9 @@ build: if [ "$READTHEDOCS_VERSION_TYPE" != "tag" ]; then hatch env run --env docs -- towncrier build --version Unreleased --yes fi - post_build: + build: - hatch env run --env docs build + post_build: - hatch env run --env docs readthedocs mkdocs: From 81f7ba8151114e4cfb04ea2adbf02f83b44f44e3 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 6 Jun 2025 15:58:15 -0400 Subject: [PATCH 07/13] Start a shell with hatch --- .readthedocs.yaml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 48f71ca006..990ca9d449 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -8,13 +8,11 @@ build: pre_build: - | python -m pip install hatch - if [ "$READTHEDOCS_VERSION_TYPE" != "tag" ]; then - hatch env run --env docs -- towncrier build --version Unreleased --yes + hatch shell docs + if [ "$READTHEDOCS_VERSION_TYPE" != "tag" ]; + then + towncrier build --version Unreleased --yes; fi - build: - - hatch env run --env docs build - post_build: - - hatch env run --env docs readthedocs mkdocs: configuration: mkdocs.yml From 1f18a56043392bdd100e052da973abfa758184ce Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 6 Jun 2025 16:08:59 -0400 Subject: [PATCH 08/13] Use commands --- .readthedocs.yaml | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 990ca9d449..ae609495d5 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -4,15 +4,11 @@ build: os: ubuntu-22.04 tools: python: "3.12" - jobs: - pre_build: - - | - python -m pip install hatch - hatch shell docs - if [ "$READTHEDOCS_VERSION_TYPE" != "tag" ]; - then - towncrier build --version Unreleased --yes; - fi - -mkdocs: - configuration: mkdocs.yml + commands: + - | + python -m pip install hatch + if [ "$READTHEDOCS_VERSION_TYPE" != "tag" ]; then + hatch env run --env docs -- towncrier build --version Unreleased --yes + fi + hatch env run --env docs build + hatch env run --env docs readthedocs From d65a2009ce415312517dc0d79c157e6245feb810 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 6 Jun 2025 16:14:43 -0400 Subject: [PATCH 09/13] Don't use hatch --- .readthedocs.yaml | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index ae609495d5..96bc537bd3 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -4,11 +4,20 @@ build: os: ubuntu-22.04 tools: python: "3.12" - commands: - - | - python -m pip install hatch - if [ "$READTHEDOCS_VERSION_TYPE" != "tag" ]; then - hatch env run --env docs -- towncrier build --version Unreleased --yes - fi - hatch env run --env docs build - hatch env run --env docs readthedocs + jobs: + pre_build: + - | + if [ "$READTHEDOCS_VERSION_TYPE" != "tag" ]; + then + towncrier build --version Unreleased --yes; + fi + +mkdocs: + configuration: mkdocs.yml + +python: + install: + - method: pip + path: . + extra_requirements: + - docs From a6ed57e821a6325b0c0294bb893a911314659bbf Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 6 Jun 2025 16:18:40 -0400 Subject: [PATCH 10/13] Figure out redirects later --- mkdocs.yml | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/mkdocs.yml b/mkdocs.yml index 1ca6f2cc95..4ae5155ce9 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -103,20 +103,6 @@ extra_css: plugins: - search - - redirects: - redirect_maps: - spec: https://zarr-specs.readthedocs.io - spec/v1: https://zarr-specs.readthedocs.io/en/latest/v1/v1.0.html - spec/v2: https://zarr-specs.readthedocs.io/en/latest/v2/v2.0.html - spec/v3: https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html - license: https://github.com/zarr-developers/zarr-python/blob/main/LICENSE.txt - tutorial: user-guide - getting-started: quickstart - roadmap: developers/roadmap.html - installation: user-guide/installation.html - api: api/zarr/index - release: release-notes.html - release-notes: release-notes.html - mkdocstrings: enable_inventory: true handlers: From 4e92c3fdaceedd1173561d7800620a1b8baffa98 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 6 Jun 2025 16:22:28 -0400 Subject: [PATCH 11/13] Format docstrings with ruff --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c037f1a99b..85d3240751 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -98,13 +98,14 @@ docs = [ "mike>=2.1.3", "mkdocs-redirects>=1.2.0", "markdown-exec[ansi]", + "ruff", # Changelog generation 'towncrier', # Optional dependencies to run examples 'numcodecs[msgpack]', 'rich', 's3fs>=2023.10.0', - 'astroid<4' + 'astroid<4', ] From b5aef2740b36a05f5d590180c8e732dce82aeb4f Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 6 Jun 2025 16:22:38 -0400 Subject: [PATCH 12/13] git ignore docs output --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index cfc455b715..f2f41270ca 100644 --- a/.gitignore +++ b/.gitignore @@ -49,7 +49,8 @@ coverage.xml # Django stuff: *.log -# Sphinx documentation +# Documentation +site/ docs/_build/ docs/data data From 0e9238553cf43ad3fbd3cb08c2d01f730bdb3ac1 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 6 Jun 2025 16:29:53 -0400 Subject: [PATCH 13/13] Specify towncrier start --- docs/release-notes.md | 10 ++++++---- pyproject.toml | 1 + 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/docs/release-notes.md b/docs/release-notes.md index 0f576b4f4e..5c12704237 100644 --- a/docs/release-notes.md +++ b/docs/release-notes.md @@ -1,11 +1,13 @@ # Release notes + + + ## 3.0.8 (2025-05-19) -> **Warning** -> In versions 3.0.0 to 3.0.7 opening arrays or groups with `mode='a'` (the default for many builtin functions) -> would cause any existing paths in the store to be deleted. This is fixed in 3.0.8, and -> we recommend all users upgrade to avoid this bug that could cause unintentional data loss. +!!! warning + + In versions 3.0.0 to 3.0.7 opening arrays or groups with `mode='a'` (the default for many builtin functions) would cause any existing paths in the store to be deleted. This is fixed in 3.0.8, and we recommend all users upgrade to avoid this bug that could cause unintentional data loss. ### Features diff --git a/pyproject.toml b/pyproject.toml index 85d3240751..bd0f02cdb7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -446,6 +446,7 @@ directory = 'changes' filename = "docs/release-notes.md" underlines = ["", "", ""] issue_format = "[#{issue}](https://github.com/zarr-developers/zarr-python/issues{issue})" +start_string = "\n" [tool.codespell] ignore-words-list = "astroid"