Skip to content

Notebook -> Docs: Replace output with file content #772

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/modules/ROOT/pages/tutorials/fastrp-and-knn.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,10 @@ print(f"Nodes compared: {result['nodesCompared']}")
print(f"Mean similarity: {result['similarityDistribution']['mean']}")
----

----
<_io.TextIOWrapper name='examples/test.txt' mode='r' encoding='UTF-8'>
----

As we can see the mean similarity between nodes is quite high. This is
due to the fact that we have a small example where there are no long
paths between nodes leading to many similar FastRP node embeddings.
Expand Down
6 changes: 5 additions & 1 deletion examples/fastrp-and-knn.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,11 @@
"cell_type": "code",
"execution_count": null,
"id": "9b132f95",
"metadata": {},
"metadata": {
"tags": [
"replace-output-with:test.txt"
]
},
"outputs": [],
"source": [
"# Run kNN and write back to db (we skip memory estimation this time...)\n",
Expand Down
1 change: 1 addition & 0 deletions examples/test.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ABCDE
29 changes: 3 additions & 26 deletions scripts/nb2doc/convert.sh
Original file line number Diff line number Diff line change
@@ -1,28 +1,5 @@
#!/bin/bash

DOC_DIR=doc/modules/ROOT/pages/tutorials
NB_DIR=examples

for notebook in ${NB_DIR}/*.ipynb
do
docfile=$(basename ${notebook} | cut -d. -f1)
echo "${notebook} -> ${DOC_DIR}/${docfile}.adoc"

# --noprompt
#  Skips the "In/Out" lines before each cell
# --ClearMetadataPreprocessor.enabled=True
#  Cleans the "ipython3" language replacing it with "Python"
# (for Asciidoc code cells)
# --ASCIIDocExporter.file_extension=.adoc
# If not set, the extension is .asciidoc

jupyter nbconvert \
--to asciidoc \
--template=scripts/nb2doc/asciidoc-template \
--output-dir ${DOC_DIR} \
--ASCIIDocExporter.file_extension=.adoc \
--no-prompt \
--ClearMetadataPreprocessor.enabled=True \
${notebook}
done

python ./scripts/nb2doc/convert_notebooks.py \
-o "doc/modules/ROOT/pages/tutorials" \
-i "examples/"
104 changes: 104 additions & 0 deletions scripts/nb2doc/convert_notebooks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# reasons for not using nbconvert cli tool:
# * cannot keep output based on a given tag

import argparse
import logging.config
import re
import sys
from pathlib import Path

import nbconvert
from nbconvert.preprocessors import Preprocessor

REPLACE_CELL_OUTPUT_TAG_PATTTERN = r"replace-output-with\:(.*)"
METADATA_TAG_KEY = "tags"

TEMPLATE_DIR = Path("scripts/nb2doc/asciidoc-template")

logging.basicConfig()
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.DEBUG)
logger = logging.getLogger()


class OutputReplacerPreprocessor(Preprocessor):
"""
Replaces the output from tagged code cell in a notebook.
Expected Tag format `replace-with:images/some.png`
"""

def __init__(self, replace_base_dir: Path, **kw):
self._replace_base_dir = replace_base_dir
super().__init__(**kw)

def preprocess_cell(self, cell, resources, cell_index):
"""
Apply a transformation on each cell. See base.py for details.
"""

if replace_tags := [
tag for tag in cell["metadata"].get(METADATA_TAG_KEY, []) if re.match(REPLACE_CELL_OUTPUT_TAG_PATTTERN, tag)
]:
if len(replace_tags) > 1:
raise ValueError(
f"Expected one or zero tags matching `{REPLACE_CELL_OUTPUT_TAG_PATTTERN}`. But got `{replace_tags}`"
)
new_output_file_name = replace_tags[0].split(":")[1].strip()
new_ouput_file = self._replace_base_dir.joinpath(new_output_file_name)
logger.info(f"Replace output with content from: {new_ouput_file}")
with new_ouput_file.open("r") as new_output:
# TODO: figure-out schema of cell outputs
# TODO Implement according to https://nbformat.readthedocs.io/en/latest/format_description.html#display-data
cell.outputs = [
{
"output_type": "display_data",
"data": {"text/plain": str(new_output)},
"metadata": {},
}
]
cell.execution_count = None
return cell, resources


def to_output_file(input_file: Path, output_dir: Path) -> Path:
return output_dir.joinpath(input_file.name.replace(".ipynb", ".adoc"))


def main(input_path: Path, output_dir: Path) -> None:
if input_path.is_file():
notebooks = [input_path]
else:
notebooks = [f for f in input_path.iterdir() if f.is_file() and f.suffix == ".ipynb"]

exporter = nbconvert.ASCIIDocExporter(template_file=str(TEMPLATE_DIR.joinpath("index.adoc.j2")))
# Skips the "In/Out" lines before each cell
exporter.exclude_input_prompt = True
exporter.exclude_output_prompt = True

metadata_cleaner = nbconvert.preprocessors.ClearMetadataPreprocessor(preserve_cell_metadata_mask=METADATA_TAG_KEY)
output_replacer = OutputReplacerPreprocessor(replace_base_dir=input_path)

exporter.register_preprocessor(metadata_cleaner, enabled=True)
exporter.register_preprocessor(output_replacer, enabled=True)

logger.info(f"Converting {len(notebooks)} notebooks.")

for notebook in notebooks:
output_file = to_output_file(notebook, output_dir)
logger.info(f"Converting notebook from `{input_path}` to: `{output_file}`")
output = exporter.from_filename(notebook)

converted = output[0]

with output_file.open(mode="w") as out:
out.write(converted)


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-o", "--output", required=True, help="directory to write the result to")
parser.add_argument("-i", "--input", required=True, help="path to the notebook file")

args = parser.parse_args()

main(Path(args.input), Path(args.output))