diff --git a/Makefile b/Makefile
index 46f0202db..cc9b90255 100644
--- a/Makefile
+++ b/Makefile
@@ -337,7 +337,7 @@ examples-pip-install:
 	cd examples && python setup.py install
 
 examples-test: examples-pip-install
-	cd examples && pytest --no-success-flaky-report --benchmark-disable -n auto --durations=5 . --cov=compiler_gym --cov-report=xml:$(COV_REPORT) $(PYTEST_ARGS)
+	cd examples && pytest --nbmake --no-success-flaky-report --benchmark-disable -n auto --durations=5 . --cov=compiler_gym --cov-report=xml:$(COV_REPORT) $(PYTEST_ARGS)
 
 # Note we export $CI=1 so that the tests always run as if within the CI
 # environement. This is to ensure that the reported coverage matches that of
diff --git a/examples/getting-started.ipynb b/examples/getting-started.ipynb
index ae2920f05..8864adce7 100644
--- a/examples/getting-started.ipynb
+++ b/examples/getting-started.ipynb
@@ -27,6 +27,10 @@
     "id": "AidRbcu8Pwxh"
    },
    "source": [
+    "<a href=\"https://colab.research.google.com/github/facebookresearch/CompilerGym/blob/stable/examples/getting-started.ipynb\">\n",
+    "    <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open in Colab\" height=\"20\">\n",
+    "</a>\n",
+    "\n",
     "# CompilerGym Getting Started\n",
     "\n",
     "CompilerGym is a toolkit for applying reinforcement learning to compiler optimization tasks. This document provides a short walkthrough of the key concepts, using the codesize reduction task of a production-grade compiler as an example. It will take about 20 minutes to work through. Lets get started!"
@@ -535,10 +539,13 @@
    "name": "CompilerGym Getting Started.ipynb",
    "provenance": []
   },
+  "execution": {
+   "timeout": 900
+  },
   "kernelspec": {
-   "display_name": "Python (compiler_gym)",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
-   "name": "compiler_gym"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -550,7 +557,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.8"
+   "version": "3.8.11"
   }
  },
  "nbformat": 4,
diff --git a/examples/requirements.txt b/examples/requirements.txt
index 820ca2a4e..70543e830 100644
--- a/examples/requirements.txt
+++ b/examples/requirements.txt
@@ -3,12 +3,11 @@ dgl==0.6.1
 geneticalgorithm>=1.0.2
 hydra-core==1.1.0
 keras==2.6.0
-matplotlib>=3.3.0
+matplotlib>=3.5.0
 nevergrad>=0.4.3
-numpy~=1.19.2  # Pin version for tensorflow.
 opentuner>=0.8.5
 pandas>=1.1.5
-ray[default,rllib]==1.8.0
+ray[default,rllib]==1.9.0
 submitit>=1.2.0
 submitit>=1.2.0
 tensorflow==2.6.1
diff --git a/examples/rllib.ipynb b/examples/rllib.ipynb
index 6a24977f8..b512b78f8 100644
--- a/examples/rllib.ipynb
+++ b/examples/rllib.ipynb
@@ -22,6 +22,10 @@
     "id": "gsrdt9HooN9K"
    },
    "source": [
+    "<a href=\"https://colab.research.google.com/github/facebookresearch/CompilerGym/blob/stable/examples/rllib.ipynb\">\n",
+    "    <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open in Colab\" height=\"20\">\n",
+    "</a>\n",
+    "\n",
     "# Using CompilerGym environments with RLlib\n",
     "\n",
     "In this notebook we will use [RLlib](https://docs.ray.io/en/master/rllib.html) to train an agent for CompilerGym's [LLVM environment](https://facebookresearch.github.io/CompilerGym/llvm/index.html). RLlib is a popular library for scalable reinforcement learning, built on [Ray](https://docs.ray.io/en/master/index.html). It provides distributed implementations of several standard reinforcement learning algorithms.\n",
@@ -178,17 +182,17 @@
     "from itertools import islice\n",
     "\n",
     "with make_env() as env:\n",
-    "  # The two datasets we will be using:\n",
-    "  npb = env.datasets[\"npb-v0\"]\n",
-    "  chstone = env.datasets[\"chstone-v0\"]\n",
+    "    # The two datasets we will be using:\n",
+    "    npb = env.datasets[\"npb-v0\"]\n",
+    "    chstone = env.datasets[\"chstone-v0\"]\n",
     "\n",
-    "  # Each dataset has a `benchmarks()` method that returns an iterator over the\n",
-    "  # benchmarks within the dataset. Here we will use iterator sliceing to grab a \n",
-    "  # handful of benchmarks for training and validation.\n",
-    "  train_benchmarks = list(islice(npb.benchmarks(), 55))\n",
-    "  train_benchmarks, val_benchmarks = train_benchmarks[:50], train_benchmarks[50:]\n",
-    "  # We will use the entire chstone-v0 dataset for testing.\n",
-    "  test_benchmarks = list(chstone.benchmarks())\n",
+    "    # Each dataset has a `benchmarks()` method that returns an iterator over the\n",
+    "    # benchmarks within the dataset. Here we will use iterator sliceing to grab a \n",
+    "    # handful of benchmarks for training and validation.\n",
+    "    train_benchmarks = list(islice(npb.benchmarks(), 55))\n",
+    "    train_benchmarks, val_benchmarks = train_benchmarks[:50], train_benchmarks[50:]\n",
+    "    # We will use the entire chstone-v0 dataset for testing.\n",
+    "    test_benchmarks = list(chstone.benchmarks())\n",
     "\n",
     "print(\"Number of benchmarks for training:\", len(train_benchmarks))\n",
     "print(\"Number of benchmarks for validation:\", len(val_benchmarks))\n",
@@ -217,11 +221,11 @@
     "from compiler_gym.wrappers import CycleOverBenchmarks\n",
     "\n",
     "def make_training_env(*args) -> compiler_gym.envs.CompilerEnv:\n",
-    "  \"\"\"Make a reinforcement learning environment that cycles over the\n",
-    "  set of training benchmarks in use.\n",
-    "  \"\"\"\n",
-    "  del args  # Unused env_config argument passed by ray\n",
-    "  return CycleOverBenchmarks(make_env(), train_benchmarks)\n",
+    "    \"\"\"Make a reinforcement learning environment that cycles over the\n",
+    "    set of training benchmarks in use.\n",
+    "    \"\"\"\n",
+    "    del args  # Unused env_config argument passed by ray\n",
+    "    return CycleOverBenchmarks(make_env(), train_benchmarks)\n",
     "\n",
     "tune.register_env(\"compiler_gym\", make_training_env)"
    ]
@@ -241,12 +245,12 @@
     "# Lets cycle through a few calls to reset() to demonstrate that this environment\n",
     "# selects a new benchmark for each episode.\n",
     "with make_training_env() as env:\n",
-    "  env.reset()\n",
-    "  print(env.benchmark)\n",
-    "  env.reset()\n",
-    "  print(env.benchmark)\n",
-    "  env.reset()\n",
-    "  print(env.benchmark)"
+    "    env.reset()\n",
+    "    print(env.benchmark)\n",
+    "    env.reset()\n",
+    "    print(env.benchmark)\n",
+    "    env.reset()\n",
+    "    print(env.benchmark)"
    ]
   },
   {
@@ -278,7 +282,7 @@
     "\n",
     "# (Re)Start the ray runtime.\n",
     "if ray.is_initialized():\n",
-    "  ray.shutdown()\n",
+    "    ray.shutdown()\n",
     "ray.init(include_dashboard=False, ignore_reinit_error=True)\n",
     "\n",
     "tune.register_env(\"compiler_gym\", make_training_env)\n",
@@ -366,18 +370,18 @@
     "# performance on a set of benchmarks.\n",
     "\n",
     "def run_agent_on_benchmarks(benchmarks):\n",
-    "  \"\"\"Run agent on a list of benchmarks and return a list of cumulative rewards.\"\"\"\n",
-    "  with make_env() as env:\n",
+    "    \"\"\"Run agent on a list of benchmarks and return a list of cumulative rewards.\"\"\"\n",
     "    rewards = []\n",
-    "    for i, benchmark in enumerate(benchmarks, start=1):\n",
-    "        observation, done = env.reset(benchmark=benchmark), False\n",
-    "        while not done:\n",
-    "            action = agent.compute_action(observation)\n",
-    "            observation, _, done, _ = env.step(action)\n",
-    "        rewards.append(env.episode_reward)\n",
-    "        print(f\"[{i}/{len(benchmarks)}] {env.state}\")\n",
+    "    with make_env() as env:\n",
+    "        for i, benchmark in enumerate(benchmarks, start=1):\n",
+    "            observation, done = env.reset(benchmark=benchmark), False\n",
+    "            while not done:\n",
+    "                action = agent.compute_action(observation)\n",
+    "                observation, _, done, _ = env.step(action)\n",
+    "            rewards.append(env.episode_reward)\n",
+    "            print(f\"[{i}/{len(benchmarks)}] {env.state}\")\n",
     "\n",
-    "  return rewards\n",
+    "    return rewards\n",
     "\n",
     "# Evaluate agent performance on the validation set.\n",
     "val_rewards = run_agent_on_benchmarks(val_benchmarks)"
@@ -413,14 +417,15 @@
    "outputs": [],
    "source": [
     "# Finally lets plot our results to see how we did!\n",
+    "%matplotlib inline\n",
     "from matplotlib import pyplot as plt\n",
     "\n",
     "def plot_results(x, y, name, ax):\n",
-    "  plt.sca(ax)\n",
-    "  plt.bar(range(len(y)), y)\n",
-    "  plt.ylabel(\"Reward (higher is better)\")\n",
-    "  plt.xticks(range(len(x)), x, rotation = 90)\n",
-    "  plt.title(f\"Performance on {name} set\")\n",
+    "    plt.sca(ax)\n",
+    "    plt.bar(range(len(y)), y)\n",
+    "    plt.ylabel(\"Reward (higher is better)\")\n",
+    "    plt.xticks(range(len(x)), x, rotation = 90)\n",
+    "    plt.title(f\"Performance on {name} set\")\n",
     "\n",
     "fig, (ax1, ax2) = plt.subplots(1, 2)\n",
     "fig.set_size_inches(13, 3)\n",
@@ -446,14 +451,27 @@
    "provenance": [],
    "toc_visible": true
   },
+  "execution": {
+   "timeout": 900
+  },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
    "name": "python3"
   },
   "language_info": {
-   "name": "python"
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.11"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }
diff --git a/tests/requirements.txt b/tests/requirements.txt
index a93d91ec0..ec1de6175 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -1,4 +1,5 @@
 flaky==3.7.0
+nbmake==0.10
 psutil==5.8.0  # Implicit dependency of pytest-xdist
 pytest==6.2.5
 pytest-benchmark==3.4.1