rasbt · rasbt · Mar 8, 2025 · Mar 8, 2025 · Mar 8, 2025 · Mar 8, 2025
diff --git a/.github/workflows/basic-tests-linux-uv.yml b/.github/workflows/basic-tests-linux-uv.yml
@@ -60,3 +60,9 @@ jobs:
           pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb
           pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
           pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
+
+      - name: Test Selected Bonus Materials
+        shell: bash
+        run: |
+          source .venv/bin/activate
+          pytest ch02/05_bpe-from-scratch/tests/tests.py
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
+
 # Configs and keys
 ch05/07_gpt_to_llama/config.json
 ch07/02_dataset-utilities/config.json
@@ -63,13 +64,16 @@ ch07/01_main-chapter-code/Smalltestmodel-sft-standalone.pth
 ch07/01_main-chapter-code/gpt2/
 
 # Datasets
+the-verdict.txt
+
 appendix-E/01_main-chapter-code/sms_spam_collection.zip
 appendix-E/01_main-chapter-code/sms_spam_collection
 appendix-E/01_main-chapter-code/train.csv
 appendix-E/01_main-chapter-code/test.csv
 appendix-E/01_main-chapter-code/validation.csv
 
 ch02/01_main-chapter-code/number-data.txt
+ch02/05_bpe-from-scratch/the-verdict.txt
 
 ch05/03_bonus_pretraining_on_gutenberg/gutenberg
 ch05/03_bonus_pretraining_on_gutenberg/gutenberg_preprocessed
@@ -107,7 +111,9 @@ ch02/05_bpe-from-scratch/bpe_merges.txt
 ch02/05_bpe-from-scratch/encoder.json
 ch02/05_bpe-from-scratch/vocab.bpe
 ch02/05_bpe-from-scratch/vocab.json
-
+encoder.json
+vocab.bpe
+vocab.json
 
 # Other
 ch0?/0?_user_interface/.chainlit/

diff --git a/ch02/02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb b/ch02/02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb
@@ -67,7 +67,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "tiktoken version: 0.7.0\n"
+      "tiktoken version: 0.9.0\n"
      ]
     }
    ],
@@ -180,8 +180,8 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Fetching encoder.json: 1.04Mit [00:00, 4.13Mit/s]                                                   \n",
-      "Fetching vocab.bpe: 457kit [00:00, 2.56Mit/s]                                                       \n"
+      "Fetching encoder.json: 1.04Mit [00:00, 3.69Mit/s]                                                   \n",
+      "Fetching vocab.bpe: 457kit [00:00, 2.53Mit/s]                                                       \n"
      ]
     }
    ],
@@ -256,10 +256,18 @@
    "id": "e9077bf4-f91f-42ad-ab76-f3d89128510e",
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/sebastian/Developer/LLMs-from-scratch/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    },
     {
      "data": {
       "text/plain": [
-       "'4.48.0'"
+       "'4.49.0'"
       ]
      },
      "execution_count": 12,
@@ -423,7 +431,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[1544, 18798, 11, 995, 13, 1148, 256, 5303, 82, 438, 257, 1332, 30]\n"
+      "[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]\n"
      ]
     }
    ],
@@ -451,7 +459,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "with open('../01_main-chapter-code/the-verdict.txt', 'r', encoding='utf-8') as f:\n",
+    "with open(\"../01_main-chapter-code/the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
     "    raw_text = f.read()"
    ]
   },
@@ -473,7 +481,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "3.39 ms ± 21.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
+      "3.84 ms ± 9.83 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
      ]
     }
    ],
@@ -499,7 +507,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "1.08 ms ± 5.99 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
+      "901 μs ± 6.27 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
      ]
     }
    ],
@@ -532,7 +540,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "10.2 ms ± 115 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
+      "11 ms ± 94.4 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
      ]
     }
    ],
@@ -550,7 +558,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "10 ms ± 36.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
+      "10.8 ms ± 180 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
      ]
     }
    ],
@@ -575,7 +583,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "3.79 ms ± 48.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
+      "3.66 ms ± 3.67 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
      ]
     }
    ],
@@ -593,7 +601,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "3.83 ms ± 58.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
+      "3.77 ms ± 49.3 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
      ]
     }
    ],
@@ -619,7 +627,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "1.59 ms ± 11.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
+      "9.37 ms ± 50.3 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
      ]
     }
    ],
@@ -644,7 +652,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.10.16"
   }
  },
  "nbformat": 4,