Skip to content

Fix BPE bonus materials #561

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Mar 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/workflows/basic-tests-linux-uv.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,9 @@ jobs:
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb

- name: Test Selected Bonus Materials
shell: bash
run: |
source .venv/bin/activate
pytest ch02/05_bpe-from-scratch/tests/tests.py
8 changes: 7 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@

# Configs and keys
ch05/07_gpt_to_llama/config.json
ch07/02_dataset-utilities/config.json
Expand Down Expand Up @@ -63,13 +64,16 @@ ch07/01_main-chapter-code/Smalltestmodel-sft-standalone.pth
ch07/01_main-chapter-code/gpt2/

# Datasets
the-verdict.txt

appendix-E/01_main-chapter-code/sms_spam_collection.zip
appendix-E/01_main-chapter-code/sms_spam_collection
appendix-E/01_main-chapter-code/train.csv
appendix-E/01_main-chapter-code/test.csv
appendix-E/01_main-chapter-code/validation.csv

ch02/01_main-chapter-code/number-data.txt
ch02/05_bpe-from-scratch/the-verdict.txt

ch05/03_bonus_pretraining_on_gutenberg/gutenberg
ch05/03_bonus_pretraining_on_gutenberg/gutenberg_preprocessed
Expand Down Expand Up @@ -107,7 +111,9 @@ ch02/05_bpe-from-scratch/bpe_merges.txt
ch02/05_bpe-from-scratch/encoder.json
ch02/05_bpe-from-scratch/vocab.bpe
ch02/05_bpe-from-scratch/vocab.json

encoder.json
vocab.bpe
vocab.json

# Other
ch0?/0?_user_interface/.chainlit/
Expand Down
36 changes: 22 additions & 14 deletions ch02/02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"tiktoken version: 0.7.0\n"
"tiktoken version: 0.9.0\n"
]
}
],
Expand Down Expand Up @@ -180,8 +180,8 @@
"name": "stderr",
"output_type": "stream",
"text": [
"Fetching encoder.json: 1.04Mit [00:00, 4.13Mit/s] \n",
"Fetching vocab.bpe: 457kit [00:00, 2.56Mit/s] \n"
"Fetching encoder.json: 1.04Mit [00:00, 3.69Mit/s] \n",
"Fetching vocab.bpe: 457kit [00:00, 2.53Mit/s] \n"
]
}
],
Expand Down Expand Up @@ -256,10 +256,18 @@
"id": "e9077bf4-f91f-42ad-ab76-f3d89128510e",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/sebastian/Developer/LLMs-from-scratch/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
},
{
"data": {
"text/plain": [
"'4.48.0'"
"'4.49.0'"
]
},
"execution_count": 12,
Expand Down Expand Up @@ -423,7 +431,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"[1544, 18798, 11, 995, 13, 1148, 256, 5303, 82, 438, 257, 1332, 30]\n"
"[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]\n"
]
}
],
Expand Down Expand Up @@ -451,7 +459,7 @@
"metadata": {},
"outputs": [],
"source": [
"with open('../01_main-chapter-code/the-verdict.txt', 'r', encoding='utf-8') as f:\n",
"with open(\"../01_main-chapter-code/the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
" raw_text = f.read()"
]
},
Expand All @@ -473,7 +481,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"3.39 ms ± 21.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
"3.84 ms ± 9.83 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
Expand All @@ -499,7 +507,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"1.08 ms ± 5.99 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
"901 μs ± 6.27 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
]
}
],
Expand Down Expand Up @@ -532,7 +540,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"10.2 ms ± 115 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
"11 ms ± 94.4 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
Expand All @@ -550,7 +558,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"10 ms ± 36.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
"10.8 ms ± 180 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
Expand All @@ -575,7 +583,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"3.79 ms ± 48.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
"3.66 ms ± 3.67 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
Expand All @@ -593,7 +601,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"3.83 ms ± 58.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
"3.77 ms ± 49.3 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
Expand All @@ -619,7 +627,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"1.59 ms ± 11.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
"9.37 ms ± 50.3 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
Expand All @@ -644,7 +652,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
"version": "3.10.16"
}
},
"nbformat": 4,
Expand Down
Loading