Cambridge-ICCS · ma595 · Jul 9, 2024 · Jul 4, 2024 · Jul 4, 2024 · Jul 4, 2024
diff --git a/exercises/01_penguin_classification.ipynb b/exercises/01_penguin_classification.ipynb
@@ -55,9 +55,13 @@
    "source": [
     "### Task 2: creating a ``torch.utils.data.Dataset``\n",
     "\n",
+    "The penguin data reading and processing can be encapsulated in a PyTorch dataset class.\n",
+    "\n",
+    "- Why is a class representation helpful?\n",
+    "\n",
     "All PyTorch dataset objects are subclasses of the ``torch.utils.data.Dataset`` class. To make a custom dataset, create a class which inherits from the ``Dataset`` class, implement some methods (the Python magic (or dunder) methods ``__len__`` and ``__getitem__``) and supply some data.\n",
     "\n",
-    "Spoiler alert: we've done this for you already in ``src/ml_workshop/_penguins.py``.\n",
+    "Spoiler alert: we've done this for you already below (see ``src/ml_workshop/_penguins.py`` for a more sophisticated implementation)\n",
     "\n",
     "- Open the file ``src/ml_workshop/_penguins.py``.\n",
     "- Let's examine, and discuss, each of the methods together.\n",
@@ -70,9 +74,118 @@
     "- Review and discuss the class arguments.\n",
     "  - ``input_keys``— ...\n",
     "  - ``target_keys``— ...\n",
-    "  - ``train``— ...\n",
-    "  - ``x_tfms``— ...\n",
-    "  - ``y_tfms``— ..."
+    "  - ``train``— ..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import List, Tuple, Any\n",
+    "\n",
+    "# import some useful functions here, see https://pytorch.org/docs/stable/torch.html\n",
+    "# where `tensor` and `eye` are used for constructing tensors,\n",
+    "# and using a lower-precision float32 is advised for performance\n",
+    "# Task 4: add imports here\n",
+    "# from torch import tensor, eye, float32\n",
+    "\n",
+    "from torch.utils.data import Dataset\n",
+    "\n",
+    "from palmerpenguins import load_penguins\n",
+    "\n",
+    "\n",
+    "class PenguinDataset(Dataset):\n",
+    "    \"\"\"Penguin dataset class.\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    input_keys : List[str]\n",
+    "        The column titles to use in the input feature vectors.\n",
+    "    target_keys : List[str]\n",
+    "        The column titles to use in the target feature vectors.\n",
+    "    train : bool\n",
+    "        If ``True``, this object will serve as the training set, and if\n",
+    "        ``False``, the validation set.\n",
+    "\n",
+    "    Notes\n",
+    "    -----\n",
+    "    The validation split contains 10 male and 10 female penguins of each\n",
+    "    species.\n",
+    "\n",
+    "    \"\"\"\n",
+    "\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        input_keys: List[str],\n",
+    "        target_keys: List[str],\n",
+    "        train: bool,\n",
+    "    ):\n",
+    "        \"\"\"Build ``PenguinDataset``.\"\"\"\n",
+    "        self.input_keys = input_keys\n",
+    "        self.target_keys = target_keys\n",
+    "\n",
+    "        data = load_penguins()\n",
+    "        data = (\n",
+    "            data.loc[~data.isna().any(axis=1)]\n",
+    "            .sort_values(by=sorted(data.keys()))\n",
+    "            .reset_index(drop=True)\n",
+    "        )\n",
+    "        # Transform the sex field into a float, with male represented by 1.0, female by 0.0\n",
+    "        data.sex = (data.sex == \"male\").astype(float)\n",
+    "        self.full_df = data\n",
+    "\n",
+    "        valid_df = self.full_df.groupby(by=[\"species\", \"sex\"]).sample(\n",
+    "            n=10,\n",
+    "            random_state=123,\n",
+    "        )\n",
+    "        # The training items are simply the items *not* in the valid split\n",
+    "        train_df = self.full_df.loc[~self.full_df.index.isin(valid_df.index)]\n",
+    "\n",
+    "        self.split = {\"train\": train_df, \"valid\": valid_df}[\n",
+    "            \"train\" if train is True else \"valid\"\n",
+    "        ]\n",
+    "\n",
+    "    def __len__(self) -> int:\n",
+    "        \"\"\"Return the length of requested split.\n",
+    "\n",
+    "        Returns\n",
+    "        -------\n",
+    "        int\n",
+    "            The number of items in the dataset.\n",
+    "\n",
+    "        \"\"\"\n",
+    "        return len(self.split)\n",
+    "\n",
+    "    def __getitem__(self, idx: int) -> Tuple[Any, Any]:\n",
+    "        \"\"\"Return an input-target pair.\n",
+    "\n",
+    "        Parameters\n",
+    "        ----------\n",
+    "        idx : int\n",
+    "            Index of the input-target pair to return.\n",
+    "\n",
+    "        Returns\n",
+    "        -------\n",
+    "        in_feats : Any\n",
+    "            Inputs.\n",
+    "        target : Any\n",
+    "            Targets.\n",
+    "\n",
+    "        \"\"\"\n",
+    "        # get the row index (idx) from the dataframe and\n",
+    "        # select relevant column features (provided as input_keys)\n",
+    "        feats = tuple(self.split.iloc[idx][self.input_keys])\n",
+    "\n",
+    "        # this gives a 'species' i.e. one of ('Gentoo',), ('Chinstrap',), or ('Adelie',)\n",
+    "        tgts = tuple(self.split.iloc[idx][self.target_keys])\n",
+    "\n",
+    "        # Task 4 - Exercise #1: convert the features to PyTorch Tensors\n",
+    "\n",
+    "        # Task 4 - Exercise #2: convert target to a 'one-hot' vector.\n",
+    "\n",
+    "        return feats, tgts"
    ]
   },
   {
@@ -97,8 +210,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from ml_workshop import PenguinDataset\n",
-    "\n",
     "data_set = PenguinDataset(\n",
     "    input_keys=[\"bill_length_mm\", \"body_mass_g\"],\n",
     "    target_keys=[\"species\"],\n",
@@ -117,7 +228,12 @@
    "source": [
     "- Can we give these items to a neural network, or do they need to be transformed first?\n",
     "  - Short answer: no, we can't just pass tuples of numbers or strings to a neural network.\n",
-    "    - We must represent these data as ``torch.Tensor``s."
+    "    - We must represent these data as ``torch.Tensor``s. This is the fundamental data abstraction used by PyTorch; they are the PyTorch equivalent to Numpy arrays, while also providing support for GPU acceleration. See [pytorch tensors documentation](https://pytorch.org/tutorials/beginner/introyt/tensors_deeper_tutorial.html).\n",
+    "    - The targets are tuples of strings i.e. ('Gentoo', )\n",
+    "      - One idea is to represent as ordinal values i.e.  [1] or [2] or [3]. But this implies that the class encoded by value 1 is closer to 2 than 1 is to 3. This is not desirable for categorical data. One-hot encoding avoids this by representing each species independently.\\\n",
+    "      \"A\" — [1, 0, 0]\\\n",
+    "      \"B\" — [0, 1, 0]\\\n",
+    "      \"C\" — [0, 0, 1]"
    ]
   },
   {
@@ -126,14 +242,46 @@
    "source": [
     "### Task 4: Applying transforms to the data\n",
     "\n",
-    "A common way of transforming inputs to neural networks is to apply a series of transforms using ``torchvision.transforms.Compose``. The [``Compose``](https://pytorch.org/vision/stable/generated/torchvision.transforms.Compose.html) object takes a list of callable objects (i.e., functions) and applies them to the incoming data.\n",
+    "Modify the `PenguinDataset` class above so that the tuples of numbers are converted to PyTorch `torch.Tensor` s and the string targets are converted to one-hot vectors.\n",
+    "\n",
+    "- Begin by importing relevant PyTorch functions.\n",
+    "- Apply transformations inside `__getitem__()` function above.\n",
+    "\n",
+    "Then create a training and validation set.\n",
+    "\n",
+    "  - We allow the model to learn directly from the training set—i.e. we fit the function to these data.\n",
+    "  - During training, we monitor the model's performance on the validation set in order to check how it's doing on unseen data. Normally, people use the validation performance to determine when to stop the training process.\n",
+    "  \n",
+    "For the validation set, we choose ten males and ten females of each species. This means the validation set is less likely to be biased by sex and species, and is potentially a more reliable measure of performance. You should always be _very_ careful when choosing metrics and splitting data.\n",
+    "\n",
+    "- Is this solution general?\n",
     "\n",
-    "These transforms can be very useful for mapping between file paths and tensors of images, etc.\n",
+    "A common way of transforming inputs to neural networks is to apply a series of transforms using `torchvision.transforms.Compose`. The [ `Compose` ](https://pytorch.org/vision/stable/generated/torchvision.transforms.Compose.html) object takes a list of callable objects and applies them to the incoming data. See how this is done more generally in the `src/ml_workshop/_penguins.py` file. \n",
+    "\n",
+    "These transforms can be very useful for mapping between file paths and tensors of images, etc.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Apply transforms we need to PenguinDataset to convert input data and target class to tensors. \n",
+    "# See Task 4 exercise comments above.\n",
+    "\n",
+    "# Create train_set\n",
+    "\n",
+    "# Create valid_set\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### (Optional) Task 4b: \n",
     "\n",
-    "- Note: here we create a training and validation set.\n",
-    "    - We allow the model to learn directly from the training set — i.e. we fit the function to these data.\n",
-    "    - During training, we monitor the model's performance on the validation set in order to check how it's doing on unseen data. Normally, people use the validation performance to determine when to stop the training process.\n",
-    "- For the validation set, we choose ten males and ten females of each species. This means the validation set is less likely to be biased by sex and species, and is potentially a more reliable measure of performance. You should always be _very_ careful when choosing metrics and splitting data."
+    "Apply the `torchvision.transforms.Compose` transformations instead of hardcoding as above. "
    ]
   },
   {
@@ -417,7 +565,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.12.2"
   }
  },
  "nbformat": 4,