From d1927404e774e7747f63c218f973184989023c63 Mon Sep 17 00:00:00 2001
From: ayush chaurasia <ayush.chaurarsia@gmail.com>
Date: Mon, 28 Jul 2025 16:37:26 +0530
Subject: [PATCH] add notebook

---
 .../workflow.ipynb                            | 1157 +++++++++++++++++
 1 file changed, 1157 insertions(+)
 create mode 100644 docs/src/examples/geneva/ecommerce-multi-index-routing/workflow.ipynb

diff --git a/docs/src/examples/geneva/ecommerce-multi-index-routing/workflow.ipynb b/docs/src/examples/geneva/ecommerce-multi-index-routing/workflow.ipynb
new file mode 100644
index 0000000..588375b
--- /dev/null
+++ b/docs/src/examples/geneva/ecommerce-multi-index-routing/workflow.ipynb
@@ -0,0 +1,1157 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "id": "ecfe1790-ecf0-44df-928e-c31173b0ac5a",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/lib/python3.10/site-packages/lancedb/__init__.py:238: UserWarning: lance is not fork-safe. If you are using multiprocessing, use spawn instead.\n",
+      "  warnings.warn(\n",
+      "/opt/conda/lib/python3.10/site-packages/lance/__init__.py:168: UserWarning: lance is not fork-safe. If you are using multiprocessing, use spawn instead.\n",
+      "  warnings.warn(\n",
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.1.1\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install --upgrade geneva lancedb google-genai kubernetes \"ray[default]\" rerankers -q"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "id": "537f53ab-39cf-459c-9ff2-f51eaad95286",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
+      "                                 Dload  Upload   Total   Spent    Left  Speed\n",
+      "  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0\n",
+      "100 23.0G  100 23.0G    0     0   221M      0  0:01:46  0:01:46 --:--:--  209M\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+     ]
+    }
+   ],
+   "source": [
+    "#!/bin/bash\n",
+    "!curl -L -o fashion-product-images-dataset.zip\\\n",
+    "  https://www.kaggle.com/api/v1/datasets/download/paramaggarwal/fashion-product-images-dataset\n",
+    "\n",
+    "!unzip -q fashion-product-images-dataset.zip \n",
+    "#### TOY DATASET ###3\n",
+    "#!/bin/bash\n",
+    "#!curl -L -o fashion-product-images-small.zip\\\n",
+    "#  https://www.kaggle.com/api/v1/datasets/download/paramaggarwal/fashion-product-images-small\n",
+    "# !unzip -q fashion-product-images-small.zip -d fashion-dataset/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "73e34039-92b8-4a80-b8c1-a2316e2c0da6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "!sudo rm -r db"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "8dc98a95-87bd-4519-b5ea-518e139a08dc",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "t1 = time.time()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8d154607-c42c-4a20-bbd0-586fdcb0a7ac",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import io\n",
+    "import geneva\n",
+    "import lancedb\n",
+    "import concurrent.futures\n",
+    "\n",
+    "\n",
+    "import pandas as pd\n",
+    "import geneva as gv\n",
+    "import pyarrow as pa\n",
+    "\n",
+    "from pathlib import Path\n",
+    "from PIL import Image\n",
+    "from google import genai\n",
+    "\n",
+    "\n",
+    "import torch\n",
+    "from transformers import CLIPProcessor, CLIPModel\n",
+    "\n",
+    "# === CONFIG ===\n",
+    "os.environ[\"API_KEY\"] = os.getenv(\"GEMINI_API_KEY\") or \"...\"\n",
+    "\n",
+    "\n",
+    "IMG_DIR = Path(\"fashion-dataset/images\")\n",
+    "STYLE_CSV = Path(\"fashion-dataset/styles.csv\")\n",
+    "DB_PATH = \"./db\"\n",
+    "TABLE_NAME = \"products\"\n",
+    "INSERT_FRAG_SIZE = 10000"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "539b21bf-219d-4adf-b697-1c5b706fc04b",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "44417\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "44412"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.read_csv(STYLE_CSV, on_bad_lines='skip')\n",
+    "df = df.dropna(subset=[\"id\", \"productDisplayName\"])    \n",
+    "df = df.drop_duplicates(subset=[\"id\"], keep=\"first\")    \n",
+    "df = df.dropna(subset=[\"id\", \"productDisplayName\"]) # set to 100 for testing\n",
+    "print(len(df))\n",
+    "def generate_rows(df, img_dir):\n",
+    "    for _, row in df.iterrows():\n",
+    "        img_path = img_dir / f\"{row['id']}.jpg\"\n",
+    "        if not img_path.exists():\n",
+    "            continue\n",
+    "        with open(img_path, \"rb\") as f:\n",
+    "            yield {\n",
+    "                \"id\": int(row[\"id\"]),\n",
+    "                \"description\": row[\"productDisplayName\"],\n",
+    "                \"image_bytes\": f.read()\n",
+    "            }\n",
+    "\n",
+    "db = lancedb.connect(DB_PATH)\n",
+    "if TABLE_NAME in db.table_names():\n",
+    "    db.drop_table(TABLE_NAME)\n",
+    "    \n",
+    "data_stream = generate_rows(df, IMG_DIR)\n",
+    "table = None\n",
+    "\n",
+    "rows = []\n",
+    "for row in data_stream:\n",
+    "    rows.append(row)\n",
+    "    if len(rows) == INSERT_FRAG_SIZE:\n",
+    "        if table:\n",
+    "            table.add(rows)\n",
+    "        else:\n",
+    "            table = db.create_table(TABLE_NAME, data=rows)\n",
+    "        rows = []\n",
+    "if rows:\n",
+    "    table.add(rows)\n",
+    "    \n",
+    "len(table)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e1a4850d-1cc4-4256-8881-1f9be541902c",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "\n",
+    "# === FEATURE ENGINEERING with Geneva UDFs ===\n",
+    "table = gv.connect(DB_PATH).open_table(TABLE_NAME)\n",
+    "\n",
+    "# Simple tag extractors\n",
+    "@gv.udf\n",
+    "def color_tags(description: str)-> str:\n",
+    "    colors = [\"black\", \"white\", \"red\", \"blue\", \"green\", \"yellow\", \"pink\", \"brown\"]\n",
+    "    return \" , \".join([c for c in colors if c in description.lower()])\n",
+    "\n",
+    "\n",
+    "\n",
+    "@gv.udf(data_type=pa.string())\n",
+    "def occasion_tagger(batch: pa.RecordBatch) -> pa.Array:\n",
+    "    _gemini = genai.Client(api_key=\"...\")\n",
+    "    descriptions = batch.column(\"description\").to_pylist()\n",
+    "\n",
+    "    def call(desc: str) -> str:\n",
+    "        prompt = (\n",
+    "            f\"Based on the following product description, describe the most suitable \"\n",
+    "            f\"occasion(s) to wear this dress in ≤25 words:\\n\\n{desc}\"\n",
+    "        )\n",
+    "        resp = _gemini.models.generate_content(\n",
+    "            model=\"gemini-2.5-flash-lite\",\n",
+    "            contents=prompt,\n",
+    "            config={\"temperature\": 0.0},\n",
+    "        )\n",
+    "        return resp.text.strip() if resp.text else None\n",
+    "\n",
+    "    with concurrent.futures.ThreadPoolExecutor(max_workers=80) as exec:\n",
+    "        occasions = list(exec.map(call, descriptions))\n",
+    "\n",
+    "    return pa.array(occasions, type=pa.string())\n",
+    "\n",
+    "    \n",
+    "\n",
+    "@gv.udf(data_type=pa.string())\n",
+    "def summarizer(batch: pa.RecordBatch) -> pa.Array:\n",
+    "    _gemini = genai.Client(api_key=\"...\")\n",
+    "    descriptions = batch.column(\"description\").to_pylist()\n",
+    "    \n",
+    "    def call(desc: str) -> str:\n",
+    "        resp = _gemini.models.generate_content(\n",
+    "            model=\"gemini-2.5-flash-lite\",\n",
+    "            contents=f\"Summarize in ≤25 words:\\n{desc}\",\n",
+    "            config={\"temperature\": 0.0},\n",
+    "        )\n",
+    "        return resp.text.strip() if resp.text else None\n",
+    "    \n",
+    "    with concurrent.futures.ThreadPoolExecutor(max_workers=80) as exec:\n",
+    "        summaries = list(exec.map(call, descriptions))\n",
+    "    \n",
+    "    return pa.array(summaries, type=pa.string())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "182c6074-6a82-4e51-8ce3-92c34f775c4d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "table.add_columns({\n",
+    "    \"color_tags\": color_tags,\n",
+    "    \"occasion\": occasion_tagger,\n",
+    "    \"summary\": summarizer\n",
+    "})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "57c464c1-4694-4a7d-a052-d6171b6e804f",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "db = geneva.connect(DB_PATH)\n",
+    "if TABLE_NAME in db.table_names():\n",
+    "    table = db[TABLE_NAME]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "714f496f-b3ae-4c82-8c27-b30f68416afd",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "id: int64\n",
+       "description: string\n",
+       "image_bytes: binary\n",
+       "color_tags: string\n",
+       "  -- field metadata --\n",
+       "  virtual_column.platform.python_version: '3.10.18'\n",
+       "  virtual_column.platform.system: 'Linux'\n",
+       "  virtual_column: 'true'\n",
+       "  virtual_column.udf: '_udfs/682cde9a5fae61d4a646676e925c720ccf8752b2aa6a' + 20\n",
+       "  virtual_column.udf_inputs: '[\"description\"]'\n",
+       "  virtual_column.platform.arch: 'x86_64'\n",
+       "  virtual_column.udf_name: 'color_tags'\n",
+       "  virtual_column.udf_backend: 'DockerUDFSpecV1'\n",
+       "occasion: string\n",
+       "  -- field metadata --\n",
+       "  virtual_column: 'true'\n",
+       "  virtual_column.platform.system: 'Linux'\n",
+       "  virtual_column.udf_backend: 'DockerUDFSpecV1'\n",
+       "  virtual_column.udf: '_udfs/78df167965d047a24e1d56bdb4860a4db5559020d570' + 20\n",
+       "  virtual_column.platform.arch: 'x86_64'\n",
+       "  virtual_column.platform.python_version: '3.10.18'\n",
+       "  virtual_column.udf_inputs: 'null'\n",
+       "  virtual_column.udf_name: 'occasion_tagger'\n",
+       "summary: string\n",
+       "  -- field metadata --\n",
+       "  virtual_column.udf: '_udfs/4b4610b4dcc9cf7b8a8f8176427e26940a6d0ae5bf18' + 20\n",
+       "  virtual_column.udf_inputs: 'null'\n",
+       "  virtual_column: 'true'\n",
+       "  virtual_column.udf_name: 'summarizer'\n",
+       "  virtual_column.udf_backend: 'DockerUDFSpecV1'\n",
+       "  virtual_column.platform.system: 'Linux'\n",
+       "  virtual_column.platform.arch: 'x86_64'\n",
+       "  virtual_column.platform.python_version: '3.10.18'"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "table.schema"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "ca9b5af9-faba-497a-8e76-878ee7246711",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import math\n",
+    "\n",
+    "CONCURRENCY = 9 # REDUCE to 4 ON COLAB \n",
+    "\n",
+    "BATCH_SIZE = 5000"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "289fe85e-e7f9-4502-aec8-33e2fdaff70f",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e413bb3ef72144caac523cc20fd76a7b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Cluster nodes provisioned: |           0 [00:00]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "geneva compute context not ready\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0dac32f4a9d8474b91facd32306effc3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Workers scheduled: |           0 [00:00]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[90m[\u001b[0m2025-07-28T10:32:32Z \u001b[33mWARN \u001b[0m lance::dataset::write::insert\u001b[90m]\u001b[0m No existing dataset at /home/jupyter/semantic_router/db/geneva_jobs.lance, it will be created\n",
+      "/opt/conda/lib/python3.10/site-packages/lancedb/__init__.py:238: UserWarning: lance is not fork-safe. If you are using multiprocessing, use spawn instead.\n",
+      "  warnings.warn(\n",
+      "/opt/conda/lib/python3.10/site-packages/lance/__init__.py:168: UserWarning: lance is not fork-safe. If you are using multiprocessing, use spawn instead.\n",
+      "  warnings.warn(\n",
+      "/opt/conda/lib/python3.10/site-packages/lancedb/__init__.py:238: UserWarning: lance is not fork-safe. If you are using multiprocessing, use spawn instead.\n",
+      "  warnings.warn(\n",
+      "/opt/conda/lib/python3.10/site-packages/lance/__init__.py:168: UserWarning: lance is not fork-safe. If you are using multiprocessing, use spawn instead.\n",
+      "  warnings.warn(\n",
+      "2025-07-28 10:32:37,770\tINFO worker.py:1918 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32m127.0.0.1:8265 \u001b[39m\u001b[22m\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1d6fda12e4144ca98909b71da3fa542f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Workers started: 0it [00:00, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "46bf2402d2214c718815b35a08945471",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches checkpointed: 0it [00:00, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e950d38de47c4f3dbc8a69ba46860e98",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Fragments written: 0it [00:00, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[36m(run_ray_add_column_remote pid=2287045)\u001b[0m \u001b[90m[\u001b[0m2025-07-28T10:32:42Z \u001b[33mWARN \u001b[0m lance::dataset::transaction\u001b[90m]\u001b[0m Building manifest with DataReplacement operation. This operation is not stable yet, please use with caution.\n",
+      "\u001b[36m(run_ray_add_column_remote pid=2287045)\u001b[0m \u001b[90m[\u001b[0m2025-07-28T10:32:42Z \u001b[33mWARN \u001b[0m lance::dataset::transaction\u001b[90m]\u001b[0m Building manifest with DataReplacement operation. This operation is not stable yet, please use with caution.\n",
+      "\u001b[36m(run_ray_add_column_remote pid=2287045)\u001b[0m \u001b[90m[\u001b[0m2025-07-28T10:32:42Z \u001b[33mWARN \u001b[0m lance::dataset::transaction\u001b[90m]\u001b[0m Building manifest with DataReplacement operation. This operation is not stable yet, please use with caution.\n",
+      "\u001b[36m(run_ray_add_column_remote pid=2287045)\u001b[0m \u001b[90m[\u001b[0m2025-07-28T10:32:43Z \u001b[33mWARN \u001b[0m lance::dataset::transaction\u001b[90m]\u001b[0m Building manifest with DataReplacement operation. This operation is not stable yet, please use with caution.\n",
+      "\u001b[36m(run_ray_add_column_remote pid=2287045)\u001b[0m \u001b[90m[\u001b[0m2025-07-28T10:32:43Z \u001b[33mWARN \u001b[0m lance::dataset::transaction\u001b[90m]\u001b[0m Building manifest with DataReplacement operation. This operation is not stable yet, please use with caution.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'e00a3a97-7b06-4cd4-9137-819ad0fd8cc1'"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[36m(run_ray_add_column_remote pid=2287045)\u001b[0m \u001b[90m[\u001b[0m2025-07-28T10:38:04Z \u001b[33mWARN \u001b[0m lance::dataset::transaction\u001b[90m]\u001b[0m Building manifest with DataReplacement operation. This operation is not stable yet, please use with caution.\n",
+      "\u001b[36m(run_ray_add_column_remote pid=2287045)\u001b[0m \u001b[90m[\u001b[0m2025-07-28T10:38:04Z \u001b[33mWARN \u001b[0m lance::dataset::transaction\u001b[90m]\u001b[0m Building manifest with DataReplacement operation. This operation is not stable yet, please use with caution.\n",
+      "\u001b[36m(run_ray_add_column_remote pid=2287045)\u001b[0m \u001b[90m[\u001b[0m2025-07-28T10:38:04Z \u001b[33mWARN \u001b[0m lance::dataset::transaction\u001b[90m]\u001b[0m Building manifest with DataReplacement operation. This operation is not stable yet, please use with caution.\n",
+      "\u001b[36m(run_ray_add_column_remote pid=2287045)\u001b[0m \u001b[90m[\u001b[0m2025-07-28T10:38:04Z \u001b[33mWARN \u001b[0m lance::dataset::transaction\u001b[90m]\u001b[0m Building manifest with DataReplacement operation. This operation is not stable yet, please use with caution.\n",
+      "\u001b[36m(run_ray_add_column_remote pid=2287045)\u001b[0m \u001b[90m[\u001b[0m2025-07-28T10:38:04Z \u001b[33mWARN \u001b[0m lance::dataset::transaction\u001b[90m]\u001b[0m Building manifest with DataReplacement operation. This operation is not stable yet, please use with caution.\n"
+     ]
+    }
+   ],
+   "source": [
+    "table.backfill(\"color_tags\", batch_size=BATCH_SIZE, concurrency=CONCURRENCY)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "2d04933e-682c-4a91-89fc-490ac2114833",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d4de7d7703744ada8f482645b9381ae0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Cluster nodes provisioned: |           0 [00:00]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "679fd68968fa424c9cbb852e33a44863",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Workers scheduled: |           0 [00:00]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0f3a4b045f2443e98f6a839701b965ff",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Workers started: 0it [00:00, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "237ff4aa04984b63a7959def7af45528",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches checkpointed: 0it [00:00, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7fa4c3760764452c928cc08f0a9edc5a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Fragments written:   0%|          | 0/5 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'95b61daa-0467-4954-82d9-262884a3e00f'"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "table.backfill(\"occasion\", batch_size=BATCH_SIZE, concurrency=CONCURRENCY, where=\"1=1\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "d518c919-03e9-43fa-a57e-d30b6435c9ec",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "40e14cf8889d4beca1967cec56a2c71e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Cluster nodes provisioned: |           0 [00:00]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "03d0498adb754a459ea5b788c61ea377",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Workers scheduled: |           0 [00:00]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "46638d66bfb84479af5535b4db44b0d9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Workers started: 0it [00:00, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d41f6e1c74ea48428619ae9c8db21206",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches checkpointed:   0%|          | 0/9 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7782021df98a413c862fe5200e5577c0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Fragments written:   0%|          | 0/5 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[11], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtable\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackfill\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msummary\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mBATCH_SIZE\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconcurrency\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mCONCURRENCY\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mwhere\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m1=1\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/geneva/table.py:380\u001b[0m, in \u001b[0;36mTable.backfill\u001b[0;34m(self, col_name, input_columns, udf, where, concurrency, intra_applier_concurrency, refresh_status_secs, **kwargs)\u001b[0m\n\u001b[1;32m    369\u001b[0m \u001b[38;5;66;03m# Kick off the job\u001b[39;00m\n\u001b[1;32m    370\u001b[0m fut \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbackfill_async(\n\u001b[1;32m    371\u001b[0m     col_name,\n\u001b[1;32m    372\u001b[0m     input_columns\u001b[38;5;241m=\u001b[39minput_columns,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    377\u001b[0m     \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m    378\u001b[0m )\n\u001b[0;32m--> 380\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[43mfut\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdone\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrefresh_status_secs\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m    381\u001b[0m     \u001b[38;5;66;03m# wait for the backfill to complete, updating statuses\u001b[39;00m\n\u001b[1;32m    382\u001b[0m     cs\u001b[38;5;241m.\u001b[39mget_status()\n\u001b[1;32m    383\u001b[0m     fut\u001b[38;5;241m.\u001b[39mstatus()\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/geneva/runners/ray/pipeline.py:801\u001b[0m, in \u001b[0;36mRayJobFuture.done\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    799\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mdone\u001b[39m(\u001b[38;5;28mself\u001b[39m, timeout: \u001b[38;5;28mfloat\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0.0\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mbool\u001b[39m:\n\u001b[1;32m    800\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstatus()\n\u001b[0;32m--> 801\u001b[0m     ready, _ \u001b[38;5;241m=\u001b[39m \u001b[43mray\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mray_obj_ref\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    802\u001b[0m     done \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mbool\u001b[39m(ready)\n\u001b[1;32m    803\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m done:\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/ray/_private/auto_init_hook.py:22\u001b[0m, in \u001b[0;36mwrap_auto_init.<locals>.auto_init_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     19\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(fn)\n\u001b[1;32m     20\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mauto_init_wrapper\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m     21\u001b[0m     auto_init_ray()\n\u001b[0;32m---> 22\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/ray/_private/client_mode_hook.py:104\u001b[0m, in \u001b[0;36mclient_mode_hook.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    102\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m func\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minit\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m is_client_mode_enabled_by_default:\n\u001b[1;32m    103\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(ray, func\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m)(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m--> 104\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/ray/_private/worker.py:3089\u001b[0m, in \u001b[0;36mwait\u001b[0;34m(ray_waitables, num_returns, timeout, fetch_local)\u001b[0m\n\u001b[1;32m   3087\u001b[0m timeout \u001b[38;5;241m=\u001b[39m timeout \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;241m10\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m6\u001b[39m\n\u001b[1;32m   3088\u001b[0m timeout_milliseconds \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mint\u001b[39m(timeout \u001b[38;5;241m*\u001b[39m \u001b[38;5;241m1000\u001b[39m)\n\u001b[0;32m-> 3089\u001b[0m ready_ids, remaining_ids \u001b[38;5;241m=\u001b[39m \u001b[43mworker\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcore_worker\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   3090\u001b[0m \u001b[43m    \u001b[49m\u001b[43mray_waitables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3091\u001b[0m \u001b[43m    \u001b[49m\u001b[43mnum_returns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3092\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtimeout_milliseconds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3093\u001b[0m \u001b[43m    \u001b[49m\u001b[43mfetch_local\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3094\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3095\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ready_ids, remaining_ids\n",
+      "File \u001b[0;32mpython/ray/_raylet.pyx:3512\u001b[0m, in \u001b[0;36mray._raylet.CoreWorker.wait\u001b[0;34m()\u001b[0m\n",
+      "File \u001b[0;32mpython/ray/includes/common.pxi:83\u001b[0m, in \u001b[0;36mray._raylet.check_status\u001b[0;34m()\u001b[0m\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "table.backfill(\"summary\", batch_size=BATCH_SIZE, concurrency=CONCURRENCY, where=\"1=1\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "716bb917-a1aa-4868-8c98-fbdaca465e9a",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "table.search().limit(3).to_pandas()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0445b330-ab65-409b-8025-940aac400fd6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import torchvision.transforms as transforms\n",
+    "import pyarrow as pa\n",
+    "import geneva as gv\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer, AutoModel\n",
+    "\n",
+    "@gv.udf(data_type=pa.list_(pa.float32(), 512),\n",
+    "       cuda=True\n",
+    "       )\n",
+    "class EmbedImage:\n",
+    "    def __init__(self):\n",
+    "        self.ready = False\n",
+    "\n",
+    "    def setup(self):\n",
+    "        self.model = CLIPModel.from_pretrained(\"openai/clip-vit-base-patch32\").cuda()\n",
+    "        self.transform = transforms.Compose([\n",
+    "            transforms.Resize((224, 224)),\n",
+    "            transforms.ToTensor(),\n",
+    "            transforms.Normalize(\n",
+    "                (0.48145466, 0.4578275, 0.40821073),\n",
+    "                (0.26862954, 0.26130258, 0.27577711)\n",
+    "            )\n",
+    "        ])\n",
+    "        self.ready = True\n",
+    "\n",
+    "\n",
+    "    def __call__(self, batch: pa.RecordBatch) -> pa.Array:\n",
+    "        if not self.ready:\n",
+    "            self.setup()\n",
+    "            \n",
+    "        img_bytes = batch.column(\"image_bytes\").to_pylist()\n",
+    "        \n",
+    "        def preprocess(b):\n",
+    "            img = Image.open(io.BytesIO(b)).convert(\"RGB\")\n",
+    "            return self.transform(img)\n",
+    "\n",
+    "        with concurrent.futures.ThreadPoolExecutor(max_workers=50) as exe:\n",
+    "            tensors = list(exe.map(preprocess, img_bytes))\n",
+    "            \n",
+    "        whole = torch.stack(tensors).cuda(non_blocking=True)\n",
+    "\n",
+    "        with torch.no_grad(), torch.cuda.amp.autocast():\n",
+    "            feats = self.model.get_image_features(pixel_values=whole)\n",
+    "\n",
+    "        out = [v.cpu().tolist() for v in feats]\n",
+    "        return pa.array(out, type=pa.list_(pa.float32(), 512))\n",
+    "    \n",
+    "\n",
+    "\n",
+    "\n",
+    "@gv.udf(\n",
+    "    data_type=pa.list_(pa.float32(), 768),\n",
+    "    cuda=True\n",
+    ")\n",
+    "class EmbedText:\n",
+    "    def __init__(self, column: str):\n",
+    "        self.ready = False\n",
+    "        self.column = column\n",
+    "\n",
+    "    def setup(self):\n",
+    "        self.tokenizer = AutoTokenizer.from_pretrained(\"BAAI/bge-base-en-v1.5\")\n",
+    "        self.model = AutoModel.from_pretrained(\"BAAI/bge-base-en-v1.5\").cuda()\n",
+    "        self.ready = True\n",
+    "\n",
+    "    def __call__(self, batch: pa.RecordBatch) -> pa.Array:\n",
+    "        if not self.ready:\n",
+    "            self.setup()\n",
+    "        texts = batch.column(self.column).to_pylist()\n",
+    "        inputs = self.tokenizer(\n",
+    "            texts,\n",
+    "            return_tensors=\"pt\",\n",
+    "            padding=True,\n",
+    "            truncation=True,\n",
+    "            max_length=256\n",
+    "        )\n",
+    "        inputs = {k: v.cuda() for k, v in inputs.items()}\n",
+    "        with torch.no_grad():\n",
+    "            out = self.model(**inputs)\n",
+    "            pooled = out.last_hidden_state.mean(dim=1)\n",
+    "        embeddings = pooled.cpu().tolist()\n",
+    "        return pa.array(embeddings, type=pa.list_(pa.float32(), 768))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1ce41cc0-9d95-4173-97d1-2a29b40f075b",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "table.add_columns({\n",
+    "    \"image_embedding\": EmbedImage(), \n",
+    "    \"summary_embedding\": EmbedText(\"summary\"),\n",
+    "    \"occasion_embedding\": EmbedText(\"occasion\")\n",
+    "})\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0dacc973-566f-4127-a3a7-a168d38bd38d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "table.backfill(\"summary_embedding\", batch_size=BATCH_SIZE, where=\"1=1\", concurrency=CONCURRENCY )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f201d95e-23e5-4188-a6a3-c5c1ebdc4e4a",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "table.backfill(\"occasion_embedding\", batch_size=BATCH_SIZE, concurrency=CONCURRENCY, where=\"1=1\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ead7da89-5468-4b9e-89d9-4814a5928386",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "table.backfill(\"image_embedding\", batch_size=BATCH_SIZE, concurrency=CONCURRENCY, where=\"1=1\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e2f447ce-6424-479a-b4cd-be3b1a48c889",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "table.search().limit(4).to_pandas()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1cb807cd-32dd-4e9d-aca7-39828e412b0e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "table.create_fts_index(\"summary\")\n",
+    "table.create_fts_index(\"occasion\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "db35d78b-356d-49b3-b36c-60c539b9048f",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "table.create_index(vector_column_name=\"summary_embedding\", num_sub_vectors=128)\n",
+    "table.create_index(vector_column_name=\"occasion_embedding\", num_sub_vectors=128)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "61673ab0-71af-444f-b732-5ad75d5b5ce6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "t2 = time.time()\n",
+    "t = t2-t1\n",
+    "t"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dbc59315-3902-42de-b5c9-4d22b1d23150",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import re\n",
+    "import io\n",
+    "import lancedb\n",
+    "import pyarrow as pa\n",
+    "from google import genai\n",
+    "from transformers import CLIPProcessor, CLIPModel\n",
+    "import torch\n",
+    "from typing import List, Optional\n",
+    "import matplotlib.pyplot as plt\n",
+    "from PIL import Image\n",
+    "from lancedb.rerankers import ColbertReranker\n",
+    "\n",
+    "\n",
+    "\n",
+    "genai_client = genai.Client(api_key=\"...\")\n",
+    "\n",
+    "# === CONNECT TO LANCEDB ===\n",
+    "db = lancedb.connect(DB_PATH)\n",
+    "tbl = db.open_table(\"products\")\n",
+    "\n",
+    "# === CLIP FOR QUERY EMBEDDING ===\n",
+    "clip = CLIPModel.from_pretrained(\"openai/clip-vit-base-patch32\").cuda()\n",
+    "proc = CLIPProcessor.from_pretrained(\"openai/clip-vit-base-patch32\")\n",
+    "\n",
+    "# === FEATURE DESCRIPTIONS FOR ROUTING ===\n",
+    "FEATURE_DESCRIPTIONS = {\n",
+    "    \"summary_embedding\": \"semantic intent captured from product summary\",\n",
+    "    \"occasion_embedding\": \"semantic intent captured from occasion description\"\n",
+    "}\n",
+    "VECTOR_FEATURES = set(FEATURE_DESCRIPTIONS.keys())\n",
+    "\n",
+    "# === QUERY ROUTER USING GEMINI ===\n",
+    "def choose_feature(query: str) -> str:\n",
+    "    options = \"\".join(f\"- `{f}`: {d}\" for f, d in FEATURE_DESCRIPTIONS.items())\n",
+    "    prompt = (\n",
+    "        f\"Given the user query:{query}\"\n",
+    "        f\"Which one of the following features best matches the query intent? \"\n",
+    "        f\"Choose exactly one option (only return the feature name as plain string without formatting):{options}\"\n",
+    "        )\n",
+    "    resp = genai_client.models.generate_content(\n",
+    "        model=\"gemini-2.5-flash-lite\",\n",
+    "        contents=prompt,\n",
+    "        config={\"temperature\": 0.0}\n",
+    "    )\n",
+    "    return resp.text.strip()\n",
+    "\n",
+    "# === COLOR EXTRACTION ===\n",
+    "COLOR_LIST = [\"black\",\"white\",\"red\",\"blue\",\"green\",\"yellow\",\"pink\",\"orange\",\"grey\",\"brown\"]\n",
+    "def extract_color(query: str) -> Optional[str]:\n",
+    "    q = query.lower()\n",
+    "    for c in COLOR_LIST:\n",
+    "        if re.search(rf\"\b{c}\b\", q):\n",
+    "            return c\n",
+    "    return None\n",
+    "\n",
+    "# === QUERY EMBEDDING FUNCTION ===\n",
+    "def embed_query(query: str) -> List[float]:\n",
+    "    inputs = proc(text=[query], return_tensors=\"pt\", padding=True)\n",
+    "    inputs = {k: v.cuda() for k, v in inputs.items()}\n",
+    "    with torch.no_grad():\n",
+    "        emb = clip.get_text_features(**inputs)[0]\n",
+    "    return emb.cpu().tolist()\n",
+    "\n",
+    "reranker_occasion = ColbertReranker(column=\"occasion\")\n",
+    "reranker_summary = ColbertReranker(column=\"summary\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0a934e63-4963-461e-97ad-6325049e9221",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "\n",
+    "# === MAIN SEARCH FUNCTION (Hybrid Only) ===\n",
+    "def search(query: str,\n",
+    "           k: int = 5,\n",
+    "           color_filter: Optional[str] = None) -> List[dict]:\n",
+    "    # Route to feature\n",
+    "    feature = choose_feature(query)\n",
+    "    print(f\"[Router] Selected feature: {feature}\")\n",
+    "\n",
+    "    # Extract color filter\n",
+    "    color = extract_color(query)\n",
+    "    if color:\n",
+    "        print(f\"[Filter] Color detected: {color}\")\n",
+    "\n",
+    "    # Always hybrid search on vector feature\n",
+    "    search_input = embed_query(query)\n",
+    "    reranker = reranker_summary if feature==\"summary_embedding\" else reranker_occasion\n",
+    "    qb = tbl.search(query_type=\"hybrid\", vector_column_name=feature).vector(search_input).text(query).rerank(reranker)\n",
+    "           \n",
+    "    #qb = tbl.search(search_input, query_type=\"vector\", vector_column_name=feature)\n",
+    "    if color:\n",
+    "        print(\"filter :\", color)\n",
+    "        qb = qb.where(f\"color_tags like '{color}'\", prefilter=True)\n",
+    "\n",
+    "    qb = qb.limit(k*2) # overfetch and rerank\n",
+    "    return qb.to_list()[:k]\n",
+    "\n",
+    "# === DISPLAY RESULTS FUNCTION ===\n",
+    "def display_results(results: List[dict], cols: int = 4):\n",
+    "    n = len(results)\n",
+    "    rows = (n + cols - 1) // cols\n",
+    "    fig, axes = plt.subplots(rows, cols, figsize=(cols * 3, rows * 3))\n",
+    "    axes = axes.flatten()\n",
+    "    for ax in axes[n:]:\n",
+    "        ax.axis('off')\n",
+    "    for i, item in enumerate(results):\n",
+    "        img = Image.open(io.BytesIO(item['image_bytes']))\n",
+    "        axes[i].imshow(img)\n",
+    "        axes[i].set_title(item.get('description','')[:15], fontsize=8)\n",
+    "        axes[i].axis('off')\n",
+    "    plt.tight_layout()\n",
+    "    plt.show()\n",
+    "\n",
+    "# === EXAMPLE USAGE ===\n",
+    "if __name__ == \"__main__\":\n",
+    "    queries = [\n",
+    "    \"black jacket\",\n",
+    "    \"shirt or t-shirt for business meeting\"\n",
+    "    ]\n",
+    "    for q in queries:\n",
+    "        print(f\"Query: {q}\")\n",
+    "        res = search(q, k=5, color_filter=None)\n",
+    "        print([r[\"occasion\"] for r in  res])\n",
+    "        display_results(res, cols=3)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cb375cb5-0151-4c74-9ead-c9755ac7629c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f04d09e4-4474-4419-89cd-199b123d8ad9",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "\n",
+    "import lancedb\n",
+    "from lancedb.pydantic import LanceModel, Vector\n",
+    "\n",
+    "\n",
+    "\n",
+    "class Documents(LanceModel):\n",
+    "    vec: Vector(2) \n",
+    "    text: str\n",
+    "\n",
+    "data = [\n",
+    "    {\"text\": \"rebel spaceships striking from a hidden base\", \"vec\": [0,0]},\n",
+    "    {\"text\": \"have won their first victory against the evil Galactic Empire\",  \"vec\": [0,0]},\n",
+    "    {\"text\": \"during the battle rebel spies managed to steal secret plans\",  \"vec\": [0,0]},\n",
+    "    {\"text\": \"to the Empire's ultimate weapon the Death Star\",  \"vector\": [0,0]},\n",
+    "]\n",
+    "uri = \"data/sample-lancedb\"\n",
+    "db = lancedb.connect(uri)\n",
+    "table = db.create_table(\"df\", schema=Documents, mode=\"overwrite\")\n",
+    "# ingest docs with auto-vectorization\n",
+    "table.add(data)\n",
+    "# Create a fts index before the hybrid search\n",
+    "table.create_fts_index(\"text\")\n",
+    "# hybrid search with default re-ranker\n",
+    "table.search(query_type=\"hybrid\", vector_column_name=\"vec\").text(\"text\").vector([0,0]).to_pandas()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "006bb922-b9c4-4af9-b9eb-f3e552bbaf2e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "environment": {
+   "kernel": "conda-base-py",
+   "name": "workbench-notebooks.m130",
+   "type": "gcloud",
+   "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/workbench-notebooks:m130"
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel) (Local)",
+   "language": "python",
+   "name": "conda-base-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}