diff --git a/docs/docs/integrations/providers/superlinked.mdx b/docs/docs/integrations/providers/superlinked.mdx new file mode 100644 index 0000000000000..72c62f47b1ebf --- /dev/null +++ b/docs/docs/integrations/providers/superlinked.mdx @@ -0,0 +1,140 @@ +--- +title: Superlinked +description: LangChain integration package for the Superlinked retrieval stack +--- + +import Link from '@docusaurus/Link'; + +### Overview + +Superlinked enables context‑aware retrieval using multiple space types (text similarity, categorical, numerical, recency, and more). The `langchain-superlinked` package provides a LangChain‑native `SuperlinkedRetriever` that plugs directly into your RAG chains. + +### Links + +- Integration repository +- Superlinked core repository +- Article: Build RAG using LangChain & Superlinked + +### Install + +```bash +pip install -U langchain-superlinked superlinked +``` + +### Quickstart + +```python +import superlinked.framework as sl +from langchain_superlinked import SuperlinkedRetriever + +# 1) Define schema +class DocumentSchema(sl.Schema): + id: sl.IdField + content: sl.String + +doc_schema = DocumentSchema() + +# 2) Define space and index +text_space = sl.TextSimilaritySpace( + text=doc_schema.content, model="sentence-transformers/all-MiniLM-L6-v2" +) +doc_index = sl.Index([text_space]) + +# 3) Define query +query = ( + sl.Query(doc_index) + .find(doc_schema) + .similar(text_space.text, sl.Param("query_text")) + .select([doc_schema.content]) + .limit(sl.Param("limit")) +) + +# 4) Minimal app setup +source = sl.InMemorySource(schema=doc_schema) +executor = sl.InMemoryExecutor(sources=[source], indices=[doc_index]) +app = executor.run() +source.put([ + {"id": "1", "content": "Machine learning algorithms process data efficiently."}, + {"id": "2", "content": "Natural language processing understands human language."}, +]) + +# 5) LangChain retriever +retriever = SuperlinkedRetriever( + sl_client=app, sl_query=query, page_content_field="content" +) + +# Search +docs = retriever.invoke("artificial intelligence", limit=2) +for d in docs: + print(d.page_content) +``` + +### What the retriever expects (App and Query) + +The retriever takes two core inputs: + +- `sl_client`: a Superlinked App created by running an executor (e.g., `InMemoryExecutor(...).run()`) +- `sl_query`: a `QueryDescriptor` returned by chaining `sl.Query(...).find(...).similar(...).select(...).limit(...)` + +Minimal setup: + +```python +import superlinked.framework as sl +from langchain_superlinked import SuperlinkedRetriever + +class Doc(sl.Schema): + id: sl.IdField + content: sl.String + +doc = Doc() +space = sl.TextSimilaritySpace(text=doc.content, model="sentence-transformers/all-MiniLM-L6-v2") +index = sl.Index([space]) + +query = ( + sl.Query(index) + .find(doc) + .similar(space.text, sl.Param("query_text")) + .select([doc.content]) + .limit(sl.Param("limit")) +) + +source = sl.InMemorySource(schema=doc) +app = sl.InMemoryExecutor(sources=[source], indices=[index]).run() + +retriever = SuperlinkedRetriever(sl_client=app, sl_query=query, page_content_field="content") +``` + +Note: For a persistent vector DB, pass `vector_database=...` to the executor (e.g., Qdrant) before `.run()`. + +### Use within a chain + +```python +from langchain_core.runnables import RunnablePassthrough +from langchain_core.prompts import ChatPromptTemplate +from langchain_openai import ChatOpenAI + +def format_docs(docs): + return "\n\n".join(doc.page_content for doc in docs) + +prompt = ChatPromptTemplate.from_template( + """ + Answer based on context:\n\nContext: {context}\nQuestion: {question} + """ +) + +chain = ({"context": retriever | format_docs, "question": RunnablePassthrough()} + | prompt + | ChatOpenAI()) + +answer = chain.invoke("How does machine learning work?") +``` + +### Resources + +- PyPI: langchain-superlinked +- PyPI: superlinked +- Source repository +- Superlinked core repository +- Build RAG using LangChain & Superlinked (article) + + diff --git a/docs/docs/integrations/retrievers/superlinked.ipynb b/docs/docs/integrations/retrievers/superlinked.ipynb new file mode 100644 index 0000000000000..c6fb15c819abf --- /dev/null +++ b/docs/docs/integrations/retrievers/superlinked.ipynb @@ -0,0 +1,1292 @@ +{ + "cells": [ + { + "cell_type": "raw", + "id": "7fb27b941602401d91542211134fc71a", + "metadata": { + "vscode": { + "languageId": "raw" + } + }, + "source": [ + "---\n", + "sidebar_label: SuperlinkedRetriever\n", + "---\n" + ] + }, + { + "cell_type": "raw", + "id": "8ce296f3", + "metadata": { + "vscode": { + "languageId": "raw" + } + }, + "source": [ + "# SuperlinkedRetriever\n", + "\n", + "> [Superlinked](https://github.com/superlinked/superlinked) is a library for building context-aware vector search applications. It provides multi-modal vector spaces that can handle text similarity, categorical similarity, recency, and numerical values with flexible weighting strategies.\n", + "\n", + "This will help you get started with the SuperlinkedRetriever [retriever](/docs/concepts/retrievers/). For detailed documentation of all SuperlinkedRetriever features and configurations head to the [API reference](https://python.langchain.com/api_reference/superlinked/retrievers/langchain_superlinked.retrievers.SuperlinkedRetriever.html).\n", + "\n", + "### Further reading\n", + "\n", + "- External article: [Build RAG using LangChain & Superlinked](https://links.superlinked.com/langchain_article)\n", + "- Integration repo: [superlinked/langchain-superlinked](https://github.com/superlinked/langchain-superlinked)\n", + "- Superlinked core repo: [superlinked/superlinked](https://links.superlinked.com/langchain_repo_sl)\n", + "\n", + "### Integration details\n", + "\n", + "| Retriever | Source | Package |\n", + "| :--- | :--- | :---: |\n", + "[SuperlinkedRetriever](https://python.langchain.com/api_reference/superlinked/retrievers/langchain_superlinked.retrievers.SuperlinkedRetriever.html) | Multi-modal vector search | langchain-superlinked |\n", + "\n", + "## Setup\n", + "\n", + "The SuperlinkedRetriever requires the `langchain-superlinked` package and its peer dependency `superlinked`. You can install these with:\n", + "\n", + "```bash\n", + "pip install -U langchain-superlinked superlinked\n", + "```\n", + "\n", + "No API keys are required for basic usage as Superlinked can run in-memory or with local vector databases.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "efd00169", + "metadata": {}, + "outputs": [], + "source": [ + "# Optional: Set up for vector database usage\n", + "# import os\n", + "# os.environ[\"QDRANT_API_KEY\"] = \"your-api-key\" # For Qdrant\n", + "# No setup required for in-memory usage\n" + ] + }, + { + "cell_type": "markdown", + "id": "aff64151", + "metadata": {}, + "source": [ + "### App and Query: what the retriever needs\n", + "\n", + "The retriever requires:\n", + "\n", + "- `sl_client`: a Superlinked App created by an executor's `.run()`\n", + "- `sl_query`: a `QueryDescriptor` built via `sl.Query(...).find(...).similar(...).select(...).limit(...)`\n", + "\n", + "Minimal example:\n", + "\n", + "```python\n", + "import superlinked.framework as sl\n", + "from langchain_superlinked import SuperlinkedRetriever\n", + "\n", + "class Doc(sl.Schema):\n", + " id: sl.IdField\n", + " content: sl.String\n", + "\n", + "doc = Doc()\n", + "space = sl.TextSimilaritySpace(text=doc.content, model=\"sentence-transformers/all-MiniLM-L6-v2\")\n", + "index = sl.Index([space])\n", + "\n", + "query = (\n", + " sl.Query(index)\n", + " .find(doc)\n", + " .similar(space.text, sl.Param(\"query_text\"))\n", + " .select([doc.content])\n", + " .limit(sl.Param(\"limit\"))\n", + ")\n", + "\n", + "source = sl.InMemorySource(schema=doc)\n", + "app = sl.InMemoryExecutor(sources=[source], indices=[index]).run()\n", + "\n", + "retriever = SuperlinkedRetriever(sl_client=app, sl_query=query, page_content_field=\"content\")\n", + "```\n", + "\n", + "For a production setup, create the executor with a vector DB (e.g., Qdrant) and pass it as `vector_database=...` before calling `.run()`.\n" + ] + }, + { + "cell_type": "raw", + "id": "acae54e37e7d407bbb7b55eff062a284", + "metadata": { + "vscode": { + "languageId": "raw" + } + }, + "source": [ + "## Instantiation\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "346a75b0", + "metadata": {}, + "outputs": [], + "source": [ + "import superlinked.framework as sl\n", + "from langchain_superlinked import SuperlinkedRetriever\n", + "\n", + "\n", + "# 1. Define Schema\n", + "class DocumentSchema(sl.Schema):\n", + " id: sl.IdField\n", + " content: sl.String\n", + "\n", + "\n", + "doc_schema = DocumentSchema()\n", + "\n", + "# 2. Define Space and Index\n", + "text_space = sl.TextSimilaritySpace(\n", + " text=doc_schema.content, model=\"sentence-transformers/all-MiniLM-L6-v2\"\n", + ")\n", + "doc_index = sl.Index([text_space])\n", + "\n", + "# 3. Define Query\n", + "query = (\n", + " sl.Query(doc_index)\n", + " .find(doc_schema)\n", + " .similar(text_space.text, sl.Param(\"query_text\"))\n", + " .select([doc_schema.content])\n", + " .limit(sl.Param(\"limit\"))\n", + ")\n", + "\n", + "# 4. Set up data and app\n", + "documents = [\n", + " {\n", + " \"id\": \"doc1\",\n", + " \"content\": \"Machine learning algorithms can process large datasets efficiently.\",\n", + " },\n", + " {\n", + " \"id\": \"doc2\",\n", + " \"content\": \"Natural language processing enables computers to understand human language.\",\n", + " },\n", + " {\n", + " \"id\": \"doc3\",\n", + " \"content\": \"Deep learning models require significant computational resources.\",\n", + " },\n", + " {\n", + " \"id\": \"doc4\",\n", + " \"content\": \"Artificial intelligence is transforming various industries.\",\n", + " },\n", + " {\n", + " \"id\": \"doc5\",\n", + " \"content\": \"Neural networks are inspired by biological brain structures.\",\n", + " },\n", + "]\n", + "\n", + "source = sl.InMemorySource(schema=doc_schema)\n", + "executor = sl.InMemoryExecutor(sources=[source], indices=[doc_index])\n", + "app = executor.run()\n", + "source.put(documents)\n", + "\n", + "# 5. Create Retriever\n", + "retriever = SuperlinkedRetriever(\n", + " sl_client=app, sl_query=query, page_content_field=\"content\", k=3\n", + ")" + ] + }, + { + "cell_type": "raw", + "id": "9a63283cbaf04dbcab1f6479b197f3a8", + "metadata": { + "vscode": { + "languageId": "raw" + } + }, + "source": [ + "## Usage\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fcc79d71", + "metadata": {}, + "outputs": [], + "source": [ + "# Basic usage\n", + "results = retriever.invoke(\"artificial intelligence and machine learning\", limit=2)\n", + "for i, doc in enumerate(results, 1):\n", + " print(f\"Document {i}:\")\n", + " print(f\"Content: {doc.page_content}\")\n", + " print(f\"Metadata: {doc.metadata}\")\n", + " print(\"---\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4afdd5ff", + "metadata": {}, + "outputs": [], + "source": [ + "# Override k parameter at query time\n", + "more_results = retriever.invoke(\"neural networks and deep learning\", k=4)\n", + "print(f\"Retrieved {len(more_results)} documents:\")\n", + "for i, doc in enumerate(more_results, 1):\n", + " print(f\"{i}. {doc.page_content[:50]}...\")" + ] + }, + { + "cell_type": "raw", + "id": "8dd0d8092fe74a7c96281538738b07e2", + "metadata": { + "vscode": { + "languageId": "raw" + } + }, + "source": [ + "## Use within a chain\n", + "\n", + "Like other retrievers, SuperlinkedRetriever can be incorporated into LLM applications via [chains](/docs/how_to/sequence/).\n", + "\n", + "We will need a LLM or chat model:\n", + "\n", + "import ChatModelTabs from \"@theme/ChatModelTabs\";\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a42e63cf", + "metadata": {}, + "outputs": [], + "source": [ + "# pip install -qU langchain-openai\n", + "import getpass\n", + "import os\n", + "\n", + "if not os.environ.get(\"OPENAI_API_KEY\"):\n", + " os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"Enter your OpenAI API key: \")\n", + "\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "llm = ChatOpenAI(model=\"gpt-4o-mini\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0158acc1", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain import hub\n", + "from langchain_core.output_parsers import StrOutputParser\n", + "from langchain_core.runnables import RunnablePassthrough\n", + "\n", + "prompt = hub.pull(\"rlm/rag-prompt\")\n", + "\n", + "\n", + "def format_docs(docs):\n", + " return \"\\n\\n\".join(doc.page_content for doc in docs)\n", + "\n", + "\n", + "rag_chain = (\n", + " {\"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n", + " | prompt\n", + " | llm\n", + " | StrOutputParser()\n", + ")\n", + "\n", + "rag_chain.invoke(\"What is machine learning and how does it work?\")" + ] + }, + { + "cell_type": "raw", + "id": "72eea5119410473aa328ad9291626812", + "metadata": { + "vscode": { + "languageId": "raw" + } + }, + "source": [ + "## API reference\n", + "\n", + "For detailed documentation of all SuperlinkedRetriever features and configurations, head to the [API reference](https://python.langchain.com/api_reference/superlinked/retrievers/langchain_superlinked.retrievers.SuperlinkedRetriever.html).\n" + ] + }, + { + "cell_type": "markdown", + "id": "f76bb133", + "metadata": {}, + "source": [ + "\"\"\"\n", + "SuperlinkedRetriever Usage Examples\n", + "\n", + "This file demonstrates how to use the SuperlinkedRetriever with different\n", + "space configurations to showcase its flexibility across various use cases.\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31a9d4f5", + "metadata": {}, + "outputs": [], + "source": [ + "import superlinked.framework as sl\n", + "from datetime import datetime, timedelta\n", + "from typing import Optional, List, Dict, Any\n", + "from langchain_core.documents import Document\n", + "\n", + "from langchain_superlinked import SuperlinkedRetriever" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a608192", + "metadata": {}, + "outputs": [], + "source": [ + "def example_1_simple_text_search():\n", + " \"\"\"\n", + " Example 1: Simple text-based semantic search\n", + " Use case: Basic document retrieval based on content similarity\n", + " \"\"\"\n", + " print(\"=== Example 1: Simple Text Search ===\")\n", + "\n", + " # 1. Define Schema\n", + " class DocumentSchema(sl.Schema):\n", + " id: sl.IdField\n", + " content: sl.String\n", + "\n", + " doc_schema = DocumentSchema()\n", + "\n", + " # 2. Define Space and Index\n", + " text_space = sl.TextSimilaritySpace(\n", + " text=doc_schema.content, model=\"sentence-transformers/all-MiniLM-L6-v2\"\n", + " )\n", + "\n", + " doc_index = sl.Index([text_space])\n", + "\n", + " # 3. Define Query\n", + " query = (\n", + " sl.Query(doc_index)\n", + " .find(doc_schema)\n", + " .similar(text_space.text, sl.Param(\"query_text\"))\n", + " .select([doc_schema.content])\n", + " .limit(sl.Param(\"limit\"))\n", + " )\n", + "\n", + " # 4. Set up data and app using executor pattern\n", + " documents = [\n", + " {\n", + " \"id\": \"doc1\",\n", + " \"content\": \"Machine learning algorithms can process large datasets efficiently.\",\n", + " },\n", + " {\n", + " \"id\": \"doc2\",\n", + " \"content\": \"Natural language processing enables computers to understand human language.\",\n", + " },\n", + " {\n", + " \"id\": \"doc3\",\n", + " \"content\": \"Deep learning models require significant computational resources.\",\n", + " },\n", + " {\n", + " \"id\": \"doc4\",\n", + " \"content\": \"Data science combines statistics, programming, and domain expertise.\",\n", + " },\n", + " {\n", + " \"id\": \"doc5\",\n", + " \"content\": \"Artificial intelligence is transforming various industries.\",\n", + " },\n", + " ]\n", + "\n", + " # Create source and executor\n", + " source = sl.InMemorySource(schema=doc_schema)\n", + " executor = sl.InMemoryExecutor(sources=[source], indices=[doc_index])\n", + " app = executor.run()\n", + "\n", + " # Add data to the source after the app is running\n", + " source.put(documents)\n", + "\n", + " # 5. Create Retriever\n", + " retriever = SuperlinkedRetriever(\n", + " sl_client=app, sl_query=query, page_content_field=\"content\"\n", + " )\n", + "\n", + " # 6. Use the retriever\n", + " results = retriever.invoke(\"artificial intelligence and machine learning\", limit=3)\n", + "\n", + " print(f\"Query: 'artificial intelligence and machine learning'\")\n", + " print(f\"Found {len(results)} documents:\")\n", + " for i, doc in enumerate(results, 1):\n", + " print(f\" {i}. {doc.page_content}\")\n", + " print()\n", + "\n", + "\n", + "def example_2_multi_space_blog_search():\n", + " \"\"\"\n", + " Example 2: Multi-space blog post search\n", + " Use case: Blog search with content, category, and recency\n", + " \"\"\"\n", + " print(\"=== Example 2: Multi-Space Blog Search ===\")\n", + "\n", + " # 1. Define Schema\n", + " class BlogPostSchema(sl.Schema):\n", + " id: sl.IdField\n", + " title: sl.String\n", + " content: sl.String\n", + " category: sl.String\n", + " published_date: sl.Timestamp\n", + " view_count: sl.Integer\n", + "\n", + " blog_schema = BlogPostSchema()\n", + "\n", + " # 2. Define Multiple Spaces\n", + " # Text similarity for content\n", + " content_space = sl.TextSimilaritySpace(\n", + " text=blog_schema.content, model=\"sentence-transformers/all-MiniLM-L6-v2\"\n", + " )\n", + "\n", + " # Title similarity\n", + " title_space = sl.TextSimilaritySpace(\n", + " text=blog_schema.title, model=\"sentence-transformers/all-MiniLM-L6-v2\"\n", + " )\n", + "\n", + " # Category similarity\n", + " category_space = sl.CategoricalSimilaritySpace(\n", + " category_input=blog_schema.category,\n", + " categories=[\"technology\", \"science\", \"business\", \"health\", \"travel\"],\n", + " )\n", + "\n", + " # Recency (favor recent posts)\n", + " recency_space = sl.RecencySpace(\n", + " timestamp=blog_schema.published_date,\n", + " period_time_list=[\n", + " sl.PeriodTime(timedelta(days=30)), # Last month\n", + " sl.PeriodTime(timedelta(days=90)), # Last 3 months\n", + " sl.PeriodTime(timedelta(days=365)), # Last year\n", + " ],\n", + " )\n", + "\n", + " # Popularity (based on view count)\n", + " popularity_space = sl.NumberSpace(\n", + " number=blog_schema.view_count,\n", + " min_value=0,\n", + " max_value=10000,\n", + " mode=sl.Mode.MAXIMUM,\n", + " )\n", + "\n", + " # 3. Create Index\n", + " blog_index = sl.Index(\n", + " [content_space, title_space, category_space, recency_space, popularity_space]\n", + " )\n", + "\n", + " # 4. Define Query with multiple weighted spaces\n", + " blog_query = (\n", + " sl.Query(\n", + " blog_index,\n", + " weights={\n", + " content_space: sl.Param(\"content_weight\"),\n", + " title_space: sl.Param(\"title_weight\"),\n", + " category_space: sl.Param(\"category_weight\"),\n", + " recency_space: sl.Param(\"recency_weight\"),\n", + " popularity_space: sl.Param(\"popularity_weight\"),\n", + " },\n", + " )\n", + " .find(blog_schema)\n", + " .similar(content_space.text, sl.Param(\"query_text\"))\n", + " .select(\n", + " [\n", + " blog_schema.title,\n", + " blog_schema.content,\n", + " blog_schema.category,\n", + " blog_schema.published_date,\n", + " blog_schema.view_count,\n", + " ]\n", + " )\n", + " .limit(sl.Param(\"limit\"))\n", + " )\n", + "\n", + " # 5. Sample blog data\n", + " from datetime import datetime\n", + "\n", + " # Convert datetime objects to unix timestamps (integers) as required by Timestamp schema field\n", + " blog_posts = [\n", + " {\n", + " \"id\": \"post1\",\n", + " \"title\": \"Introduction to Machine Learning\",\n", + " \"content\": \"Machine learning is revolutionizing how we process data and make predictions.\",\n", + " \"category\": \"technology\",\n", + " \"published_date\": int((datetime.now() - timedelta(days=5)).timestamp()),\n", + " \"view_count\": 1500,\n", + " },\n", + " {\n", + " \"id\": \"post2\",\n", + " \"title\": \"The Future of AI in Healthcare\",\n", + " \"content\": \"Artificial intelligence is transforming medical diagnosis and treatment.\",\n", + " \"category\": \"health\",\n", + " \"published_date\": int((datetime.now() - timedelta(days=15)).timestamp()),\n", + " \"view_count\": 2300,\n", + " },\n", + " {\n", + " \"id\": \"post3\",\n", + " \"title\": \"Business Analytics with Python\",\n", + " \"content\": \"Learn how to use Python for business data analysis and visualization.\",\n", + " \"category\": \"business\",\n", + " \"published_date\": int((datetime.now() - timedelta(days=45)).timestamp()),\n", + " \"view_count\": 980,\n", + " },\n", + " {\n", + " \"id\": \"post4\",\n", + " \"title\": \"Deep Learning Neural Networks\",\n", + " \"content\": \"Understanding neural networks and their applications in modern AI.\",\n", + " \"category\": \"technology\",\n", + " \"published_date\": int((datetime.now() - timedelta(days=2)).timestamp()),\n", + " \"view_count\": 3200,\n", + " },\n", + " ]\n", + "\n", + " # Create source and executor\n", + " source = sl.InMemorySource(schema=blog_schema)\n", + " executor = sl.InMemoryExecutor(sources=[source], indices=[blog_index])\n", + " app = executor.run()\n", + "\n", + " # Add data to the source after the app is running\n", + " source.put(blog_posts)\n", + "\n", + " # 6. Create Retriever\n", + " retriever = SuperlinkedRetriever(\n", + " sl_client=app,\n", + " sl_query=blog_query,\n", + " page_content_field=\"content\",\n", + " metadata_fields=[\"title\", \"category\", \"published_date\", \"view_count\"],\n", + " )\n", + "\n", + " # 7. Demonstrate different weighting strategies\n", + " scenarios = [\n", + " {\n", + " \"name\": \"Content-focused search\",\n", + " \"params\": {\n", + " \"content_weight\": 1.0,\n", + " \"title_weight\": 0.3,\n", + " \"category_weight\": 0.1,\n", + " \"recency_weight\": 0.2,\n", + " \"popularity_weight\": 0.1,\n", + " \"limit\": 3,\n", + " },\n", + " },\n", + " {\n", + " \"name\": \"Recent posts prioritized\",\n", + " \"params\": {\n", + " \"content_weight\": 0.5,\n", + " \"title_weight\": 0.2,\n", + " \"category_weight\": 0.1,\n", + " \"recency_weight\": 1.0,\n", + " \"popularity_weight\": 0.1,\n", + " \"limit\": 3,\n", + " },\n", + " },\n", + " {\n", + " \"name\": \"Popular posts with category emphasis\",\n", + " \"params\": {\n", + " \"content_weight\": 0.6,\n", + " \"title_weight\": 0.3,\n", + " \"category_weight\": 0.8,\n", + " \"recency_weight\": 0.3,\n", + " \"popularity_weight\": 0.9,\n", + " \"limit\": 3,\n", + " },\n", + " },\n", + " ]\n", + "\n", + " query_text = \"machine learning and AI applications\"\n", + "\n", + " for scenario in scenarios:\n", + " print(f\"\\n--- {scenario['name']} ---\")\n", + " print(f\"Query: '{query_text}'\")\n", + "\n", + " results = retriever.invoke(query_text, **scenario[\"params\"])\n", + "\n", + " for i, doc in enumerate(results, 1):\n", + " print(\n", + " f\" {i}. {doc.metadata['title']} (Category: {doc.metadata['category']}, Views: {doc.metadata['view_count']})\"\n", + " )\n", + "\n", + " print()\n", + "\n", + "\n", + "def example_3_ecommerce_product_search():\n", + " \"\"\"\n", + " Example 3: E-commerce product search\n", + " Use case: Product search with price range, brand preference, and ratings\n", + " \"\"\"\n", + " print(\"=== Example 3: E-commerce Product Search ===\")\n", + "\n", + " # 1. Define Schema\n", + " class ProductSchema(sl.Schema):\n", + " id: sl.IdField\n", + " name: sl.String\n", + " description: sl.String\n", + " brand: sl.String\n", + " price: sl.Float\n", + " rating: sl.Float\n", + " category: sl.String\n", + "\n", + " product_schema = ProductSchema()\n", + "\n", + " # 2. Define Spaces\n", + " description_space = sl.TextSimilaritySpace(\n", + " text=product_schema.description, model=\"sentence-transformers/all-MiniLM-L6-v2\"\n", + " )\n", + "\n", + " name_space = sl.TextSimilaritySpace(\n", + " text=product_schema.name, model=\"sentence-transformers/all-MiniLM-L6-v2\"\n", + " )\n", + "\n", + " brand_space = sl.CategoricalSimilaritySpace(\n", + " category_input=product_schema.brand,\n", + " categories=[\"Apple\", \"Samsung\", \"Sony\", \"Nike\", \"Adidas\", \"Canon\"],\n", + " )\n", + "\n", + " category_space = sl.CategoricalSimilaritySpace(\n", + " category_input=product_schema.category,\n", + " categories=[\"electronics\", \"clothing\", \"sports\", \"photography\"],\n", + " )\n", + "\n", + " # Price space (lower prices get higher scores in MINIMUM mode)\n", + " price_space = sl.NumberSpace(\n", + " number=product_schema.price,\n", + " min_value=10.0,\n", + " max_value=2000.0,\n", + " mode=sl.Mode.MINIMUM, # Favor lower prices\n", + " )\n", + "\n", + " # Rating space (higher ratings get higher scores)\n", + " rating_space = sl.NumberSpace(\n", + " number=product_schema.rating,\n", + " min_value=1.0,\n", + " max_value=5.0,\n", + " mode=sl.Mode.MAXIMUM, # Favor higher ratings\n", + " )\n", + "\n", + " # 3. Create Index\n", + " product_index = sl.Index(\n", + " [\n", + " description_space,\n", + " name_space,\n", + " brand_space,\n", + " category_space,\n", + " price_space,\n", + " rating_space,\n", + " ]\n", + " )\n", + "\n", + " # 4. Define Query\n", + " product_query = (\n", + " sl.Query(\n", + " product_index,\n", + " weights={\n", + " description_space: sl.Param(\"description_weight\"),\n", + " name_space: sl.Param(\"name_weight\"),\n", + " brand_space: sl.Param(\"brand_weight\"),\n", + " category_space: sl.Param(\"category_weight\"),\n", + " price_space: sl.Param(\"price_weight\"),\n", + " rating_space: sl.Param(\"rating_weight\"),\n", + " },\n", + " )\n", + " .find(product_schema)\n", + " .similar(description_space.text, sl.Param(\"query_text\"))\n", + " .select(\n", + " [\n", + " product_schema.name,\n", + " product_schema.description,\n", + " product_schema.brand,\n", + " product_schema.price,\n", + " product_schema.rating,\n", + " product_schema.category,\n", + " ]\n", + " )\n", + " .limit(sl.Param(\"limit\"))\n", + " )\n", + "\n", + " # 5. Sample product data\n", + " products = [\n", + " {\n", + " \"id\": \"prod1\",\n", + " \"name\": \"Wireless Bluetooth Headphones\",\n", + " \"description\": \"High-quality wireless headphones with noise cancellation and long battery life.\",\n", + " \"brand\": \"Sony\",\n", + " \"price\": 299.99,\n", + " \"rating\": 4.5,\n", + " \"category\": \"electronics\",\n", + " },\n", + " {\n", + " \"id\": \"prod2\",\n", + " \"name\": \"Professional DSLR Camera\",\n", + " \"description\": \"Full-frame DSLR camera perfect for professional photography and videography.\",\n", + " \"brand\": \"Canon\",\n", + " \"price\": 1299.99,\n", + " \"rating\": 4.8,\n", + " \"category\": \"photography\",\n", + " },\n", + " {\n", + " \"id\": \"prod3\",\n", + " \"name\": \"Running Shoes\",\n", + " \"description\": \"Comfortable running shoes with excellent cushioning and support for athletes.\",\n", + " \"brand\": \"Nike\",\n", + " \"price\": 129.99,\n", + " \"rating\": 4.3,\n", + " \"category\": \"sports\",\n", + " },\n", + " {\n", + " \"id\": \"prod4\",\n", + " \"name\": \"Smartphone with 5G\",\n", + " \"description\": \"Latest smartphone with 5G connectivity, advanced camera, and all-day battery.\",\n", + " \"brand\": \"Samsung\",\n", + " \"price\": 899.99,\n", + " \"rating\": 4.6,\n", + " \"category\": \"electronics\",\n", + " },\n", + " {\n", + " \"id\": \"prod5\",\n", + " \"name\": \"Bluetooth Speaker\",\n", + " \"description\": \"Portable Bluetooth speaker with waterproof design and rich sound quality.\",\n", + " \"brand\": \"Sony\",\n", + " \"price\": 79.99,\n", + " \"rating\": 4.2,\n", + " \"category\": \"electronics\",\n", + " },\n", + " ]\n", + "\n", + " # Create source and executor\n", + " source = sl.InMemorySource(schema=product_schema)\n", + " executor = sl.InMemoryExecutor(sources=[source], indices=[product_index])\n", + " app = executor.run()\n", + "\n", + " # Add data to the source after the app is running\n", + " source.put(products)\n", + "\n", + " # 6. Create Retriever\n", + " retriever = SuperlinkedRetriever(\n", + " sl_client=app,\n", + " sl_query=product_query,\n", + " page_content_field=\"description\",\n", + " metadata_fields=[\"name\", \"brand\", \"price\", \"rating\", \"category\"],\n", + " )\n", + "\n", + " # 7. Demonstrate different search strategies\n", + " scenarios = [\n", + " {\n", + " \"name\": \"Quality-focused search (high ratings matter most)\",\n", + " \"query\": \"wireless audio device\",\n", + " \"params\": {\n", + " \"description_weight\": 0.7,\n", + " \"name_weight\": 0.5,\n", + " \"brand_weight\": 0.2,\n", + " \"category_weight\": 0.3,\n", + " \"price_weight\": 0.1,\n", + " \"rating_weight\": 1.0, # Prioritize high ratings\n", + " \"limit\": 3,\n", + " },\n", + " },\n", + " {\n", + " \"name\": \"Budget-conscious search (price matters most)\",\n", + " \"query\": \"electronics device\",\n", + " \"params\": {\n", + " \"description_weight\": 0.6,\n", + " \"name_weight\": 0.4,\n", + " \"brand_weight\": 0.1,\n", + " \"category_weight\": 0.2,\n", + " \"price_weight\": 1.0, # Prioritize lower prices\n", + " \"rating_weight\": 0.3,\n", + " \"limit\": 3,\n", + " },\n", + " },\n", + " {\n", + " \"name\": \"Brand-focused search (brand loyalty)\",\n", + " \"query\": \"sony products\",\n", + " \"params\": {\n", + " \"description_weight\": 0.5,\n", + " \"name_weight\": 0.3,\n", + " \"brand_weight\": 1.0, # Prioritize specific brand\n", + " \"category_weight\": 0.2,\n", + " \"price_weight\": 0.2,\n", + " \"rating_weight\": 0.4,\n", + " \"limit\": 3,\n", + " },\n", + " },\n", + " ]\n", + "\n", + " for scenario in scenarios:\n", + " print(f\"\\n--- {scenario['name']} ---\")\n", + " print(f\"Query: '{scenario['query']}'\")\n", + "\n", + " results = retriever.invoke(scenario[\"query\"], **scenario[\"params\"])\n", + "\n", + " for i, doc in enumerate(results, 1):\n", + " metadata = doc.metadata\n", + " print(\n", + " f\" {i}. {metadata['name']} ({metadata['brand']}) - ${metadata['price']} - ⭐{metadata['rating']}\"\n", + " )\n", + "\n", + " print()\n", + "\n", + "\n", + "def example_4_news_article_search():\n", + " \"\"\"\n", + " Example 4: News article search with sentiment and topics\n", + " Use case: News search with content, sentiment, topic categorization, and recency\n", + " \"\"\"\n", + " print(\"=== Example 4: News Article Search ===\")\n", + "\n", + " # 1. Define Schema\n", + " class NewsArticleSchema(sl.Schema):\n", + " id: sl.IdField\n", + " headline: sl.String\n", + " content: sl.String\n", + " topic: sl.String\n", + " sentiment_score: sl.Float # -1 (negative) to 1 (positive)\n", + " published_at: sl.Timestamp\n", + " source: sl.String\n", + "\n", + " news_schema = NewsArticleSchema()\n", + "\n", + " # 2. Define Spaces\n", + " content_space = sl.TextSimilaritySpace(\n", + " text=news_schema.content, model=\"sentence-transformers/all-MiniLM-L6-v2\"\n", + " )\n", + "\n", + " headline_space = sl.TextSimilaritySpace(\n", + " text=news_schema.headline, model=\"sentence-transformers/all-MiniLM-L6-v2\"\n", + " )\n", + "\n", + " topic_space = sl.CategoricalSimilaritySpace(\n", + " category_input=news_schema.topic,\n", + " categories=[\n", + " \"technology\",\n", + " \"politics\",\n", + " \"business\",\n", + " \"sports\",\n", + " \"entertainment\",\n", + " \"science\",\n", + " ],\n", + " )\n", + "\n", + " source_space = sl.CategoricalSimilaritySpace(\n", + " category_input=news_schema.source,\n", + " categories=[\"Reuters\", \"BBC\", \"CNN\", \"TechCrunch\", \"Bloomberg\"],\n", + " )\n", + "\n", + " # Sentiment space (can be configured to prefer positive or negative news)\n", + " sentiment_space = sl.NumberSpace(\n", + " number=news_schema.sentiment_score,\n", + " min_value=-1.0,\n", + " max_value=1.0,\n", + " mode=sl.Mode.MAXIMUM, # Default to preferring positive news\n", + " )\n", + "\n", + " # Recency space\n", + " recency_space = sl.RecencySpace(\n", + " timestamp=news_schema.published_at,\n", + " period_time_list=[\n", + " sl.PeriodTime(timedelta(hours=6)), # Last 6 hours\n", + " sl.PeriodTime(timedelta(days=1)), # Last day\n", + " sl.PeriodTime(timedelta(days=7)), # Last week\n", + " ],\n", + " )\n", + "\n", + " # 3. Create Index\n", + " news_index = sl.Index(\n", + " [\n", + " content_space,\n", + " headline_space,\n", + " topic_space,\n", + " source_space,\n", + " sentiment_space,\n", + " recency_space,\n", + " ]\n", + " )\n", + "\n", + " # 4. Define Query\n", + " news_query = (\n", + " sl.Query(\n", + " news_index,\n", + " weights={\n", + " content_space: sl.Param(\"content_weight\"),\n", + " headline_space: sl.Param(\"headline_weight\"),\n", + " topic_space: sl.Param(\"topic_weight\"),\n", + " source_space: sl.Param(\"source_weight\"),\n", + " sentiment_space: sl.Param(\"sentiment_weight\"),\n", + " recency_space: sl.Param(\"recency_weight\"),\n", + " },\n", + " )\n", + " .find(news_schema)\n", + " .similar(content_space.text, sl.Param(\"query_text\"))\n", + " .select(\n", + " [\n", + " news_schema.headline,\n", + " news_schema.content,\n", + " news_schema.topic,\n", + " news_schema.sentiment_score,\n", + " news_schema.published_at,\n", + " news_schema.source,\n", + " ]\n", + " )\n", + " .limit(sl.Param(\"limit\"))\n", + " )\n", + "\n", + " # 5. Sample news data\n", + " # Convert datetime objects to unix timestamps (integers) as required by Timestamp schema field\n", + " news_articles = [\n", + " {\n", + " \"id\": \"news1\",\n", + " \"headline\": \"Major Breakthrough in AI Research Announced\",\n", + " \"content\": \"Scientists have developed a new artificial intelligence model that shows remarkable improvements in natural language understanding.\",\n", + " \"topic\": \"technology\",\n", + " \"sentiment_score\": 0.8,\n", + " \"published_at\": int((datetime.now() - timedelta(hours=2)).timestamp()),\n", + " \"source\": \"TechCrunch\",\n", + " },\n", + " {\n", + " \"id\": \"news2\",\n", + " \"headline\": \"Stock Market Faces Volatility Amid Economic Concerns\",\n", + " \"content\": \"Financial markets experienced significant fluctuations today as investors react to new economic data and policy announcements.\",\n", + " \"topic\": \"business\",\n", + " \"sentiment_score\": -0.3,\n", + " \"published_at\": int((datetime.now() - timedelta(hours=8)).timestamp()),\n", + " \"source\": \"Bloomberg\",\n", + " },\n", + " {\n", + " \"id\": \"news3\",\n", + " \"headline\": \"New Climate Research Shows Promising Results\",\n", + " \"content\": \"Recent studies indicate that innovative climate technologies are showing positive environmental impact and could help address climate change.\",\n", + " \"topic\": \"science\",\n", + " \"sentiment_score\": 0.6,\n", + " \"published_at\": int((datetime.now() - timedelta(hours=12)).timestamp()),\n", + " \"source\": \"Reuters\",\n", + " },\n", + " {\n", + " \"id\": \"news4\",\n", + " \"headline\": \"Tech Companies Report Strong Quarterly Earnings\",\n", + " \"content\": \"Several major technology companies exceeded expectations in their quarterly earnings reports, driven by AI and cloud computing growth.\",\n", + " \"topic\": \"technology\",\n", + " \"sentiment_score\": 0.7,\n", + " \"published_at\": int((datetime.now() - timedelta(hours=4)).timestamp()),\n", + " \"source\": \"CNN\",\n", + " },\n", + " ]\n", + "\n", + " # Create source and executor\n", + " source = sl.InMemorySource(schema=news_schema)\n", + " executor = sl.InMemoryExecutor(sources=[source], indices=[news_index])\n", + " app = executor.run()\n", + "\n", + " # Add data to the source after the app is running\n", + " source.put(news_articles)\n", + "\n", + " # 6. Create Retriever\n", + " retriever = SuperlinkedRetriever(\n", + " sl_client=app,\n", + " sl_query=news_query,\n", + " page_content_field=\"content\",\n", + " metadata_fields=[\n", + " \"headline\",\n", + " \"topic\",\n", + " \"sentiment_score\",\n", + " \"published_at\",\n", + " \"source\",\n", + " ],\n", + " )\n", + "\n", + " # 7. Demonstrate different news search strategies\n", + " print(f\"Query: 'artificial intelligence developments'\")\n", + "\n", + " # Recent technology news\n", + " results = retriever.invoke(\n", + " \"artificial intelligence developments\",\n", + " content_weight=0.8,\n", + " headline_weight=0.6,\n", + " topic_weight=0.4,\n", + " source_weight=0.2,\n", + " sentiment_weight=0.3,\n", + " recency_weight=1.0, # Prioritize recent news\n", + " limit=2,\n", + " )\n", + "\n", + " print(\"\\nRecent Technology News:\")\n", + " for i, doc in enumerate(results, 1):\n", + " metadata = doc.metadata\n", + " published_timestamp = metadata[\"published_at\"]\n", + " # Convert unix timestamp back to datetime for display calculation\n", + " published_time = datetime.fromtimestamp(published_timestamp)\n", + " hours_ago = (datetime.now() - published_time).total_seconds() / 3600\n", + " sentiment = (\n", + " \"πŸ“ˆ Positive\"\n", + " if metadata[\"sentiment_score\"] > 0\n", + " else \"πŸ“‰ Negative\"\n", + " if metadata[\"sentiment_score\"] < 0\n", + " else \"➑️ Neutral\"\n", + " )\n", + "\n", + " print(f\" {i}. {metadata['headline']}\")\n", + " print(f\" Source: {metadata['source']} | {sentiment} | {hours_ago:.1f}h ago\")\n", + "\n", + " print()\n", + "\n", + "\n", + "def demonstrate_langchain_integration():\n", + " \"\"\"\n", + " Example 5: Integration with LangChain RAG pipeline\n", + " Shows how to use the SuperlinkedRetriever in a complete RAG workflow\n", + " \"\"\"\n", + " print(\"=== Example 5: LangChain RAG Integration ===\")\n", + "\n", + " # This would typically be used with an actual LLM\n", + " # For demo purposes, we'll just show the retrieval part\n", + "\n", + " # Quick setup of a simple retriever\n", + " class FAQSchema(sl.Schema):\n", + " id: sl.IdField\n", + " question: sl.String\n", + " answer: sl.String\n", + " category: sl.String\n", + "\n", + " faq_schema = FAQSchema()\n", + "\n", + " text_space = sl.TextSimilaritySpace(\n", + " text=faq_schema.question, model=\"sentence-transformers/all-MiniLM-L6-v2\"\n", + " )\n", + "\n", + " category_space = sl.CategoricalSimilaritySpace(\n", + " category_input=faq_schema.category,\n", + " categories=[\"technical\", \"billing\", \"general\", \"account\"],\n", + " )\n", + "\n", + " faq_index = sl.Index([text_space, category_space])\n", + "\n", + " faq_query = (\n", + " sl.Query(\n", + " faq_index,\n", + " weights={\n", + " text_space: sl.Param(\"text_weight\"),\n", + " category_space: sl.Param(\"category_weight\"),\n", + " },\n", + " )\n", + " .find(faq_schema)\n", + " .similar(text_space.text, sl.Param(\"query_text\"))\n", + " .select([faq_schema.question, faq_schema.answer, faq_schema.category])\n", + " .limit(sl.Param(\"limit\"))\n", + " )\n", + "\n", + " # Sample FAQ data\n", + " faqs = [\n", + " {\n", + " \"id\": \"faq1\",\n", + " \"question\": \"How do I reset my password?\",\n", + " \"answer\": \"You can reset your password by clicking 'Forgot Password' on the login page and following the email instructions.\",\n", + " \"category\": \"account\",\n", + " },\n", + " {\n", + " \"id\": \"faq2\",\n", + " \"question\": \"Why is my API not working?\",\n", + " \"answer\": \"Check your API key, rate limits, and ensure you're using the correct endpoint URL.\",\n", + " \"category\": \"technical\",\n", + " },\n", + " {\n", + " \"id\": \"faq3\",\n", + " \"question\": \"How do I upgrade my subscription?\",\n", + " \"answer\": \"Visit the billing section in your account settings to upgrade your plan.\",\n", + " \"category\": \"billing\",\n", + " },\n", + " ]\n", + "\n", + " # Create source and executor\n", + " source = sl.InMemorySource(schema=faq_schema)\n", + " executor = sl.InMemoryExecutor(sources=[source], indices=[faq_index])\n", + " app = executor.run()\n", + "\n", + " # Add data to the source after the app is running\n", + " source.put(faqs)\n", + "\n", + " retriever = SuperlinkedRetriever(\n", + " sl_client=app,\n", + " sl_query=faq_query,\n", + " page_content_field=\"answer\",\n", + " metadata_fields=[\"question\", \"category\"],\n", + " )\n", + "\n", + " # Simulate a RAG query\n", + " user_question = \"I can't access the API\"\n", + "\n", + " print(f\"User Question: '{user_question}'\")\n", + " print(\"Retrieving relevant context...\")\n", + "\n", + " context_docs = retriever.invoke(\n", + " user_question, text_weight=1.0, category_weight=0.3, limit=2\n", + " )\n", + "\n", + " print(\"\\nRetrieved Context:\")\n", + " for i, doc in enumerate(context_docs, 1):\n", + " print(f\" {i}. Q: {doc.metadata['question']}\")\n", + " print(f\" A: {doc.page_content}\")\n", + " print(f\" Category: {doc.metadata['category']}\")\n", + "\n", + " print(\n", + " \"\\n[In a real RAG setup, this context would be passed to an LLM to generate a response]\"\n", + " )\n", + " print()\n", + "\n", + "\n", + "def example_6_qdrant_vector_database():\n", + " \"\"\"\n", + " Example 6: Same retriever with Qdrant vector database\n", + " Use case: Production deployment with persistent vector storage\n", + "\n", + " This demonstrates that SuperlinkedRetriever is vector database agnostic.\n", + " The SAME retriever code works with Qdrant (or Redis, MongoDB) by only\n", + " changing the executor configuration, not the retriever implementation.\n", + " \"\"\"\n", + " print(\"=== Example 6: Qdrant Vector Database ===\")\n", + "\n", + " # 1. Define Schema (IDENTICAL to Example 1)\n", + " class DocumentSchema(sl.Schema):\n", + " id: sl.IdField\n", + " content: sl.String\n", + "\n", + " doc_schema = DocumentSchema()\n", + "\n", + " # 2. Define Space and Index (IDENTICAL to Example 1)\n", + " text_space = sl.TextSimilaritySpace(\n", + " text=doc_schema.content, model=\"sentence-transformers/all-MiniLM-L6-v2\"\n", + " )\n", + "\n", + " doc_index = sl.Index([text_space])\n", + "\n", + " # 3. Define Query (IDENTICAL to Example 1)\n", + " query = (\n", + " sl.Query(doc_index)\n", + " .find(doc_schema)\n", + " .similar(text_space.text, sl.Param(\"query_text\"))\n", + " .select([doc_schema.content])\n", + " .limit(sl.Param(\"limit\"))\n", + " )\n", + "\n", + " # 4. Configure Qdrant Vector Database (ONLY DIFFERENCE!)\n", + " print(\"πŸ”§ Configuring Qdrant vector database...\")\n", + " try:\n", + " qdrant_vector_db = sl.QdrantVectorDatabase(\n", + " url=\"https://your-qdrant-cluster.qdrant.io\", # Replace with your Qdrant URL\n", + " api_key=\"your-api-key-here\", # Replace with your API key\n", + " default_query_limit=10,\n", + " vector_precision=sl.Precision.FLOAT16,\n", + " )\n", + " print(\"Qdrant configuration created (credentials needed for actual connection)\")\n", + " except Exception as e:\n", + " print(f\"Qdrant not configured (expected without credentials): {e}\")\n", + " print(\"Using in-memory fallback for demonstration...\")\n", + " qdrant_vector_db = None\n", + "\n", + " # 5. Set up data and app (SLIGHT DIFFERENCE - vector database parameter)\n", + " documents = [\n", + " {\n", + " \"id\": \"doc1\",\n", + " \"content\": \"Machine learning algorithms can process large datasets efficiently.\",\n", + " },\n", + " {\n", + " \"id\": \"doc2\",\n", + " \"content\": \"Natural language processing enables computers to understand human language.\",\n", + " },\n", + " {\n", + " \"id\": \"doc3\",\n", + " \"content\": \"Deep learning models require significant computational resources.\",\n", + " },\n", + " {\n", + " \"id\": \"doc4\",\n", + " \"content\": \"Data science combines statistics, programming, and domain expertise.\",\n", + " },\n", + " {\n", + " \"id\": \"doc5\",\n", + " \"content\": \"Artificial intelligence is transforming various industries.\",\n", + " },\n", + " ]\n", + "\n", + " # Create source and executor with Qdrant (or fallback to in-memory)\n", + " source = sl.InMemorySource(schema=doc_schema)\n", + "\n", + " if qdrant_vector_db:\n", + " # Production setup with Qdrant\n", + " executor = sl.InMemoryExecutor(\n", + " sources=[source],\n", + " indices=[doc_index],\n", + " vector_database=qdrant_vector_db, # This makes it use Qdrant!\n", + " )\n", + " storage_type = \"Qdrant (persistent)\"\n", + " else:\n", + " # Fallback to in-memory for demo\n", + " executor = sl.InMemoryExecutor(sources=[source], indices=[doc_index])\n", + " storage_type = \"In-Memory (fallback)\"\n", + "\n", + " app = executor.run()\n", + "\n", + " # Add data to the source after the app is running\n", + " source.put(documents)\n", + "\n", + " # 6. Create Retriever (IDENTICAL CODE!)\n", + " retriever = SuperlinkedRetriever(\n", + " sl_client=app, sl_query=query, page_content_field=\"content\"\n", + " )\n", + "\n", + " # 7. Use the retriever (IDENTICAL CODE!)\n", + " results = retriever.invoke(\"artificial intelligence and machine learning\", limit=3)\n", + "\n", + " print(f\"Vector Storage: {storage_type}\")\n", + " print(f\"Query: 'artificial intelligence and machine learning'\")\n", + " print(f\"Found {len(results)} documents:\")\n", + " for i, doc in enumerate(results, 1):\n", + " print(f\" {i}. {doc.page_content}\")\n", + "\n", + " print(\n", + " \"\\nKey Insight: Same SuperlinkedRetriever code works with any vector database!\"\n", + " )\n", + " print(\n", + " \"Only executor configuration changes, retriever implementation stays identical\"\n", + " )\n", + " print(\"Switch between in-memory β†’ Qdrant β†’ Redis β†’ MongoDB without code changes\")\n", + " print()\n", + "\n", + "\n", + "def main():\n", + " \"\"\"\n", + " Run all examples to demonstrate the flexibility of SuperlinkedRetriever\n", + " \"\"\"\n", + " print(\"SuperlinkedRetriever Examples\")\n", + " print(\"=\" * 50)\n", + " print(\"This file demonstrates how the SuperlinkedRetriever can be used\")\n", + " print(\"with different space configurations for various use cases.\\n\")\n", + "\n", + " try:\n", + " example_1_simple_text_search()\n", + " example_2_multi_space_blog_search()\n", + " example_3_ecommerce_product_search()\n", + " example_4_news_article_search()\n", + " demonstrate_langchain_integration()\n", + " example_6_qdrant_vector_database()\n", + "\n", + " print(\"All examples completed successfully!\")\n", + "\n", + " except Exception as e:\n", + " print(f\"Error running examples: {e}\")\n", + " print(\"Make sure you have 'superlinked' package installed:\")\n", + " print(\"pip install superlinked\")" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/docs/integrations/retrievers/superlinked_examples.ipynb b/docs/docs/integrations/retrievers/superlinked_examples.ipynb new file mode 100644 index 0000000000000..4d27243989a9a --- /dev/null +++ b/docs/docs/integrations/retrievers/superlinked_examples.ipynb @@ -0,0 +1,204 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SuperlinkedRetriever Examples\n", + "\n", + "This notebook demonstrates how to build a Superlinked App and Query Descriptor and use them with the LangChain `SuperlinkedRetriever`.\n", + "\n", + "Install the integration from PyPI:\n", + "\n", + "```bash\n", + "pip install -U langchain-superlinked superlinked\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "Install the integration and its peer dependency:\n", + "\n", + "```bash\n", + "pip install -U langchain-superlinked superlinked\n", + "```\n", + "\n", + "## Instantiation\n", + "\n", + "See below for creating a Superlinked App (`sl_client`) and a `QueryDescriptor` (`sl_query`), then wiring them into `SuperlinkedRetriever`.\n", + "\n", + "## Usage\n", + "\n", + "Call `retriever.invoke(query_text, **params)` to retrieve `Document` objects. Examples below show single-space and multi-space setups.\n", + "\n", + "## Use within a chain\n", + "\n", + "The retriever can be used in LangChain chains by piping it into your prompt and model. See the main Superlinked retriever page for a full RAG example.\n", + "\n", + "## API reference\n", + "\n", + "Refer to the API docs:\n", + "\n", + "- https://python.langchain.com/api_reference/superlinked/retrievers/langchain_superlinked.retrievers.SuperlinkedRetriever.html\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import superlinked.framework as sl\n", + "from langchain_superlinked import SuperlinkedRetriever\n", + "from datetime import timedelta\n", + "\n", + "\n", + "# Define schema\n", + "class DocumentSchema(sl.Schema):\n", + " id: sl.IdField\n", + " content: sl.String\n", + "\n", + "\n", + "doc_schema = DocumentSchema()\n", + "\n", + "# Space + index\n", + "text_space = sl.TextSimilaritySpace(\n", + " text=doc_schema.content, model=\"sentence-transformers/all-MiniLM-L6-v2\"\n", + ")\n", + "doc_index = sl.Index([text_space])\n", + "\n", + "# Query descriptor\n", + "query = (\n", + " sl.Query(doc_index)\n", + " .find(doc_schema)\n", + " .similar(text_space.text, sl.Param(\"query_text\"))\n", + " .select([doc_schema.content])\n", + " .limit(sl.Param(\"limit\"))\n", + ")\n", + "\n", + "# Minimal app\n", + "source = sl.InMemorySource(schema=doc_schema)\n", + "executor = sl.InMemoryExecutor(sources=[source], indices=[doc_index])\n", + "app = executor.run()\n", + "\n", + "# Data\n", + "source.put(\n", + " [\n", + " {\"id\": \"1\", \"content\": \"Machine learning algorithms process data efficiently.\"},\n", + " {\n", + " \"id\": \"2\",\n", + " \"content\": \"Natural language processing understands human language.\",\n", + " },\n", + " {\"id\": \"3\", \"content\": \"Deep learning models require significant compute.\"},\n", + " ]\n", + ")\n", + "\n", + "# Retriever\n", + "retriever = SuperlinkedRetriever(\n", + " sl_client=app, sl_query=query, page_content_field=\"content\"\n", + ")\n", + "\n", + "retriever.invoke(\"artificial intelligence\", limit=2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Multi-space example (blog posts)\n", + "class BlogPostSchema(sl.Schema):\n", + " id: sl.IdField\n", + " title: sl.String\n", + " content: sl.String\n", + " category: sl.String\n", + " published_date: sl.Timestamp\n", + "\n", + "\n", + "blog = BlogPostSchema()\n", + "\n", + "content_space = sl.TextSimilaritySpace(\n", + " text=blog.content, model=\"sentence-transformers/all-MiniLM-L6-v2\"\n", + ")\n", + "title_space = sl.TextSimilaritySpace(\n", + " text=blog.title, model=\"sentence-transformers/all-MiniLM-L6-v2\"\n", + ")\n", + "cat_space = sl.CategoricalSimilaritySpace(\n", + " category_input=blog.category, categories=[\"technology\", \"science\", \"business\"]\n", + ")\n", + "recency_space = sl.RecencySpace(\n", + " timestamp=blog.published_date,\n", + " period_time_list=[\n", + " sl.PeriodTime(timedelta(days=30)),\n", + " sl.PeriodTime(timedelta(days=90)),\n", + " ],\n", + ")\n", + "\n", + "blog_index = sl.Index([content_space, title_space, cat_space, recency_space])\n", + "\n", + "blog_query = (\n", + " sl.Query(\n", + " blog_index,\n", + " weights={\n", + " content_space: sl.Param(\"content_weight\"),\n", + " title_space: sl.Param(\"title_weight\"),\n", + " cat_space: sl.Param(\"category_weight\"),\n", + " recency_space: sl.Param(\"recency_weight\"),\n", + " },\n", + " )\n", + " .find(blog)\n", + " .similar(content_space.text, sl.Param(\"query_text\"))\n", + " .select([blog.title, blog.content, blog.category, blog.published_date])\n", + " .limit(sl.Param(\"limit\"))\n", + ")\n", + "\n", + "source = sl.InMemorySource(schema=blog)\n", + "app = sl.InMemoryExecutor(sources=[source], indices=[blog_index]).run()\n", + "\n", + "from datetime import datetime\n", + "\n", + "source.put(\n", + " [\n", + " {\n", + " \"id\": \"p1\",\n", + " \"title\": \"Intro to ML\",\n", + " \"content\": \"Machine learning 101\",\n", + " \"category\": \"technology\",\n", + " \"published_date\": int((datetime.now() - timedelta(days=5)).timestamp()),\n", + " },\n", + " {\n", + " \"id\": \"p2\",\n", + " \"title\": \"AI in Healthcare\",\n", + " \"content\": \"Transforming diagnosis\",\n", + " \"category\": \"science\",\n", + " \"published_date\": int((datetime.now() - timedelta(days=15)).timestamp()),\n", + " },\n", + " ]\n", + ")\n", + "\n", + "blog_retriever = SuperlinkedRetriever(\n", + " sl_client=app,\n", + " sl_query=blog_query,\n", + " page_content_field=\"content\",\n", + " metadata_fields=[\"title\", \"category\", \"published_date\"],\n", + ")\n", + "\n", + "blog_retriever.invoke(\n", + " \"machine learning\", content_weight=1.0, recency_weight=0.5, limit=2\n", + ")" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/docs/integrations/retrievers/superlinked_examples.py b/docs/docs/integrations/retrievers/superlinked_examples.py new file mode 100644 index 0000000000000..7c0d2bf8337a4 --- /dev/null +++ b/docs/docs/integrations/retrievers/superlinked_examples.py @@ -0,0 +1,954 @@ +""" +SuperlinkedRetriever Usage Examples + +This file demonstrates how to use the SuperlinkedRetriever with different +space configurations to showcase its flexibility across various use cases. +""" +# ruff: noqa: T201, E501 +# mypy: ignore-errors + +from datetime import datetime, timedelta + +import superlinked.framework as sl + +from langchain_superlinked import SuperlinkedRetriever + + +def example_1_simple_text_search(): + """ + Example 1: Simple text-based semantic search + Use case: Basic document retrieval based on content similarity + """ + print("=== Example 1: Simple Text Search ===") + + # 1. Define Schema + class DocumentSchema(sl.Schema): + id: sl.IdField + content: sl.String + + doc_schema = DocumentSchema() + + # 2. Define Space and Index + text_space = sl.TextSimilaritySpace( + text=doc_schema.content, model="sentence-transformers/all-MiniLM-L6-v2" + ) + + doc_index = sl.Index([text_space]) + + # 3. Define Query + query = ( + sl.Query(doc_index) + .find(doc_schema) + .similar(text_space.text, sl.Param("query_text")) + .select([doc_schema.content]) + .limit(sl.Param("limit")) + ) + + # 4. Set up data and app using executor pattern + documents = [ + { + "id": "doc1", + "content": "Machine learning algorithms can process large datasets efficiently.", + }, + { + "id": "doc2", + "content": "Natural language processing enables computers to understand human language.", + }, + { + "id": "doc3", + "content": "Deep learning models require significant computational resources.", + }, + { + "id": "doc4", + "content": "Data science combines statistics, programming, and domain expertise.", + }, + { + "id": "doc5", + "content": "Artificial intelligence is transforming various industries.", + }, + ] + + # Create source and executor + source = sl.InMemorySource(schema=doc_schema) + executor = sl.InMemoryExecutor(sources=[source], indices=[doc_index]) + app = executor.run() + + # Add data to the source after the app is running + source.put(documents) + + # 5. Create Retriever + retriever = SuperlinkedRetriever( + sl_client=app, sl_query=query, page_content_field="content" + ) + + # 6. Use the retriever + results = retriever.invoke("artificial intelligence and machine learning", limit=3) + + print("Query: 'artificial intelligence and machine learning'") + print(f"Found {len(results)} documents:") + for i, doc in enumerate(results, 1): + print(f" {i}. {doc.page_content}") + print() + + +def example_2_multi_space_blog_search(): + """ + Example 2: Multi-space blog post search + Use case: Blog search with content, category, and recency + """ + print("=== Example 2: Multi-Space Blog Search ===") + + # 1. Define Schema + class BlogPostSchema(sl.Schema): + id: sl.IdField + title: sl.String + content: sl.String + category: sl.String + published_date: sl.Timestamp + view_count: sl.Integer + + blog_schema = BlogPostSchema() + + # 2. Define Multiple Spaces + # Text similarity for content + content_space = sl.TextSimilaritySpace( + text=blog_schema.content, model="sentence-transformers/all-MiniLM-L6-v2" + ) + + # Title similarity + title_space = sl.TextSimilaritySpace( + text=blog_schema.title, model="sentence-transformers/all-MiniLM-L6-v2" + ) + + # Category similarity + category_space = sl.CategoricalSimilaritySpace( + category_input=blog_schema.category, + categories=["technology", "science", "business", "health", "travel"], + ) + + # Recency (favor recent posts) + recency_space = sl.RecencySpace( + timestamp=blog_schema.published_date, + period_time_list=[ + sl.PeriodTime(timedelta(days=30)), # Last month + sl.PeriodTime(timedelta(days=90)), # Last 3 months + sl.PeriodTime(timedelta(days=365)), # Last year + ], + ) + + # Popularity (based on view count) + popularity_space = sl.NumberSpace( + number=blog_schema.view_count, + min_value=0, + max_value=10000, + mode=sl.Mode.MAXIMUM, + ) + + # 3. Create Index + blog_index = sl.Index( + [content_space, title_space, category_space, recency_space, popularity_space] + ) + + # 4. Define Query with multiple weighted spaces + blog_query = ( + sl.Query( + blog_index, + weights={ + content_space: sl.Param("content_weight"), + title_space: sl.Param("title_weight"), + category_space: sl.Param("category_weight"), + recency_space: sl.Param("recency_weight"), + popularity_space: sl.Param("popularity_weight"), + }, + ) + .find(blog_schema) + .similar(content_space.text, sl.Param("query_text")) + .select( + [ + blog_schema.title, + blog_schema.content, + blog_schema.category, + blog_schema.published_date, + blog_schema.view_count, + ] + ) + .limit(sl.Param("limit")) + ) + + # 5. Sample blog data + from datetime import datetime + + # Convert datetime objects to unix timestamps (integers) as required by Timestamp schema field + blog_posts = [ + { + "id": "post1", + "title": "Introduction to Machine Learning", + "content": "Machine learning is revolutionizing how we process data and make predictions.", + "category": "technology", + "published_date": int((datetime.now() - timedelta(days=5)).timestamp()), + "view_count": 1500, + }, + { + "id": "post2", + "title": "The Future of AI in Healthcare", + "content": "Artificial intelligence is transforming medical diagnosis and treatment.", + "category": "health", + "published_date": int((datetime.now() - timedelta(days=15)).timestamp()), + "view_count": 2300, + }, + { + "id": "post3", + "title": "Business Analytics with Python", + "content": "Learn how to use Python for business data analysis and visualization.", + "category": "business", + "published_date": int((datetime.now() - timedelta(days=45)).timestamp()), + "view_count": 980, + }, + { + "id": "post4", + "title": "Deep Learning Neural Networks", + "content": "Understanding neural networks and their applications in modern AI.", + "category": "technology", + "published_date": int((datetime.now() - timedelta(days=2)).timestamp()), + "view_count": 3200, + }, + ] + + # Create source and executor + source = sl.InMemorySource(schema=blog_schema) + executor = sl.InMemoryExecutor(sources=[source], indices=[blog_index]) + app = executor.run() + + # Add data to the source after the app is running + source.put(blog_posts) + + # 6. Create Retriever + retriever = SuperlinkedRetriever( + sl_client=app, + sl_query=blog_query, + page_content_field="content", + metadata_fields=["title", "category", "published_date", "view_count"], + ) + + # 7. Demonstrate different weighting strategies + scenarios = [ + { + "name": "Content-focused search", + "params": { + "content_weight": 1.0, + "title_weight": 0.3, + "category_weight": 0.1, + "recency_weight": 0.2, + "popularity_weight": 0.1, + "limit": 3, + }, + }, + { + "name": "Recent posts prioritized", + "params": { + "content_weight": 0.5, + "title_weight": 0.2, + "category_weight": 0.1, + "recency_weight": 1.0, + "popularity_weight": 0.1, + "limit": 3, + }, + }, + { + "name": "Popular posts with category emphasis", + "params": { + "content_weight": 0.6, + "title_weight": 0.3, + "category_weight": 0.8, + "recency_weight": 0.3, + "popularity_weight": 0.9, + "limit": 3, + }, + }, + ] + + query_text = "machine learning and AI applications" + + for scenario in scenarios: + print(f"\n--- {scenario['name']} ---") + print(f"Query: '{query_text}'") + + results = retriever.invoke(query_text, **scenario["params"]) + + for i, doc in enumerate(results, 1): + print( + f" {i}. {doc.metadata['title']} (Category: {doc.metadata['category']}, Views: {doc.metadata['view_count']})" + ) + + print() + + +def example_3_ecommerce_product_search(): + """ + Example 3: E-commerce product search + Use case: Product search with price range, brand preference, and ratings + """ + print("=== Example 3: E-commerce Product Search ===") + + # 1. Define Schema + class ProductSchema(sl.Schema): + id: sl.IdField + name: sl.String + description: sl.String + brand: sl.String + price: sl.Float + rating: sl.Float + category: sl.String + + product_schema = ProductSchema() + + # 2. Define Spaces + description_space = sl.TextSimilaritySpace( + text=product_schema.description, model="sentence-transformers/all-MiniLM-L6-v2" + ) + + name_space = sl.TextSimilaritySpace( + text=product_schema.name, model="sentence-transformers/all-MiniLM-L6-v2" + ) + + brand_space = sl.CategoricalSimilaritySpace( + category_input=product_schema.brand, + categories=["Apple", "Samsung", "Sony", "Nike", "Adidas", "Canon"], + ) + + category_space = sl.CategoricalSimilaritySpace( + category_input=product_schema.category, + categories=["electronics", "clothing", "sports", "photography"], + ) + + # Price space (lower prices get higher scores in MINIMUM mode) + price_space = sl.NumberSpace( + number=product_schema.price, + min_value=10.0, + max_value=2000.0, + mode=sl.Mode.MINIMUM, # Favor lower prices + ) + + # Rating space (higher ratings get higher scores) + rating_space = sl.NumberSpace( + number=product_schema.rating, + min_value=1.0, + max_value=5.0, + mode=sl.Mode.MAXIMUM, # Favor higher ratings + ) + + # 3. Create Index + product_index = sl.Index( + [ + description_space, + name_space, + brand_space, + category_space, + price_space, + rating_space, + ] + ) + + # 4. Define Query + product_query = ( + sl.Query( + product_index, + weights={ + description_space: sl.Param("description_weight"), + name_space: sl.Param("name_weight"), + brand_space: sl.Param("brand_weight"), + category_space: sl.Param("category_weight"), + price_space: sl.Param("price_weight"), + rating_space: sl.Param("rating_weight"), + }, + ) + .find(product_schema) + .similar(description_space.text, sl.Param("query_text")) + .select( + [ + product_schema.name, + product_schema.description, + product_schema.brand, + product_schema.price, + product_schema.rating, + product_schema.category, + ] + ) + .limit(sl.Param("limit")) + ) + + # 5. Sample product data + products = [ + { + "id": "prod1", + "name": "Wireless Bluetooth Headphones", + "description": "High-quality wireless headphones with noise cancellation and long battery life.", + "brand": "Sony", + "price": 299.99, + "rating": 4.5, + "category": "electronics", + }, + { + "id": "prod2", + "name": "Professional DSLR Camera", + "description": "Full-frame DSLR camera perfect for professional photography and videography.", + "brand": "Canon", + "price": 1299.99, + "rating": 4.8, + "category": "photography", + }, + { + "id": "prod3", + "name": "Running Shoes", + "description": "Comfortable running shoes with excellent cushioning and support for athletes.", + "brand": "Nike", + "price": 129.99, + "rating": 4.3, + "category": "sports", + }, + { + "id": "prod4", + "name": "Smartphone with 5G", + "description": "Latest smartphone with 5G connectivity, advanced camera, and all-day battery.", + "brand": "Samsung", + "price": 899.99, + "rating": 4.6, + "category": "electronics", + }, + { + "id": "prod5", + "name": "Bluetooth Speaker", + "description": "Portable Bluetooth speaker with waterproof design and rich sound quality.", + "brand": "Sony", + "price": 79.99, + "rating": 4.2, + "category": "electronics", + }, + ] + + # Create source and executor + source = sl.InMemorySource(schema=product_schema) + executor = sl.InMemoryExecutor(sources=[source], indices=[product_index]) + app = executor.run() + + # Add data to the source after the app is running + source.put(products) + + # 6. Create Retriever + retriever = SuperlinkedRetriever( + sl_client=app, + sl_query=product_query, + page_content_field="description", + metadata_fields=["name", "brand", "price", "rating", "category"], + ) + + # 7. Demonstrate different search strategies + scenarios = [ + { + "name": "Quality-focused search (high ratings matter most)", + "query": "wireless audio device", + "params": { + "description_weight": 0.7, + "name_weight": 0.5, + "brand_weight": 0.2, + "category_weight": 0.3, + "price_weight": 0.1, + "rating_weight": 1.0, + "limit": 3, + }, + }, + { + "name": "Budget-conscious search (price matters most)", + "query": "electronics device", + "params": { + "description_weight": 0.6, + "name_weight": 0.4, + "brand_weight": 0.1, + "category_weight": 0.2, + "price_weight": 1.0, + "rating_weight": 0.3, + "limit": 3, + }, + }, + { + "name": "Brand-focused search (brand loyalty)", + "query": "sony products", + "params": { + "description_weight": 0.5, + "name_weight": 0.3, + "brand_weight": 1.0, + "category_weight": 0.2, + "price_weight": 0.2, + "rating_weight": 0.4, + "limit": 3, + }, + }, + ] + + for scenario in scenarios: + print(f"\n--- {scenario['name']} ---") + print(f"Query: '{scenario['query']}'") + + results = retriever.invoke(scenario["query"], **scenario["params"]) + + for i, doc in enumerate(results, 1): + metadata = doc.metadata + print( + f" {i}. {metadata['name']} ({metadata['brand']}) - ${metadata['price']} - ⭐{metadata['rating']}" + ) + + print() + + +def example_4_news_article_search(): + """ + Example 4: News article search with sentiment and topics + Use case: News search with content, sentiment, topic categorization, and recency + """ + print("=== Example 4: News Article Search ===") + + # 1. Define Schema + class NewsArticleSchema(sl.Schema): + id: sl.IdField + headline: sl.String + content: sl.String + topic: sl.String + sentiment_score: sl.Float # -1 (negative) to 1 (positive) + published_at: sl.Timestamp + source: sl.String + + news_schema = NewsArticleSchema() + + # 2. Define Spaces + content_space = sl.TextSimilaritySpace( + text=news_schema.content, model="sentence-transformers/all-MiniLM-L6-v2" + ) + + headline_space = sl.TextSimilaritySpace( + text=news_schema.headline, model="sentence-transformers/all-MiniLM-L6-v2" + ) + + topic_space = sl.CategoricalSimilaritySpace( + category_input=news_schema.topic, + categories=[ + "technology", + "politics", + "business", + "sports", + "entertainment", + "science", + ], + ) + + source_space = sl.CategoricalSimilaritySpace( + category_input=news_schema.source, + categories=["Reuters", "BBC", "CNN", "TechCrunch", "Bloomberg"], + ) + + # Sentiment space (can be configured to prefer positive or negative news) + sentiment_space = sl.NumberSpace( + number=news_schema.sentiment_score, + min_value=-1.0, + max_value=1.0, + mode=sl.Mode.MAXIMUM, # Default to preferring positive news + ) + + # Recency space + recency_space = sl.RecencySpace( + timestamp=news_schema.published_at, + period_time_list=[ + sl.PeriodTime(timedelta(hours=6)), # Last 6 hours + sl.PeriodTime(timedelta(days=1)), # Last day + sl.PeriodTime(timedelta(days=7)), # Last week + ], + ) + + # 3. Create Index + news_index = sl.Index( + [ + content_space, + headline_space, + topic_space, + source_space, + sentiment_space, + recency_space, + ] + ) + + # 4. Define Query + news_query = ( + sl.Query( + news_index, + weights={ + content_space: sl.Param("content_weight"), + headline_space: sl.Param("headline_weight"), + topic_space: sl.Param("topic_weight"), + source_space: sl.Param("source_weight"), + sentiment_space: sl.Param("sentiment_weight"), + recency_space: sl.Param("recency_weight"), + }, + ) + .find(news_schema) + .similar(content_space.text, sl.Param("query_text")) + .select( + [ + news_schema.headline, + news_schema.content, + news_schema.topic, + news_schema.sentiment_score, + news_schema.published_at, + news_schema.source, + ] + ) + .limit(sl.Param("limit")) + ) + + # 5. Sample news data + # Convert datetime objects to unix timestamps (integers) as required by Timestamp schema field + news_articles = [ + { + "id": "news1", + "headline": "Major Breakthrough in AI Research Announced", + "content": "Scientists have developed a new artificial intelligence model that shows remarkable improvements in natural language understanding.", + "topic": "technology", + "sentiment_score": 0.8, + "published_at": int((datetime.now() - timedelta(hours=2)).timestamp()), + "source": "TechCrunch", + }, + { + "id": "news2", + "headline": "Stock Market Faces Volatility Amid Economic Concerns", + "content": "Financial markets experienced significant fluctuations today as investors react to new economic data and policy announcements.", + "topic": "business", + "sentiment_score": -0.3, + "published_at": int((datetime.now() - timedelta(hours=8)).timestamp()), + "source": "Bloomberg", + }, + { + "id": "news3", + "headline": "New Climate Research Shows Promising Results", + "content": "Recent studies indicate that innovative climate technologies are showing positive environmental impact and could help address climate change.", + "topic": "science", + "sentiment_score": 0.6, + "published_at": int((datetime.now() - timedelta(hours=12)).timestamp()), + "source": "Reuters", + }, + { + "id": "news4", + "headline": "Tech Companies Report Strong Quarterly Earnings", + "content": "Several major technology companies exceeded expectations in their quarterly earnings reports, driven by AI and cloud computing growth.", + "topic": "technology", + "sentiment_score": 0.7, + "published_at": int((datetime.now() - timedelta(hours=4)).timestamp()), + "source": "CNN", + }, + ] + + # Create source and executor + source = sl.InMemorySource(schema=news_schema) + executor = sl.InMemoryExecutor(sources=[source], indices=[news_index]) + app = executor.run() + + # Add data to the source after the app is running + source.put(news_articles) + + # 6. Create Retriever + retriever = SuperlinkedRetriever( + sl_client=app, + sl_query=news_query, + page_content_field="content", + metadata_fields=[ + "headline", + "topic", + "sentiment_score", + "published_at", + "source", + ], + ) + + # 7. Demonstrate different news search strategies + print("Query: 'artificial intelligence developments'") + + # Recent technology news + results = retriever.invoke( + "artificial intelligence developments", + content_weight=0.8, + headline_weight=0.6, + topic_weight=0.4, + source_weight=0.2, + sentiment_weight=0.3, + recency_weight=1.0, # Prioritize recent news + limit=2, + ) + + print("\nRecent Technology News:") + for i, doc in enumerate(results, 1): + metadata = doc.metadata + published_timestamp = metadata["published_at"] + # Convert unix timestamp back to datetime for display calculation + published_time = datetime.fromtimestamp(published_timestamp) + hours_ago = (datetime.now() - published_time).total_seconds() / 3600 + sentiment = ( + "πŸ“ˆ Positive" + if metadata["sentiment_score"] > 0 + else "πŸ“‰ Negative" + if metadata["sentiment_score"] < 0 + else "➑️ Neutral" + ) + + print(f" {i}. {metadata['headline']}") + print(f" Source: {metadata['source']} | {sentiment} | {hours_ago:.1f}h ago") + + print() + + +def demonstrate_langchain_integration(): + """ + Example 5: Integration with LangChain RAG pipeline + Shows how to use the SuperlinkedRetriever in a complete RAG workflow + """ + print("=== Example 5: LangChain RAG Integration ===") + + # This would typically be used with an actual LLM + # For demo purposes, we'll just show the retrieval part + + # Quick setup of a simple retriever + class FAQSchema(sl.Schema): + id: sl.IdField + question: sl.String + answer: sl.String + category: sl.String + + faq_schema = FAQSchema() + + text_space = sl.TextSimilaritySpace( + text=faq_schema.question, model="sentence-transformers/all-MiniLM-L6-v2" + ) + + category_space = sl.CategoricalSimilaritySpace( + category_input=faq_schema.category, + categories=["technical", "billing", "general", "account"], + ) + + faq_index = sl.Index([text_space, category_space]) + + faq_query = ( + sl.Query( + faq_index, + weights={ + text_space: sl.Param("text_weight"), + category_space: sl.Param("category_weight"), + }, + ) + .find(faq_schema) + .similar(text_space.text, sl.Param("query_text")) + .select([faq_schema.question, faq_schema.answer, faq_schema.category]) + .limit(sl.Param("limit")) + ) + + # Sample FAQ data + faqs = [ + { + "id": "faq1", + "question": "How do I reset my password?", + "answer": "You can reset your password by clicking 'Forgot Password' on the login page and following the email instructions.", + "category": "account", + }, + { + "id": "faq2", + "question": "Why is my API not working?", + "answer": "Check your API key, rate limits, and ensure you're using the correct endpoint URL.", + "category": "technical", + }, + { + "id": "faq3", + "question": "How do I upgrade my subscription?", + "answer": "Visit the billing section in your account settings to upgrade your plan.", + "category": "billing", + }, + ] + + # Create source and executor + source = sl.InMemorySource(schema=faq_schema) + executor = sl.InMemoryExecutor(sources=[source], indices=[faq_index]) + app = executor.run() + + # Add data to the source after the app is running + source.put(faqs) + + retriever = SuperlinkedRetriever( + sl_client=app, + sl_query=faq_query, + page_content_field="answer", + metadata_fields=["question", "category"], + ) + + # Simulate a RAG query + user_question = "I can't access the API" + + print(f"User Question: '{user_question}'") + print("Retrieving relevant context...") + + context_docs = retriever.invoke( + user_question, text_weight=1.0, category_weight=0.3, limit=2 + ) + + print("\nRetrieved Context:") + for i, doc in enumerate(context_docs, 1): + print(f" {i}. Q: {doc.metadata['question']}") + print(f" A: {doc.page_content}") + print(f" Category: {doc.metadata['category']}") + + print( + "\n[In a real RAG setup, this context would be passed to an LLM to generate a response]" + ) + print() + + +def example_6_qdrant_vector_database(): + """ + Example 6: Same retriever with Qdrant vector database + Use case: Production deployment with persistent vector storage + + This demonstrates that SuperlinkedRetriever is vector database agnostic. + The SAME retriever code works with Qdrant (or Redis, MongoDB) by only + changing the executor configuration, not the retriever implementation. + """ + print("=== Example 6: Qdrant Vector Database ===") + + # 1. Define Schema (IDENTICAL to Example 1) + class DocumentSchema(sl.Schema): + id: sl.IdField + content: sl.String + + doc_schema = DocumentSchema() + + # 2. Define Space and Index (IDENTICAL to Example 1) + text_space = sl.TextSimilaritySpace( + text=doc_schema.content, model="sentence-transformers/all-MiniLM-L6-v2" + ) + + doc_index = sl.Index([text_space]) + + # 3. Define Query (IDENTICAL to Example 1) + query = ( + sl.Query(doc_index) + .find(doc_schema) + .similar(text_space.text, sl.Param("query_text")) + .select([doc_schema.content]) + .limit(sl.Param("limit")) + ) + + # 4. Configure Qdrant Vector Database (ONLY DIFFERENCE!) + print("πŸ”§ Configuring Qdrant vector database...") + try: + qdrant_vector_db = sl.QdrantVectorDatabase( + url="https://your-qdrant-cluster.qdrant.io", # Replace with your Qdrant URL + api_key="your-api-key-here", # Replace with your API key + default_query_limit=10, + vector_precision=sl.Precision.FLOAT16, + ) + print( + "βœ… Qdrant configuration created (credentials needed for actual connection)" + ) + except Exception as e: + print(f"⚠️ Qdrant not configured (expected without credentials): {e}") + print("πŸ“ Using in-memory fallback for demonstration...") + qdrant_vector_db = None + + # 5. Set up data and app (SLIGHT DIFFERENCE - vector database parameter) + documents = [ + { + "id": "doc1", + "content": "Machine learning algorithms can process large datasets efficiently.", + }, + { + "id": "doc2", + "content": "Natural language processing enables computers to understand human language.", + }, + { + "id": "doc3", + "content": "Deep learning models require significant computational resources.", + }, + { + "id": "doc4", + "content": "Data science combines statistics, programming, and domain expertise.", + }, + { + "id": "doc5", + "content": "Artificial intelligence is transforming various industries.", + }, + ] + + # Create source and executor with Qdrant (or fallback to in-memory) + source = sl.InMemorySource(schema=doc_schema) + + if qdrant_vector_db: + # Production setup with Qdrant + executor = sl.InMemoryExecutor( + sources=[source], + indices=[doc_index], + vector_database=qdrant_vector_db, # πŸ‘ˆ This makes it use Qdrant! + ) + storage_type = "Qdrant (persistent)" + else: + # Fallback to in-memory for demo + executor = sl.InMemoryExecutor(sources=[source], indices=[doc_index]) + storage_type = "In-Memory (fallback)" + + app = executor.run() + + # Add data to the source after the app is running + source.put(documents) + + # 6. Create Retriever (IDENTICAL CODE!) + retriever = SuperlinkedRetriever( + sl_client=app, sl_query=query, page_content_field="content" + ) + + # 7. Use the retriever (IDENTICAL CODE!) + results = retriever.invoke("artificial intelligence and machine learning", limit=3) + + print(f"πŸ“Š Vector Storage: {storage_type}") + print("πŸ” Query: 'artificial intelligence and machine learning'") + print(f"πŸ“„ Found {len(results)} documents:") + for i, doc in enumerate(results, 1): + print(f" {i}. {doc.page_content}") + + print( + "\nβœ… Key Insight: Same SuperlinkedRetriever code works with any vector database!" + ) + print( + "βœ… Only executor configuration changes, retriever implementation stays identical" + ) + print("βœ… Switch between in-memory β†’ Qdrant β†’ Redis β†’ MongoDB without code changes") + print() + + +def main(): + """ + Run all examples to demonstrate the flexibility of SuperlinkedRetriever + """ + print("SuperlinkedRetriever Examples") + print("=" * 50) + print("This file demonstrates how the SuperlinkedRetriever can be used") + print("with different space configurations for various use cases.\n") + + try: + example_1_simple_text_search() + example_2_multi_space_blog_search() + example_3_ecommerce_product_search() + example_4_news_article_search() + demonstrate_langchain_integration() + example_6_qdrant_vector_database() + + print("πŸŽ‰ All examples completed successfully!") + + except Exception as e: + print(f"❌ Error running examples: {e}") + print("Make sure you have 'superlinked' package installed:") + print("pip install superlinked") + + +if __name__ == "__main__": + main()