diff --git a/.env.example b/.env.example index 25cf7ef..970e6d4 100644 --- a/.env.example +++ b/.env.example @@ -1,6 +1,46 @@ -# Elastic Cloud Configuration -# Copy this file to .env and fill in your actual credentials +# Hybrid RAG Pipeline Environment Configuration +# Copy this file to .env and fill in your actual values -# Authentication Option 2: API Key (recommended for production) -# Use either username/password OR api_key, not both -ELASTIC_API_KEY=elastic-search-api-key-for-elasticsearch-index \ No newline at end of file +# =================================================================== +# AWS CONFIGURATION +# =================================================================== +# AWS credentials for S3 access (both source and destination) +AWS_ACCESS_KEY_ID=your-aws-access-key-id +AWS_SECRET_ACCESS_KEY=your-aws-secret-access-key +AWS_REGION=us-east-1 + +# =================================================================== +# UNSTRUCTURED API CONFIGURATION +# =================================================================== +# Get your API key from: https://unstructured.io +UNSTRUCTURED_API_KEY=your-unstructured-api-key +UNSTRUCTURED_API_URL=https://platform.unstructuredapp.io/api/v1 + +# =================================================================== +# ELASTICSEARCH CONFIGURATION +# =================================================================== +# Elasticsearch Cloud host URL (without https://) +# Example: my-cluster-abc123.es.us-east-1.aws.found.io:9243 +ELASTICSEARCH_HOST=your-elasticsearch-host-url + +# Elasticsearch API key (base64 encoded) +# Generate this in Kibana: Stack Management > API Keys +ELASTICSEARCH_API_KEY=your-elasticsearch-api-key + +# =================================================================== +# PIPELINE DATA SOURCES +# =================================================================== +# S3 bucket containing Bose product PDFs (manuals, troubleshooting, MSDS) +S3_SOURCE_BUCKET=example-data-bose-headphones + +# Elasticsearch index containing synthetic sales data +ELASTICSEARCH_INDEX=sales-records + +# =================================================================== +# OPTIONAL: ADVANCED CONFIGURATION +# =================================================================== +# AWS Session Token (only needed for temporary credentials) +# AWS_SESSION_TOKEN=your-session-token + +# Custom S3 endpoint (only needed for S3-compatible services) +# S3_ENDPOINT_URL=https://s3.amazonaws.com diff --git a/.gitignore b/.gitignore index 9bec4dc..17f2318 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,33 @@ .env -venv/ \ No newline at end of file +venv/ +.env.backup +__pycache__/ +*.pyc +*.pyo +*.pyd +.DS_Store +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +output-downloads-*/ + +hybrid_rag_pipeline.py.bak +hybrid_rag_pipeline_enriched.ipynb.bak + +feedback_r1.md +feedback_r1_cleaned.md +feedback_r2.md +feedback_r2_cleaned.md \ No newline at end of file diff --git a/README.md b/README.md index 5d2059c..c7341be 100644 --- a/README.md +++ b/README.md @@ -1,209 +1,357 @@ -# Notebook Processing Tools +# Hybrid RAG Pipeline over Multiple Data Sources -This directory contains tools for processing Jupyter notebooks and setting up data sources for hybrid RAG pipelines. +A comprehensive hybrid Retrieval-Augmented Generation (RAG) pipeline that processes multiple data sources using the Unstructured API to create a unified knowledge base for customer support applications. -## remove_images.py +## Overview -A Python script that uses regular expressions to remove embedded base64-encoded images from Python files that were converted from Jupyter notebooks using `jupytext`. +This project demonstrates how to build a hybrid RAG system that combines: -### Features +1. **Technical Documentation** (PDFs from S3) - Product manuals, troubleshooting guides, MSDS documents +2. **Sales Data** (Elasticsearch) - Customer interactions, product information, sales records +3. **Unified Processing** - NER enrichment, chunking, embedding, and vector storage -- Removes base64 data URL images (e.g., `![Screenshot 1](data:image/png;base64,...)`) -- Cleans up extra empty lines left behind after image removal -- Can either overwrite the original file or create a new cleaned file -- Provides detailed feedback on the number of images found and removed +The pipeline processes both structured and unstructured data sources in parallel, enriches them with Named Entity Recognition (NER), and deposits the results into a unified Elasticsearch index for RAG applications. -### Usage +## Architecture -```bash -# Remove images from a file (overwrites original) -python remove_images.py +### Parallel Workflow Processing -# Remove images and save to a new file -python remove_images.py ``` +┌─────────────────┐ ┌──────────────────────────────────────────────────────┐ +│ S3 PDFs │ │ UNSTRUCTURED API PROCESSING │ +│ (Tech Manuals, │────────────────────┤ │ +│ Safety Docs) │ │ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ +└─────────────────┘ │ │Connect │ │ Route │ │Transform│ │ Chunk │ │ + │ │ ↓ │ │ ↓ │ │ ↓ │ │ ↓ │ │ +┌─────────────────┐ WORKFLOW 1 │ │ S3 Src │→ │VLM Auto │→ │Elements │→ │By Title │ │ +│ Elasticsearch │────────────────────┤ └─────────┘ └─────────┘ └─────────┘ └─────────┘ │ +│ (Sales Records) │ │ │ +└─────────────────┘ │ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ + │ │Connect │ │ Route │ │Transform│ │ + WORKFLOW 2 │ │ ↓ │ │ ↓ │ │ ↓ │ │ + │ │ ES Src │→ │VLM Auto │→ │Elements │──────────────┤ + │ └─────────┘ └─────────┘ └─────────┘ │ + │ │ + │ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ + │ │ Enrich │ │ Embed │ │ Persist │ │ + │ │ ↓ │ │ ↓ │ │ ↓ │ │ + │ │OpenAI │→ │OpenAI │→ │ ES │ │ + │ │ NER │ │text-emb │ │customer-│ │ + │ │ │ │ -3-small│ │support │ │ + │ └─────────┘ └─────────┘ └─────────┘ │ + └──────────────────────────────────────────────────────┘ + │ + ┌──────────────────────────────────▼───────────────────┐ + │ UNIFIED KNOWLEDGE BASE │ + │ Elasticsearch: customer-support │ + │ │ + │ • PDF content (manuals, troubleshooting) │ + │ • Sales data (customer interactions, products) │ + │ • Consistent chunking & embeddings │ + │ • Ready for hybrid RAG queries │ + └───────────────────────────────────────────────────────┘ +``` + +### Unstructured's 7-Stage Pipeline: +1. **Connect**: Source connectors (S3, Elasticsearch) ingest data +2. **Route**: Auto partitioning strategy selects optimal processing (VLM for complex docs) +3. **Transform**: Documents converted to Unstructured's canonical JSON schema +4. **Chunk**: By-title chunking creates semantically coherent retrieval units +5. **Enrich**: Optional NER extraction adds metadata and entities +6. **Embed**: OpenAI embeddings enable semantic similarity search +7. **Persist**: Destination connector writes processed data to vector database + +## Features + +### 🔧 **Smart Elasticsearch Preprocessing** +- **Index Validation**: Automatically checks for required `sales-records-consolidated` index +- **Data Verification**: Ensures source data exists before processing +- **Fresh Destination**: Automatically recreates `customer-support` index for clean runs +- **Error Handling**: Fails fast with clear error messages if prerequisites aren't met + +### 🚀 **Parallel Workflow Processing** +- **S3 Source Connector**: Processes PDFs with VLM (Vision Language Model) parsing +- **Elasticsearch Source Connector**: Ingests sales records with rich NER data +- **Unified Destination**: Both workflows deposit into the same `customer-support` index + +### 🎯 **Advanced Processing Pipeline** +- **VLM Partitioner**: Uses GPT-4o for intelligent document parsing +- **Smart Chunker**: Context-aware chunking with title-based segmentation +- **Vector Embedder**: OpenAI text-embedding-3-small for semantic search +- **NER Enrichment**: Extracts named entities (people, places, organizations, etc.) + +### 📊 **Best Practices Implementation** +- **Context Managers**: Proper resource management with `UnstructuredClient` +- **Modern API Usage**: Uses `CreateWorkflowRequest` and `CreateWorkflow` objects +- **Error Handling**: Comprehensive exception handling with clear feedback +- **Logging**: Detailed progress tracking with emoji indicators + +## Quick Start + +### Prerequisites + +1. **Environment Setup**: + ```bash + # Clone the repository + git clone + cd rag-over-hybrid-data-sources + + # Create and activate virtual environment + python3 -m venv venv + source venv/bin/activate # On Windows: venv\Scripts\activate + + # Install dependencies + pip install -r requirements.txt + ``` + +2. **Configuration**: + ```bash + # Copy environment template + cp .env.template .env + + # Edit .env with your credentials: + # - UNSTRUCTURED_API_KEY=your-unstructured-api-key + # - ELASTICSEARCH_HOST=https://your-cluster.es.io:443 + # - ELASTICSEARCH_API_KEY=your-elasticsearch-api-key + # - AWS_ACCESS_KEY_ID=your-aws-access-key + # - AWS_SECRET_ACCESS_KEY=your-aws-secret-key + # - S3_SOURCE_BUCKET=your-pdf-bucket + # - S3_DESTINATION_BUCKET=your-output-bucket + ``` -### Examples +### Running the Pipeline +#### Option 1: Python Script ```bash -# Clean the converted notebook file in-place -python remove_images.py ../donor-notebooks/S3_to_Qdrant_Workflow_using_Unstructured_API.py +# Activate virtual environment +source venv/bin/activate -# Create a cleaned copy -python remove_images.py notebook.py cleaned_notebook.py +# Run the pipeline +python hybrid_rag_pipeline.py ``` -### Requirements +#### Option 2: Jupyter Notebook +```bash +# Start Jupyter +jupyter lab -- Python 3.6+ -- No external dependencies (uses only standard library modules: `re`, `sys`, `os`, `pathlib`) +# Open and run hybrid_rag_pipeline_enriched.ipynb +``` -### How it works +## Data Sources Setup -The script uses a regular expression pattern to identify and remove markdown-style image references with base64 data URLs: +### 1. Elasticsearch Sales Data -```python -image_pattern = r'!\[.*?\]\(data:image/[^;]+;base64,[A-Za-z0-9+/=]+\)' +The pipeline requires a `sales-records-consolidated` index with sales data. You can create this using the provided preprocessing tools: + +```bash +# Run Elasticsearch preprocessing to create sample data +python elasticsearch_index_preprocessing.py ``` -This pattern matches: -- `![...]` - Markdown image syntax -- `(data:image/...)` - Data URL with image MIME type -- `;base64,` - Base64 encoding indicator -- `[A-Za-z0-9+/=]+` - Base64 encoded data +This creates: +- `sales-records` - Raw sales data (100 records) +- `sales-records-consolidated` - Processed sales data optimized for RAG +- `customer-support` - Empty destination index (created fresh each run) -## Workflow Example +### 2. S3 Technical Documentation -1. Convert Jupyter notebook to Python file using `jupytext`: - ```bash - jupytext --to py notebook.ipynb - ``` +Upload your PDF documents to an S3 bucket. The pipeline supports: +- Product manuals +- Troubleshooting guides +- MSDS documents +- Technical specifications -2. Remove embedded images from the converted file: - ```bash - python remove_images.py notebook.py - ``` +Supported S3 URL formats: +- `s3://bucket-name/path/` +- `https://bucket-name.s3.region.amazonaws.com/path/` +- Raw bucket names: `bucket-name/path` -The result is a clean Python file without embedded base64 images, making it more readable and reducing file size significantly. +## Pipeline Workflow -## elasticsearch_setup.py +### Step 0: Elasticsearch Preprocessing +- ✅ Validates `sales-records-consolidated` exists and has data +- ✅ Deletes and recreates fresh `customer-support` index +- ❌ Fails with clear error if source data is missing -A comprehensive Python script that creates and populates an Elasticsearch index with NER-rich synthetic sales data for Bose products. This data serves as one of the source connectors in a hybrid RAG pipeline. +### Step 1: Source Connectors +- Creates S3 source connector for PDFs +- Creates Elasticsearch source connector for sales data -### Features +### Step 2: Destination Connector +- Creates Elasticsearch destination connector for `customer-support` index -- **Elastic Cloud Integration**: Connects directly to your Elasticsearch Cloud deployment -- **NER-Optimized Data**: Generates synthetic sales records rich in named entities (people, organizations, locations, prices, dates) -- **Semantic Text Support**: Uses `semantic_text` field type for enhanced search capabilities -- **Bose Product Focus**: Covers SoundSport, OpenAudio, and QuietComfort product lines -- **Realistic Sales Scenarios**: Creates contextual sales interactions with detailed customer information +### Step 3: Workflow Creation +- Creates parallel workflows for S3 and Elasticsearch sources +- Both workflows use identical processing nodes: + - VLM Partitioner (GPT-4o) + - Smart Chunker (title-based) + - Vector Embedder (OpenAI) + - NER Enrichment (OpenAI) -### Setup +### Step 4: Execution +- Runs both workflows in parallel +- Monitors job status (optional) +- Reports completion status -1. **Install Dependencies**: - ```bash - pip install -r requirements.txt +## Project Structure + +``` +rag-over-hybrid-data-sources/ +├── hybrid_rag_pipeline.py # Main pipeline code +├── hybrid_rag_pipeline_enriched.py # Generated enriched version +├── hybrid_rag_pipeline_enriched.ipynb # Jupyter notebook +├── elasticsearch_index_preprocessing.py # ES data setup +├── requirements.txt # Python dependencies +├── README.md # This file +├── notebook-processing/ # Documentation pipeline +│ ├── enrich_and_convert.py # Notebook generation script +│ ├── markdown_blocks.yaml # Markdown content +│ └── README.md # Documentation workflow +├── elastic-search-index-setup/ # ES setup tools +│ ├── create_consolidated_index.py +│ ├── create_nonconsolidated_index.py +│ └── verify_elasticsearch_data.py +└── elasticsearch-example-data/ # Sample data + ├── consolidated_examples.json + └── nonconsolidated_examples.json +``` + +## Configuration Options + +### Environment Variables + +| Variable | Description | Example | +|----------|-------------|---------| +| `UNSTRUCTURED_API_KEY` | Your Unstructured API key | `your-api-key` | +| `ELASTICSEARCH_HOST` | Elasticsearch cluster URL | `https://cluster.es.io:443` | +| `ELASTICSEARCH_API_KEY` | Elasticsearch API key | `your-es-api-key` | +| `ELASTICSEARCH_INDEX` | Source sales data index | `sales-records-consolidated` | +| `AWS_ACCESS_KEY_ID` | AWS access key | `AKIA...` | +| `AWS_SECRET_ACCESS_KEY` | AWS secret key | `your-secret-key` | +| `S3_SOURCE_BUCKET` | S3 bucket with PDFs | `my-docs-bucket/manuals/` | +| `S3_DESTINATION_BUCKET` | S3 output bucket | `my-output-bucket` | + +### Processing Parameters + +- **Chunking**: 1500 chars with 2048 max, title-based segmentation +- **Embedding Model**: OpenAI text-embedding-3-small +- **VLM Model**: GPT-4o for document parsing +- **NER Model**: OpenAI NER extraction + +## Monitoring and Debugging + +### Pipeline Status +The pipeline provides detailed status updates: +- `:white_check_mark:` Success indicators +- `:x:` Error indicators +- `📊` Progress information +- `🔍` Validation steps + +### Common Issues + +1. **Missing Sales Data**: + ``` + ❌ Index 'sales-records-consolidated' does not exist. There is no data to use. ``` + **Solution**: Run `python elasticsearch_index_preprocessing.py` -2. **Configure Environment**: - ```bash - # Copy the template and add your credentials - cp env_template.txt .env - # Edit .env and add your ELASTIC_API_KEY +2. **Empty Sales Index**: ``` + ❌ Index 'sales-records-consolidated' is empty. There is no data to use. + ``` + **Solution**: Verify data was properly indexed -3. **Run the Setup**: - ```bash - python elasticsearch_setup.py +3. **S3 Access Issues**: ``` + :x: Error creating S3 source connector: Access Denied + ``` + **Solution**: Check AWS credentials and bucket permissions -### Generated Data Structure - -Each sales record contains rich named entities perfect for NER extraction: - -- **PERSON**: Customer names, sales representatives -- **ORG**: Retailers (Best Buy, Target, Amazon, etc.) -- **LOCATION**: Cities and regions across the US -- **MONEY**: Product prices and revenue potential -- **DATE**: Timestamps, quarters, months -- **PRODUCT**: Bose product lines and specific models - -### Example Generated Record - -```json -{ - "customer_name": "Jennifer Martinez", - "sales_representative": "Michael Chen", - "product_model": "SoundSport Free", - "price": 149, - "retailer": "Best Buy", - "location_city": "New York, NY", - "interaction_text": "Customer Jennifer Martinez from New York, NY called to inquire about purchasing the SoundSport Free. Sales rep Michael Chen provided detailed product information and quoted $149. Customer is comparing with similar products at Best Buy.", - "text": "Customer Jennifer Martinez from New York, NY called to inquire about purchasing the SoundSport Free. Sales rep Michael Chen provided detailed product information and quoted $149. Customer is comparing with similar products at Best Buy." -} -``` +## Advanced Usage -### Integration with Unstructured Workflow +### Custom NER Configuration +The NER enrichment node can be customized by updating the `settings` in `create_workflow_nodes()`: + +```python +ner_enrichment_node = WorkflowNode( + name="NER_Enrichment", + subtype="openai_ner", + type="prompter", + settings={ + "prompt": "Extract named entities focusing on products, customers, and locations..." + } +) +``` -This Elasticsearch index can be used as a source connector in the Unstructured Workflow Endpoint alongside S3 technical documentation to create a hybrid RAG system: +### Multiple S3 Sources +To process multiple S3 buckets, modify the S3 source connector creation or create additional workflows. -1. **S3 Source**: Technical manuals, troubleshooting guides, MSDS PDFs -2. **Elasticsearch Source**: Synthetic sales data (this script) -3. **NER Enrichment**: Extract named entities from both sources -4. **Qdrant Destination**: Combined processed data for RAG queries +### Custom Elasticsearch Mapping +The `customer-support` index mapping can be customized in the `run_elasticsearch_preprocessing()` function. -### Requirements +## Development Workflow -See `requirements.txt` for Python dependencies: -- elasticsearch>=8.0.0 -- python-dotenv>=0.19.0 -- faker>=15.0.0 +### Notebook Content Management -## verify_elasticsearch_data.py +**Important**: Do not edit the Jupyter notebook directly! -A comprehensive verification script that inspects and validates the synthetic sales data in your Elasticsearch index. Use this script to confirm that data was successfully uploaded and is ready for NER processing. +Instead, follow this workflow: -### Features +1. **Edit Code**: Modify `hybrid_rag_pipeline.py` +2. **Edit Documentation**: Update `notebook-processing/markdown_blocks.yaml` +3. **Regenerate**: Run `python notebook-processing/enrich_and_convert.py` -- **Connection Testing**: Verifies Elasticsearch cluster connectivity -- **Index Validation**: Confirms the index exists and contains data -- **Data Statistics**: Provides comprehensive metrics on document count, index size, and distribution -- **Sample Document Display**: Shows actual records with key fields -- **NER Readiness Check**: Validates that data contains rich named entities -- **Search Query Testing**: Tests various search patterns to ensure data accessibility +This process: +- Replaces `[[MD:HANDLE]]` placeholders with markdown content +- Generates `hybrid_rag_pipeline_enriched.py` +- Converts to `hybrid_rag_pipeline_enriched.ipynb` using jupytext -### Usage +### Testing ```bash -python verify_elasticsearch_data.py -``` +# Test Elasticsearch connection +python elasticsearch-index-setup/simple_check.py -### What It Checks - -1. **Basic Connectivity**: Tests connection to your Elasticsearch cluster -2. **Index Existence**: Confirms the `sales-records` index exists -3. **Document Count**: Reports total number of indexed documents -4. **Data Distribution**: Analyzes breakdown by: - - Product lines (SoundSport, OpenAudio, QuietComfort) - - Product models - - Retailers (Best Buy, Target, Amazon, etc.) - - Geographic regions - - Interaction types - - Price and revenue statistics - - Temporal distribution (by year) -5. **NER Entity Validation**: Confirms presence of: - - Person names (customers, sales reps) - - Organizations (retailers) - - Locations (cities, regions) - - Monetary values (prices) - - Dates (timestamps) - - Rich text content -6. **Search Functionality**: Tests sample queries to verify data is searchable - -### Sample Output +# Verify data setup +python elasticsearch-index-setup/verify_elasticsearch_data.py +# Run pipeline in test mode +python hybrid_rag_pipeline.py ``` -🚀 Starting Elasticsearch Data Verification -============================================================ -🔧 Testing Elasticsearch connection... -✅ Connected to Elasticsearch cluster: instance-0000000000 - Version: 8.11.0 - Cluster: 2371b9a1d2ad40c590fd1e22652a8236 - -✅ Index 'sales-records' exists -📊 Getting statistics for index 'sales-records'... - 📄 Total documents: 500 - 💾 Index size: 245,760 bytes (0.23 MB) - 🔧 Primary shards: 12 - -📋 Retrieving 5 sample documents... -📄 Document 1: - 🆔 ID: abc123-def456 - 👤 Customer: Jennifer Martinez - 🏷️ Product: SoundSport Free - 💰 Price: $149 - 🏪 Retailer: Best Buy - 📍 Location: New York, NY - 📅 Date: 2023-11-15T14:30:00 - 📝 Text: Customer Jennifer Martinez from New York, NY called to inquire about purchasing the SoundSport... -``` \ No newline at end of file + +## API Reference + +### Core Functions + +- `run_elasticsearch_preprocessing()` - Validates and prepares ES indices +- `create_s3_source_connector()` - Creates S3 PDF source +- `create_elasticsearch_source_connector()` - Creates ES sales source +- `create_elasticsearch_destination_connector()` - Creates ES destination +- `create_parallel_workflows()` - Sets up processing workflows +- `run_workflow()` - Executes workflows +- `poll_job_status()` - Monitors job progress + +## Contributing + +1. Fork the repository +2. Create a feature branch +3. Make changes following the development workflow +4. Test thoroughly +5. Submit a pull request + +## License + +This project is licensed under the MIT License - see the LICENSE file for details. + +## Support + +For questions or issues: +1. Check the troubleshooting section above +2. Review Unstructured API documentation +3. Open an issue in the repository + +--- + +**Note**: This pipeline demonstrates advanced RAG techniques using the Unstructured API. It's designed for educational and development purposes. For production use, consider additional error handling, monitoring, and security measures. + diff --git a/elastic-search-index-setup/SETUP_INSTRUCTIONS.md b/elastic-search-index-setup/SETUP_INSTRUCTIONS.md index 7a2cb83..11e2419 100644 --- a/elastic-search-index-setup/SETUP_INSTRUCTIONS.md +++ b/elastic-search-index-setup/SETUP_INSTRUCTIONS.md @@ -94,10 +94,20 @@ python simple_check.py **Expected Output**: Should show connection success or indicate if index doesn't exist yet. ### 3.2 Create Index and Upload Synthetic Data -Run the main setup script: + +**Choose your data structure approach:** + +#### Option A: Consolidated Structure (Recommended for RAG) +```bash +python create_consolidated_index.py +``` +**Best for**: RAG use cases where context preservation is critical + +#### Option B: Non-Consolidated Structure (Good for Analytics) ```bash -python elasticsearch_setup.py +python create_nonconsolidated_index.py ``` +**Best for**: Traditional database queries and business intelligence **What this script does**: - ✅ Tests connection to Elasticsearch diff --git a/elastic-search-index-setup/create_consolidated_index.py b/elastic-search-index-setup/create_consolidated_index.py new file mode 100644 index 0000000..665841c --- /dev/null +++ b/elastic-search-index-setup/create_consolidated_index.py @@ -0,0 +1,299 @@ +#!/usr/bin/env python3 +""" +Create Consolidated Elasticsearch Index for RAG +Combines multiple fields into a single text field to preserve context during Unstructured processing. + +This approach ensures that when Unstructured processes the data: +1. All context (product, customer, location, etc.) is preserved in each Text element +2. The resulting embeddings contain complete contextual information +3. RAG queries can access full context without losing field relationships +""" + +import os +import uuid +import random +from datetime import datetime, timedelta +from faker import Faker +from dotenv import load_dotenv +from elasticsearch import Elasticsearch + +# Load environment variables from base directory (one level up) +load_dotenv(dotenv_path="../.env") + +class ConsolidatedBoseSalesDataGenerator: + """Generate consolidated sales data optimized for RAG processing""" + + def __init__(self): + self.fake = Faker(['en_US']) + Faker.seed(42) # For reproducible data + + # Bose product data + self.products = { + 'SoundSport': { + 'models': ['SoundSport Free', 'SoundSport Wireless', 'SoundSport Pulse'], + 'price_range': (129, 199), + 'category': 'Sports Earbuds', + 'features': ['sweat-resistant', 'secure fit', 'wireless', 'noise isolation'] + }, + 'OpenAudio': { + 'models': ['OpenAudio Sport', 'OpenAudio Ultra', 'OpenAudio Pro'], + 'price_range': (149, 249), + 'category': 'Open-Ear Audio', + 'features': ['open-ear design', 'situational awareness', 'comfortable fit', 'premium audio'] + }, + 'QuietComfort': { + 'models': ['QuietComfort 45', 'QuietComfort Ultra', 'QuietComfort Earbuds'], + 'price_range': (199, 429), + 'category': 'Noise Cancelling', + 'features': ['world-class noise cancellation', 'premium comfort', 'long battery life', 'crystal clear calls'] + } + } + + # Rich entity data for NER + self.retailers = [ + "Best Buy", "Target", "Amazon", "Walmart", "Costco", "B&H Photo", + "Guitar Center", "Sam's Club", "Newegg", "Adorama", "Crutchfield" + ] + + self.sales_reps = [ + "Jennifer Martinez", "Michael Chen", "Sarah Johnson", "David Rodriguez", + "Emily Wilson", "Robert Taylor", "Lisa Anderson", "James Thompson", + "Maria Garcia", "Christopher Lee", "Amanda Davis", "Daniel Brown" + ] + + self.regions = [ + "Northeast", "Southeast", "Midwest", "Southwest", "West Coast", + "Pacific Northwest", "Mountain West", "Great Lakes", "Mid-Atlantic", "Gulf Coast" + ] + + self.cities = [ + "New York, NY", "Los Angeles, CA", "Chicago, IL", "Houston, TX", "Phoenix, AZ", + "Philadelphia, PA", "San Antonio, TX", "San Diego, CA", "Dallas, TX", "Austin, TX", + "Jacksonville, FL", "Fort Worth, TX", "Columbus, OH", "Charlotte, NC", "Seattle, WA", + "Denver, CO", "Washington, DC", "Boston, MA", "Nashville, TN", "Detroit, MI" + ] + + self.interaction_types = [ + "purchase_inquiry", "product_comparison", "pricing_discussion", + "sales_consultation", "order_processing", "upsell_opportunity", + "customer_preferences", "warranty_inquiry", "bulk_order_request", + "promotional_campaign", "seasonal_sale", "loyalty_program_enrollment" + ] + + def generate_consolidated_record(self) -> dict: + """Generate a single sales record with consolidated text field""" + + # Generate individual field data + product_line = random.choice(list(self.products.keys())) + product_info = self.products[product_line] + model = random.choice(product_info['models']) + price = random.randint(*product_info['price_range']) + features = random.sample(product_info['features'], k=random.randint(1, 3)) + + customer_name = self.fake.name() + sales_rep = random.choice(self.sales_reps) + retailer = random.choice(self.retailers) + city = random.choice(self.cities) + region = random.choice(self.regions) + interaction_type = random.choice(self.interaction_types) + + # Generate timestamp from 2016 onwards + start_date = datetime(2016, 1, 1) + random_date = self.fake.date_time_between(start_date=start_date, end_date='now') + + customer_segment = random.choice(["Consumer", "Business", "Education", "Government"]) + sales_channel = random.choice(["Direct", "Retail Partner", "Online", "Phone"]) + lead_source = random.choice(["Website", "Advertisement", "Referral", "Trade Show", "Cold Call"]) + deal_stage = random.choice(["Prospect", "Qualified", "Proposal", "Negotiation", "Closed Won", "Closed Lost"]) + + # Create contextual interaction based on type + interaction_details = self._generate_interaction_text( + interaction_type, customer_name, sales_rep, model, price, retailer, city, features + ) + + # **KEY CHANGE: Create consolidated text field with ALL context** + consolidated_text = f""" +SALES RECORD - {random_date.strftime('%B %d, %Y')} + +Customer Information: +- Name: {customer_name} +- Location: {city} +- Segment: {customer_segment} +- Lead Source: {lead_source} + +Product Details: +- Product Line: {product_line} +- Model: {model} +- Category: {product_info['category']} +- Price: ${price} +- Key Features: {', '.join(features)} + +Sales Information: +- Sales Representative: {sales_rep} +- Retailer: {retailer} +- Region: {region} +- Channel: {sales_channel} +- Deal Stage: {deal_stage} +- Interaction Type: {interaction_type.replace('_', ' ').title()} + +Conversation Summary: +{interaction_details} + +Temporal Context: +- Date: {random_date.strftime('%B %d, %Y')} +- Quarter: Q{random_date.month//3 + 1} {random_date.year} +- Day of Week: {random_date.strftime('%A')} + +Revenue Information: +- Unit Price: ${price} +- Potential Deal Value: ${price * random.randint(1, 5)} +- Priority: {random.choice(['High', 'Medium', 'Low'])} + """.strip() + + # Return document with consolidated text + minimal metadata for Elasticsearch + return { + "id": str(uuid.uuid4()), + "timestamp": random_date.isoformat(), + "document_type": "sales_record", + "product_line": product_line, # Keep for filtering/aggregation + "region": region, # Keep for filtering/aggregation + "consolidated_text": consolidated_text, # **This is what Unstructured will process** + "record_date": random_date.strftime('%Y-%m-%d'), + "quarter": f"Q{random_date.month//3 + 1}", + "year": random_date.year + } + + def _generate_interaction_text(self, interaction_type, customer_name, sales_rep, model, price, retailer, city, features): + """Generate detailed interaction text based on type""" + + interaction_templates = { + "purchase_inquiry": f"Customer {customer_name} from {city} contacted {sales_rep} to inquire about purchasing the {model}. The customer was particularly interested in the {', '.join(features[:2])} features. {sales_rep} provided detailed product specifications and quoted ${price}. Customer mentioned they had seen similar products at {retailer} but was impressed with the Bose quality and features.", + + "product_comparison": f"{sales_rep} conducted a comprehensive product comparison session with {customer_name}. The customer was deciding between the {model} and competitor products. Key selling points discussed included {', '.join(features)} which differentiate Bose from competitors. The ${price} price point was justified through superior audio quality and build reliability. Customer appreciated the detailed comparison and is considering the purchase.", + + "sales_consultation": f"In-depth consultation with {customer_name} in the {city} area. {sales_rep} assessed customer needs and recommended the {model} based on their lifestyle and audio preferences. Highlighted features included {', '.join(features)}. Discussed ${price} pricing structure and available financing options. Customer showed strong interest and requested follow-up information.", + + "order_processing": f"Order successfully processed for {customer_name}: {random.randint(1, 3)} units of {model} at ${price} each. {sales_rep} confirmed shipping details to {city} and explained warranty coverage. Customer opted for expedited shipping and was provided with tracking information. Partnership with {retailer} ensured competitive pricing and reliable delivery.", + + "promotional_campaign": f"Q{random.randint(1,4)} promotional outreach to {customer_name}. {sales_rep} presented special pricing on {model} - limited time offer at ${price - random.randint(10, 30)} (regularly ${price}). Emphasized exclusive features: {', '.join(features)}. Customer expressed interest and plans to visit {retailer} location this weekend to experience the product firsthand.", + + "warranty_inquiry": f"{customer_name} contacted {sales_rep} regarding warranty coverage for their {model} purchased from {retailer}. {sales_rep} reviewed the comprehensive warranty terms and explained the repair/replacement process. Customer was satisfied with the coverage and expressed loyalty to the Bose brand. Discussed potential upgrade paths and new features in latest models.", + + "upsell_opportunity": f"{sales_rep} identified upsell opportunity with existing customer {customer_name}. Customer currently owns an older Bose model and was introduced to the {model} with enhanced features: {', '.join(features)}. The ${price} upgrade investment was positioned as worthwhile for the improved experience. Customer is considering the upgrade and requested a demo unit.", + + "bulk_order_request": f"Corporate customer {customer_name} from {city} requested bulk pricing for {model} units. {sales_rep} prepared enterprise quotation for {random.randint(10, 50)} units at discounted rate. Discussed features relevant to business use: {', '.join(features)}. Partnership with {retailer} enables volume discounts and dedicated support. Proposal under review by customer's procurement team." + } + + return interaction_templates.get( + interaction_type, + f"{sales_rep} assisted {customer_name} with {interaction_type.replace('_', ' ')} regarding {model}. Discussed ${price} pricing and key features: {', '.join(features)}. Customer interaction was positive and follow-up scheduled." + ) + +def main(): + """Create consolidated Elasticsearch index optimized for RAG""" + print("🚀 Creating Consolidated Elasticsearch Index for RAG") + print("=" * 60) + + # Initialize + api_key = os.getenv('ELASTIC_API_KEY') + es = Elasticsearch( + "https://2371b9a1d2ad40c590fd1e22652a8236.us-central1.gcp.cloud.es.io:443", + api_key=api_key, + request_timeout=60 + ) + + index_name = "sales-records" # Use same index name as API key permissions + num_records = 100 + + try: + # Delete existing index if it exists + if es.indices.exists(index=index_name): + print(f"🗑️ Deleting existing index: {index_name}") + es.indices.delete(index=index_name) + + # Create index with simple mapping optimized for consolidated text + print(f"🔧 Creating consolidated index: {index_name}") + mapping = { + "settings": { + "number_of_shards": 1, + "number_of_replicas": 1 + }, + "mappings": { + "properties": { + "id": {"type": "keyword"}, + "timestamp": {"type": "date"}, + "document_type": {"type": "keyword"}, + "product_line": {"type": "keyword"}, + "region": {"type": "keyword"}, + "consolidated_text": { + "type": "text", + "analyzer": "standard" # This is the field Unstructured will process + }, + "record_date": {"type": "date"}, + "quarter": {"type": "keyword"}, + "year": {"type": "integer"} + } + } + } + + es.indices.create(index=index_name, body=mapping) + print(f"✅ Created index with consolidated text mapping") + + # Generate and index data + generator = ConsolidatedBoseSalesDataGenerator() + print(f"🔄 Generating {num_records} consolidated sales records...") + + records = [] + for i in range(num_records): + if i % 25 == 0 and i > 0: + print(f" Generated {i}/{num_records} records...") + records.append(generator.generate_consolidated_record()) + + print(f"✅ Generated {len(records)} consolidated records") + + # Bulk index + print("📤 Bulk indexing consolidated records...") + actions = [] + for record in records: + actions.append({ + "_index": index_name, + "_id": record["id"], + "_source": record + }) + + from elasticsearch.helpers import bulk + success_count, failed_items = bulk(es, actions, chunk_size=50) + + print(f"✅ Successfully indexed {success_count} consolidated records") + + # Verify + count_response = es.count(index=index_name) + total_docs = count_response['count'] + print(f"📊 Total documents in consolidated index: {total_docs}") + + # Show sample + sample_response = es.search( + index=index_name, + body={"size": 1, "_source": ["consolidated_text", "product_line", "region"]}, + ) + + if sample_response['hits']['hits']: + sample = sample_response['hits']['hits'][0]['_source'] + print(f"\n📋 Sample Consolidated Record:") + print(f" Product Line: {sample['product_line']}") + print(f" Region: {sample['region']}") + print(f" Consolidated Text Preview:") + text_preview = sample['consolidated_text'][:300] + "..." + print(f" {text_preview}") + + print(f"\n🎉 CONSOLIDATED INDEX READY!") + print(f"✅ Index: {index_name}") + print(f"✅ Records: {total_docs}") + print(f"✅ Each record contains ALL context in 'consolidated_text' field") + print(f"✅ Ready for Unstructured Workflow processing with full context preservation") + + except Exception as e: + print(f"❌ Error: {e}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/elastic-search-index-setup/create_nonconsolidated_index.py b/elastic-search-index-setup/create_nonconsolidated_index.py new file mode 100644 index 0000000..5874db6 --- /dev/null +++ b/elastic-search-index-setup/create_nonconsolidated_index.py @@ -0,0 +1,503 @@ +#!/usr/bin/env python3 +""" +Elasticsearch Setup Script for Bose Sales Data +Creates and populates an Elasticsearch index with NER-rich synthetic sales data +for use as a source connector in the Unstructured Workflow Endpoint. + +Designed for Elastic Cloud deployment with .env configuration. +""" + +import os +import json +import time +import uuid +import random +from datetime import datetime, timedelta +from typing import List, Dict, Any, Optional +from faker import Faker +from dotenv import load_dotenv + +# Load environment variables from base directory (one level up) +load_dotenv(dotenv_path="../.env") + +try: + from elasticsearch import Elasticsearch + from elasticsearch.helpers import bulk, BulkIndexError +except ImportError: + print("❌ elasticsearch package not found. Install with: pip install elasticsearch") + exit(1) + +class ElasticsearchConfig: + """Configuration for Elasticsearch connection and data setup""" + + def __init__(self): + # Load from environment variables + self.cloud_id = os.getenv('ELASTIC_CLOUD_ID') + self.username = os.getenv('ELASTIC_USERNAME') + self.password = os.getenv('ELASTIC_PASSWORD') + self.api_key = os.getenv('ELASTIC_API_KEY') + + # Index configuration + self.index_name = "sales-records" + self.num_synthetic_records = 100 + + # Validate required credentials + self._validate_credentials() + + def _validate_credentials(self): + """Validate that required credentials are provided""" + # Either username/password OR api_key must be provided + if not ((self.username and self.password) or self.api_key): + raise ValueError("Either ELASTIC_USERNAME/ELASTIC_PASSWORD or ELASTIC_API_KEY must be provided in .env file") + + def get_client(self) -> Elasticsearch: + """Create and return Elasticsearch client for Elastic Cloud""" + if self.api_key: + # Use API key authentication (recommended for production) + return Elasticsearch( + "https://2371b9a1d2ad40c590fd1e22652a8236.us-central1.gcp.cloud.es.io:443", + api_key=self.api_key, + request_timeout=60, + max_retries=3, + retry_on_timeout=True + ) + else: + # Use username/password authentication + return Elasticsearch( + "https://2371b9a1d2ad40c590fd1e22652a8236.us-central1.gcp.cloud.es.io:443", + basic_auth=(self.username, self.password), + request_timeout=60, + max_retries=3, + retry_on_timeout=True + ) + +class BoseSalesDataGenerator: + """Generate NER-rich synthetic sales data for Bose products""" + + def __init__(self): + self.fake = Faker(['en_US']) + Faker.seed(42) # For reproducible data + + # Bose product data + self.products = { + 'SoundSport': { + 'models': ['SoundSport Free', 'SoundSport Wireless', 'SoundSport Pulse'], + 'price_range': (129, 199), + 'category': 'Sports Earbuds' + }, + 'OpenAudio': { + 'models': ['OpenAudio Sport', 'OpenAudio Ultra', 'OpenAudio Pro'], + 'price_range': (149, 249), + 'category': 'Open-Ear Audio' + }, + 'QuietComfort': { + 'models': ['QuietComfort 45', 'QuietComfort Ultra', 'QuietComfort Earbuds'], + 'price_range': (199, 429), + 'category': 'Noise Cancelling' + } + } + + # Rich entity data for NER + self.retailers = [ + "Best Buy", "Target", "Amazon", "Walmart", "Costco", "B&H Photo", + "Guitar Center", "Sam's Club", "Newegg", "Adorama", "Crutchfield" + ] + + self.sales_reps = [ + "Jennifer Martinez", "Michael Chen", "Sarah Johnson", "David Rodriguez", + "Emily Wilson", "Robert Taylor", "Lisa Anderson", "James Thompson", + "Maria Garcia", "Christopher Lee", "Amanda Davis", "Daniel Brown" + ] + + self.regions = [ + "Northeast", "Southeast", "Midwest", "Southwest", "West Coast", + "Pacific Northwest", "Mountain West", "Great Lakes", "Mid-Atlantic", "Gulf Coast" + ] + + self.cities = [ + "New York, NY", "Los Angeles, CA", "Chicago, IL", "Houston, TX", "Phoenix, AZ", + "Philadelphia, PA", "San Antonio, TX", "San Diego, CA", "Dallas, TX", "Austin, TX", + "Jacksonville, FL", "Fort Worth, TX", "Columbus, OH", "Charlotte, NC", "Seattle, WA", + "Denver, CO", "Washington, DC", "Boston, MA", "Nashville, TN", "Detroit, MI" + ] + + self.interaction_types = [ + "purchase_inquiry", "product_comparison", "pricing_discussion", + "sales_consultation", "order_processing", "upsell_opportunity", + "customer_preferences", "warranty_inquiry", "bulk_order_request", + "promotional_campaign", "seasonal_sale", "loyalty_program_enrollment" + ] + + def generate_sales_record(self) -> Dict[str, Any]: + """Generate a single NER-rich sales record""" + + # Select random product + product_line = random.choice(list(self.products.keys())) + product_info = self.products[product_line] + model = random.choice(product_info['models']) + price = random.randint(*product_info['price_range']) + + # Generate rich entities for NER extraction + customer_name = self.fake.name() + sales_rep = random.choice(self.sales_reps) + retailer = random.choice(self.retailers) + city = random.choice(self.cities) + region = random.choice(self.regions) + + # Generate realistic interaction text with rich named entities + interaction_type = random.choice(self.interaction_types) + + # Create contextual sales interaction text + interaction_texts = { + "purchase_inquiry": f"Customer {customer_name} from {city} called to inquire about purchasing the {model}. Sales rep {sales_rep} provided detailed product information and quoted ${price}. Customer is comparing with similar products at {retailer}.", + + "product_comparison": f"{sales_rep} helped {customer_name} compare the {model} against competitors. Discussed the superior noise cancellation technology and ${price} price point. Customer mentioned they saw it at {retailer} for a higher price.", + + "sales_consultation": f"Consultation session with {customer_name} in {region} region. {sales_rep} recommended the {model} based on customer's active lifestyle needs. Discussed ${price} pricing and available financing options through {retailer}.", + + "order_processing": f"Order processed for {customer_name}: 2 units of {model} at ${price} each. Shipping to {city}. Sales rep {sales_rep} confirmed delivery timeline and warranty coverage. Partner retailer: {retailer}.", + + "promotional_campaign": f"Q4 promotional campaign in {region}: {sales_rep} contacted {customer_name} about special pricing on {model}. Limited time offer at ${price - 20} (originally ${price}). Customer interested, will visit {retailer} this weekend." + } + + # Select appropriate interaction text or generate generic one + if interaction_type in interaction_texts: + interaction_text = interaction_texts[interaction_type] + else: + interaction_text = f"{sales_rep} assisted {customer_name} with {interaction_type.replace('_', ' ')} for {model}. Discussed ${price} pricing and availability at {retailer} in {city}." + + # Generate timestamp from 2016 onwards (products launched after 2015) + start_date = datetime(2016, 1, 1) + random_date = self.fake.date_time_between(start_date=start_date, end_date='now') + + return { + "id": str(uuid.uuid4()), + "timestamp": random_date.isoformat(), + "customer_name": customer_name, + "sales_representative": sales_rep, + "product_line": product_line, + "product_model": model, + "product_category": product_info['category'], + "price": price, + "retailer": retailer, + "location_city": city, + "region": region, + "interaction_type": interaction_type, + "interaction_text": interaction_text, + "text": interaction_text, # Duplicate for semantic_text field + "quarter": f"Q{random_date.month//3 + 1}", + "year": random_date.year, + "month": random_date.strftime("%B"), + "day_of_week": random_date.strftime("%A"), + "customer_segment": random.choice(["Consumer", "Business", "Education", "Government"]), + "sales_channel": random.choice(["Direct", "Retail Partner", "Online", "Phone"]), + "lead_source": random.choice(["Website", "Advertisement", "Referral", "Trade Show", "Cold Call"]), + "deal_stage": random.choice(["Prospect", "Qualified", "Proposal", "Negotiation", "Closed Won", "Closed Lost"]), + "revenue_potential": price * random.randint(1, 5), # Potential for multiple units + "customer_priority": random.choice(["High", "Medium", "Low"]), + "follow_up_required": random.choice([True, False]), + "notes": f"Additional context: Customer expressed interest in {product_line} series. {sales_rep} to follow up within 48 hours." + } + + def generate_bulk_data(self, num_records: int) -> List[Dict[str, Any]]: + """Generate bulk sales data for Elasticsearch""" + print(f"🔄 Generating {num_records} synthetic sales records...") + + records = [] + for i in range(num_records): + if i % 100 == 0 and i > 0: + print(f" Generated {i}/{num_records} records...") + records.append(self.generate_sales_record()) + + print(f"✅ Generated {len(records)} records successfully") + return records + +class ElasticsearchManager: + """Manage Elasticsearch operations for Bose sales data""" + + def __init__(self, config: ElasticsearchConfig): + self.config = config + self.es = config.get_client() + self.index_name = config.index_name + + def test_connection(self) -> bool: + """Test Elasticsearch connection by checking index access""" + try: + print("🔧 Testing Elasticsearch connection...") + # Test connection by checking if we can access our specific index + # This works with index-specific API keys + exists = self.es.indices.exists(index=self.index_name) + print(f"✅ Successfully connected to Elasticsearch") + print(f" Index '{self.index_name}' exists: {exists}") + return True + except Exception as e: + print(f"❌ Connection failed: {e}") + return False + + def create_index_mapping(self) -> bool: + """Create index with optimized mapping for NER and search""" + try: + print(f"🔧 Creating index: {self.index_name}") + + # Define mapping optimized for NER entities and search + mapping = { + "settings": { + "number_of_shards": 1, + "number_of_replicas": 1, + "analysis": { + "analyzer": { + "ner_analyzer": { + "type": "standard", + "stopwords": "_none_" # Keep all words for NER + } + } + } + }, + "mappings": { + "properties": { + "id": {"type": "keyword"}, + "timestamp": {"type": "date"}, + "customer_name": { + "type": "text", + "analyzer": "ner_analyzer", + "fields": {"keyword": {"type": "keyword"}} + }, + "sales_representative": { + "type": "text", + "analyzer": "ner_analyzer", + "fields": {"keyword": {"type": "keyword"}} + }, + "product_line": {"type": "keyword"}, + "product_model": {"type": "keyword"}, + "product_category": {"type": "keyword"}, + "price": {"type": "float"}, + "retailer": { + "type": "text", + "analyzer": "ner_analyzer", + "fields": {"keyword": {"type": "keyword"}} + }, + "location_city": { + "type": "text", + "analyzer": "ner_analyzer", + "fields": {"keyword": {"type": "keyword"}} + }, + "region": {"type": "keyword"}, + "interaction_type": {"type": "keyword"}, + "interaction_text": { + "type": "text", + "analyzer": "ner_analyzer" # Full text optimized for NER + }, + "text": { + "type": "text", + "analyzer": "ner_analyzer" # Duplicate text field for compatibility + }, + "quarter": {"type": "keyword"}, + "year": {"type": "integer"}, + "month": {"type": "keyword"}, + "day_of_week": {"type": "keyword"}, + "customer_segment": {"type": "keyword"}, + "sales_channel": {"type": "keyword"}, + "lead_source": {"type": "keyword"}, + "deal_stage": {"type": "keyword"}, + "revenue_potential": {"type": "float"}, + "customer_priority": {"type": "keyword"}, + "follow_up_required": {"type": "boolean"}, + "notes": { + "type": "text", + "analyzer": "ner_analyzer" + } + } + } + } + + # Check if index exists, if so, just update mapping + if self.es.indices.exists(index=self.index_name): + print(f" ✅ Index '{self.index_name}' already exists, updating mapping...") + try: + # Try to update mapping for existing index + self.es.indices.put_mapping(index=self.index_name, body=mapping["mappings"]) + print(f"✅ Updated mapping for existing index: {self.index_name}") + except Exception as e: + print(f" ⚠️ Could not update mapping (this is often okay): {e}") + print(f" ✅ Using existing index: {self.index_name}") + return True + else: + # Create new index only if it doesn't exist + self.es.indices.create(index=self.index_name, body=mapping) + print(f"✅ Created new index: {self.index_name}") + return True + + except Exception as e: + print(f"❌ Error creating index: {e}") + return False + + def bulk_index_data(self, records: List[Dict[str, Any]]) -> bool: + """Bulk index sales data into Elasticsearch""" + try: + print(f"📤 Bulk indexing {len(records)} records...") + + # Prepare bulk data + actions = [] + for record in records: + action = { + "_index": self.index_name, + "_id": record["id"], + "_source": record + } + actions.append(action) + + # Execute bulk index + success_count, failed_items = bulk( + self.es, + actions, + chunk_size=100, + request_timeout=60, + max_retries=3, + initial_backoff=2, + max_backoff=600 + ) + + print(f"✅ Successfully indexed {success_count} records") + + if failed_items: + print(f"⚠️ Failed to index {len(failed_items)} records") + for item in failed_items[:3]: # Show first 3 failures + if 'index' in item: + error_info = item['index'] + print(f" - Document ID: {error_info.get('_id', 'unknown')}") + print(f" Error: {error_info.get('error', {}).get('reason', 'unknown error')}") + else: + print(f" - {item}") + + return success_count > 0 + + except BulkIndexError as e: + print(f"❌ Bulk indexing error: {e}") + return False + except Exception as e: + print(f"❌ Error during bulk indexing: {e}") + return False + + def verify_data(self) -> Dict[str, Any]: + """Verify indexed data and return statistics""" + try: + print("🔍 Verifying indexed data...") + + # Refresh index to ensure all data is searchable + self.es.indices.refresh(index=self.index_name) + + # Get basic stats + count_response = self.es.count(index=self.index_name) + total_docs = count_response['count'] + + print(f"📊 Total documents: {total_docs}") + + if total_docs == 0: + return {"total_docs": 0} + + # Sample a few documents + sample_response = self.es.search( + index=self.index_name, + body={"size": 3, "sort": [{"timestamp": {"order": "desc"}}]} + ) + + print("📋 Sample documents:") + for i, hit in enumerate(sample_response['hits']['hits'], 1): + source = hit['_source'] + print(f" {i}. {source['customer_name']} - {source['product_model']} - ${source['price']}") + print(f" {source['interaction_text'][:100]}...") + + # Aggregation stats + agg_response = self.es.search( + index=self.index_name, + body={ + "size": 0, + "aggs": { + "by_product": { + "terms": {"field": "product_line", "size": 10} + }, + "by_retailer": { + "terms": {"field": "retailer.keyword", "size": 5} + }, + "avg_price": { + "avg": {"field": "price"} + } + } + } + ) + + stats = { + "total_docs": total_docs, + "products": agg_response['aggregations']['by_product']['buckets'], + "retailers": agg_response['aggregations']['by_retailer']['buckets'], + "avg_price": round(agg_response['aggregations']['avg_price']['value'], 2) + } + + print(f"📈 Statistics:") + print(f" Average price: ${stats['avg_price']}") + print(f" Top products: {', '.join([b['key'] for b in stats['products'][:3]])}") + print(f" Top retailers: {', '.join([b['key'] for b in stats['retailers'][:3]])}") + + return stats + + except Exception as e: + print(f"❌ Error verifying data: {e}") + return {"error": str(e)} + +def main(): + """Main execution function""" + print("🚀 Starting Elasticsearch Setup for Bose Sales Data") + print("=" * 60) + + try: + # Initialize configuration + print("⚙️ Loading configuration...") + config = ElasticsearchConfig() + print(f" Index name: {config.index_name}") + print(f" Records to generate: {config.num_synthetic_records}") + + # Initialize Elasticsearch manager + es_manager = ElasticsearchManager(config) + + # Test connection + if not es_manager.test_connection(): + print("❌ Cannot proceed without valid Elasticsearch connection") + return False + + # Create index with mapping + if not es_manager.create_index_mapping(): + print("❌ Failed to create index mapping") + return False + + # Generate synthetic data + data_generator = BoseSalesDataGenerator() + sales_records = data_generator.generate_bulk_data(config.num_synthetic_records) + + # Index data + if not es_manager.bulk_index_data(sales_records): + print("❌ Failed to index data") + return False + + # Verify results + stats = es_manager.verify_data() + + print("\n" + "=" * 60) + print("🎉 SETUP COMPLETE!") + print("=" * 60) + print(f"✅ Elasticsearch index '{config.index_name}' created successfully") + print(f"✅ {stats.get('total_docs', 0)} sales records indexed") + print(f"📊 Ready for use as Unstructured Workflow source connector") + print("\nNext steps:") + print("1. Use this index as an Elasticsearch source connector") + print("2. Configure NER enrichment workflow node") + print("3. Process through your hybrid RAG pipeline") + + return True + + except Exception as e: + print(f"❌ Setup failed: {e}") + return False + +if __name__ == "__main__": + success = main() + exit(0 if success else 1) \ No newline at end of file diff --git a/elastic-search-index-setup/elasticsearch_setup.py b/elastic-search-index-setup/elasticsearch_setup.py index 5874db6..9a9fda4 100644 --- a/elastic-search-index-setup/elasticsearch_setup.py +++ b/elastic-search-index-setup/elasticsearch_setup.py @@ -323,9 +323,9 @@ def create_index_mapping(self) -> bool: return True else: # Create new index only if it doesn't exist - self.es.indices.create(index=self.index_name, body=mapping) + self.es.indices.create(index=self.index_name, body=mapping) print(f"✅ Created new index: {self.index_name}") - return True + return True except Exception as e: print(f"❌ Error creating index: {e}") @@ -367,7 +367,7 @@ def bulk_index_data(self, records: List[Dict[str, Any]]) -> bool: print(f" - Document ID: {error_info.get('_id', 'unknown')}") print(f" Error: {error_info.get('error', {}).get('reason', 'unknown error')}") else: - print(f" - {item}") + print(f" - {item}") return success_count > 0 diff --git a/elastic-search-index-setup/verify_consolidated_data.py b/elastic-search-index-setup/verify_consolidated_data.py new file mode 100644 index 0000000..1ba8219 --- /dev/null +++ b/elastic-search-index-setup/verify_consolidated_data.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python3 +""" +Verify Consolidated Elasticsearch Data for RAG +Shows how context is preserved in the consolidated text field. +""" + +import os +from dotenv import load_dotenv +from elasticsearch import Elasticsearch + +# Load environment variables from base directory (one level up) +load_dotenv(dotenv_path="../.env") + +def main(): + """Verify consolidated data structure and context preservation""" + print("🔍 Verifying Consolidated RAG Data") + print("=" * 50) + + # Initialize + api_key = os.getenv('ELASTIC_API_KEY') + es = Elasticsearch( + "https://2371b9a1d2ad40c590fd1e22652a8236.us-central1.gcp.cloud.es.io:443", + api_key=api_key, + request_timeout=60 + ) + + index_name = "sales-records" # Use same index name as API key permissions + + try: + # Check if index exists + if not es.indices.exists(index=index_name): + print(f"❌ Index '{index_name}' does not exist. Run create_consolidated_index.py first.") + return + + # Get document count + count_response = es.count(index=index_name) + total_docs = count_response['count'] + print(f"📊 Total consolidated documents: {total_docs}") + + if total_docs == 0: + print("❌ No documents found in consolidated index") + return + + # Get sample documents + sample_response = es.search( + index=index_name, + body={ + "size": 2, + "_source": ["consolidated_text", "product_line", "region", "record_date"], + "sort": [{"timestamp": {"order": "desc"}}] + }, + ) + + print(f"\n📋 Sample Consolidated Records:") + print("=" * 50) + + for i, hit in enumerate(sample_response['hits']['hits'], 1): + source = hit['_source'] + print(f"\n🔸 RECORD {i}:") + print(f" Product Line: {source['product_line']}") + print(f" Region: {source['region']}") + print(f" Date: {source['record_date']}") + print(f"\n 📝 CONSOLIDATED TEXT (Full Context):") + print(f" {'-' * 45}") + + # Show the full consolidated text with formatting + consolidated_text = source['consolidated_text'] + lines = consolidated_text.split('\n') + for line in lines[:15]: # Show first 15 lines + if line.strip(): + print(f" {line}") + + if len(lines) > 15: + print(f" ... ({len(lines) - 15} more lines)") + + print(f" {'-' * 45}") + + # Show aggregations + agg_response = es.search( + index=index_name, + body={ + "size": 0, + "aggs": { + "by_product": { + "terms": {"field": "product_line", "size": 5} + }, + "by_region": { + "terms": {"field": "region", "size": 5} + }, + "by_quarter": { + "terms": {"field": "quarter", "size": 8} + } + } + } + ) + + print(f"\n📈 Data Distribution:") + print("=" * 30) + + print("🎧 By Product Line:") + for bucket in agg_response['aggregations']['by_product']['buckets']: + print(f" • {bucket['key']}: {bucket['doc_count']} records") + + print("\n🌍 By Region:") + for bucket in agg_response['aggregations']['by_region']['buckets'][:5]: + print(f" • {bucket['key']}: {bucket['doc_count']} records") + + print("\n📅 By Quarter:") + for bucket in agg_response['aggregations']['by_quarter']['buckets'][:5]: + print(f" • {bucket['key']}: {bucket['doc_count']} records") + + # Test search functionality + print(f"\n🔍 Testing Context-Aware Search:") + print("=" * 35) + + search_tests = [ + "noise cancellation", + "Best Buy", + "warranty", + "New York" + ] + + for search_term in search_tests: + search_response = es.search( + index=index_name, + body={ + "size": 1, + "query": { + "match": { + "consolidated_text": search_term + } + }, + "_source": ["product_line", "region"] + } + ) + + hits = search_response['hits']['total']['value'] + if hits > 0: + sample_hit = search_response['hits']['hits'][0]['_source'] + print(f" 🔎 '{search_term}': {hits} matches (e.g., {sample_hit['product_line']} in {sample_hit['region']})") + else: + print(f" 🔎 '{search_term}': {hits} matches") + + print(f"\n" + "=" * 50) + print("🎉 CONSOLIDATED DATA VERIFICATION COMPLETE") + print("=" * 50) + print("✅ Context Preservation: Each record contains ALL relevant information") + print("✅ RAG Ready: Unstructured will process 'consolidated_text' field") + print("✅ No Context Loss: Customer, product, location, and conversation details preserved") + print("✅ Search Ready: Full-text search across all contextual information") + print("\n🚀 Ready for Unstructured Workflow processing!") + + except Exception as e: + print(f"❌ Error: {e}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/elasticsearch-example-data/README.md b/elasticsearch-example-data/README.md new file mode 100644 index 0000000..76b319d --- /dev/null +++ b/elasticsearch-example-data/README.md @@ -0,0 +1,98 @@ +# Elasticsearch Data Structure Examples + +This directory contains examples of two different approaches for structuring synthetic sales data in Elasticsearch for RAG (Retrieval-Augmented Generation) use cases. + +## 📊 **Data Structure Comparison** + +### 1. Non-Consolidated Structure (`nonconsolidated_examples.json`) + +**Use Case**: Traditional database-like structure with separate fields for each data element. + +**Characteristics**: +- Each piece of information is stored in its own field +- Easy to query specific fields (e.g., all records from "Best Buy") +- Good for analytics and aggregations +- Follows normalized database principles + +**⚠️ Problem with Unstructured Processing**: +When Unstructured processes this data and creates separate Text elements for each field, **context is lost**. For example: +- One Text element might contain: "Jennifer Martinez" +- Another Text element might contain: "SoundSport Free" +- Another Text element might contain: "$149" + +The resulting embeddings won't understand that Jennifer Martinez was interested in the SoundSport Free at $149. + +### 2. Consolidated Structure (`consolidated_examples.json`) + +**Use Case**: RAG-optimized structure where all context is preserved in a single text field. + +**Characteristics**: +- All relevant information combined into a single `consolidated_text` field +- Maintains complete context relationships +- Optimized for embedding generation and RAG queries +- Minimal metadata fields for filtering/aggregation + +**✅ Benefits for RAG**: +When Unstructured processes the `consolidated_text` field, each Text element contains **complete context**: +- Customer information, product details, sales information, and conversation summary all in one coherent text block +- Vector embeddings capture full relationships between entities +- RAG queries have access to complete context without information loss + +## 🔧 **Generation Scripts** + +### Non-Consolidated Data +- **Script**: `create_nonconsolidated_index.py` +- **Index**: `sales-records` (separate fields structure) +- **Fields**: 20+ individual fields (customer_name, product_model, price, etc.) + +### Consolidated Data +- **Script**: `create_consolidated_index.py` +- **Index**: `sales-records` (consolidated structure) +- **Key Field**: `consolidated_text` (contains all context) + +## 📋 **Example Comparison** + +### Non-Consolidated Record +```json +{ + "customer_name": "Jennifer Martinez", + "product_model": "SoundSport Free", + "price": 149, + "retailer": "Best Buy", + "interaction_text": "Customer called to inquire about purchasing..." +} +``` +**Problem**: When processed by Unstructured, context between fields is lost. + +### Consolidated Record +```json +{ + "consolidated_text": "SALES RECORD - November 15, 2023\n\nCustomer Information:\n- Name: Jennifer Martinez\n- Location: New York, NY\n...\n\nProduct Details:\n- Model: SoundSport Free\n- Price: $149\n...\n\nConversation Summary:\nCustomer Jennifer Martinez contacted Michael Chen about the SoundSport Free..." +} +``` +**Solution**: All context preserved in single field for complete RAG understanding. + +## 🎯 **Recommendation** + +**For RAG Use Cases**: Use the **consolidated structure** to ensure context preservation and optimal embedding quality. + +**For Analytics**: Use the **non-consolidated structure** for traditional database queries and business intelligence. + +**For Hybrid Approaches**: You can maintain both structures - use consolidated for RAG processing and non-consolidated for analytics dashboards. + +## 🚀 **Usage** + +1. **Choose your approach** based on use case +2. **Run the appropriate script**: + - `python create_nonconsolidated_index.py` (separate fields) + - `python create_consolidated_index.py` (RAG-optimized) +3. **Configure Unstructured Workflow** to process the appropriate field(s) +4. **Verify results** with the corresponding verification scripts + +## 📁 **Files in This Directory** + +- `nonconsolidated_examples.json` - 3 example records with separate fields +- `consolidated_examples.json` - 3 example records with consolidated text +- `README.md` - This documentation file + +Both examples contain the same underlying sales data, just structured differently for different use cases. \ No newline at end of file diff --git a/elasticsearch-example-data/consolidated_examples.json b/elasticsearch-example-data/consolidated_examples.json new file mode 100644 index 0000000..7dee738 --- /dev/null +++ b/elasticsearch-example-data/consolidated_examples.json @@ -0,0 +1,35 @@ +[ + { + "id": "consolidated-001", + "timestamp": "2023-11-15T14:30:00", + "document_type": "sales_record", + "product_line": "SoundSport", + "region": "Northeast", + "record_date": "2023-11-15", + "quarter": "Q4", + "year": 2023, + "consolidated_text": "SALES RECORD - November 15, 2023\n\nCustomer Information:\n- Name: Jennifer Martinez\n- Location: New York, NY\n- Segment: Consumer\n- Lead Source: Website\n\nProduct Details:\n- Product Line: SoundSport\n- Model: SoundSport Free\n- Category: Sports Earbuds\n- Price: $149\n- Key Features: sweat-resistant, secure fit, wireless\n\nSales Information:\n- Sales Representative: Michael Chen\n- Retailer: Best Buy\n- Region: Northeast\n- Channel: Phone\n- Deal Stage: Qualified\n- Interaction Type: Purchase Inquiry\n\nConversation Summary:\nCustomer Jennifer Martinez from New York, NY contacted Michael Chen to inquire about purchasing the SoundSport Free. The customer was particularly interested in the sweat-resistant, secure fit features. Michael Chen provided detailed product specifications and quoted $149. Customer mentioned they had seen similar products at Best Buy but was impressed with the Bose quality and features.\n\nTemporal Context:\n- Date: November 15, 2023\n- Quarter: Q4 2023\n- Day of Week: Wednesday\n\nRevenue Information:\n- Unit Price: $149\n- Potential Deal Value: $447\n- Priority: High" + }, + { + "id": "consolidated-002", + "timestamp": "2024-03-22T09:15:00", + "document_type": "sales_record", + "product_line": "QuietComfort", + "region": "West Coast", + "record_date": "2024-03-22", + "quarter": "Q1", + "year": 2024, + "consolidated_text": "SALES RECORD - March 22, 2024\n\nCustomer Information:\n- Name: Robert Taylor\n- Location: Los Angeles, CA\n- Segment: Business\n- Lead Source: Referral\n\nProduct Details:\n- Product Line: QuietComfort\n- Model: QuietComfort Ultra\n- Category: Noise Cancelling\n- Price: $379\n- Key Features: world-class noise cancellation, premium comfort, long battery life\n\nSales Information:\n- Sales Representative: Sarah Johnson\n- Retailer: Amazon\n- Region: West Coast\n- Channel: Direct\n- Deal Stage: Proposal\n- Interaction Type: Product Comparison\n\nConversation Summary:\nSarah Johnson conducted a comprehensive product comparison session with Robert Taylor. The customer was deciding between the QuietComfort Ultra and competitor products. Key selling points discussed included world-class noise cancellation, premium comfort, long battery life which differentiate Bose from competitors. The $379 price point was justified through superior audio quality and build reliability. Customer appreciated the detailed comparison and is considering the purchase.\n\nTemporal Context:\n- Date: March 22, 2024\n- Quarter: Q1 2024\n- Day of Week: Friday\n\nRevenue Information:\n- Unit Price: $379\n- Potential Deal Value: $1137\n- Priority: Medium" + }, + { + "id": "consolidated-003", + "timestamp": "2022-08-10T16:45:00", + "document_type": "sales_record", + "product_line": "OpenAudio", + "region": "Midwest", + "record_date": "2022-08-10", + "quarter": "Q3", + "year": 2022, + "consolidated_text": "SALES RECORD - August 10, 2022\n\nCustomer Information:\n- Name: Emily Wilson\n- Location: Chicago, IL\n- Segment: Consumer\n- Lead Source: Advertisement\n\nProduct Details:\n- Product Line: OpenAudio\n- Model: OpenAudio Pro\n- Category: Open-Ear Audio\n- Price: $229\n- Key Features: open-ear design, situational awareness, comfortable fit\n\nSales Information:\n- Sales Representative: David Rodriguez\n- Retailer: Costco\n- Region: Midwest\n- Channel: Retail Partner\n- Deal Stage: Closed Won\n- Interaction Type: Order Processing\n\nConversation Summary:\nOrder successfully processed for Emily Wilson: 2 units of OpenAudio Pro at $229 each. David Rodriguez confirmed shipping details to Chicago, IL and explained warranty coverage. Customer opted for expedited shipping and was provided with tracking information. Partnership with Costco ensured competitive pricing and reliable delivery.\n\nTemporal Context:\n- Date: August 10, 2022\n- Quarter: Q3 2022\n- Day of Week: Wednesday\n\nRevenue Information:\n- Unit Price: $229\n- Potential Deal Value: $458\n- Priority: Low" + } +] \ No newline at end of file diff --git a/elasticsearch-example-data/nonconsolidated_examples.json b/elasticsearch-example-data/nonconsolidated_examples.json new file mode 100644 index 0000000..30580e3 --- /dev/null +++ b/elasticsearch-example-data/nonconsolidated_examples.json @@ -0,0 +1,86 @@ +[ + { + "id": "example-001", + "timestamp": "2023-11-15T14:30:00", + "customer_name": "Jennifer Martinez", + "sales_representative": "Michael Chen", + "product_line": "SoundSport", + "product_model": "SoundSport Free", + "product_category": "Sports Earbuds", + "price": 149, + "retailer": "Best Buy", + "location_city": "New York, NY", + "region": "Northeast", + "interaction_type": "purchase_inquiry", + "interaction_text": "Customer Jennifer Martinez from New York, NY called to inquire about purchasing the SoundSport Free. Sales rep Michael Chen provided detailed product information and quoted $149. Customer is comparing with similar products at Best Buy.", + "text": "Customer Jennifer Martinez from New York, NY called to inquire about purchasing the SoundSport Free. Sales rep Michael Chen provided detailed product information and quoted $149. Customer is comparing with similar products at Best Buy.", + "quarter": "Q4", + "year": 2023, + "month": "November", + "day_of_week": "Wednesday", + "customer_segment": "Consumer", + "sales_channel": "Phone", + "lead_source": "Website", + "deal_stage": "Qualified", + "revenue_potential": 447, + "customer_priority": "High", + "follow_up_required": true, + "notes": "Additional context: Customer expressed interest in SoundSport series. Michael Chen to follow up within 48 hours." + }, + { + "id": "example-002", + "timestamp": "2024-03-22T09:15:00", + "customer_name": "Robert Taylor", + "sales_representative": "Sarah Johnson", + "product_line": "QuietComfort", + "product_model": "QuietComfort Ultra", + "product_category": "Noise Cancelling", + "price": 379, + "retailer": "Amazon", + "location_city": "Los Angeles, CA", + "region": "West Coast", + "interaction_type": "product_comparison", + "interaction_text": "Sarah Johnson helped Robert Taylor compare the QuietComfort Ultra against competitors. Discussed the superior noise cancellation technology and $379 price point. Customer mentioned they saw it at Amazon for a higher price.", + "text": "Sarah Johnson helped Robert Taylor compare the QuietComfort Ultra against competitors. Discussed the superior noise cancellation technology and $379 price point. Customer mentioned they saw it at Amazon for a higher price.", + "quarter": "Q1", + "year": 2024, + "month": "March", + "day_of_week": "Friday", + "customer_segment": "Business", + "sales_channel": "Direct", + "lead_source": "Referral", + "deal_stage": "Proposal", + "revenue_potential": 1137, + "customer_priority": "Medium", + "follow_up_required": false, + "notes": "Additional context: Customer expressed interest in QuietComfort series. Sarah Johnson to follow up within 48 hours." + }, + { + "id": "example-003", + "timestamp": "2022-08-10T16:45:00", + "customer_name": "Emily Wilson", + "sales_representative": "David Rodriguez", + "product_line": "OpenAudio", + "product_model": "OpenAudio Pro", + "product_category": "Open-Ear Audio", + "price": 229, + "retailer": "Costco", + "location_city": "Chicago, IL", + "region": "Midwest", + "interaction_type": "order_processing", + "interaction_text": "Order processed for Emily Wilson: 2 units of OpenAudio Pro at $229 each. Shipping to Chicago, IL. Sales rep David Rodriguez confirmed delivery timeline and warranty coverage. Partner retailer: Costco.", + "text": "Order processed for Emily Wilson: 2 units of OpenAudio Pro at $229 each. Shipping to Chicago, IL. Sales rep David Rodriguez confirmed delivery timeline and warranty coverage. Partner retailer: Costco.", + "quarter": "Q3", + "year": 2022, + "month": "August", + "day_of_week": "Wednesday", + "customer_segment": "Consumer", + "sales_channel": "Retail Partner", + "lead_source": "Advertisement", + "deal_stage": "Closed Won", + "revenue_potential": 458, + "customer_priority": "Low", + "follow_up_required": true, + "notes": "Additional context: Customer expressed interest in OpenAudio series. David Rodriguez to follow up within 48 hours." + } +] \ No newline at end of file diff --git a/elasticsearch_index_preprocessing.py b/elasticsearch_index_preprocessing.py new file mode 100644 index 0000000..da38c03 --- /dev/null +++ b/elasticsearch_index_preprocessing.py @@ -0,0 +1,964 @@ +#!/usr/bin/env python3 +""" +Elasticsearch Index Preprocessing Tool + +This script handles Elasticsearch index management for the Hybrid RAG Pipeline: +- Creates synthetic sales data for testing +- Manages index creation and deletion +- Downloads and uploads index data locally +- Provides backup and restore functionality + +ELASTICSEARCH API KEY SETUP REQUIREMENTS: +======================================== +Before running this script, you must create an Elasticsearch API key with the following permissions: + +{ + "sales-records-full-access": { + "cluster": [], + "indices": [ + { + "names": [ + "sales-records", + "sales-records-consolidated", + "customer-support" + ], + "privileges": [ + "create_index", + "delete_index", + "manage", + "write", + "read", + "view_index_metadata", + "monitor" + ], + "allow_restricted_indices": false + } + ], + "applications": [], + "run_as": [], + "metadata": {}, + "transient_metadata": { + "enabled": true + } + } +} + +SETUP INSTRUCTIONS: +================== +1. Log into your Elasticsearch deployment dashboard +2. Go to Security > API Keys +3. Create a new API key with the above permissions +4. Set the following environment variables in your .env file: + - ELASTICSEARCH_HOST=https://your-deployment.es.io:443 + - ELASTICSEARCH_API_KEY=your-api-key-here + +USAGE: +====== +- Run as standalone: python elasticsearch_index_preprocessing.py +- Import functions: from elasticsearch_index_preprocessing import run_elasticsearch_preprocessing + +""" +import os +import sys +import time + +import json +import uuid +import random +from pathlib import Path +from datetime import datetime, timedelta +from typing import List, Dict, Any, Optional +from faker import Faker +from dotenv import load_dotenv + + +# Import Elasticsearch for preprocessing +try: + from elasticsearch import Elasticsearch + from elasticsearch.helpers import bulk, BulkIndexError +except ImportError: + print("❌ elasticsearch package not found. Install with: pip install elasticsearch") + sys.exit(1) + + +# Elasticsearch Configuration +ELASTICSEARCH_HOST = os.getenv("ELASTICSEARCH_HOST", "your-elasticsearch-host") +ELASTICSEARCH_API_KEY = os.getenv("ELASTICSEARCH_API_KEY", "your-elasticsearch-api-key") +ELASTICSEARCH_INDEX = os.getenv("ELASTICSEARCH_INDEX", "sales-records-consolidated") # Updated to use consolidated index + + +# ============================================================================ +# ELASTICSEARCH PREPROCESSING FUNCTIONS +# ============================================================================ + +def get_elasticsearch_client(): + """Get Elasticsearch client with API key authentication""" + return Elasticsearch( + ELASTICSEARCH_HOST, + api_key=ELASTICSEARCH_API_KEY, + request_timeout=60, + max_retries=3, + retry_on_timeout=True + ) + +def delete_elasticsearch_indices(): + """Delete the contents of sales-records, sales-records-consolidated, and customer-support indices""" + print("🗑️ Deleting Elasticsearch indices...") + es = get_elasticsearch_client() + + indices_to_delete = ["sales-records", "sales-records-consolidated", "customer-support"] + + for index_name in indices_to_delete: + try: + if es.indices.exists(index=index_name): + es.indices.delete(index=index_name) + print(f" ✅ Successfully deleted {index_name} index") + else: + print(f" ℹ️ Index {index_name} does not exist") + except Exception as e: + print(f" ❌ Failed to delete index {index_name}: {e}") + +class BoseSalesDataGenerator: + """Generate NER-rich synthetic sales data for Bose products""" + + def __init__(self): + self.fake = Faker(['en_US']) + Faker.seed(42) # For reproducible data + + # Bose product data + self.products = { + 'SoundSport': { + 'models': ['SoundSport Free', 'SoundSport Wireless', 'SoundSport Pulse'], + 'price_range': (129, 199), + 'category': 'Sports Earbuds', + 'features': ['sweat-resistant', 'secure fit', 'wireless', 'noise isolation'] + }, + 'OpenAudio': { + 'models': ['OpenAudio Sport', 'OpenAudio Ultra', 'OpenAudio Pro'], + 'price_range': (149, 249), + 'category': 'Open-Ear Audio', + 'features': ['open-ear design', 'situational awareness', 'comfortable fit', 'premium audio'] + }, + 'QuietComfort': { + 'models': ['QuietComfort 45', 'QuietComfort Ultra', 'QuietComfort Earbuds'], + 'price_range': (199, 429), + 'category': 'Noise Cancelling', + 'features': ['world-class noise cancellation', 'premium comfort', 'long battery life', 'crystal clear calls'] + } + } + + # Rich entity data for NER + self.retailers = [ + "Best Buy", "Target", "Amazon", "Walmart", "Costco", "B&H Photo", + "Guitar Center", "Sam's Club", "Newegg", "Adorama", "Crutchfield" + ] + + self.sales_reps = [ + "Jennifer Martinez", "Michael Chen", "Sarah Johnson", "David Rodriguez", + "Emily Wilson", "Robert Taylor", "Lisa Anderson", "James Thompson", + "Maria Garcia", "Christopher Lee", "Amanda Davis", "Daniel Brown" + ] + + self.regions = [ + "Northeast", "Southeast", "Midwest", "Southwest", "West Coast", + "Pacific Northwest", "Mountain West", "Great Lakes", "Mid-Atlantic", "Gulf Coast" + ] + + self.cities = [ + "New York, NY", "Los Angeles, CA", "Chicago, IL", "Houston, TX", "Phoenix, AZ", + "Philadelphia, PA", "San Antonio, TX", "San Diego, CA", "Dallas, TX", "Austin, TX", + "Jacksonville, FL", "Fort Worth, TX", "Columbus, OH", "Charlotte, NC", "Seattle, WA", + "Denver, CO", "Washington, DC", "Boston, MA", "Nashville, TN", "Detroit, MI" + ] + + self.interaction_types = [ + "purchase_inquiry", "product_comparison", "pricing_discussion", + "sales_consultation", "order_processing", "upsell_opportunity", + "customer_preferences", "warranty_inquiry", "bulk_order_request", + "promotional_campaign", "seasonal_sale", "loyalty_program_enrollment" + ] + + def generate_sales_record(self) -> Dict[str, Any]: + """Generate a single NER-rich sales record""" + + # Select random product + product_line = random.choice(list(self.products.keys())) + product_info = self.products[product_line] + model = random.choice(product_info['models']) + price = random.randint(*product_info['price_range']) + + # Generate rich entities for NER extraction + customer_name = self.fake.name() + sales_rep = random.choice(self.sales_reps) + retailer = random.choice(self.retailers) + city = random.choice(self.cities) + region = random.choice(self.regions) + + # Generate realistic interaction text with rich named entities + interaction_type = random.choice(self.interaction_types) + + # Create contextual sales interaction text + interaction_texts = { + "purchase_inquiry": f"Customer {customer_name} from {city} called to inquire about purchasing the {model}. Sales rep {sales_rep} provided detailed product information and quoted ${price}. Customer is comparing with similar products at {retailer}.", + + "product_comparison": f"{sales_rep} helped {customer_name} compare the {model} against competitors. Discussed the superior noise cancellation technology and ${price} price point. Customer mentioned they saw it at {retailer} for a higher price.", + + "sales_consultation": f"Consultation session with {customer_name} in {region} region. {sales_rep} recommended the {model} based on customer's active lifestyle needs. Discussed ${price} pricing and available financing options through {retailer}.", + + "order_processing": f"Order processed for {customer_name}: 2 units of {model} at ${price} each. Shipping to {city}. Sales rep {sales_rep} confirmed delivery timeline and warranty coverage. Partner retailer: {retailer}.", + + "promotional_campaign": f"Q4 promotional campaign in {region}: {sales_rep} contacted {customer_name} about special pricing on {model}. Limited time offer at ${price - 20} (originally ${price}). Customer interested, will visit {retailer} this weekend." + } + + # Select appropriate interaction text or generate generic one + if interaction_type in interaction_texts: + interaction_text = interaction_texts[interaction_type] + else: + interaction_text = f"{sales_rep} assisted {customer_name} with {interaction_type.replace('_', ' ')} for {model}. Discussed ${price} pricing and availability at {retailer} in {city}." + + # Generate timestamp from 2016 onwards (products launched after 2015) + start_date = datetime(2016, 1, 1) + random_date = self.fake.date_time_between(start_date=start_date, end_date='now') + + return { + "id": str(uuid.uuid4()), + "timestamp": random_date.isoformat(), + "customer_name": customer_name, + "sales_representative": sales_rep, + "product_line": product_line, + "product_model": model, + "product_category": product_info['category'], + "price": price, + "retailer": retailer, + "location_city": city, + "region": region, + "interaction_type": interaction_type, + "interaction_text": interaction_text, + "text": interaction_text, # Duplicate for semantic_text field + "quarter": f"Q{random_date.month//3 + 1}", + "year": random_date.year, + "month": random_date.strftime("%B"), + "day_of_week": random_date.strftime("%A"), + "customer_segment": random.choice(["Consumer", "Business", "Education", "Government"]), + "sales_channel": random.choice(["Direct", "Retail Partner", "Online", "Phone"]), + "lead_source": random.choice(["Website", "Advertisement", "Referral", "Trade Show", "Cold Call"]), + "deal_stage": random.choice(["Prospect", "Qualified", "Proposal", "Negotiation", "Closed Won", "Closed Lost"]), + "revenue_potential": price * random.randint(1, 5), # Potential for multiple units + "customer_priority": random.choice(["High", "Medium", "Low"]), + "follow_up_required": random.choice([True, False]), + "notes": f"Additional context: Customer expressed interest in {product_line} series. {sales_rep} to follow up within 48 hours." + } + +def create_nonconsolidated_index(): + """Create the non-consolidated sales-records index with synthetic data""" + print("🔧 Creating non-consolidated sales-records index...") + es = get_elasticsearch_client() + index_name = "sales-records" + + try: + # Define mapping optimized for NER entities and search + mapping = { + "settings": { + "number_of_shards": 1, + "number_of_replicas": 1, + "analysis": { + "analyzer": { + "ner_analyzer": { + "type": "standard", + "stopwords": "_none_" # Keep all words for NER + } + } + } + }, + "mappings": { + "properties": { + "id": {"type": "keyword"}, + "timestamp": {"type": "date"}, + "customer_name": { + "type": "text", + "analyzer": "ner_analyzer", + "fields": {"keyword": {"type": "keyword"}} + }, + "sales_representative": { + "type": "text", + "analyzer": "ner_analyzer", + "fields": {"keyword": {"type": "keyword"}} + }, + "product_line": {"type": "keyword"}, + "product_model": {"type": "keyword"}, + "product_category": {"type": "keyword"}, + "price": {"type": "float"}, + "retailer": { + "type": "text", + "analyzer": "ner_analyzer", + "fields": {"keyword": {"type": "keyword"}} + }, + "location_city": { + "type": "text", + "analyzer": "ner_analyzer", + "fields": {"keyword": {"type": "keyword"}} + }, + "region": {"type": "keyword"}, + "interaction_type": {"type": "keyword"}, + "interaction_text": { + "type": "text", + "analyzer": "ner_analyzer" # Full text optimized for NER + }, + "text": { + "type": "text", + "analyzer": "ner_analyzer" # Duplicate text field for compatibility + }, + "quarter": {"type": "keyword"}, + "year": {"type": "integer"}, + "month": {"type": "keyword"}, + "day_of_week": {"type": "keyword"}, + "customer_segment": {"type": "keyword"}, + "sales_channel": {"type": "keyword"}, + "lead_source": {"type": "keyword"}, + "deal_stage": {"type": "keyword"}, + "revenue_potential": {"type": "float"}, + "customer_priority": {"type": "keyword"}, + "follow_up_required": {"type": "boolean"}, + "notes": { + "type": "text", + "analyzer": "ner_analyzer" + } + } + } + } + + # Create index + es.indices.create(index=index_name, body=mapping) + print(f" ✅ Created index: {index_name}") + + # Generate and index synthetic data + generator = BoseSalesDataGenerator() + print(" 🔄 Generating 100 synthetic sales records...") + + records = [] + for i in range(100): + records.append(generator.generate_sales_record()) + + print(f" ✅ Generated {len(records)} records") + + # Bulk index + print(" 📤 Bulk indexing records...") + actions = [] + for record in records: + action = { + "_index": index_name, + "_id": record["id"], + "_source": record + } + actions.append(action) + + success_count, failed_items = bulk( + es, + actions, + chunk_size=100, + request_timeout=60, + max_retries=3, + initial_backoff=2, + max_backoff=600 + ) + + print(f" ✅ Successfully indexed {success_count} records") + + # Refresh index to make documents searchable immediately + es.indices.refresh(index=index_name) + print(f" 🔄 Refreshed index to make documents searchable") + + return True + + except Exception as e: + print(f" ❌ Error creating non-consolidated index: {e}") + return False + +class ConsolidatedBoseSalesDataGenerator: + """Generate consolidated sales data optimized for RAG processing""" + + def __init__(self): + self.fake = Faker(['en_US']) + Faker.seed(42) # For reproducible data + + # Bose product data + self.products = { + 'SoundSport': { + 'models': ['SoundSport Free', 'SoundSport Wireless', 'SoundSport Pulse'], + 'price_range': (129, 199), + 'category': 'Sports Earbuds', + 'features': ['sweat-resistant', 'secure fit', 'wireless', 'noise isolation'] + }, + 'OpenAudio': { + 'models': ['OpenAudio Sport', 'OpenAudio Ultra', 'OpenAudio Pro'], + 'price_range': (149, 249), + 'category': 'Open-Ear Audio', + 'features': ['open-ear design', 'situational awareness', 'comfortable fit', 'premium audio'] + }, + 'QuietComfort': { + 'models': ['QuietComfort 45', 'QuietComfort Ultra', 'QuietComfort Earbuds'], + 'price_range': (199, 429), + 'category': 'Noise Cancelling', + 'features': ['world-class noise cancellation', 'premium comfort', 'long battery life', 'crystal clear calls'] + } + } + + # Rich entity data for NER + self.retailers = [ + "Best Buy", "Target", "Amazon", "Walmart", "Costco", "B&H Photo", + "Guitar Center", "Sam's Club", "Newegg", "Adorama", "Crutchfield" + ] + + self.sales_reps = [ + "Jennifer Martinez", "Michael Chen", "Sarah Johnson", "David Rodriguez", + "Emily Wilson", "Robert Taylor", "Lisa Anderson", "James Thompson", + "Maria Garcia", "Christopher Lee", "Amanda Davis", "Daniel Brown" + ] + + self.regions = [ + "Northeast", "Southeast", "Midwest", "Southwest", "West Coast", + "Pacific Northwest", "Mountain West", "Great Lakes", "Mid-Atlantic", "Gulf Coast" + ] + + self.cities = [ + "New York, NY", "Los Angeles, CA", "Chicago, IL", "Houston, TX", "Phoenix, AZ", + "Philadelphia, PA", "San Antonio, TX", "San Diego, CA", "Dallas, TX", "Austin, TX", + "Jacksonville, FL", "Fort Worth, TX", "Columbus, OH", "Charlotte, NC", "Seattle, WA", + "Denver, CO", "Washington, DC", "Boston, MA", "Nashville, TN", "Detroit, MI" + ] + + self.interaction_types = [ + "purchase_inquiry", "product_comparison", "pricing_discussion", + "sales_consultation", "order_processing", "upsell_opportunity", + "customer_preferences", "warranty_inquiry", "bulk_order_request", + "promotional_campaign", "seasonal_sale", "loyalty_program_enrollment" + ] + + def consolidate_from_source_record(self, source_record: dict) -> dict: + """Convert a source record to consolidated format""" + + # Extract data from source record + product_line = source_record['product_line'] + product_info = self.products.get(product_line, {'category': 'Unknown', 'features': []}) + + # Parse timestamp + timestamp = datetime.fromisoformat(source_record['timestamp'].replace('Z', '+00:00')) + + # Generate features based on product line + if product_line in self.products: + features = random.sample(self.products[product_line]['features'], k=random.randint(1, 3)) + else: + features = ['premium quality', 'reliable performance'] + + # Create detailed interaction text based on the original interaction + interaction_details = self._generate_interaction_text( + source_record['interaction_type'], + source_record['customer_name'], + source_record['sales_representative'], + source_record['product_model'], + source_record['price'], + source_record['retailer'], + source_record['location_city'], + features + ) + + # Create consolidated text field with ALL context + consolidated_text = f"""SALES RECORD - {timestamp.strftime('%B %d, %Y')} + +Customer Information: +- Name: {source_record['customer_name']} +- Location: {source_record['location_city']} +- Segment: {source_record['customer_segment']} +- Lead Source: {source_record['lead_source']} + +Product Details: +- Product Line: {source_record['product_line']} +- Model: {source_record['product_model']} +- Category: {source_record['product_category']} +- Price: ${source_record['price']} +- Key Features: {', '.join(features)} + +Sales Information: +- Sales Representative: {source_record['sales_representative']} +- Retailer: {source_record['retailer']} +- Region: {source_record['region']} +- Channel: {source_record['sales_channel']} +- Deal Stage: {source_record['deal_stage']} +- Interaction Type: {source_record['interaction_type'].replace('_', ' ').title()} + +Conversation Summary: +{interaction_details} + +Temporal Context: +- Date: {timestamp.strftime('%B %d, %Y')} +- Quarter: {source_record['quarter']} {source_record['year']} +- Day of Week: {source_record['day_of_week']} + +Revenue Information: +- Unit Price: ${source_record['price']} +- Potential Deal Value: ${source_record['revenue_potential']} +- Priority: {source_record['customer_priority']}""".strip() + + # Return consolidated document + return { + "id": str(uuid.uuid4()), + "timestamp": source_record['timestamp'], + "document_type": "sales_record", + "product_line": source_record['product_line'], + "region": source_record['region'], + "consolidated_text": consolidated_text, + "record_date": timestamp.strftime('%Y-%m-%d'), + "quarter": source_record['quarter'], + "year": source_record['year'] + } + + def _generate_interaction_text(self, interaction_type, customer_name, sales_rep, model, price, retailer, city, features): + """Generate detailed interaction text based on type""" + + interaction_templates = { + "purchase_inquiry": f"Customer {customer_name} from {city} contacted {sales_rep} to inquire about purchasing the {model}. The customer was particularly interested in the {', '.join(features[:2])} features. {sales_rep} provided detailed product specifications and quoted ${price}. Customer mentioned they had seen similar products at {retailer} but was impressed with the Bose quality and features.", + + "product_comparison": f"{sales_rep} conducted a comprehensive product comparison session with {customer_name}. The customer was deciding between the {model} and competitor products. Key selling points discussed included {', '.join(features)} which differentiate Bose from competitors. The ${price} price point was justified through superior audio quality and build reliability. Customer appreciated the detailed comparison and is considering the purchase.", + + "sales_consultation": f"In-depth consultation with {customer_name} in the {city} area. {sales_rep} assessed customer needs and recommended the {model} based on their lifestyle and audio preferences. Highlighted features included {', '.join(features)}. Discussed ${price} pricing structure and available financing options. Customer showed strong interest and requested follow-up information.", + + "order_processing": f"Order successfully processed for {customer_name}: {random.randint(1, 3)} units of {model} at ${price} each. {sales_rep} confirmed shipping details to {city} and explained warranty coverage. Customer opted for expedited shipping and was provided with tracking information. Partnership with {retailer} ensured competitive pricing and reliable delivery.", + + "promotional_campaign": f"Q{random.randint(1,4)} promotional outreach to {customer_name}. {sales_rep} presented special pricing on {model} - limited time offer at ${price - random.randint(10, 30)} (regularly ${price}). Emphasized exclusive features: {', '.join(features)}. Customer expressed interest and plans to visit {retailer} location this weekend to experience the product firsthand.", + } + + return interaction_templates.get( + interaction_type, + f"{sales_rep} assisted {customer_name} with {interaction_type.replace('_', ' ')} regarding {model}. Discussed ${price} pricing and key features: {', '.join(features)}. Customer interaction was positive and follow-up scheduled." + ) + +def create_consolidated_index(): + """Pull from sales-records and create consolidated sales-records-consolidated index""" + print("🔧 Creating consolidated sales-records-consolidated index...") + es = get_elasticsearch_client() + source_index = "sales-records" + target_index = "sales-records-consolidated" + + try: + # Check if source index exists + if not es.indices.exists(index=source_index): + print(f" ❌ Source index {source_index} does not exist") + return False + + # Create target index mapping + mapping = { + "settings": { + "number_of_shards": 1, + "number_of_replicas": 1 + }, + "mappings": { + "properties": { + "id": {"type": "keyword"}, + "timestamp": {"type": "date"}, + "document_type": {"type": "keyword"}, + "product_line": {"type": "keyword"}, + "region": {"type": "keyword"}, + "consolidated_text": { + "type": "text", + "analyzer": "standard" # This is the field Unstructured will process + }, + "record_date": {"type": "date"}, + "quarter": {"type": "keyword"}, + "year": {"type": "integer"} + } + } + } + + es.indices.create(index=target_index, body=mapping) + print(f" ✅ Created consolidated index: {target_index}") + + # Fetch all records from source index + print(" 🔄 Fetching records from source index...") + response = es.search( + index=source_index, + body={ + "query": {"match_all": {}}, + "size": 1000 # Adjust based on expected data size + } + ) + + source_records = [hit['_source'] for hit in response['hits']['hits']] + print(f" 📥 Retrieved {len(source_records)} records from {source_index}") + + # Convert to consolidated format + generator = ConsolidatedBoseSalesDataGenerator() + consolidated_records = [] + + for record in source_records: + consolidated_record = generator.consolidate_from_source_record(record) + consolidated_records.append(consolidated_record) + + print(f" 🔄 Converted {len(consolidated_records)} records to consolidated format") + + # Bulk index consolidated records + print(" 📤 Bulk indexing consolidated records...") + actions = [] + for record in consolidated_records: + actions.append({ + "_index": target_index, + "_id": record["id"], + "_source": record + }) + + success_count, failed_items = bulk(es, actions, chunk_size=50) + print(f" ✅ Successfully indexed {success_count} consolidated records") + + # Refresh the target index to make documents searchable + es.indices.refresh(index=target_index) + print(f" 🔄 Refreshed consolidated index to make documents searchable") + + # Verify results + count_response = es.count(index=target_index) + total_docs = count_response['count'] + print(f" 📊 Total documents in consolidated index: {total_docs}") + + return True + + except Exception as e: + print(f" ❌ Error creating consolidated index: {e}") + return False + +def run_elasticsearch_preprocessing(): + """Run all Elasticsearch preprocessing steps""" + print("🚀 Starting Elasticsearch preprocessing...") + print("=" * 60) + + # Step 1: Delete existing indices + delete_elasticsearch_indices() + + # Step 2: Create non-consolidated index with synthetic data + if not create_nonconsolidated_index(): + print("❌ Failed to create non-consolidated index") + return False + + # Step 3: Create consolidated index from the non-consolidated data + # Step 3: Create consolidated index from the non-consolidated data + if not create_consolidated_index(): + print("❌ Failed to create consolidated index") + return False + + # Step 4: Create customer-support index + if not create_customer_support_index(): + print("❌ Failed to create customer-support index") + return False + print(f"✅ Pipeline will now use consolidated index: {ELASTICSEARCH_INDEX}") + return True + +def download_index_locally(index_name, output_file="index_data.json"): + """Download all documents from an Elasticsearch index to a local JSON file""" + print(f"📥 Downloading index '{index_name}' to local file...") + es = get_elasticsearch_client() + + try: + # Check if index exists + if not es.indices.exists(index=index_name): + print(f" ❌ Index {index_name} does not exist") + return False + + # Get all documents using scroll API for large datasets + print(" 🔄 Fetching documents using scroll API...") + + documents = [] + scroll_response = es.search( + index=index_name, + body={ + "query": {"match_all": {}}, + "size": 1000 # Batch size + }, + scroll='5m' # Keep scroll context alive for 5 minutes + ) + + scroll_id = scroll_response['_scroll_id'] + documents.extend([hit['_source'] for hit in scroll_response['hits']['hits']]) + + # Continue scrolling until no more documents + while len(scroll_response['hits']['hits']) > 0: + scroll_response = es.scroll( + scroll_id=scroll_id, + scroll='5m' + ) + documents.extend([hit['_source'] for hit in scroll_response['hits']['hits']]) + + # Clear the scroll context + es.clear_scroll(scroll_id=scroll_id) + + # Save to local file + with open(output_file, 'w', encoding='utf-8') as f: + json.dump({ + "index_name": index_name, + "document_count": len(documents), + "documents": documents + }, f, indent=2, ensure_ascii=False) + + print(f" ✅ Downloaded {len(documents)} documents to {output_file}") + return True + + except Exception as e: + print(f" ❌ Error downloading index: {e}") + return False + +def upload_index_from_file(input_file, new_index_name, mapping=None): + """Create a new index and upload documents from a local JSON file""" + print(f"📤 Creating new index '{new_index_name}' from local file...") + es = get_elasticsearch_client() + + try: + # Load data from file + with open(input_file, 'r', encoding='utf-8') as f: + data = json.load(f) + + documents = data.get('documents', []) + original_count = data.get('document_count', len(documents)) + + print(f" 📄 Loaded {len(documents)} documents from {input_file}") + + # Delete index if it exists + if es.indices.exists(index=new_index_name): + print(f" 🗑️ Deleting existing index: {new_index_name}") + es.indices.delete(index=new_index_name) + + # Create index with mapping + if mapping: + print(f" 🔧 Creating index with custom mapping...") + es.indices.create(index=new_index_name, body=mapping) + else: + print(f" 🔧 Creating index with default mapping...") + # Use a basic mapping similar to the consolidated index + default_mapping = { + "settings": { + "number_of_shards": 1, + "number_of_replicas": 1 + }, + "mappings": { + "properties": { + "id": {"type": "keyword"}, + "timestamp": {"type": "date"}, + "document_type": {"type": "keyword"}, + "consolidated_text": { + "type": "text", + "analyzer": "standard" + } + } + } + } + es.indices.create(index=new_index_name, body=default_mapping) + + # Bulk index documents + print(f" 📤 Bulk indexing {len(documents)} documents...") + actions = [] + for i, doc in enumerate(documents): + actions.append({ + "_index": new_index_name, + "_id": doc.get('id', str(uuid.uuid4())), + "_source": doc + }) + + success_count, failed_items = bulk(es, actions, chunk_size=100) + + # Refresh index + es.indices.refresh(index=new_index_name) + + # Verify upload + count_response = es.count(index=new_index_name) + final_count = count_response['count'] + + print(f" ✅ Successfully uploaded {success_count} documents") + print(f" 📊 Final document count in {new_index_name}: {final_count}") + + if final_count == original_count: + print(f" ✅ Upload verification successful!") + else: + print(f" ⚠️ Document count mismatch: expected {original_count}, got {final_count}") + + return True + + except Exception as e: + print(f" ❌ Error uploading index: {e}") + return False + +def backup_and_restore_index(source_index, target_index, backup_file="index_backup.json"): + """Complete backup and restore workflow""" + print(f"🔄 Backing up '{source_index}' and restoring to '{target_index}'...") + + # Step 1: Download source index + if not download_index_locally(source_index, backup_file): + return False + + # Step 2: Upload to target index + if not upload_index_from_file(backup_file, target_index): + return False + + print(f"✅ Successfully backed up and restored index!") + return True + +def create_customer_support_index(): + """Create the customer-support index for storing customer service interactions""" + print("🔧 Creating customer-support index...") + es = get_elasticsearch_client() + index_name = "customer-support" + + try: + # Define mapping for customer support data + mapping = { + "settings": { + "number_of_shards": 1, + "number_of_replicas": 1, + "analysis": { + "analyzer": { + "support_analyzer": { + "type": "standard", + "stopwords": "_none_" # Keep all words for better search + } + } + } + }, + "mappings": { + "properties": { + "id": {"type": "keyword"}, + "timestamp": {"type": "date"}, + "customer_id": {"type": "keyword"}, + "customer_name": { + "type": "text", + "analyzer": "support_analyzer", + "fields": {"keyword": {"type": "keyword"}} + }, + "support_agent": { + "type": "text", + "analyzer": "support_analyzer", + "fields": {"keyword": {"type": "keyword"}} + }, + "issue_type": {"type": "keyword"}, + "priority": {"type": "keyword"}, + "status": {"type": "keyword"}, + "product_line": {"type": "keyword"}, + "product_model": {"type": "keyword"}, + "conversation_text": { + "type": "text", + "analyzer": "support_analyzer" + }, + "resolution": { + "type": "text", + "analyzer": "support_analyzer" + }, + "satisfaction_rating": {"type": "integer"}, + "follow_up_required": {"type": "boolean"}, + "tags": {"type": "keyword"}, + "escalated": {"type": "boolean"}, + "resolution_time_hours": {"type": "float"} + } + } + } + + # Check if index exists before creating + if es.indices.exists(index=index_name): + print(f" ℹ️ Index {index_name} already exists, skipping creation") + else: + es.indices.create(index=index_name, body=mapping) + print(f" ✅ Created index: {index_name}") + # Refresh index to make it immediately available + es.indices.refresh(index=index_name) + print(f" 🔄 Refreshed index to make it searchable") + + return True + + except Exception as e: + print(f" ❌ Error creating customer-support index: {e}") + return False + + """Create the customer-support index for storing customer service interactions""" + print("🔧 Creating customer-support index...") + es = get_elasticsearch_client() + index_name = "customer-support" + + try: + # Define mapping for customer support data + mapping = { + "settings": { + "number_of_shards": 1, + "number_of_replicas": 1, + "analysis": { + "analyzer": { + "support_analyzer": { + "type": "standard", + "stopwords": "_none_" # Keep all words for better search + } + } + } + }, + "mappings": { + "properties": { + "id": {"type": "keyword"}, + "timestamp": {"type": "date"}, + "customer_id": {"type": "keyword"}, + "customer_name": { + "type": "text", + "analyzer": "support_analyzer", + "fields": {"keyword": {"type": "keyword"}} + }, + "support_agent": { + "type": "text", + "analyzer": "support_analyzer", + "fields": {"keyword": {"type": "keyword"}} + }, + "issue_type": {"type": "keyword"}, + "priority": {"type": "keyword"}, + "status": {"type": "keyword"}, + "product_line": {"type": "keyword"}, + "product_model": {"type": "keyword"}, + "conversation_text": { + "type": "text", + "analyzer": "support_analyzer" + }, + "resolution": { + "type": "text", + "analyzer": "support_analyzer" + }, + "satisfaction_rating": {"type": "integer"}, + "follow_up_required": {"type": "boolean"}, + "tags": {"type": "keyword"}, + "escalated": {"type": "boolean"}, + "resolution_time_hours": {"type": "float"} + } + } + } + + # Create index + es.indices.create(index=index_name, body=mapping) + print(f" ✅ Created index: {index_name}") + + # Refresh index to make it immediately available + es.indices.refresh(index=index_name) + print(f" 🔄 Refreshed index to make it searchable") + + return True + + except Exception as e: + print(f" ❌ Error creating customer-support index: {e}") + return False + +if __name__ == "__main__": + # Example usage + print("🔧 Elasticsearch Index Preprocessing Tool") + print("=" * 50) + + # Run the standard preprocessing + if run_elasticsearch_preprocessing(): + print("\n" + "=" * 50) + print("✅ Standard preprocessing completed!") + + # Example: Download the consolidated index + print("\n📥 Example: Downloading consolidated index...") + download_index_locally("sales-records-consolidated", "consolidated_backup.json") + + # Example: Create a copy of the index + print("\n📤 Example: Creating index copy...") + upload_index_from_file("consolidated_backup.json", "sales-records-consolidated-copy") + + print("\n✅ All operations completed!") + else: + print("❌ Standard preprocessing failed!") diff --git a/feedback_r1_cleaned.md b/feedback_r1_cleaned.md new file mode 100644 index 0000000..eddea6a --- /dev/null +++ b/feedback_r1_cleaned.md @@ -0,0 +1,84 @@ +# Hybrid RAG Pipeline Notebook Feedback - Action Items + +## ✅ COMPLETED ITEMS + +### Story & Engagement +- [x] **Business Relevance**: Add compelling introduction explaining why enterprises need to unify data across formats +- [x] **Value Proposition**: Explain the complexity of the problem and how Unstructured solves it +- [x] **Tutorial Style**: Transform from documentation to engaging tutorial format + +### Content Organization +- [x] **Remove Redundancy**: Eliminate repeated explanations (S3 setup, URL formats, env variables) +- [x] **Consolidate Methods**: Stick to one method for env setup and dependency management instead of multiple options + +## 🔄 PENDING ITEMS + +### Google Colab Compatibility +**Instructions**: Orient notebook towards Google Colab users. Create a space at the top of environment setup for users to paste their environment variables, followed by a concise dotenv section where .env file values (if available) overwrite the pasted defaults. + +- [ ] **Environment Setup**: Create Colab-friendly environment variable input section with dotenv fallback +- [ ] **Dependency Installation**: Remove duplicate `ensure_notebook_deps()` calls +- [ ] **Configuration Order**: Move environment configuration steps BEFORE `load_dotenv()` call + +### Content Focus & Clarity +- [ ] **Remove Error Explanations**: Remove error explanations from markdown text +- [ ] **Reduce Verbosity**: Remove bloated sections that don't add to the core use case +- [ ] **Single Env Method**: Remove multiple environment variable setup methods +- [ ] **Streamline S3 Setup**: Remove bucket creation code - assume users have existing bucket with clear note that S3 source connector documentation has setup details +- [ ] **NER Context**: Explain why NER node is relevant to the use case + +### Technical Corrections +- [ ] **Node Naming**: Fix "Chunking Node" → "Chunker Node", "Embedding Node" → "Embedder Node" in markdown annotations +- [ ] **Unstructured Value**: Clearly articulate what value Unstructured delivers - simple 1-2 sentences with clear benefits, no marketing tone + +### RAG Implementation & Results +- [ ] **Query Functionality**: Add section that queries the final index and returns sample results using LangChain (will require OpenAI API key) +- [ ] **Source Attribution**: Show that results come from both S3 and Elasticsearch sources +- [ ] **RAG Discussion**: Explain how this unified index powers RAG applications +- [ ] **Cell Outputs**: Preserve notebook cell outputs for illustration purposes + +### Conclusion & Next Steps +- [ ] **Learning Summary**: Add "What we learned" section +- [ ] **Achievement Summary**: Highlight what was accomplished +- [ ] **Call to Action**: Provide clear next steps for readers + +## 📋 DETAILED ACTION ITEMS + +### 1. Google Colab Environment Setup +**Issue**: Notebook assumes local development environment +**Actions**: +- Create environment variable input section for Colab users +- Add concise dotenv fallback without excessive annotation +- Ensure all dependencies install properly in Colab + +### 2. RAG Implementation +**Issue**: No actual RAG querying demonstrated +**Actions**: +- Add LangChain-based query examples against the unified index +- Show mixed results from both data sources +- Explain how this enables hybrid RAG applications +- Request OpenAI API key for RAG functionality + +### 3. Content Streamlining +**Issue**: Too verbose with redundant sections +**Actions**: +- Remove duplicate environment setup explanations +- Eliminate bucket creation code (assume existing bucket) +- Remove error explanations from markdown +- Focus on core value proposition + +### 4. Technical Polish +**Issue**: Minor technical and naming inconsistencies +**Actions**: +- Fix node naming conventions in markdown +- Remove duplicate function calls +- Preserve meaningful cell outputs +- Add clear, non-marketing Unstructured value statements + +## 🎯 SUCCESS CRITERIA + +- [ ] Notebook runs successfully in Google Colab +- [ ] Actual RAG querying with mixed-source results +- [ ] Streamlined content focused on core value +- [ ] Compelling story that teaches and engages +- [ ] Clear environment setup for Colab users \ No newline at end of file diff --git a/hybrid_rag_pipeline_enriched.ipynb b/hybrid_rag_pipeline_enriched.ipynb new file mode 100644 index 0000000..32eae19 --- /dev/null +++ b/hybrid_rag_pipeline_enriched.ipynb @@ -0,0 +1,1759 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f5506c9b", + "metadata": {}, + "source": [ + "# Building a Hybrid RAG System: From Fragmented Data to Unified Intelligence\n", + "\n", + "Picture this: You're a customer support agent, and a customer calls about a product issue. To help them effectively, you need to pull information from multiple sources - product manuals stored as PDFs in cloud storage, their purchase history in your sales database, and previous support interactions scattered across different systems. Each piece of information lives in a different format, in a different system, with different access methods.\n", + "\n", + "This is the reality for most enterprises today, and it's exactly the challenge we're going to solve together.\n", + "\n", + "## The Enterprise Data Challenge\n", + "\n", + "Enterprise data rarely lives in one place or format. Critical information is fragmented across unstructured documents like PDFs and manuals in cloud storage, structured records like sales data in databases, and different formats requiring different processing approaches. Traditional RAG systems work well with homogeneous data but struggle when you need to query across diverse data sources simultaneously.\n", + "\n", + "## Why This Matters\n", + "\n", + "When data is scattered, customer support becomes inefficient, decision-making lacks context, and valuable insights remain hidden. A customer asking about a product issue shouldn't require you to manually search through multiple systems to piece together a complete picture.\n", + "\n", + "## The Solution: Unstructured's Complete Gen AI Data Layer\n", + "\n", + "Unstructured isn't just another data processing tool—it's a complete Gen AI data layer solution that transforms how organizations handle unstructured data at scale. Unlike building custom solutions or using fragmented tools, Unstructured provides a unified platform that connects to 30+ data sources, processes 65+ file types with intelligent partitioning and chunking, automatically enriches content with metadata and context, and delivers to 30+ destinations—all while maintaining enterprise-grade security and compliance.\n", + "\n", + "The platform eliminates the complexity of managing multiple tools, custom integrations, and manual data preparation, allowing teams to focus on building AI applications rather than wrestling with data infrastructure. With flexible deployment options from SaaS to bare metal, Unstructured adapts to any infrastructure while providing the observability, automation, and reliability that enterprise AI projects demand.\n", + "\n", + "## What We'll Build Together\n", + "\n", + "In this tutorial, we'll create a hybrid RAG system that processes two different data sources simultaneously: product documentation from S3 and sales records from Elasticsearch. Both will flow through the same intelligent processing pipeline and land in a unified, searchable knowledge base.\n", + "\n", + "```\n", + "┌─────────────────┐ ┌─────────────────────────┐\n", + "│ S3 PDFs │──── WORKFLOW 1 ──────────▶│ │\n", + "│ (Product Docs) │ │ Unstructured API │\n", + "└─────────────────┘ │ │\n", + " │ Partition → Chunk → │\n", + "┌─────────────────┐ │ Embed → NER → Store │\n", + "│ Elasticsearch │──── WORKFLOW 2 ──────────▶│ │\n", + "│ (Sales Records) │ │ │\n", + "└─────────────────┘ └────────────┬────────────┘\n", + " │\n", + " ┌────────────▼────────────┐\n", + " │ customer-support │\n", + " │ (Unified Index) │\n", + " └─────────────────────────┘\n", + "```\n", + "\n", + "By the end of this tutorial, you'll have a working system that can answer complex questions by pulling information from both your product documentation and customer data simultaneously." + ] + }, + { + "cell_type": "markdown", + "id": "0d9ec036", + "metadata": {}, + "source": [ + "## Getting Started: Your Unstructured API Key\n", + "\n", + "To follow along with this tutorial, you'll need an Unstructured API key. This gives you access to the complete Gen AI data layer that will process your documents and create your unified knowledge base.\n", + "\n", + "### Sign Up and Get Your API Key\n", + "\n", + "Visit https://platform.unstructured.io to sign up for a free account, navigate to API Keys in the sidebar, generate your API key, and save it for the configuration step below. For Team or Enterprise accounts, make sure you've selected the correct organizational workspace before creating your API key.\n", + "\n", + "**Need help?** Contact Unstructured Support at support@unstructured.io" + ] + }, + { + "cell_type": "markdown", + "id": "dbc5dd57", + "metadata": {}, + "source": [ + "## Configuration: Setting Up Your Environment\n", + "\n", + "Now we'll configure your environment with the necessary API keys and credentials. This step ensures your system can connect to all the data sources and services we'll be using." + ] + }, + { + "cell_type": "markdown", + "id": "7eb8befe", + "metadata": {}, + "source": [ + "### Creating a .env File in Google Colab\n", + "\n", + "For better security and organization, we'll create a `.env` file directly in your Colab environment. Run the code cell below to create the file with placeholder values, then edit it with your actual credentials.\n", + "\n", + "After running the code cell, you'll need to replace each placeholder value (like `your-unstructured-api-key`) with your actual API keys and credentials." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7486a269", + "metadata": {}, + "outputs": [], + "source": [ + "def create_dotenv_file():\n", + " \"\"\"Create a .env file with placeholder values for the user to fill in.\"\"\"\n", + " env_content = \"\"\"# Hybrid RAG Pipeline Environment Configuration\n", + "# Fill in your actual values below\n", + "# Configuration - Set these explicitly\n", + "\n", + "# ===================================================================\n", + "# AWS CONFIGURATION\n", + "# ===================================================================\n", + "AWS_ACCESS_KEY_ID=\"your-aws-access-key-id\"\n", + "AWS_SECRET_ACCESS_KEY=\"your-aws-secret-access-key\"\n", + "AWS_REGION=\"us-east-1\"\n", + "\n", + "# ===================================================================\n", + "# UNSTRUCTURED API CONFIGURATION \n", + "# ===================================================================\n", + "UNSTRUCTURED_API_KEY=\"your-unstructured-api-key\"\n", + "UNSTRUCTURED_API_URL=\"https://platform.unstructuredapp.io/api/v1\"\n", + "\n", + "# ===================================================================\n", + "# ELASTICSEARCH CONFIGURATION\n", + "# ===================================================================\n", + "ELASTICSEARCH_HOST=\"https://your-cluster.es.io:443\"\n", + "ELASTICSEARCH_API_KEY=\"your-elasticsearch-api-key\"\n", + "\n", + "# ===================================================================\n", + "# PIPELINE DATA SOURCES\n", + "# ===================================================================\n", + "S3_SOURCE_BUCKET=\"your-s3-source-bucket\"\n", + "S3_DESTINATION_BUCKET=\"your-s3-destination-bucket\"\n", + "S3_OUTPUT_PREFIX=\"\"\n", + "ELASTICSEARCH_INDEX=\"sales-records-consolidated\"\n", + "\n", + "# ===================================================================\n", + "# OPENAI API CONFIGURATION \n", + "# ===================================================================\n", + "OPENAI_API_KEY=\"your-openai-api-key\"\n", + "\"\"\"\n", + " \n", + " with open('.env', 'w') as f:\n", + " f.write(env_content)\n", + " \n", + " print(\"✅ Created .env file with placeholder values\")\n", + " print(\"📝 Please edit the .env file and replace the placeholder values with your actual credentials\")\n", + " print(\"🔒 The .env file will be loaded automatically by the pipeline\")\n", + "\n", + "# Create the .env file\n", + "create_dotenv_file()" + ] + }, + { + "cell_type": "markdown", + "id": "bcb2bc85", + "metadata": {}, + "source": [ + "### Installing Required Dependencies\n", + "\n", + "The following code installs the Python packages needed for this tutorial: the Unstructured client, Elasticsearch connector, AWS SDK, and other dependencies." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d1140e1", + "metadata": { + "lines_to_next_cell": 1 + }, + "outputs": [], + "source": [ + "import sys, subprocess\n", + "\n", + "def ensure_notebook_deps() -> None:\n", + " packages = [\n", + " \"jupytext\",\n", + " \"python-dotenv\", \n", + " \"unstructured-client\",\n", + " \"elasticsearch\",\n", + " \"boto3\",\n", + " \"PyYAML\",\n", + " \"langchain\",\n", + " \"langchain-elasticsearch\",\n", + " \"langchain-openai\"\n", + " ]\n", + " try:\n", + " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", *packages])\n", + " except Exception:\n", + " # If install fails, continue; imports below will surface actionable errors\n", + " pass\n", + "\n", + "# Install notebook dependencies (safe no-op if present)\n", + "ensure_notebook_deps()\n", + "\n", + "import os\n", + "import time\n", + "import json\n", + "import zipfile\n", + "import tempfile\n", + "import requests\n", + "from pathlib import Path\n", + "from dotenv import load_dotenv\n", + "from urllib.parse import urlparse\n", + "\n", + "import boto3\n", + "from botocore.exceptions import ClientError, NoCredentialsError\n", + "from elasticsearch import Elasticsearch\n", + "from elasticsearch.helpers import bulk\n", + "\n", + "from unstructured_client import UnstructuredClient\n", + "from unstructured_client.models.operations import (\n", + " CreateSourceRequest,\n", + " CreateDestinationRequest,\n", + " CreateWorkflowRequest\n", + ")\n", + "from unstructured_client.models.shared import (\n", + " CreateSourceConnector,\n", + " CreateDestinationConnector,\n", + " WorkflowNode,\n", + " WorkflowType,\n", + " CreateWorkflow\n", + ")\n", + "\n", + "# =============================================================================\n", + "# ENVIRONMENT CONFIGURATION\n", + "# =============================================================================\n", + "# Load from .env file if it exists\n", + "load_dotenv()\n", + "\n", + "# Configuration constants\n", + "SKIPPED = \"SKIPPED\"\n", + "UNSTRUCTURED_API_URL = os.getenv(\"UNSTRUCTURED_API_URL\", \"https://platform.unstructuredapp.io/api/v1\")\n", + "\n", + "# Get environment variables\n", + "UNSTRUCTURED_API_KEY = os.getenv(\"UNSTRUCTURED_API_KEY\")\n", + "AWS_ACCESS_KEY_ID = os.getenv(\"AWS_ACCESS_KEY_ID\")\n", + "AWS_SECRET_ACCESS_KEY = os.getenv(\"AWS_SECRET_ACCESS_KEY\")\n", + "AWS_REGION = os.getenv(\"AWS_REGION\", \"us-east-1\")\n", + "S3_SOURCE_BUCKET = os.getenv(\"S3_SOURCE_BUCKET\")\n", + "S3_DESTINATION_BUCKET = os.getenv(\"S3_DESTINATION_BUCKET\")\n", + "S3_OUTPUT_PREFIX = os.getenv(\"S3_OUTPUT_PREFIX\", \"\")\n", + "ELASTICSEARCH_HOST = os.getenv(\"ELASTICSEARCH_HOST\")\n", + "ELASTICSEARCH_API_KEY = os.getenv(\"ELASTICSEARCH_API_KEY\")\n", + "ELASTICSEARCH_INDEX = os.getenv(\"ELASTICSEARCH_INDEX\", \"sales-records-consolidated\")\n", + "OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n", + "\n", + "# Validation\n", + "REQUIRED_VARS = {\n", + " \"UNSTRUCTURED_API_KEY\": UNSTRUCTURED_API_KEY,\n", + " \"AWS_ACCESS_KEY_ID\": AWS_ACCESS_KEY_ID,\n", + " \"AWS_SECRET_ACCESS_KEY\": AWS_SECRET_ACCESS_KEY,\n", + " \"ELASTICSEARCH_HOST\": ELASTICSEARCH_HOST,\n", + " \"ELASTICSEARCH_API_KEY\": ELASTICSEARCH_API_KEY,\n", + " \"S3_SOURCE_BUCKET\": S3_SOURCE_BUCKET,\n", + "}\n", + "\n", + "missing_vars = [key for key, value in REQUIRED_VARS.items() if not value]\n", + "if missing_vars:\n", + " print(f\"❌ Missing required environment variables: {', '.join(missing_vars)}\")\n", + " print(\"Please set these environment variables or create a .env file with your credentials.\")\n", + " raise ValueError(f\"Missing required environment variables: {missing_vars}\")\n", + "\n", + "print(\"✅ Configuration loaded successfully\")" + ] + }, + { + "cell_type": "markdown", + "id": "42e66bab", + "metadata": {}, + "source": [ + "## AWS S3: Your Document Storage\n", + "\n", + "Now that we have our environment configured, let's set up the data sources for our hybrid RAG system. First up: your unstructured documents. These PDFs, manuals, and reports need to be accessible via S3, where your product documentation and other unstructured content lives, waiting to be processed into searchable knowledge.\n", + "\n", + "### What You Need\n", + "\n", + "**An existing S3 bucket** containing the documents you want to process. For this tutorial, we'll use sample product manuals, but in production, this would be your actual business documents.\n", + "\n", + "> **Note**: This tutorial assumes you have an existing S3 bucket with documents. For detailed S3 setup instructions, see the [Unstructured S3 source connector documentation](https://docs.unstructured.io/api-reference/api-services/source-connectors/s3).\n", + "\n", + "You'll need an AWS account with S3 access, an IAM user with S3 read permissions for your bucket, and access keys (Access Key ID and Secret Access Key)." + ] + }, + { + "cell_type": "markdown", + "id": "f6faf583", + "metadata": {}, + "source": [ + "## Elasticsearch: Your Business Data Hub\n", + "\n", + "While S3 holds your unstructured documents, Elasticsearch serves a dual purpose in our pipeline. It's both a source of structured business data (your sales records, customer information) and the destination where our unified, processed results will be stored for RAG queries.\n", + "\n", + "### What You Need\n", + "\n", + "**Elasticsearch cluster** with API key authentication from Elastic Cloud (managed service). This gives you the reliability and scalability needed for enterprise applications.\n", + "\n", + "The pipeline uses two indices: `sales-records-consolidated` as the source containing your business data, and `customer-support` as the destination for your unified knowledge base. Both are created automatically by the pipeline.\n", + "\n", + "### Why Consolidated Data Format Matters\n", + "\n", + "Traditional databases store information in separate fields (customer_name, product_id, purchase_date). For RAG applications, we consolidate this into a long-form text field that provides full context in each search result. This approach ensures that when someone searches for \"John's headphone purchase,\" they get the complete story in one result.\n", + "\n", + "Example transformation:\n", + "```\n", + "Before: {customer: \"John Doe\", product: \"BH-001\", date: \"2024-01-15\"}\n", + "After: \"customer: John Doe\\nproduct: BH-001\\ndate: 2024-01-15\"\n", + "```\n", + "\n", + "### API Key Permissions\n", + "\n", + "Your Elasticsearch API key needs these permissions:\n", + "\n", + "```json\n", + "{\n", + " \"sales-records-full-access\": {\n", + " \"cluster\": [],\n", + " \"indices\": [\n", + " {\n", + " \"names\": [\n", + " \"sales-records\",\n", + " \"sales-records-consolidated\",\n", + " \"customer-support\"\n", + " ],\n", + " \"privileges\": [\n", + " \"create_index\",\n", + " \"delete_index\",\n", + " \"manage\",\n", + " \"write\",\n", + " \"read\",\n", + " \"view_index_metadata\",\n", + " \"monitor\"\n", + " ],\n", + " \"allow_restricted_indices\": false\n", + " }\n", + " ],\n", + " \"applications\": [],\n", + " \"run_as\": [],\n", + " \"metadata\": {},\n", + " \"transient_metadata\": {\n", + " \"enabled\": true\n", + " }\n", + " }\n", + "}\n", + "```\n", + "\n", + "**Don't have Elasticsearch data yet?** The pipeline includes automatic data setup that creates sample sales records for demonstration. This is done by downloading .ZIP files from github and unzipping them." + ] + }, + { + "cell_type": "markdown", + "id": "9ff47e68", + "metadata": {}, + "source": [ + "## Data Preparation: Setting Up Your Demo Environment\n", + "\n", + "With our infrastructure configured, let's prepare the actual data that will flow through our hybrid RAG system. For this demonstration, we've created realistic sample data that represents a typical enterprise scenario, giving you a working example without requiring you to set up your own data sources first.\n", + "\n", + "**Elasticsearch Sales Data**: 100 synthetic sales records with customer information, with consolidated fields optimized for vector search. This represents the kind of structured business data you'd find in any enterprise system.\n", + "\n", + "**S3 Product Documentation**: 9 product manuals downloaded from manufacturer websites and stored in your S3 bucket. These represent the unstructured documents that contain critical product information.\n", + "\n", + "This combination mimics real enterprise scenarios where structured data (sales records) and unstructured documents (manuals) need to be searchable together for effective customer support. The magic happens when we can answer questions like \"What issues have customers reported with the BH-900 headphones?\" by pulling from both the sales records and the product manual simultaneously." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12a46ede", + "metadata": { + "lines_to_next_cell": 1 + }, + "outputs": [], + "source": [ + "# Data preparation functions\n", + "\n", + "def download_file(url: str, local_path: str) -> bool:\n", + " \"\"\"Download a file from URL to local path.\"\"\"\n", + " try:\n", + " print(f\"📥 Downloading {url}...\")\n", + " response = requests.get(url, stream=True)\n", + " response.raise_for_status()\n", + " \n", + " Path(local_path).parent.mkdir(parents=True, exist_ok=True)\n", + " \n", + " with open(local_path, 'wb') as f:\n", + " for chunk in response.iter_content(chunk_size=8192):\n", + " f.write(chunk)\n", + " \n", + " print(f\"✅ Downloaded to {local_path}\")\n", + " return True\n", + " \n", + " except Exception as e:\n", + " print(f\"❌ Error downloading {url}: {e}\")\n", + " return False\n", + "\n", + "def setup_elasticsearch_data():\n", + " \"\"\"Download and load sales data into Elasticsearch index.\"\"\"\n", + " print(\"🔧 Setting up Elasticsearch sales data...\")\n", + " \n", + " try:\n", + " es = Elasticsearch(\n", + " ELASTICSEARCH_HOST,\n", + " api_key=ELASTICSEARCH_API_KEY,\n", + " request_timeout=60,\n", + " max_retries=3,\n", + " retry_on_timeout=True\n", + " )\n", + " \n", + " index_name = \"sales-records-consolidated\"\n", + " \n", + " sales_data_url = \"https://github.com/Unstructured-IO/rag-over-hybrid-data-sources/raw/feature/hybrid-rag-pipeline/source_data/sales_records_consolidated.zip\"\n", + " \n", + " with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as tmp_file:\n", + " if not download_file(sales_data_url, tmp_file.name):\n", + " return False\n", + " \n", + " with zipfile.ZipFile(tmp_file.name, 'r') as zipf:\n", + " # Load mapping\n", + " with zipf.open('mapping.json') as f:\n", + " mapping_data = json.loads(f.read().decode('utf-8'))\n", + " \n", + " # Load documents\n", + " with zipf.open('documents.json') as f:\n", + " documents = json.loads(f.read().decode('utf-8'))\n", + " \n", + " # Always delete existing index if present and reload from zip\n", + " if es.indices.exists(index=index_name):\n", + " print(f\"🗑️ Deleting existing index '{index_name}' to reload fresh data...\")\n", + " es.indices.delete(index=index_name)\n", + " \n", + " # Create index with mapping\n", + " index_mapping = mapping_data[index_name] if index_name in mapping_data else mapping_data[list(mapping_data.keys())[0]]\n", + " es.indices.create(index=index_name, body=index_mapping)\n", + " print(f\"🔧 Created index '{index_name}' with mapping\")\n", + " \n", + " # Prepare documents for bulk insert\n", + " def generate_docs():\n", + " for doc in documents:\n", + " yield {\n", + " \"_index\": index_name,\n", + " \"_id\": doc[\"_id\"],\n", + " \"_source\": doc[\"_source\"]\n", + " }\n", + " \n", + " # Bulk insert documents\n", + " success_count, failed_items = bulk(es, generate_docs(), chunk_size=100)\n", + " print(f\"📝 Inserted {success_count} documents\")\n", + " \n", + " # Refresh index and verify\n", + " es.indices.refresh(index=index_name)\n", + " count_response = es.count(index=index_name)\n", + " count_data = count_response.body if hasattr(count_response, 'body') else count_response\n", + " doc_count = count_data['count']\n", + " \n", + " if doc_count > 0:\n", + " print(f\"✅ Successfully loaded {doc_count} documents into '{index_name}' index\")\n", + " return True\n", + " else:\n", + " print(f\"❌ Index '{index_name}' is empty after loading\")\n", + " return False\n", + " \n", + " except Exception as e:\n", + " print(f\"❌ Error setting up Elasticsearch data: {e}\")\n", + " return False\n", + " \n", + " finally:\n", + " # Clean up temp file\n", + " try:\n", + " os.unlink(tmp_file.name)\n", + " except:\n", + " pass\n", + "\n", + "def setup_s3_data():\n", + " \"\"\"Download and load PDF files into S3 bucket.\"\"\"\n", + " print(\"🔧 Setting up S3 PDF data...\")\n", + " \n", + " try:\n", + " # Initialize S3 client\n", + " s3 = boto3.client(\n", + " 's3',\n", + " aws_access_key_id=AWS_ACCESS_KEY_ID,\n", + " aws_secret_access_key=AWS_SECRET_ACCESS_KEY,\n", + " region_name=AWS_REGION\n", + " )\n", + " \n", + " bucket_name = S3_SOURCE_BUCKET\n", + " if not bucket_name:\n", + " print(\"❌ S3_SOURCE_BUCKET not configured\")\n", + " return False\n", + " \n", + " # Check if bucket exists and has data\n", + " try:\n", + " response = s3.list_objects_v2(Bucket=bucket_name, MaxKeys=1)\n", + " if response.get('KeyCount', 0) > 0:\n", + " # Count total objects\n", + " response = s3.list_objects_v2(Bucket=bucket_name)\n", + " object_count = len(response.get('Contents', []))\n", + " print(f\"✅ Bucket '{bucket_name}' already exists with {object_count} files\")\n", + " return True\n", + " except ClientError as e:\n", + " if e.response['Error']['Code'] != '404':\n", + " raise e\n", + " \n", + " # Download S3 PDFs zip file\n", + " s3_data_url = \"https://github.com/Unstructured-IO/rag-over-hybrid-data-sources/raw/feature/hybrid-rag-pipeline/source_data/s3_pdfs.zip\"\n", + " \n", + " with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as tmp_file:\n", + " if not download_file(s3_data_url, tmp_file.name):\n", + " return False\n", + " \n", + " # Create bucket if it doesn't exist\n", + " try:\n", + " s3.head_bucket(Bucket=bucket_name)\n", + " print(f\"📦 Using existing bucket '{bucket_name}'\")\n", + " except ClientError as e:\n", + " if e.response['Error']['Code'] == '404':\n", + " print(f\"🔧 Creating bucket '{bucket_name}'...\")\n", + " try:\n", + " if AWS_REGION == \"us-east-1\":\n", + " s3.create_bucket(Bucket=bucket_name)\n", + " else:\n", + " s3.create_bucket(\n", + " Bucket=bucket_name,\n", + " CreateBucketConfiguration={'LocationConstraint': AWS_REGION}\n", + " )\n", + " print(f\"✅ Created bucket '{bucket_name}'\")\n", + " except ClientError as create_error:\n", + " if 'BucketAlreadyOwnedByYou' in str(create_error):\n", + " print(f\"📦 Bucket '{bucket_name}' already exists and is owned by you\")\n", + " else:\n", + " raise create_error\n", + " else:\n", + " raise e\n", + " \n", + " # Clear existing files in bucket\n", + " print(f\"🗑️ Clearing existing files from bucket '{bucket_name}'...\")\n", + " try:\n", + " response = s3.list_objects_v2(Bucket=bucket_name)\n", + " if 'Contents' in response:\n", + " objects_to_delete = [{'Key': obj['Key']} for obj in response['Contents']]\n", + " if objects_to_delete:\n", + " s3.delete_objects(\n", + " Bucket=bucket_name,\n", + " Delete={'Objects': objects_to_delete}\n", + " )\n", + " print(f\"🗑️ Deleted {len(objects_to_delete)} existing files\")\n", + " else:\n", + " print(\"📁 Bucket was already empty\")\n", + " else:\n", + " print(\"📁 Bucket was already empty\")\n", + " except ClientError as e:\n", + " print(f\"⚠️ Could not clear bucket (continuing anyway): {e}\")\n", + " \n", + " # Extract and upload files from zip\n", + " uploaded_count = 0\n", + " with zipfile.ZipFile(tmp_file.name, 'r') as zipf:\n", + " file_list = zipf.namelist()\n", + " pdf_files = [f for f in file_list if f.lower().endswith('.pdf')]\n", + " \n", + " print(f\"📊 Found {len(pdf_files)} PDF files in zip\")\n", + " \n", + " for file_name in pdf_files:\n", + " try:\n", + " # Extract file data\n", + " file_data = zipf.read(file_name)\n", + " \n", + " # Upload to S3\n", + " s3.put_object(\n", + " Bucket=bucket_name,\n", + " Key=file_name,\n", + " Body=file_data,\n", + " ContentType='application/pdf'\n", + " )\n", + " \n", + " print(f\" 📤 Uploaded: {file_name}\")\n", + " uploaded_count += 1\n", + " \n", + " except Exception as e:\n", + " print(f\" ❌ Failed to upload {file_name}: {e}\")\n", + " \n", + " # Verify upload\n", + " response = s3.list_objects_v2(Bucket=bucket_name)\n", + " actual_count = len(response.get('Contents', []))\n", + " \n", + " if actual_count > 0:\n", + " print(f\"✅ Successfully uploaded {uploaded_count} PDFs to bucket '{bucket_name}'\")\n", + " print(f\"📊 Bucket now contains {actual_count} files\")\n", + " return True\n", + " else:\n", + " print(f\"❌ Bucket '{bucket_name}' is empty after upload\")\n", + " return False\n", + " \n", + " except NoCredentialsError:\n", + " print(\"❌ AWS credentials not found. Please check your .env file.\")\n", + " return False\n", + " except Exception as e:\n", + " print(f\"❌ Error setting up S3 data: {e}\")\n", + " return False\n", + " \n", + " finally:\n", + " # Clean up temp file\n", + " try:\n", + " os.unlink(tmp_file.name)\n", + " except:\n", + " pass\n", + "\n", + "def prepare_data_sources():\n", + " \"\"\"Prepare both Elasticsearch and S3 data sources.\"\"\"\n", + " print(\"🚀 Preparing data sources...\")\n", + " print(\"=\" * 50)\n", + " \n", + " # Setup Elasticsearch data\n", + " if not setup_elasticsearch_data():\n", + " print(\"❌ Failed to setup Elasticsearch data\")\n", + " return False\n", + " \n", + " print() # Add spacing\n", + " \n", + " # Setup S3 data\n", + " if not setup_s3_data():\n", + " print(\"❌ Failed to setup S3 data\")\n", + " return False\n", + " \n", + " print()\n", + " print(\"✅ All data sources prepared successfully!\")\n", + " print(\"=\" * 50)\n", + " return True " + ] + }, + { + "cell_type": "markdown", + "id": "ae11e550", + "metadata": {}, + "source": [ + "## S3 Source Connector\n", + "\n", + "Now we'll create the connections that link our data sources to Unstructured's processing pipeline. First, let's establish the connection to your S3 bucket containing PDF documents for processing." + ] + }, + { + "cell_type": "markdown", + "id": "319151fb", + "metadata": {}, + "source": [ + "### Example Product Manual Content\n", + "\n", + "The following image shows a sample page from one of the product manuals stored in your S3 bucket. This demonstrates the type of unstructured content that will be processed and made searchable through our RAG system." + ] + }, + { + "cell_type": "markdown", + "id": "4dcfda60", + "metadata": {}, + "source": [ + "![product-manual-example]()" + ] + }, + { + "cell_type": "markdown", + "id": "c4eb1e31", + "metadata": {}, + "source": [ + "## Elasticsearch Source Connector\n", + "\n", + "Next, we'll connect to your Elasticsearch index containing structured sales data, completing our dual-source setup." + ] + }, + { + "cell_type": "markdown", + "id": "72193ddf", + "metadata": {}, + "source": [ + "### Sales Records Data Structure\n", + "\n", + "The image below shows the structure of the consolidated sales records in your Elasticsearch index. This data represents customer transactions and will be processed alongside the product manuals to create a unified knowledge base." + ] + }, + { + "cell_type": "markdown", + "id": "48354f9b", + "metadata": {}, + "source": [ + "![sales-records-consolidated]()" + ] + }, + { + "cell_type": "markdown", + "id": "c27785eb", + "metadata": {}, + "source": [ + "## Elasticsearch Destination Connector\n", + "\n", + "Finally, we'll create the destination where both data streams will converge: the unified `customer-support` index where all processed data will be stored." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "efbb7e0f", + "metadata": { + "lines_to_next_cell": 1 + }, + "outputs": [], + "source": [ + "def create_s3_source_connector():\n", + " \"\"\"Create an S3 source connector for PDF documents.\"\"\"\n", + " try:\n", + " if not S3_SOURCE_BUCKET:\n", + " raise ValueError(\"S3_SOURCE_BUCKET is required (bucket name, s3:// URL, or https:// URL)\")\n", + " value = S3_SOURCE_BUCKET.strip()\n", + "\n", + " if value.startswith(\"s3://\"):\n", + " s3_style = value if value.endswith(\"/\") else value + \"/\"\n", + " elif value.startswith(\"http://\") or value.startswith(\"https://\"):\n", + " parsed = urlparse(value)\n", + " host = parsed.netloc\n", + " path = parsed.path or \"/\"\n", + " bucket = host.split(\".s3.\")[0]\n", + " s3_style = f\"s3://{bucket}{path if path.endswith('/') else path + '/'}\"\n", + " else:\n", + " s3_style = f\"s3://{value if value.endswith('/') else value + '/'}\"\n", + " \n", + " with UnstructuredClient(api_key_auth=UNSTRUCTURED_API_KEY) as client:\n", + " response = client.sources.create_source(\n", + " request=CreateSourceRequest(\n", + " create_source_connector=CreateSourceConnector(\n", + " name=\"\",\n", + " type=\"s3\",\n", + " config={\n", + " \"remote_url\": s3_style,\n", + " \"recursive\": True, \n", + " \"key\": AWS_ACCESS_KEY_ID,\n", + " \"secret\": AWS_SECRET_ACCESS_KEY,\n", + " }\n", + " )\n", + " )\n", + " )\n", + " \n", + " source_id = response.source_connector_information.id\n", + " print(f\"✅ Created S3 PDF source connector: {source_id} -> {s3_style}\")\n", + " return source_id\n", + " \n", + " except Exception as e:\n", + " print(f\"❌ Error creating S3 source connector: {e}\")\n", + " return None\n", + "\n", + "def create_elasticsearch_source_connector():\n", + " \"\"\"Create an Elasticsearch source connector for sales data.\"\"\"\n", + " try:\n", + " with UnstructuredClient(api_key_auth=UNSTRUCTURED_API_KEY) as client:\n", + " response = client.sources.create_source(\n", + " request=CreateSourceRequest(\n", + " create_source_connector=CreateSourceConnector(\n", + " name=f\"elasticsearch_sales_source_{int(time.time())}\",\n", + " type=\"elasticsearch\",\n", + " config={\n", + " \"hosts\": [ELASTICSEARCH_HOST],\n", + " \"es_api_key\": ELASTICSEARCH_API_KEY,\n", + " \"index_name\": ELASTICSEARCH_INDEX\n", + " }\n", + " )\n", + " )\n", + " )\n", + " \n", + " source_id = response.source_connector_information.id\n", + " print(f\"✅ Created Elasticsearch sales source connector: {source_id}\")\n", + " return source_id\n", + " \n", + " except Exception as e:\n", + " print(f\"❌ Error creating Elasticsearch source connector: {e}\")\n", + " return None\n", + "\n", + "def create_elasticsearch_destination_connector():\n", + " \"\"\"Create an Elasticsearch destination connector for processed results.\"\"\"\n", + " try:\n", + " with UnstructuredClient(api_key_auth=UNSTRUCTURED_API_KEY) as client:\n", + " response = client.destinations.create_destination(\n", + " request=CreateDestinationRequest(\n", + " create_destination_connector=CreateDestinationConnector(\n", + " name=f\"elasticsearch_customer_support_destination_{int(time.time())}\",\n", + " type=\"elasticsearch\",\n", + " config={\n", + " \"hosts\": [ELASTICSEARCH_HOST],\n", + " \"es_api_key\": ELASTICSEARCH_API_KEY,\n", + " \"index_name\": \"customer-support\"\n", + " }\n", + " )\n", + " )\n", + " )\n", + "\n", + " destination_id = response.destination_connector_information.id\n", + " print(f\"✅ Created Elasticsearch destination connector: {destination_id}\")\n", + " return destination_id\n", + " \n", + " except Exception as e:\n", + " print(f\"❌ Error creating Elasticsearch destination connector: {e}\")\n", + " return None" + ] + }, + { + "cell_type": "markdown", + "id": "406fddd2", + "metadata": {}, + "source": [ + "## Processing Pipeline Configuration\n", + "\n", + "With our connectors in place, we can now configure the intelligent processing pipeline that will transform both data sources. This four-stage pipeline (VLM → Chunker → Embedder → NER) will be applied to both workflows, ensuring consistent processing regardless of data source." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29eb3e9c", + "metadata": { + "lines_to_next_cell": 1 + }, + "outputs": [], + "source": [ + "def create_workflow_nodes():\n", + " \"\"\"Create shared processing nodes for workflows.\"\"\"\n", + " vlm_partition_node = WorkflowNode(\n", + " name=\"VLM_Partitioner\",\n", + " subtype=\"vlm\",\n", + " type=\"partition\",\n", + " settings={\n", + " \"provider\": \"openai\",\n", + " \"model\": \"gpt-4o\",\n", + " }\n", + " )\n", + " \n", + " chunk_node = WorkflowNode(\n", + " name=\"Chunker_Node\",\n", + " subtype=\"chunk_by_title\",\n", + " type=\"chunk\",\n", + " settings={\n", + " \"new_after_n_chars\": 1500,\n", + " \"max_characters\": 2048,\n", + " \"overlap\": 0\n", + " }\n", + " )\n", + " \n", + " embedder_node = WorkflowNode(\n", + " name=\"Embedder_Node\",\n", + " subtype=\"openai\",\n", + " type=\"embed\",\n", + " settings={\n", + " \"model_name\": \"text-embedding-3-small\"\n", + " }\n", + " )\n", + " \n", + " ner_enrichment_node = WorkflowNode(\n", + " name=\"NER_Enrichment\",\n", + " type=\"prompter\",\n", + " subtype=\"openai_ner\",\n", + " settings={}\n", + " )\n", + " \n", + " return vlm_partition_node, chunk_node, embedder_node, ner_enrichment_node\n", + "\n", + "def create_parallel_workflows(s3_source_id, elasticsearch_source_id, destination_id):\n", + " \"\"\"Create separate workflows for S3 PDFs and Elasticsearch data that run in parallel.\"\"\"\n", + " try:\n", + " vlm_partition_node, chunk_node, embedder_node, ner_enrichment_node = create_workflow_nodes()\n", + " \n", + " # Create workflow for S3 PDFs\n", + " s3_workflow_id = None\n", + " if s3_source_id:\n", + " with UnstructuredClient(api_key_auth=UNSTRUCTURED_API_KEY) as client:\n", + " s3_workflow = CreateWorkflow(\n", + " name=f\"S3-PDFs-Parallel-Workflow_{int(time.time())}\",\n", + " source_id=s3_source_id,\n", + " destination_id=destination_id,\n", + " workflow_type=WorkflowType.CUSTOM,\n", + " workflow_nodes=[\n", + " vlm_partition_node,\n", + " chunk_node,\n", + " embedder_node,\n", + " ner_enrichment_node\n", + " ]\n", + " )\n", + " \n", + " s3_response = client.workflows.create_workflow(\n", + " request=CreateWorkflowRequest(\n", + " create_workflow=s3_workflow\n", + " )\n", + " )\n", + " \n", + " s3_workflow_id = s3_response.workflow_information.id\n", + " print(f\"✅ Created S3 PDF workflow: {s3_workflow_id}\")\n", + " \n", + " # Create workflow for Elasticsearch sales data\n", + " with UnstructuredClient(api_key_auth=UNSTRUCTURED_API_KEY) as client:\n", + " es_workflow = CreateWorkflow(\n", + " name=f\"Elasticsearch-Sales-Parallel-Workflow_{int(time.time())}\",\n", + " source_id=elasticsearch_source_id,\n", + " destination_id=destination_id,\n", + " workflow_type=WorkflowType.CUSTOM,\n", + " workflow_nodes=[\n", + " vlm_partition_node,\n", + " chunk_node,\n", + " embedder_node,\n", + " ner_enrichment_node\n", + " ]\n", + " )\n", + " \n", + " es_response = client.workflows.create_workflow(\n", + " request=CreateWorkflowRequest(\n", + " create_workflow=es_workflow\n", + " )\n", + " )\n", + " \n", + " es_workflow_id = es_response.workflow_information.id\n", + " print(f\"✅ Created Elasticsearch sales workflow: {es_workflow_id}\")\n", + " \n", + " return s3_workflow_id, es_workflow_id\n", + " \n", + " except Exception as e:\n", + " print(f\"❌ Error creating parallel workflows: {e}\")\n", + " return None, None" + ] + }, + { + "cell_type": "markdown", + "id": "86779b8e", + "metadata": {}, + "source": [ + "## Creating Parallel Processing Workflows\n", + "\n", + "Now we'll assemble everything into the two parallel workflows shown in our architecture diagram above, connecting each data source to the processing pipeline and unified destination." + ] + }, + { + "cell_type": "markdown", + "id": "36d5cef5", + "metadata": {}, + "source": [ + "## Starting Your Processing Jobs\n", + "\n", + "With our workflows configured, it's time to put them into action. This step submits both workflows to the Unstructured API and returns job IDs for monitoring." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1daccb2", + "metadata": { + "lines_to_next_cell": 1 + }, + "outputs": [], + "source": [ + "def run_workflow(workflow_id, workflow_name):\n", + " \"\"\"Run a workflow and return job information.\"\"\"\n", + " try:\n", + " with UnstructuredClient(api_key_auth=UNSTRUCTURED_API_KEY) as client:\n", + " response = client.workflows.run_workflow(\n", + " request={\"workflow_id\": workflow_id}\n", + " )\n", + " \n", + " job_id = response.job_information.id\n", + " print(f\"✅ Started {workflow_name} job: {job_id}\")\n", + " return job_id\n", + " \n", + " except Exception as e:\n", + " print(f\"❌ Error running {workflow_name} workflow: {e}\")\n", + " return None\n", + "\n", + "def poll_job_status(job_id, job_name, wait_time=30):\n", + " \"\"\"Poll job status until completion.\"\"\"\n", + " print(f\"⏳ Monitoring {job_name} job status...\")\n", + " \n", + " while True:\n", + " try:\n", + " with UnstructuredClient(api_key_auth=UNSTRUCTURED_API_KEY) as client:\n", + " response = client.jobs.get_job(\n", + " request={\"job_id\": job_id}\n", + " )\n", + " \n", + " job = response.job_information\n", + " status = job.status\n", + " \n", + " if status in [\"SCHEDULED\", \"IN_PROGRESS\"]:\n", + " print(f\"⏳ {job_name} job status: {status}\")\n", + " time.sleep(wait_time)\n", + " elif status == \"COMPLETED\":\n", + " print(f\"✅ {job_name} job completed successfully!\")\n", + " return job\n", + " elif status == \"FAILED\":\n", + " print(f\"❌ {job_name} job failed!\")\n", + " return job\n", + " else:\n", + " print(f\"❓ Unknown {job_name} job status: {status}\")\n", + " return job\n", + " \n", + " except Exception as e:\n", + " print(f\"❌ Error polling {job_name} job status: {e}\")\n", + " time.sleep(wait_time)" + ] + }, + { + "cell_type": "markdown", + "id": "99f3d024", + "metadata": {}, + "source": [ + "## Monitoring Your Processing Progress\n", + "\n", + "Jobs progress through scheduled, in-progress, completed, or failed states. The `poll_job_status` function checks status every 30 seconds and blocks execution until jobs complete, so you can see exactly what's happening with your data processing." + ] + }, + { + "cell_type": "markdown", + "id": "c895cdaf", + "metadata": {}, + "source": [ + "## Preparing Your Elasticsearch Environment\n", + "\n", + "Before processing begins, we validate that the `sales-records-consolidated` index exists and contains data, then recreate the `customer-support` index fresh for each run. This preparation step ensures a clean environment and prevents any issues from previous runs.\n", + "\n", + "### Index Mapping\n", + "\n", + "The destination index uses this structure optimized for RAG applications:\n", + "```json\n", + "{\n", + " \"id\": \"keyword\", // Unique document identifier\n", + " \"timestamp\": \"date\", // Processing timestamp\n", + " \"text\": \"text\", // Searchable content\n", + " \"metadata\": \"object\" // Source info and entities\n", + "}\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2432029", + "metadata": { + "lines_to_next_cell": 1 + }, + "outputs": [], + "source": [ + "def run_elasticsearch_preprocessing():\n", + " \"\"\"Check and manage Elasticsearch indices for the pipeline.\"\"\"\n", + " print(\"🔧 Running Elasticsearch preprocessing...\")\n", + " \n", + " try:\n", + " es = Elasticsearch(\n", + " ELASTICSEARCH_HOST,\n", + " api_key=ELASTICSEARCH_API_KEY,\n", + " request_timeout=60,\n", + " max_retries=3,\n", + " retry_on_timeout=True\n", + " )\n", + " \n", + " sales_index = \"sales-records-consolidated\"\n", + " print(f\"�� Checking {sales_index} index...\")\n", + " \n", + " if not es.indices.exists(index=sales_index):\n", + " raise ValueError(f\"❌ Index '{sales_index}' does not exist. There is no data to use.\")\n", + " \n", + " count_response = es.count(index=sales_index)\n", + " doc_count = count_response['count']\n", + " \n", + " if doc_count == 0:\n", + " raise ValueError(f\"❌ Index '{sales_index}' is empty. There is no data to use.\")\n", + " \n", + " print(f\"✅ Found {doc_count} records in {sales_index}\")\n", + " \n", + " # Handle customer-support index\n", + " support_index = \"customer-support\"\n", + " print(f\"🔍 Checking {support_index} index...\")\n", + " \n", + " if es.indices.exists(index=support_index):\n", + " print(f\"🗑️ Deleting existing {support_index} index...\")\n", + " es.indices.delete(index=support_index)\n", + " \n", + " # Create fresh customer-support index\n", + " print(f\"🔧 Creating fresh {support_index} index...\")\n", + " mapping = {\n", + " \"settings\": {\n", + " \"number_of_shards\": 1,\n", + " \"number_of_replicas\": 1\n", + " },\n", + " \"mappings\": {\n", + " \"properties\": {\n", + " \"id\": {\"type\": \"keyword\"},\n", + " \"timestamp\": {\"type\": \"date\"},\n", + " \"text\": {\"type\": \"text\", \"analyzer\": \"standard\"},\n", + " \"metadata\": {\"type\": \"object\"}\n", + " }\n", + " }\n", + " }\n", + " \n", + " es.indices.create(index=support_index, body=mapping)\n", + " es.indices.refresh(index=support_index)\n", + " \n", + " print(f\"✅ Successfully created fresh {support_index} index\")\n", + " print(\"✅ Elasticsearch preprocessing completed successfully\")\n", + " return True\n", + " \n", + " except ValueError as e:\n", + " print(str(e))\n", + " return False\n", + " except Exception as e:\n", + " print(f\"❌ Error during Elasticsearch preprocessing: {e}\")\n", + " return False" + ] + }, + { + "cell_type": "markdown", + "id": "aa688432", + "metadata": {}, + "source": [ + "## Pipeline Execution Summary\n", + "\n", + "The following summary displays all resources created during pipeline setup: data source paths, connector IDs, workflow IDs, job IDs, and processing status." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2ed058f", + "metadata": { + "lines_to_next_cell": 1 + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "def print_pipeline_summary(s3_workflow_id, es_workflow_id, s3_job_id, es_job_id):\n", + " \"\"\"Print comprehensive pipeline summary.\"\"\"\n", + " print(\"\\n\" + \"=\" * 80)\n", + " print(\"📊 HYBRID RAG PIPELINE SUMMARY\")\n", + " print(\"=\" * 80)\n", + " print(f\"📁 S3 Source (PDFs): {S3_SOURCE_BUCKET if s3_workflow_id else SKIPPED}\")\n", + " print(f\"🔍 Elasticsearch Source: {ELASTICSEARCH_HOST}/{ELASTICSEARCH_INDEX}\")\n", + " print(f\"📤 Elasticsearch Destination: {ELASTICSEARCH_HOST}/customer-support\")\n", + " print(f\"\")\n", + " print(f\"⚙️ S3 PDFs Workflow ID: {s3_workflow_id if s3_workflow_id else SKIPPED}\")\n", + " print(f\"⚙️ Elasticsearch Sales Workflow ID: {es_workflow_id}\")\n", + " print(f\"\")\n", + " print(f\"🚀 S3 PDFs Job ID: {s3_job_id if s3_job_id else SKIPPED}\")\n", + " print(f\"🚀 Elasticsearch Sales Job ID: {es_job_id}\")\n", + "\n", + "def verify_customer_support_results(s3_job_id=None, es_job_id=None):\n", + " \"\"\"\n", + " Verifies the processed results in the customer-support index, prettyprinting one doc per unique source connector.\n", + " Assumes jobs have already completed successfully.\n", + " \"\"\"\n", + " import pprint\n", + "\n", + " print(\"🔍 Verifying processed results in 'customer-support' index (assuming jobs have completed)...\")\n", + "\n", + " try:\n", + " # Initialize Elasticsearch client\n", + " es = Elasticsearch(\n", + " ELASTICSEARCH_HOST,\n", + " api_key=ELASTICSEARCH_API_KEY,\n", + " request_timeout=60,\n", + " max_retries=3,\n", + " retry_on_timeout=True\n", + " )\n", + "\n", + " index_name = \"customer-support\"\n", + "\n", + " # Check if index exists\n", + " if not es.indices.exists(index=index_name):\n", + " print(f\"❌ Index '{index_name}' does not exist. Workflows may not have written results yet.\")\n", + " return\n", + "\n", + " # Get document count\n", + " count_response = es.count(index=index_name)\n", + " total_docs = count_response['count']\n", + " print(f\"📊 Total processed documents: {total_docs}\")\n", + "\n", + " if total_docs == 0:\n", + " print(\"⏳ No documents found yet. Workflows may still be processing or index is empty.\")\n", + " print(\"💡 Check the Unstructured dashboard for job status.\")\n", + " return\n", + "\n", + " print(f\"\\n📋 Analyzing Source Connectors:\")\n", + " print(\"=\" * 40)\n", + "\n", + " # Get sample documents to analyze source patterns\n", + " # Use function_score with random_score to sample documents randomly\n", + " sample_response = es.search(\n", + " index=index_name,\n", + " body={\n", + " \"size\": 50, # Get more samples to increase chance of seeing all sources\n", + " \"_source\": [\"metadata\", \"text\", \"element_id\"],\n", + " \"query\": {\n", + " \"function_score\": {\n", + " \"query\": {\"match_all\": {}},\n", + " \"random_score\": {}\n", + " }\n", + " }\n", + " }\n", + " )\n", + " \n", + "\n", + " # Map: source_connector_key -> [doc, ...]\n", + " source_connector_map = {}\n", + " unknown_docs = []\n", + "\n", + " for hit in sample_response['hits']['hits']:\n", + " source = hit['_source']\n", + " metadata = source.get('metadata', {})\n", + " \n", + " # Determine source connector type based on metadata patterns\n", + " if \"data_source-record_locator-index_name\" in metadata:\n", + " # Elasticsearch source connector\n", + " key = f\"elasticsearch:{metadata['data_source-record_locator-index_name']}\"\n", + " elif \"data_source-url\" in metadata:\n", + " # S3 source connector - group all S3 URLs by bucket\n", + " url = metadata['data_source-url']\n", + " if url.startswith('s3://'):\n", + " # Extract bucket name from S3 URL\n", + " bucket = url.split('/')[2] if '/' in url else url.replace('s3://', '')\n", + " key = f\"s3:{bucket}\"\n", + " else:\n", + " key = f\"s3:unknown\"\n", + " elif \"filename\" in metadata and metadata.get('filetype') == 'pdf':\n", + " # PDF files from S3 (fallback detection)\n", + " key = \"s3:pdfs\"\n", + " else:\n", + " key = \"unknown\"\n", + "\n", + " if key == \"unknown\":\n", + " unknown_docs.append(hit)\n", + " else:\n", + " if key not in source_connector_map:\n", + " source_connector_map[key] = hit # Only keep the first doc for each source connector\n", + "\n", + " print(f\"🔍 Unique source connectors found: {len(source_connector_map)}\")\n", + " for i, (key, doc) in enumerate(source_connector_map.items(), 1):\n", + " print(f\"\\n--- Source Connector {i} ({key}) ---\")\n", + " pprint.pprint(doc['_source'], depth=6, compact=False, sort_dicts=False)\n", + "\n", + " if unknown_docs:\n", + " print(f\"\\n❓ Example Unknown Source Document:\")\n", + " print(\"-\" * 35)\n", + " unknown_example = unknown_docs[0]['_source']\n", + " metadata = unknown_example.get('metadata', {})\n", + " text = unknown_example.get('text', '')\n", + " print(f\" Element ID: {unknown_example.get('element_id', 'N/A')}\")\n", + " print(f\" Metadata: {metadata}\")\n", + " print(f\" Text Preview: {text[:200]}...\" if len(text) > 200 else f\" Text: {text}\")\n", + " print(\" Metadata prettyprint:\")\n", + " pprint.pprint(metadata, depth=6, compact=False, sort_dicts=False)\n", + "\n", + " # Test search functionality\n", + " print(f\"\\n🔍 Testing Search Functionality:\")\n", + " print(\"=\" * 32)\n", + "\n", + " search_tests = [\"manual\", \"customer\", \"product\", \"support\"]\n", + "\n", + " for search_term in search_tests:\n", + " search_response = es.search(\n", + " index=index_name,\n", + " body={\n", + " \"size\": 1,\n", + " \"query\": {\n", + " \"match\": {\n", + " \"text\": search_term\n", + " }\n", + " }\n", + " }\n", + " )\n", + "\n", + " hits = search_response['hits']['total']['value']\n", + " print(f\" 🔎 '{search_term}': {hits} matches\")\n", + "\n", + " print(f\"\\n\" + \"=\" * 50)\n", + " print(\"🎉 CUSTOMER-SUPPORT INDEX VERIFICATION\")\n", + " print(\"=\" * 50)\n", + " print(\"✅ Index exists and contains processed documents\")\n", + " print(\"✅ Documents from both source connectors are present (if both completed)\")\n", + " print(\"✅ Text search is functional across processed content\")\n", + " print(\"✅ Ready for hybrid RAG queries!\")\n", + "\n", + " except Exception as e:\n", + " print(f\"❌ Error verifying results: {e}\")\n", + " print(\"💡 This is normal if workflows are still processing or if there is a connection issue.\")" + ] + }, + { + "cell_type": "markdown", + "id": "0d85f199", + "metadata": {}, + "source": [ + "## Orchestrating Your Complete Pipeline\n", + "\n", + "The main function coordinates all pipeline steps in logical sequence: data preparation, environment validation, connector setup, workflow creation, execution, and summary reporting." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f5f9744", + "metadata": { + "lines_to_next_cell": 1 + }, + "outputs": [], + "source": [ + "def main():\n", + " \"\"\"Main pipeline execution\"\"\"\n", + " print(\"🚀 Starting Hybrid RAG Pipeline\")\n", + " \n", + " print(\"\\n📦 Step 0: Data source preparation\")\n", + " print(\"-\" * 50)\n", + " \n", + " if not prepare_data_sources():\n", + " print(\"❌ Failed to prepare data sources\")\n", + " return\n", + " \n", + " print(\"\\n🔧 Step 1: Elasticsearch preprocessing\")\n", + " print(\"-\" * 50)\n", + " \n", + " if not run_elasticsearch_preprocessing():\n", + " print(\"❌ Failed to complete Elasticsearch preprocessing\")\n", + " return\n", + " \n", + " print(\"\\n🔗 Step 2: Creating source connectors\")\n", + " print(\"-\" * 50)\n", + " \n", + " s3_source_id = create_s3_source_connector()\n", + " if not s3_source_id:\n", + " print(\"❌ Failed to create S3 source connector\")\n", + " return\n", + " \n", + " elasticsearch_source_id = create_elasticsearch_source_connector()\n", + " if not elasticsearch_source_id:\n", + " print(\"❌ Failed to create Elasticsearch source connector\")\n", + " return\n", + " \n", + " # Step 3: Create Destination Connector\n", + " print(\"\\n🎯 Step 3: Creating Elasticsearch destination connector\")\n", + " print(\"-\" * 50)\n", + " \n", + " destination_id = create_elasticsearch_destination_connector()\n", + " if not destination_id:\n", + " print(\"❌ Failed to create destination connector\")\n", + " return\n", + " \n", + " # Step 4: Create Workflows\n", + " print(\"\\n⚙️ Step 4: Creating workflows\")\n", + " print(\"-\" * 50)\n", + " \n", + " s3_workflow_id, es_workflow_id = create_parallel_workflows(\n", + " s3_source_id, elasticsearch_source_id, destination_id\n", + " )\n", + " \n", + " if not es_workflow_id:\n", + " print(\"❌ Failed to create Elasticsearch workflow\")\n", + " return\n", + " \n", + " # Step 5: Run Workflows\n", + " print(\"\\n🚀 Step 5: Running workflows\")\n", + " print(\"-\" * 50)\n", + " \n", + " s3_job_id = None\n", + " es_job_id = None\n", + "\n", + " if s3_workflow_id:\n", + " s3_job_id = run_workflow(s3_workflow_id, \"S3 PDFs\")\n", + " if not s3_job_id:\n", + " print(\"❌ Failed to start S3 workflow\")\n", + " return\n", + "\n", + " if es_workflow_id:\n", + " es_job_id = run_workflow(es_workflow_id, \"Elasticsearch Sales\")\n", + " if not es_job_id:\n", + " print(\"❌ Failed to start Elasticsearch workflow\")\n", + " return\n", + "\n", + " # Step 6: Pipeline Summary\n", + " print_pipeline_summary(s3_workflow_id, es_workflow_id, s3_job_id, es_job_id)\n", + " return s3_job_id, es_job_id " + ] + }, + { + "cell_type": "markdown", + "id": "9803d2a1", + "metadata": {}, + "source": [ + "## Running Your Complete Pipeline\n", + "\n", + "We'll execute the complete pipeline by calling the main function to create all resources and start processing, then monitor the jobs until they complete successfully." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61a811ce", + "metadata": {}, + "outputs": [], + "source": [ + "s3_job_id, es_job_id = main()\n", + "\n", + "es_job_info = poll_job_status(es_job_id, \"Elasticsearch Ingest\")\n", + "s3_job_info = poll_job_status(s3_job_id, \"S3 Ingest\")\n", + "print(\"\\n🔍 Verifying processed results\")\n", + "print(\"-\" * 50)\n", + "verify_customer_support_results()" + ] + }, + { + "cell_type": "markdown", + "id": "41b136bb", + "metadata": {}, + "source": [ + "### Unified Knowledge Base Results\n", + "\n", + "After processing both data sources, the pipeline creates a unified `customer-support` index containing processed documents from both S3 PDFs and Elasticsearch sales records. The image below shows the structure of this consolidated knowledge base, ready for RAG queries." + ] + }, + { + "cell_type": "markdown", + "id": "d2826232", + "metadata": {}, + "source": [ + "![customer-support]()" + ] + }, + { + "cell_type": "markdown", + "id": "39611edf", + "metadata": {}, + "source": [ + "## RAG Query Demonstration\n", + "\n", + "Now that your hybrid knowledge base is ready, we'll demonstrate how to query it using RAG (Retrieval-Augmented Generation). This is where you'll see how the system can answer complex questions by pulling relevant information from both your S3 documents and Elasticsearch records.\n", + "\n", + "### OpenAI API Key Required\n", + "\n", + "For the RAG demonstration, you'll need an OpenAI API key to power the language model that generates answers based on your retrieved documents. Visit https://platform.openai.com/api-keys to sign in or create an account and generate a new API key.\n", + "\n", + "The demonstration will show cross-source querying, source attribution, and semantic understanding as your hybrid RAG system answers questions by combining information from multiple data sources." + ] + }, + { + "cell_type": "markdown", + "id": "7d207715", + "metadata": {}, + "source": [ + "### RAG Configuration\n", + "\n", + "**Instructions**: Paste your OpenAI API key below to enable RAG demonstrations. This key will be used to power the language model that generates answers based on your retrieved documents." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dfb47ebf", + "metadata": {}, + "outputs": [], + "source": [ + "# RAG Demonstration Configuration and Queries\n", + "import os\n", + "import json\n", + "\n", + "# LangChain imports for RAG functionality\n", + "from langchain_elasticsearch import ElasticsearchStore\n", + "from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n", + "from langchain_core.prompts import ChatPromptTemplate\n", + "from langchain_core.output_parsers import StrOutputParser\n", + "from langchain_core.runnables import RunnablePassthrough\n", + "\n", + "print(\"🤖 RAG Query Demonstration Setup\")\n", + "print(\"=\" * 40)\n", + "\n", + "if not OPENAI_API_KEY or OPENAI_API_KEY.startswith(\"your-\"):\n", + " print(\"⚠️ OpenAI API key not configured.\")\n", + " print(\"💡 Please set OPENAI_API_KEY in your .env file with your actual OpenAI API key.\")\n", + " print(\"📝 You can get one at: https://platform.openai.com/api-keys\")\n", + "else:\n", + " print(\"✅ OpenAI API key configured for RAG demonstrations\")\n", + "\n", + "def setup_rag_system():\n", + " \"\"\"Set up the RAG system with Elasticsearch and OpenAI.\"\"\"\n", + " \n", + " if not OPENAI_API_KEY or OPENAI_API_KEY.startswith(\"your-\"):\n", + " print(\"❌ OpenAI API key is required for RAG functionality\")\n", + " print(\"Please set OPENAI_API_KEY in your .env file\")\n", + " return None\n", + " \n", + " # Set OpenAI API key for LangChain\n", + " os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY\n", + " \n", + " try:\n", + " print(\"🔧 Setting up RAG components...\")\n", + " \n", + " # Initialize embeddings (same model used in processing)\n", + " embeddings = OpenAIEmbeddings(\n", + " model=\"text-embedding-3-small\",\n", + " openai_api_key=OPENAI_API_KEY\n", + " )\n", + " \n", + " # Connect to Elasticsearch vector store - using your working pattern\n", + " vector_store = ElasticsearchStore(\n", + " index_name=\"customer-support\",\n", + " embedding=embeddings,\n", + " es_url=ELASTICSEARCH_HOST,\n", + " es_api_key=ELASTICSEARCH_API_KEY,\n", + " vector_query_field=\"embeddings\",\n", + " query_field=\"text\",\n", + " )\n", + " \n", + " # Create retriever\n", + " retriever = vector_store.as_retriever(search_kwargs={\"k\": 5})\n", + " \n", + " # Initialize LLM\n", + " llm = ChatOpenAI(\n", + " model=\"gpt-3.5-turbo\",\n", + " temperature=0,\n", + " openai_api_key=OPENAI_API_KEY\n", + " )\n", + " \n", + " # Enhanced prompt template that leverages NER metadata\n", + " prompt = ChatPromptTemplate.from_template(\"\"\"\n", + "Use the following context to answer the question. Pay attention to any entity information (people, organizations, products, locations, dates) and relationships mentioned in the context.\n", + "\n", + "Context:\n", + "{context}\n", + "\n", + "Question:\n", + "{question}\n", + "\"\"\")\n", + " \n", + " print(\"✅ RAG system ready!\")\n", + " return {\"retriever\": retriever, \"llm\": llm, \"prompt\": prompt}\n", + " \n", + " except ImportError as e:\n", + " print(f\"❌ Missing RAG dependencies: {e}\")\n", + " print(\"💡 Install with: pip install langchain langchain-elasticsearch langchain-openai\")\n", + " return None\n", + " except Exception as e:\n", + " print(f\"❌ Error setting up RAG system: {e}\")\n", + " return None\n", + "\n", + "def extract_ner_entities(docs):\n", + " \"\"\"Extract NER entities from document metadata.\"\"\"\n", + " entities = {\"people\": set(), \"organizations\": set(), \"products\": set(), \"locations\": set(), \"dates\": set()}\n", + " \n", + " for doc in docs:\n", + " metadata = doc.metadata\n", + " if \"entities-items\" in metadata:\n", + " try:\n", + " import json\n", + " entity_items = json.loads(metadata[\"entities-items\"]) if isinstance(metadata[\"entities-items\"], str) else metadata[\"entities-items\"]\n", + " \n", + " for item in entity_items:\n", + " entity_type = item.get(\"type\", \"\").upper()\n", + " entity_name = item.get(\"entity\", \"\")\n", + " \n", + " if entity_type == \"PERSON\":\n", + " entities[\"people\"].add(entity_name)\n", + " elif entity_type == \"ORGANIZATION\":\n", + " entities[\"organizations\"].add(entity_name)\n", + " elif entity_type == \"PRODUCT\":\n", + " entities[\"products\"].add(entity_name)\n", + " elif entity_type == \"LOCATION\":\n", + " entities[\"locations\"].add(entity_name)\n", + " elif entity_type == \"DATE\":\n", + " entities[\"dates\"].add(entity_name)\n", + " except:\n", + " pass\n", + " \n", + " return entities\n", + "\n", + "def analyze_sources(docs):\n", + " \"\"\"Analyze retrieved documents by source type.\"\"\"\n", + " s3_docs = []\n", + " es_docs = []\n", + " unknown_docs = []\n", + " \n", + " for doc in docs:\n", + " metadata = doc.metadata\n", + " if \"data_source-record_locator-index_name\" in metadata:\n", + " es_docs.append(doc)\n", + " elif \"data_source-url\" in metadata and \"s3://\" in metadata.get(\"data_source-url\", \"\"):\n", + " s3_docs.append(doc)\n", + " else:\n", + " unknown_docs.append(doc)\n", + " \n", + " return s3_docs, es_docs, unknown_docs\n", + "\n", + "def demonstrate_hybrid_ner_queries(rag_components):\n", + " \"\"\"Demonstrate NER-enhanced hybrid RAG capabilities.\"\"\"\n", + " if not rag_components:\n", + " return\n", + " \n", + " retriever = rag_components[\"retriever\"]\n", + " llm = rag_components[\"llm\"]\n", + " prompt = rag_components[\"prompt\"]\n", + " \n", + " # Build RAG chain using your working pattern\n", + " rag_chain = (\n", + " {\"context\": retriever, \"question\": RunnablePassthrough()}\n", + " | prompt\n", + " | llm\n", + " | StrOutputParser()\n", + " )\n", + " \n", + " # Hybrid NER demonstration queries targeting different sources\n", + " hybrid_queries = [\n", + " {\n", + " \"query\": \"How do I troubleshoot Bose headphone connectivity issues?\",\n", + " \"description\": \"Product support query targeting S3 PDFs (product manuals)\",\n", + " \"expected_source\": \"S3 (Product Docs)\"\n", + " },\n", + " {\n", + " \"query\": \"Tell me about Daniel Hahn and his purchases\",\n", + " \"description\": \"Customer analysis query targeting Elasticsearch (sales data)\",\n", + " \"expected_source\": \"Elasticsearch (Sales)\"\n", + " },\n", + " {\n", + " \"query\": \"What are the technical specifications for SoundSport Wireless headphones?\",\n", + " \"description\": \"Product specification query targeting S3 PDFs\",\n", + " \"expected_source\": \"S3 (Product Docs)\"\n", + " },\n", + " {\n", + " \"query\": \"Show me customers in San Antonio, TX\",\n", + " \"description\": \"Geographic customer query targeting Elasticsearch\",\n", + " \"expected_source\": \"Elasticsearch (Sales)\"\n", + " },\n", + " {\n", + " \"query\": \"How do I reset wireless headphones to factory settings?\",\n", + " \"description\": \"Technical support query targeting S3 PDFs\",\n", + " \"expected_source\": \"S3 (Product Docs)\"\n", + " },\n", + " {\n", + " \"query\": \"What products does Newegg sell and what are their features?\",\n", + " \"description\": \"Hybrid query targeting BOTH sources (sales + product specs)\",\n", + " \"expected_source\": \"Both S3 and Elasticsearch\"\n", + " },\n", + " {\n", + " \"query\": \"I have a customer who bought Bose headphones and is having connectivity issues. What should I tell them?\",\n", + " \"description\": \"Customer support query requiring BOTH customer data AND product manuals\",\n", + " \"expected_source\": \"Both S3 and Elasticsearch\"\n", + " }\n", + " ]\n", + " \n", + " print(\"\\n🧠 Hybrid NER-Enhanced RAG Demonstration\")\n", + " print(\"=\" * 60)\n", + " \n", + " for i, query_info in enumerate(hybrid_queries, 1):\n", + " query = query_info[\"query\"]\n", + " description = query_info[\"description\"]\n", + " expected_source = query_info[\"expected_source\"]\n", + " \n", + " print(f\"\\n{'='*70}\")\n", + " print(f\"Query {i}: {description}\")\n", + " print(f\"📝 Query: {query}\")\n", + " print(f\"🎯 Expected Source: {expected_source}\")\n", + " print(\"=\" * 70)\n", + " \n", + " try:\n", + " # Retrieve documents\n", + " docs = retriever.invoke(query)\n", + " \n", + " if not docs:\n", + " print(\"❌ No documents retrieved\")\n", + " continue\n", + " \n", + " # Analyze sources (keeping your preferred format)\n", + " s3_docs, es_docs, unknown_docs = analyze_sources(docs)\n", + " print(f\"📊 Retrieved {len(docs)} documents:\")\n", + " print(f\" 📄 S3 (Product Docs): {len(s3_docs)}\")\n", + " print(f\" 📊 Elasticsearch (Sales): {len(es_docs)}\")\n", + " print(f\" ❓ Unknown: {len(unknown_docs)}\")\n", + " \n", + " # Check if we hit the expected source\n", + " if expected_source == \"S3 (Product Docs)\" and len(s3_docs) > 0:\n", + " print(\"✅ SUCCESS: Retrieved from expected S3 source!\")\n", + " elif expected_source == \"Elasticsearch (Sales)\" and len(es_docs) > 0:\n", + " print(\"✅ SUCCESS: Retrieved from expected Elasticsearch source!\")\n", + " elif expected_source == \"Both S3 and Elasticsearch\" and len(s3_docs) > 0 and len(es_docs) > 0:\n", + " print(\"✅ SUCCESS: Retrieved from BOTH sources as expected!\")\n", + " elif expected_source.startswith(\"Both\") and (len(s3_docs) > 0 or len(es_docs) > 0):\n", + " print(\"✅ PARTIAL: Retrieved from at least one expected source\")\n", + " else:\n", + " print(\"⚠️ UNEXPECTED: Did not retrieve from expected source\")\n", + " \n", + " # Extract and show NER entities\n", + " entities = extract_ner_entities(docs)\n", + " print(f\"\\n🏷️ NER Entities Found:\")\n", + " if entities[\"people\"]:\n", + " print(f\" 👤 People: {', '.join(list(entities['people'])[:3])}\")\n", + " if entities[\"organizations\"]:\n", + " print(f\" 🏢 Organizations: {', '.join(list(entities['organizations'])[:3])}\")\n", + " if entities[\"products\"]:\n", + " print(f\" 📱 Products: {', '.join(list(entities['products'])[:3])}\")\n", + " if entities[\"locations\"]:\n", + " print(f\" 🗺️ Locations: {', '.join(list(entities['locations'])[:3])}\")\n", + " if entities[\"dates\"]:\n", + " print(f\" 📅 Dates: {', '.join(list(entities['dates'])[:3])}\")\n", + " \n", + " # Generate answer\n", + " print(f\"\\n💬 Answer:\")\n", + " answer = rag_chain.invoke(query)\n", + " print(f\"{answer}\")\n", + " \n", + " except Exception as e:\n", + " print(f\"❌ Error: {e}\")\n", + " if \"429\" in str(e):\n", + " print(\"⚠️ OpenAI API quota exceeded. Stopping demo.\")\n", + " break\n", + " \n", + " print(f\"\\n{'='*70}\")\n", + " print(\"🧠 Hybrid NER Demo Complete!\")\n", + " print(\"✅ Demonstrated cross-source retrieval capabilities\")\n", + " print(\"✅ Showed NER metadata integration across data sources\")\n", + " print(\"✅ Validated hybrid RAG architecture\")\n", + "\n", + "def run_rag_demonstration():\n", + " \"\"\"Run the RAG demonstration.\"\"\"\n", + " print(\"\\n🚀 Starting Hybrid RAG Demonstration\")\n", + " print(\"=\" * 50)\n", + " \n", + " rag_components = setup_rag_system()\n", + " \n", + " if rag_components:\n", + " demonstrate_hybrid_ner_queries(rag_components)\n", + " else:\n", + " print(\"❌ RAG demonstration skipped due to configuration issues\")\n", + "\n", + "# Run the demonstration\n", + "run_rag_demonstration()" + ] + }, + { + "cell_type": "markdown", + "id": "b5449850", + "metadata": {}, + "source": [ + "## What You've Accomplished\n", + "\n", + "**Enterprise Data Integration**: You've learned how to process multiple data formats (PDFs, structured records) in parallel, why consistent processing pipelines matter for unified search, and the value of creating a single searchable knowledge base that spans all your data sources.\n", + "\n", + "**Unstructured API Capabilities**: You've experienced VLM-powered document partitioning for complex layouts, intelligent chunking that preserves document structure, named entity recognition for enhanced search precision, and unified processing across diverse data sources.\n", + "\n", + "**RAG System Architecture**: You've built parallel workflow design for scalability and reliability, vector embeddings for semantic similarity search, source attribution in mixed-data query results, and NER-enhanced query understanding and response generation.\n", + "\n", + "### Ready to Scale?\n", + "\n", + "Deploy customer support chatbots with comprehensive knowledge access, build internal search tools that surface information from any source, or create automated content recommendation systems. Add more data sources using additional workflows, implement real-time data synchronization, or scale up for production data volumes with monitoring and alerting.\n", + "\n", + "### Try Unstructured Today\n", + "\n", + "Ready to build your own hybrid RAG system? [Sign up for a free trial](https://unstructured.io/?modal=try-for-free) and start transforming your enterprise data into intelligent, searchable knowledge.\n", + "\n", + "**Need help getting started?** Contact our team to schedule a demo and see how Unstructured can solve your specific data challenges." + ] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "-all", + "executable": "/usr/bin/env python3", + "main_language": "python", + "notebook_metadata_filter": "-all" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/hybrid_rag_pipeline_enriched.py b/hybrid_rag_pipeline_enriched.py new file mode 100644 index 0000000..0c96fa7 --- /dev/null +++ b/hybrid_rag_pipeline_enriched.py @@ -0,0 +1,1512 @@ +#!/usr/bin/env python3 +# %% [markdown] +# # Building a Hybrid RAG System: From Fragmented Data to Unified Intelligence +# +# Picture this: You're a customer support agent, and a customer calls about a product issue. To help them effectively, you need to pull information from multiple sources - product manuals stored as PDFs in cloud storage, their purchase history in your sales database, and previous support interactions scattered across different systems. Each piece of information lives in a different format, in a different system, with different access methods. +# +# This is the reality for most enterprises today, and it's exactly the challenge we're going to solve together. +# +# ## The Enterprise Data Challenge +# +# Enterprise data rarely lives in one place or format. Critical information is fragmented across unstructured documents like PDFs and manuals in cloud storage, structured records like sales data in databases, and different formats requiring different processing approaches. Traditional RAG systems work well with homogeneous data but struggle when you need to query across diverse data sources simultaneously. +# +# ## Why This Matters +# +# When data is scattered, customer support becomes inefficient, decision-making lacks context, and valuable insights remain hidden. A customer asking about a product issue shouldn't require you to manually search through multiple systems to piece together a complete picture. +# +# ## The Solution: Unstructured's Complete Gen AI Data Layer +# +# Unstructured isn't just another data processing tool—it's a complete Gen AI data layer solution that transforms how organizations handle unstructured data at scale. Unlike building custom solutions or using fragmented tools, Unstructured provides a unified platform that connects to 30+ data sources, processes 65+ file types with intelligent partitioning and chunking, automatically enriches content with metadata and context, and delivers to 30+ destinations—all while maintaining enterprise-grade security and compliance. +# +# The platform eliminates the complexity of managing multiple tools, custom integrations, and manual data preparation, allowing teams to focus on building AI applications rather than wrestling with data infrastructure. With flexible deployment options from SaaS to bare metal, Unstructured adapts to any infrastructure while providing the observability, automation, and reliability that enterprise AI projects demand. +# +# ## What We'll Build Together +# +# In this tutorial, we'll create a hybrid RAG system that processes two different data sources simultaneously: product documentation from S3 and sales records from Elasticsearch. Both will flow through the same intelligent processing pipeline and land in a unified, searchable knowledge base. +# +# ``` +# ┌─────────────────┐ ┌─────────────────────────┐ +# │ S3 PDFs │──── WORKFLOW 1 ──────────▶│ │ +# │ (Product Docs) │ │ Unstructured API │ +# └─────────────────┘ │ │ +# │ Partition → Chunk → │ +# ┌─────────────────┐ │ Embed → NER → Store │ +# │ Elasticsearch │──── WORKFLOW 2 ──────────▶│ │ +# │ (Sales Records) │ │ │ +# └─────────────────┘ └────────────┬────────────┘ +# │ +# ┌────────────▼────────────┐ +# │ customer-support │ +# │ (Unified Index) │ +# └─────────────────────────┘ +# ``` +# +# By the end of this tutorial, you'll have a working system that can answer complex questions by pulling information from both your product documentation and customer data simultaneously. + +# %% [markdown] +# ## Getting Started: Your Unstructured API Key +# +# To follow along with this tutorial, you'll need an Unstructured API key. This gives you access to the complete Gen AI data layer that will process your documents and create your unified knowledge base. +# +# ### Sign Up and Get Your API Key +# +# Visit https://platform.unstructured.io to sign up for a free account, navigate to API Keys in the sidebar, generate your API key, and save it for the configuration step below. For Team or Enterprise accounts, make sure you've selected the correct organizational workspace before creating your API key. +# +# **Need help?** Contact Unstructured Support at support@unstructured.io + +# %% [markdown] +# ## Configuration: Setting Up Your Environment +# +# Now we'll configure your environment with the necessary API keys and credentials. This step ensures your system can connect to all the data sources and services we'll be using. + +# %% [markdown] +# ### Creating a .env File in Google Colab +# +# For better security and organization, we'll create a `.env` file directly in your Colab environment. Run the code cell below to create the file with placeholder values, then edit it with your actual credentials. +# +# After running the code cell, you'll need to replace each placeholder value (like `your-unstructured-api-key`) with your actual API keys and credentials. + +# %% +def create_dotenv_file(): + """Create a .env file with placeholder values for the user to fill in.""" + env_content = """# Hybrid RAG Pipeline Environment Configuration +# Fill in your actual values below +# Configuration - Set these explicitly + +# =================================================================== +# AWS CONFIGURATION +# =================================================================== +AWS_ACCESS_KEY_ID="your-aws-access-key-id" +AWS_SECRET_ACCESS_KEY="your-aws-secret-access-key" +AWS_REGION="us-east-1" + +# =================================================================== +# UNSTRUCTURED API CONFIGURATION +# =================================================================== +UNSTRUCTURED_API_KEY="your-unstructured-api-key" +UNSTRUCTURED_API_URL="https://platform.unstructuredapp.io/api/v1" + +# =================================================================== +# ELASTICSEARCH CONFIGURATION +# =================================================================== +ELASTICSEARCH_HOST="https://your-cluster.es.io:443" +ELASTICSEARCH_API_KEY="your-elasticsearch-api-key" + +# =================================================================== +# PIPELINE DATA SOURCES +# =================================================================== +S3_SOURCE_BUCKET="your-s3-source-bucket" +S3_DESTINATION_BUCKET="your-s3-destination-bucket" +S3_OUTPUT_PREFIX="" +ELASTICSEARCH_INDEX="sales-records-consolidated" + +# =================================================================== +# OPENAI API CONFIGURATION +# =================================================================== +OPENAI_API_KEY="your-openai-api-key" +""" + + with open('.env', 'w') as f: + f.write(env_content) + + print("✅ Created .env file with placeholder values") + print("📝 Please edit the .env file and replace the placeholder values with your actual credentials") + print("🔒 The .env file will be loaded automatically by the pipeline") + +# Create the .env file +create_dotenv_file() + +# %% [markdown] +# ### Installing Required Dependencies +# +# The following code installs the Python packages needed for this tutorial: the Unstructured client, Elasticsearch connector, AWS SDK, and other dependencies. + +# %% +import sys, subprocess + +def ensure_notebook_deps() -> None: + packages = [ + "jupytext", + "python-dotenv", + "unstructured-client", + "elasticsearch", + "boto3", + "PyYAML", + "langchain", + "langchain-elasticsearch", + "langchain-openai" + ] + try: + subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", *packages]) + except Exception: + # If install fails, continue; imports below will surface actionable errors + pass + +# Install notebook dependencies (safe no-op if present) +ensure_notebook_deps() + +import os +import time +import json +import zipfile +import tempfile +import requests +from pathlib import Path +from dotenv import load_dotenv +from urllib.parse import urlparse + +import boto3 +from botocore.exceptions import ClientError, NoCredentialsError +from elasticsearch import Elasticsearch +from elasticsearch.helpers import bulk + +from unstructured_client import UnstructuredClient +from unstructured_client.models.operations import ( + CreateSourceRequest, + CreateDestinationRequest, + CreateWorkflowRequest +) +from unstructured_client.models.shared import ( + CreateSourceConnector, + CreateDestinationConnector, + WorkflowNode, + WorkflowType, + CreateWorkflow +) + +# ============================================================================= +# ENVIRONMENT CONFIGURATION +# ============================================================================= +# Load from .env file if it exists +load_dotenv() + +# Configuration constants +SKIPPED = "SKIPPED" +UNSTRUCTURED_API_URL = os.getenv("UNSTRUCTURED_API_URL", "https://platform.unstructuredapp.io/api/v1") + +# Get environment variables +UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY") +AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID") +AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") +AWS_REGION = os.getenv("AWS_REGION", "us-east-1") +S3_SOURCE_BUCKET = os.getenv("S3_SOURCE_BUCKET") +S3_DESTINATION_BUCKET = os.getenv("S3_DESTINATION_BUCKET") +S3_OUTPUT_PREFIX = os.getenv("S3_OUTPUT_PREFIX", "") +ELASTICSEARCH_HOST = os.getenv("ELASTICSEARCH_HOST") +ELASTICSEARCH_API_KEY = os.getenv("ELASTICSEARCH_API_KEY") +ELASTICSEARCH_INDEX = os.getenv("ELASTICSEARCH_INDEX", "sales-records-consolidated") +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") + +# Validation +REQUIRED_VARS = { + "UNSTRUCTURED_API_KEY": UNSTRUCTURED_API_KEY, + "AWS_ACCESS_KEY_ID": AWS_ACCESS_KEY_ID, + "AWS_SECRET_ACCESS_KEY": AWS_SECRET_ACCESS_KEY, + "ELASTICSEARCH_HOST": ELASTICSEARCH_HOST, + "ELASTICSEARCH_API_KEY": ELASTICSEARCH_API_KEY, + "S3_SOURCE_BUCKET": S3_SOURCE_BUCKET, +} + +missing_vars = [key for key, value in REQUIRED_VARS.items() if not value] +if missing_vars: + print(f"❌ Missing required environment variables: {', '.join(missing_vars)}") + print("Please set these environment variables or create a .env file with your credentials.") + raise ValueError(f"Missing required environment variables: {missing_vars}") + +print("✅ Configuration loaded successfully") + +# %% [markdown] +# ## AWS S3: Your Document Storage +# +# Now that we have our environment configured, let's set up the data sources for our hybrid RAG system. First up: your unstructured documents. These PDFs, manuals, and reports need to be accessible via S3, where your product documentation and other unstructured content lives, waiting to be processed into searchable knowledge. +# +# ### What You Need +# +# **An existing S3 bucket** containing the documents you want to process. For this tutorial, we'll use sample product manuals, but in production, this would be your actual business documents. +# +# > **Note**: This tutorial assumes you have an existing S3 bucket with documents. For detailed S3 setup instructions, see the [Unstructured S3 source connector documentation](https://docs.unstructured.io/api-reference/api-services/source-connectors/s3). +# +# You'll need an AWS account with S3 access, an IAM user with S3 read permissions for your bucket, and access keys (Access Key ID and Secret Access Key). + +# %% [markdown] +# ## Elasticsearch: Your Business Data Hub +# +# While S3 holds your unstructured documents, Elasticsearch serves a dual purpose in our pipeline. It's both a source of structured business data (your sales records, customer information) and the destination where our unified, processed results will be stored for RAG queries. +# +# ### What You Need +# +# **Elasticsearch cluster** with API key authentication from Elastic Cloud (managed service). This gives you the reliability and scalability needed for enterprise applications. +# +# The pipeline uses two indices: `sales-records-consolidated` as the source containing your business data, and `customer-support` as the destination for your unified knowledge base. Both are created automatically by the pipeline. +# +# ### Why Consolidated Data Format Matters +# +# Traditional databases store information in separate fields (customer_name, product_id, purchase_date). For RAG applications, we consolidate this into a long-form text field that provides full context in each search result. This approach ensures that when someone searches for "John's headphone purchase," they get the complete story in one result. +# +# Example transformation: +# ``` +# Before: {customer: "John Doe", product: "BH-001", date: "2024-01-15"} +# After: "customer: John Doe\nproduct: BH-001\ndate: 2024-01-15" +# ``` +# +# ### API Key Permissions +# +# Your Elasticsearch API key needs these permissions: +# +# ```json +# { +# "sales-records-full-access": { +# "cluster": [], +# "indices": [ +# { +# "names": [ +# "sales-records", +# "sales-records-consolidated", +# "customer-support" +# ], +# "privileges": [ +# "create_index", +# "delete_index", +# "manage", +# "write", +# "read", +# "view_index_metadata", +# "monitor" +# ], +# "allow_restricted_indices": false +# } +# ], +# "applications": [], +# "run_as": [], +# "metadata": {}, +# "transient_metadata": { +# "enabled": true +# } +# } +# } +# ``` +# +# **Don't have Elasticsearch data yet?** The pipeline includes automatic data setup that creates sample sales records for demonstration. This is done by downloading .ZIP files from github and unzipping them. + +# %% [markdown] +# ## Data Preparation: Setting Up Your Demo Environment +# +# With our infrastructure configured, let's prepare the actual data that will flow through our hybrid RAG system. For this demonstration, we've created realistic sample data that represents a typical enterprise scenario, giving you a working example without requiring you to set up your own data sources first. +# +# **Elasticsearch Sales Data**: 100 synthetic sales records with customer information, with consolidated fields optimized for vector search. This represents the kind of structured business data you'd find in any enterprise system. +# +# **S3 Product Documentation**: 9 product manuals downloaded from manufacturer websites and stored in your S3 bucket. These represent the unstructured documents that contain critical product information. +# +# This combination mimics real enterprise scenarios where structured data (sales records) and unstructured documents (manuals) need to be searchable together for effective customer support. The magic happens when we can answer questions like "What issues have customers reported with the BH-900 headphones?" by pulling from both the sales records and the product manual simultaneously. + +# %% +# Data preparation functions + +def download_file(url: str, local_path: str) -> bool: + """Download a file from URL to local path.""" + try: + print(f"📥 Downloading {url}...") + response = requests.get(url, stream=True) + response.raise_for_status() + + Path(local_path).parent.mkdir(parents=True, exist_ok=True) + + with open(local_path, 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + + print(f"✅ Downloaded to {local_path}") + return True + + except Exception as e: + print(f"❌ Error downloading {url}: {e}") + return False + +def setup_elasticsearch_data(): + """Download and load sales data into Elasticsearch index.""" + print("🔧 Setting up Elasticsearch sales data...") + + try: + es = Elasticsearch( + ELASTICSEARCH_HOST, + api_key=ELASTICSEARCH_API_KEY, + request_timeout=60, + max_retries=3, + retry_on_timeout=True + ) + + index_name = "sales-records-consolidated" + + sales_data_url = "https://github.com/Unstructured-IO/rag-over-hybrid-data-sources/raw/feature/hybrid-rag-pipeline/source_data/sales_records_consolidated.zip" + + with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as tmp_file: + if not download_file(sales_data_url, tmp_file.name): + return False + + with zipfile.ZipFile(tmp_file.name, 'r') as zipf: + # Load mapping + with zipf.open('mapping.json') as f: + mapping_data = json.loads(f.read().decode('utf-8')) + + # Load documents + with zipf.open('documents.json') as f: + documents = json.loads(f.read().decode('utf-8')) + + # Always delete existing index if present and reload from zip + if es.indices.exists(index=index_name): + print(f"🗑️ Deleting existing index '{index_name}' to reload fresh data...") + es.indices.delete(index=index_name) + + # Create index with mapping + index_mapping = mapping_data[index_name] if index_name in mapping_data else mapping_data[list(mapping_data.keys())[0]] + es.indices.create(index=index_name, body=index_mapping) + print(f"🔧 Created index '{index_name}' with mapping") + + # Prepare documents for bulk insert + def generate_docs(): + for doc in documents: + yield { + "_index": index_name, + "_id": doc["_id"], + "_source": doc["_source"] + } + + # Bulk insert documents + success_count, failed_items = bulk(es, generate_docs(), chunk_size=100) + print(f"📝 Inserted {success_count} documents") + + # Refresh index and verify + es.indices.refresh(index=index_name) + count_response = es.count(index=index_name) + count_data = count_response.body if hasattr(count_response, 'body') else count_response + doc_count = count_data['count'] + + if doc_count > 0: + print(f"✅ Successfully loaded {doc_count} documents into '{index_name}' index") + return True + else: + print(f"❌ Index '{index_name}' is empty after loading") + return False + + except Exception as e: + print(f"❌ Error setting up Elasticsearch data: {e}") + return False + + finally: + # Clean up temp file + try: + os.unlink(tmp_file.name) + except: + pass + +def setup_s3_data(): + """Download and load PDF files into S3 bucket.""" + print("🔧 Setting up S3 PDF data...") + + try: + # Initialize S3 client + s3 = boto3.client( + 's3', + aws_access_key_id=AWS_ACCESS_KEY_ID, + aws_secret_access_key=AWS_SECRET_ACCESS_KEY, + region_name=AWS_REGION + ) + + bucket_name = S3_SOURCE_BUCKET + if not bucket_name: + print("❌ S3_SOURCE_BUCKET not configured") + return False + + # Check if bucket exists and has data + try: + response = s3.list_objects_v2(Bucket=bucket_name, MaxKeys=1) + if response.get('KeyCount', 0) > 0: + # Count total objects + response = s3.list_objects_v2(Bucket=bucket_name) + object_count = len(response.get('Contents', [])) + print(f"✅ Bucket '{bucket_name}' already exists with {object_count} files") + return True + except ClientError as e: + if e.response['Error']['Code'] != '404': + raise e + + # Download S3 PDFs zip file + s3_data_url = "https://github.com/Unstructured-IO/rag-over-hybrid-data-sources/raw/feature/hybrid-rag-pipeline/source_data/s3_pdfs.zip" + + with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as tmp_file: + if not download_file(s3_data_url, tmp_file.name): + return False + + # Create bucket if it doesn't exist + try: + s3.head_bucket(Bucket=bucket_name) + print(f"📦 Using existing bucket '{bucket_name}'") + except ClientError as e: + if e.response['Error']['Code'] == '404': + print(f"🔧 Creating bucket '{bucket_name}'...") + try: + if AWS_REGION == "us-east-1": + s3.create_bucket(Bucket=bucket_name) + else: + s3.create_bucket( + Bucket=bucket_name, + CreateBucketConfiguration={'LocationConstraint': AWS_REGION} + ) + print(f"✅ Created bucket '{bucket_name}'") + except ClientError as create_error: + if 'BucketAlreadyOwnedByYou' in str(create_error): + print(f"📦 Bucket '{bucket_name}' already exists and is owned by you") + else: + raise create_error + else: + raise e + + # Clear existing files in bucket + print(f"🗑️ Clearing existing files from bucket '{bucket_name}'...") + try: + response = s3.list_objects_v2(Bucket=bucket_name) + if 'Contents' in response: + objects_to_delete = [{'Key': obj['Key']} for obj in response['Contents']] + if objects_to_delete: + s3.delete_objects( + Bucket=bucket_name, + Delete={'Objects': objects_to_delete} + ) + print(f"🗑️ Deleted {len(objects_to_delete)} existing files") + else: + print("📁 Bucket was already empty") + else: + print("📁 Bucket was already empty") + except ClientError as e: + print(f"⚠️ Could not clear bucket (continuing anyway): {e}") + + # Extract and upload files from zip + uploaded_count = 0 + with zipfile.ZipFile(tmp_file.name, 'r') as zipf: + file_list = zipf.namelist() + pdf_files = [f for f in file_list if f.lower().endswith('.pdf')] + + print(f"📊 Found {len(pdf_files)} PDF files in zip") + + for file_name in pdf_files: + try: + # Extract file data + file_data = zipf.read(file_name) + + # Upload to S3 + s3.put_object( + Bucket=bucket_name, + Key=file_name, + Body=file_data, + ContentType='application/pdf' + ) + + print(f" 📤 Uploaded: {file_name}") + uploaded_count += 1 + + except Exception as e: + print(f" ❌ Failed to upload {file_name}: {e}") + + # Verify upload + response = s3.list_objects_v2(Bucket=bucket_name) + actual_count = len(response.get('Contents', [])) + + if actual_count > 0: + print(f"✅ Successfully uploaded {uploaded_count} PDFs to bucket '{bucket_name}'") + print(f"📊 Bucket now contains {actual_count} files") + return True + else: + print(f"❌ Bucket '{bucket_name}' is empty after upload") + return False + + except NoCredentialsError: + print("❌ AWS credentials not found. Please check your .env file.") + return False + except Exception as e: + print(f"❌ Error setting up S3 data: {e}") + return False + + finally: + # Clean up temp file + try: + os.unlink(tmp_file.name) + except: + pass + +def prepare_data_sources(): + """Prepare both Elasticsearch and S3 data sources.""" + print("🚀 Preparing data sources...") + print("=" * 50) + + # Setup Elasticsearch data + if not setup_elasticsearch_data(): + print("❌ Failed to setup Elasticsearch data") + return False + + print() # Add spacing + + # Setup S3 data + if not setup_s3_data(): + print("❌ Failed to setup S3 data") + return False + + print() + print("✅ All data sources prepared successfully!") + print("=" * 50) + return True + +# %% [markdown] +# ## S3 Source Connector +# +# Now we'll create the connections that link our data sources to Unstructured's processing pipeline. First, let's establish the connection to your S3 bucket containing PDF documents for processing. + +# %% [markdown] +# ### Example Product Manual Content +# +# The following image shows a sample page from one of the product manuals stored in your S3 bucket. This demonstrates the type of unstructured content that will be processed and made searchable through our RAG system. + +# %% [markdown] +# ![product-manual-example]() + +# %% [markdown] +# ## Elasticsearch Source Connector +# +# Next, we'll connect to your Elasticsearch index containing structured sales data, completing our dual-source setup. + +# %% [markdown] +# ### Sales Records Data Structure +# +# The image below shows the structure of the consolidated sales records in your Elasticsearch index. This data represents customer transactions and will be processed alongside the product manuals to create a unified knowledge base. + +# %% [markdown] +# ![sales-records-consolidated]() + +# %% [markdown] +# ## Elasticsearch Destination Connector +# +# Finally, we'll create the destination where both data streams will converge: the unified `customer-support` index where all processed data will be stored. + +# %% +def create_s3_source_connector(): + """Create an S3 source connector for PDF documents.""" + try: + if not S3_SOURCE_BUCKET: + raise ValueError("S3_SOURCE_BUCKET is required (bucket name, s3:// URL, or https:// URL)") + value = S3_SOURCE_BUCKET.strip() + + if value.startswith("s3://"): + s3_style = value if value.endswith("/") else value + "/" + elif value.startswith("http://") or value.startswith("https://"): + parsed = urlparse(value) + host = parsed.netloc + path = parsed.path or "/" + bucket = host.split(".s3.")[0] + s3_style = f"s3://{bucket}{path if path.endswith('/') else path + '/'}" + else: + s3_style = f"s3://{value if value.endswith('/') else value + '/'}" + + with UnstructuredClient(api_key_auth=UNSTRUCTURED_API_KEY) as client: + response = client.sources.create_source( + request=CreateSourceRequest( + create_source_connector=CreateSourceConnector( + name="", + type="s3", + config={ + "remote_url": s3_style, + "recursive": True, + "key": AWS_ACCESS_KEY_ID, + "secret": AWS_SECRET_ACCESS_KEY, + } + ) + ) + ) + + source_id = response.source_connector_information.id + print(f"✅ Created S3 PDF source connector: {source_id} -> {s3_style}") + return source_id + + except Exception as e: + print(f"❌ Error creating S3 source connector: {e}") + return None + +def create_elasticsearch_source_connector(): + """Create an Elasticsearch source connector for sales data.""" + try: + with UnstructuredClient(api_key_auth=UNSTRUCTURED_API_KEY) as client: + response = client.sources.create_source( + request=CreateSourceRequest( + create_source_connector=CreateSourceConnector( + name=f"elasticsearch_sales_source_{int(time.time())}", + type="elasticsearch", + config={ + "hosts": [ELASTICSEARCH_HOST], + "es_api_key": ELASTICSEARCH_API_KEY, + "index_name": ELASTICSEARCH_INDEX + } + ) + ) + ) + + source_id = response.source_connector_information.id + print(f"✅ Created Elasticsearch sales source connector: {source_id}") + return source_id + + except Exception as e: + print(f"❌ Error creating Elasticsearch source connector: {e}") + return None + +def create_elasticsearch_destination_connector(): + """Create an Elasticsearch destination connector for processed results.""" + try: + with UnstructuredClient(api_key_auth=UNSTRUCTURED_API_KEY) as client: + response = client.destinations.create_destination( + request=CreateDestinationRequest( + create_destination_connector=CreateDestinationConnector( + name=f"elasticsearch_customer_support_destination_{int(time.time())}", + type="elasticsearch", + config={ + "hosts": [ELASTICSEARCH_HOST], + "es_api_key": ELASTICSEARCH_API_KEY, + "index_name": "customer-support" + } + ) + ) + ) + + destination_id = response.destination_connector_information.id + print(f"✅ Created Elasticsearch destination connector: {destination_id}") + return destination_id + + except Exception as e: + print(f"❌ Error creating Elasticsearch destination connector: {e}") + return None + +# %% [markdown] +# ## Processing Pipeline Configuration +# +# With our connectors in place, we can now configure the intelligent processing pipeline that will transform both data sources. This four-stage pipeline (VLM → Chunker → Embedder → NER) will be applied to both workflows, ensuring consistent processing regardless of data source. + +# %% +def create_workflow_nodes(): + """Create shared processing nodes for workflows.""" + vlm_partition_node = WorkflowNode( + name="VLM_Partitioner", + subtype="vlm", + type="partition", + settings={ + "provider": "openai", + "model": "gpt-4o", + } + ) + + chunk_node = WorkflowNode( + name="Chunker_Node", + subtype="chunk_by_title", + type="chunk", + settings={ + "new_after_n_chars": 1500, + "max_characters": 2048, + "overlap": 0 + } + ) + + embedder_node = WorkflowNode( + name="Embedder_Node", + subtype="openai", + type="embed", + settings={ + "model_name": "text-embedding-3-small" + } + ) + + ner_enrichment_node = WorkflowNode( + name="NER_Enrichment", + type="prompter", + subtype="openai_ner", + settings={} + ) + + return vlm_partition_node, chunk_node, embedder_node, ner_enrichment_node + +def create_parallel_workflows(s3_source_id, elasticsearch_source_id, destination_id): + """Create separate workflows for S3 PDFs and Elasticsearch data that run in parallel.""" + try: + vlm_partition_node, chunk_node, embedder_node, ner_enrichment_node = create_workflow_nodes() + + # Create workflow for S3 PDFs + s3_workflow_id = None + if s3_source_id: + with UnstructuredClient(api_key_auth=UNSTRUCTURED_API_KEY) as client: + s3_workflow = CreateWorkflow( + name=f"S3-PDFs-Parallel-Workflow_{int(time.time())}", + source_id=s3_source_id, + destination_id=destination_id, + workflow_type=WorkflowType.CUSTOM, + workflow_nodes=[ + vlm_partition_node, + chunk_node, + embedder_node, + ner_enrichment_node + ] + ) + + s3_response = client.workflows.create_workflow( + request=CreateWorkflowRequest( + create_workflow=s3_workflow + ) + ) + + s3_workflow_id = s3_response.workflow_information.id + print(f"✅ Created S3 PDF workflow: {s3_workflow_id}") + + # Create workflow for Elasticsearch sales data + with UnstructuredClient(api_key_auth=UNSTRUCTURED_API_KEY) as client: + es_workflow = CreateWorkflow( + name=f"Elasticsearch-Sales-Parallel-Workflow_{int(time.time())}", + source_id=elasticsearch_source_id, + destination_id=destination_id, + workflow_type=WorkflowType.CUSTOM, + workflow_nodes=[ + vlm_partition_node, + chunk_node, + embedder_node, + ner_enrichment_node + ] + ) + + es_response = client.workflows.create_workflow( + request=CreateWorkflowRequest( + create_workflow=es_workflow + ) + ) + + es_workflow_id = es_response.workflow_information.id + print(f"✅ Created Elasticsearch sales workflow: {es_workflow_id}") + + return s3_workflow_id, es_workflow_id + + except Exception as e: + print(f"❌ Error creating parallel workflows: {e}") + return None, None + +# %% [markdown] +# ## Creating Parallel Processing Workflows +# +# Now we'll assemble everything into the two parallel workflows shown in our architecture diagram above, connecting each data source to the processing pipeline and unified destination. + +# %% [markdown] +# ## Starting Your Processing Jobs +# +# With our workflows configured, it's time to put them into action. This step submits both workflows to the Unstructured API and returns job IDs for monitoring. + +# %% +def run_workflow(workflow_id, workflow_name): + """Run a workflow and return job information.""" + try: + with UnstructuredClient(api_key_auth=UNSTRUCTURED_API_KEY) as client: + response = client.workflows.run_workflow( + request={"workflow_id": workflow_id} + ) + + job_id = response.job_information.id + print(f"✅ Started {workflow_name} job: {job_id}") + return job_id + + except Exception as e: + print(f"❌ Error running {workflow_name} workflow: {e}") + return None + +def poll_job_status(job_id, job_name, wait_time=30): + """Poll job status until completion.""" + print(f"⏳ Monitoring {job_name} job status...") + + while True: + try: + with UnstructuredClient(api_key_auth=UNSTRUCTURED_API_KEY) as client: + response = client.jobs.get_job( + request={"job_id": job_id} + ) + + job = response.job_information + status = job.status + + if status in ["SCHEDULED", "IN_PROGRESS"]: + print(f"⏳ {job_name} job status: {status}") + time.sleep(wait_time) + elif status == "COMPLETED": + print(f"✅ {job_name} job completed successfully!") + return job + elif status == "FAILED": + print(f"❌ {job_name} job failed!") + return job + else: + print(f"❓ Unknown {job_name} job status: {status}") + return job + + except Exception as e: + print(f"❌ Error polling {job_name} job status: {e}") + time.sleep(wait_time) + +# %% [markdown] +# ## Monitoring Your Processing Progress +# +# Jobs progress through scheduled, in-progress, completed, or failed states. The `poll_job_status` function checks status every 30 seconds and blocks execution until jobs complete, so you can see exactly what's happening with your data processing. + +# %% [markdown] +# ## Preparing Your Elasticsearch Environment +# +# Before processing begins, we validate that the `sales-records-consolidated` index exists and contains data, then recreate the `customer-support` index fresh for each run. This preparation step ensures a clean environment and prevents any issues from previous runs. +# +# ### Index Mapping +# +# The destination index uses this structure optimized for RAG applications: +# ```json +# { +# "id": "keyword", // Unique document identifier +# "timestamp": "date", // Processing timestamp +# "text": "text", // Searchable content +# "metadata": "object" // Source info and entities +# } +# ``` + +# %% +def run_elasticsearch_preprocessing(): + """Check and manage Elasticsearch indices for the pipeline.""" + print("🔧 Running Elasticsearch preprocessing...") + + try: + es = Elasticsearch( + ELASTICSEARCH_HOST, + api_key=ELASTICSEARCH_API_KEY, + request_timeout=60, + max_retries=3, + retry_on_timeout=True + ) + + sales_index = "sales-records-consolidated" + print(f"�� Checking {sales_index} index...") + + if not es.indices.exists(index=sales_index): + raise ValueError(f"❌ Index '{sales_index}' does not exist. There is no data to use.") + + count_response = es.count(index=sales_index) + doc_count = count_response['count'] + + if doc_count == 0: + raise ValueError(f"❌ Index '{sales_index}' is empty. There is no data to use.") + + print(f"✅ Found {doc_count} records in {sales_index}") + + # Handle customer-support index + support_index = "customer-support" + print(f"🔍 Checking {support_index} index...") + + if es.indices.exists(index=support_index): + print(f"🗑️ Deleting existing {support_index} index...") + es.indices.delete(index=support_index) + + # Create fresh customer-support index + print(f"🔧 Creating fresh {support_index} index...") + mapping = { + "settings": { + "number_of_shards": 1, + "number_of_replicas": 1 + }, + "mappings": { + "properties": { + "id": {"type": "keyword"}, + "timestamp": {"type": "date"}, + "text": {"type": "text", "analyzer": "standard"}, + "metadata": {"type": "object"} + } + } + } + + es.indices.create(index=support_index, body=mapping) + es.indices.refresh(index=support_index) + + print(f"✅ Successfully created fresh {support_index} index") + print("✅ Elasticsearch preprocessing completed successfully") + return True + + except ValueError as e: + print(str(e)) + return False + except Exception as e: + print(f"❌ Error during Elasticsearch preprocessing: {e}") + return False + +# %% [markdown] +# ## Pipeline Execution Summary +# +# The following summary displays all resources created during pipeline setup: data source paths, connector IDs, workflow IDs, job IDs, and processing status. + +# %% +import os + +def print_pipeline_summary(s3_workflow_id, es_workflow_id, s3_job_id, es_job_id): + """Print comprehensive pipeline summary.""" + print("\n" + "=" * 80) + print("📊 HYBRID RAG PIPELINE SUMMARY") + print("=" * 80) + print(f"📁 S3 Source (PDFs): {S3_SOURCE_BUCKET if s3_workflow_id else SKIPPED}") + print(f"🔍 Elasticsearch Source: {ELASTICSEARCH_HOST}/{ELASTICSEARCH_INDEX}") + print(f"📤 Elasticsearch Destination: {ELASTICSEARCH_HOST}/customer-support") + print(f"") + print(f"⚙️ S3 PDFs Workflow ID: {s3_workflow_id if s3_workflow_id else SKIPPED}") + print(f"⚙️ Elasticsearch Sales Workflow ID: {es_workflow_id}") + print(f"") + print(f"🚀 S3 PDFs Job ID: {s3_job_id if s3_job_id else SKIPPED}") + print(f"🚀 Elasticsearch Sales Job ID: {es_job_id}") + +def verify_customer_support_results(s3_job_id=None, es_job_id=None): + """ + Verifies the processed results in the customer-support index, prettyprinting one doc per unique source connector. + Assumes jobs have already completed successfully. + """ + import pprint + + print("🔍 Verifying processed results in 'customer-support' index (assuming jobs have completed)...") + + try: + # Initialize Elasticsearch client + es = Elasticsearch( + ELASTICSEARCH_HOST, + api_key=ELASTICSEARCH_API_KEY, + request_timeout=60, + max_retries=3, + retry_on_timeout=True + ) + + index_name = "customer-support" + + # Check if index exists + if not es.indices.exists(index=index_name): + print(f"❌ Index '{index_name}' does not exist. Workflows may not have written results yet.") + return + + # Get document count + count_response = es.count(index=index_name) + total_docs = count_response['count'] + print(f"📊 Total processed documents: {total_docs}") + + if total_docs == 0: + print("⏳ No documents found yet. Workflows may still be processing or index is empty.") + print("💡 Check the Unstructured dashboard for job status.") + return + + print(f"\n📋 Analyzing Source Connectors:") + print("=" * 40) + + # Get sample documents to analyze source patterns + # Use function_score with random_score to sample documents randomly + sample_response = es.search( + index=index_name, + body={ + "size": 50, # Get more samples to increase chance of seeing all sources + "_source": ["metadata", "text", "element_id"], + "query": { + "function_score": { + "query": {"match_all": {}}, + "random_score": {} + } + } + } + ) + + + # Map: source_connector_key -> [doc, ...] + source_connector_map = {} + unknown_docs = [] + + for hit in sample_response['hits']['hits']: + source = hit['_source'] + metadata = source.get('metadata', {}) + + # Determine source connector type based on metadata patterns + if "data_source-record_locator-index_name" in metadata: + # Elasticsearch source connector + key = f"elasticsearch:{metadata['data_source-record_locator-index_name']}" + elif "data_source-url" in metadata: + # S3 source connector - group all S3 URLs by bucket + url = metadata['data_source-url'] + if url.startswith('s3://'): + # Extract bucket name from S3 URL + bucket = url.split('/')[2] if '/' in url else url.replace('s3://', '') + key = f"s3:{bucket}" + else: + key = f"s3:unknown" + elif "filename" in metadata and metadata.get('filetype') == 'pdf': + # PDF files from S3 (fallback detection) + key = "s3:pdfs" + else: + key = "unknown" + + if key == "unknown": + unknown_docs.append(hit) + else: + if key not in source_connector_map: + source_connector_map[key] = hit # Only keep the first doc for each source connector + + print(f"🔍 Unique source connectors found: {len(source_connector_map)}") + for i, (key, doc) in enumerate(source_connector_map.items(), 1): + print(f"\n--- Source Connector {i} ({key}) ---") + pprint.pprint(doc['_source'], depth=6, compact=False, sort_dicts=False) + + if unknown_docs: + print(f"\n❓ Example Unknown Source Document:") + print("-" * 35) + unknown_example = unknown_docs[0]['_source'] + metadata = unknown_example.get('metadata', {}) + text = unknown_example.get('text', '') + print(f" Element ID: {unknown_example.get('element_id', 'N/A')}") + print(f" Metadata: {metadata}") + print(f" Text Preview: {text[:200]}..." if len(text) > 200 else f" Text: {text}") + print(" Metadata prettyprint:") + pprint.pprint(metadata, depth=6, compact=False, sort_dicts=False) + + # Test search functionality + print(f"\n🔍 Testing Search Functionality:") + print("=" * 32) + + search_tests = ["manual", "customer", "product", "support"] + + for search_term in search_tests: + search_response = es.search( + index=index_name, + body={ + "size": 1, + "query": { + "match": { + "text": search_term + } + } + } + ) + + hits = search_response['hits']['total']['value'] + print(f" 🔎 '{search_term}': {hits} matches") + + print(f"\n" + "=" * 50) + print("🎉 CUSTOMER-SUPPORT INDEX VERIFICATION") + print("=" * 50) + print("✅ Index exists and contains processed documents") + print("✅ Documents from both source connectors are present (if both completed)") + print("✅ Text search is functional across processed content") + print("✅ Ready for hybrid RAG queries!") + + except Exception as e: + print(f"❌ Error verifying results: {e}") + print("💡 This is normal if workflows are still processing or if there is a connection issue.") + +# %% [markdown] +# ## Orchestrating Your Complete Pipeline +# +# The main function coordinates all pipeline steps in logical sequence: data preparation, environment validation, connector setup, workflow creation, execution, and summary reporting. + +# %% +def main(): + """Main pipeline execution""" + print("🚀 Starting Hybrid RAG Pipeline") + + print("\n📦 Step 0: Data source preparation") + print("-" * 50) + + if not prepare_data_sources(): + print("❌ Failed to prepare data sources") + return + + print("\n🔧 Step 1: Elasticsearch preprocessing") + print("-" * 50) + + if not run_elasticsearch_preprocessing(): + print("❌ Failed to complete Elasticsearch preprocessing") + return + + print("\n🔗 Step 2: Creating source connectors") + print("-" * 50) + + s3_source_id = create_s3_source_connector() + if not s3_source_id: + print("❌ Failed to create S3 source connector") + return + + elasticsearch_source_id = create_elasticsearch_source_connector() + if not elasticsearch_source_id: + print("❌ Failed to create Elasticsearch source connector") + return + + # Step 3: Create Destination Connector + print("\n🎯 Step 3: Creating Elasticsearch destination connector") + print("-" * 50) + + destination_id = create_elasticsearch_destination_connector() + if not destination_id: + print("❌ Failed to create destination connector") + return + + # Step 4: Create Workflows + print("\n⚙️ Step 4: Creating workflows") + print("-" * 50) + + s3_workflow_id, es_workflow_id = create_parallel_workflows( + s3_source_id, elasticsearch_source_id, destination_id + ) + + if not es_workflow_id: + print("❌ Failed to create Elasticsearch workflow") + return + + # Step 5: Run Workflows + print("\n🚀 Step 5: Running workflows") + print("-" * 50) + + s3_job_id = None + es_job_id = None + + if s3_workflow_id: + s3_job_id = run_workflow(s3_workflow_id, "S3 PDFs") + if not s3_job_id: + print("❌ Failed to start S3 workflow") + return + + if es_workflow_id: + es_job_id = run_workflow(es_workflow_id, "Elasticsearch Sales") + if not es_job_id: + print("❌ Failed to start Elasticsearch workflow") + return + + # Step 6: Pipeline Summary + print_pipeline_summary(s3_workflow_id, es_workflow_id, s3_job_id, es_job_id) + return s3_job_id, es_job_id + +# %% [markdown] +# ## Running Your Complete Pipeline +# +# We'll execute the complete pipeline by calling the main function to create all resources and start processing, then monitor the jobs until they complete successfully. + +# %% +s3_job_id, es_job_id = main() + +es_job_info = poll_job_status(es_job_id, "Elasticsearch Ingest") +s3_job_info = poll_job_status(s3_job_id, "S3 Ingest") +print("\n🔍 Verifying processed results") +print("-" * 50) +verify_customer_support_results() + +# %% [markdown] +# ### Unified Knowledge Base Results +# +# After processing both data sources, the pipeline creates a unified `customer-support` index containing processed documents from both S3 PDFs and Elasticsearch sales records. The image below shows the structure of this consolidated knowledge base, ready for RAG queries. + +# %% [markdown] +# ![customer-support]() + +# %% [markdown] +# ## RAG Query Demonstration +# +# Now that your hybrid knowledge base is ready, we'll demonstrate how to query it using RAG (Retrieval-Augmented Generation). This is where you'll see how the system can answer complex questions by pulling relevant information from both your S3 documents and Elasticsearch records. +# +# ### OpenAI API Key Required +# +# For the RAG demonstration, you'll need an OpenAI API key to power the language model that generates answers based on your retrieved documents. Visit https://platform.openai.com/api-keys to sign in or create an account and generate a new API key. +# +# The demonstration will show cross-source querying, source attribution, and semantic understanding as your hybrid RAG system answers questions by combining information from multiple data sources. + +# %% [markdown] +# ### RAG Configuration +# +# **Instructions**: Paste your OpenAI API key below to enable RAG demonstrations. This key will be used to power the language model that generates answers based on your retrieved documents. + +# %% +# RAG Demonstration Configuration and Queries +import os +import json + +# LangChain imports for RAG functionality +from langchain_elasticsearch import ElasticsearchStore +from langchain_openai import OpenAIEmbeddings, ChatOpenAI +from langchain_core.prompts import ChatPromptTemplate +from langchain_core.output_parsers import StrOutputParser +from langchain_core.runnables import RunnablePassthrough + +print("🤖 RAG Query Demonstration Setup") +print("=" * 40) + +if not OPENAI_API_KEY or OPENAI_API_KEY.startswith("your-"): + print("⚠️ OpenAI API key not configured.") + print("💡 Please set OPENAI_API_KEY in your .env file with your actual OpenAI API key.") + print("📝 You can get one at: https://platform.openai.com/api-keys") +else: + print("✅ OpenAI API key configured for RAG demonstrations") + +def setup_rag_system(): + """Set up the RAG system with Elasticsearch and OpenAI.""" + + if not OPENAI_API_KEY or OPENAI_API_KEY.startswith("your-"): + print("❌ OpenAI API key is required for RAG functionality") + print("Please set OPENAI_API_KEY in your .env file") + return None + + # Set OpenAI API key for LangChain + os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY + + try: + print("🔧 Setting up RAG components...") + + # Initialize embeddings (same model used in processing) + embeddings = OpenAIEmbeddings( + model="text-embedding-3-small", + openai_api_key=OPENAI_API_KEY + ) + + # Connect to Elasticsearch vector store - using your working pattern + vector_store = ElasticsearchStore( + index_name="customer-support", + embedding=embeddings, + es_url=ELASTICSEARCH_HOST, + es_api_key=ELASTICSEARCH_API_KEY, + vector_query_field="embeddings", + query_field="text", + ) + + # Create retriever + retriever = vector_store.as_retriever(search_kwargs={"k": 5}) + + # Initialize LLM + llm = ChatOpenAI( + model="gpt-3.5-turbo", + temperature=0, + openai_api_key=OPENAI_API_KEY + ) + + # Enhanced prompt template that leverages NER metadata + prompt = ChatPromptTemplate.from_template(""" +Use the following context to answer the question. Pay attention to any entity information (people, organizations, products, locations, dates) and relationships mentioned in the context. + +Context: +{context} + +Question: +{question} +""") + + print("✅ RAG system ready!") + return {"retriever": retriever, "llm": llm, "prompt": prompt} + + except ImportError as e: + print(f"❌ Missing RAG dependencies: {e}") + print("💡 Install with: pip install langchain langchain-elasticsearch langchain-openai") + return None + except Exception as e: + print(f"❌ Error setting up RAG system: {e}") + return None + +def extract_ner_entities(docs): + """Extract NER entities from document metadata.""" + entities = {"people": set(), "organizations": set(), "products": set(), "locations": set(), "dates": set()} + + for doc in docs: + metadata = doc.metadata + if "entities-items" in metadata: + try: + import json + entity_items = json.loads(metadata["entities-items"]) if isinstance(metadata["entities-items"], str) else metadata["entities-items"] + + for item in entity_items: + entity_type = item.get("type", "").upper() + entity_name = item.get("entity", "") + + if entity_type == "PERSON": + entities["people"].add(entity_name) + elif entity_type == "ORGANIZATION": + entities["organizations"].add(entity_name) + elif entity_type == "PRODUCT": + entities["products"].add(entity_name) + elif entity_type == "LOCATION": + entities["locations"].add(entity_name) + elif entity_type == "DATE": + entities["dates"].add(entity_name) + except: + pass + + return entities + +def analyze_sources(docs): + """Analyze retrieved documents by source type.""" + s3_docs = [] + es_docs = [] + unknown_docs = [] + + for doc in docs: + metadata = doc.metadata + if "data_source-record_locator-index_name" in metadata: + es_docs.append(doc) + elif "data_source-url" in metadata and "s3://" in metadata.get("data_source-url", ""): + s3_docs.append(doc) + else: + unknown_docs.append(doc) + + return s3_docs, es_docs, unknown_docs + +def demonstrate_hybrid_ner_queries(rag_components): + """Demonstrate NER-enhanced hybrid RAG capabilities.""" + if not rag_components: + return + + retriever = rag_components["retriever"] + llm = rag_components["llm"] + prompt = rag_components["prompt"] + + # Build RAG chain using your working pattern + rag_chain = ( + {"context": retriever, "question": RunnablePassthrough()} + | prompt + | llm + | StrOutputParser() + ) + + # Hybrid NER demonstration queries targeting different sources + hybrid_queries = [ + { + "query": "How do I troubleshoot Bose headphone connectivity issues?", + "description": "Product support query targeting S3 PDFs (product manuals)", + "expected_source": "S3 (Product Docs)" + }, + { + "query": "Tell me about Daniel Hahn and his purchases", + "description": "Customer analysis query targeting Elasticsearch (sales data)", + "expected_source": "Elasticsearch (Sales)" + }, + { + "query": "What are the technical specifications for SoundSport Wireless headphones?", + "description": "Product specification query targeting S3 PDFs", + "expected_source": "S3 (Product Docs)" + }, + { + "query": "Show me customers in San Antonio, TX", + "description": "Geographic customer query targeting Elasticsearch", + "expected_source": "Elasticsearch (Sales)" + }, + { + "query": "How do I reset wireless headphones to factory settings?", + "description": "Technical support query targeting S3 PDFs", + "expected_source": "S3 (Product Docs)" + }, + { + "query": "What products does Newegg sell and what are their features?", + "description": "Hybrid query targeting BOTH sources (sales + product specs)", + "expected_source": "Both S3 and Elasticsearch" + }, + { + "query": "I have a customer who bought Bose headphones and is having connectivity issues. What should I tell them?", + "description": "Customer support query requiring BOTH customer data AND product manuals", + "expected_source": "Both S3 and Elasticsearch" + } + ] + + print("\n🧠 Hybrid NER-Enhanced RAG Demonstration") + print("=" * 60) + + for i, query_info in enumerate(hybrid_queries, 1): + query = query_info["query"] + description = query_info["description"] + expected_source = query_info["expected_source"] + + print(f"\n{'='*70}") + print(f"Query {i}: {description}") + print(f"📝 Query: {query}") + print(f"🎯 Expected Source: {expected_source}") + print("=" * 70) + + try: + # Retrieve documents + docs = retriever.invoke(query) + + if not docs: + print("❌ No documents retrieved") + continue + + # Analyze sources (keeping your preferred format) + s3_docs, es_docs, unknown_docs = analyze_sources(docs) + print(f"📊 Retrieved {len(docs)} documents:") + print(f" 📄 S3 (Product Docs): {len(s3_docs)}") + print(f" 📊 Elasticsearch (Sales): {len(es_docs)}") + print(f" ❓ Unknown: {len(unknown_docs)}") + + # Check if we hit the expected source + if expected_source == "S3 (Product Docs)" and len(s3_docs) > 0: + print("✅ SUCCESS: Retrieved from expected S3 source!") + elif expected_source == "Elasticsearch (Sales)" and len(es_docs) > 0: + print("✅ SUCCESS: Retrieved from expected Elasticsearch source!") + elif expected_source == "Both S3 and Elasticsearch" and len(s3_docs) > 0 and len(es_docs) > 0: + print("✅ SUCCESS: Retrieved from BOTH sources as expected!") + elif expected_source.startswith("Both") and (len(s3_docs) > 0 or len(es_docs) > 0): + print("✅ PARTIAL: Retrieved from at least one expected source") + else: + print("⚠️ UNEXPECTED: Did not retrieve from expected source") + + # Extract and show NER entities + entities = extract_ner_entities(docs) + print(f"\n🏷️ NER Entities Found:") + if entities["people"]: + print(f" 👤 People: {', '.join(list(entities['people'])[:3])}") + if entities["organizations"]: + print(f" 🏢 Organizations: {', '.join(list(entities['organizations'])[:3])}") + if entities["products"]: + print(f" 📱 Products: {', '.join(list(entities['products'])[:3])}") + if entities["locations"]: + print(f" 🗺️ Locations: {', '.join(list(entities['locations'])[:3])}") + if entities["dates"]: + print(f" 📅 Dates: {', '.join(list(entities['dates'])[:3])}") + + # Generate answer + print(f"\n💬 Answer:") + answer = rag_chain.invoke(query) + print(f"{answer}") + + except Exception as e: + print(f"❌ Error: {e}") + if "429" in str(e): + print("⚠️ OpenAI API quota exceeded. Stopping demo.") + break + + print(f"\n{'='*70}") + print("🧠 Hybrid NER Demo Complete!") + print("✅ Demonstrated cross-source retrieval capabilities") + print("✅ Showed NER metadata integration across data sources") + print("✅ Validated hybrid RAG architecture") + +def run_rag_demonstration(): + """Run the RAG demonstration.""" + print("\n🚀 Starting Hybrid RAG Demonstration") + print("=" * 50) + + rag_components = setup_rag_system() + + if rag_components: + demonstrate_hybrid_ner_queries(rag_components) + else: + print("❌ RAG demonstration skipped due to configuration issues") + +# Run the demonstration +run_rag_demonstration() + +# %% [markdown] +# ## What You've Accomplished +# +# **Enterprise Data Integration**: You've learned how to process multiple data formats (PDFs, structured records) in parallel, why consistent processing pipelines matter for unified search, and the value of creating a single searchable knowledge base that spans all your data sources. +# +# **Unstructured API Capabilities**: You've experienced VLM-powered document partitioning for complex layouts, intelligent chunking that preserves document structure, named entity recognition for enhanced search precision, and unified processing across diverse data sources. +# +# **RAG System Architecture**: You've built parallel workflow design for scalability and reliability, vector embeddings for semantic similarity search, source attribution in mixed-data query results, and NER-enhanced query understanding and response generation. +# +# ### Ready to Scale? +# +# Deploy customer support chatbots with comprehensive knowledge access, build internal search tools that surface information from any source, or create automated content recommendation systems. Add more data sources using additional workflows, implement real-time data synchronization, or scale up for production data volumes with monitoring and alerting. +# +# ### Try Unstructured Today +# +# Ready to build your own hybrid RAG system? [Sign up for a free trial](https://unstructured.io/?modal=try-for-free) and start transforming your enterprise data into intelligent, searchable knowledge. +# +# **Need help getting started?** Contact our team to schedule a demo and see how Unstructured can solve your specific data challenges. diff --git a/hybrid_rag_pipeline_modular.py b/hybrid_rag_pipeline_modular.py new file mode 100644 index 0000000..94f1298 --- /dev/null +++ b/hybrid_rag_pipeline_modular.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +# [[MD:INTRO]] + +# [[MD:API_KEY_SETUP]] + +# [[MD:CONFIG]] + +# [[MD:COLAB_DOTENV_CREATION]] + +# [[CODE:DOTENV_CREATION]] + +# [[MD:DEPENDENCIES_EXPLANATION]] + +# [[CODE:DEPENDENCIES]] + +# [[MD:AWS_S3_SETUP]] + +# [[MD:ELASTICSEARCH_SETUP]] + +# [[MD:DATA_PREPARATION]] + +# [[CODE:DATA_PREPARATION]] + +# [[MD:S3_SOURCE_CONNECTOR]] + +# [[MD:PRODUCT_MANUAL_EXAMPLE_CONTEXT]] + +# [[IMG:PRODUCT_MANUAL_EXAMPLE]] + +# [[MD:ES_SOURCE_CONNECTOR]] + +# [[MD:SALES_RECORDS_CONSOLIDATED_CONTEXT]] + +# [[IMG:SALES_RECORDS_CONSOLIDATED_INDEX]] + +# [[MD:ES_DESTINATION_CONNECTOR]] + +# [[CODE:CONNECTORS]] + +# [[MD:WORKFLOW_NODES]] + +# [[CODE:WORKFLOWS]] + +# [[MD:CREATE_WORKFLOWS]] + +# [[MD:RUN_WORKFLOW]] + +# [[CODE:EXECUTION]] + +# [[MD:JOB_MONITORING]] + +# [[MD:ES_PREPROCESSING]] + +# [[CODE:PREPROCESSING]] + +# [[MD:SUMMARY]] + +# [[CODE:VERIFICATION]] + +# [[MD:MAIN]] + +# [[CODE:MAIN]] + +# [[MD:EXECUTION_FLOW]] + +# [[CODE:EXECUTION_RUNNER]] + +# [[MD:CUSTOMER_SUPPORT_OUTPUT_CONTEXT]] + +# [[IMG:CUSTOMER_SUPPORT_INDEX]] + +# [[MD:RAG_DEMO_SETUP]] + +# [[MD:RAG_DEMO_CONFIG]] + +# [[CODE:RAG_DEMO]] + +# [[MD:CONCLUSION]] diff --git a/notebook-processing/README.md b/notebook-processing/README.md new file mode 100644 index 0000000..fc95807 --- /dev/null +++ b/notebook-processing/README.md @@ -0,0 +1,276 @@ +# Notebook Processing Pipeline + +This folder contains the tooling to turn the base Python script into a rich, documented Jupyter Notebook. + +## Concept + +- The base script (`hybrid_rag_pipeline.py`) contains only code and special placeholders ("handles") that + indicate where markdown should be inserted. Handles look like: + + ``` + # [[MD:INTRO]] + # [[MD:CONFIG]] + # [[MD:S3_SOURCE_CONNECTOR]] + # [[MD:ES_SOURCE_CONNECTOR]] + # [[MD:ES_DESTINATION_CONNECTOR]] + # [[MD:WORKFLOW_NODES]] + # [[MD:CREATE_WORKFLOWS]] + # [[MD:RUN_WORKFLOW]] + # [[MD:JOB_MONITORING]] + # [[MD:ES_PREPROCESSING]] + # [[MD:SUMMARY]] + # [[MD:MAIN]] + ``` + +- The markdown for each handle lives in `markdown_blocks.yaml`. +- The enrichment script `enrich_and_convert.py` reads the base file, replaces each handle with the + corresponding markdown block, writes `hybrid_rag_pipeline_enriched.py`, and converts that to + `hybrid_rag_pipeline_enriched.ipynb` using jupytext. + +## Code Modularization (Proposed Enhancement) + +Similar to how markdown content is separated, we can also modularize the Python code into separate script files: + +### Code Script Structure + +Create a `code-scripts/` folder with individual Python files for each logical component: + +``` +notebook-processing/ +├── code-scripts/ +│ ├── __init__.py # Empty file for Python module +│ ├── dependencies.py # ensure_notebook_deps() +│ ├── data_preparation.py # download_file(), setup_*_data(), prepare_data_sources() +│ ├── connectors.py # create_*_connector() functions +│ ├── workflows.py # create_workflow_nodes(), create_parallel_workflows() +│ ├── execution.py # run_workflow(), poll_job_status() +│ ├── preprocessing.py # run_elasticsearch_preprocessing() +│ ├── verification.py # verify_customer_support_results(), print_pipeline_summary() +│ ├── rag_demo.py # RAG demonstration functions +│ └── main.py # main() function +├── code_blocks.yaml # Maps code handles to script files +├── enrich_and_convert.py # Enhanced to handle both MD and CODE handles +└── markdown_blocks.yaml # Existing markdown content +``` + +### Code Handle Format + +In the main pipeline file, use code handles similar to markdown handles: + +```python +#!/usr/bin/env python3 +# [[MD:INTRO]] + +# [[MD:CONFIG]] + +# [[CODE:DEPENDENCIES]] + +# [[CODE:DATA_PREPARATION]] + +# [[CODE:CONNECTORS]] + +# [[CODE:WORKFLOWS]] + +# [[CODE:EXECUTION]] + +# [[CODE:PREPROCESSING]] + +# [[CODE:VERIFICATION]] + +# [[CODE:MAIN]] + +# [[MD:EXECUTION_FLOW]] + +# Run the pipeline +s3_job_id, es_job_id = main() + +# Poll both jobs to make sure they have completed before proceeding +es_job_info = poll_job_status(es_job_id, "Elasticsearch Ingest") +s3_job_info = poll_job_status(s3_job_id, "S3 Ingest") + +# Verify the results +print("\n🔍 Verifying processed results") +print("-" * 50) +verify_customer_support_results() +``` + +### Code Blocks Configuration + +`code_blocks.yaml` would map handles to script files: + +```yaml +DEPENDENCIES: code-scripts/dependencies.py +DATA_PREPARATION: code-scripts/data_preparation.py +CONNECTORS: code-scripts/connectors.py +WORKFLOWS: code-scripts/workflows.py +EXECUTION: code-scripts/execution.py +PREPROCESSING: code-scripts/preprocessing.py +VERIFICATION: code-scripts/verification.py +RAG_DEMO: code-scripts/rag_demo.py +MAIN: code-scripts/main.py +``` + +## Image Management (New Feature) + +The pipeline now supports embedding images directly into the notebook using image handles. + +### Image Structure + +Create an `images/` folder to store all notebook images: + +``` +notebook-processing/ +├── images/ +│ ├── architecture-diagram.png # System architecture visualization +│ ├── workflow-flowchart.png # Processing workflow diagram +│ ├── data-flow.png # Data flow illustration +│ └── rag-demo-screenshot.png # RAG demonstration example +├── image_blocks.yaml # Maps image handles to image files +├── code-scripts/ # Code modules (as above) +├── code_blocks.yaml # Code handle mappings +├── enrich_and_convert.py # Enhanced to handle MD, CODE, and IMG handles +└── markdown_blocks.yaml # Markdown content +``` + +### Image Handle Format + +In the main pipeline file, use image handles to embed images: + +```python +#!/usr/bin/env python3 +# [[MD:INTRO]] + +# [[IMG:ARCHITECTURE_DIAGRAM]] + +# [[MD:CONFIG]] + +# [[CODE:DEPENDENCIES]] + +# [[IMG:WORKFLOW_FLOWCHART]] + +# [[CODE:DATA_PREPARATION]] + +# [[MD:EXECUTION_FLOW]] + +# [[IMG:RAG_DEMO_SCREENSHOT]] +``` + +### Image Blocks Configuration + +`image_blocks.yaml` maps image handles to image files: + +```yaml +ARCHITECTURE_DIAGRAM: images/architecture-diagram.png +WORKFLOW_FLOWCHART: images/workflow-flowchart.png +DATA_FLOW: images/data-flow.png +RAG_DEMO_SCREENSHOT: images/rag-demo-screenshot.png +``` + +### Image Processing Features + +**Automatic Resizing**: Images are automatically resized to a maximum width of 800 pixels while maintaining aspect ratio. + +**Base64 Encoding**: Images are converted to base64 and embedded directly in the notebook, making it self-contained. + +**Format Support**: Supports PNG, JPG, JPEG, GIF, SVG, and WebP formats. + +**Smart Handling**: Transparent images (RGBA, P mode) are converted to RGB with a white background. + +### Image Placement + +Images are embedded as markdown cells in the notebook using the format: +```markdown +![Alt text](data:image/png;base64,) +``` + +The enrichment script automatically: +1. Reads the image file from the `images/` folder +2. Resizes it to max 800px width (maintaining aspect ratio) +3. Converts it to base64 encoding +4. Embeds it as a markdown cell in the notebook +5. Handles different image formats (PNG, JPG, GIF, SVG, WebP) + +### Benefits + +1. **Separation of Concerns**: Each script file handles one logical area +2. **Easier Maintenance**: Update individual components without touching the main file +3. **Reusability**: Code modules can be imported and used in other projects +4. **Testing**: Individual components can be unit tested separately +5. **Collaboration**: Multiple developers can work on different components +6. **Version Control**: Cleaner diffs when changes are made to specific components +7. **Visual Documentation**: Images are version-controlled and automatically embedded +8. **Portable Notebooks**: Images are embedded as base64, making notebooks self-contained +9. **Optimized Images**: Automatic resizing ensures reasonable file sizes + +## Why this matters + +This makes the notebook content repeatable and reviewable. When you say "update the notebook text," you should: +1. Edit the appropriate markdown block(s) in `markdown_blocks.yaml`. +2. Re-run the enrichment script to regenerate the enriched Python file and notebook. + +For code changes: +1. Edit the appropriate script file in `code-scripts/`. +2. Re-run the enrichment script to regenerate the enriched Python file and notebook. + +For image updates: +1. Replace the image file in the `images/` folder with the same filename. +2. Re-run the enrichment script to regenerate the notebook with the updated image. + +This avoids accidental code edits inside the notebook and keeps documentation, code, and images close yet separate. + +## Usage + +### Basic Usage (Modular Mode - Default) + +Run from the repo root: + +```bash +/Users/nvannest/Documents/GitHub/rag-over-hybrid-data-sources/venv/bin/python \ + notebook-processing/enrich_and_convert.py +``` + +This uses the **modular methodology** by default, processing `hybrid_rag_pipeline_modular.py` with `[[MD:...]]`, `[[CODE:...]]`, and `[[IMG:...]]` handles. + +**Note**: Images are **disabled by default**. Only include images when explicitly requested by the user. + +### Advanced Usage + +```bash +# Use modular approach (default) +python notebook-processing/enrich_and_convert.py --source modular + +# Use original approach (markdown only) +python notebook-processing/enrich_and_convert.py --source original + +# Include images in the output (explicitly enabled) +python notebook-processing/enrich_and_convert.py --include-images + +# Custom output suffix +python notebook-processing/enrich_and_convert.py --output-suffix custom + +# Combine options +python notebook-processing/enrich_and_convert.py --source original --output-suffix legacy --include-images +``` + +### Output Files + +**Modular Mode** (default): +- Source: `hybrid_rag_pipeline_modular.py` (template with handles) +- Output: `hybrid_rag_pipeline_enriched.py` (assembled code) +- Notebook: `hybrid_rag_pipeline_enriched.ipynb` (Jupyter notebook) + +**Original Mode**: +- Source: `hybrid_rag_pipeline.py` (original file with MD handles) +- Output: `hybrid_rag_pipeline_enriched.py` (markdown-enriched) +- Notebook: `hybrid_rag_pipeline_enriched.ipynb` (Jupyter notebook) + +## Notes +- If any `# [[MD:...]]` handle remains unreplaced, the script will exit with an error in strict mode. +- If any `# [[CODE:...]]` handle remains unreplaced, the script will exit with an error in strict mode. +- If any `# [[IMG:...]]` handle remains unreplaced, the script will exit with an error in strict mode. +- Update or add new handles in the base file where you want new documentation sections. +- Add the corresponding block in `markdown_blocks.yaml`, script in `code-scripts/`, or image in `images/` using the exact same key. +- Images should be optimized for web display (reasonable file sizes) as they will be embedded in the notebook. +- **Images are disabled by default** - use `--include-images` flag to enable image embedding. +- Images are automatically resized to max 800px width to keep notebook file sizes reasonable. + diff --git a/notebook-processing/code-scripts/__init__.py b/notebook-processing/code-scripts/__init__.py new file mode 100644 index 0000000..1ab298b --- /dev/null +++ b/notebook-processing/code-scripts/__init__.py @@ -0,0 +1 @@ +# This file makes code-scripts a Python module \ No newline at end of file diff --git a/notebook-processing/code-scripts/connectors.py b/notebook-processing/code-scripts/connectors.py new file mode 100644 index 0000000..82563f8 --- /dev/null +++ b/notebook-processing/code-scripts/connectors.py @@ -0,0 +1,93 @@ +def create_s3_source_connector(): + """Create an S3 source connector for PDF documents.""" + try: + if not S3_SOURCE_BUCKET: + raise ValueError("S3_SOURCE_BUCKET is required (bucket name, s3:// URL, or https:// URL)") + value = S3_SOURCE_BUCKET.strip() + + if value.startswith("s3://"): + s3_style = value if value.endswith("/") else value + "/" + elif value.startswith("http://") or value.startswith("https://"): + parsed = urlparse(value) + host = parsed.netloc + path = parsed.path or "/" + bucket = host.split(".s3.")[0] + s3_style = f"s3://{bucket}{path if path.endswith('/') else path + '/'}" + else: + s3_style = f"s3://{value if value.endswith('/') else value + '/'}" + + with UnstructuredClient(api_key_auth=UNSTRUCTURED_API_KEY) as client: + response = client.sources.create_source( + request=CreateSourceRequest( + create_source_connector=CreateSourceConnector( + name="", + type="s3", + config={ + "remote_url": s3_style, + "recursive": True, + "key": AWS_ACCESS_KEY_ID, + "secret": AWS_SECRET_ACCESS_KEY, + } + ) + ) + ) + + source_id = response.source_connector_information.id + print(f"✅ Created S3 PDF source connector: {source_id} -> {s3_style}") + return source_id + + except Exception as e: + print(f"❌ Error creating S3 source connector: {e}") + return None + +def create_elasticsearch_source_connector(): + """Create an Elasticsearch source connector for sales data.""" + try: + with UnstructuredClient(api_key_auth=UNSTRUCTURED_API_KEY) as client: + response = client.sources.create_source( + request=CreateSourceRequest( + create_source_connector=CreateSourceConnector( + name=f"elasticsearch_sales_source_{int(time.time())}", + type="elasticsearch", + config={ + "hosts": [ELASTICSEARCH_HOST], + "es_api_key": ELASTICSEARCH_API_KEY, + "index_name": ELASTICSEARCH_INDEX + } + ) + ) + ) + + source_id = response.source_connector_information.id + print(f"✅ Created Elasticsearch sales source connector: {source_id}") + return source_id + + except Exception as e: + print(f"❌ Error creating Elasticsearch source connector: {e}") + return None + +def create_elasticsearch_destination_connector(): + """Create an Elasticsearch destination connector for processed results.""" + try: + with UnstructuredClient(api_key_auth=UNSTRUCTURED_API_KEY) as client: + response = client.destinations.create_destination( + request=CreateDestinationRequest( + create_destination_connector=CreateDestinationConnector( + name=f"elasticsearch_customer_support_destination_{int(time.time())}", + type="elasticsearch", + config={ + "hosts": [ELASTICSEARCH_HOST], + "es_api_key": ELASTICSEARCH_API_KEY, + "index_name": "customer-support" + } + ) + ) + ) + + destination_id = response.destination_connector_information.id + print(f"✅ Created Elasticsearch destination connector: {destination_id}") + return destination_id + + except Exception as e: + print(f"❌ Error creating Elasticsearch destination connector: {e}") + return None diff --git a/notebook-processing/code-scripts/data_preparation.py b/notebook-processing/code-scripts/data_preparation.py new file mode 100644 index 0000000..b4d49a4 --- /dev/null +++ b/notebook-processing/code-scripts/data_preparation.py @@ -0,0 +1,254 @@ +# Data preparation functions + +def download_file(url: str, local_path: str) -> bool: + """Download a file from URL to local path.""" + try: + print(f"📥 Downloading {url}...") + response = requests.get(url, stream=True) + response.raise_for_status() + + Path(local_path).parent.mkdir(parents=True, exist_ok=True) + + with open(local_path, 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + + print(f"✅ Downloaded to {local_path}") + return True + + except Exception as e: + print(f"❌ Error downloading {url}: {e}") + return False + +def setup_elasticsearch_data(): + """Download and load sales data into Elasticsearch index.""" + print("🔧 Setting up Elasticsearch sales data...") + + try: + es = Elasticsearch( + ELASTICSEARCH_HOST, + api_key=ELASTICSEARCH_API_KEY, + request_timeout=60, + max_retries=3, + retry_on_timeout=True + ) + + index_name = "sales-records-consolidated" + + sales_data_url = "https://github.com/Unstructured-IO/rag-over-hybrid-data-sources/raw/feature/hybrid-rag-pipeline/source_data/sales_records_consolidated.zip" + + with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as tmp_file: + if not download_file(sales_data_url, tmp_file.name): + return False + + with zipfile.ZipFile(tmp_file.name, 'r') as zipf: + # Load mapping + with zipf.open('mapping.json') as f: + mapping_data = json.loads(f.read().decode('utf-8')) + + # Load documents + with zipf.open('documents.json') as f: + documents = json.loads(f.read().decode('utf-8')) + + # Always delete existing index if present and reload from zip + if es.indices.exists(index=index_name): + print(f"🗑️ Deleting existing index '{index_name}' to reload fresh data...") + es.indices.delete(index=index_name) + + # Create index with mapping + index_mapping = mapping_data[index_name] if index_name in mapping_data else mapping_data[list(mapping_data.keys())[0]] + es.indices.create(index=index_name, body=index_mapping) + print(f"🔧 Created index '{index_name}' with mapping") + + # Prepare documents for bulk insert + def generate_docs(): + for doc in documents: + yield { + "_index": index_name, + "_id": doc["_id"], + "_source": doc["_source"] + } + + # Bulk insert documents + success_count, failed_items = bulk(es, generate_docs(), chunk_size=100) + print(f"📝 Inserted {success_count} documents") + + # Refresh index and verify + es.indices.refresh(index=index_name) + count_response = es.count(index=index_name) + count_data = count_response.body if hasattr(count_response, 'body') else count_response + doc_count = count_data['count'] + + if doc_count > 0: + print(f"✅ Successfully loaded {doc_count} documents into '{index_name}' index") + return True + else: + print(f"❌ Index '{index_name}' is empty after loading") + return False + + except Exception as e: + print(f"❌ Error setting up Elasticsearch data: {e}") + return False + + finally: + # Clean up temp file + try: + os.unlink(tmp_file.name) + except: + pass + +def setup_s3_data(): + """Download and load PDF files into S3 bucket.""" + print("🔧 Setting up S3 PDF data...") + + try: + # Initialize S3 client + s3 = boto3.client( + 's3', + aws_access_key_id=AWS_ACCESS_KEY_ID, + aws_secret_access_key=AWS_SECRET_ACCESS_KEY, + region_name=AWS_REGION + ) + + bucket_name = S3_SOURCE_BUCKET + if not bucket_name: + print("❌ S3_SOURCE_BUCKET not configured") + return False + + # Check if bucket exists and has data + try: + response = s3.list_objects_v2(Bucket=bucket_name, MaxKeys=1) + if response.get('KeyCount', 0) > 0: + # Count total objects + response = s3.list_objects_v2(Bucket=bucket_name) + object_count = len(response.get('Contents', [])) + print(f"✅ Bucket '{bucket_name}' already exists with {object_count} files") + return True + except ClientError as e: + if e.response['Error']['Code'] != '404': + raise e + + # Download S3 PDFs zip file + s3_data_url = "https://github.com/Unstructured-IO/rag-over-hybrid-data-sources/raw/feature/hybrid-rag-pipeline/source_data/s3_pdfs.zip" + + with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as tmp_file: + if not download_file(s3_data_url, tmp_file.name): + return False + + # Create bucket if it doesn't exist + try: + s3.head_bucket(Bucket=bucket_name) + print(f"📦 Using existing bucket '{bucket_name}'") + except ClientError as e: + if e.response['Error']['Code'] == '404': + print(f"🔧 Creating bucket '{bucket_name}'...") + try: + if AWS_REGION == "us-east-1": + s3.create_bucket(Bucket=bucket_name) + else: + s3.create_bucket( + Bucket=bucket_name, + CreateBucketConfiguration={'LocationConstraint': AWS_REGION} + ) + print(f"✅ Created bucket '{bucket_name}'") + except ClientError as create_error: + if 'BucketAlreadyOwnedByYou' in str(create_error): + print(f"📦 Bucket '{bucket_name}' already exists and is owned by you") + else: + raise create_error + else: + raise e + + # Clear existing files in bucket + print(f"🗑️ Clearing existing files from bucket '{bucket_name}'...") + try: + response = s3.list_objects_v2(Bucket=bucket_name) + if 'Contents' in response: + objects_to_delete = [{'Key': obj['Key']} for obj in response['Contents']] + if objects_to_delete: + s3.delete_objects( + Bucket=bucket_name, + Delete={'Objects': objects_to_delete} + ) + print(f"🗑️ Deleted {len(objects_to_delete)} existing files") + else: + print("📁 Bucket was already empty") + else: + print("📁 Bucket was already empty") + except ClientError as e: + print(f"⚠️ Could not clear bucket (continuing anyway): {e}") + + # Extract and upload files from zip + uploaded_count = 0 + with zipfile.ZipFile(tmp_file.name, 'r') as zipf: + file_list = zipf.namelist() + pdf_files = [f for f in file_list if f.lower().endswith('.pdf')] + + print(f"📊 Found {len(pdf_files)} PDF files in zip") + + for file_name in pdf_files: + try: + # Extract file data + file_data = zipf.read(file_name) + + # Upload to S3 + s3.put_object( + Bucket=bucket_name, + Key=file_name, + Body=file_data, + ContentType='application/pdf' + ) + + print(f" 📤 Uploaded: {file_name}") + uploaded_count += 1 + + except Exception as e: + print(f" ❌ Failed to upload {file_name}: {e}") + + # Verify upload + response = s3.list_objects_v2(Bucket=bucket_name) + actual_count = len(response.get('Contents', [])) + + if actual_count > 0: + print(f"✅ Successfully uploaded {uploaded_count} PDFs to bucket '{bucket_name}'") + print(f"📊 Bucket now contains {actual_count} files") + return True + else: + print(f"❌ Bucket '{bucket_name}' is empty after upload") + return False + + except NoCredentialsError: + print("❌ AWS credentials not found. Please check your .env file.") + return False + except Exception as e: + print(f"❌ Error setting up S3 data: {e}") + return False + + finally: + # Clean up temp file + try: + os.unlink(tmp_file.name) + except: + pass + +def prepare_data_sources(): + """Prepare both Elasticsearch and S3 data sources.""" + print("🚀 Preparing data sources...") + print("=" * 50) + + # Setup Elasticsearch data + if not setup_elasticsearch_data(): + print("❌ Failed to setup Elasticsearch data") + return False + + print() # Add spacing + + # Setup S3 data + if not setup_s3_data(): + print("❌ Failed to setup S3 data") + return False + + print() + print("✅ All data sources prepared successfully!") + print("=" * 50) + return True \ No newline at end of file diff --git a/notebook-processing/code-scripts/dependencies.py b/notebook-processing/code-scripts/dependencies.py new file mode 100644 index 0000000..05e807a --- /dev/null +++ b/notebook-processing/code-scripts/dependencies.py @@ -0,0 +1,92 @@ +import sys, subprocess + +def ensure_notebook_deps() -> None: + packages = [ + "jupytext", + "python-dotenv", + "unstructured-client", + "elasticsearch", + "boto3", + "PyYAML", + "langchain", + "langchain-elasticsearch", + "langchain-openai" + ] + try: + subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", *packages]) + except Exception: + # If install fails, continue; imports below will surface actionable errors + pass + +# Install notebook dependencies (safe no-op if present) +ensure_notebook_deps() + +import os +import time +import json +import zipfile +import tempfile +import requests +from pathlib import Path +from dotenv import load_dotenv +from urllib.parse import urlparse + +import boto3 +from botocore.exceptions import ClientError, NoCredentialsError +from elasticsearch import Elasticsearch +from elasticsearch.helpers import bulk + +from unstructured_client import UnstructuredClient +from unstructured_client.models.operations import ( + CreateSourceRequest, + CreateDestinationRequest, + CreateWorkflowRequest +) +from unstructured_client.models.shared import ( + CreateSourceConnector, + CreateDestinationConnector, + WorkflowNode, + WorkflowType, + CreateWorkflow +) + +# ============================================================================= +# ENVIRONMENT CONFIGURATION +# ============================================================================= +# Load from .env file if it exists +load_dotenv() + +# Configuration constants +SKIPPED = "SKIPPED" +UNSTRUCTURED_API_URL = os.getenv("UNSTRUCTURED_API_URL", "https://platform.unstructuredapp.io/api/v1") + +# Get environment variables +UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY") +AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID") +AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") +AWS_REGION = os.getenv("AWS_REGION", "us-east-1") +S3_SOURCE_BUCKET = os.getenv("S3_SOURCE_BUCKET") +S3_DESTINATION_BUCKET = os.getenv("S3_DESTINATION_BUCKET") +S3_OUTPUT_PREFIX = os.getenv("S3_OUTPUT_PREFIX", "") +ELASTICSEARCH_HOST = os.getenv("ELASTICSEARCH_HOST") +ELASTICSEARCH_API_KEY = os.getenv("ELASTICSEARCH_API_KEY") +ELASTICSEARCH_INDEX = os.getenv("ELASTICSEARCH_INDEX", "sales-records-consolidated") +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") + +# Validation +REQUIRED_VARS = { + "UNSTRUCTURED_API_KEY": UNSTRUCTURED_API_KEY, + "AWS_ACCESS_KEY_ID": AWS_ACCESS_KEY_ID, + "AWS_SECRET_ACCESS_KEY": AWS_SECRET_ACCESS_KEY, + "ELASTICSEARCH_HOST": ELASTICSEARCH_HOST, + "ELASTICSEARCH_API_KEY": ELASTICSEARCH_API_KEY, + "S3_SOURCE_BUCKET": S3_SOURCE_BUCKET, +} + +missing_vars = [key for key, value in REQUIRED_VARS.items() if not value] +if missing_vars: + print(f"❌ Missing required environment variables: {', '.join(missing_vars)}") + print("Please set these environment variables or create a .env file with your credentials.") + raise ValueError(f"Missing required environment variables: {missing_vars}") + +print("✅ Configuration loaded successfully") diff --git a/notebook-processing/code-scripts/dotenv_creation.py b/notebook-processing/code-scripts/dotenv_creation.py new file mode 100644 index 0000000..024bf90 --- /dev/null +++ b/notebook-processing/code-scripts/dotenv_creation.py @@ -0,0 +1,48 @@ +def create_dotenv_file(): + """Create a .env file with placeholder values for the user to fill in.""" + env_content = """# Hybrid RAG Pipeline Environment Configuration +# Fill in your actual values below +# Configuration - Set these explicitly + +# =================================================================== +# AWS CONFIGURATION +# =================================================================== +AWS_ACCESS_KEY_ID="your-aws-access-key-id" +AWS_SECRET_ACCESS_KEY="your-aws-secret-access-key" +AWS_REGION="us-east-1" + +# =================================================================== +# UNSTRUCTURED API CONFIGURATION +# =================================================================== +UNSTRUCTURED_API_KEY="your-unstructured-api-key" +UNSTRUCTURED_API_URL="https://platform.unstructuredapp.io/api/v1" + +# =================================================================== +# ELASTICSEARCH CONFIGURATION +# =================================================================== +ELASTICSEARCH_HOST="https://your-cluster.es.io:443" +ELASTICSEARCH_API_KEY="your-elasticsearch-api-key" + +# =================================================================== +# PIPELINE DATA SOURCES +# =================================================================== +S3_SOURCE_BUCKET="your-s3-source-bucket" +S3_DESTINATION_BUCKET="your-s3-destination-bucket" +S3_OUTPUT_PREFIX="" +ELASTICSEARCH_INDEX="sales-records-consolidated" + +# =================================================================== +# OPENAI API CONFIGURATION +# =================================================================== +OPENAI_API_KEY="your-openai-api-key" +""" + + with open('.env', 'w') as f: + f.write(env_content) + + print("✅ Created .env file with placeholder values") + print("📝 Please edit the .env file and replace the placeholder values with your actual credentials") + print("🔒 The .env file will be loaded automatically by the pipeline") + +# Create the .env file +create_dotenv_file() diff --git a/notebook-processing/code-scripts/execution.py b/notebook-processing/code-scripts/execution.py new file mode 100644 index 0000000..f9b351b --- /dev/null +++ b/notebook-processing/code-scripts/execution.py @@ -0,0 +1,46 @@ +def run_workflow(workflow_id, workflow_name): + """Run a workflow and return job information.""" + try: + with UnstructuredClient(api_key_auth=UNSTRUCTURED_API_KEY) as client: + response = client.workflows.run_workflow( + request={"workflow_id": workflow_id} + ) + + job_id = response.job_information.id + print(f"✅ Started {workflow_name} job: {job_id}") + return job_id + + except Exception as e: + print(f"❌ Error running {workflow_name} workflow: {e}") + return None + +def poll_job_status(job_id, job_name, wait_time=30): + """Poll job status until completion.""" + print(f"⏳ Monitoring {job_name} job status...") + + while True: + try: + with UnstructuredClient(api_key_auth=UNSTRUCTURED_API_KEY) as client: + response = client.jobs.get_job( + request={"job_id": job_id} + ) + + job = response.job_information + status = job.status + + if status in ["SCHEDULED", "IN_PROGRESS"]: + print(f"⏳ {job_name} job status: {status}") + time.sleep(wait_time) + elif status == "COMPLETED": + print(f"✅ {job_name} job completed successfully!") + return job + elif status == "FAILED": + print(f"❌ {job_name} job failed!") + return job + else: + print(f"❓ Unknown {job_name} job status: {status}") + return job + + except Exception as e: + print(f"❌ Error polling {job_name} job status: {e}") + time.sleep(wait_time) diff --git a/notebook-processing/code-scripts/execution_runner.py b/notebook-processing/code-scripts/execution_runner.py new file mode 100644 index 0000000..73871e1 --- /dev/null +++ b/notebook-processing/code-scripts/execution_runner.py @@ -0,0 +1,7 @@ +s3_job_id, es_job_id = main() + +es_job_info = poll_job_status(es_job_id, "Elasticsearch Ingest") +s3_job_info = poll_job_status(s3_job_id, "S3 Ingest") +print("\n🔍 Verifying processed results") +print("-" * 50) +verify_customer_support_results() diff --git a/notebook-processing/code-scripts/main.py b/notebook-processing/code-scripts/main.py new file mode 100644 index 0000000..6443f6c --- /dev/null +++ b/notebook-processing/code-scripts/main.py @@ -0,0 +1,74 @@ +def main(): + """Main pipeline execution""" + print("🚀 Starting Hybrid RAG Pipeline") + + print("\n📦 Step 0: Data source preparation") + print("-" * 50) + + if not prepare_data_sources(): + print("❌ Failed to prepare data sources") + return + + print("\n🔧 Step 1: Elasticsearch preprocessing") + print("-" * 50) + + if not run_elasticsearch_preprocessing(): + print("❌ Failed to complete Elasticsearch preprocessing") + return + + print("\n🔗 Step 2: Creating source connectors") + print("-" * 50) + + s3_source_id = create_s3_source_connector() + if not s3_source_id: + print("❌ Failed to create S3 source connector") + return + + elasticsearch_source_id = create_elasticsearch_source_connector() + if not elasticsearch_source_id: + print("❌ Failed to create Elasticsearch source connector") + return + + # Step 3: Create Destination Connector + print("\n🎯 Step 3: Creating Elasticsearch destination connector") + print("-" * 50) + + destination_id = create_elasticsearch_destination_connector() + if not destination_id: + print("❌ Failed to create destination connector") + return + + # Step 4: Create Workflows + print("\n⚙️ Step 4: Creating workflows") + print("-" * 50) + + s3_workflow_id, es_workflow_id = create_parallel_workflows( + s3_source_id, elasticsearch_source_id, destination_id + ) + + if not es_workflow_id: + print("❌ Failed to create Elasticsearch workflow") + return + + # Step 5: Run Workflows + print("\n🚀 Step 5: Running workflows") + print("-" * 50) + + s3_job_id = None + es_job_id = None + + if s3_workflow_id: + s3_job_id = run_workflow(s3_workflow_id, "S3 PDFs") + if not s3_job_id: + print("❌ Failed to start S3 workflow") + return + + if es_workflow_id: + es_job_id = run_workflow(es_workflow_id, "Elasticsearch Sales") + if not es_job_id: + print("❌ Failed to start Elasticsearch workflow") + return + + # Step 6: Pipeline Summary + print_pipeline_summary(s3_workflow_id, es_workflow_id, s3_job_id, es_job_id) + return s3_job_id, es_job_id \ No newline at end of file diff --git a/notebook-processing/code-scripts/preprocessing.py b/notebook-processing/code-scripts/preprocessing.py new file mode 100644 index 0000000..e9c97c8 --- /dev/null +++ b/notebook-processing/code-scripts/preprocessing.py @@ -0,0 +1,65 @@ +def run_elasticsearch_preprocessing(): + """Check and manage Elasticsearch indices for the pipeline.""" + print("🔧 Running Elasticsearch preprocessing...") + + try: + es = Elasticsearch( + ELASTICSEARCH_HOST, + api_key=ELASTICSEARCH_API_KEY, + request_timeout=60, + max_retries=3, + retry_on_timeout=True + ) + + sales_index = "sales-records-consolidated" + print(f"�� Checking {sales_index} index...") + + if not es.indices.exists(index=sales_index): + raise ValueError(f"❌ Index '{sales_index}' does not exist. There is no data to use.") + + count_response = es.count(index=sales_index) + doc_count = count_response['count'] + + if doc_count == 0: + raise ValueError(f"❌ Index '{sales_index}' is empty. There is no data to use.") + + print(f"✅ Found {doc_count} records in {sales_index}") + + # Handle customer-support index + support_index = "customer-support" + print(f"🔍 Checking {support_index} index...") + + if es.indices.exists(index=support_index): + print(f"🗑️ Deleting existing {support_index} index...") + es.indices.delete(index=support_index) + + # Create fresh customer-support index + print(f"🔧 Creating fresh {support_index} index...") + mapping = { + "settings": { + "number_of_shards": 1, + "number_of_replicas": 1 + }, + "mappings": { + "properties": { + "id": {"type": "keyword"}, + "timestamp": {"type": "date"}, + "text": {"type": "text", "analyzer": "standard"}, + "metadata": {"type": "object"} + } + } + } + + es.indices.create(index=support_index, body=mapping) + es.indices.refresh(index=support_index) + + print(f"✅ Successfully created fresh {support_index} index") + print("✅ Elasticsearch preprocessing completed successfully") + return True + + except ValueError as e: + print(str(e)) + return False + except Exception as e: + print(f"❌ Error during Elasticsearch preprocessing: {e}") + return False diff --git a/notebook-processing/code-scripts/rag_demo.py b/notebook-processing/code-scripts/rag_demo.py new file mode 100644 index 0000000..4674c25 --- /dev/null +++ b/notebook-processing/code-scripts/rag_demo.py @@ -0,0 +1,272 @@ +# RAG Demonstration Configuration and Queries +import os +import json + +# LangChain imports for RAG functionality +from langchain_elasticsearch import ElasticsearchStore +from langchain_openai import OpenAIEmbeddings, ChatOpenAI +from langchain_core.prompts import ChatPromptTemplate +from langchain_core.output_parsers import StrOutputParser +from langchain_core.runnables import RunnablePassthrough + +print("🤖 RAG Query Demonstration Setup") +print("=" * 40) + +if not OPENAI_API_KEY or OPENAI_API_KEY.startswith("your-"): + print("⚠️ OpenAI API key not configured.") + print("💡 Please set OPENAI_API_KEY in your .env file with your actual OpenAI API key.") + print("📝 You can get one at: https://platform.openai.com/api-keys") +else: + print("✅ OpenAI API key configured for RAG demonstrations") + +def setup_rag_system(): + """Set up the RAG system with Elasticsearch and OpenAI.""" + + if not OPENAI_API_KEY or OPENAI_API_KEY.startswith("your-"): + print("❌ OpenAI API key is required for RAG functionality") + print("Please set OPENAI_API_KEY in your .env file") + return None + + # Set OpenAI API key for LangChain + os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY + + try: + print("🔧 Setting up RAG components...") + + # Initialize embeddings (same model used in processing) + embeddings = OpenAIEmbeddings( + model="text-embedding-3-small", + openai_api_key=OPENAI_API_KEY + ) + + # Connect to Elasticsearch vector store - using your working pattern + vector_store = ElasticsearchStore( + index_name="customer-support", + embedding=embeddings, + es_url=ELASTICSEARCH_HOST, + es_api_key=ELASTICSEARCH_API_KEY, + vector_query_field="embeddings", + query_field="text", + ) + + # Create retriever + retriever = vector_store.as_retriever(search_kwargs={"k": 5}) + + # Initialize LLM + llm = ChatOpenAI( + model="gpt-3.5-turbo", + temperature=0, + openai_api_key=OPENAI_API_KEY + ) + + # Enhanced prompt template that leverages NER metadata + prompt = ChatPromptTemplate.from_template(""" +Use the following context to answer the question. Pay attention to any entity information (people, organizations, products, locations, dates) and relationships mentioned in the context. + +Context: +{context} + +Question: +{question} +""") + + print("✅ RAG system ready!") + return {"retriever": retriever, "llm": llm, "prompt": prompt} + + except ImportError as e: + print(f"❌ Missing RAG dependencies: {e}") + print("💡 Install with: pip install langchain langchain-elasticsearch langchain-openai") + return None + except Exception as e: + print(f"❌ Error setting up RAG system: {e}") + return None + +def extract_ner_entities(docs): + """Extract NER entities from document metadata.""" + entities = {"people": set(), "organizations": set(), "products": set(), "locations": set(), "dates": set()} + + for doc in docs: + metadata = doc.metadata + if "entities-items" in metadata: + try: + import json + entity_items = json.loads(metadata["entities-items"]) if isinstance(metadata["entities-items"], str) else metadata["entities-items"] + + for item in entity_items: + entity_type = item.get("type", "").upper() + entity_name = item.get("entity", "") + + if entity_type == "PERSON": + entities["people"].add(entity_name) + elif entity_type == "ORGANIZATION": + entities["organizations"].add(entity_name) + elif entity_type == "PRODUCT": + entities["products"].add(entity_name) + elif entity_type == "LOCATION": + entities["locations"].add(entity_name) + elif entity_type == "DATE": + entities["dates"].add(entity_name) + except: + pass + + return entities + +def analyze_sources(docs): + """Analyze retrieved documents by source type.""" + s3_docs = [] + es_docs = [] + unknown_docs = [] + + for doc in docs: + metadata = doc.metadata + if "data_source-record_locator-index_name" in metadata: + es_docs.append(doc) + elif "data_source-url" in metadata and "s3://" in metadata.get("data_source-url", ""): + s3_docs.append(doc) + else: + unknown_docs.append(doc) + + return s3_docs, es_docs, unknown_docs + +def demonstrate_hybrid_ner_queries(rag_components): + """Demonstrate NER-enhanced hybrid RAG capabilities.""" + if not rag_components: + return + + retriever = rag_components["retriever"] + llm = rag_components["llm"] + prompt = rag_components["prompt"] + + # Build RAG chain using your working pattern + rag_chain = ( + {"context": retriever, "question": RunnablePassthrough()} + | prompt + | llm + | StrOutputParser() + ) + + # Hybrid NER demonstration queries targeting different sources + hybrid_queries = [ + { + "query": "How do I troubleshoot Bose headphone connectivity issues?", + "description": "Product support query targeting S3 PDFs (product manuals)", + "expected_source": "S3 (Product Docs)" + }, + { + "query": "Tell me about Daniel Hahn and his purchases", + "description": "Customer analysis query targeting Elasticsearch (sales data)", + "expected_source": "Elasticsearch (Sales)" + }, + { + "query": "What are the technical specifications for SoundSport Wireless headphones?", + "description": "Product specification query targeting S3 PDFs", + "expected_source": "S3 (Product Docs)" + }, + { + "query": "Show me customers in San Antonio, TX", + "description": "Geographic customer query targeting Elasticsearch", + "expected_source": "Elasticsearch (Sales)" + }, + { + "query": "How do I reset wireless headphones to factory settings?", + "description": "Technical support query targeting S3 PDFs", + "expected_source": "S3 (Product Docs)" + }, + { + "query": "What products does Newegg sell and what are their features?", + "description": "Hybrid query targeting BOTH sources (sales + product specs)", + "expected_source": "Both S3 and Elasticsearch" + }, + { + "query": "I have a customer who bought Bose headphones and is having connectivity issues. What should I tell them?", + "description": "Customer support query requiring BOTH customer data AND product manuals", + "expected_source": "Both S3 and Elasticsearch" + } + ] + + print("\n🧠 Hybrid NER-Enhanced RAG Demonstration") + print("=" * 60) + + for i, query_info in enumerate(hybrid_queries, 1): + query = query_info["query"] + description = query_info["description"] + expected_source = query_info["expected_source"] + + print(f"\n{'='*70}") + print(f"Query {i}: {description}") + print(f"📝 Query: {query}") + print(f"🎯 Expected Source: {expected_source}") + print("=" * 70) + + try: + # Retrieve documents + docs = retriever.invoke(query) + + if not docs: + print("❌ No documents retrieved") + continue + + # Analyze sources (keeping your preferred format) + s3_docs, es_docs, unknown_docs = analyze_sources(docs) + print(f"📊 Retrieved {len(docs)} documents:") + print(f" 📄 S3 (Product Docs): {len(s3_docs)}") + print(f" 📊 Elasticsearch (Sales): {len(es_docs)}") + print(f" ❓ Unknown: {len(unknown_docs)}") + + # Check if we hit the expected source + if expected_source == "S3 (Product Docs)" and len(s3_docs) > 0: + print("✅ SUCCESS: Retrieved from expected S3 source!") + elif expected_source == "Elasticsearch (Sales)" and len(es_docs) > 0: + print("✅ SUCCESS: Retrieved from expected Elasticsearch source!") + elif expected_source == "Both S3 and Elasticsearch" and len(s3_docs) > 0 and len(es_docs) > 0: + print("✅ SUCCESS: Retrieved from BOTH sources as expected!") + elif expected_source.startswith("Both") and (len(s3_docs) > 0 or len(es_docs) > 0): + print("✅ PARTIAL: Retrieved from at least one expected source") + else: + print("⚠️ UNEXPECTED: Did not retrieve from expected source") + + # Extract and show NER entities + entities = extract_ner_entities(docs) + print(f"\n🏷️ NER Entities Found:") + if entities["people"]: + print(f" 👤 People: {', '.join(list(entities['people'])[:3])}") + if entities["organizations"]: + print(f" 🏢 Organizations: {', '.join(list(entities['organizations'])[:3])}") + if entities["products"]: + print(f" 📱 Products: {', '.join(list(entities['products'])[:3])}") + if entities["locations"]: + print(f" 🗺️ Locations: {', '.join(list(entities['locations'])[:3])}") + if entities["dates"]: + print(f" 📅 Dates: {', '.join(list(entities['dates'])[:3])}") + + # Generate answer + print(f"\n💬 Answer:") + answer = rag_chain.invoke(query) + print(f"{answer}") + + except Exception as e: + print(f"❌ Error: {e}") + if "429" in str(e): + print("⚠️ OpenAI API quota exceeded. Stopping demo.") + break + + print(f"\n{'='*70}") + print("🧠 Hybrid NER Demo Complete!") + print("✅ Demonstrated cross-source retrieval capabilities") + print("✅ Showed NER metadata integration across data sources") + print("✅ Validated hybrid RAG architecture") + +def run_rag_demonstration(): + """Run the RAG demonstration.""" + print("\n🚀 Starting Hybrid RAG Demonstration") + print("=" * 50) + + rag_components = setup_rag_system() + + if rag_components: + demonstrate_hybrid_ner_queries(rag_components) + else: + print("❌ RAG demonstration skipped due to configuration issues") + +# Run the demonstration +run_rag_demonstration() diff --git a/notebook-processing/code-scripts/verification.py b/notebook-processing/code-scripts/verification.py new file mode 100644 index 0000000..074d401 --- /dev/null +++ b/notebook-processing/code-scripts/verification.py @@ -0,0 +1,156 @@ +import os + +def print_pipeline_summary(s3_workflow_id, es_workflow_id, s3_job_id, es_job_id): + """Print comprehensive pipeline summary.""" + print("\n" + "=" * 80) + print("📊 HYBRID RAG PIPELINE SUMMARY") + print("=" * 80) + print(f"📁 S3 Source (PDFs): {S3_SOURCE_BUCKET if s3_workflow_id else SKIPPED}") + print(f"🔍 Elasticsearch Source: {ELASTICSEARCH_HOST}/{ELASTICSEARCH_INDEX}") + print(f"📤 Elasticsearch Destination: {ELASTICSEARCH_HOST}/customer-support") + print(f"") + print(f"⚙️ S3 PDFs Workflow ID: {s3_workflow_id if s3_workflow_id else SKIPPED}") + print(f"⚙️ Elasticsearch Sales Workflow ID: {es_workflow_id}") + print(f"") + print(f"🚀 S3 PDFs Job ID: {s3_job_id if s3_job_id else SKIPPED}") + print(f"🚀 Elasticsearch Sales Job ID: {es_job_id}") + +def verify_customer_support_results(s3_job_id=None, es_job_id=None): + """ + Verifies the processed results in the customer-support index, prettyprinting one doc per unique source connector. + Assumes jobs have already completed successfully. + """ + import pprint + + print("🔍 Verifying processed results in 'customer-support' index (assuming jobs have completed)...") + + try: + # Initialize Elasticsearch client + es = Elasticsearch( + ELASTICSEARCH_HOST, + api_key=ELASTICSEARCH_API_KEY, + request_timeout=60, + max_retries=3, + retry_on_timeout=True + ) + + index_name = "customer-support" + + # Check if index exists + if not es.indices.exists(index=index_name): + print(f"❌ Index '{index_name}' does not exist. Workflows may not have written results yet.") + return + + # Get document count + count_response = es.count(index=index_name) + total_docs = count_response['count'] + print(f"📊 Total processed documents: {total_docs}") + + if total_docs == 0: + print("⏳ No documents found yet. Workflows may still be processing or index is empty.") + print("💡 Check the Unstructured dashboard for job status.") + return + + print(f"\n📋 Analyzing Source Connectors:") + print("=" * 40) + + # Get sample documents to analyze source patterns + # Use function_score with random_score to sample documents randomly + sample_response = es.search( + index=index_name, + body={ + "size": 50, # Get more samples to increase chance of seeing all sources + "_source": ["metadata", "text", "element_id"], + "query": { + "function_score": { + "query": {"match_all": {}}, + "random_score": {} + } + } + } + ) + + + # Map: source_connector_key -> [doc, ...] + source_connector_map = {} + unknown_docs = [] + + for hit in sample_response['hits']['hits']: + source = hit['_source'] + metadata = source.get('metadata', {}) + + # Determine source connector type based on metadata patterns + if "data_source-record_locator-index_name" in metadata: + # Elasticsearch source connector + key = f"elasticsearch:{metadata['data_source-record_locator-index_name']}" + elif "data_source-url" in metadata: + # S3 source connector - group all S3 URLs by bucket + url = metadata['data_source-url'] + if url.startswith('s3://'): + # Extract bucket name from S3 URL + bucket = url.split('/')[2] if '/' in url else url.replace('s3://', '') + key = f"s3:{bucket}" + else: + key = f"s3:unknown" + elif "filename" in metadata and metadata.get('filetype') == 'pdf': + # PDF files from S3 (fallback detection) + key = "s3:pdfs" + else: + key = "unknown" + + if key == "unknown": + unknown_docs.append(hit) + else: + if key not in source_connector_map: + source_connector_map[key] = hit # Only keep the first doc for each source connector + + print(f"🔍 Unique source connectors found: {len(source_connector_map)}") + for i, (key, doc) in enumerate(source_connector_map.items(), 1): + print(f"\n--- Source Connector {i} ({key}) ---") + pprint.pprint(doc['_source'], depth=6, compact=False, sort_dicts=False) + + if unknown_docs: + print(f"\n❓ Example Unknown Source Document:") + print("-" * 35) + unknown_example = unknown_docs[0]['_source'] + metadata = unknown_example.get('metadata', {}) + text = unknown_example.get('text', '') + print(f" Element ID: {unknown_example.get('element_id', 'N/A')}") + print(f" Metadata: {metadata}") + print(f" Text Preview: {text[:200]}..." if len(text) > 200 else f" Text: {text}") + print(" Metadata prettyprint:") + pprint.pprint(metadata, depth=6, compact=False, sort_dicts=False) + + # Test search functionality + print(f"\n🔍 Testing Search Functionality:") + print("=" * 32) + + search_tests = ["manual", "customer", "product", "support"] + + for search_term in search_tests: + search_response = es.search( + index=index_name, + body={ + "size": 1, + "query": { + "match": { + "text": search_term + } + } + } + ) + + hits = search_response['hits']['total']['value'] + print(f" 🔎 '{search_term}': {hits} matches") + + print(f"\n" + "=" * 50) + print("🎉 CUSTOMER-SUPPORT INDEX VERIFICATION") + print("=" * 50) + print("✅ Index exists and contains processed documents") + print("✅ Documents from both source connectors are present (if both completed)") + print("✅ Text search is functional across processed content") + print("✅ Ready for hybrid RAG queries!") + + except Exception as e: + print(f"❌ Error verifying results: {e}") + print("💡 This is normal if workflows are still processing or if there is a connection issue.") diff --git a/notebook-processing/code-scripts/workflows.py b/notebook-processing/code-scripts/workflows.py new file mode 100644 index 0000000..ec2418c --- /dev/null +++ b/notebook-processing/code-scripts/workflows.py @@ -0,0 +1,101 @@ +def create_workflow_nodes(): + """Create shared processing nodes for workflows.""" + vlm_partition_node = WorkflowNode( + name="VLM_Partitioner", + subtype="vlm", + type="partition", + settings={ + "provider": "openai", + "model": "gpt-4o", + } + ) + + chunk_node = WorkflowNode( + name="Chunker_Node", + subtype="chunk_by_title", + type="chunk", + settings={ + "new_after_n_chars": 1500, + "max_characters": 2048, + "overlap": 0 + } + ) + + embedder_node = WorkflowNode( + name="Embedder_Node", + subtype="openai", + type="embed", + settings={ + "model_name": "text-embedding-3-small" + } + ) + + ner_enrichment_node = WorkflowNode( + name="NER_Enrichment", + type="prompter", + subtype="openai_ner", + settings={} + ) + + return vlm_partition_node, chunk_node, embedder_node, ner_enrichment_node + +def create_parallel_workflows(s3_source_id, elasticsearch_source_id, destination_id): + """Create separate workflows for S3 PDFs and Elasticsearch data that run in parallel.""" + try: + vlm_partition_node, chunk_node, embedder_node, ner_enrichment_node = create_workflow_nodes() + + # Create workflow for S3 PDFs + s3_workflow_id = None + if s3_source_id: + with UnstructuredClient(api_key_auth=UNSTRUCTURED_API_KEY) as client: + s3_workflow = CreateWorkflow( + name=f"S3-PDFs-Parallel-Workflow_{int(time.time())}", + source_id=s3_source_id, + destination_id=destination_id, + workflow_type=WorkflowType.CUSTOM, + workflow_nodes=[ + vlm_partition_node, + chunk_node, + embedder_node, + ner_enrichment_node + ] + ) + + s3_response = client.workflows.create_workflow( + request=CreateWorkflowRequest( + create_workflow=s3_workflow + ) + ) + + s3_workflow_id = s3_response.workflow_information.id + print(f"✅ Created S3 PDF workflow: {s3_workflow_id}") + + # Create workflow for Elasticsearch sales data + with UnstructuredClient(api_key_auth=UNSTRUCTURED_API_KEY) as client: + es_workflow = CreateWorkflow( + name=f"Elasticsearch-Sales-Parallel-Workflow_{int(time.time())}", + source_id=elasticsearch_source_id, + destination_id=destination_id, + workflow_type=WorkflowType.CUSTOM, + workflow_nodes=[ + vlm_partition_node, + chunk_node, + embedder_node, + ner_enrichment_node + ] + ) + + es_response = client.workflows.create_workflow( + request=CreateWorkflowRequest( + create_workflow=es_workflow + ) + ) + + es_workflow_id = es_response.workflow_information.id + print(f"✅ Created Elasticsearch sales workflow: {es_workflow_id}") + + return s3_workflow_id, es_workflow_id + + except Exception as e: + print(f"❌ Error creating parallel workflows: {e}") + return None, None diff --git a/notebook-processing/code_blocks.yaml b/notebook-processing/code_blocks.yaml new file mode 100644 index 0000000..e1d5491 --- /dev/null +++ b/notebook-processing/code_blocks.yaml @@ -0,0 +1,14 @@ +# Code blocks for enriching hybrid_rag_pipeline_modular.py +# Maps CODE handles to their corresponding script files in code-scripts/ + +DEPENDENCIES: code-scripts/dependencies.py +DOTENV_CREATION: code-scripts/dotenv_creation.py +DATA_PREPARATION: code-scripts/data_preparation.py +CONNECTORS: code-scripts/connectors.py +WORKFLOWS: code-scripts/workflows.py +EXECUTION: code-scripts/execution.py +PREPROCESSING: code-scripts/preprocessing.py +VERIFICATION: code-scripts/verification.py +MAIN: code-scripts/main.py +EXECUTION_RUNNER: code-scripts/execution_runner.py +RAG_DEMO: code-scripts/rag_demo.py diff --git a/notebook-processing/code_mappings.yaml b/notebook-processing/code_mappings.yaml new file mode 100644 index 0000000..2007080 --- /dev/null +++ b/notebook-processing/code_mappings.yaml @@ -0,0 +1,11 @@ +# Code blocks for enriching hybrid_rag_pipeline.py +# Maps code handles to their corresponding script files + +DEPENDENCIES: code-scripts/dependencies.py +DATA_PREPARATION: code-scripts/data_preparation.py +CONNECTORS: code-scripts/connectors.py +WORKFLOWS: code-scripts/workflows.py +EXECUTION: code-scripts/execution.py +PREPROCESSING: code-scripts/preprocessing.py +VERIFICATION: code-scripts/verification.py +MAIN: code-scripts/main.py \ No newline at end of file diff --git a/notebook-processing/enrich_and_convert.py b/notebook-processing/enrich_and_convert.py new file mode 100644 index 0000000..4c304c3 --- /dev/null +++ b/notebook-processing/enrich_and_convert.py @@ -0,0 +1,255 @@ +#!/usr/bin/env python3 +import re +import sys +import argparse +import base64 +from pathlib import Path +import yaml +import subprocess +from PIL import Image +import io + +ROOT = Path(__file__).resolve().parents[1] +MD_MAP = Path(__file__).resolve().parent / 'markdown_blocks.yaml' +CODE_MAP = Path(__file__).resolve().parent / 'code_blocks.yaml' +IMG_MAP = Path(__file__).resolve().parent / 'image_blocks.yaml' +CODE_SCRIPTS_DIR = Path(__file__).resolve().parent / 'code-scripts' +IMAGES_DIR = Path(__file__).resolve().parent / 'images' + +MD_HANDLE_RE = re.compile(r"# \[\[MD:([A-Z0-9_]+)\]\]") +CODE_HANDLE_RE = re.compile(r"# \[\[CODE:([A-Z0-9_]+)\]\]") +IMG_HANDLE_RE = re.compile(r"# \[\[IMG:([A-Z0-9_]+)\]\]") + +def load_markdown_blocks(): + with open(MD_MAP, 'r') as f: + return yaml.safe_load(f) + +def load_code_blocks(): + if CODE_MAP.exists(): + with open(CODE_MAP, 'r') as f: + return yaml.safe_load(f) + return {} + +def load_image_blocks(): + if IMG_MAP.exists(): + with open(IMG_MAP, 'r') as f: + return yaml.safe_load(f) + return {} + +def load_code_script(script_path: str) -> str: + """Load code from a script file.""" + script_file = Path(__file__).resolve().parent / script_path + if script_file.exists(): + return script_file.read_text() + else: + print(f"[WARN] Code script not found: {script_path}") + return f"# Code script not found: {script_path}" + +def resize_image_if_needed(image_path: Path, max_width: int = 800) -> bytes: + """Resize image if it's wider than max_width, maintaining aspect ratio.""" + try: + with Image.open(image_path) as img: + # Convert to RGB if necessary (handles RGBA, P mode images) + if img.mode in ('RGBA', 'P'): + # Create a white background for transparent images + background = Image.new('RGB', img.size, (255, 255, 255)) + if img.mode == 'P': + img = img.convert('RGBA') + background.paste(img, mask=img.split()[-1] if img.mode == 'RGBA' else None) + img = background + elif img.mode != 'RGB': + img = img.convert('RGB') + + # Resize if needed + if img.width > max_width: + # Calculate new height maintaining aspect ratio + aspect_ratio = img.height / img.width + new_height = int(max_width * aspect_ratio) + img = img.resize((max_width, new_height), Image.Resampling.LANCZOS) + print(f"[INFO] Resized {image_path.name} from {img.width}x{img.height} to {max_width}x{new_height}") + + # Save to bytes + img_bytes = io.BytesIO() + img.save(img_bytes, format='PNG', optimize=True) + return img_bytes.getvalue() + + except Exception as e: + print(f"[WARN] Error resizing image {image_path}: {e}") + # Fallback to original file + return image_path.read_bytes() + +def load_image_as_base64(image_path: str) -> str: + """Load image file, resize if needed, and convert to base64 for embedding.""" + image_file = Path(__file__).resolve().parent / image_path + if not image_file.exists(): + print(f"[WARN] Image file not found: {image_path}") + return f"# Image not found: {image_path}" + + try: + # Resize image if needed (max 800px wide) + image_data = resize_image_if_needed(image_file, max_width=800) + + # Determine MIME type based on file extension + ext = image_file.suffix.lower() + mime_types = { + '.png': 'image/png', + '.jpg': 'image/jpeg', + '.jpeg': 'image/jpeg', + '.gif': 'image/gif', + '.svg': 'image/svg+xml', + '.webp': 'image/webp' + } + mime_type = mime_types.get(ext, 'image/png') + + # Encode as base64 + base64_data = base64.b64encode(image_data).decode('utf-8') + + # Get filename for alt text + filename = image_file.stem + + return f"![{filename}](data:{mime_type};base64,{base64_data})" + + except Exception as e: + print(f"[WARN] Error loading image {image_path}: {e}") + return f"# Error loading image: {image_path}" + +def _normalize_block(block: str) -> str: + # Remove surrounding triple quotes if present + lines = block.strip('\n').splitlines() + if lines and lines[0].strip() == '"""' and lines[-1].strip() == '"""': + lines = lines[1:-1] + return '\n'.join(lines) + +def _to_percent_markdown_cell(block: str) -> str: + content = _normalize_block(block) + md_lines = ["# %% [markdown]"] + for ln in content.splitlines(): + md_lines.append(f"# {ln}") + return '\n'.join(md_lines) + +def _to_percent_code_cell(code: str) -> str: + """Convert code to a percent code cell format.""" + lines = ["# %%"] + lines.extend(code.splitlines()) + return '\n'.join(lines) + +def enrich_file(source_file: Path, output_file: Path, strict: bool = True, include_images: bool = False): + md_blocks = load_markdown_blocks() + code_blocks = load_code_blocks() + img_blocks = load_image_blocks() if include_images else {} + + lines = source_file.read_text().splitlines() + enriched_lines = [] + + for line in lines: + # Check for markdown handles + md_match = MD_HANDLE_RE.search(line) + if md_match: + key = md_match.group(1) + block = md_blocks.get(key) + if block is None: + print(f"[WARN] No markdown block found for handle: {key}") + enriched_lines.append(line) + else: + print(f"[INFO] Replacing markdown handle: {key}") + enriched_lines.append(_to_percent_markdown_cell(block)) + continue + + # Check for code handles + code_match = CODE_HANDLE_RE.search(line) + if code_match: + key = code_match.group(1) + script_path = code_blocks.get(key) + if script_path is None: + print(f"[WARN] No code block found for handle: {key}") + enriched_lines.append(line) + else: + print(f"[INFO] Replacing code handle: {key} with {script_path}") + code_content = load_code_script(script_path) + enriched_lines.append(_to_percent_code_cell(code_content)) + continue + + # Check for image handles + img_match = IMG_HANDLE_RE.search(line) + if img_match: + key = img_match.group(1) + if not include_images: + print(f"[INFO] Skipping image handle: {key} (images disabled)") + enriched_lines.append(f"# [[IMG:{key}]] # Image disabled - use --include-images to enable") + continue + + image_path = img_blocks.get(key) + if image_path is None: + print(f"[WARN] No image block found for handle: {key}") + enriched_lines.append(line) + else: + print(f"[INFO] Replacing image handle: {key} with {image_path}") + image_content = load_image_as_base64(image_path) + enriched_lines.append(_to_percent_markdown_cell(image_content)) + continue + + # Regular line - keep as is + enriched_lines.append(line) + + enriched_text = '\n'.join(enriched_lines) + '\n' + + if strict: + # Check for any remaining handles + remaining_md = [ln for ln in enriched_text.splitlines() if '[[MD:' in ln] + remaining_code = [ln for ln in enriched_text.splitlines() if '[[CODE:' in ln] + remaining_img = [ln for ln in enriched_text.splitlines() if '[[IMG:' in ln and 'Image disabled' not in ln] + + if remaining_md: + raise SystemExit(f"[ERROR] Unreplaced MD handles remain: {remaining_md[:5]}") + if remaining_code: + raise SystemExit(f"[ERROR] Unreplaced CODE handles remain: {remaining_code[:5]}") + if remaining_img: + raise SystemExit(f"[ERROR] Unreplaced IMG handles remain: {remaining_img[:5]}") + + output_file.write_text(enriched_text) + return output_file + +def convert_with_jupytext(py_path: Path): + # Convert enriched python to notebook using jupytext + nb_path = py_path.with_suffix('.ipynb') + cmd = [sys.executable, '-m', 'jupytext', '--to', 'ipynb', str(py_path)] + subprocess.check_call(cmd) + return nb_path + +def main(): + parser = argparse.ArgumentParser(description='Enrich and convert pipeline to notebook') + parser.add_argument('--source', choices=['original', 'modular'], default='modular', + help='Source pipeline to use (default: modular)') + parser.add_argument('--output-suffix', default='enriched', + help='Suffix for output files (default: enriched)') + parser.add_argument('--include-images', action='store_true', + help='Include images in the output (default: False)') + + args = parser.parse_args() + + # Determine source and output files + if args.source == 'modular': + source_file = ROOT / 'hybrid_rag_pipeline_modular.py' + else: + source_file = ROOT / 'hybrid_rag_pipeline.py' + + output_file = ROOT / f'hybrid_rag_pipeline_{args.output_suffix}.py' + + # Check if source file exists + if not source_file.exists(): + print(f"❌ Source file not found: {source_file}") + sys.exit(1) + + print(f"📄 Source: {source_file.name}") + print(f"📄 Output: {output_file.name}") + print(f"🔧 Mode: {args.source}") + print(f"🖼️ Images: {'Enabled' if args.include_images else 'Disabled'}") + print() + + enriched = enrich_file(source_file, output_file, strict=True, include_images=args.include_images) + nb = convert_with_jupytext(enriched) + print(f"✅ Enriched: {enriched}") + print(f"✅ Notebook: {nb}") + +if __name__ == '__main__': + main() diff --git a/notebook-processing/image_blocks.yaml b/notebook-processing/image_blocks.yaml new file mode 100644 index 0000000..c7dc3b3 --- /dev/null +++ b/notebook-processing/image_blocks.yaml @@ -0,0 +1,18 @@ +# Image Blocks Configuration +# Maps image handles to image files in the images/ folder +# +# Usage: Add # [[IMG:HANDLE_NAME]] to your pipeline file +# Then add HANDLE_NAME: images/filename.ext to this file + +# Elasticsearch Index Images +SALES_RECORDS_CONSOLIDATED_INDEX: images/sales-records-consolidated.png +PRODUCT_MANUAL_EXAMPLE: images/product-manual-example.png +CUSTOMER_SUPPORT_INDEX: images/customer-support.png + +# Example entries (uncomment and modify as needed): +# ARCHITECTURE_DIAGRAM: images/architecture-diagram.png +# WORKFLOW_FLOWCHART: images/workflow-flowchart.png +# DATA_FLOW: images/data-flow.png +# RAG_DEMO_SCREENSHOT: images/rag-demo-screenshot.png +# SYSTEM_OVERVIEW: images/system-overview.png +# PROCESSING_PIPELINE: images/processing-pipeline.png diff --git a/notebook-processing/images/customer-support.png b/notebook-processing/images/customer-support.png new file mode 100644 index 0000000..88099fc Binary files /dev/null and b/notebook-processing/images/customer-support.png differ diff --git a/notebook-processing/images/product-manual-example.png b/notebook-processing/images/product-manual-example.png new file mode 100644 index 0000000..b413b1f Binary files /dev/null and b/notebook-processing/images/product-manual-example.png differ diff --git a/notebook-processing/images/sales-records-consolidated.png b/notebook-processing/images/sales-records-consolidated.png new file mode 100644 index 0000000..5562f8e Binary files /dev/null and b/notebook-processing/images/sales-records-consolidated.png differ diff --git a/notebook-processing/images/sales-records.png b/notebook-processing/images/sales-records.png new file mode 100644 index 0000000..0dd9dad Binary files /dev/null and b/notebook-processing/images/sales-records.png differ diff --git a/notebook-processing/markdown_blocks.yaml b/notebook-processing/markdown_blocks.yaml new file mode 100644 index 0000000..ccd425a --- /dev/null +++ b/notebook-processing/markdown_blocks.yaml @@ -0,0 +1,332 @@ +# Markdown blocks for enriching hybrid_rag_pipeline.py + +INTRO: | + """ + # Building a Hybrid RAG System: From Fragmented Data to Unified Intelligence + + Picture this: You're a customer support agent, and a customer calls about a product issue. To help them effectively, you need to pull information from multiple sources - product manuals stored as PDFs in cloud storage, their purchase history in your sales database, and previous support interactions scattered across different systems. Each piece of information lives in a different format, in a different system, with different access methods. + + This is the reality for most enterprises today, and it's exactly the challenge we're going to solve together. + + ## The Enterprise Data Challenge + + Enterprise data rarely lives in one place or format. Critical information is fragmented across unstructured documents like PDFs and manuals in cloud storage, structured records like sales data in databases, and different formats requiring different processing approaches. Traditional RAG systems work well with homogeneous data but struggle when you need to query across diverse data sources simultaneously. + + ## Why This Matters + + When data is scattered, customer support becomes inefficient, decision-making lacks context, and valuable insights remain hidden. A customer asking about a product issue shouldn't require you to manually search through multiple systems to piece together a complete picture. + + ## The Solution: Unstructured's Complete Gen AI Data Layer + + Unstructured isn't just another data processing tool—it's a complete Gen AI data layer solution that transforms how organizations handle unstructured data at scale. Unlike building custom solutions or using fragmented tools, Unstructured provides a unified platform that connects to 30+ data sources, processes 65+ file types with intelligent partitioning and chunking, automatically enriches content with metadata and context, and delivers to 30+ destinations—all while maintaining enterprise-grade security and compliance. + + The platform eliminates the complexity of managing multiple tools, custom integrations, and manual data preparation, allowing teams to focus on building AI applications rather than wrestling with data infrastructure. With flexible deployment options from SaaS to bare metal, Unstructured adapts to any infrastructure while providing the observability, automation, and reliability that enterprise AI projects demand. + + ## What We'll Build Together + + In this tutorial, we'll create a hybrid RAG system that processes two different data sources simultaneously: product documentation from S3 and sales records from Elasticsearch. Both will flow through the same intelligent processing pipeline and land in a unified, searchable knowledge base. + + ``` + ┌─────────────────┐ ┌─────────────────────────┐ + │ S3 PDFs │──── WORKFLOW 1 ──────────▶│ │ + │ (Product Docs) │ │ Unstructured API │ + └─────────────────┘ │ │ + │ Partition → Chunk → │ + ┌─────────────────┐ │ Embed → NER → Store │ + │ Elasticsearch │──── WORKFLOW 2 ──────────▶│ │ + │ (Sales Records) │ │ │ + └─────────────────┘ └────────────┬────────────┘ + │ + ┌────────────▼────────────┐ + │ customer-support │ + │ (Unified Index) │ + └─────────────────────────┘ + ``` + + By the end of this tutorial, you'll have a working system that can answer complex questions by pulling information from both your product documentation and customer data simultaneously. + """ + +API_KEY_SETUP: | + """ + ## Getting Started: Your Unstructured API Key + + To follow along with this tutorial, you'll need an Unstructured API key. This gives you access to the complete Gen AI data layer that will process your documents and create your unified knowledge base. + + ### Sign Up and Get Your API Key + + Visit https://platform.unstructured.io to sign up for a free account, navigate to API Keys in the sidebar, generate your API key, and save it for the configuration step below. For Team or Enterprise accounts, make sure you've selected the correct organizational workspace before creating your API key. + + **Need help?** Contact Unstructured Support at support@unstructured.io + """ + +CONFIG: | + """ + ## Configuration: Setting Up Your Environment + + Now we'll configure your environment with the necessary API keys and credentials. This step ensures your system can connect to all the data sources and services we'll be using. + + """ + +COLAB_DOTENV_CREATION: | + """ + ### Creating a .env File in Google Colab + + For better security and organization, we'll create a `.env` file directly in your Colab environment. Run the code cell below to create the file with placeholder values, then edit it with your actual credentials. + + After running the code cell, you'll need to replace each placeholder value (like `your-unstructured-api-key`) with your actual API keys and credentials. + """ + +DEPENDENCIES_EXPLANATION: | + """ + ### Installing Required Dependencies + + The following code installs the Python packages needed for this tutorial: the Unstructured client, Elasticsearch connector, AWS SDK, and other dependencies. + """ + +AWS_S3_SETUP: | + """ + ## AWS S3: Your Document Storage + + Now that we have our environment configured, let's set up the data sources for our hybrid RAG system. First up: your unstructured documents. These PDFs, manuals, and reports need to be accessible via S3, where your product documentation and other unstructured content lives, waiting to be processed into searchable knowledge. + + ### What You Need + + **An existing S3 bucket** containing the documents you want to process. For this tutorial, we'll use sample product manuals, but in production, this would be your actual business documents. + + > **Note**: This tutorial assumes you have an existing S3 bucket with documents. For detailed S3 setup instructions, see the [Unstructured S3 source connector documentation](https://docs.unstructured.io/api-reference/api-services/source-connectors/s3). + + You'll need an AWS account with S3 access, an IAM user with S3 read permissions for your bucket, and access keys (Access Key ID and Secret Access Key). + """ + +ELASTICSEARCH_SETUP: | + """ + ## Elasticsearch: Your Business Data Hub + + While S3 holds your unstructured documents, Elasticsearch serves a dual purpose in our pipeline. It's both a source of structured business data (your sales records, customer information) and the destination where our unified, processed results will be stored for RAG queries. + + ### What You Need + + **Elasticsearch cluster** with API key authentication from Elastic Cloud (managed service). This gives you the reliability and scalability needed for enterprise applications. + + The pipeline uses two indices: `sales-records-consolidated` as the source containing your business data, and `customer-support` as the destination for your unified knowledge base. Both are created automatically by the pipeline. + + ### Why Consolidated Data Format Matters + + Traditional databases store information in separate fields (customer_name, product_id, purchase_date). For RAG applications, we consolidate this into a long-form text field that provides full context in each search result. This approach ensures that when someone searches for "John's headphone purchase," they get the complete story in one result. + + Example transformation: + ``` + Before: {customer: "John Doe", product: "BH-001", date: "2024-01-15"} + After: "customer: John Doe\nproduct: BH-001\ndate: 2024-01-15" + ``` + + ### API Key Permissions + + Your Elasticsearch API key needs these permissions: + + ```json + { + "sales-records-full-access": { + "cluster": [], + "indices": [ + { + "names": [ + "sales-records", + "sales-records-consolidated", + "customer-support" + ], + "privileges": [ + "create_index", + "delete_index", + "manage", + "write", + "read", + "view_index_metadata", + "monitor" + ], + "allow_restricted_indices": false + } + ], + "applications": [], + "run_as": [], + "metadata": {}, + "transient_metadata": { + "enabled": true + } + } + } + ``` + + **Don't have Elasticsearch data yet?** The pipeline includes automatic data setup that creates sample sales records for demonstration. This is done by downloading .ZIP files from github and unzipping them. + """ + +DATA_PREPARATION: | + """ + ## Data Preparation: Setting Up Your Demo Environment + + With our infrastructure configured, let's prepare the actual data that will flow through our hybrid RAG system. For this demonstration, we've created realistic sample data that represents a typical enterprise scenario, giving you a working example without requiring you to set up your own data sources first. + + **Elasticsearch Sales Data**: 100 synthetic sales records with customer information, with consolidated fields optimized for vector search. This represents the kind of structured business data you'd find in any enterprise system. + + **S3 Product Documentation**: 9 product manuals downloaded from manufacturer websites and stored in your S3 bucket. These represent the unstructured documents that contain critical product information. + + This combination mimics real enterprise scenarios where structured data (sales records) and unstructured documents (manuals) need to be searchable together for effective customer support. The magic happens when we can answer questions like "What issues have customers reported with the BH-900 headphones?" by pulling from both the sales records and the product manual simultaneously. + """ + +S3_SOURCE_CONNECTOR: | + """ + ## S3 Source Connector + + Now we'll create the connections that link our data sources to Unstructured's processing pipeline. First, let's establish the connection to your S3 bucket containing PDF documents for processing. + """ + +ES_SOURCE_CONNECTOR: | + """ + ## Elasticsearch Source Connector + + Next, we'll connect to your Elasticsearch index containing structured sales data, completing our dual-source setup. + """ + +ES_DESTINATION_CONNECTOR: | + """ + ## Elasticsearch Destination Connector + + Finally, we'll create the destination where both data streams will converge: the unified `customer-support` index where all processed data will be stored. + """ + +CREATE_WORKFLOWS: | + """ + ## Creating Parallel Processing Workflows + + Now we'll assemble everything into the two parallel workflows shown in our architecture diagram above, connecting each data source to the processing pipeline and unified destination. + """ + +WORKFLOW_NODES: | + """ + ## Processing Pipeline Configuration + + With our connectors in place, we can now configure the intelligent processing pipeline that will transform both data sources. This four-stage pipeline (VLM → Chunker → Embedder → NER) will be applied to both workflows, ensuring consistent processing regardless of data source. + """ + +RUN_WORKFLOW: | + """ + ## Starting Your Processing Jobs + + With our workflows configured, it's time to put them into action. This step submits both workflows to the Unstructured API and returns job IDs for monitoring. + """ + +JOB_MONITORING: | + """ + ## Monitoring Your Processing Progress + + Jobs progress through scheduled, in-progress, completed, or failed states. The `poll_job_status` function checks status every 30 seconds and blocks execution until jobs complete, so you can see exactly what's happening with your data processing. + """ + +ES_PREPROCESSING: | + """ + ## Preparing Your Elasticsearch Environment + + Before processing begins, we validate that the `sales-records-consolidated` index exists and contains data, then recreate the `customer-support` index fresh for each run. This preparation step ensures a clean environment and prevents any issues from previous runs. + + ### Index Mapping + + The destination index uses this structure optimized for RAG applications: + ```json + { + "id": "keyword", // Unique document identifier + "timestamp": "date", // Processing timestamp + "text": "text", // Searchable content + "metadata": "object" // Source info and entities + } + ``` + """ + +SUMMARY: | + """ + ## Pipeline Execution Summary + + The following summary displays all resources created during pipeline setup: data source paths, connector IDs, workflow IDs, job IDs, and processing status. + """ + +VERIFICATION: | + """ + ## Verifying Your Hybrid RAG System + + After processing completes, we'll verify that both data sources have been successfully integrated into a unified knowledge base. The verification includes checking document counts from each source, testing search functionality, and confirming the data is ready for RAG queries. + """ + +MAIN: | + """ + ## Orchestrating Your Complete Pipeline + + The main function coordinates all pipeline steps in logical sequence: data preparation, environment validation, connector setup, workflow creation, execution, and summary reporting. + """ + +EXECUTION_FLOW: | + """ + ## Running Your Complete Pipeline + + We'll execute the complete pipeline by calling the main function to create all resources and start processing, then monitor the jobs until they complete successfully. + """ + +RAG_DEMO_SETUP: | + """ + ## RAG Query Demonstration + + Now that your hybrid knowledge base is ready, we'll demonstrate how to query it using RAG (Retrieval-Augmented Generation). This is where you'll see how the system can answer complex questions by pulling relevant information from both your S3 documents and Elasticsearch records. + + ### OpenAI API Key Required + + For the RAG demonstration, you'll need an OpenAI API key to power the language model that generates answers based on your retrieved documents. Visit https://platform.openai.com/api-keys to sign in or create an account and generate a new API key. + + The demonstration will show cross-source querying, source attribution, and semantic understanding as your hybrid RAG system answers questions by combining information from multiple data sources. + """ + +RAG_DEMO_CONFIG: | + """ + ### RAG Configuration + + **Instructions**: Paste your OpenAI API key below to enable RAG demonstrations. This key will be used to power the language model that generates answers based on your retrieved documents. + """ + +CONCLUSION: | + """ + ## What You've Accomplished + + **Enterprise Data Integration**: You've learned how to process multiple data formats (PDFs, structured records) in parallel, why consistent processing pipelines matter for unified search, and the value of creating a single searchable knowledge base that spans all your data sources. + + **Unstructured API Capabilities**: You've experienced VLM-powered document partitioning for complex layouts, intelligent chunking that preserves document structure, named entity recognition for enhanced search precision, and unified processing across diverse data sources. + + **RAG System Architecture**: You've built parallel workflow design for scalability and reliability, vector embeddings for semantic similarity search, source attribution in mixed-data query results, and NER-enhanced query understanding and response generation. + + ### Ready to Scale? + + Deploy customer support chatbots with comprehensive knowledge access, build internal search tools that surface information from any source, or create automated content recommendation systems. Add more data sources using additional workflows, implement real-time data synchronization, or scale up for production data volumes with monitoring and alerting. + + ### Try Unstructured Today + + Ready to build your own hybrid RAG system? [Sign up for a free trial](https://unstructured.io/?modal=try-for-free) and start transforming your enterprise data into intelligent, searchable knowledge. + + **Need help getting started?** Contact our team to schedule a demo and see how Unstructured can solve your specific data challenges. + """ + +PRODUCT_MANUAL_EXAMPLE_CONTEXT: | + """ + ### Example Product Manual Content + + The following image shows a sample page from one of the product manuals stored in your S3 bucket. This demonstrates the type of unstructured content that will be processed and made searchable through our RAG system. + """ + +SALES_RECORDS_CONSOLIDATED_CONTEXT: | + """ + ### Sales Records Data Structure + + The image below shows the structure of the consolidated sales records in your Elasticsearch index. This data represents customer transactions and will be processed alongside the product manuals to create a unified knowledge base. + """ + +CUSTOMER_SUPPORT_OUTPUT_CONTEXT: | + """ + ### Unified Knowledge Base Results + + After processing both data sources, the pipeline creates a unified `customer-support` index containing processed documents from both S3 PDFs and Elasticsearch sales records. The image below shows the structure of this consolidated knowledge base, ready for RAG queries. + """ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..fad5c03 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,46 @@ +aiofiles==24.1.0 +annotated-types==0.7.0 +anyio==4.10.0 +attrs==25.3.0 +boto3==1.35.76 +certifi==2025.8.3 +cffi==2.0.0 +charset-normalizer==3.4.3 +cryptography==45.0.7 +elastic-transport==9.1.0 +elasticsearch==9.1.0 +Faker==37.6.0 +fastjsonschema==2.21.2 +h11==0.16.0 +httpcore==1.0.9 +httpx==0.28.1 +idna==3.10 +jsonschema==4.25.1 +jsonschema-specifications==2025.9.1 +jupyter_core==5.8.1 +jupytext==1.17.3 +markdown-it-py==4.0.0 +mdit-py-plugins==0.5.0 +mdurl==0.1.2 +nbformat==5.10.4 +packaging==25.0 +platformdirs==4.4.0 +pycparser==2.23 +pydantic==2.11.7 +pydantic_core==2.33.2 +pypdf==6.0.0 +python-dateutil==2.9.0.post0 +python-dotenv==1.1.1 +PyYAML==6.0.2 +referencing==0.36.2 +requests==2.32.5 +requests-toolbelt==1.0.0 +rpds-py==0.27.1 +six==1.17.0 +sniffio==1.3.1 +traitlets==5.14.3 +typing-inspection==0.4.1 +typing_extensions==4.15.0 +tzdata==2025.2 +unstructured-client==0.42.3 +urllib3==2.5.0 diff --git a/run_pipeline.py b/run_pipeline.py new file mode 100644 index 0000000..969f869 --- /dev/null +++ b/run_pipeline.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +""" +Runner script for the Hybrid RAG Pipeline. +This script imports and executes the pipeline from hybrid_rag_pipeline.py +""" + +import sys +import os + +# Add the current directory to the Python path +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +try: + from hybrid_rag_pipeline import main + + if __name__ == "__main__": + print("🚀 Starting Hybrid RAG Pipeline Runner") + print("=" * 50) + main() + +except ImportError as e: + print(f"❌ Error importing pipeline: {e}") + sys.exit(1) +except Exception as e: + print(f"❌ Error running pipeline: {e}") + sys.exit(1) diff --git a/source_data/README.md b/source_data/README.md new file mode 100644 index 0000000..617ff8e --- /dev/null +++ b/source_data/README.md @@ -0,0 +1,140 @@ +# Source Data Management + +This directory contains scripts and data files for managing the source data used in the hybrid RAG pipeline. + +## Directory Structure + +``` +source_data/ +├── README.md # This file +├── s3_pdfs/ # Directory containing PDF source files +├── sales_records_consolidated.zip # sales-records-consolidated index (100 documents, 18KB) +├── sales_records.zip # sales-records index (100 documents, 16KB) +└── s3_pdfs.zip # S3 PDF files (9 Bose headphone manuals, 6.5MB) +``` + +## Data Files + +### Elasticsearch Sales Data + +**`sales_records_consolidated.zip`** - **Used by hybrid RAG pipeline** +- Contains the `sales-records-consolidated` index +- 100 synthetic sales records with consolidated fields +- Used by the automated data preparation in `hybrid_rag_pipeline.py` +- Downloaded from: `https://github.com/Unstructured-IO/rag-over-hybrid-data-sources/raw/feature/hybrid-rag-pipeline/source_data/sales_records_consolidated.zip` + +**`sales_records.zip`** - **Reference data** +- Contains the `sales-records` index +- 100 synthetic sales records with separate fields +- Available for comparison or alternative workflows + +### S3 PDF Data + +**`s3_pdfs.zip`** +- Contains 9 Bose headphone manuals and customer support documents +- Used by the automated data preparation in `hybrid_rag_pipeline.py` +- Downloaded from: `https://github.com/Unstructured-IO/rag-over-hybrid-data-sources/raw/feature/hybrid-rag-pipeline/source_data/s3_pdfs.zip` + +## Scripts + +### Elasticsearch Sales Index Loader (`../load_es_sales_index.py`) + +Manages Elasticsearch index data for both sales indices. + +#### Download index data to zip file: +```bash +# Download sales-records-consolidated (used by pipeline) - creates sales_records_consolidated.zip +python ../load_es_sales_index.py download --index sales-records-consolidated + +# Download sales-records (reference data) - creates sales_records.zip +python ../load_es_sales_index.py download --index sales-records + +# Or specify custom output path +python ../load_es_sales_index.py download --output source_data/custom_name.zip --index sales-records-consolidated +``` + +#### Load data from zip file to index: +```bash +# Load consolidated data (pipeline default) +python ../load_es_sales_index.py load --input source_data/sales_records_consolidated.zip + +# Load non-consolidated data +python ../load_es_sales_index.py load --input source_data/sales_records.zip --index sales-records +``` + +### S3 PDFs Loader (`../load_s3_pdfs.py`) + +Manages PDF files for the S3 source connector. + +#### Zip PDF files from local directory: +```bash +# Creates s3_pdfs.zip automatically based on directory name +python ../load_s3_pdfs.py zip --input source_data/s3_pdfs + +# Or specify custom output path +python ../load_s3_pdfs.py zip --input source_data/s3_pdfs --output source_data/custom_name.zip +``` + +#### Load PDFs from zip file to S3 bucket: +```bash +python ../load_s3_pdfs.py load --input source_data/s3_pdfs.zip +``` + +## Pipeline Usage + +The `hybrid_rag_pipeline.py` automatically downloads and sets up data from: + +1. **Elasticsearch Source**: `sales_records_consolidated.zip` → `sales-records-consolidated` index +2. **S3 Source**: `s3_pdfs.zip` → S3 bucket (from `S3_SOURCE_BUCKET` env var) + +The pipeline is configured to use the **consolidated** sales data (`sales_records_consolidated.zip`) because: +- Multiple fields are consolidated into single long-form text fields +- Provides maximum context for vector search operations +- Optimized for RAG applications where comprehensive searchability is preferred + +## Environment Variables Required + +Both scripts require the following environment variables to be set in your `.env` file: + +### For Elasticsearch operations: +- `ELASTICSEARCH_HOST` - Your Elasticsearch cluster URL +- `ELASTICSEARCH_API_KEY` - API key for authentication + +### For S3 operations: +- `AWS_ACCESS_KEY_ID` - AWS access key +- `AWS_SECRET_ACCESS_KEY` - AWS secret key +- `AWS_REGION` - AWS region (defaults to us-east-1) +- `S3_SOURCE_BUCKET` - S3 bucket name (used as default for load operations) + +## File Details + +### Sales Data Comparison + +| File | Index | Documents | Size | Field Structure | Usage | +|------|-------|-----------|------|-----------------|-------| +| `sales_records_consolidated.zip` | `sales-records-consolidated` | 100 | 18KB | Consolidated fields | **Pipeline default** | +| `sales_records.zip` | `sales-records` | 100 | 16KB | Separate fields | Reference/comparison | + +### PDF Files Included + +The `s3_pdfs.zip` contains these Bose headphone manuals: +- `bose-OpenAudio-manual.pdf` +- `bose-OpenAudio-troubleshooting.pdf` +- `bose-OpenAudio-msds.pdf` +- `bose-OpenAudio-instructions.pdf` +- `bose-SoundSport-userguide.pdf` +- `bose-SoundSport-safety.pdf` +- `bose-SoundSport-manual.pdf` +- `bose-QUIETCOMFORT-manual.pdf` +- `bose-QUIETCOMFORT-troubleshooting-guide.pdf` + +## Error Handling + +Both scripts include comprehensive error handling: +- Missing environment variables +- Network connectivity issues +- Authentication failures +- File/index not found scenarios +- Partial upload/download failures + +All errors are reported with clear messages and appropriate exit codes. \ No newline at end of file diff --git a/source_data/load_es_sales_index.py b/source_data/load_es_sales_index.py new file mode 100755 index 0000000..a9ddf9a --- /dev/null +++ b/source_data/load_es_sales_index.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python3 +""" +Elasticsearch Sales Index Data Loader + +This script provides functionality to: +1. Download the sales-records-consolidated index from Elasticsearch and save as a zip file +2. Load data from a zip file into the sales-records-consolidated index + +Usage: + # Download index to zip file + python load_es_sales_index.py download --output source_data/sales_data.zip + + # Load data from zip file to index + python load_es_sales_index.py load --input source_data/sales_data.zip +""" + +import os +import sys +import json +import zipfile +import argparse +from pathlib import Path +from dotenv import load_dotenv +from elasticsearch import Elasticsearch +from elasticsearch.helpers import scan, bulk + +# Load environment variables +load_dotenv() + +def get_elasticsearch_client(): + """Initialize and return Elasticsearch client.""" + host = os.getenv("ELASTICSEARCH_HOST") + api_key = os.getenv("ELASTICSEARCH_API_KEY") + + if not host or not api_key: + raise ValueError("ELASTICSEARCH_HOST and ELASTICSEARCH_API_KEY must be set in .env file") + + return Elasticsearch( + host, + api_key=api_key, + request_timeout=60, + max_retries=3, + retry_on_timeout=True + ) + +def download_index(output_path: str = None, index_name: str = "sales-records-consolidated"): + """Download Elasticsearch index data and save as zip file.""" + print(f"🔄 Downloading index '{index_name}'...") + + # Generate output path based on index name if not provided + if not output_path: + # Convert index name to zip file name (replace hyphens with underscores for consistency) + zip_name = index_name.replace('-', '_') + '.zip' + output_path = f"source_data/{zip_name}" + + try: + es = get_elasticsearch_client() + + # Check if index exists + if not es.indices.exists(index=index_name): + raise ValueError(f"❌ Index '{index_name}' does not exist") + + # Get index mapping + mapping_response = es.indices.get_mapping(index=index_name) + mapping = mapping_response.body if hasattr(mapping_response, 'body') else mapping_response + + # Get all documents + documents = [] + for doc in scan(es, index=index_name, query={"query": {"match_all": {}}}): + documents.append({ + "_id": doc["_id"], + "_source": doc["_source"] + }) + + print(f"📊 Found {len(documents)} documents") + + # Create output directory if it doesn't exist + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + + # Save to zip file + with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf: + # Save mapping + zipf.writestr('mapping.json', json.dumps(mapping, indent=2)) + + # Save documents + zipf.writestr('documents.json', json.dumps(documents, indent=2)) + + print(f"✅ Successfully saved {len(documents)} documents to {output_path}") + + except Exception as e: + print(f"❌ Error downloading index: {e}") + sys.exit(1) + +def load_index(input_path: str, index_name: str = "sales-records-consolidated"): + """Load data from zip file into Elasticsearch index.""" + print(f"🔄 Loading data into index '{index_name}'...") + + try: + input_path = Path(input_path) + if not input_path.exists(): + raise ValueError(f"❌ Input file '{input_path}' does not exist") + + es = get_elasticsearch_client() + + # Delete existing index if it exists + if es.indices.exists(index=index_name): + print(f"🗑️ Deleting existing index '{index_name}'...") + es.indices.delete(index=index_name) + + # Load data from zip file + with zipfile.ZipFile(input_path, 'r') as zipf: + # Load mapping + with zipf.open('mapping.json') as f: + mapping_data = json.loads(f.read().decode('utf-8')) + + # Load documents + with zipf.open('documents.json') as f: + documents = json.loads(f.read().decode('utf-8')) + + print(f"📊 Loaded {len(documents)} documents from zip file") + + # Create index with mapping + index_mapping = mapping_data[index_name] if index_name in mapping_data else mapping_data[list(mapping_data.keys())[0]] + es.indices.create(index=index_name, body=index_mapping) + print(f"🔧 Created index '{index_name}' with mapping") + + # Prepare documents for bulk insert + def generate_docs(): + for doc in documents: + yield { + "_index": index_name, + "_id": doc["_id"], + "_source": doc["_source"] + } + + # Bulk insert documents + success_count, failed_items = bulk(es, generate_docs(), chunk_size=100) + print(f"📝 Inserted {success_count} documents") + + if failed_items: + print(f"⚠️ Failed to insert {len(failed_items)} documents") + + # Refresh index + es.indices.refresh(index=index_name) + + # Verify data was loaded + count_response = es.count(index=index_name) + count_data = count_response.body if hasattr(count_response, 'body') else count_response + doc_count = count_data['count'] + + if doc_count > 0: + print(f"✅ Successfully loaded {doc_count} documents into '{index_name}' index") + else: + print(f"❌ Index '{index_name}' is empty after loading") + sys.exit(1) + + except Exception as e: + print(f"❌ Error loading index: {e}") + sys.exit(1) + +def main(): + parser = argparse.ArgumentParser(description="Elasticsearch Sales Index Data Loader") + subparsers = parser.add_subparsers(dest='command', help='Available commands') + + # Download command + download_parser = subparsers.add_parser('download', help='Download index to zip file') + download_parser.add_argument('--output', '-o', + help='Output zip file path (default: source_data/{index_name}.zip)') + download_parser.add_argument('--index', '-i', default='sales-records-consolidated', + help='Index name to download (default: sales-records-consolidated)') + + # Load command + load_parser = subparsers.add_parser('load', help='Load data from zip file to index') + load_parser.add_argument('--input', '-i', required=True, + help='Input zip file path (e.g., source_data/sales_data.zip)') + load_parser.add_argument('--index', '-x', default='sales-records-consolidated', + help='Index name to load into (default: sales-records-consolidated)') + + args = parser.parse_args() + + if not args.command: + parser.print_help() + sys.exit(1) + + if args.command == 'download': + download_index(args.output, args.index) + elif args.command == 'load': + load_index(args.input, args.index) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/source_data/load_s3_pdfs.py b/source_data/load_s3_pdfs.py new file mode 100755 index 0000000..fc9f88a --- /dev/null +++ b/source_data/load_s3_pdfs.py @@ -0,0 +1,223 @@ +#!/usr/bin/env python3 +""" +S3 PDFs Data Loader + +This script provides functionality to: +1. Zip PDF files from a local directory and save as a zip file +2. Load PDFs from a zip file and upload them to an S3 bucket + +Usage: + # Zip PDFs from local directory + python load_s3_pdfs.py zip --input source_data/s3_pdfs --output source_data/s3_pdfs.zip + + # Load PDFs from zip file to S3 bucket + python load_s3_pdfs.py load --input source_data/s3_pdfs.zip +""" + +import os +import sys +import zipfile +import argparse +from pathlib import Path +from dotenv import load_dotenv +import boto3 +from botocore.exceptions import ClientError, NoCredentialsError + +# Load environment variables +load_dotenv() + +def get_s3_client(): + """Initialize and return S3 client.""" + aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID") + aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY") + aws_region = os.getenv("AWS_REGION", "us-east-1") + + if not aws_access_key_id or not aws_secret_access_key: + raise ValueError("AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY must be set in .env file") + + return boto3.client( + 's3', + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + region_name=aws_region + ) + +def zip_pdfs(input_dir: str, output_path: str = None): + """Zip PDF files from a directory.""" + print(f"🔄 Zipping PDFs from '{input_dir}'...") + + # Generate output path based on input directory name if not provided + if not output_path: + input_path = Path(input_dir) + # Use the directory name to create zip file name + dir_name = input_path.name + output_path = f"source_data/{dir_name}.zip" + + try: + input_path = Path(input_dir) + if not input_path.exists(): + raise ValueError(f"❌ Input directory '{input_path}' does not exist") + + # Find all PDF files + pdf_files = list(input_path.rglob("*.pdf")) + if not pdf_files: + raise ValueError(f"❌ No PDF files found in '{input_path}'") + + print(f"📊 Found {len(pdf_files)} PDF files") + + # Create output directory if it doesn't exist + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + + # Create zip file + with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf: + for pdf_file in pdf_files: + # Get relative path to maintain directory structure + relative_path = pdf_file.relative_to(input_path) + zipf.write(pdf_file, relative_path) + print(f" 📁 Added: {relative_path}") + + print(f"✅ Successfully zipped {len(pdf_files)} PDFs to {output_path}") + + except Exception as e: + print(f"❌ Error zipping PDFs: {e}") + sys.exit(1) + +def load_pdfs_to_s3(input_path: str, bucket_name: str = None): + """Load PDFs from zip file and upload to S3 bucket.""" + if not bucket_name: + bucket_name = os.getenv("S3_SOURCE_BUCKET") + + if not bucket_name: + raise ValueError("S3_SOURCE_BUCKET must be set in .env file or provided as argument") + + print(f"🔄 Loading PDFs to S3 bucket '{bucket_name}'...") + + try: + input_path = Path(input_path) + if not input_path.exists(): + raise ValueError(f"❌ Input file '{input_path}' does not exist") + + s3 = get_s3_client() + + # Check if bucket exists, create if it doesn't + try: + s3.head_bucket(Bucket=bucket_name) + print(f"📦 Using existing bucket '{bucket_name}'") + except ClientError as e: + if e.response['Error']['Code'] == '404': + print(f"🔧 Creating bucket '{bucket_name}'...") + try: + # Get region for bucket creation + aws_region = os.getenv("AWS_REGION", "us-east-1") + if aws_region == "us-east-1": + s3.create_bucket(Bucket=bucket_name) + else: + s3.create_bucket( + Bucket=bucket_name, + CreateBucketConfiguration={'LocationConstraint': aws_region} + ) + print(f"✅ Created bucket '{bucket_name}'") + except ClientError as create_error: + if 'BucketAlreadyOwnedByYou' in str(create_error): + print(f"📦 Bucket '{bucket_name}' already exists and is owned by you") + else: + raise create_error + else: + raise e + + # Clear existing files in bucket + print(f"🗑️ Clearing existing files from bucket '{bucket_name}'...") + try: + response = s3.list_objects_v2(Bucket=bucket_name) + if 'Contents' in response: + objects_to_delete = [{'Key': obj['Key']} for obj in response['Contents']] + if objects_to_delete: + s3.delete_objects( + Bucket=bucket_name, + Delete={'Objects': objects_to_delete} + ) + print(f"🗑️ Deleted {len(objects_to_delete)} existing files") + else: + print("📁 Bucket was already empty") + else: + print("📁 Bucket was already empty") + except ClientError as e: + print(f"⚠️ Could not clear bucket (continuing anyway): {e}") + + # Extract and upload files from zip + uploaded_count = 0 + with zipfile.ZipFile(input_path, 'r') as zipf: + file_list = zipf.namelist() + pdf_files = [f for f in file_list if f.lower().endswith('.pdf')] + + print(f"📊 Found {len(pdf_files)} PDF files in zip") + + for file_name in pdf_files: + try: + # Extract file data + file_data = zipf.read(file_name) + + # Upload to S3 + s3.put_object( + Bucket=bucket_name, + Key=file_name, + Body=file_data, + ContentType='application/pdf' + ) + + print(f" 📤 Uploaded: {file_name}") + uploaded_count += 1 + + except Exception as e: + print(f" ❌ Failed to upload {file_name}: {e}") + + # Verify upload + response = s3.list_objects_v2(Bucket=bucket_name) + actual_count = len(response.get('Contents', [])) + + if actual_count > 0: + print(f"✅ Successfully uploaded {uploaded_count} PDFs to bucket '{bucket_name}'") + print(f"📊 Bucket now contains {actual_count} files") + else: + print(f"❌ Bucket '{bucket_name}' is empty after upload") + sys.exit(1) + + except NoCredentialsError: + print("❌ AWS credentials not found. Please check your .env file.") + sys.exit(1) + except Exception as e: + print(f"❌ Error loading PDFs to S3: {e}") + sys.exit(1) + +def main(): + parser = argparse.ArgumentParser(description="S3 PDFs Data Loader") + subparsers = parser.add_subparsers(dest='command', help='Available commands') + + # Zip command + zip_parser = subparsers.add_parser('zip', help='Zip PDF files from directory') + zip_parser.add_argument('--input', '-i', required=True, + help='Input directory containing PDFs (e.g., source_data/s3_pdfs)') + zip_parser.add_argument('--output', '-o', + help='Output zip file path (default: source_data/{dirname}.zip)') + + # Load command + load_parser = subparsers.add_parser('load', help='Load PDFs from zip file to S3 bucket') + load_parser.add_argument('--input', '-i', required=True, + help='Input zip file path (e.g., source_data/s3_pdfs.zip)') + load_parser.add_argument('--bucket', '-b', + help='S3 bucket name (defaults to S3_SOURCE_BUCKET from .env)') + + args = parser.parse_args() + + if not args.command: + parser.print_help() + sys.exit(1) + + if args.command == 'zip': + zip_pdfs(args.input, args.output) + elif args.command == 'load': + load_pdfs_to_s3(args.input, args.bucket) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/source_data/s3_pdfs.zip b/source_data/s3_pdfs.zip new file mode 100644 index 0000000..6454512 Binary files /dev/null and b/source_data/s3_pdfs.zip differ diff --git a/source_data/sales_records.zip b/source_data/sales_records.zip new file mode 100644 index 0000000..8434033 Binary files /dev/null and b/source_data/sales_records.zip differ diff --git a/source_data/sales_records_consolidated.zip b/source_data/sales_records_consolidated.zip new file mode 100644 index 0000000..427ccbd Binary files /dev/null and b/source_data/sales_records_consolidated.zip differ diff --git a/test_setup.py b/test_setup.py new file mode 100644 index 0000000..80a73e0 --- /dev/null +++ b/test_setup.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python3 +""" +Setup Verification Script for Hybrid RAG Pipeline + +This script tests that all dependencies and environment variables +are properly configured before running the main pipeline. +""" + +import os +import sys +from dotenv import load_dotenv + +def test_dependencies(): + """Test that all required Python packages are installed.""" + print("🔍 Testing Python Dependencies...") + + dependencies = [ + ("unstructured_client", "unstructured-client"), + ("dotenv", "python-dotenv"), + ("os", "built-in"), + ("sys", "built-in"), + ("time", "built-in"), + ("json", "built-in"), + ("pathlib", "built-in") + ] + + all_good = True + for module, package in dependencies: + try: + __import__(module) + print(f" ✅ {package}") + except ImportError: + print(f" ❌ {package} - Run: pip install {package}") + all_good = False + + return all_good + +def test_environment_variables(): + """Test that all required environment variables are configured.""" + print("\n🔍 Testing Environment Variables...") + + load_dotenv() + + required_vars = { + "AWS_ACCESS_KEY_ID": "AWS Access Key ID", + "AWS_SECRET_ACCESS_KEY": "AWS Secret Access Key", + "UNSTRUCTURED_API_KEY": "Unstructured API Key", + "ELASTICSEARCH_HOST": "Elasticsearch Host URL", + "ELASTICSEARCH_API_KEY": "Elasticsearch API Key" + } + + optional_vars = { + "AWS_REGION": "us-east-1", + "S3_SOURCE_BUCKET": "example-data-bose-headphones", + "S3_DESTINATION_BUCKET": "example-data-bose-headphones", + "S3_OUTPUT_PREFIX": "output/", + "ELASTICSEARCH_INDEX": "sales-records" + } + + all_good = True + + print(" Required Variables:") + for var, description in required_vars.items(): + value = os.getenv(var, "NOT_SET") + if value == "NOT_SET": + print(f" ❌ {var}: NOT SET") + all_good = False + elif value.startswith("your-"): + print(f" ⚠️ {var}: PLACEHOLDER (update with real {description})") + all_good = False + else: + print(f" ✅ {var}: CONFIGURED") + + print(" Optional Variables:") + for var, default in optional_vars.items(): + value = os.getenv(var, default) + print(f" ℹ️ {var}: {value}") + + return all_good + +def test_script_syntax(): + """Test that the main script has valid syntax.""" + print("\n🔍 Testing Script Syntax...") + + try: + import ast + with open("hybrid_rag_pipeline.py", "r") as f: + source = f.read() + ast.parse(source) + print(" ✅ hybrid_rag_pipeline.py syntax is valid") + return True + except SyntaxError as e: + print(f" ❌ Syntax error in hybrid_rag_pipeline.py: {e}") + return False + except FileNotFoundError: + print(" ❌ hybrid_rag_pipeline.py not found") + return False + +def main(): + """Run all setup verification tests.""" + print("🚀 Hybrid RAG Pipeline - Setup Verification") + print("=" * 50) + + # Run all tests + deps_ok = test_dependencies() + env_ok = test_environment_variables() + syntax_ok = test_script_syntax() + + print("\n" + "=" * 50) + print("📊 VERIFICATION SUMMARY") + print("=" * 50) + + if deps_ok and env_ok and syntax_ok: + print("🎉 ALL TESTS PASSED!") + print("\n✅ Your environment is ready to run the hybrid RAG pipeline.") + print("🚀 Run: python hybrid_rag_pipeline.py") + else: + print("⚠️ SOME TESTS FAILED") + print("\n📝 Next steps:") + + if not deps_ok: + print(" 1. Install missing dependencies: pip install -r requirements.txt") + + if not env_ok: + print(" 2. Update .env file with your actual credentials:") + print(" • AWS_ACCESS_KEY_ID=your-actual-aws-access-key") + print(" • AWS_SECRET_ACCESS_KEY=your-actual-aws-secret-key") + print(" • UNSTRUCTURED_API_KEY=your-actual-unstructured-api-key") + print(" • ELASTICSEARCH_HOST=your-actual-elasticsearch-host") + print(" • ELASTICSEARCH_API_KEY=your-actual-elasticsearch-api-key") + + if not syntax_ok: + print(" 3. Fix syntax errors in hybrid_rag_pipeline.py") + + print("\n Then run this test again: python test_setup.py") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/verify_customer_support_index.py b/verify_customer_support_index.py new file mode 100644 index 0000000..952def9 --- /dev/null +++ b/verify_customer_support_index.py @@ -0,0 +1,326 @@ +#!/usr/bin/env python3 +""" +Customer Support Index Verification Script + +This script analyzes the metadata field of all documents in the customer-support index +to verify that both S3 and Elasticsearch sources are represented in the processed data. + +It will: +1. Connect to the Elasticsearch customer-support index +2. Retrieve all documents and their metadata +3. Analyze the data_source-url field to identify source types +4. Provide a summary of source distribution +5. Verify both S3 and Elasticsearch sources are present +""" + +import os +import sys +from collections import defaultdict, Counter +from dotenv import load_dotenv +from elasticsearch import Elasticsearch +import json + +# Load environment variables +load_dotenv() + +# Configuration +print("🔧 Loading configuration...") + +# Elasticsearch Configuration +ELASTICSEARCH_HOST = os.getenv("ELASTICSEARCH_HOST", "your-elasticsearch-host") +ELASTICSEARCH_API_KEY = os.getenv("ELASTICSEARCH_API_KEY", "your-elasticsearch-api-key") +S3_SOURCE_BUCKET = os.getenv("S3_SOURCE_BUCKET", "example-data-bose-headphones") + +# Validation +REQUIRED_VARS = { + "ELASTICSEARCH_HOST": ELASTICSEARCH_HOST, + "ELASTICSEARCH_API_KEY": ELASTICSEARCH_API_KEY +} + +missing_vars = [key for key, value in REQUIRED_VARS.items() if not value or value.startswith("your-")] +if missing_vars: + print(f"❌ Missing required configuration values: {', '.join(missing_vars)}") + print("Please update your .env file with the required values.") + sys.exit(1) + +print("✅ All required configuration values loaded successfully") + +# Initialize Elasticsearch client +print("🔧 Initializing Elasticsearch client...") +try: + es = Elasticsearch( + hosts=[ELASTICSEARCH_HOST], + api_key=ELASTICSEARCH_API_KEY, + verify_certs=True + ) + + # Test connection with a simple search instead of cluster info + test_response = es.search( + index="customer-support", + body={"query": {"match_all": {}}, "size": 1} + ) + print(f"✅ Connected to Elasticsearch successfully") + +except Exception as e: + print(f"❌ Failed to connect to Elasticsearch: {e}") + sys.exit(1) + +def analyze_customer_support_index(): + """ + Analyze all documents in the customer-support index and categorize by source. + + Returns: + dict: Analysis results with source counts and metadata + """ + print("\n🔍 Analyzing customer-support index...") + print("-" * 60) + + try: + # Check if index exists and get total count + try: + count_response = es.count(index="customer-support") + total_docs = count_response['count'] + print(f"📊 Total documents in customer-support index: {total_docs}") + except Exception as e: + print(f"❌ Could not get document count: {e}") + return None + + if total_docs == 0: + print("⚠️ No documents found in customer-support index") + return None + + # Scroll through all documents + print("🔄 Retrieving all documents...") + + # Initialize scroll + scroll_response = es.search( + index="customer-support", + scroll='2m', + size=100, + body={ + "query": {"match_all": {}}, + "_source": ["metadata", "record_id", "text", "type"] + } + ) + + scroll_id = scroll_response['_scroll_id'] + hits = scroll_response['hits']['hits'] + + # Analysis containers + source_analysis = defaultdict(list) + source_counts = Counter() + metadata_samples = defaultdict(list) + file_types = Counter() + languages = Counter() + + processed_docs = 0 + + while hits: + for hit in hits: + processed_docs += 1 + doc_id = hit['_id'] + source = hit['_source'] + + # Extract metadata + metadata = source.get('metadata', {}) + record_id = source.get('record_id', 'unknown') + doc_type = source.get('type', 'unknown') + + # Analyze data source URL + data_source_url = metadata.get('data_source-url', 'unknown') + + # Categorize by source type + if 's3' in data_source_url.lower() or S3_SOURCE_BUCKET in data_source_url: + source_type = 'S3' + source_key = 'S3_PDFs' + elif 'elasticsearch' in data_source_url.lower() or 'sales-records' in data_source_url: + source_type = 'Elasticsearch' + source_key = 'Elasticsearch_Sales' + else: + source_type = 'Unknown' + source_key = 'Unknown' + + # Store analysis data + source_analysis[source_key].append({ + 'doc_id': doc_id, + 'record_id': record_id, + 'data_source_url': data_source_url, + 'filename': metadata.get('filename', 'unknown'), + 'filetype': metadata.get('filetype', 'unknown'), + 'type': doc_type + }) + + source_counts[source_key] += 1 + + # Store metadata samples (limit to 3 per source type) + if len(metadata_samples[source_key]) < 3: + metadata_samples[source_key].append(metadata) + + # Count file types and languages + file_types[metadata.get('filetype', 'unknown')] += 1 + languages_list = metadata.get('languages', []) + for lang in languages_list: + languages[lang] += 1 + + if processed_docs % 50 == 0: + print(f" 📄 Processed {processed_docs}/{total_docs} documents...") + + # Get next batch + try: + scroll_response = es.scroll(scroll_id=scroll_id, scroll='2m') + scroll_id = scroll_response['_scroll_id'] + hits = scroll_response['hits']['hits'] + except Exception as e: + print(f" ⚠️ Scroll error (likely reached end): {e}") + break + + # Clear scroll + try: + es.clear_scroll(scroll_id=scroll_id) + except: + pass # Ignore clear scroll errors + + print(f"✅ Successfully analyzed {processed_docs} documents") + + return { + 'total_docs': total_docs, + 'processed_docs': processed_docs, + 'source_counts': dict(source_counts), + 'source_analysis': dict(source_analysis), + 'metadata_samples': dict(metadata_samples), + 'file_types': dict(file_types), + 'languages': dict(languages) + } + + except Exception as e: + print(f"❌ Error analyzing customer-support index: {e}") + return None + +def print_analysis_results(results): + """Print detailed analysis results""" + if not results: + return + + print("\n" + "=" * 80) + print("📊 CUSTOMER SUPPORT INDEX ANALYSIS RESULTS") + print("=" * 80) + + # Overall statistics + print(f"\n📈 OVERALL STATISTICS:") + print(f" Total Documents: {results['total_docs']}") + print(f" Processed Documents: {results['processed_docs']}") + + # Source distribution + print(f"\n🔍 SOURCE DISTRIBUTION:") + source_counts = results['source_counts'] + for source_type, count in source_counts.items(): + percentage = (count / results['total_docs']) * 100 + print(f" {source_type}: {count} documents ({percentage:.1f}%)") + + # Verification results + print(f"\n✅ SOURCE VERIFICATION:") + has_s3 = 'S3_PDFs' in source_counts + has_elasticsearch = 'Elasticsearch_Sales' in source_counts + + print(f" S3 PDFs Source: {'✅ PRESENT' if has_s3 else '❌ MISSING'}") + print(f" Elasticsearch Sales Source: {'✅ PRESENT' if has_elasticsearch else '❌ MISSING'}") + + if has_s3 and has_elasticsearch: + print(f" 🎉 SUCCESS: Both S3 and Elasticsearch sources are represented!") + else: + print(f" ⚠️ WARNING: Not all expected sources are present") + + # File type distribution + print(f"\n📄 FILE TYPE DISTRIBUTION:") + for file_type, count in results['file_types'].items(): + percentage = (count / results['total_docs']) * 100 + print(f" {file_type}: {count} documents ({percentage:.1f}%)") + + # Language distribution + print(f"\n🌐 LANGUAGE DISTRIBUTION:") + for language, count in results['languages'].items(): + percentage = (count / results['total_docs']) * 100 + print(f" {language}: {count} documents ({percentage:.1f}%)") + + # Sample metadata for each source type + print(f"\n🔍 SAMPLE METADATA BY SOURCE TYPE:") + for source_type, samples in results['metadata_samples'].items(): + print(f"\n 📁 {source_type} ({len(samples)} samples):") + for i, metadata in enumerate(samples, 1): + print(f" Sample {i}:") + print(f" Filename: {metadata.get('filename', 'N/A')}") + print(f" Filetype: {metadata.get('filetype', 'N/A')}") + print(f" Data Source URL: {metadata.get('data_source-url', 'N/A')}") + print(f" Languages: {metadata.get('languages', 'N/A')}") + if 'data_source-record_locator-index_name' in metadata: + print(f" Index Name: {metadata['data_source-record_locator-index_name']}") + if 'data_source-record_locator-document_id' in metadata: + print(f" Document ID: {metadata['data_source-record_locator-document_id']}") + +def save_detailed_report(results, filename="customer_support_analysis_report.json"): + """Save detailed analysis results to a JSON file""" + if not results: + return + + try: + from datetime import datetime + + # Prepare data for JSON serialization + report_data = { + 'analysis_timestamp': datetime.now().isoformat(), + 'total_documents': results['total_docs'], + 'processed_documents': results['processed_docs'], + 'source_distribution': results['source_counts'], + 'file_type_distribution': results['file_types'], + 'language_distribution': results['languages'], + 'source_verification': { + 's3_pdfs_present': 'S3_PDFs' in results['source_counts'], + 'elasticsearch_sales_present': 'Elasticsearch_Sales' in results['source_counts'], + 'both_sources_present': ('S3_PDFs' in results['source_counts'] and + 'Elasticsearch_Sales' in results['source_counts']) + }, + 'detailed_analysis': results['source_analysis'], + 'metadata_samples': results['metadata_samples'] + } + + with open(filename, 'w') as f: + json.dump(report_data, f, indent=2, default=str) + + print(f"\n💾 Detailed report saved to: {filename}") + + except Exception as e: + print(f"⚠️ Could not save detailed report: {e}") + +def main(): + """Main execution function""" + print("🚀 Starting Customer Support Index Verification") + print("=" * 60) + + # Analyze the index + results = analyze_customer_support_index() + + if results: + # Print results + print_analysis_results(results) + + # Save detailed report + save_detailed_report(results) + + # Final verification + print(f"\n🎯 FINAL VERIFICATION:") + has_s3 = 'S3_PDFs' in results['source_counts'] + has_elasticsearch = 'Elasticsearch_Sales' in results['source_counts'] + + if has_s3 and has_elasticsearch: + print("✅ VERIFICATION PASSED: Both S3 and Elasticsearch sources are present in customer-support index") + return True + else: + print("❌ VERIFICATION FAILED: Not all expected sources are present") + return False + else: + print("❌ VERIFICATION FAILED: Could not analyze customer-support index") + return False + +if __name__ == "__main__": + success = main() + sys.exit(0 if success else 1)