meta-llama · anushadudi · May 27, 2025 · May 27, 2025 · May 27, 2025
diff --git a/3p-integrations/databricks/databricks_rag_example.ipynb b/3p-integrations/databricks/databricks_rag_example.ipynb
diff --git a/...egrations/databricks/llama-prompt-optimization-examples/hotpotqa/hotpotqa_databricks.yaml b/...egrations/databricks/llama-prompt-optimization-examples/hotpotqa/hotpotqa_databricks.yaml
@@ -0,0 +1,44 @@
+model:
+  name: "databricks/databricks-llama-4-maverick"
+  api_base: "https://<YOUR_DATABRICKS_URL>.cloud.databricks.com/serving-endpoints"
+  temperature: 0.0
+  max_tokens: 2048
+
+dataset:
+  adapter_class: "llama_prompt_ops.datasets.hotpotqa.HotpotQAAdapter"
+  path: "hotpotqa_sample_subset.json"
+  train_size: 0.07
+  validation_size: 0.07
+  test_size: 0.07
+  adapter_params:
+    passages_per_hop: 3
+    max_hops: 2
+    retriever_url: null  # Set to a valid ColBERT URL if available
+    input_field: "question"  # Primary input field is the question
+    context_field: "context"  # Field containing context passages
+    supporting_facts_field: "supporting_facts"  # Field containing supporting facts
+    golden_output_field: "answer"  # Field to use as ground truth/reference output
+
+system_prompt:
+  text: |
+    You are an expert at answering complex questions that require multi-hop reasoning. Give a short factoid answer.
+  inputs: ["question", "context"]
+  outputs: ["answer"]
+
+metric:
+  class: "llama_prompt_ops.datasets.hotpotqa.HotpotQAMetric"
+  output_field: "answer"
+  passage_weight: 0.5  # Weight for passage retrieval in the combined score
+  strict_json: false
+
+optimization:
+  strategy: "llama"
+  max_rounds: 3
+  max_examples_per_round: 5
+  max_prompt_length: 4096
+  num_candidates: 5
+  bootstrap_examples: 4
+  num_threads: 36
+
+output:
+  prefix: "hotpotqa_databricks"
diff --git a/...grations/databricks/llama-prompt-optimization-examples/hotpotqa/hotpotqa_prompt_ops.ipynb b/...grations/databricks/llama-prompt-optimization-examples/hotpotqa/hotpotqa_prompt_ops.ipynb
diff --git a/...ations/databricks/llama-prompt-optimization-examples/hotpotqa/hotpotqa_sample_subset.json b/...ations/databricks/llama-prompt-optimization-examples/hotpotqa/hotpotqa_sample_subset.json
diff --git a/...ma-prompt-optimization-examples/hotpotqa/results/hotpotqa_databricks_20250526_202100.json b/...ma-prompt-optimization-examples/hotpotqa/results/hotpotqa_databricks_20250526_202100.json
diff --git a/...ma-prompt-optimization-examples/hotpotqa/results/hotpotqa_databricks_20250526_202100.yaml b/...ma-prompt-optimization-examples/hotpotqa/results/hotpotqa_databricks_20250526_202100.yaml
diff --git a/.../databricks/llama-prompt-optimization-examples/hts-classification/hts_classification.yaml b/.../databricks/llama-prompt-optimization-examples/hts-classification/hts_classification.yaml
@@ -0,0 +1,40 @@
+model:
+  # name: "databricks/databricks-llama-4-maverick"
+  name: "databricks/maverick"
+  api_base: "https://<YOUR_DATABRICKS_URL>.cloud.databricks.com/serving-endpoints"
+  temperature: 0.0
+  max_tokens: 100
+
+dataset:
+  adapter_class: "llama_prompt_ops.core.datasets.ConfigurableJSONAdapter"
+  path: "hts_classification_sample.json"
+  train_size: 0.7
+  validation_size: 0.15
+  test_size: 0.15
+  input_field: "description"
+  golden_output_field: "hts_code"
+
+system_prompt:
+  text: |
+    You are an expert at classifying products into Harmonized Tariff Schedule (HTS) codes.
+    Given a product description, assign the most appropriate HTS code.
+    Provide only the HTS code as output, without any additional explanation.
+  inputs: ["description"]
+  outputs: ["hts_code"]
+
+metric:
+  class: "hts_metric.HTSCodeMetric"
+  output_field: "hts_code"
+
+
+optimization:
+  strategy: "llama"
+  max_rounds: 2
+  max_examples_per_round: 5
+  max_prompt_length: 1000
+  num_candidates: 5
+  bootstrap_examples: 4
+  num_threads: 8
+
+output:
+  prefix: "hts_classification"
diff --git a/...llama-prompt-optimization-examples/hts-classification/hts_classification_prompt_ops.ipynb b/...llama-prompt-optimization-examples/hts-classification/hts_classification_prompt_ops.ipynb
diff --git a/...icks/llama-prompt-optimization-examples/hts-classification/hts_classification_sample.json b/...icks/llama-prompt-optimization-examples/hts-classification/hts_classification_sample.json
diff --git a/...tegrations/databricks/llama-prompt-optimization-examples/hts-classification/hts_metric.py b/...tegrations/databricks/llama-prompt-optimization-examples/hts-classification/hts_metric.py
@@ -0,0 +1,144 @@
+"""
+HTS Code format validation metric.
+
+This module provides a metric for evaluating if HTS code predictions
+exactly match the format of the ground truth (must use same format - numeric or dotted).
+If gold uses numeric format (XXXXXXXXXX), prediction must also use numeric format.
+If gold uses dotted format (XX.XX.XX.XXXX), prediction must also use dotted format.
+"""
+
+import re
+from typing import Any, Dict, Union
+from llama_prompt_ops.core.metrics import MetricBase
+
+class HTSCodeMetric(MetricBase):
+    """
+    Metric for evaluating HTS code format predictions.
+
+    This metric verifies that:
+    1. The prediction is a valid 10-digit HTS code
+    2. The prediction uses EXACTLY the same format as the ground truth:
+       - If gold is XXXXXXXXXX, prediction must be XXXXXXXXXX
+       - If gold is XX.XX.XX.XXXX, prediction must be XX.XX.XX.XXXX
+    """
+
+    def __init__(self, output_field: str = "hts_code"):
+        """
+        Initialize the HTS code metric.
+
+        Args:
+            output_field: Field name containing the HTS code in the prediction
+        """
+        self.output_field = output_field
+
+    def __call__(
+        self, 
+        gold: Any, 
+        pred: Any, 
+        trace: bool = False,
+        **kwargs
+    ) -> Union[Dict[str, float], float]:
+        """
+        Evaluate if prediction format exactly matches the ground truth HTS code format.
+        Returns 0.0 if formats don't match (e.g., numeric vs dotted).
+
+        Args:
+            gold: Ground truth example
+            pred: Predicted example
+            trace: Whether to return detailed results
+
+        Returns:
+            Score (1.0 for exact format match, 0.0 otherwise) or detailed results dict
+        """
+        # Extract values
+        gold_value = self._extract_value(gold)
+        pred_value = self._extract_value(pred)
+
+        if trace:
+            print(f"Gold value: {gold_value}")
+            print(f"Predicted value: {pred_value}")
+
+        # Clean the values
+        gold_clean = self._clean_hts_code(gold_value)
+        pred_clean = self._clean_hts_code(pred_value)
+
+        # Get format patterns
+        gold_format = self._get_format_pattern(gold_clean)
+        pred_format = self._get_format_pattern(pred_clean)
+
+        # Check if prediction is valid and matches gold format exactly
+        is_valid = (
+            gold_format != 'invalid' and  # Gold must be valid
+            pred_format != 'invalid' and  # Prediction must be valid
+            gold_format == pred_format    # Formats must match exactly
+        )
+
+        if trace:
+            return {
+                "score": 1.0 if is_valid else 0.0,
+                "gold_format": gold_format,
+                "pred_format": pred_format,
+                "formats_match": gold_format == pred_format,
+                "cleaned_gold": gold_clean,
+                "cleaned_pred": pred_clean,
+                "explanation": self._get_explanation(gold_format, pred_format)
+            }
+
+        return 1.0 if is_valid else 0.0
+
+    def _extract_value(self, data: Any) -> str:
+        """
+        Extract HTS code value from various input types.
+        """
+        if isinstance(data, str):
+            return data
+
+        if isinstance(data, dict):
+            return str(data.get(self.output_field, ""))
+
+        if hasattr(data, self.output_field):
+            return str(getattr(data, self.output_field))
+
+        if hasattr(data, "outputs") and isinstance(data.outputs, dict):
+            return str(data.outputs.get(self.output_field, ""))
+
+        return str(data)
+
+    def _clean_hts_code(self, code: str) -> str:
+        """
+        Clean HTS code by removing spaces and standardizing format.
+        """
+        # Remove all whitespace
+        code = "".join(code.split())
+
+        # Remove any non-alphanumeric characters except dots
+        code = re.sub(r'[^0-9.]', '', code)
+
+        return code
+
+    def _get_format_pattern(self, code: str) -> str:
+        """
+        Get the format pattern of the code.
+        Returns:
+        - 'numeric' for XXXXXXXXXX format
+        - 'dotted' for XX.XX.XX.XXXX format
+        - 'invalid' for any other format
+        """
+        if re.match(r'^\d{10}$', code):
+            return 'numeric'
+        elif re.match(r'^\d{2}\.\d{2}\.\d{2}\.\d{4}$', code):
+            return 'dotted'
+        else:
+            return 'invalid'
+
+    def _get_explanation(self, gold_format: str, pred_format: str) -> str:
+        """
+        Get a human-readable explanation of the format comparison.
+        """
+        if gold_format == 'invalid':
+            return "Gold standard format is invalid"
+        if pred_format == 'invalid':
+            return "Prediction format is invalid"
+        if gold_format != pred_format:
+            return f"Format mismatch: Gold uses {gold_format} format but prediction uses {pred_format} format"
+        return "Formats match correctly"
diff --git a/...-optimization-examples/hts-classification/results/hts_classification_20250526_213325.json b/...-optimization-examples/hts-classification/results/hts_classification_20250526_213325.json
@@ -0,0 +1,30 @@
+{
+  "prompt": "You are a customs classification expert specializing in the Harmonized Tariff Schedule (HTS) codes for international trade. Your task is to accurately predict the HTS code for a given product description. Analyze the product description carefully and provide the most appropriate HTS code. Output only the HTS code without any additional explanation or context.",
+  "few_shots": [
+    {
+      "question": "Products of the United States when returned after having been exported, or any other products when returned within 3 years after having been exported, without having been advanced in value or improved in condition by any process of manufacture or other means while abroad Other: Articles provided for in chapter 87: Other",
+      "context": "",
+      "answer": "9801001074"
+    },
+    {
+      "question": "Combs, hair-slides and the like; hairpins, curling pins, curling grips, hair-curlers and the like, other than those of heading 8516, and parts thereof: Combs, hair-slides and the like: Other: Combs: Valued not over $4.50 per gross",
+      "context": "",
+      "answer": "9615192000"
+    },
+    {
+      "question": "Fishing rods, fish hooks and other line fishing tackle; fish landing nets, butterfly nets and similar nets; decoy \"birds\" (other than those of heading 9208 or 9705) and similar hunting or shooting equipment; parts and accessories thereof: Other: Other, including parts and accessories: Artificial baits and flies",
+      "context": "",
+      "answer": "9507907000"
+    },
+    {
+      "question": "Knives with cutting blades, serrated or not (including pruning knives), other than knives of heading 8208, and blades and other base metal parts thereof: Other: Handles of base metal: Other",
+      "context": "",
+      "answer": "8211959000"
+    },
+    {
+      "question": "Men's or boys' shirts: Of cotton: Other Dress shirts: With two or more colors in the warp and/or the filling: Boys' (340)",
+      "context": "",
+      "answer": "6205202021"
+    }
+  ]
+}
diff --git a/...-optimization-examples/hts-classification/results/hts_classification_20250526_213325.yaml b/...-optimization-examples/hts-classification/results/hts_classification_20250526_213325.yaml
@@ -0,0 +1,32 @@
+system: |-
+            You are a customs classification expert specializing in the Harmonized Tariff Schedule (HTS) codes for international trade. Your task is to accurately predict the HTS code for a given product description. Analyze the product description carefully and provide the most appropriate HTS code. Output only the HTS code without any additional explanation or context.
+
+        Few-shot examples:
+
+                Example 1:
+                    Question: Products of the United States when returned after having been exported, or any other products when returned within 3 years after having been exported, without having been advanced in value or improved in condition by any process of manufacture or other means while abroad Other: Articles provided for in chapter 87: Other
+                    Answer: 9801001074
+
+                Example 2:
+                    Question: Combs, hair-slides and the like; hairpins, curling pins, curling grips, hair-curlers and the like, other than those of heading 8516, and parts thereof: Combs, hair-slides and the like: Other: Combs: Valued not over $4.50 per gross
+                    Answer: 9615192000
+
+                Example 3:
+                    Question: Fishing rods, fish hooks and other line fishing tackle; fish landing nets, butterfly nets and similar nets; decoy "birds" (other than those of heading 9208 or 9705) and similar hunting or shooting equipment; parts and accessories thereof: Other: Other, including parts and accessories: Artificial baits and flies
+                    Answer: 9507907000
+
+                Example 4:
+                    Question: Knives with cutting blades, serrated or not (including pruning knives), other than knives of heading 8208, and blades and other base metal parts thereof: Other: Handles of base metal: Other
+                    Answer: 8211959000
+
+                Example 5:
+                    Question: Men's or boys' shirts: Of cotton: Other Dress shirts: With two or more colors in the warp and/or the filling: Boys' (340)
+                    Answer: 6205202021
+
+
+config:
+  task_model: <llama_prompt_ops.core.model.DSPyModelAdapter object at 0x7fe2f5d0f890>
+  model_family: llama
+  optimization:
+    name: BasicOptimizationStrategy
+    model_name: maverick