From 1d792a37821456f98781694f695d36072023a27f Mon Sep 17 00:00:00 2001
From: Ryan Liu <78334320+lamld203844@users.noreply.github.com>
Date: Mon, 13 May 2024 10:35:18 +0000
Subject: [PATCH 1/3] Create embeddings with Gemini API part 1

---
 .../embeddings/google/00_embedding_example.py | 22 +++++++
 .../embeddings/google/01_create_embeddings.py | 58 +++++++++++++++++++
 2 files changed, 80 insertions(+)
 create mode 100644 examples/embeddings/google/00_embedding_example.py
 create mode 100644 examples/embeddings/google/01_create_embeddings.py

diff --git a/examples/embeddings/google/00_embedding_example.py b/examples/embeddings/google/00_embedding_example.py
new file mode 100644
index 0000000..485a91f
--- /dev/null
+++ b/examples/embeddings/google/00_embedding_example.py
@@ -0,0 +1,22 @@
+import numpy as np
+import os
+from dotenv import load_dotenv
+load_dotenv()
+
+# Import Gemini module
+import google.generativeai as genai
+
+# Initialize Gemini client
+genai.configure(api_key=os.environ["GEMINI_API_KEY"])
+model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest")
+
+# Create embedding
+result = genai.embed_content(
+    model="models/embedding-001",
+    content="cat",
+)
+
+# print embedding
+embedding = result['embedding']
+print(embedding)
+print(f"Shape: {np.array(embedding).shape}")
\ No newline at end of file
diff --git a/examples/embeddings/google/01_create_embeddings.py b/examples/embeddings/google/01_create_embeddings.py
new file mode 100644
index 0000000..b27bfee
--- /dev/null
+++ b/examples/embeddings/google/01_create_embeddings.py
@@ -0,0 +1,58 @@
+import json
+import os
+from dotenv import load_dotenv
+load_dotenv()
+
+# Import Gemini module
+import google.generativeai as genai
+
+# Initialize Gemini client
+genai.configure(api_key=os.environ["GEMINI_API_KEY"])
+model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest")
+
+# ==========================================
+# embeddings given file i.e whole ai.txt file
+# ==========================================
+
+# helper function
+def embed_chunk(chunk, model="models/embedding-001"):
+    '''
+        'embedding input into vector representation'
+    args:
+        chunk (str): raw input data string
+        model (str): gemini embedding model name
+    return:
+        embbeding (arr): vector representation
+    '''
+    # Create embedding
+    result = genai.embed_content(
+        model=model,
+        content=chunk,
+    )
+
+    return result['embedding']
+
+# Open file
+FILE_PATH = "../../../data/transcripts/ai.txt"
+with open(FILE_PATH, 'r') as f:
+
+    raw_data = f.read() # read whole file
+
+    # split data into chunks of 500 chars
+    chunks = [raw_data[i:i+500] for i in range(0, len(raw_data), 500)]
+
+    embeddings = {} # dictionary contain raw chunk: corresponding embedding
+
+    # embed each chunk in chunks
+    print("Creating embeddings...")
+    for chunk in chunks:
+        embedding = embed_chunk(chunk)
+        embeddings[chunk] = embedding
+
+    # saving: write to file name 'embeddings.jsonl'
+    print("Write embeddings to file...")
+    with open('embeddings.jsonl', 'w') as f:
+        for chunk, embedding in embeddings.items():
+            f.write((f'{{"text": {json.dumps(chunk)}, "embedding": {embedding}}}\n'))
+
+    print('Written to file.')
\ No newline at end of file

From 4bd76b9bd7d66d6eaecec6772cfa1262e8b785be Mon Sep 17 00:00:00 2001
From: Ryan Liu <78334320+lamld203844@users.noreply.github.com>
Date: Thu, 16 May 2024 14:20:18 +0000
Subject: [PATCH 2/3] Complete embeddings with Gemini API

---
 .env.example                                       | 5 -----
 .gitignore                                         | 3 ++-
 examples/embeddings/google/01_create_embeddings.py | 4 ++--
 3 files changed, 4 insertions(+), 8 deletions(-)
 delete mode 100644 .env.example

diff --git a/.env.example b/.env.example
deleted file mode 100644
index 807ab3c..0000000
--- a/.env.example
+++ /dev/null
@@ -1,5 +0,0 @@
-CLAUDE_API_KEY=TODO # https://console.anthropic.com/settings/keys
-OPENAI_API_KEY=TODO # https://platform.openai.com/api-keys
-GEMINI_API_KEY=TODO # https://aistudio.google.com/app/apikey
-
-# After you've added your API keys to this file, rename .env.example to .env
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 315b4b2..f585fca 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 *.env
 *.jsonl
 *.vsix
-demo/
\ No newline at end of file
+demo/
+*/playground.ipynb
\ No newline at end of file
diff --git a/examples/embeddings/google/01_create_embeddings.py b/examples/embeddings/google/01_create_embeddings.py
index b27bfee..1b2cf49 100644
--- a/examples/embeddings/google/01_create_embeddings.py
+++ b/examples/embeddings/google/01_create_embeddings.py
@@ -15,7 +15,7 @@
 # ==========================================
 
 # helper function
-def embed_chunk(chunk, model="models/embedding-001"):
+def get_embedding(chunk, model="models/embedding-001"):
     '''
         'embedding input into vector representation'
     args:
@@ -46,7 +46,7 @@ def embed_chunk(chunk, model="models/embedding-001"):
     # embed each chunk in chunks
     print("Creating embeddings...")
     for chunk in chunks:
-        embedding = embed_chunk(chunk)
+        embedding = get_embedding(chunk)
         embeddings[chunk] = embedding
 
     # saving: write to file name 'embeddings.jsonl'

From 6ed9ff49f341c1d8cc055b678a2868a6ed92e153 Mon Sep 17 00:00:00 2001
From: Ryan Liu <78334320+lamld203844@users.noreply.github.com>
Date: Thu, 16 May 2024 14:20:40 +0000
Subject: [PATCH 3/3] Complete embeddings with Gemini API

---
 .../embeddings/google/02_search_embeddings.py | 76 ++++++++++++++
 .../03_qa_with_embeddings_based_search.py     | 99 +++++++++++++++++++
 2 files changed, 175 insertions(+)
 create mode 100644 examples/embeddings/google/02_search_embeddings.py
 create mode 100644 examples/embeddings/google/03_qa_with_embeddings_based_search.py

diff --git a/examples/embeddings/google/02_search_embeddings.py b/examples/embeddings/google/02_search_embeddings.py
new file mode 100644
index 0000000..914c588
--- /dev/null
+++ b/examples/embeddings/google/02_search_embeddings.py
@@ -0,0 +1,76 @@
+import json
+import numpy as np
+import os
+from dotenv import load_dotenv
+load_dotenv()
+
+import google.generativeai as genai
+
+# Initialize Gemini client
+genai.configure(api_key=os.environ["GEMINI_API_KEY"])
+model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest")
+
+def get_embedding(chunk, model="models/embedding-001"):
+    '''
+        'embedding input into vector representation'
+    args:
+        chunk (str): raw input data string
+        model (str): gemini embedding model name
+    return:
+        embbeding (arr): vector representation
+    '''
+    # Create embedding
+    result = genai.embed_content(
+        model=model,
+        content=chunk,
+    )
+
+    return result['embedding']
+
+# Open the 'embeddings.jsonl' file in read mode
+with open('embeddings.jsonl', 'r') as f:
+
+    # Read all lines from the file
+    lines = f.readlines()
+
+    # Initialize a dictionary to load the embeddings
+    embeddings = {}
+
+    # Loop through each line in the file
+    for line in lines:
+
+        # Parse the JSON object in the line
+        line = json.loads(line)
+
+        # Map the text chunk to its corresponding embedding in the embeddings dictionary
+        embeddings[line['text']] = line['embedding']
+
+# Prompt the user to enter a query
+query = input("Enter a query: ")
+
+# Get the embedding for the query
+query_embedding = get_embedding(query)
+
+# Initialize variables to track the best matching chunk and its score
+best_chunk = None
+best_score = float("-inf")
+
+# Loop through each chunk and its embedding in the embeddings dictionary
+for chunk, embedding in embeddings.items():
+
+    # Compute the similarity score as the dot product of the embedding vectors
+    score = np.dot(embedding, query_embedding)
+
+    # If this score is better than the best score found so far,
+    # update the best_chunk and best_score with the current chunk and score
+    if score > best_score:
+        best_chunk = chunk
+        best_score = score
+
+    # Note: OpenAI embeddings are normalized to length 1, which means that:
+    # Cosine similarity can be computed slightly faster using just a dot product
+    # Cosine similarity and Euclidean distance will result in the identical rankings
+    # https://help.openai.com/en/articles/6824809-embeddings-frequently-asked-questions
+
+# Print the chunk that is most similar to the query
+print(best_chunk)
diff --git a/examples/embeddings/google/03_qa_with_embeddings_based_search.py b/examples/embeddings/google/03_qa_with_embeddings_based_search.py
new file mode 100644
index 0000000..78d1d47
--- /dev/null
+++ b/examples/embeddings/google/03_qa_with_embeddings_based_search.py
@@ -0,0 +1,99 @@
+import json
+import numpy as np
+import os
+from dotenv import load_dotenv
+load_dotenv()
+
+import google.generativeai as genai
+# Initialize Gemini client
+genai.configure(api_key=os.environ["GEMINI_API_KEY"])
+
+# ============ Prompt query + add system prompt===========================
+
+# System prompt that sets the context for the chat completion API call
+system_prompt = "You are a friendly and supportive teaching assistant for CS50. You are also a cat."
+
+# Prompt the user for their query
+user_query = input("User: ")
+
+# ================ Search embeddings (Retrieve) =====================================
+# Get the embedding for the user's query using the function defined earlier
+def get_embedding(chunk, model="models/embedding-001"):
+    '''
+        'embedding input into vector representation'
+    args:
+        chunk (str): raw input data string
+        model (str): gemini embedding model name
+    return:
+        embbeding (arr): vector representation
+    '''
+    # Create embedding
+    result = genai.embed_content(
+        model=model,
+        content=chunk,
+    )
+
+    return result['embedding']
+
+def load_embeddings():
+    # Open the 'embeddings.jsonl' file in read mode to load pre-computed embeddings
+    with open('embeddings.jsonl', 'r') as f:
+
+        # Read all lines from the file
+        lines = f.readlines()
+
+        # Initialize a dictionary to store the embeddings
+        embeddings = {}
+
+        # Loop through each line in the file, assuming each line is a JSON object
+        for line in lines:
+
+            # Parse the JSON object from the line
+            line = json.loads(line)
+
+            # Store the text chunk and its corresponding embedding in the dictionary
+            embeddings[line['text']] = line['embedding']
+
+        return embeddings
+
+
+query_embedding = get_embedding(user_query)
+
+# Initialize variables to track the best matching chunk and its similarity score
+best_chunk = None
+best_score = float("-inf")
+
+# Loop through each chunk and its embedding in the embeddings dictionary
+embeddings = load_embeddings()
+for chunk, embedding in embeddings.items():
+
+    # Compute the similarity score as the dot product of the query embedding and the chunk's embedding
+    score = np.dot(embedding, query_embedding)
+
+    # Update the best_chunk and best_score if the current score is higher
+    if score > best_score:
+        best_chunk = chunk
+        best_score = score
+
+# ========= Augment prompt===========================================================================
+# Prepare the prompt for the chat completion by including the best matching chunk and the user's query
+prompt = "Answer the question using the following information delimited by triple brackets:\n\n"
+prompt += f"```\n{best_chunk}\n```"
+prompt += "\nQuestion: " + user_query
+prompt += "\nDon't say based on information provided or something like that"
+
+
+# ========================= Response ================================================
+# Generate a response using the Gemini API with the prepared prompt and system context
+model = genai.GenerativeModel(
+    model_name="gemini-1.5-pro-latest",
+    system_instruction=system_prompt
+)
+
+messages = [
+    {'role':'user', 'parts': [prompt]},
+]
+generated_content = model.generate_content(messages)
+response_text = generated_content.candidates[0].content.parts[0].text.strip()
+
+print(f"Assistant: {response_text}")