cs50 · lamld203844 · May 13, 2024 · May 16, 2024 · May 16, 2024 · Aug 7, 2024
diff --git a/.env.example b/.env.example
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 *.env
 *.jsonl
 *.vsix
-demo/
+demo/
+*/playground.ipynb
diff --git a/examples/embeddings/google/00_embedding_example.py b/examples/embeddings/google/00_embedding_example.py
@@ -0,0 +1,22 @@
+import numpy as np
+import os
+from dotenv import load_dotenv
+load_dotenv()
+
+# Import Gemini module
+import google.generativeai as genai
+
+# Initialize Gemini client
+genai.configure(api_key=os.environ["GEMINI_API_KEY"])
+model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest")
+
+# Create embedding
+result = genai.embed_content(
+    model="models/embedding-001",
+    content="cat",
+)
+
+# print embedding
+embedding = result['embedding']
+print(embedding)
+print(f"Shape: {np.array(embedding).shape}")
diff --git a/examples/embeddings/google/01_create_embeddings.py b/examples/embeddings/google/01_create_embeddings.py
@@ -0,0 +1,58 @@
+import json
+import os
+from dotenv import load_dotenv
+load_dotenv()
+
+# Import Gemini module
+import google.generativeai as genai
+
+# Initialize Gemini client
+genai.configure(api_key=os.environ["GEMINI_API_KEY"])
+model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest")
+
+# ==========================================
+# embeddings given file i.e whole ai.txt file
+# ==========================================
+
+# helper function
+def get_embedding(chunk, model="models/embedding-001"):
+    '''
+        'embedding input into vector representation'
+    args:
+        chunk (str): raw input data string
+        model (str): gemini embedding model name
+    return:
+        embbeding (arr): vector representation
+    '''
+    # Create embedding
+    result = genai.embed_content(
+        model=model,
+        content=chunk,
+    )
+
+    return result['embedding']
+
+# Open file
+FILE_PATH = "../../../data/transcripts/ai.txt"
+with open(FILE_PATH, 'r') as f:
+
+    raw_data = f.read() # read whole file
+
+    # split data into chunks of 500 chars
+    chunks = [raw_data[i:i+500] for i in range(0, len(raw_data), 500)]
+
+    embeddings = {} # dictionary contain raw chunk: corresponding embedding
+
+    # embed each chunk in chunks
+    print("Creating embeddings...")
+    for chunk in chunks:
+        embedding = get_embedding(chunk)
+        embeddings[chunk] = embedding
+
+    # saving: write to file name 'embeddings.jsonl'
+    print("Write embeddings to file...")
+    with open('embeddings.jsonl', 'w') as f:
+        for chunk, embedding in embeddings.items():
+            f.write((f'{{"text": {json.dumps(chunk)}, "embedding": {embedding}}}\n'))
+
+    print('Written to file.')
diff --git a/examples/embeddings/google/02_search_embeddings.py b/examples/embeddings/google/02_search_embeddings.py
@@ -0,0 +1,76 @@
+import json
+import numpy as np
+import os
+from dotenv import load_dotenv
+load_dotenv()
+
+import google.generativeai as genai
+
+# Initialize Gemini client
+genai.configure(api_key=os.environ["GEMINI_API_KEY"])
+model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest")
+
+def get_embedding(chunk, model="models/embedding-001"):
+    '''
+        'embedding input into vector representation'
+    args:
+        chunk (str): raw input data string
+        model (str): gemini embedding model name
+    return:
+        embbeding (arr): vector representation
+    '''
+    # Create embedding
+    result = genai.embed_content(
+        model=model,
+        content=chunk,
+    )
+
+    return result['embedding']
+
+# Open the 'embeddings.jsonl' file in read mode
+with open('embeddings.jsonl', 'r') as f:
+
+    # Read all lines from the file
+    lines = f.readlines()
+
+    # Initialize a dictionary to load the embeddings
+    embeddings = {}
+
+    # Loop through each line in the file
+    for line in lines:
+
+        # Parse the JSON object in the line
+        line = json.loads(line)
+
+        # Map the text chunk to its corresponding embedding in the embeddings dictionary
+        embeddings[line['text']] = line['embedding']
+
+# Prompt the user to enter a query
+query = input("Enter a query: ")
+
+# Get the embedding for the query
+query_embedding = get_embedding(query)
+
+# Initialize variables to track the best matching chunk and its score
+best_chunk = None
+best_score = float("-inf")
+
+# Loop through each chunk and its embedding in the embeddings dictionary
+for chunk, embedding in embeddings.items():
+
+    # Compute the similarity score as the dot product of the embedding vectors
+    score = np.dot(embedding, query_embedding)
+
+    # If this score is better than the best score found so far,
+    # update the best_chunk and best_score with the current chunk and score
+    if score > best_score:
+        best_chunk = chunk
+        best_score = score
+
+    # Note: OpenAI embeddings are normalized to length 1, which means that:
+    # Cosine similarity can be computed slightly faster using just a dot product
+    # Cosine similarity and Euclidean distance will result in the identical rankings
+    # https://help.openai.com/en/articles/6824809-embeddings-frequently-asked-questions
+
+# Print the chunk that is most similar to the query
+print(best_chunk)
diff --git a/examples/embeddings/google/03_qa_with_embeddings_based_search.py b/examples/embeddings/google/03_qa_with_embeddings_based_search.py
@@ -0,0 +1,99 @@
+import json
+import numpy as np
+import os
+from dotenv import load_dotenv
+load_dotenv()
+
+import google.generativeai as genai
+# Initialize Gemini client
+genai.configure(api_key=os.environ["GEMINI_API_KEY"])
+
+# ============ Prompt query + add system prompt===========================
+
+# System prompt that sets the context for the chat completion API call
+system_prompt = "You are a friendly and supportive teaching assistant for CS50. You are also a cat."
+
+# Prompt the user for their query
+user_query = input("User: ")
+
+# ================ Search embeddings (Retrieve) =====================================
+# Get the embedding for the user's query using the function defined earlier
+def get_embedding(chunk, model="models/embedding-001"):
+    '''
+        'embedding input into vector representation'
+    args:
+        chunk (str): raw input data string
+        model (str): gemini embedding model name
+    return:
+        embbeding (arr): vector representation
+    '''
+    # Create embedding
+    result = genai.embed_content(
+        model=model,
+        content=chunk,
+    )
+
+    return result['embedding']
+
+def load_embeddings():
+    # Open the 'embeddings.jsonl' file in read mode to load pre-computed embeddings
+    with open('embeddings.jsonl', 'r') as f:
+
+        # Read all lines from the file
+        lines = f.readlines()
+
+        # Initialize a dictionary to store the embeddings
+        embeddings = {}
+
+        # Loop through each line in the file, assuming each line is a JSON object
+        for line in lines:
+
+            # Parse the JSON object from the line
+            line = json.loads(line)
+
+            # Store the text chunk and its corresponding embedding in the dictionary
+            embeddings[line['text']] = line['embedding']
+
+        return embeddings
+
+
+query_embedding = get_embedding(user_query)
+
+# Initialize variables to track the best matching chunk and its similarity score
+best_chunk = None
+best_score = float("-inf")
+
+# Loop through each chunk and its embedding in the embeddings dictionary
+embeddings = load_embeddings()
+for chunk, embedding in embeddings.items():
+
+    # Compute the similarity score as the dot product of the query embedding and the chunk's embedding
+    score = np.dot(embedding, query_embedding)
+
+    # Update the best_chunk and best_score if the current score is higher
+    if score > best_score:
+        best_chunk = chunk
+        best_score = score
+
+# ========= Augment prompt===========================================================================
+# Prepare the prompt for the chat completion by including the best matching chunk and the user's query
+prompt = "Answer the question using the following information delimited by triple brackets:\n\n"
+prompt += f"```\n{best_chunk}\n```"
+prompt += "\nQuestion: " + user_query
+prompt += "\nDon't say based on information provided or something like that"
+
+
+# ========================= Response ================================================
+# Generate a response using the Gemini API with the prepared prompt and system context
+model = genai.GenerativeModel(
+    model_name="gemini-1.5-pro-latest",
+    system_instruction=system_prompt
+)
+
+messages = [
+    {'role':'user', 'parts': [prompt]},
+]
+generated_content = model.generate_content(messages)
+response_text = generated_content.candidates[0].content.parts[0].text.strip()
+
+print(f"Assistant: {response_text}")