From 1d792a37821456f98781694f695d36072023a27f Mon Sep 17 00:00:00 2001 From: Ryan Liu <78334320+lamld203844@users.noreply.github.com> Date: Mon, 13 May 2024 10:35:18 +0000 Subject: [PATCH 1/3] Create embeddings with Gemini API part 1 --- .../embeddings/google/00_embedding_example.py | 22 +++++++ .../embeddings/google/01_create_embeddings.py | 58 +++++++++++++++++++ 2 files changed, 80 insertions(+) create mode 100644 examples/embeddings/google/00_embedding_example.py create mode 100644 examples/embeddings/google/01_create_embeddings.py diff --git a/examples/embeddings/google/00_embedding_example.py b/examples/embeddings/google/00_embedding_example.py new file mode 100644 index 0000000..485a91f --- /dev/null +++ b/examples/embeddings/google/00_embedding_example.py @@ -0,0 +1,22 @@ +import numpy as np +import os +from dotenv import load_dotenv +load_dotenv() + +# Import Gemini module +import google.generativeai as genai + +# Initialize Gemini client +genai.configure(api_key=os.environ["GEMINI_API_KEY"]) +model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest") + +# Create embedding +result = genai.embed_content( + model="models/embedding-001", + content="cat", +) + +# print embedding +embedding = result['embedding'] +print(embedding) +print(f"Shape: {np.array(embedding).shape}") \ No newline at end of file diff --git a/examples/embeddings/google/01_create_embeddings.py b/examples/embeddings/google/01_create_embeddings.py new file mode 100644 index 0000000..b27bfee --- /dev/null +++ b/examples/embeddings/google/01_create_embeddings.py @@ -0,0 +1,58 @@ +import json +import os +from dotenv import load_dotenv +load_dotenv() + +# Import Gemini module +import google.generativeai as genai + +# Initialize Gemini client +genai.configure(api_key=os.environ["GEMINI_API_KEY"]) +model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest") + +# ========================================== +# embeddings given file i.e whole ai.txt file +# ========================================== + +# helper function +def embed_chunk(chunk, model="models/embedding-001"): + ''' + 'embedding input into vector representation' + args: + chunk (str): raw input data string + model (str): gemini embedding model name + return: + embbeding (arr): vector representation + ''' + # Create embedding + result = genai.embed_content( + model=model, + content=chunk, + ) + + return result['embedding'] + +# Open file +FILE_PATH = "../../../data/transcripts/ai.txt" +with open(FILE_PATH, 'r') as f: + + raw_data = f.read() # read whole file + + # split data into chunks of 500 chars + chunks = [raw_data[i:i+500] for i in range(0, len(raw_data), 500)] + + embeddings = {} # dictionary contain raw chunk: corresponding embedding + + # embed each chunk in chunks + print("Creating embeddings...") + for chunk in chunks: + embedding = embed_chunk(chunk) + embeddings[chunk] = embedding + + # saving: write to file name 'embeddings.jsonl' + print("Write embeddings to file...") + with open('embeddings.jsonl', 'w') as f: + for chunk, embedding in embeddings.items(): + f.write((f'{{"text": {json.dumps(chunk)}, "embedding": {embedding}}}\n')) + + print('Written to file.') \ No newline at end of file From 4bd76b9bd7d66d6eaecec6772cfa1262e8b785be Mon Sep 17 00:00:00 2001 From: Ryan Liu <78334320+lamld203844@users.noreply.github.com> Date: Thu, 16 May 2024 14:20:18 +0000 Subject: [PATCH 2/3] Complete embeddings with Gemini API --- .env.example | 5 ----- .gitignore | 3 ++- examples/embeddings/google/01_create_embeddings.py | 4 ++-- 3 files changed, 4 insertions(+), 8 deletions(-) delete mode 100644 .env.example diff --git a/.env.example b/.env.example deleted file mode 100644 index 807ab3c..0000000 --- a/.env.example +++ /dev/null @@ -1,5 +0,0 @@ -CLAUDE_API_KEY=TODO # https://console.anthropic.com/settings/keys -OPENAI_API_KEY=TODO # https://platform.openai.com/api-keys -GEMINI_API_KEY=TODO # https://aistudio.google.com/app/apikey - -# After you've added your API keys to this file, rename .env.example to .env \ No newline at end of file diff --git a/.gitignore b/.gitignore index 315b4b2..f585fca 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *.env *.jsonl *.vsix -demo/ \ No newline at end of file +demo/ +*/playground.ipynb \ No newline at end of file diff --git a/examples/embeddings/google/01_create_embeddings.py b/examples/embeddings/google/01_create_embeddings.py index b27bfee..1b2cf49 100644 --- a/examples/embeddings/google/01_create_embeddings.py +++ b/examples/embeddings/google/01_create_embeddings.py @@ -15,7 +15,7 @@ # ========================================== # helper function -def embed_chunk(chunk, model="models/embedding-001"): +def get_embedding(chunk, model="models/embedding-001"): ''' 'embedding input into vector representation' args: @@ -46,7 +46,7 @@ def embed_chunk(chunk, model="models/embedding-001"): # embed each chunk in chunks print("Creating embeddings...") for chunk in chunks: - embedding = embed_chunk(chunk) + embedding = get_embedding(chunk) embeddings[chunk] = embedding # saving: write to file name 'embeddings.jsonl' From 6ed9ff49f341c1d8cc055b678a2868a6ed92e153 Mon Sep 17 00:00:00 2001 From: Ryan Liu <78334320+lamld203844@users.noreply.github.com> Date: Thu, 16 May 2024 14:20:40 +0000 Subject: [PATCH 3/3] Complete embeddings with Gemini API --- .../embeddings/google/02_search_embeddings.py | 76 ++++++++++++++ .../03_qa_with_embeddings_based_search.py | 99 +++++++++++++++++++ 2 files changed, 175 insertions(+) create mode 100644 examples/embeddings/google/02_search_embeddings.py create mode 100644 examples/embeddings/google/03_qa_with_embeddings_based_search.py diff --git a/examples/embeddings/google/02_search_embeddings.py b/examples/embeddings/google/02_search_embeddings.py new file mode 100644 index 0000000..914c588 --- /dev/null +++ b/examples/embeddings/google/02_search_embeddings.py @@ -0,0 +1,76 @@ +import json +import numpy as np +import os +from dotenv import load_dotenv +load_dotenv() + +import google.generativeai as genai + +# Initialize Gemini client +genai.configure(api_key=os.environ["GEMINI_API_KEY"]) +model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest") + +def get_embedding(chunk, model="models/embedding-001"): + ''' + 'embedding input into vector representation' + args: + chunk (str): raw input data string + model (str): gemini embedding model name + return: + embbeding (arr): vector representation + ''' + # Create embedding + result = genai.embed_content( + model=model, + content=chunk, + ) + + return result['embedding'] + +# Open the 'embeddings.jsonl' file in read mode +with open('embeddings.jsonl', 'r') as f: + + # Read all lines from the file + lines = f.readlines() + + # Initialize a dictionary to load the embeddings + embeddings = {} + + # Loop through each line in the file + for line in lines: + + # Parse the JSON object in the line + line = json.loads(line) + + # Map the text chunk to its corresponding embedding in the embeddings dictionary + embeddings[line['text']] = line['embedding'] + +# Prompt the user to enter a query +query = input("Enter a query: ") + +# Get the embedding for the query +query_embedding = get_embedding(query) + +# Initialize variables to track the best matching chunk and its score +best_chunk = None +best_score = float("-inf") + +# Loop through each chunk and its embedding in the embeddings dictionary +for chunk, embedding in embeddings.items(): + + # Compute the similarity score as the dot product of the embedding vectors + score = np.dot(embedding, query_embedding) + + # If this score is better than the best score found so far, + # update the best_chunk and best_score with the current chunk and score + if score > best_score: + best_chunk = chunk + best_score = score + + # Note: OpenAI embeddings are normalized to length 1, which means that: + # Cosine similarity can be computed slightly faster using just a dot product + # Cosine similarity and Euclidean distance will result in the identical rankings + # https://help.openai.com/en/articles/6824809-embeddings-frequently-asked-questions + +# Print the chunk that is most similar to the query +print(best_chunk) diff --git a/examples/embeddings/google/03_qa_with_embeddings_based_search.py b/examples/embeddings/google/03_qa_with_embeddings_based_search.py new file mode 100644 index 0000000..78d1d47 --- /dev/null +++ b/examples/embeddings/google/03_qa_with_embeddings_based_search.py @@ -0,0 +1,99 @@ +import json +import numpy as np +import os +from dotenv import load_dotenv +load_dotenv() + +import google.generativeai as genai +# Initialize Gemini client +genai.configure(api_key=os.environ["GEMINI_API_KEY"]) + +# ============ Prompt query + add system prompt=========================== + +# System prompt that sets the context for the chat completion API call +system_prompt = "You are a friendly and supportive teaching assistant for CS50. You are also a cat." + +# Prompt the user for their query +user_query = input("User: ") + +# ================ Search embeddings (Retrieve) ===================================== +# Get the embedding for the user's query using the function defined earlier +def get_embedding(chunk, model="models/embedding-001"): + ''' + 'embedding input into vector representation' + args: + chunk (str): raw input data string + model (str): gemini embedding model name + return: + embbeding (arr): vector representation + ''' + # Create embedding + result = genai.embed_content( + model=model, + content=chunk, + ) + + return result['embedding'] + +def load_embeddings(): + # Open the 'embeddings.jsonl' file in read mode to load pre-computed embeddings + with open('embeddings.jsonl', 'r') as f: + + # Read all lines from the file + lines = f.readlines() + + # Initialize a dictionary to store the embeddings + embeddings = {} + + # Loop through each line in the file, assuming each line is a JSON object + for line in lines: + + # Parse the JSON object from the line + line = json.loads(line) + + # Store the text chunk and its corresponding embedding in the dictionary + embeddings[line['text']] = line['embedding'] + + return embeddings + + +query_embedding = get_embedding(user_query) + +# Initialize variables to track the best matching chunk and its similarity score +best_chunk = None +best_score = float("-inf") + +# Loop through each chunk and its embedding in the embeddings dictionary +embeddings = load_embeddings() +for chunk, embedding in embeddings.items(): + + # Compute the similarity score as the dot product of the query embedding and the chunk's embedding + score = np.dot(embedding, query_embedding) + + # Update the best_chunk and best_score if the current score is higher + if score > best_score: + best_chunk = chunk + best_score = score + +# ========= Augment prompt=========================================================================== +# Prepare the prompt for the chat completion by including the best matching chunk and the user's query +prompt = "Answer the question using the following information delimited by triple brackets:\n\n" +prompt += f"```\n{best_chunk}\n```" +prompt += "\nQuestion: " + user_query +prompt += "\nDon't say based on information provided or something like that" + + +# ========================= Response ================================================ +# Generate a response using the Gemini API with the prepared prompt and system context +model = genai.GenerativeModel( + model_name="gemini-1.5-pro-latest", + system_instruction=system_prompt +) + +messages = [ + {'role':'user', 'parts': [prompt]}, +] +generated_content = model.generate_content(messages) +response_text = generated_content.candidates[0].content.parts[0].text.strip() + +print(f"Assistant: {response_text}")