Skip to content

Add embeddings with Gemini API #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions .env.example

This file was deleted.

3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
*.env
*.jsonl
*.vsix
demo/
demo/
*/playground.ipynb
22 changes: 22 additions & 0 deletions examples/embeddings/google/00_embedding_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import numpy as np
import os
from dotenv import load_dotenv
load_dotenv()

# Import Gemini module
import google.generativeai as genai

# Initialize Gemini client
genai.configure(api_key=os.environ["GEMINI_API_KEY"])
model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest")

# Create embedding
result = genai.embed_content(
model="models/embedding-001",
content="cat",
)

# print embedding
embedding = result['embedding']
print(embedding)
print(f"Shape: {np.array(embedding).shape}")
58 changes: 58 additions & 0 deletions examples/embeddings/google/01_create_embeddings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import json
import os
from dotenv import load_dotenv
load_dotenv()

# Import Gemini module
import google.generativeai as genai

# Initialize Gemini client
genai.configure(api_key=os.environ["GEMINI_API_KEY"])
model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest")

# ==========================================
# embeddings given file i.e whole ai.txt file
# ==========================================

# helper function
def get_embedding(chunk, model="models/embedding-001"):
'''
'embedding input into vector representation'
args:
chunk (str): raw input data string
model (str): gemini embedding model name
return:
embbeding (arr): vector representation
'''
# Create embedding
result = genai.embed_content(
model=model,
content=chunk,
)

return result['embedding']

# Open file
FILE_PATH = "../../../data/transcripts/ai.txt"
with open(FILE_PATH, 'r') as f:

raw_data = f.read() # read whole file

# split data into chunks of 500 chars
chunks = [raw_data[i:i+500] for i in range(0, len(raw_data), 500)]

embeddings = {} # dictionary contain raw chunk: corresponding embedding

# embed each chunk in chunks
print("Creating embeddings...")
for chunk in chunks:
embedding = get_embedding(chunk)
embeddings[chunk] = embedding

# saving: write to file name 'embeddings.jsonl'
print("Write embeddings to file...")
with open('embeddings.jsonl', 'w') as f:
for chunk, embedding in embeddings.items():
f.write((f'{{"text": {json.dumps(chunk)}, "embedding": {embedding}}}\n'))

print('Written to file.')
76 changes: 76 additions & 0 deletions examples/embeddings/google/02_search_embeddings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import json
import numpy as np
import os
from dotenv import load_dotenv
load_dotenv()

import google.generativeai as genai

# Initialize Gemini client
genai.configure(api_key=os.environ["GEMINI_API_KEY"])
model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest")

def get_embedding(chunk, model="models/embedding-001"):
'''
'embedding input into vector representation'
args:
chunk (str): raw input data string
model (str): gemini embedding model name
return:
embbeding (arr): vector representation
'''
# Create embedding
result = genai.embed_content(
model=model,
content=chunk,
)

return result['embedding']

# Open the 'embeddings.jsonl' file in read mode
with open('embeddings.jsonl', 'r') as f:

# Read all lines from the file
lines = f.readlines()

# Initialize a dictionary to load the embeddings
embeddings = {}

# Loop through each line in the file
for line in lines:

# Parse the JSON object in the line
line = json.loads(line)

# Map the text chunk to its corresponding embedding in the embeddings dictionary
embeddings[line['text']] = line['embedding']

# Prompt the user to enter a query
query = input("Enter a query: ")

# Get the embedding for the query
query_embedding = get_embedding(query)

# Initialize variables to track the best matching chunk and its score
best_chunk = None
best_score = float("-inf")

# Loop through each chunk and its embedding in the embeddings dictionary
for chunk, embedding in embeddings.items():

# Compute the similarity score as the dot product of the embedding vectors
score = np.dot(embedding, query_embedding)

# If this score is better than the best score found so far,
# update the best_chunk and best_score with the current chunk and score
if score > best_score:
best_chunk = chunk
best_score = score

# Note: OpenAI embeddings are normalized to length 1, which means that:
# Cosine similarity can be computed slightly faster using just a dot product
# Cosine similarity and Euclidean distance will result in the identical rankings
# https://help.openai.com/en/articles/6824809-embeddings-frequently-asked-questions

# Print the chunk that is most similar to the query
print(best_chunk)
99 changes: 99 additions & 0 deletions examples/embeddings/google/03_qa_with_embeddings_based_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import json
import numpy as np
import os
from dotenv import load_dotenv
load_dotenv()

import google.generativeai as genai
# Initialize Gemini client
genai.configure(api_key=os.environ["GEMINI_API_KEY"])

# ============ Prompt query + add system prompt===========================

# System prompt that sets the context for the chat completion API call
system_prompt = "You are a friendly and supportive teaching assistant for CS50. You are also a cat."

# Prompt the user for their query
user_query = input("User: ")

# ================ Search embeddings (Retrieve) =====================================
# Get the embedding for the user's query using the function defined earlier
def get_embedding(chunk, model="models/embedding-001"):
'''
'embedding input into vector representation'
args:
chunk (str): raw input data string
model (str): gemini embedding model name
return:
embbeding (arr): vector representation
'''
# Create embedding
result = genai.embed_content(
model=model,
content=chunk,
)

return result['embedding']

def load_embeddings():
# Open the 'embeddings.jsonl' file in read mode to load pre-computed embeddings
with open('embeddings.jsonl', 'r') as f:

# Read all lines from the file
lines = f.readlines()

# Initialize a dictionary to store the embeddings
embeddings = {}

# Loop through each line in the file, assuming each line is a JSON object
for line in lines:

# Parse the JSON object from the line
line = json.loads(line)

# Store the text chunk and its corresponding embedding in the dictionary
embeddings[line['text']] = line['embedding']

return embeddings


query_embedding = get_embedding(user_query)

# Initialize variables to track the best matching chunk and its similarity score
best_chunk = None
best_score = float("-inf")

# Loop through each chunk and its embedding in the embeddings dictionary
embeddings = load_embeddings()
for chunk, embedding in embeddings.items():

# Compute the similarity score as the dot product of the query embedding and the chunk's embedding
score = np.dot(embedding, query_embedding)

# Update the best_chunk and best_score if the current score is higher
if score > best_score:
best_chunk = chunk
best_score = score

# ========= Augment prompt===========================================================================
# Prepare the prompt for the chat completion by including the best matching chunk and the user's query
prompt = "Answer the question using the following information delimited by triple brackets:\n\n"
prompt += f"```\n{best_chunk}\n```"
prompt += "\nQuestion: " + user_query
prompt += "\nDon't say based on information provided or something like that"


# ========================= Response ================================================
# Generate a response using the Gemini API with the prepared prompt and system context
model = genai.GenerativeModel(
model_name="gemini-1.5-pro-latest",
system_instruction=system_prompt
)

messages = [
{'role':'user', 'parts': [prompt]},
]
generated_content = model.generate_content(messages)
response_text = generated_content.candidates[0].content.parts[0].text.strip()

print(f"Assistant: {response_text}")