Skip to content

Commit

Permalink
updated openAI embeddings.
Browse files Browse the repository at this point in the history
  • Loading branch information
Amruth-Vamshi authored Nov 7, 2023
1 parent 167e0c6 commit 4684dd8
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 43 deletions.
53 changes: 14 additions & 39 deletions src/embeddings/openai/remote/model.py
Original file line number Diff line number Diff line change
@@ -1,52 +1,27 @@
import os
import openai
from openai.embeddings_utils import get_embedding
from cache import AsyncTTL
from request import ModelRequest
import numpy as np
import pandas as pd
import tiktoken
import ast
from sklearn.metrics.pairwise import cosine_similarity

openai.api_key = os.getenv("OPENAI_API_KEY")
from openai import OpenAI


class Model:
embedding_df = None
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base" # this the encoding for text-embedding-ada-002
max_tokens = 8000 # the maximum for text-embedding-ada-002 is 8191

def __new__(cls, context):
cls.context = context
if not hasattr(cls, 'instance'):
cls.embedding_df = pd.read_csv('src/embeddings/openai/remote/akai.csv')
cls.embedding_df['embedding'] = cls.embedding_df['embedding'].apply(ast.literal_eval)
cls.instance = client = OpenAI(
api_key=os.getenv("OPENAI_API_KEY"),
)
cls.instance = super(Model, cls).__new__(cls)
return cls.instance

@AsyncTTL(time_to_live=600000, maxsize=1024)
async def inference(self, request: ModelRequest):
print("request.prompt", request.prompt)
new_prompt_embedding = get_embedding(request.prompt, engine=self.embedding_model)
similarity_scores = cosine_similarity(
[new_prompt_embedding], np.stack(self.embedding_df['embedding'], axis=0))[0]
most_similar_indices = np.argsort(similarity_scores)[::-1]
most_similar_prompts = self.embedding_df.loc[most_similar_indices, ['combined_prompt', 'combined_content']]
most_similar_prompts['similarity_score'] = np.sort(similarity_scores)[::-1]
similar_content = most_similar_prompts.iloc[0:20]
sim_cutoff_range = np.max(similar_content['similarity_score']) - request.similarity_score_range
similar_content_df = similar_content.loc[similar_content['similarity_score'] >= sim_cutoff_range, :]
similar_content_df1 = similar_content_df.drop(columns='similarity_score')
similar_content_dict = similar_content_df1.to_dict('records')
# modified_content_dict = remove_content_tags_from_dic(similar_content_dict)
print("similar_content_dict", similar_content_dict)
return (similar_content_dict)
# Modify this function according to model requirements such that inputs and output remains the same
query = request.query

async def create_embeddings(self, embedding_df):
encoding = tiktoken.get_encoding(self.embedding_encoding)
embedding_df["n_tokens"] = embedding_df.combined_prompt.apply(lambda x: len(encoding.encode(x)))
embedding_df["embedding"] = embedding_df.combined_prompt.apply(
lambda x: get_embedding(x, engine=self.embedding_model))
return embedding_df
if(query != None):
embedding = client.embeddings.create(
input=query,
model=self.embedding_model,
).data[0].embedding
return embedding

return "Invalid input"
11 changes: 7 additions & 4 deletions src/embeddings/openai/remote/request.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import json
import pandas as pd


class ModelRequest():
def __init__(self, prompt, similarity_score_range=0):
self.prompt = prompt
self.similarity_score_range = similarity_score_range
def __init__(self, query=None, df = pd.DataFrame(), query_type = None):
# Url to download csv file
self.query = query # String
self.query_type = query_type
self.df = df

def to_json(self):
return json.dumps(self, default=lambda o: o.__dict__,
sort_keys=True, indent=4)
sort_keys=True, indent=4)

0 comments on commit 4684dd8

Please sign in to comment.