37 lines
1.2 KiB
Python
37 lines
1.2 KiB
Python
import numpy as np
|
|
from openai import OpenAI
|
|
from dotenv import load_dotenv
|
|
from typing import List
|
|
|
|
|
|
class EmbeddingFunctions:
|
|
def __init__(self):
|
|
load_dotenv()
|
|
self.client = OpenAI()
|
|
|
|
def create_embeddings_from_sentences(
|
|
self, sentences: List[str], chunk_size: int = 2000
|
|
) -> List[np.ndarray]:
|
|
file_embeddings = []
|
|
for chunk_index in range(0, len(sentences), chunk_size):
|
|
chunk_embeddings = self.client.embeddings.create(
|
|
model="text-embedding-3-small",
|
|
input=sentences[chunk_index : chunk_index + chunk_size],
|
|
)
|
|
chunk_array = np.array(
|
|
[x.embedding for x in chunk_embeddings.data], dtype=np.float16
|
|
)
|
|
file_embeddings.append(
|
|
chunk_array / np.linalg.norm(chunk_array, axis=1)[:, np.newaxis]
|
|
)
|
|
|
|
return np.vstack(file_embeddings)
|
|
|
|
def create_embedding_from_sentence(self, sentence: list) -> np.ndarray:
|
|
query_embedding = self.client.embeddings.create(
|
|
model="text-embedding-3-small", input=sentence
|
|
)
|
|
return np.array(query_embedding.data[0].embedding, dtype=np.float16).reshape(
|
|
1, -1
|
|
)
|