Initial commit: intelaide backend and frontend
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
0
doclink/app/__init__.py
Normal file
0
doclink/app/__init__.py
Normal file
0
doclink/app/api/__init__.py
Normal file
0
doclink/app/api/__init__.py
Normal file
474
doclink/app/api/core.py
Normal file
474
doclink/app/api/core.py
Normal file
@@ -0,0 +1,474 @@
|
||||
from typing import List
|
||||
import numpy as np
|
||||
import bcrypt
|
||||
import re
|
||||
import base64
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from cryptography.hazmat.primitives.ciphers.aead import AESGCM
|
||||
|
||||
from ..functions.reading_functions import ReadingFunctions
|
||||
from ..functions.embedding_functions import EmbeddingFunctions
|
||||
from ..functions.indexing_functions import IndexingFunctions
|
||||
from ..functions.chatbot_functions import ChatbotFunctions
|
||||
from ..functions.scraping_functions import Webscraper
|
||||
from ..functions.export_functions import Exporter
|
||||
|
||||
|
||||
class Authenticator:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def verify_password(self, plain_password: str, hashed_password: str) -> bool:
|
||||
return bcrypt.checkpw(
|
||||
plain_password.encode("utf-8"), hashed_password.encode("utf-8")
|
||||
)
|
||||
|
||||
def hash_password(self, password: str) -> str:
|
||||
salt = bcrypt.gensalt()
|
||||
return bcrypt.hashpw(password.encode("utf-8"), salt).decode("utf-8")
|
||||
|
||||
|
||||
class Encryptor:
|
||||
def __init__(self):
|
||||
load_dotenv()
|
||||
self.key = os.getenv("ENCRYPTION_KEY")
|
||||
self.email_auth = "EMAIL_AUTH_DATA_2025"
|
||||
self.email_nonce = self.email_auth.encode("utf-8")[:12].ljust(12, b"\0")
|
||||
self._key_bytes = base64.b64decode(self.key)
|
||||
self.aesgcm = AESGCM(self._key_bytes)
|
||||
|
||||
def encrypt(self, text: str, auth_data) -> str:
|
||||
try:
|
||||
nonce = os.urandom(12)
|
||||
encrypted_data = self.aesgcm.encrypt(
|
||||
nonce, text.encode("utf-8"), auth_data.encode("utf-8")
|
||||
)
|
||||
combined_encrypt = nonce + encrypted_data
|
||||
encrypted_sentence = base64.b64encode(combined_encrypt).decode("utf-8")
|
||||
return encrypted_sentence
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
def decrypt(self, encrypted_data: str, auth_data) -> str:
|
||||
try:
|
||||
decoded_text = base64.b64decode(encrypted_data.encode("utf-8"))
|
||||
nonce = decoded_text[:12]
|
||||
encrypted_text = decoded_text[12:]
|
||||
decrypted_data = self.aesgcm.decrypt(
|
||||
nonce, encrypted_text, auth_data.encode("utf-8")
|
||||
)
|
||||
return decrypted_data.decode("utf-8")
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
|
||||
class Processor:
|
||||
def __init__(
|
||||
self,
|
||||
):
|
||||
self.ef = EmbeddingFunctions()
|
||||
self.rf = ReadingFunctions()
|
||||
self.indf = IndexingFunctions()
|
||||
self.cf = ChatbotFunctions()
|
||||
self.en = Encryptor()
|
||||
self.ws = Webscraper()
|
||||
self.ex = Exporter()
|
||||
|
||||
def create_index(self, embeddings: np.ndarray, index_type: str = "flat"):
|
||||
if index_type == "flat":
|
||||
index = self.indf.create_flat_index(embeddings=embeddings)
|
||||
return index
|
||||
|
||||
def filter_search(
|
||||
self, domain_content: dict, domain_embeddings: np.ndarray, file_ids: list
|
||||
):
|
||||
filtered_indexes = []
|
||||
filtered_content = []
|
||||
|
||||
for i, content in enumerate(domain_content):
|
||||
if content[4] in file_ids:
|
||||
filtered_indexes.append(i)
|
||||
filtered_content.append(content)
|
||||
|
||||
filtered_embeddings = domain_embeddings[filtered_indexes]
|
||||
|
||||
index = self.create_index(embeddings=filtered_embeddings)
|
||||
boost_info = self.extract_boost_info(
|
||||
domain_content=filtered_content, embeddings=filtered_embeddings
|
||||
)
|
||||
|
||||
try:
|
||||
index_header = self.create_index(embeddings=boost_info["header_embeddings"])
|
||||
except IndexError:
|
||||
index_header = None
|
||||
|
||||
return index, filtered_content, boost_info, index_header
|
||||
|
||||
def search_index(
|
||||
self,
|
||||
user_query: str,
|
||||
domain_content: dict,
|
||||
boost_info: dict,
|
||||
index,
|
||||
index_header,
|
||||
):
|
||||
file_lang = self.file_lang_detection(domain_content=domain_content)
|
||||
queries, lang = self.query_preprocessing(
|
||||
user_query=user_query, file_lang=file_lang
|
||||
)
|
||||
if not queries:
|
||||
if lang == "tr":
|
||||
return (
|
||||
"Sorunu anlayamadım",
|
||||
None,
|
||||
None,
|
||||
)
|
||||
else:
|
||||
return (
|
||||
f"I didn't understand {user_query}",
|
||||
None,
|
||||
None,
|
||||
)
|
||||
|
||||
query_embeddings = self.ef.create_embeddings_from_sentences(
|
||||
sentences=queries[:-1]
|
||||
)
|
||||
|
||||
boost_array = self._create_boost_array(
|
||||
header_indexes=boost_info["header_indexes"],
|
||||
sentence_amount=index.ntotal,
|
||||
query_vector=query_embeddings[0],
|
||||
index_header=index_header,
|
||||
)
|
||||
|
||||
# Get search distances with occurrences
|
||||
dict_resource = {}
|
||||
for i, query_embedding in enumerate(query_embeddings):
|
||||
D, I = index.search(query_embedding.reshape(1, -1), len(domain_content)) # noqa: E741
|
||||
if i == 0:
|
||||
convergence_vector, distance_vector = I[0], D[0]
|
||||
for i, match_index in enumerate(I[0]):
|
||||
if match_index in dict_resource:
|
||||
dict_resource[match_index].append(D[0][i])
|
||||
else:
|
||||
dict_resource[match_index] = [D[0][i]]
|
||||
|
||||
file_boost_array = self._create_file_boost_array(
|
||||
domain_content=domain_content,
|
||||
distance_vector=distance_vector,
|
||||
convergence_vector=convergence_vector,
|
||||
)
|
||||
|
||||
# Combine boost arrays
|
||||
combined_boost_array = 0.25 * file_boost_array + 0.75 * boost_array
|
||||
|
||||
# Get average occurrences
|
||||
dict_resource = self._avg_resources(dict_resource)
|
||||
|
||||
for key in dict_resource:
|
||||
dict_resource[key] *= combined_boost_array[key]
|
||||
|
||||
sorted_dict = dict(
|
||||
sorted(dict_resource.items(), key=lambda item: item[1], reverse=True)
|
||||
)
|
||||
|
||||
filtered_indexes = [
|
||||
sentence_index
|
||||
for sentence_index in sorted_dict.keys()
|
||||
if sorted_dict[sentence_index] >= 0.35
|
||||
]
|
||||
sorted_sentence_indexes = filtered_indexes[:10]
|
||||
|
||||
# Early return with message
|
||||
if not sorted_sentence_indexes:
|
||||
if lang == "tr":
|
||||
return (
|
||||
"Seçtiğin dokümanlarda bu sorunun cevabını bulamadım",
|
||||
None,
|
||||
None,
|
||||
)
|
||||
else:
|
||||
return (
|
||||
"I couldn't find the answer of the question within the selected files",
|
||||
None,
|
||||
None,
|
||||
)
|
||||
|
||||
# Sentences to context creation
|
||||
context, context_windows, resources = self.context_creator(
|
||||
sentence_index_list=sorted_sentence_indexes,
|
||||
domain_content=domain_content,
|
||||
header_indexes=boost_info["header_indexes"],
|
||||
table_indexes=boost_info["table_indexes"],
|
||||
)
|
||||
|
||||
answer = self.cf.response_generation(
|
||||
query=user_query, context=context, intention=queries[-1]
|
||||
)
|
||||
|
||||
return answer, resources, context_windows
|
||||
|
||||
def query_preprocessing(self, user_query, file_lang):
|
||||
generated_queries, lang = self.cf.query_generation(
|
||||
query=user_query, file_lang=file_lang
|
||||
)
|
||||
splitted_queries = generated_queries.split("\n")
|
||||
|
||||
if len(splitted_queries) > 1:
|
||||
return splitted_queries, lang
|
||||
return None, lang
|
||||
|
||||
def _create_boost_array(
|
||||
self,
|
||||
header_indexes: list,
|
||||
sentence_amount: int,
|
||||
query_vector: np.ndarray,
|
||||
index_header,
|
||||
):
|
||||
boost_array = np.ones(sentence_amount)
|
||||
|
||||
if not index_header:
|
||||
return boost_array
|
||||
|
||||
D, I = index_header.search(query_vector.reshape(1, -1), 10) # noqa: E741
|
||||
filtered_header_indexes = [
|
||||
header_index
|
||||
for index, header_index in enumerate(I[0])
|
||||
if D[0][index] > 0.30
|
||||
]
|
||||
|
||||
if not filtered_header_indexes:
|
||||
return boost_array
|
||||
else:
|
||||
for i, filtered_index in enumerate(filtered_header_indexes):
|
||||
try:
|
||||
start = header_indexes[filtered_index] + 1
|
||||
end = header_indexes[filtered_index + 1]
|
||||
if i > 2:
|
||||
boost_array[start:end] *= 1.1
|
||||
elif i > 0:
|
||||
boost_array[start:end] *= 1.2
|
||||
else:
|
||||
boost_array[start:end] *= 1.3
|
||||
except IndexError as e:
|
||||
print(f"List is out of range {e}")
|
||||
continue
|
||||
return boost_array
|
||||
|
||||
# File boost function
|
||||
def _create_file_boost_array(
|
||||
self,
|
||||
domain_content: list,
|
||||
distance_vector: np.ndarray,
|
||||
convergence_vector: np.ndarray,
|
||||
):
|
||||
boost_array = np.ones(len(domain_content))
|
||||
sort_order = np.argsort(convergence_vector)
|
||||
sorted_scores = distance_vector[sort_order]
|
||||
file_counts = {}
|
||||
|
||||
if not domain_content:
|
||||
return boost_array
|
||||
else:
|
||||
for _, _, _, _, _, filename in domain_content:
|
||||
file_counts[filename] = file_counts.get(filename, 0) + 1
|
||||
|
||||
file_sentence_counts = np.cumsum([0] + list(file_counts.values()))
|
||||
|
||||
for i in range(len(file_sentence_counts) - 1):
|
||||
start, end = file_sentence_counts[i], file_sentence_counts[i + 1]
|
||||
if np.mean(sorted_scores[start:end]) > 0.30:
|
||||
boost_array[start:end] *= 1.1
|
||||
|
||||
return boost_array
|
||||
|
||||
def context_creator(
|
||||
self,
|
||||
sentence_index_list: list,
|
||||
domain_content: List[tuple],
|
||||
header_indexes: list,
|
||||
table_indexes: list,
|
||||
):
|
||||
context = ""
|
||||
context_windows = []
|
||||
widened_indexes = []
|
||||
original_matches = set(sentence_index_list)
|
||||
|
||||
for i, sentence_index in enumerate(sentence_index_list):
|
||||
window_size = 4 if i < 3 else 2
|
||||
start = max(0, sentence_index - window_size)
|
||||
end = min(len(domain_content) - 1, sentence_index + window_size)
|
||||
|
||||
if table_indexes:
|
||||
for table_index in table_indexes:
|
||||
if sentence_index == table_index:
|
||||
widened_indexes.append((table_index, table_index))
|
||||
table_indexes.remove(table_index)
|
||||
break
|
||||
|
||||
if not header_indexes:
|
||||
widened_indexes.append((start, end))
|
||||
else:
|
||||
for i, current_header in enumerate(header_indexes):
|
||||
if sentence_index == current_header:
|
||||
start = max(0, sentence_index)
|
||||
if (
|
||||
i + 1 < len(header_indexes)
|
||||
and abs(sentence_index - header_indexes[i + 1]) <= 20
|
||||
):
|
||||
end = min(
|
||||
len(domain_content) - 1, header_indexes[i + 1] - 1
|
||||
)
|
||||
else:
|
||||
end = min(
|
||||
len(domain_content) - 1, sentence_index + window_size
|
||||
)
|
||||
break
|
||||
elif (
|
||||
i + 1 < len(header_indexes)
|
||||
and current_header < sentence_index < header_indexes[i + 1]
|
||||
):
|
||||
start = (
|
||||
current_header
|
||||
if abs(sentence_index - current_header) <= 20
|
||||
else max(0, sentence_index - window_size)
|
||||
)
|
||||
end = (
|
||||
header_indexes[i + 1] - 1
|
||||
if abs(header_indexes[i + 1] - sentence_index) <= 20
|
||||
else min(
|
||||
len(domain_content) - 1, sentence_index + window_size
|
||||
)
|
||||
)
|
||||
break
|
||||
elif (
|
||||
i == len(header_indexes) - 1
|
||||
and current_header >= sentence_index
|
||||
):
|
||||
start = (
|
||||
max(0, sentence_index)
|
||||
if abs(current_header - sentence_index) <= 20
|
||||
else max(0, sentence_index - window_size)
|
||||
)
|
||||
end = min(len(domain_content) - 1, sentence_index + window_size)
|
||||
break
|
||||
if (start, end) not in widened_indexes:
|
||||
widened_indexes.append((start, end))
|
||||
|
||||
merged_truples = self.merge_tuples(widen_sentences=widened_indexes)
|
||||
|
||||
used_indexes = [
|
||||
min(index for index in sentence_index_list if tuple[0] <= index <= tuple[1])
|
||||
for tuple in merged_truples
|
||||
]
|
||||
resources = self._extract_resources(
|
||||
sentence_indexes=used_indexes, domain_content=domain_content
|
||||
)
|
||||
|
||||
for i, tuple in enumerate(merged_truples):
|
||||
if tuple[0] == tuple[1]:
|
||||
windened_sentence = " ".join(
|
||||
self.en.decrypt(
|
||||
domain_content[tuple[0]][0], domain_content[tuple[0]][4]
|
||||
)
|
||||
)
|
||||
context += f"Context{i + 1}: File:{resources['file_names'][i]}, Confidence:{(len(sentence_index_list) - i) / len(sentence_index_list)}, Table\n{windened_sentence}\n"
|
||||
context_windows.append(windened_sentence)
|
||||
else:
|
||||
highlighted_sentences = []
|
||||
|
||||
for index in range(tuple[0], tuple[1] + 1):
|
||||
sentence_text = self.en.decrypt(
|
||||
domain_content[index][0], domain_content[index][4]
|
||||
)
|
||||
|
||||
# Highlight original matches
|
||||
if index in original_matches:
|
||||
highlighted_sentences.append(f"<mark>{sentence_text}</mark>")
|
||||
else:
|
||||
highlighted_sentences.append(sentence_text)
|
||||
|
||||
windened_sentence = " ".join(highlighted_sentences)
|
||||
context += f"Context{i + 1}: File:{resources['file_names'][i]}, Confidence:{(len(sentence_index_list) - i) / len(sentence_index_list)}, {windened_sentence}\n\n"
|
||||
context_windows.append(windened_sentence)
|
||||
|
||||
return context, context_windows, resources
|
||||
|
||||
def _avg_resources(self, resources_dict):
|
||||
for key, value in resources_dict.items():
|
||||
value_mean = sum(value) / len(value)
|
||||
value_coefficient = value_mean + len(value) * 0.0025
|
||||
resources_dict[key] = value_coefficient
|
||||
return resources_dict
|
||||
|
||||
def _extract_resources(self, sentence_indexes: list, domain_content: List[tuple]):
|
||||
resources = {"file_names": [], "page_numbers": []}
|
||||
for index in sentence_indexes:
|
||||
resources["file_names"].append(domain_content[index][5])
|
||||
resources["page_numbers"].append(domain_content[index][3])
|
||||
return resources
|
||||
|
||||
def _create_dynamic_context(self, sentences):
|
||||
context = ""
|
||||
for i, sentence in enumerate(sentences):
|
||||
context += f"{i + 1}: {sentence}\n"
|
||||
return context
|
||||
|
||||
def extract_boost_info(self, domain_content: List[tuple], embeddings: np.ndarray):
|
||||
boost_info = {
|
||||
"header_indexes": [],
|
||||
"headers": [],
|
||||
"header_embeddings": [],
|
||||
"table_indexes": [],
|
||||
}
|
||||
for index in range(len(domain_content)):
|
||||
if domain_content[index][1]:
|
||||
boost_info["header_indexes"].append(index)
|
||||
boost_info["headers"].append(domain_content[index][0])
|
||||
|
||||
if domain_content[index][2]:
|
||||
boost_info["table_indexes"].append(index)
|
||||
boost_info["header_embeddings"] = embeddings[boost_info["header_indexes"]]
|
||||
return boost_info
|
||||
|
||||
def merge_tuples(self, widen_sentences):
|
||||
sorted_dict = {0: widen_sentences[0]}
|
||||
|
||||
for sentence_tuple in widen_sentences[1:]:
|
||||
tuple_range = range(sentence_tuple[0], sentence_tuple[1])
|
||||
is_in = 0
|
||||
for index, value in sorted_dict.items():
|
||||
current_range = range(value[0], value[1])
|
||||
if set(tuple_range) & set(current_range):
|
||||
interval = (
|
||||
min(sorted_dict[index][0], sentence_tuple[0]),
|
||||
max(sorted_dict[index][1], sentence_tuple[1]),
|
||||
)
|
||||
sorted_dict[index] = interval
|
||||
is_in = 1
|
||||
|
||||
if not is_in:
|
||||
sorted_dict[index + 1] = sentence_tuple
|
||||
|
||||
return list(dict.fromkeys(sorted_dict.values()))
|
||||
|
||||
def file_lang_detection(self, domain_content: List[tuple]):
|
||||
file_lang = {}
|
||||
detected_sentence_amount = (
|
||||
25 if len(domain_content) > 25 else len(domain_content)
|
||||
)
|
||||
|
||||
for i in range(0, detected_sentence_amount):
|
||||
decrypted_content = self.en.decrypt(
|
||||
domain_content[i][0], domain_content[i][4]
|
||||
)
|
||||
if re.match(r"\b[a-zA-Z]{" + str(4) + r",}\b", decrypted_content) or (
|
||||
decrypted_content[0] == "|" and decrypted_content[-1] == "|"
|
||||
):
|
||||
lang = self.cf.detect_language(decrypted_content)
|
||||
file_lang[lang] = file_lang.get(lang, 0) + 1
|
||||
try:
|
||||
return max(file_lang, key=file_lang.get)
|
||||
except ValueError:
|
||||
return "en"
|
||||
846
doclink/app/api/endpoints.py
Normal file
846
doclink/app/api/endpoints.py
Normal file
@@ -0,0 +1,846 @@
|
||||
from fastapi import APIRouter, UploadFile, HTTPException, Request, Query, File, Form
|
||||
from fastapi.responses import JSONResponse, StreamingResponse
|
||||
from google.oauth2.credentials import Credentials
|
||||
from googleapiclient.discovery import build
|
||||
from googleapiclient.http import MediaIoBaseDownload
|
||||
from datetime import datetime
|
||||
|
||||
import os
|
||||
import logging
|
||||
import uuid
|
||||
import base64
|
||||
import psycopg2
|
||||
import io
|
||||
import hmac
|
||||
import hashlib
|
||||
|
||||
from .core import Processor
|
||||
from .core import Authenticator
|
||||
from .core import Encryptor
|
||||
from ..db.database import Database
|
||||
from ..redis_manager import RedisManager, RedisConnectionError
|
||||
|
||||
# services
|
||||
router = APIRouter()
|
||||
processor = Processor()
|
||||
authenticator = Authenticator()
|
||||
redis_manager = RedisManager()
|
||||
encryptor = Encryptor()
|
||||
|
||||
# logger
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# environment variables
|
||||
GOOGLE_CLIENT_ID = os.getenv("GOOGLE_CLIENT_ID")
|
||||
GOOGLE_CLIENT_SECRET = os.getenv("GOOGLE_CLIENT_SECRET")
|
||||
GOOGLE_REDIRECT_URI = os.getenv("GOOGLE_REDIRECT_URI")
|
||||
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
||||
|
||||
|
||||
# request functions
|
||||
@router.post("/db/get_user_info")
|
||||
async def get_user_info(request: Request):
|
||||
try:
|
||||
data = await request.json()
|
||||
user_id = data.get("user_id")
|
||||
with Database() as db:
|
||||
user_info, domain_info = db.get_user_info_w_id(user_id)
|
||||
|
||||
return JSONResponse(
|
||||
content={
|
||||
"user_info": user_info,
|
||||
"domain_info": domain_info,
|
||||
},
|
||||
status_code=200,
|
||||
)
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/db/rename_domain")
|
||||
async def rename_domain(request: Request):
|
||||
try:
|
||||
data = await request.json()
|
||||
selected_domain_id = data.get("domain_id")
|
||||
new_name = data.get("new_name")
|
||||
with Database() as db:
|
||||
success = db.rename_domain(domain_id=selected_domain_id, new_name=new_name)
|
||||
|
||||
if not success:
|
||||
return JSONResponse(
|
||||
content={"message": "error while renaming domain"},
|
||||
status_code=400,
|
||||
)
|
||||
|
||||
return JSONResponse(
|
||||
content={"message": "success"},
|
||||
status_code=200,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error renaming domain: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/db/create_domain")
|
||||
async def create_domain(
|
||||
request: Request,
|
||||
userID: str = Query(...),
|
||||
):
|
||||
try:
|
||||
data = await request.json()
|
||||
domain_name = data.get("domain_name")
|
||||
domain_id = str(uuid.uuid4())
|
||||
with Database() as db:
|
||||
result = db.create_domain(
|
||||
user_id=userID,
|
||||
domain_id=domain_id,
|
||||
domain_name=domain_name,
|
||||
domain_type=1,
|
||||
)
|
||||
|
||||
if not result["success"]:
|
||||
return JSONResponse(
|
||||
content={"message": result["message"]},
|
||||
status_code=400,
|
||||
)
|
||||
|
||||
return JSONResponse(
|
||||
content={"message": "success", "domain_id": domain_id},
|
||||
status_code=200,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error renaming domain: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/db/delete_domain")
|
||||
async def delete_domain(request: Request):
|
||||
try:
|
||||
data = await request.json()
|
||||
domain_id = data.get("domain_id")
|
||||
with Database() as db:
|
||||
success = db.delete_domain(domain_id=domain_id)
|
||||
|
||||
if success < 0:
|
||||
return JSONResponse(
|
||||
content={
|
||||
"message": "This is your default domain. You cannot delete it completely, instead you can delete the unnucessary files inside!"
|
||||
},
|
||||
status_code=400,
|
||||
)
|
||||
elif success == 0:
|
||||
return JSONResponse(
|
||||
content={
|
||||
"message": "Error while deleting domain. Please report this to us, using feedback on the bottom left."
|
||||
},
|
||||
status_code=400,
|
||||
)
|
||||
|
||||
db.conn.commit()
|
||||
|
||||
return JSONResponse(
|
||||
content={"message": "success"},
|
||||
status_code=200,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error while deleting domain: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/db/insert_feedback")
|
||||
async def insert_feedback(
|
||||
userID: str = Query(...),
|
||||
feedback_type: str = Form(...),
|
||||
feedback_description: str = Form(...),
|
||||
feedback_screenshot: UploadFile = File(None),
|
||||
):
|
||||
try:
|
||||
feedback_id = str(uuid.uuid4())
|
||||
screenshot_data = None
|
||||
|
||||
if feedback_screenshot:
|
||||
contents = await feedback_screenshot.read()
|
||||
if len(contents) > 2 * 1024 * 1024: # 2MB limit
|
||||
raise HTTPException(
|
||||
status_code=400, detail="Screenshot size should be less than 2MB"
|
||||
)
|
||||
screenshot_data = base64.b64encode(contents).decode("utf-8")
|
||||
|
||||
with Database() as db:
|
||||
db.insert_user_feedback(
|
||||
feedback_id=feedback_id,
|
||||
user_id=userID,
|
||||
feedback_type=feedback_type,
|
||||
description=feedback_description[:5000],
|
||||
screenshot=screenshot_data,
|
||||
)
|
||||
db.conn.commit()
|
||||
|
||||
return JSONResponse(
|
||||
content={"message": "Thanks for the feedback!"}, status_code=200
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/db/insert_rating")
|
||||
async def insert_rating(
|
||||
userID: str = Query(...),
|
||||
rating: int = Form(...),
|
||||
user_note: str = Form(""),
|
||||
):
|
||||
try:
|
||||
rating_id = str(uuid.uuid4())
|
||||
with Database() as db:
|
||||
db.insert_user_rating(
|
||||
rating_id=rating_id,
|
||||
user_id=userID,
|
||||
rating=rating,
|
||||
user_note=user_note if user_note else None,
|
||||
)
|
||||
db.conn.commit()
|
||||
|
||||
return JSONResponse(
|
||||
content={"message": "Thank you for the rating!"}, status_code=200
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/qa/select_domain")
|
||||
async def select_domain(
|
||||
request: Request,
|
||||
userID: str = Query(...),
|
||||
):
|
||||
try:
|
||||
data = await request.json()
|
||||
selected_domain_id = data.get("domain_id")
|
||||
_, _, success = update_selected_domain(
|
||||
user_id=userID, domain_id=selected_domain_id
|
||||
)
|
||||
|
||||
if not success:
|
||||
return JSONResponse(
|
||||
content={"message": "error while updating selected domain"},
|
||||
status_code=400,
|
||||
)
|
||||
|
||||
redis_manager.refresh_user_ttl(userID)
|
||||
return JSONResponse(
|
||||
content={"message": "success"},
|
||||
status_code=200,
|
||||
)
|
||||
except RedisConnectionError as e:
|
||||
logger.error(f"Redis connection error: {str(e)}")
|
||||
raise HTTPException(status_code=503, detail="Service temporarily unavailable")
|
||||
except Exception as e:
|
||||
logger.error(f"Error in select_domain: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/qa/generate_answer")
|
||||
async def generate_answer(
|
||||
request: Request,
|
||||
userID: str = Query(...),
|
||||
sessionID: str = Query(...),
|
||||
):
|
||||
try:
|
||||
data = await request.json()
|
||||
user_message = data.get("user_message")
|
||||
file_ids = data.get("file_ids")
|
||||
|
||||
# Check if domain is selected
|
||||
selected_domain_id = redis_manager.get_data(f"user:{userID}:selected_domain")
|
||||
if not selected_domain_id:
|
||||
return JSONResponse(
|
||||
content={"message": "Please select a domain first..."},
|
||||
status_code=400,
|
||||
)
|
||||
|
||||
if not file_ids:
|
||||
return JSONResponse(
|
||||
content={"message": "You didn't select any files..."},
|
||||
status_code=400,
|
||||
)
|
||||
|
||||
with Database() as db:
|
||||
update_result = db.upsert_session_info(user_id=userID, session_id=sessionID)
|
||||
|
||||
if not update_result["success"]:
|
||||
return JSONResponse(
|
||||
content={"message": update_result["message"]},
|
||||
status_code=400,
|
||||
)
|
||||
|
||||
# Get required data from Redis
|
||||
index, filtered_content, boost_info, index_header = processor.filter_search(
|
||||
domain_content=redis_manager.get_data(f"user:{userID}:domain_content"),
|
||||
domain_embeddings=redis_manager.get_data(
|
||||
f"user:{userID}:domain_embeddings"
|
||||
),
|
||||
file_ids=file_ids,
|
||||
)
|
||||
|
||||
if not index or not filtered_content:
|
||||
return JSONResponse(
|
||||
content={"message": "Nothing in here..."},
|
||||
status_code=400,
|
||||
)
|
||||
|
||||
# Process search
|
||||
answer, resources, resource_sentences = processor.search_index(
|
||||
user_query=user_message,
|
||||
domain_content=filtered_content,
|
||||
boost_info=boost_info,
|
||||
index=index,
|
||||
index_header=index_header,
|
||||
)
|
||||
|
||||
if not resources or not resource_sentences:
|
||||
return JSONResponse(
|
||||
content={
|
||||
"message": answer,
|
||||
"daily_count": update_result["daily_count"],
|
||||
},
|
||||
status_code=200,
|
||||
)
|
||||
|
||||
redis_manager.refresh_user_ttl(userID)
|
||||
|
||||
return JSONResponse(
|
||||
content={
|
||||
"answer": answer,
|
||||
"resources": resources,
|
||||
"resource_sentences": resource_sentences,
|
||||
"question_count": update_result["question_count"],
|
||||
"daily_count": update_result["daily_count"],
|
||||
},
|
||||
status_code=200,
|
||||
)
|
||||
|
||||
except RedisConnectionError as e:
|
||||
logger.error(f"Redis connection error: {str(e)}")
|
||||
raise HTTPException(status_code=503, detail="Service temporarily unavailable")
|
||||
except Exception as e:
|
||||
logger.error(f"Error in generate_answer: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/io/store_file")
|
||||
async def store_file(
|
||||
userID: str = Query(...),
|
||||
file: UploadFile = File(...),
|
||||
lastModified: str = Form(...),
|
||||
):
|
||||
try:
|
||||
file_bytes = await file.read()
|
||||
if not file_bytes:
|
||||
return JSONResponse(
|
||||
content={
|
||||
"message": f"Empty file {file.filename}. If you think not, please report this to us!"
|
||||
},
|
||||
status_code=400,
|
||||
)
|
||||
|
||||
file_data = processor.rf.read_file(
|
||||
file_bytes=file_bytes, file_name=file.filename
|
||||
)
|
||||
|
||||
if not file_data["sentences"]:
|
||||
return JSONResponse(
|
||||
content={
|
||||
"message": f"No content to extract in {file.filename}. If there is please report this to us!"
|
||||
},
|
||||
status_code=400,
|
||||
)
|
||||
|
||||
# Create embeddings
|
||||
file_embeddings = processor.ef.create_embeddings_from_sentences(
|
||||
sentences=file_data["sentences"]
|
||||
)
|
||||
|
||||
# Store in Redis
|
||||
redis_key = f"user:{userID}:upload:{file.filename}"
|
||||
upload_data = {
|
||||
"file_name": file.filename,
|
||||
"last_modified": datetime.fromtimestamp(int(lastModified) / 1000).strftime(
|
||||
"%Y-%m-%d"
|
||||
)[:20],
|
||||
"sentences": file_data["sentences"],
|
||||
"page_numbers": file_data["page_number"],
|
||||
"is_headers": file_data["is_header"],
|
||||
"is_tables": file_data["is_table"],
|
||||
"embeddings": file_embeddings,
|
||||
}
|
||||
|
||||
redis_manager.set_data(redis_key, upload_data, expiry=3600)
|
||||
|
||||
return JSONResponse(
|
||||
content={"message": "success", "file_name": file.filename},
|
||||
status_code=200,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Error storing file {file.filename}: {str(e)}")
|
||||
return JSONResponse(
|
||||
content={"message": f"Error storing file: {str(e)}"}, status_code=500
|
||||
)
|
||||
|
||||
|
||||
@router.post("/io/store_drive_file")
|
||||
async def store_drive_file(
|
||||
userID: str = Query(...),
|
||||
lastModified: str = Form(...),
|
||||
driveFileId: str = Form(...),
|
||||
driveFileName: str = Form(...),
|
||||
accessToken: str = Form(...),
|
||||
):
|
||||
try:
|
||||
credentials = Credentials(
|
||||
token=accessToken,
|
||||
client_id=GOOGLE_CLIENT_ID,
|
||||
client_secret=GOOGLE_CLIENT_SECRET,
|
||||
token_uri="https://oauth2.googleapis.com/token",
|
||||
)
|
||||
|
||||
drive_service = build("drive", "v3", credentials=credentials)
|
||||
|
||||
google_mime_types = {
|
||||
"application/vnd.google-apps.document": ("application/pdf", ".pdf"),
|
||||
"application/vnd.google-apps.spreadsheet": (
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
".xlsx",
|
||||
),
|
||||
"application/vnd.google-apps.presentation": (
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
".pptx",
|
||||
),
|
||||
"application/vnd.google-apps.script": ("text/plain", ".txt"),
|
||||
}
|
||||
|
||||
file_metadata = (
|
||||
drive_service.files().get(fileId=driveFileId, fields="mimeType").execute()
|
||||
)
|
||||
mime_type = file_metadata["mimeType"]
|
||||
|
||||
if mime_type in google_mime_types:
|
||||
export_mime_type, extension = google_mime_types[mime_type]
|
||||
request = drive_service.files().export_media(
|
||||
fileId=driveFileId, mimeType=export_mime_type
|
||||
)
|
||||
|
||||
if not driveFileName.endswith(extension):
|
||||
driveFileName += extension
|
||||
else:
|
||||
request = drive_service.files().get_media(fileId=driveFileId)
|
||||
|
||||
file_stream = io.BytesIO()
|
||||
downloader = MediaIoBaseDownload(file_stream, request)
|
||||
|
||||
done = False
|
||||
while not done:
|
||||
_, done = downloader.next_chunk()
|
||||
|
||||
file_stream.seek(0)
|
||||
file_bytes = file_stream.read()
|
||||
|
||||
if not file_bytes:
|
||||
return JSONResponse(
|
||||
content={
|
||||
"message": f"Empty file {driveFileName}. If you think not, please report this to us!"
|
||||
},
|
||||
status_code=400,
|
||||
)
|
||||
|
||||
file_data = processor.rf.read_file(
|
||||
file_bytes=file_bytes, file_name=driveFileName
|
||||
)
|
||||
|
||||
if not file_data["sentences"]:
|
||||
return JSONResponse(
|
||||
content={
|
||||
"message": f"No content to extract in {driveFileName}. If there is please report this to us!"
|
||||
},
|
||||
status_code=400,
|
||||
)
|
||||
|
||||
file_embeddings = processor.ef.create_embeddings_from_sentences(
|
||||
sentences=file_data["sentences"]
|
||||
)
|
||||
|
||||
redis_key = f"user:{userID}:upload:{driveFileName}"
|
||||
upload_data = {
|
||||
"file_name": driveFileName,
|
||||
"last_modified": datetime.fromtimestamp(int(lastModified) / 1000).strftime(
|
||||
"%Y-%m-%d"
|
||||
)[:20],
|
||||
"sentences": file_data["sentences"],
|
||||
"page_numbers": file_data["page_number"],
|
||||
"is_headers": file_data["is_header"],
|
||||
"is_tables": file_data["is_table"],
|
||||
"embeddings": file_embeddings,
|
||||
}
|
||||
|
||||
redis_manager.set_data(redis_key, upload_data, expiry=3600)
|
||||
|
||||
return JSONResponse(
|
||||
content={"message": "success", "file_name": driveFileName}, status_code=200
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Error storing Drive file {driveFileName}: {str(e)}")
|
||||
return JSONResponse(
|
||||
content={"message": f"Error storing file: {str(e)}"}, status_code=500
|
||||
)
|
||||
|
||||
|
||||
@router.post("/io/store_url")
|
||||
async def store_url(userID: str = Query(...), url: str = Form(...)):
|
||||
try:
|
||||
if not processor.ws.url_validator(url):
|
||||
return JSONResponse(
|
||||
content={"message": "Invalid URL. Please enter a valid URL."},
|
||||
status_code=400,
|
||||
)
|
||||
|
||||
html = processor.ws.request_creator(url)
|
||||
if not html:
|
||||
return JSONResponse(
|
||||
content={"message": "Error fetching the URL. Please try again later."},
|
||||
status_code=400,
|
||||
)
|
||||
|
||||
file_data = processor.rf.read_url(html_content=html)
|
||||
|
||||
if not file_data["sentences"]:
|
||||
return JSONResponse(
|
||||
content={
|
||||
"message": f"No content to extract in {url}. If there is please report this to us!"
|
||||
},
|
||||
status_code=400,
|
||||
)
|
||||
|
||||
file_embeddings = processor.ef.create_embeddings_from_sentences(
|
||||
sentences=file_data["sentences"]
|
||||
)
|
||||
|
||||
redis_key = f"user:{userID}:upload:{url}"
|
||||
upload_data = {
|
||||
"file_name": url,
|
||||
"last_modified": datetime.now().strftime("%Y-%m-%d"),
|
||||
"sentences": file_data["sentences"],
|
||||
"page_numbers": file_data["page_number"],
|
||||
"is_headers": file_data["is_header"],
|
||||
"is_tables": file_data["is_table"],
|
||||
"embeddings": file_embeddings,
|
||||
}
|
||||
|
||||
redis_manager.set_data(redis_key, upload_data, expiry=3600)
|
||||
|
||||
return JSONResponse(
|
||||
content={"message": "success", "file_name": url}, status_code=200
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Error storing URL {url}: {str(e)}")
|
||||
return JSONResponse(
|
||||
content={"message": f"Error storing URL: {str(e)}"}, status_code=500
|
||||
)
|
||||
|
||||
|
||||
@router.post("/io/upload_files")
|
||||
async def upload_files(userID: str = Query(...)):
|
||||
try:
|
||||
# Get domain info
|
||||
selected_domain_id = redis_manager.get_data(f"user:{userID}:selected_domain")
|
||||
|
||||
with Database() as db:
|
||||
domain_info = db.get_domain_info(
|
||||
user_id=userID, domain_id=selected_domain_id
|
||||
)
|
||||
|
||||
if not domain_info:
|
||||
return JSONResponse(
|
||||
content={"message": "Invalid domain selected"}, status_code=400
|
||||
)
|
||||
|
||||
# Get all stored files from Redis
|
||||
stored_files = redis_manager.get_keys_by_pattern(f"user:{userID}:upload:*")
|
||||
if not stored_files:
|
||||
return JSONResponse(
|
||||
content={"message": "No files to process"}, status_code=400
|
||||
)
|
||||
|
||||
file_info_batch = []
|
||||
file_content_batch = []
|
||||
|
||||
# Process stored files
|
||||
for redis_key in stored_files:
|
||||
upload_data = redis_manager.get_data(redis_key)
|
||||
if not upload_data:
|
||||
continue
|
||||
|
||||
file_id = str(uuid.uuid4())
|
||||
|
||||
# Prepare batches
|
||||
file_info_batch.append(
|
||||
(
|
||||
userID,
|
||||
file_id,
|
||||
selected_domain_id,
|
||||
upload_data["file_name"],
|
||||
upload_data["last_modified"],
|
||||
)
|
||||
)
|
||||
|
||||
for i in range(len(upload_data["sentences"])):
|
||||
file_content_batch.append(
|
||||
(
|
||||
file_id,
|
||||
encryptor.encrypt(
|
||||
text=upload_data["sentences"][i], auth_data=file_id
|
||||
),
|
||||
upload_data["page_numbers"][i],
|
||||
upload_data["is_headers"][i],
|
||||
upload_data["is_tables"][i],
|
||||
psycopg2.Binary(upload_data["embeddings"][i]),
|
||||
)
|
||||
)
|
||||
|
||||
# Clean up Redis
|
||||
redis_manager.delete_data(redis_key)
|
||||
|
||||
# Bulk insert with limit check
|
||||
result = db.insert_file_batches(file_info_batch, file_content_batch)
|
||||
if not result["success"]:
|
||||
return JSONResponse(
|
||||
content={"message": result["message"]}, status_code=400
|
||||
)
|
||||
db.conn.commit()
|
||||
|
||||
# Update domain info
|
||||
file_names, file_ids, success = update_selected_domain(
|
||||
user_id=userID, domain_id=selected_domain_id
|
||||
)
|
||||
if not success:
|
||||
return JSONResponse(
|
||||
content={
|
||||
"message": "Files uploaded but, domain could not be updated",
|
||||
"file_names": None,
|
||||
"file_ids": None,
|
||||
},
|
||||
status_code=400,
|
||||
)
|
||||
|
||||
return JSONResponse(
|
||||
content={
|
||||
"message": "success",
|
||||
"file_names": file_names,
|
||||
"file_ids": file_ids,
|
||||
},
|
||||
status_code=200,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Error processing uploads: {str(e)}")
|
||||
return JSONResponse(
|
||||
content={"message": f"Error processing uploads: {str(e)}"}, status_code=500
|
||||
)
|
||||
|
||||
|
||||
@router.post("/db/remove_file_upload")
|
||||
async def remove_file_upload(
|
||||
request: Request,
|
||||
userID: str = Query(...),
|
||||
):
|
||||
try:
|
||||
data = await request.json()
|
||||
file_id = data.get("file_id")
|
||||
domain_id = data.get("domain_id")
|
||||
|
||||
with Database() as db:
|
||||
success = db.clear_file_content(file_id=file_id)
|
||||
if not success:
|
||||
return JSONResponse(
|
||||
content={
|
||||
"message": "Error deleting files",
|
||||
},
|
||||
status_code=400,
|
||||
)
|
||||
db.conn.commit()
|
||||
|
||||
_, _, success = update_selected_domain(user_id=userID, domain_id=domain_id)
|
||||
if not success:
|
||||
return JSONResponse(
|
||||
content={"message": "error"},
|
||||
status_code=200,
|
||||
)
|
||||
|
||||
return JSONResponse(
|
||||
content={
|
||||
"message": "success",
|
||||
},
|
||||
status_code=200,
|
||||
)
|
||||
except KeyError:
|
||||
return JSONResponse(
|
||||
content={"message": "Please select the domain number first"},
|
||||
status_code=200,
|
||||
)
|
||||
except Exception as e:
|
||||
db.conn.rollback()
|
||||
logging.error(f"Error during file deletion: {str(e)}")
|
||||
raise HTTPException(
|
||||
content={"message": f"Failed deleting, error: {e}"}, status_code=500
|
||||
)
|
||||
|
||||
|
||||
@router.post("/io/export_response")
|
||||
async def export_response(request: Request):
|
||||
try:
|
||||
data = await request.json()
|
||||
text = data.get("contents", [])
|
||||
|
||||
if not text:
|
||||
raise ValueError("No content selected for export")
|
||||
|
||||
formatted_text = "\n\n------------------\n\n".join(text)
|
||||
|
||||
response = processor.ex.export_pdf(data=formatted_text)
|
||||
|
||||
return StreamingResponse(
|
||||
io.BytesIO(response.getvalue()),
|
||||
media_type="application/pdf",
|
||||
headers={
|
||||
"Content-Disposition": "attachment; filename=DoclinkExport.pdf",
|
||||
"Content-Type": "application/pdf",
|
||||
"Content-Length": str(len(response.getvalue())),
|
||||
},
|
||||
)
|
||||
except ValueError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"PDF generation failed Error: {e}")
|
||||
|
||||
|
||||
@router.post("/auth/logout")
|
||||
async def logout(request: Request):
|
||||
try:
|
||||
data = await request.json()
|
||||
user_id = data.get("user_id")
|
||||
session_id = data.get("session_id")
|
||||
|
||||
response = JSONResponse(content={"message": "Logged out successfully"})
|
||||
|
||||
# Clear FastAPI session cookie
|
||||
response.delete_cookie(
|
||||
key="session_id",
|
||||
path="/",
|
||||
domain=None, # This will use the current domain
|
||||
secure=True,
|
||||
httponly=True,
|
||||
samesite="lax",
|
||||
)
|
||||
|
||||
# Delete user redis session
|
||||
redis_key = f"user:{user_id}:session:{session_id}"
|
||||
session_exists = redis_manager.client.exists(redis_key)
|
||||
if session_exists:
|
||||
redis_manager.client.delete(redis_key)
|
||||
|
||||
return response
|
||||
except Exception as e:
|
||||
logging.error(f"Error during logout: {str(e)}")
|
||||
raise HTTPException(
|
||||
content={"message": f"Failed logout, error: {e}"}, status_code=500
|
||||
)
|
||||
|
||||
|
||||
@router.post("/webhooks/lemon-squeezy")
|
||||
async def handle_webhook(request: Request):
|
||||
try:
|
||||
# Get the raw request body
|
||||
body = await request.body()
|
||||
payload = await request.json()
|
||||
|
||||
# Get the signature from the header
|
||||
signature = request.headers.get("X-Signature")
|
||||
|
||||
# Signature verification
|
||||
webhook_secret = os.getenv("LEMON_SQUEEZY_WEBHOOK_SECRET")
|
||||
expected_signature = hmac.new(
|
||||
webhook_secret.encode(), body, hashlib.sha256
|
||||
).hexdigest()
|
||||
|
||||
if not hmac.compare_digest(signature, expected_signature):
|
||||
raise HTTPException(status_code=401, detail="Invalid signature")
|
||||
|
||||
event_name = payload.get("meta", {}).get("event_name")
|
||||
if not event_name == "order_created":
|
||||
return JSONResponse(
|
||||
status_code=400, content={"message": "Wrong event came!"}
|
||||
)
|
||||
|
||||
# Upgrade user to the premium limits
|
||||
data = payload.get("data", {}).get("attributes", {})
|
||||
customer_id = data.get("customer_id")
|
||||
customer_email = data.get("user_email")
|
||||
receipt_url = data.get("urls").get("receipt")
|
||||
|
||||
with Database() as db:
|
||||
db.update_user_subscription(
|
||||
user_email=customer_email,
|
||||
lemon_squeezy_customer_id=customer_id,
|
||||
receipt_url=receipt_url,
|
||||
)
|
||||
db.conn.commit()
|
||||
return JSONResponse(status_code=200, content={"message": "Webhook received"})
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Webhook error: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
# local functions
|
||||
def update_selected_domain(user_id: str, domain_id: str):
|
||||
try:
|
||||
redis_manager.set_data(f"user:{user_id}:selected_domain", domain_id)
|
||||
|
||||
with Database() as db:
|
||||
file_info = db.get_file_info_with_domain(user_id, domain_id)
|
||||
|
||||
if not file_info:
|
||||
# Clear any existing domain data
|
||||
redis_manager.delete_data(f"user:{user_id}:domain_content")
|
||||
redis_manager.delete_data(f"user:{user_id}:index")
|
||||
redis_manager.delete_data(f"user:{user_id}:index_header")
|
||||
redis_manager.delete_data(f"user:{user_id}:boost_info")
|
||||
return None, None, 1
|
||||
|
||||
content, embeddings = db.get_file_content(
|
||||
file_ids=[info["file_id"] for info in file_info]
|
||||
)
|
||||
|
||||
if not content or not len(embeddings):
|
||||
# Clear any existing domain data
|
||||
redis_manager.delete_data(f"user:{user_id}:domain_content")
|
||||
redis_manager.delete_data(f"user:{user_id}:index")
|
||||
redis_manager.delete_data(f"user:{user_id}:index_header")
|
||||
redis_manager.delete_data(f"user:{user_id}:boost_info")
|
||||
return None, None, 0
|
||||
|
||||
# Store domain content in Redis
|
||||
redis_manager.set_data(f"user:{user_id}:domain_content", content)
|
||||
redis_manager.set_data(f"user:{user_id}:domain_embeddings", embeddings)
|
||||
|
||||
file_names = [info["file_name"] for info in file_info]
|
||||
file_ids = [info["file_id"] for info in file_info]
|
||||
|
||||
return file_names, file_ids, 1
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in update_selected_domain: {str(e)}")
|
||||
raise RedisConnectionError(f"Failed to update domain: {str(e)}")
|
||||
0
doclink/app/db/__init__.py
Normal file
0
doclink/app/db/__init__.py
Normal file
18
doclink/app/db/config.py
Normal file
18
doclink/app/db/config.py
Normal file
@@ -0,0 +1,18 @@
|
||||
from configparser import ConfigParser
|
||||
|
||||
|
||||
class GenerateConfig:
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
|
||||
def config(filename="app/db/database.ini", section="postgresql"):
|
||||
parser = ConfigParser()
|
||||
parser.read(filename)
|
||||
db_config = {}
|
||||
if parser.has_section(section):
|
||||
params = parser.items(section)
|
||||
for param in params:
|
||||
db_config[param[0]] = param[1]
|
||||
else:
|
||||
raise Exception(f"Section {section} is not found in {filename} file.")
|
||||
return db_config
|
||||
740
doclink/app/db/database.py
Normal file
740
doclink/app/db/database.py
Normal file
@@ -0,0 +1,740 @@
|
||||
from psycopg2 import extras
|
||||
from psycopg2 import DatabaseError
|
||||
from pathlib import Path
|
||||
import psycopg2
|
||||
import logging
|
||||
import numpy as np
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
|
||||
from .config import GenerateConfig
|
||||
from ..api.core import Encryptor
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
encryptor = Encryptor()
|
||||
|
||||
|
||||
class Database:
|
||||
_instance = None
|
||||
|
||||
def __new__(cls):
|
||||
if cls._instance is None:
|
||||
cls._instance = super(Database, cls).__new__(cls)
|
||||
cls._instance.db_config = GenerateConfig.config()
|
||||
return cls._instance
|
||||
|
||||
def __enter__(self):
|
||||
self.conn = psycopg2.connect(**self.db_config)
|
||||
self.cursor = self.conn.cursor()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
if self.cursor:
|
||||
self.cursor.close()
|
||||
if self.conn:
|
||||
if exc_type is None:
|
||||
self.conn.commit()
|
||||
else:
|
||||
self.conn.rollback()
|
||||
self.conn.close()
|
||||
|
||||
def initialize_tables(self):
|
||||
sql_path = Path(__file__).resolve().parent / "sql" / "table_initialize.sql"
|
||||
with sql_path.open("r") as file:
|
||||
query = file.read()
|
||||
try:
|
||||
self.cursor.execute(query)
|
||||
self.conn.commit()
|
||||
except DatabaseError as e:
|
||||
self.conn.rollback()
|
||||
raise e
|
||||
|
||||
def reset_database(self):
|
||||
sql_path = Path(__file__).resolve().parent / "sql" / "database_reset.sql"
|
||||
with sql_path.open("r") as file:
|
||||
query = file.read()
|
||||
try:
|
||||
self.cursor.execute(query)
|
||||
self.conn.commit()
|
||||
except DatabaseError as e:
|
||||
self.conn.rollback()
|
||||
raise e
|
||||
|
||||
def _bytes_to_embeddings(self, byte_array):
|
||||
return np.frombuffer(byte_array.tobytes(), dtype=np.float16).reshape(
|
||||
byte_array.shape[0], -1
|
||||
)
|
||||
|
||||
def get_user_info_w_id(self, user_id: str):
|
||||
query_get_user_info = """
|
||||
SELECT DISTINCT user_name, user_surname, user_email, user_type, user_created_at, picture_url
|
||||
FROM user_info
|
||||
WHERE user_id = %s
|
||||
"""
|
||||
query_get_domain_ids = """
|
||||
SELECT DISTINCT domain_id
|
||||
FROM domain_info
|
||||
WHERE user_id = %s
|
||||
"""
|
||||
query_get_domain_info = """
|
||||
SELECT t1.domain_name, t1.domain_id, t2.file_name, t2.file_id
|
||||
FROM domain_info t1
|
||||
LEFT JOIN file_info t2 ON t1.domain_id = t2.domain_id
|
||||
WHERE t1.domain_id IN %s
|
||||
"""
|
||||
query_get_daily_count = """
|
||||
SELECT sum(question_count)
|
||||
FROM session_info s
|
||||
WHERE s.user_id = %s
|
||||
AND s.created_at >= CURRENT_TIMESTAMP - INTERVAL '24 hours' AND s.created_at <= CURRENT_TIMESTAMP;
|
||||
"""
|
||||
|
||||
try:
|
||||
self.cursor.execute(query_get_user_info, (user_id,))
|
||||
user_info_data = self.cursor.fetchone()
|
||||
|
||||
self.cursor.execute(query_get_daily_count, (user_id,))
|
||||
user_daily_count = self.cursor.fetchone()
|
||||
|
||||
if not user_info_data:
|
||||
return None, None
|
||||
|
||||
user_info = {
|
||||
"user_name": user_info_data[0],
|
||||
"user_surname": user_info_data[1],
|
||||
"user_email": user_info_data[2],
|
||||
"user_type": user_info_data[3],
|
||||
"user_created_at": str(user_info_data[4]),
|
||||
"user_daily_count": user_daily_count[0] if user_daily_count[0] else 0,
|
||||
"user_picture_url": user_info_data[5],
|
||||
}
|
||||
|
||||
self.cursor.execute(query_get_domain_ids, (user_id,))
|
||||
domain_id_data = self.cursor.fetchall()
|
||||
|
||||
if not domain_id_data:
|
||||
return user_info, None
|
||||
|
||||
domain_ids = [data[0] for data in domain_id_data]
|
||||
self.cursor.execute(query_get_domain_info, (tuple(domain_ids),))
|
||||
domain_info_data = self.cursor.fetchall()
|
||||
domain_info = {}
|
||||
for data in domain_info_data:
|
||||
if data[1] not in domain_info.keys():
|
||||
domain_info[data[1]] = {
|
||||
"domain_name": data[0],
|
||||
"file_names": [data[2]] if data[2] else [],
|
||||
"file_ids": [data[3]] if data[3] else [],
|
||||
}
|
||||
else:
|
||||
domain_info[data[1]]["file_names"].append(data[2])
|
||||
domain_info[data[1]]["file_ids"].append(data[3])
|
||||
|
||||
return user_info, domain_info
|
||||
|
||||
except DatabaseError as e:
|
||||
self.conn.rollback()
|
||||
raise e
|
||||
|
||||
def get_file_info_with_domain(self, user_id: str, domain_id: str):
|
||||
query_get_file_info = """
|
||||
SELECT DISTINCT file_id, file_name, file_modified_date, file_upload_date
|
||||
FROM file_info
|
||||
WHERE user_id = %s AND domain_id = %s
|
||||
"""
|
||||
try:
|
||||
self.cursor.execute(
|
||||
query_get_file_info,
|
||||
(
|
||||
user_id,
|
||||
domain_id,
|
||||
),
|
||||
)
|
||||
data = self.cursor.fetchall()
|
||||
return (
|
||||
[
|
||||
{
|
||||
"file_id": row[0],
|
||||
"file_name": row[1],
|
||||
"file_modified_date": row[2],
|
||||
"file_upload_date": row[3],
|
||||
}
|
||||
for row in data
|
||||
]
|
||||
if data
|
||||
else None
|
||||
)
|
||||
except DatabaseError as e:
|
||||
self.conn.rollback()
|
||||
raise e
|
||||
|
||||
def get_domain_info(self, user_id: str, domain_id: int):
|
||||
query = """
|
||||
SELECT DISTINCT domain_name
|
||||
FROM domain_info
|
||||
WHERE user_id = %s AND domain_id = %s
|
||||
"""
|
||||
try:
|
||||
self.cursor.execute(
|
||||
query,
|
||||
(
|
||||
user_id,
|
||||
domain_id,
|
||||
),
|
||||
)
|
||||
data = self.cursor.fetchone()
|
||||
return {"domain_name": data[0]} if data else None
|
||||
except DatabaseError as e:
|
||||
self.conn.rollback()
|
||||
raise e
|
||||
|
||||
def get_file_content(self, file_ids: list):
|
||||
query_get_content = """
|
||||
SELECT t1.sentence AS sentence, t1.is_header AS is_header, t1.is_table AS is_table, t1.page_number AS page_number, t1.file_id AS file_id, t2.file_name AS file_name
|
||||
FROM file_content t1
|
||||
LEFT JOIN file_info t2 ON t1.file_id = t2.file_id
|
||||
WHERE t1.file_id IN %s
|
||||
"""
|
||||
query_get_embeddings = """
|
||||
SELECT array_agg(embedding) AS embeddings
|
||||
FROM file_content
|
||||
WHERE file_id IN %s
|
||||
"""
|
||||
try:
|
||||
self.cursor.execute(query_get_content, (tuple(file_ids),))
|
||||
content = self.cursor.fetchall()
|
||||
self.cursor.execute(query_get_embeddings, (tuple(file_ids),))
|
||||
byte_embeddings = self.cursor.fetchone()
|
||||
if content and byte_embeddings and byte_embeddings[0]:
|
||||
embeddings = self._bytes_to_embeddings(np.array(byte_embeddings[0]))
|
||||
return content, embeddings
|
||||
else:
|
||||
return None, None
|
||||
except DatabaseError as e:
|
||||
self.conn.rollback()
|
||||
print(f"Database error occurred: {e}")
|
||||
return None, None
|
||||
except Exception as e:
|
||||
print(f"An unexpected error occurred: {e}")
|
||||
return None, None
|
||||
|
||||
def get_session_info(self, session_id: str):
|
||||
query_get_session = """
|
||||
SELECT user_id, created_at
|
||||
FROM session_info
|
||||
WHERE session_id = %s
|
||||
"""
|
||||
self.cursor.execute(query_get_session, (session_id,))
|
||||
data = self.cursor.fetchone()
|
||||
return {"user_id": data[0], "created_at": data[1]} if data else None
|
||||
|
||||
def rename_domain(self, domain_id: str, new_name: str):
|
||||
query = """
|
||||
UPDATE domain_info
|
||||
SET domain_name = %s
|
||||
WHERE domain_id = %s
|
||||
"""
|
||||
try:
|
||||
self.cursor.execute(query, (new_name, domain_id))
|
||||
rows_affected = self.cursor.rowcount
|
||||
return rows_affected > 0
|
||||
except DatabaseError as e:
|
||||
self.conn.rollback()
|
||||
raise e
|
||||
|
||||
def insert_user_guide(self, user_id: str, domain_id: str):
|
||||
"""
|
||||
Insert default user guide content into user's default domain
|
||||
using the file_id already present in default_content
|
||||
"""
|
||||
current_date = datetime.now().date()
|
||||
file_id = str(uuid.uuid4())
|
||||
|
||||
try:
|
||||
# Insert file info with the new file_id
|
||||
query_insert_file_info = """
|
||||
INSERT INTO file_info
|
||||
(user_id, domain_id, file_id, file_name, file_modified_date, file_upload_date)
|
||||
VALUES
|
||||
(%s, %s, %s, %s, %s, %s)
|
||||
"""
|
||||
|
||||
self.cursor.execute(
|
||||
query_insert_file_info,
|
||||
(
|
||||
user_id,
|
||||
domain_id,
|
||||
file_id,
|
||||
"User Guide.pdf",
|
||||
current_date,
|
||||
current_date,
|
||||
),
|
||||
)
|
||||
|
||||
query_get_guide_content = """
|
||||
SELECT sentence, is_header, is_table, page_number, embedding
|
||||
FROM default_content
|
||||
"""
|
||||
self.cursor.execute(query_get_guide_content)
|
||||
default_content = self.cursor.fetchall()
|
||||
|
||||
for row in default_content:
|
||||
sentence, is_header, is_table, page_number, embedding = row
|
||||
encrypted_sentence = encryptor.encrypt(sentence, file_id)
|
||||
|
||||
self.cursor.execute(
|
||||
"INSERT INTO file_content (file_id, sentence, is_header, is_table, page_number, embedding) VALUES (%s, %s, %s, %s, %s, %s)",
|
||||
(
|
||||
file_id,
|
||||
encrypted_sentence,
|
||||
is_header,
|
||||
is_table,
|
||||
page_number,
|
||||
embedding,
|
||||
),
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
except DatabaseError as e:
|
||||
self.conn.rollback()
|
||||
logger.error(f"Error inserting user guide: {str(e)}")
|
||||
raise e
|
||||
except Exception as e:
|
||||
self.conn.rollback()
|
||||
logger.error(f"Unexpected error inserting user guide: {str(e)}")
|
||||
raise e
|
||||
|
||||
def delete_domain(self, domain_id: str):
|
||||
get_domain_type_query = """
|
||||
SELECT domain_type
|
||||
FROM domain_info
|
||||
WHERE domain_id = %s
|
||||
"""
|
||||
get_files_query = """
|
||||
SELECT file_id
|
||||
FROM file_info
|
||||
WHERE domain_id = %s
|
||||
"""
|
||||
|
||||
delete_content_query = """
|
||||
DELETE FROM file_content
|
||||
WHERE file_id IN %s
|
||||
"""
|
||||
|
||||
delete_files_query = """
|
||||
DELETE FROM file_info
|
||||
WHERE domain_id = %s
|
||||
"""
|
||||
|
||||
delete_domain_query = """
|
||||
DELETE FROM domain_info
|
||||
WHERE domain_id = %s
|
||||
"""
|
||||
|
||||
try:
|
||||
self.cursor.execute(get_domain_type_query, (domain_id,))
|
||||
domain_type = self.cursor.fetchone()
|
||||
if not domain_type[0]:
|
||||
return -1
|
||||
|
||||
self.cursor.execute(get_files_query, (domain_id,))
|
||||
file_data = self.cursor.fetchall()
|
||||
file_ids = [data[0] for data in file_data]
|
||||
|
||||
# content -> files -> domain
|
||||
if file_ids:
|
||||
self.cursor.execute(delete_content_query, (tuple(file_ids),))
|
||||
self.cursor.execute(delete_files_query, (domain_id,))
|
||||
self.cursor.execute(delete_domain_query, (domain_id,))
|
||||
|
||||
rows_affected = self.cursor.rowcount
|
||||
|
||||
return 1 if rows_affected else 0
|
||||
|
||||
except DatabaseError as e:
|
||||
# Rollback in case of error
|
||||
self.cursor.execute("ROLLBACK")
|
||||
logger.error(f"Error deleting domain {domain_id}: {str(e)}")
|
||||
raise e
|
||||
|
||||
def insert_user_feedback(
|
||||
self,
|
||||
feedback_id: str,
|
||||
user_id: str,
|
||||
feedback_type: str,
|
||||
description: str,
|
||||
screenshot: str = None,
|
||||
):
|
||||
query = """
|
||||
INSERT INTO user_feedback (feedback_id, user_id, feedback_type, description, screenshot)
|
||||
VALUES (%s, %s, %s, %s, %s)
|
||||
"""
|
||||
try:
|
||||
self.cursor.execute(
|
||||
query,
|
||||
(
|
||||
feedback_id,
|
||||
user_id,
|
||||
feedback_type,
|
||||
description,
|
||||
screenshot,
|
||||
),
|
||||
)
|
||||
except DatabaseError as e:
|
||||
self.conn.rollback()
|
||||
raise e
|
||||
|
||||
def insert_domain_info(
|
||||
self, user_id: str, domain_id: str, domain_name: str, domain_type: int
|
||||
):
|
||||
query_insert_domain_info = """
|
||||
INSERT INTO domain_info (user_id, domain_id, domain_name, domain_type)
|
||||
VALUES (%s, %s, %s, %s)
|
||||
"""
|
||||
try:
|
||||
self.cursor.execute(
|
||||
query_insert_domain_info,
|
||||
(
|
||||
user_id,
|
||||
domain_id,
|
||||
domain_name,
|
||||
domain_type,
|
||||
),
|
||||
)
|
||||
except DatabaseError as e:
|
||||
self.conn.rollback()
|
||||
raise e
|
||||
|
||||
def create_domain(
|
||||
self, user_id: str, domain_name: str, domain_id: str, domain_type: int
|
||||
):
|
||||
query_count_domains = """
|
||||
SELECT COUNT(*), user_type
|
||||
FROM domain_info d
|
||||
JOIN user_info u ON d.user_id = u.user_id
|
||||
WHERE u.user_id = %s
|
||||
GROUP BY user_type
|
||||
"""
|
||||
|
||||
try:
|
||||
self.cursor.execute(query_count_domains, (user_id,))
|
||||
result = self.cursor.fetchall()
|
||||
|
||||
domain_count, user_type = result[0][0], result[0][1]
|
||||
|
||||
if user_type == "free" and domain_count >= 3:
|
||||
return {
|
||||
"success": False,
|
||||
"message": "Free users can only create up to 3 domains. Upgrade account to create more domains!",
|
||||
}
|
||||
|
||||
elif user_type == "premium" and domain_count >= 10:
|
||||
return {
|
||||
"success": False,
|
||||
"message": "Premium users can only create up to 20 domains. Upgrade account to create more domains!",
|
||||
}
|
||||
|
||||
query_insert = """
|
||||
INSERT INTO domain_info (user_id, domain_id, domain_name, domain_type)
|
||||
VALUES (%s, %s, %s, %s)
|
||||
RETURNING domain_id
|
||||
"""
|
||||
|
||||
self.cursor.execute(
|
||||
query_insert, (user_id, domain_id, domain_name, domain_type)
|
||||
)
|
||||
created_domain_id = self.cursor.fetchone()[0]
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"domain_id": created_domain_id,
|
||||
"message": "success",
|
||||
}
|
||||
|
||||
except DatabaseError as e:
|
||||
self.conn.rollback()
|
||||
raise e
|
||||
|
||||
def get_user_total_file_count(self, user_id: str):
|
||||
user_type_query = """
|
||||
SELECT user_type
|
||||
FROM user_info
|
||||
WHERE user_id = %s
|
||||
"""
|
||||
|
||||
file_count_query = """
|
||||
SELECT COUNT(file_id)
|
||||
FROM file_info
|
||||
WHERE user_id = %s
|
||||
"""
|
||||
|
||||
try:
|
||||
# Get user type first
|
||||
self.cursor.execute(user_type_query, (user_id,))
|
||||
user_type_result = self.cursor.fetchone()
|
||||
|
||||
if not user_type_result:
|
||||
logger.error(f"User {user_id} not found in database")
|
||||
return False
|
||||
|
||||
user_type = user_type_result[0]
|
||||
|
||||
# Get file count
|
||||
self.cursor.execute(file_count_query, (user_id,))
|
||||
file_count_result = self.cursor.fetchone()
|
||||
file_count = file_count_result[0] if file_count_result else 0
|
||||
|
||||
return file_count, user_type
|
||||
except Exception as e:
|
||||
self.conn.rollback()
|
||||
logger.error(f"Error in user total file processing: {str(e)}")
|
||||
return False
|
||||
|
||||
def insert_file_batches(
|
||||
self, file_info_batch: list, file_content_batch: list
|
||||
) -> bool:
|
||||
"""Process both file info and content in a single transaction."""
|
||||
try:
|
||||
user_id = file_info_batch[0][0]
|
||||
file_count, user_type = self.get_user_total_file_count(user_id)
|
||||
|
||||
if user_type == "free" and file_count + len(file_info_batch) > 10:
|
||||
return {
|
||||
"success": False,
|
||||
"message": f"Free users can only have 10 total files. You currently have {file_count} files across all folders. Upgrade to add more!",
|
||||
}
|
||||
elif user_type == "premium" and file_count + len(file_info_batch) > 100:
|
||||
return {
|
||||
"success": False,
|
||||
"message": f"Premium users can only have 100 total files. You currently have {file_count} files across all folders",
|
||||
}
|
||||
|
||||
self._insert_file_info_batch(file_info_batch)
|
||||
self._insert_file_content_batch(file_content_batch)
|
||||
|
||||
return {"success": True, "message": "Files uploaded successfully"}
|
||||
except Exception as e:
|
||||
self.conn.rollback()
|
||||
logger.error(f"Error in batch processing: {str(e)}")
|
||||
return False
|
||||
|
||||
def _insert_file_info_batch(self, file_info_batch: list):
|
||||
"""Internal method for file info insertion."""
|
||||
query = """
|
||||
INSERT INTO file_info (user_id, file_id, domain_id, file_name, file_modified_date)
|
||||
VALUES %s
|
||||
"""
|
||||
try:
|
||||
extras.execute_values(self.cursor, query, file_info_batch)
|
||||
logger.info(
|
||||
f"Successfully inserted {len(file_info_batch)} file info records"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error while inserting file info: {str(e)}")
|
||||
raise
|
||||
|
||||
def _insert_file_content_batch(self, file_content_batch: list):
|
||||
"""Internal method for file content insertion."""
|
||||
query = """
|
||||
INSERT INTO file_content (file_id, sentence, page_number, is_header, is_table, embedding)
|
||||
VALUES %s
|
||||
"""
|
||||
try:
|
||||
extras.execute_values(self.cursor, query, file_content_batch)
|
||||
logger.info(
|
||||
f"Successfully inserted {len(file_content_batch)} content rows "
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error while inserting file content: {str(e)}")
|
||||
raise
|
||||
|
||||
def upsert_session_info(self, user_id: str, session_id: str):
|
||||
# First check if the session exists
|
||||
check_session_query = """
|
||||
SELECT id FROM session_info
|
||||
WHERE user_id = %s AND session_id = %s
|
||||
"""
|
||||
|
||||
# Query to get daily question count and user type
|
||||
query_get_daily_count = """
|
||||
SELECT sum(question_count), u.user_type
|
||||
FROM session_info s
|
||||
JOIN user_info u ON s.user_id = u.user_id
|
||||
WHERE s.user_id = %s
|
||||
AND s.created_at >= CURRENT_TIMESTAMP - INTERVAL '24 hours' AND s.created_at <= CURRENT_TIMESTAMP
|
||||
GROUP BY u.user_type;
|
||||
"""
|
||||
|
||||
# Query to insert new session
|
||||
insert_session_query = """
|
||||
INSERT INTO session_info
|
||||
(user_id, session_id, question_count, total_enterance, last_enterance)
|
||||
VALUES (%s, %s, 0, 1, CURRENT_TIMESTAMP)
|
||||
RETURNING id
|
||||
"""
|
||||
|
||||
# Query to update existing session
|
||||
update_question_query = """
|
||||
UPDATE session_info
|
||||
SET question_count = question_count + 1,
|
||||
last_enterance = CURRENT_TIMESTAMP
|
||||
WHERE user_id = %s AND session_id = %s
|
||||
RETURNING question_count
|
||||
"""
|
||||
|
||||
try:
|
||||
# Check if session exists
|
||||
self.cursor.execute(check_session_query, (user_id, session_id))
|
||||
session_exists = self.cursor.fetchone()
|
||||
|
||||
# If session doesn't exist, create it
|
||||
if not session_exists:
|
||||
self.cursor.execute(insert_session_query, (user_id, session_id))
|
||||
self.conn.commit()
|
||||
|
||||
# Get daily count and user type
|
||||
self.cursor.execute(query_get_daily_count, (user_id,))
|
||||
result = self.cursor.fetchall()
|
||||
daily_count, user_type = result[0][0], result[0][1]
|
||||
|
||||
# Check free user limits
|
||||
if user_type == "free" and daily_count >= 25:
|
||||
return {
|
||||
"success": False,
|
||||
"message": "Daily question limit reached for free user. Please try again tomorrow or upgrade your plan!",
|
||||
"question_count": daily_count,
|
||||
}
|
||||
|
||||
# Increment question count
|
||||
self.cursor.execute(update_question_query, (user_id, session_id))
|
||||
question_count = self.cursor.fetchone()[0]
|
||||
self.conn.commit()
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"message": "success",
|
||||
"question_count": question_count,
|
||||
"daily_count": daily_count,
|
||||
}
|
||||
except Exception as e:
|
||||
self.conn.rollback()
|
||||
print(f"Error updating session info: {str(e)}")
|
||||
raise e
|
||||
|
||||
def insert_user_rating(
|
||||
self, rating_id: str, user_id: str, rating: int, user_note: str
|
||||
):
|
||||
query = """
|
||||
INSERT INTO user_rating (rating_id, user_id, rating, user_note)
|
||||
VALUES (%s, %s, %s, %s)
|
||||
"""
|
||||
try:
|
||||
self.cursor.execute(query, (rating_id, user_id, rating, user_note))
|
||||
except Exception as e:
|
||||
self.conn.rollback()
|
||||
raise e
|
||||
|
||||
def clear_file_info(self, user_id: str, file_ids: list):
|
||||
query = """
|
||||
DELETE FROM file_info
|
||||
WHERE user_id = %s AND file_id IN %s
|
||||
"""
|
||||
try:
|
||||
self.cursor.execute(
|
||||
query,
|
||||
(
|
||||
user_id,
|
||||
tuple(
|
||||
file_ids,
|
||||
),
|
||||
),
|
||||
)
|
||||
return 1
|
||||
except DatabaseError as e:
|
||||
self.conn.rollback()
|
||||
raise e
|
||||
|
||||
def clear_file_content(self, file_id: list):
|
||||
clear_content_query = """
|
||||
DELETE FROM file_content
|
||||
WHERE file_id = %s
|
||||
"""
|
||||
clear_file_info_query = """
|
||||
DELETE FROM file_info
|
||||
WHERE file_id = %s
|
||||
"""
|
||||
try:
|
||||
self.cursor.execute(
|
||||
clear_content_query,
|
||||
(file_id,),
|
||||
)
|
||||
|
||||
self.cursor.execute(
|
||||
clear_file_info_query,
|
||||
(file_id,),
|
||||
)
|
||||
|
||||
rows_affected = self.cursor.rowcount
|
||||
|
||||
return 1 if rows_affected else 0
|
||||
|
||||
except DatabaseError as e:
|
||||
self.conn.rollback()
|
||||
raise e
|
||||
|
||||
def update_user_subscription(
|
||||
self,
|
||||
user_email: str,
|
||||
lemon_squeezy_customer_id: str,
|
||||
receipt_url: str,
|
||||
):
|
||||
try:
|
||||
query_get_user = """
|
||||
SELECT user_id FROM user_info
|
||||
WHERE user_email = %s
|
||||
LIMIT 1
|
||||
"""
|
||||
self.cursor.execute(query_get_user, (user_email,))
|
||||
result = self.cursor.fetchone()
|
||||
|
||||
if result:
|
||||
# Insert user into the premium table
|
||||
user_id = result[0]
|
||||
query_insert_premium_user = """
|
||||
INSERT INTO premium_user_info (lemon_squeezy_customer_id, user_id, receipt_url)
|
||||
VALUES (%s, %s, %s)
|
||||
"""
|
||||
self.cursor.execute(
|
||||
query_insert_premium_user,
|
||||
(lemon_squeezy_customer_id, user_id, receipt_url),
|
||||
)
|
||||
|
||||
# Update user info within the user_info table
|
||||
query_update_user_info = """
|
||||
UPDATE user_info
|
||||
SET user_type = %s
|
||||
WHERE user_id = %s
|
||||
RETURNING user_id
|
||||
"""
|
||||
self.cursor.execute(query_update_user_info, ("premium", user_id))
|
||||
return
|
||||
else:
|
||||
# This is for handling webhooks before we've updated the user record
|
||||
logger.warning(
|
||||
f"Received webhook for unknown customer: {lemon_squeezy_customer_id}"
|
||||
)
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(f"Error updating subscription: {str(e)}")
|
||||
self.conn.rollback() # Added rollback to prevent transaction errors
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
with Database() as db:
|
||||
db.reset_database()
|
||||
db.initialize_tables()
|
||||
38
doclink/app/db/sql/database_reset.sql
Normal file
38
doclink/app/db/sql/database_reset.sql
Normal file
@@ -0,0 +1,38 @@
|
||||
-- drop_all_tables.sql
|
||||
|
||||
-- Disable foreign key checks to avoid dependency issues
|
||||
SET session_replication_role = 'replica';
|
||||
|
||||
-- Drop all tables in the public schema
|
||||
DO $$
|
||||
DECLARE
|
||||
r RECORD;
|
||||
BEGIN
|
||||
FOR r IN (SELECT tablename FROM pg_tables WHERE schemaname = 'public') LOOP
|
||||
EXECUTE 'DROP TABLE IF EXISTS ' || quote_ident(r.tablename) || ' CASCADE';
|
||||
END LOOP;
|
||||
END $$;
|
||||
|
||||
-- Re-enable foreign key checks
|
||||
SET session_replication_role = 'origin';
|
||||
|
||||
-- Optionally, you can also drop sequences if you have any
|
||||
DO $$
|
||||
DECLARE
|
||||
r RECORD;
|
||||
BEGIN
|
||||
FOR r IN (SELECT sequencename FROM pg_sequences WHERE schemaname = 'public') LOOP
|
||||
EXECUTE 'DROP SEQUENCE IF EXISTS ' || quote_ident(r.sequencename) || ' CASCADE';
|
||||
END LOOP;
|
||||
END $$;
|
||||
|
||||
-- If you want to reset the primary key sequences for all tables, you can add this:
|
||||
-- (Note: Only necessary if you've inserted data and want to reset auto-incrementing ids)
|
||||
DO $$
|
||||
DECLARE
|
||||
r RECORD;
|
||||
BEGIN
|
||||
FOR r IN (SELECT tablename FROM pg_tables WHERE schemaname = 'public') LOOP
|
||||
EXECUTE 'ALTER TABLE ' || quote_ident(r.tablename) || ' ALTER COLUMN id RESTART WITH 1;';
|
||||
END LOOP;
|
||||
END $$;
|
||||
88
doclink/app/db/sql/table_initialize.sql
Normal file
88
doclink/app/db/sql/table_initialize.sql
Normal file
@@ -0,0 +1,88 @@
|
||||
CREATE TABLE IF NOT EXISTS user_info (
|
||||
user_id UUID PRIMARY KEY,
|
||||
google_id VARCHAR(255) NOT NULL,
|
||||
user_name VARCHAR(50) NOT NULL,
|
||||
user_surname VARCHAR(50) NOT NULL,
|
||||
user_email VARCHAR(100) UNIQUE NOT NULL,
|
||||
user_type VARCHAR(20) DEFAULT 'free',
|
||||
picture_url VARCHAR(255),
|
||||
user_created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS premium_user_info (
|
||||
lemon_squeezy_customer_id VARCHAR(255) NOT NULL,
|
||||
user_id UUID PRIMARY KEY,
|
||||
receipt_url VARCHAR,
|
||||
payment_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
FOREIGN KEY (user_id) REFERENCES user_info(user_id)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS user_feedback (
|
||||
feedback_id UUID PRIMARY KEY,
|
||||
user_id UUID NOT NULL,
|
||||
feedback_type VARCHAR(20) NOT NULL,
|
||||
description TEXT NOT NULL,
|
||||
screenshot TEXT,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
FOREIGN KEY (user_id) REFERENCES user_info(user_id)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS domain_info (
|
||||
user_id UUID NOT NULL,
|
||||
domain_id UUID PRIMARY KEY,
|
||||
domain_name VARCHAR(30) NOT NULL,
|
||||
domain_type INTEGER,
|
||||
FOREIGN KEY (user_id) REFERENCES user_info(user_id)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS file_info (
|
||||
user_id UUID NOT NULL,
|
||||
domain_id UUID NOT NULL,
|
||||
file_id UUID PRIMARY KEY,
|
||||
file_name VARCHAR(255) NOT NULL,
|
||||
file_modified_date DATE,
|
||||
file_upload_date DATE DEFAULT CURRENT_DATE,
|
||||
FOREIGN KEY (user_id) REFERENCES user_info(user_id),
|
||||
FOREIGN KEY (domain_id) REFERENCES domain_info(domain_id)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS file_content (
|
||||
content_id SERIAL PRIMARY KEY,
|
||||
file_id UUID NOT NULL,
|
||||
sentence TEXT NOT NULL,
|
||||
is_header BOOLEAN DEFAULT FALSE,
|
||||
is_table BOOLEAN DEFAULT FALSE,
|
||||
page_number INTEGER,
|
||||
embedding BYTEA,
|
||||
FOREIGN KEY (file_id) REFERENCES file_info(file_id)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS session_info (
|
||||
id SERIAL PRIMARY KEY,
|
||||
user_id UUID NOT NULL,
|
||||
session_id UUID NOT NULL,
|
||||
question_count INTEGER DEFAULT 0,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
total_enterance INTEGER DEFAULT 0,
|
||||
last_enterance TIMESTAMP,
|
||||
FOREIGN KEY (user_id) REFERENCES user_info(user_id)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS default_content (
|
||||
content_id SERIAL PRIMARY KEY,
|
||||
file_id UUID NOT NULL,
|
||||
sentence TEXT NOT NULL,
|
||||
is_header BOOLEAN DEFAULT FALSE,
|
||||
is_table BOOLEAN DEFAULT FALSE,
|
||||
page_number INTEGER,
|
||||
embedding BYTEA
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS user_rating (
|
||||
rating_id UUID PRIMARY KEY,
|
||||
user_id UUID NOT NULL,
|
||||
rating INTEGER NOT NULL,
|
||||
user_note TEXT,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
FOREIGN KEY (user_id) REFERENCES user_info(user_id)
|
||||
);
|
||||
0
doclink/app/functions/__init__.py
Normal file
0
doclink/app/functions/__init__.py
Normal file
83
doclink/app/functions/chatbot_functions.py
Normal file
83
doclink/app/functions/chatbot_functions.py
Normal file
@@ -0,0 +1,83 @@
|
||||
from openai import OpenAI
|
||||
from dotenv import load_dotenv
|
||||
from langdetect import detect
|
||||
import textwrap
|
||||
import yaml
|
||||
import re
|
||||
from typing import Dict, Any, Match
|
||||
|
||||
|
||||
class ChatbotFunctions:
|
||||
def __init__(self):
|
||||
load_dotenv()
|
||||
self.client = OpenAI()
|
||||
|
||||
with open("app/utils/prompts.yaml", "r", encoding="utf-8") as file:
|
||||
self.prompt_data = yaml.safe_load(file)
|
||||
|
||||
def _prompt_query_generation(self, query, file_lang):
|
||||
return textwrap.dedent(
|
||||
self.get_prompt(category="queries", query=query, file_lang=file_lang)
|
||||
)
|
||||
|
||||
def _prompt_answer_generation(self, query, context, lang, intention):
|
||||
return textwrap.dedent(
|
||||
self.get_prompt(category=intention, query=query, context=context, lang=lang)
|
||||
)
|
||||
|
||||
def response_generation(self, query, context, intention):
|
||||
lang = self.detect_language(query=query)
|
||||
prompt = self._prompt_answer_generation(
|
||||
query=query, context=context, lang=lang, intention=intention
|
||||
)
|
||||
response = self.client.chat.completions.create(
|
||||
model="gpt-4o-mini",
|
||||
messages=[
|
||||
{"role": "system", "content": prompt},
|
||||
{"role": "user", "content": query},
|
||||
],
|
||||
temperature=0,
|
||||
)
|
||||
answer = response.choices[0].message.content.strip()
|
||||
return answer
|
||||
|
||||
def query_generation(self, query, file_lang):
|
||||
lang = self.detect_language(query=query)
|
||||
prompt = self._prompt_query_generation(query, file_lang=file_lang)
|
||||
response = self.client.chat.completions.create(
|
||||
model="gpt-4o-mini",
|
||||
messages=[
|
||||
{"role": "system", "content": prompt},
|
||||
{"role": "user", "content": query},
|
||||
],
|
||||
temperature=0,
|
||||
)
|
||||
new_queries = response.choices[0].message.content.strip()
|
||||
return new_queries, lang
|
||||
|
||||
def detect_language(self, query):
|
||||
if query.isalpha():
|
||||
lang = detect(text=query)
|
||||
return "tr" if lang == "tr" else "en"
|
||||
return None
|
||||
|
||||
def replace_variables(self, match: Match, kwargs: Dict[str, Any]):
|
||||
variables = match.group(1) or match.group(2)
|
||||
value = kwargs.get(variables)
|
||||
return str(value)
|
||||
|
||||
def get_prompt(self, category, **kwargs):
|
||||
variable_pattern = r"\${?(\w+)}?|\{(\w+)\}"
|
||||
try:
|
||||
prompt = self.prompt_data["prompts"]["languages"]["en"][category.strip()][
|
||||
0
|
||||
]["text"]
|
||||
|
||||
def replace_wrapper(match):
|
||||
return self.replace_variables(match, kwargs)
|
||||
|
||||
full_prompt = re.sub(variable_pattern, replace_wrapper, prompt)
|
||||
return full_prompt
|
||||
except KeyError:
|
||||
print(f"No template found for {category}")
|
||||
return None
|
||||
36
doclink/app/functions/embedding_functions.py
Normal file
36
doclink/app/functions/embedding_functions.py
Normal file
@@ -0,0 +1,36 @@
|
||||
import numpy as np
|
||||
from openai import OpenAI
|
||||
from dotenv import load_dotenv
|
||||
from typing import List
|
||||
|
||||
|
||||
class EmbeddingFunctions:
|
||||
def __init__(self):
|
||||
load_dotenv()
|
||||
self.client = OpenAI()
|
||||
|
||||
def create_embeddings_from_sentences(
|
||||
self, sentences: List[str], chunk_size: int = 2000
|
||||
) -> List[np.ndarray]:
|
||||
file_embeddings = []
|
||||
for chunk_index in range(0, len(sentences), chunk_size):
|
||||
chunk_embeddings = self.client.embeddings.create(
|
||||
model="text-embedding-3-small",
|
||||
input=sentences[chunk_index : chunk_index + chunk_size],
|
||||
)
|
||||
chunk_array = np.array(
|
||||
[x.embedding for x in chunk_embeddings.data], dtype=np.float16
|
||||
)
|
||||
file_embeddings.append(
|
||||
chunk_array / np.linalg.norm(chunk_array, axis=1)[:, np.newaxis]
|
||||
)
|
||||
|
||||
return np.vstack(file_embeddings)
|
||||
|
||||
def create_embedding_from_sentence(self, sentence: list) -> np.ndarray:
|
||||
query_embedding = self.client.embeddings.create(
|
||||
model="text-embedding-3-small", input=sentence
|
||||
)
|
||||
return np.array(query_embedding.data[0].embedding, dtype=np.float16).reshape(
|
||||
1, -1
|
||||
)
|
||||
126
doclink/app/functions/export_functions.py
Normal file
126
doclink/app/functions/export_functions.py
Normal file
@@ -0,0 +1,126 @@
|
||||
from reportlab.lib import colors
|
||||
from reportlab.lib.pagesizes import A4
|
||||
from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet
|
||||
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
|
||||
from reportlab.pdfbase import pdfmetrics
|
||||
from reportlab.pdfbase.ttfonts import TTFont
|
||||
from io import BytesIO
|
||||
import re
|
||||
|
||||
|
||||
class Exporter:
|
||||
def __init__(self):
|
||||
self.styles = getSampleStyleSheet()
|
||||
self.setup_styles()
|
||||
|
||||
def _register_fonts(self):
|
||||
pdfmetrics.registerFont(TTFont("Helvetica", "Helvetica"))
|
||||
pdfmetrics.registerFont(TTFont("Helvetica-Bold", "Helvetica-Bold"))
|
||||
|
||||
def setup_styles(self):
|
||||
self.styles.add(
|
||||
ParagraphStyle(
|
||||
name="Header",
|
||||
fontSize=14,
|
||||
textColor=colors.HexColor("#10B981"),
|
||||
spaceAfter=12,
|
||||
fontName="Helvetica-Bold",
|
||||
encoding="utf-8",
|
||||
)
|
||||
)
|
||||
|
||||
self.styles.add(
|
||||
ParagraphStyle(
|
||||
name="Content",
|
||||
fontSize=11,
|
||||
textColor=colors.black,
|
||||
spaceAfter=8,
|
||||
fontName="Helvetica",
|
||||
encoding="utf-8",
|
||||
)
|
||||
)
|
||||
|
||||
self.styles.add(
|
||||
ParagraphStyle(
|
||||
name="Bullet-Point",
|
||||
fontSize=11,
|
||||
leftIndent=20,
|
||||
bulletIndent=10,
|
||||
spaceAfter=5,
|
||||
fontName="Helvetica",
|
||||
encoding="utf-8",
|
||||
)
|
||||
)
|
||||
|
||||
def clean_text(self, text: str) -> str:
|
||||
if not isinstance(text, str):
|
||||
text = text.decode("utf-8")
|
||||
|
||||
text = text.replace("ı", "i").replace("İ", "I")
|
||||
text = text.replace("ğ", "g").replace("Ğ", "G")
|
||||
text = text.replace("ü", "u").replace("Ü", "U")
|
||||
text = text.replace("ş", "s").replace("Ş", "S")
|
||||
text = text.replace("ö", "o").replace("Ö", "O")
|
||||
text = text.replace("ç", "c").replace("Ç", "C")
|
||||
|
||||
text = re.sub(
|
||||
r"\[header\](.*?)\[/header\]", r'<para style="Header">\1</para>', text
|
||||
)
|
||||
text = re.sub(r"\[bold\](.*?)\[/bold\]", r"<b>\1</b>", text)
|
||||
return text
|
||||
|
||||
def create_watermark(self, canvas, doc):
|
||||
canvas.saveState()
|
||||
canvas.setFillColor(colors.HexColor("#10B981"))
|
||||
canvas.setFont("Helvetica", 8)
|
||||
canvas.drawString(30, 20, "Generated by docklink.io")
|
||||
canvas.restoreState()
|
||||
|
||||
def export_pdf(self, data: str) -> BytesIO:
|
||||
buffer = BytesIO()
|
||||
content = []
|
||||
cleaned_text = self.clean_text(data)
|
||||
|
||||
doc = SimpleDocTemplate(
|
||||
buffer,
|
||||
pagesize=A4,
|
||||
rightMargin=30,
|
||||
leftMargin=30,
|
||||
topMargin=30,
|
||||
bottomMargin=30,
|
||||
)
|
||||
|
||||
lines = cleaned_text.split("\n")
|
||||
|
||||
for line in lines:
|
||||
if line.strip():
|
||||
if (
|
||||
line.startswith("<h1>")
|
||||
or line.startswith('<para style="Header">')
|
||||
or "header" in line
|
||||
):
|
||||
# Header section
|
||||
text = line.replace("<h1>", "").replace("</h1>", "")
|
||||
content.append(Paragraph(text, self.styles["Header"]))
|
||||
elif line.startswith("-"):
|
||||
# Bullet point
|
||||
text = line.strip()
|
||||
content.append(Paragraph(f"- {text}", self.styles["Bullet-Point"]))
|
||||
else:
|
||||
# Normal text
|
||||
content.append(Paragraph(line, self.styles["Content"]))
|
||||
|
||||
content.append(Spacer(1, 2))
|
||||
|
||||
try:
|
||||
doc.build(
|
||||
content,
|
||||
onFirstPage=self.create_watermark,
|
||||
onLaterPages=self.create_watermark,
|
||||
)
|
||||
buffer.seek(0)
|
||||
return buffer
|
||||
except Exception as e:
|
||||
raise ValueError(
|
||||
f"Error: {e} Content too large or complex to export to PDF"
|
||||
)
|
||||
12
doclink/app/functions/indexing_functions.py
Normal file
12
doclink/app/functions/indexing_functions.py
Normal file
@@ -0,0 +1,12 @@
|
||||
import faiss
|
||||
|
||||
|
||||
class IndexingFunctions:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def create_flat_index(self, embeddings):
|
||||
dimension = len(embeddings[0])
|
||||
index = faiss.IndexFlatIP(dimension)
|
||||
index.add(embeddings)
|
||||
return index
|
||||
536
doclink/app/functions/reading_functions.py
Normal file
536
doclink/app/functions/reading_functions.py
Normal file
@@ -0,0 +1,536 @@
|
||||
import fitz
|
||||
import tempfile
|
||||
import io
|
||||
import re
|
||||
import spacy
|
||||
import pymupdf4llm
|
||||
import xml.etree.ElementTree as ET
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from langchain_text_splitters import MarkdownHeaderTextSplitter
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.pipeline.simple_pipeline import SimplePipeline
|
||||
from docling.document_converter import (
|
||||
DocumentConverter,
|
||||
WordFormatOption,
|
||||
PowerpointFormatOption,
|
||||
HTMLFormatOption,
|
||||
)
|
||||
|
||||
|
||||
class ReadingFunctions:
|
||||
def __init__(self):
|
||||
self.nlp = spacy.load(
|
||||
"en_core_web_sm",
|
||||
disable=[
|
||||
"tagger",
|
||||
"attribute_ruler",
|
||||
"lemmatizer",
|
||||
"ner",
|
||||
"textcat",
|
||||
"custom",
|
||||
],
|
||||
)
|
||||
self.max_file_size_mb = 50
|
||||
self.headers_to_split_on = [
|
||||
("#", "Header 1"),
|
||||
("##", "Header 2"),
|
||||
("###", "Header 3"),
|
||||
("####", "Header 4"),
|
||||
]
|
||||
self.markdown_splitter = MarkdownHeaderTextSplitter(
|
||||
self.headers_to_split_on, strip_headers=False, return_each_line=True
|
||||
)
|
||||
self.converter = DocumentConverter(
|
||||
allowed_formats=[
|
||||
InputFormat.DOCX,
|
||||
InputFormat.PPTX,
|
||||
InputFormat.XLSX,
|
||||
InputFormat.PDF,
|
||||
InputFormat.HTML,
|
||||
],
|
||||
format_options={
|
||||
InputFormat.DOCX: WordFormatOption(pipeline_cls=SimplePipeline),
|
||||
InputFormat.PPTX: PowerpointFormatOption(pipeline_cls=SimplePipeline),
|
||||
InputFormat.HTML: HTMLFormatOption(pipeline_cls=SimplePipeline),
|
||||
},
|
||||
)
|
||||
|
||||
def read_file(self, file_bytes: bytes, file_name: str):
|
||||
"""Read and process file content from bytes"""
|
||||
file_size_mb = self._get_file_size(file_bytes=file_bytes)
|
||||
file_type = file_name.split(".")[-1].lower()
|
||||
|
||||
if file_size_mb > self.max_file_size_mb:
|
||||
raise ValueError(f"File size exceeds {self.max_file_size_mb}MB limit")
|
||||
|
||||
try:
|
||||
if file_type == "pdf":
|
||||
return self._process_pdf(file_bytes=file_bytes)
|
||||
elif file_type == "docx":
|
||||
return self._process_docx(file_bytes=file_bytes)
|
||||
elif file_type == "pptx":
|
||||
return self._process_pptx(file_bytes=file_bytes)
|
||||
elif file_type == "xlsx":
|
||||
return self._process_xlsx(file_bytes=file_bytes)
|
||||
elif file_type == "udf":
|
||||
return self._process_udf(file_bytes=file_bytes)
|
||||
elif file_type in ["txt", "rtf"]:
|
||||
return self._process_txt(file_bytes=file_bytes)
|
||||
else:
|
||||
raise ValueError(f"Unsupported file type: {file_type}")
|
||||
|
||||
except Exception as e:
|
||||
raise ValueError(f"Error processing {file_name}: {str(e)}")
|
||||
|
||||
def read_url(self, html_content: tuple):
|
||||
html_data = {
|
||||
"sentences": [],
|
||||
"page_number": [],
|
||||
"is_header": [],
|
||||
"is_table": [],
|
||||
}
|
||||
|
||||
try:
|
||||
with tempfile.NamedTemporaryFile(delete=True, suffix=".html") as temp_file:
|
||||
temp_file.write(html_content.encode("utf-8"))
|
||||
temp_file.flush()
|
||||
html_path = Path(temp_file.name)
|
||||
md_text = self.converter.convert(
|
||||
html_path
|
||||
).document.export_to_markdown()
|
||||
splits = self.markdown_splitter.split_text(md_text)
|
||||
|
||||
for split in splits:
|
||||
if (
|
||||
not len(split.page_content) > 5
|
||||
or re.match(r"^[^\w]*$", split.page_content)
|
||||
or split.page_content[:4] == "<!--"
|
||||
):
|
||||
continue
|
||||
elif split.metadata and split.page_content[0] == "#":
|
||||
html_data["sentences"].append(split.page_content)
|
||||
html_data["is_header"].append(True)
|
||||
html_data["is_table"].append(False)
|
||||
html_data["page_number"].append(1)
|
||||
elif split.page_content[0] == "|" and split.page_content[-1] == "|":
|
||||
html_data["sentences"].append(split.page_content)
|
||||
html_data["is_header"].append(False)
|
||||
html_data["is_table"].append(True)
|
||||
html_data["page_number"].append(1)
|
||||
else:
|
||||
html_data["sentences"].append(split.page_content)
|
||||
html_data["is_header"].append(False)
|
||||
html_data["is_table"].append(False)
|
||||
html_data["page_number"].append(1)
|
||||
return self._chunk_html(html_data)
|
||||
except Exception as e:
|
||||
raise ValueError(f"Error processing HTML content: {str(e)}")
|
||||
|
||||
def _process_pdf(self, file_bytes: bytes):
|
||||
pdf_data = {"sentences": [], "page_number": [], "is_header": [], "is_table": []}
|
||||
pdf_file = io.BytesIO(file_bytes)
|
||||
with fitz.open(stream=pdf_file, filetype="pdf") as pdf:
|
||||
# Process each page
|
||||
markdown_pages = pymupdf4llm.to_markdown(
|
||||
pdf, page_chunks=True, show_progress=False, margins=0
|
||||
)
|
||||
for i, page in enumerate(markdown_pages):
|
||||
splits = self.markdown_splitter.split_text(page["text"])
|
||||
for split in splits:
|
||||
if not len(split.page_content) > 5 or re.match(
|
||||
r"^[^\w]*$", split.page_content
|
||||
):
|
||||
continue
|
||||
elif (
|
||||
split.metadata and split.page_content[0] == "#"
|
||||
): # Header detection
|
||||
pdf_data["sentences"].append(split.page_content)
|
||||
pdf_data["is_header"].append(True)
|
||||
pdf_data["is_table"].append(False)
|
||||
pdf_data["page_number"].append(i + 1)
|
||||
elif (
|
||||
split.page_content[0] == "*"
|
||||
and split.page_content[-1] == "*"
|
||||
and (
|
||||
re.match(
|
||||
r"(\*{2,})(\d+(?:\.\d+)*)\s*(\*{2,})?(.*)$",
|
||||
split.page_content,
|
||||
)
|
||||
or re.match(
|
||||
r"(\*{1,3})?([A-Z][a-zA-Z\s\-]+)(\*{1,3})?$",
|
||||
split.page_content,
|
||||
)
|
||||
)
|
||||
): # Sub-Header and Header variant detection
|
||||
pdf_data["sentences"].append(split.page_content)
|
||||
pdf_data["is_header"].append(True)
|
||||
pdf_data["is_table"].append(False)
|
||||
pdf_data["page_number"].append(i + 1)
|
||||
elif (
|
||||
split.page_content[0] == "|" and split.page_content[-1] == "|"
|
||||
): # Table detection
|
||||
pdf_data["sentences"].append(split.page_content)
|
||||
pdf_data["is_header"].append(False)
|
||||
pdf_data["is_table"].append(True)
|
||||
pdf_data["page_number"].append(i + 1)
|
||||
else:
|
||||
pdf_data["sentences"].append(split.page_content)
|
||||
pdf_data["is_header"].append(False)
|
||||
pdf_data["is_table"].append(False)
|
||||
pdf_data["page_number"].append(i + 1)
|
||||
return pdf_data
|
||||
|
||||
def _process_docx(self, file_bytes: bytes):
|
||||
docx_data = {
|
||||
"sentences": [],
|
||||
"page_number": [],
|
||||
"is_header": [],
|
||||
"is_table": [],
|
||||
}
|
||||
current_length = 0
|
||||
chars_per_page = 2000
|
||||
current_page = 1
|
||||
|
||||
docx_file = io.BytesIO(file_bytes)
|
||||
with tempfile.NamedTemporaryFile(delete=True, suffix=".docx") as temp_file:
|
||||
temp_file.write(docx_file.getvalue())
|
||||
docx_path = Path(temp_file.name)
|
||||
md_text = self.converter.convert(docx_path).document.export_to_markdown()
|
||||
splits = self.markdown_splitter.split_text(md_text)
|
||||
for split in splits:
|
||||
if current_length + len(split.page_content) > chars_per_page:
|
||||
current_page += 1
|
||||
current_length = 0
|
||||
|
||||
if (
|
||||
not len(split.page_content) > 5
|
||||
or re.match(r"^[^\w]*$", split.page_content)
|
||||
or split.page_content[:4] == "<!--"
|
||||
):
|
||||
continue
|
||||
elif (
|
||||
split.metadata and split.page_content[0] == "#"
|
||||
): # Header detection
|
||||
docx_data["sentences"].append(split.page_content)
|
||||
docx_data["is_header"].append(True)
|
||||
docx_data["is_table"].append(False)
|
||||
docx_data["page_number"].append(current_page)
|
||||
current_length += len(split.page_content)
|
||||
elif (
|
||||
split.page_content[0] == "*"
|
||||
and split.page_content[-1] == "*"
|
||||
and (
|
||||
re.match(
|
||||
r"(\*{2,})(\d+(?:\.\d+)*)\s*(\*{2,})?(.*)$",
|
||||
split.page_content,
|
||||
)
|
||||
or re.match(
|
||||
r"(\*{1,3})?([A-Z][a-zA-Z\s\-]+)(\*{1,3})?$",
|
||||
split.page_content,
|
||||
)
|
||||
)
|
||||
): # Sub-Header and Header variant detection
|
||||
docx_data["sentences"].append(split.page_content)
|
||||
docx_data["is_header"].append(True)
|
||||
docx_data["is_table"].append(False)
|
||||
docx_data["page_number"].append(current_page)
|
||||
current_length += len(split.page_content)
|
||||
elif (
|
||||
split.page_content[0] == "|" and split.page_content[-1] == "|"
|
||||
): # Table detection
|
||||
docx_data["sentences"].append(split.page_content)
|
||||
docx_data["is_header"].append(False)
|
||||
docx_data["is_table"].append(True)
|
||||
docx_data["page_number"].append(current_page)
|
||||
current_length += len(split.page_content)
|
||||
else:
|
||||
docx_data["sentences"].append(split.page_content)
|
||||
docx_data["is_header"].append(False)
|
||||
docx_data["is_table"].append(False)
|
||||
docx_data["page_number"].append(current_page)
|
||||
current_length += len(split.page_content)
|
||||
return docx_data
|
||||
|
||||
def _process_pptx(self, file_bytes: bytes):
|
||||
pptx_data = {
|
||||
"sentences": [],
|
||||
"page_number": [],
|
||||
"is_header": [],
|
||||
"is_table": [],
|
||||
}
|
||||
current_length = 0
|
||||
chars_per_page = 500
|
||||
current_page = 1
|
||||
pptx_file = io.BytesIO(file_bytes)
|
||||
with tempfile.NamedTemporaryFile(delete=True, suffix=".pptx") as temp_file:
|
||||
temp_file.write(pptx_file.getvalue())
|
||||
pptx_path = Path(temp_file.name)
|
||||
md_text = self.converter.convert(pptx_path).document.export_to_markdown()
|
||||
splits = self.markdown_splitter.split_text(md_text)
|
||||
for split in splits:
|
||||
if current_length + len(split.page_content) > chars_per_page:
|
||||
current_page += 1
|
||||
current_length = 0
|
||||
if (
|
||||
not len(split.page_content) > 5
|
||||
or re.match(r"^[^\w]*$", split.page_content)
|
||||
or split.page_content[:4] == "<!--"
|
||||
):
|
||||
continue
|
||||
elif (
|
||||
split.metadata and split.page_content[0] == "#"
|
||||
): # Header detection
|
||||
pptx_data["sentences"].append(split.page_content)
|
||||
pptx_data["is_header"].append(True)
|
||||
pptx_data["is_table"].append(False)
|
||||
pptx_data["page_number"].append(current_page)
|
||||
current_length += len(split.page_content)
|
||||
elif (
|
||||
split.page_content[0] == "*"
|
||||
and split.page_content[-1] == "*"
|
||||
and (
|
||||
re.match(
|
||||
r"(\*{2,})(\d+(?:\.\d+)*)\s*(\*{2,})?(.*)$",
|
||||
split.page_content,
|
||||
)
|
||||
or re.match(
|
||||
r"(\*{1,3})?([A-Z][a-zA-Z\s\-]+)(\*{1,3})?$",
|
||||
split.page_content,
|
||||
)
|
||||
)
|
||||
): # Sub-Header and Header variant detection
|
||||
pptx_data["sentences"].append(split.page_content)
|
||||
pptx_data["is_header"].append(True)
|
||||
pptx_data["is_table"].append(False)
|
||||
pptx_data["page_number"].append(current_page)
|
||||
current_length += len(split.page_content)
|
||||
elif (
|
||||
split.page_content[0] == "|" and split.page_content[-1] == "|"
|
||||
): # Table detection
|
||||
pptx_data["sentences"].append(split.page_content)
|
||||
pptx_data["is_header"].append(False)
|
||||
pptx_data["is_table"].append(True)
|
||||
pptx_data["page_number"].append(current_page)
|
||||
current_length += len(split.page_content)
|
||||
else:
|
||||
pptx_data["sentences"].append(split.page_content)
|
||||
pptx_data["is_header"].append(False)
|
||||
pptx_data["is_table"].append(False)
|
||||
pptx_data["page_number"].append(current_page)
|
||||
current_length += len(split.page_content)
|
||||
return pptx_data
|
||||
|
||||
def _process_xlsx(self, file_bytes: bytes):
|
||||
xlsx_data = {
|
||||
"sentences": [],
|
||||
"page_number": [],
|
||||
"is_header": [],
|
||||
"is_table": [],
|
||||
}
|
||||
current_length = 0
|
||||
chars_per_page = 2000
|
||||
current_page = 1
|
||||
xlsx_file = io.BytesIO(file_bytes)
|
||||
with tempfile.NamedTemporaryFile(delete=True, suffix=".xlsx") as temp_file:
|
||||
temp_file.write(xlsx_file.getvalue())
|
||||
xlsx_path = Path(temp_file.name)
|
||||
md_text = self.converter.convert(xlsx_path).document.export_to_markdown()
|
||||
splits = self.markdown_splitter.split_text(md_text)
|
||||
for split in splits:
|
||||
if current_length + len(split.page_content) > chars_per_page:
|
||||
current_page += 1
|
||||
current_length = 0
|
||||
if (
|
||||
not len(split.page_content) > 5
|
||||
or re.match(r"^[^\w]*$", split.page_content)
|
||||
or split.page_content[:4] == "<!--"
|
||||
):
|
||||
continue
|
||||
elif (
|
||||
split.metadata and split.page_content[0] == "#"
|
||||
): # Header detection
|
||||
xlsx_data["sentences"].append(split.page_content)
|
||||
xlsx_data["is_header"].append(True)
|
||||
xlsx_data["is_table"].append(False)
|
||||
xlsx_data["page_number"].append(current_page)
|
||||
current_length += len(split.page_content)
|
||||
elif (
|
||||
split.page_content[0] == "*"
|
||||
and split.page_content[-1] == "*"
|
||||
and (
|
||||
re.match(
|
||||
r"(\*{2,})(\d+(?:\.\d+)*)\s*(\*{2,})?(.*)$",
|
||||
split.page_content,
|
||||
)
|
||||
or re.match(
|
||||
r"(\*{1,3})?([A-Z][a-zA-Z\s\-]+)(\*{1,3})?$",
|
||||
split.page_content,
|
||||
)
|
||||
)
|
||||
): # Sub-Header and Header variant detection
|
||||
xlsx_data["sentences"].append(split.page_content)
|
||||
xlsx_data["is_header"].append(True)
|
||||
xlsx_data["is_table"].append(False)
|
||||
xlsx_data["page_number"].append(current_page)
|
||||
current_length += len(split.page_content)
|
||||
elif (
|
||||
split.page_content[0] == "|" and split.page_content[-1] == "|"
|
||||
): # Table detection
|
||||
xlsx_data["sentences"].append(split.page_content)
|
||||
xlsx_data["is_header"].append(False)
|
||||
xlsx_data["is_table"].append(True)
|
||||
xlsx_data["page_number"].append(current_page)
|
||||
current_length += len(split.page_content)
|
||||
else:
|
||||
xlsx_data["sentences"].append(split.page_content)
|
||||
xlsx_data["is_header"].append(False)
|
||||
xlsx_data["is_table"].append(False)
|
||||
xlsx_data["page_number"].append(current_page)
|
||||
current_length += len(split.page_content)
|
||||
return xlsx_data
|
||||
|
||||
def _process_udf(self, file_bytes: bytes):
|
||||
udf_data = {
|
||||
"sentences": [],
|
||||
"page_number": [],
|
||||
"is_header": [],
|
||||
"is_table": [],
|
||||
}
|
||||
current_length = 0
|
||||
chars_per_page = 2000
|
||||
current_page = 1
|
||||
|
||||
udf_file = io.BytesIO(file_bytes)
|
||||
with zipfile.ZipFile(udf_file, "r") as zip_ref:
|
||||
xml_content = zip_ref.read("content.xml")
|
||||
dataTree = ET.parse(io.BytesIO(xml_content))
|
||||
splits = self.markdown_splitter.split_text(
|
||||
dataTree.find(".//content").text.strip()
|
||||
)
|
||||
for split in splits:
|
||||
if current_length + len(split.page_content) > chars_per_page:
|
||||
current_page += 1
|
||||
current_length = 0
|
||||
|
||||
if (
|
||||
not len(split.page_content) > 5
|
||||
or re.match(r"^[^\w]*$", split.page_content)
|
||||
or split.page_content[:4] == "<!--"
|
||||
):
|
||||
continue
|
||||
elif (
|
||||
split.metadata and split.page_content[0] == "#"
|
||||
): # Header detection
|
||||
udf_data["sentences"].append(split.page_content)
|
||||
udf_data["is_header"].append(True)
|
||||
udf_data["is_table"].append(False)
|
||||
udf_data["page_number"].append(current_page)
|
||||
current_length += len(split.page_content)
|
||||
elif (
|
||||
split.page_content[0] == "*"
|
||||
and split.page_content[-1] == "*"
|
||||
and (
|
||||
re.match(
|
||||
r"(\*{2,})(\d+(?:\.\d+)*)\s*(\*{2,})?(.*)$",
|
||||
split.page_content,
|
||||
)
|
||||
or re.match(
|
||||
r"(\*{1,3})?([A-Z][a-zA-Z\s\-]+)(\*{1,3})?$",
|
||||
split.page_content,
|
||||
)
|
||||
)
|
||||
): # Sub-Header and Header variant detection
|
||||
udf_data["sentences"].append(split.page_content)
|
||||
udf_data["is_header"].append(True)
|
||||
udf_data["is_table"].append(False)
|
||||
udf_data["page_number"].append(current_page)
|
||||
current_length += len(split.page_content)
|
||||
elif (
|
||||
split.page_content[0] == "|" and split.page_content[-1] == "|"
|
||||
): # Table detection
|
||||
udf_data["sentences"].append(split.page_content)
|
||||
udf_data["is_header"].append(False)
|
||||
udf_data["is_table"].append(True)
|
||||
udf_data["page_number"].append(current_page)
|
||||
current_length += len(split.page_content)
|
||||
else:
|
||||
udf_data["sentences"].append(split.page_content)
|
||||
udf_data["is_header"].append(False)
|
||||
udf_data["is_table"].append(False)
|
||||
udf_data["page_number"].append(current_page)
|
||||
current_length += len(split.page_content)
|
||||
return udf_data
|
||||
|
||||
def _process_txt(self, file_bytes: bytes):
|
||||
text_data = {
|
||||
"sentences": [],
|
||||
"page_number": [],
|
||||
"is_header": [],
|
||||
"is_table": [],
|
||||
}
|
||||
text = file_bytes.decode("utf-8", errors="ignore")
|
||||
valid_sentences = self._process_text(text=text)
|
||||
text_data["sentences"].extend(valid_sentences)
|
||||
text_data["page_number"].extend([1] * len(valid_sentences))
|
||||
text_data["is_header"].extend([False] * len(valid_sentences))
|
||||
|
||||
text_data["is_table"] = [False] * len(text_data["sentences"])
|
||||
return text_data
|
||||
|
||||
def _process_text(self, text):
|
||||
docs = self.nlp(text)
|
||||
sentences = [sent.text.replace("\n", " ").strip() for sent in docs.sents]
|
||||
return [sentence for sentence in sentences if len(sentence) > 15]
|
||||
|
||||
def _chunk_html(self, html_text: str, max_tokens: int = 2000):
|
||||
chunked_data = {
|
||||
"sentences": [],
|
||||
"page_number": [],
|
||||
"is_header": [],
|
||||
"is_table": [],
|
||||
}
|
||||
|
||||
current_length = 0
|
||||
|
||||
for i, sentence in enumerate(html_text["sentences"]):
|
||||
estimated_tokens = len(sentence.split())
|
||||
|
||||
if estimated_tokens > max_tokens:
|
||||
words = sentence.split()
|
||||
for j in range(0, len(words), max_tokens):
|
||||
chunk = " ".join(words[j : j + max_tokens])
|
||||
chunked_data["sentences"].append(chunk)
|
||||
chunked_data["page_number"].append(html_text["page_number"][i])
|
||||
chunked_data["is_header"].append(html_text["is_header"][i])
|
||||
chunked_data["is_table"].append(html_text["is_table"][i])
|
||||
else:
|
||||
if current_length + estimated_tokens > max_tokens:
|
||||
chunked_data["sentences"].append(sentence)
|
||||
chunked_data["page_number"].append(html_text["page_number"][i])
|
||||
chunked_data["is_header"].append(html_text["is_header"][i])
|
||||
chunked_data["is_table"].append(html_text["is_table"][i])
|
||||
current_length = 0
|
||||
else:
|
||||
chunked_data["sentences"].append(sentence)
|
||||
chunked_data["page_number"].append(html_text["page_number"][i])
|
||||
chunked_data["is_header"].append(html_text["is_header"][i])
|
||||
chunked_data["is_table"].append(html_text["is_table"][i])
|
||||
current_length += estimated_tokens
|
||||
|
||||
return chunked_data
|
||||
|
||||
def _get_file_size(self, file_bytes: bytes) -> None:
|
||||
return len(file_bytes) / (1024 * 1024)
|
||||
|
||||
def _clean_text(self, text: str) -> str:
|
||||
text = re.sub(r"(\b\w+)\s*\n\s*(\w+\b)", r"\1 \2", text)
|
||||
text = re.sub(r"(\w+)-\s+(\w+)", r"\1\2", text)
|
||||
text = re.sub(r"[,()]\s*\n\s*(\w+)", r" \1", text)
|
||||
text = re.sub(r"(\b\w+)\s*-\s*(\w+\b)", r"\1 \2", text)
|
||||
text = re.sub(r"(\w+)\s*[-–]\s*(\w+)", r"\1\2", text)
|
||||
text = re.sub(
|
||||
r"(?:[\s!\"#$%&\'()*+,\-.:;<=>?@\[\\\]^_`{|}~]+)(?!\w)", r" ", text
|
||||
)
|
||||
text = text.replace("\n", " ").strip()
|
||||
return " ".join(text.split())
|
||||
114
doclink/app/functions/scraping_functions.py
Normal file
114
doclink/app/functions/scraping_functions.py
Normal file
@@ -0,0 +1,114 @@
|
||||
import logging
|
||||
import validators
|
||||
|
||||
from requests_html import HTMLSession
|
||||
from urllib.parse import urlparse
|
||||
from ratelimit import limits, sleep_and_retry
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import Optional, Tuple
|
||||
|
||||
|
||||
class Webscraper:
|
||||
def __init__(self):
|
||||
self.session = HTMLSession()
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
self.unwanted_tags = [
|
||||
"nav",
|
||||
"header",
|
||||
"footer",
|
||||
"aside",
|
||||
"script",
|
||||
"style",
|
||||
"noscript",
|
||||
"iframe",
|
||||
"advertisement",
|
||||
"banner",
|
||||
"cookie-banner",
|
||||
"social-media",
|
||||
"comments",
|
||||
'[class*="ad-"]',
|
||||
'[class*="advertisement"]',
|
||||
'[class*="banner"]',
|
||||
'[class*="social"]',
|
||||
'[class*="footer"]',
|
||||
'[class*="header-nav"]',
|
||||
'[class*="cookie"]',
|
||||
'[class*="popup"]',
|
||||
'[class*="modal"]',
|
||||
'[class*="newsletter"]',
|
||||
]
|
||||
|
||||
@sleep_and_retry
|
||||
@limits(calls=30, period=60)
|
||||
def request_creator(self, url: str) -> Optional[str]:
|
||||
try:
|
||||
response = self.session.get(url, timeout=30)
|
||||
response.raise_for_status()
|
||||
return response.html.html
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error making request to {url}: {e}")
|
||||
return None
|
||||
|
||||
def url_validator(self, url: str) -> bool:
|
||||
try:
|
||||
if not validators.url(url):
|
||||
return False
|
||||
|
||||
parsed = urlparse(url)
|
||||
|
||||
return parsed.scheme in ["https", "http"]
|
||||
except Exception as e:
|
||||
self.logger.error(f"URL validation error: {str(e)}")
|
||||
return False
|
||||
|
||||
def html_parser(self, html: str) -> str:
|
||||
try:
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
for selector in self.unwanted_tags:
|
||||
for element in soup.select(selector):
|
||||
element.decompose()
|
||||
|
||||
main_content = None
|
||||
main_tags = ["article", "main", "div"]
|
||||
|
||||
for tag in main_tags:
|
||||
if tag == "div":
|
||||
for element in soup.find_all(tag, class_=True):
|
||||
class_name = str(element.get("class", ""))
|
||||
if any(
|
||||
pattern in class_name.lower()
|
||||
for pattern in ["content", "article", "post", "entry"]
|
||||
):
|
||||
main_content = element
|
||||
break
|
||||
else:
|
||||
main_content = soup.find(tag)
|
||||
|
||||
if main_content:
|
||||
break
|
||||
if not main_content:
|
||||
main_content = soup.body
|
||||
|
||||
return str(main_content) if main_content else str(soup)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error cleaning HTML: {str(e)}")
|
||||
return html
|
||||
|
||||
def scraper(self, url: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
if not self.url_validator(url):
|
||||
return None, "Invalid Format"
|
||||
|
||||
html = self.request_creator(url)
|
||||
if not html:
|
||||
return None, "Failed to fetch URL"
|
||||
|
||||
try:
|
||||
parsed_html = self.html_parser(html=html)
|
||||
return parsed_html, None
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error processing URL {url}: {str(e)}")
|
||||
return None, f"Error processing URL {str(e)}"
|
||||
214
doclink/app/main_dev.py
Normal file
214
doclink/app/main_dev.py
Normal file
@@ -0,0 +1,214 @@
|
||||
from fastapi import FastAPI, Request, HTTPException
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.templating import Jinja2Templates
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from fastapi.responses import RedirectResponse
|
||||
|
||||
import requests as http_requests
|
||||
import os
|
||||
import jwt
|
||||
import uuid
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
from dotenv import load_dotenv
|
||||
from .api import endpoints
|
||||
from .db.database import Database
|
||||
|
||||
# Load configurations
|
||||
load_dotenv()
|
||||
|
||||
# Constants
|
||||
FRONTEND_URL = os.getenv("FRONTEND_URL_DEV", "http://localhost:3000")
|
||||
GOOGLE_CLIENT_ID = os.getenv("GOOGLE_CLIENT_ID")
|
||||
SECRET_KEY = os.getenv("MIDDLEWARE_SECRET_KEY")
|
||||
|
||||
# App initialization
|
||||
app = FastAPI(title="Doclink")
|
||||
app.mount("/static", StaticFiles(directory="app/static"), name="static")
|
||||
templates = Jinja2Templates(directory="templates")
|
||||
|
||||
# CORS Configuration
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=[FRONTEND_URL],
|
||||
allow_credentials=True,
|
||||
allow_methods=["GET", "POST"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
|
||||
async def verify_google_token(token: str) -> dict:
|
||||
"""Verify Google OAuth token and get user info"""
|
||||
try:
|
||||
# Use the access token to get user info from Google
|
||||
userinfo_response = http_requests.get(
|
||||
"https://www.googleapis.com/oauth2/v3/userinfo",
|
||||
headers={"Authorization": f"Bearer {token}"},
|
||||
)
|
||||
|
||||
if not userinfo_response.ok:
|
||||
raise ValueError("Failed to get user info")
|
||||
|
||||
userinfo = userinfo_response.json()
|
||||
|
||||
# Verify basic user info exists
|
||||
if not userinfo.get("sub"): # 'sub' is the Google user ID
|
||||
raise ValueError("Invalid user info")
|
||||
|
||||
return userinfo
|
||||
except Exception as e:
|
||||
print(f"Token verification error: {str(e)}")
|
||||
raise HTTPException(status_code=401, detail=f"Invalid token: {str(e)}")
|
||||
|
||||
|
||||
def create_session_token(user_data: dict) -> str:
|
||||
"""Create an encrypted session token"""
|
||||
payload = {
|
||||
"user_id": user_data["user_id"],
|
||||
"email": user_data["email"],
|
||||
"exp": datetime.utcnow() + timedelta(days=1), # 1 day expiration
|
||||
}
|
||||
return jwt.encode(payload, SECRET_KEY, algorithm="HS256")
|
||||
|
||||
|
||||
def verify_session_token(session_token: str) -> dict:
|
||||
"""Verify and decode session token"""
|
||||
try:
|
||||
payload = jwt.decode(session_token, SECRET_KEY, algorithms=["HS256"])
|
||||
return payload
|
||||
except jwt.ExpiredSignatureError:
|
||||
raise HTTPException(status_code=401, detail="Session expired")
|
||||
except jwt.InvalidTokenError:
|
||||
raise HTTPException(status_code=401, detail="Invalid session")
|
||||
|
||||
|
||||
@app.middleware("http")
|
||||
async def auth_middleware(request: Request, call_next):
|
||||
"""Middleware to check authentication for protected routes"""
|
||||
# Public routes that don't need authentication
|
||||
public_paths = {"/api/version", "/docs", "/redoc", "/openapi.json"}
|
||||
|
||||
if request.url.path in public_paths:
|
||||
return await call_next(request)
|
||||
|
||||
# Check if it's a chat route
|
||||
if request.url.path.startswith("/chat/"):
|
||||
# Get either query parameters (from Next.js redirect) or session cookie
|
||||
token = request.query_params.get("token")
|
||||
session_cookie = request.cookies.get("session_token")
|
||||
|
||||
if not token and not session_cookie:
|
||||
return RedirectResponse(url=FRONTEND_URL)
|
||||
|
||||
try:
|
||||
# If we have both token and session, prioritize session
|
||||
if session_cookie:
|
||||
try:
|
||||
user_data = verify_session_token(session_cookie)
|
||||
request.state.user_data = user_data
|
||||
return await call_next(request)
|
||||
except Exception as e:
|
||||
print(f"Error {e}")
|
||||
if not token:
|
||||
return RedirectResponse(url=FRONTEND_URL)
|
||||
|
||||
# Token-based auth as fallback
|
||||
if token:
|
||||
print("Using token authentication")
|
||||
request.state.token = token
|
||||
request.state.user_id = request.query_params.get("userId")
|
||||
request.state.is_new_user = (
|
||||
request.query_params.get("isNewUser", "false").lower() == "true"
|
||||
)
|
||||
return await call_next(request)
|
||||
|
||||
# No valid auth method
|
||||
print("No valid authentication method found")
|
||||
return RedirectResponse(url=FRONTEND_URL)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Auth middleware error: {str(e)}", exc_info=True)
|
||||
return RedirectResponse(url=FRONTEND_URL)
|
||||
|
||||
return await call_next(request)
|
||||
|
||||
|
||||
@app.get("/chat/{session_id}")
|
||||
async def chat_page(request: Request, session_id: str):
|
||||
"""Handle both initial and subsequent visits to chat page"""
|
||||
try:
|
||||
# If we have a token in query params, this is an initial visit
|
||||
if hasattr(request.state, "token"):
|
||||
# Verify Google token and get user info
|
||||
google_user = await verify_google_token(request.state.token)
|
||||
|
||||
# Create user data
|
||||
user_data = {
|
||||
"user_id": request.state.user_id,
|
||||
"email": google_user.get("email"),
|
||||
"name": google_user.get("name"),
|
||||
"picture": google_user.get("picture"),
|
||||
}
|
||||
|
||||
# Create session token
|
||||
session_token = create_session_token(user_data)
|
||||
|
||||
# Create domain if first time
|
||||
if request.state.is_new_user:
|
||||
with Database() as db:
|
||||
domain_id = str(uuid.uuid4())
|
||||
db.insert_domain_info(
|
||||
user_id=request.state.user_id,
|
||||
domain_id=domain_id,
|
||||
domain_name="My First Folder",
|
||||
domain_type=0,
|
||||
)
|
||||
db.insert_user_guide(
|
||||
user_id=request.state.user_id, domain_id=domain_id
|
||||
)
|
||||
|
||||
# Create response with template
|
||||
response = templates.TemplateResponse(
|
||||
"app.html",
|
||||
{
|
||||
"request": request,
|
||||
"session_id": session_id,
|
||||
"user_id": user_data["user_id"],
|
||||
"is_first_time": request.state.is_new_user,
|
||||
"environment": "dev",
|
||||
},
|
||||
)
|
||||
|
||||
# Set session cookie
|
||||
response.set_cookie(
|
||||
key="session_token",
|
||||
value=session_token,
|
||||
httponly=True,
|
||||
secure=False,
|
||||
max_age=259200, # 1 day
|
||||
samesite="lax",
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
# If we have user_data from cookie, this is a subsequent visit
|
||||
else:
|
||||
user_data = request.state.user_data
|
||||
return templates.TemplateResponse(
|
||||
"app.html",
|
||||
{
|
||||
"request": request,
|
||||
"session_id": session_id,
|
||||
"user_id": user_data["user_id"],
|
||||
"is_first_time": False,
|
||||
"environment": "dev",
|
||||
},
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error in chat page: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail="Error rendering application")
|
||||
|
||||
|
||||
# Include other routes
|
||||
app.include_router(endpoints.router, prefix="/api/v1")
|
||||
279
doclink/app/main_prod.py
Normal file
279
doclink/app/main_prod.py
Normal file
@@ -0,0 +1,279 @@
|
||||
from fastapi import FastAPI, Request, HTTPException
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.templating import Jinja2Templates
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from fastapi.responses import RedirectResponse
|
||||
|
||||
import requests as http_requests
|
||||
import os
|
||||
import jwt
|
||||
import uuid
|
||||
import logging
|
||||
from logging.handlers import RotatingFileHandler
|
||||
from datetime import datetime, timedelta
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from .api import endpoints
|
||||
from .db.database import Database
|
||||
|
||||
# Load configurations
|
||||
load_dotenv()
|
||||
|
||||
# Constants
|
||||
FRONTEND_URL = os.getenv("FRONTEND_URL_PROD", "http://localhost:3000")
|
||||
GOOGLE_CLIENT_ID = os.getenv("GOOGLE_CLIENT_ID")
|
||||
SECRET_KEY = os.getenv("MIDDLEWARE_SECRET_KEY")
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||
handlers=[
|
||||
RotatingFileHandler(
|
||||
"/var/log/doclink/doclink.log",
|
||||
maxBytes=10000000, # 10MB
|
||||
backupCount=5,
|
||||
),
|
||||
logging.StreamHandler(),
|
||||
],
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# App initialization
|
||||
app = FastAPI(title="Doclink")
|
||||
app.mount("/static", StaticFiles(directory="app/static"), name="static")
|
||||
templates = Jinja2Templates(directory="templates")
|
||||
|
||||
|
||||
# Middleware headers
|
||||
@app.middleware("http")
|
||||
async def add_security_headers(request: Request, call_next):
|
||||
response = await call_next(request)
|
||||
|
||||
response.headers["Content-Security-Policy"] = (
|
||||
"default-src 'self';"
|
||||
"script-src 'self' 'unsafe-inline' 'unsafe-eval' "
|
||||
"https://cdnjs.cloudflare.com "
|
||||
"https://www.googletagmanager.com "
|
||||
"https://www.google-analytics.com "
|
||||
"https://cdn.jsdelivr.net;"
|
||||
"style-src 'self' 'unsafe-inline' "
|
||||
"https://fonts.googleapis.com "
|
||||
"https://cdn.jsdelivr.net "
|
||||
"https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/ "
|
||||
"https://cdn.jsdelivr.net/npm/bootstrap-icons@1.11.2/;"
|
||||
"style-src-elem 'self' 'unsafe-inline' "
|
||||
"https://fonts.googleapis.com "
|
||||
"https://cdn.jsdelivr.net "
|
||||
"https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/ "
|
||||
"https://cdn.jsdelivr.net/npm/bootstrap-icons@1.11.2/;"
|
||||
"font-src 'self' https://fonts.gstatic.com "
|
||||
"https://cdn.jsdelivr.net data:;"
|
||||
"img-src 'self' data: https://www.google-analytics.com https://*.googleusercontent.com;"
|
||||
"connect-src 'self' https://www.google-analytics.com;"
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
|
||||
# CORS
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=[FRONTEND_URL],
|
||||
allow_credentials=True,
|
||||
allow_methods=["GET", "POST"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
|
||||
async def verify_google_token(token: str) -> dict:
|
||||
"""Verify Google OAuth token and get user info"""
|
||||
try:
|
||||
# Use the access token to get user info from Google
|
||||
userinfo_response = http_requests.get(
|
||||
"https://www.googleapis.com/oauth2/v3/userinfo",
|
||||
headers={"Authorization": f"Bearer {token}"},
|
||||
)
|
||||
|
||||
if not userinfo_response.ok:
|
||||
raise ValueError("Failed to get user info")
|
||||
|
||||
userinfo = userinfo_response.json()
|
||||
|
||||
# Verify basic user info exists
|
||||
if not userinfo.get("sub"): # 'sub' is the Google user ID
|
||||
raise ValueError("Invalid user info")
|
||||
|
||||
return userinfo
|
||||
except Exception as e:
|
||||
logger.info(f"Token verification error: {str(e)}")
|
||||
raise HTTPException(status_code=401, detail=f"Invalid token: {str(e)}")
|
||||
|
||||
|
||||
def create_session_token(user_data: dict) -> str:
|
||||
"""Create an encrypted session token"""
|
||||
payload = {
|
||||
"user_id": user_data["user_id"],
|
||||
"email": user_data["email"],
|
||||
"exp": datetime.utcnow() + timedelta(days=1), # 1 day expiration
|
||||
}
|
||||
return jwt.encode(payload, SECRET_KEY, algorithm="HS256")
|
||||
|
||||
|
||||
def verify_session_token(session_token: str) -> dict:
|
||||
"""Verify and decode session token"""
|
||||
try:
|
||||
payload = jwt.decode(session_token, SECRET_KEY, algorithms=["HS256"])
|
||||
return payload
|
||||
except jwt.ExpiredSignatureError:
|
||||
raise HTTPException(status_code=401, detail="Session expired")
|
||||
except jwt.InvalidTokenError:
|
||||
raise HTTPException(status_code=401, detail="Invalid session")
|
||||
|
||||
|
||||
@app.middleware("http")
|
||||
async def auth_middleware(request: Request, call_next):
|
||||
"""Middleware to check authentication for protected routes"""
|
||||
# Public routes that don't need authentication
|
||||
public_paths = {"/api/version", "/docs", "/redoc", "/openapi.json"}
|
||||
|
||||
if request.url.path in public_paths:
|
||||
return await call_next(request)
|
||||
|
||||
# Check if it's a chat route
|
||||
if request.url.path.startswith("/chat/"):
|
||||
# Get either query parameters (from Next.js redirect) or session cookie
|
||||
token = request.query_params.get("token")
|
||||
session_cookie = request.cookies.get("session_token")
|
||||
|
||||
if not token and not session_cookie:
|
||||
return RedirectResponse(url=FRONTEND_URL)
|
||||
|
||||
try:
|
||||
# If we have both token and session, prioritize session
|
||||
if session_cookie:
|
||||
try:
|
||||
user_data = verify_session_token(session_cookie)
|
||||
request.state.user_data = user_data
|
||||
return await call_next(request)
|
||||
except Exception as e:
|
||||
logger.info(f"Error validation of session cookie {e}")
|
||||
if not token:
|
||||
return RedirectResponse(url=FRONTEND_URL)
|
||||
|
||||
# Token-based auth as fallback
|
||||
if token:
|
||||
logger.info("Using token authentication")
|
||||
request.state.token = token
|
||||
request.state.user_id = request.query_params.get("userId")
|
||||
request.state.is_new_user = (
|
||||
request.query_params.get("isNewUser", "false").lower() == "true"
|
||||
)
|
||||
return await call_next(request)
|
||||
|
||||
# No valid auth method
|
||||
logger.info("No valid authentication method found")
|
||||
return RedirectResponse(url=FRONTEND_URL)
|
||||
|
||||
except Exception as e:
|
||||
logger.info(f"Auth middleware error: {str(e)}", exc_info=True)
|
||||
return RedirectResponse(url=FRONTEND_URL)
|
||||
|
||||
return await call_next(request)
|
||||
|
||||
|
||||
@app.get("/chat/{session_id}")
|
||||
async def chat_page(request: Request, session_id: str):
|
||||
"""Handle both initial and subsequent visits to chat page"""
|
||||
logger.info(f"******** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ********")
|
||||
try:
|
||||
logger.info(f"Processing chat page request for session {session_id}")
|
||||
logger.info(f"Request state: {vars(request.state)}")
|
||||
|
||||
# If we have a token in query params, this is an initial visit
|
||||
if hasattr(request.state, "token"):
|
||||
logger.info("Processing initial visit with token")
|
||||
# Verify Google token and get user info
|
||||
try:
|
||||
# Verify Google token and get user info
|
||||
google_user = await verify_google_token(request.state.token)
|
||||
logger.info(f"Google user verified: {google_user.get('email')}")
|
||||
|
||||
# Create user data
|
||||
user_data = {
|
||||
"user_id": request.state.user_id,
|
||||
"email": google_user.get("email"),
|
||||
"name": google_user.get("name"),
|
||||
"picture": google_user.get("picture"),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing token: {str(e)}", exc_info=True)
|
||||
raise
|
||||
|
||||
# Create session token
|
||||
session_token = create_session_token(user_data)
|
||||
|
||||
# Create domain if first time
|
||||
if request.state.is_new_user:
|
||||
with Database() as db:
|
||||
domain_id = str(uuid.uuid4())
|
||||
db.insert_domain_info(
|
||||
user_id=request.state.user_id,
|
||||
domain_id=domain_id,
|
||||
domain_name="My First Folder",
|
||||
domain_type=0,
|
||||
)
|
||||
db.insert_user_guide(
|
||||
user_id=request.state.user_id, domain_id=domain_id
|
||||
)
|
||||
|
||||
# Create response with template
|
||||
response = templates.TemplateResponse(
|
||||
"app.html",
|
||||
{
|
||||
"request": request,
|
||||
"session_id": session_id,
|
||||
"user_id": user_data["user_id"],
|
||||
"is_first_time": request.state.is_new_user,
|
||||
"environment": "prod",
|
||||
},
|
||||
)
|
||||
|
||||
# Set session cookie
|
||||
response.set_cookie(
|
||||
key="session_token",
|
||||
value=session_token,
|
||||
httponly=True,
|
||||
secure=False,
|
||||
max_age=86400, # 1 day
|
||||
samesite="lax",
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
# If we have user_data from cookie, this is a subsequent visit
|
||||
else:
|
||||
logger.info("Processing subsequent visit with session cookie")
|
||||
|
||||
user_data = request.state.user_data
|
||||
return templates.TemplateResponse(
|
||||
"app.html",
|
||||
{
|
||||
"request": request,
|
||||
"session_id": session_id,
|
||||
"user_id": user_data["user_id"],
|
||||
"is_first_time": False,
|
||||
"environment": "prod",
|
||||
},
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.info(f"Error processing subsequent visit with session cookie {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Error rendering application {e}")
|
||||
|
||||
|
||||
# Include other routes
|
||||
app.include_router(endpoints.router, prefix="/api/v1")
|
||||
190
doclink/app/redis_manager.py
Normal file
190
doclink/app/redis_manager.py
Normal file
@@ -0,0 +1,190 @@
|
||||
from redis import Redis
|
||||
from typing import Optional, Any
|
||||
import pickle
|
||||
import logging
|
||||
from functools import wraps
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RedisConnectionError(Exception):
|
||||
"""Custom exception for Redis connection issues"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class RedisManager:
|
||||
_instance = None
|
||||
|
||||
def __new__(cls):
|
||||
if cls._instance is None:
|
||||
cls._instance = super(RedisManager, cls).__new__(cls)
|
||||
cls._instance._initialized = False
|
||||
return cls._instance
|
||||
|
||||
def __init__(self):
|
||||
if not self._initialized:
|
||||
try:
|
||||
self.client = Redis(
|
||||
host="localhost",
|
||||
port=6380,
|
||||
db=0,
|
||||
decode_responses=False,
|
||||
socket_timeout=5,
|
||||
)
|
||||
self._initialized = True
|
||||
logger.info("Redis connection established")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to connect to Redis: {str(e)}")
|
||||
raise RedisConnectionError(f"Redis connection failed: {str(e)}")
|
||||
|
||||
self.default_ttl = 1800
|
||||
|
||||
def _handle_connection(func):
|
||||
"""Decorator to handle Redis connection errors"""
|
||||
|
||||
@wraps(func)
|
||||
def wrapper(self, *args, **kwargs):
|
||||
try:
|
||||
return func(self, *args, **kwargs)
|
||||
except Exception as e:
|
||||
logger.error(f"Redis operation failed: {str(e)}")
|
||||
raise RedisConnectionError(f"Redis operation failed: {str(e)}")
|
||||
|
||||
return wrapper
|
||||
|
||||
@_handle_connection
|
||||
def set_data(self, key: str, value: Any, expiry: int = 1800) -> bool:
|
||||
"""Store data in Redis with expiry time"""
|
||||
try:
|
||||
pickled_value = pickle.dumps(value)
|
||||
return self.client.set(key, pickled_value, ex=expiry)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to set data for key {key}: {str(e)}")
|
||||
return False
|
||||
|
||||
@_handle_connection
|
||||
def get_data(self, key: str) -> Optional[Any]:
|
||||
"""Retrieve data from Redis"""
|
||||
try:
|
||||
data = self.client.get(key)
|
||||
return pickle.loads(data) if data else None
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get data for key {key}: {str(e)}")
|
||||
return None
|
||||
|
||||
@_handle_connection
|
||||
def delete_data(self, key: str) -> bool:
|
||||
"""Delete data from Redis"""
|
||||
try:
|
||||
return bool(self.client.delete(key))
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete key {key}: {str(e)}")
|
||||
return False
|
||||
|
||||
@_handle_connection
|
||||
def clear_user_data(self, user_id: str) -> bool:
|
||||
"""Clear all data for a specific user"""
|
||||
try:
|
||||
pattern = f"user:{user_id}:*"
|
||||
keys = self.client.keys(pattern)
|
||||
if keys:
|
||||
return bool(self.client.delete(*keys))
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to clear data for user {user_id}: {str(e)}")
|
||||
return False
|
||||
|
||||
@_handle_connection
|
||||
def get_memory_usage(self) -> dict:
|
||||
"""Get Redis memory statistics"""
|
||||
try:
|
||||
info = self.client.info(section="memory")
|
||||
return {
|
||||
"used_memory": info["used_memory_human"],
|
||||
"peak_memory": info["used_memory_peak_human"],
|
||||
"fragmentation": info["mem_fragmentation_ratio"],
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get memory usage: {str(e)}")
|
||||
return {}
|
||||
|
||||
@_handle_connection
|
||||
def refresh_user_ttl(self, user_id: str) -> bool:
|
||||
"""Refresh TTL for all keys belonging to a user"""
|
||||
try:
|
||||
# Get all keys for this user
|
||||
pattern = f"user:{user_id}:*"
|
||||
user_keys = self.client.keys(pattern)
|
||||
|
||||
if not user_keys:
|
||||
return False
|
||||
|
||||
# Update TTL for all user's keys
|
||||
pipeline = self.client.pipeline()
|
||||
for key in user_keys:
|
||||
pipeline.expire(key, self.default_ttl)
|
||||
|
||||
# Execute all EXPIRE commands atomically
|
||||
results = pipeline.execute()
|
||||
|
||||
# Check if all operations succeeded
|
||||
success = all(results)
|
||||
if not success:
|
||||
logger.warning(f"Some TTL updates failed for user {user_id}")
|
||||
|
||||
return success
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to refresh TTL for user {user_id}: {str(e)}")
|
||||
return False
|
||||
|
||||
@_handle_connection
|
||||
def refresh_key_ttl(self, key: str, ttl: int = None) -> bool:
|
||||
"""Refresh TTL for a specific key"""
|
||||
try:
|
||||
return self.client.expire(key, ttl or self.default_ttl)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to refresh TTL for key {key}: {str(e)}")
|
||||
return False
|
||||
|
||||
def is_connected(self) -> bool:
|
||||
"""Check if Redis connection is alive"""
|
||||
try:
|
||||
return self.client.ping()
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def get_keys_by_pattern(self, pattern: str = "*") -> list:
|
||||
"""Get all keys matching pattern"""
|
||||
try:
|
||||
return [key.decode("utf-8") for key in self.client.keys(pattern)]
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting keys: {e}")
|
||||
return []
|
||||
|
||||
def get_key_info(self, key: str) -> dict:
|
||||
"""Get detailed information about a key"""
|
||||
try:
|
||||
return {
|
||||
"type": self.client.type(key).decode("utf-8"),
|
||||
"ttl": self.client.ttl(key),
|
||||
"memory": self.client.memory_usage(key),
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting key info: {e}")
|
||||
return {}
|
||||
|
||||
def monitor_user_data(self, user_id: str) -> dict:
|
||||
"""Monitor all data for a specific user"""
|
||||
try:
|
||||
user_keys = self.get_keys_by_pattern(f"user:{user_id}:*")
|
||||
return {
|
||||
"total_keys": len(user_keys),
|
||||
"keys": {key: self.get_key_info(key) for key in user_keys},
|
||||
"memory_usage": self.get_memory_usage(),
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Error monitoring user data: {e}")
|
||||
return {}
|
||||
3592
doclink/app/static/css/app.css
Normal file
3592
doclink/app/static/css/app.css
Normal file
File diff suppressed because it is too large
Load Diff
BIN
doclink/app/static/favicon/apple-touch-icon.png
Normal file
BIN
doclink/app/static/favicon/apple-touch-icon.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 25 KiB |
BIN
doclink/app/static/favicon/favicon-16x16.png
Normal file
BIN
doclink/app/static/favicon/favicon-16x16.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 543 B |
BIN
doclink/app/static/favicon/favicon-32x32.png
Normal file
BIN
doclink/app/static/favicon/favicon-32x32.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 1.3 KiB |
1
doclink/app/static/favicon/site.webmanifest
Normal file
1
doclink/app/static/favicon/site.webmanifest
Normal file
@@ -0,0 +1 @@
|
||||
{"name":"","short_name":"","icons":[{"src":"/android-chrome-192x192.png","sizes":"192x192","type":"image/png"},{"src":"/android-chrome-512x512.png","sizes":"512x512","type":"image/png"}],"theme_color":"#ffffff","background_color":"#ffffff","display":"standalone"}
|
||||
553
doclink/app/static/js/app-api.js
Normal file
553
doclink/app/static/js/app-api.js
Normal file
@@ -0,0 +1,553 @@
|
||||
window.fetchUserInfo = async function(userID) {
|
||||
try {
|
||||
const response = await fetch('/api/v1/db/get_user_info', {
|
||||
method: 'POST',
|
||||
body: JSON.stringify({ user_id: userID }),
|
||||
headers: {
|
||||
'Content-Type': 'application/json'
|
||||
},
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error('Failed to fetch initial user data');
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
if (!data) {
|
||||
console.error('User could not be found!');
|
||||
return null;
|
||||
}
|
||||
|
||||
return data;
|
||||
} catch (error) {
|
||||
console.error('Error fetching initial user data:', error);
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
window.handleLogoutRequest = async function handleLogoutRequest(userId, sessionId) {
|
||||
try {
|
||||
const response = await fetch('/api/v1/auth/logout', {
|
||||
method: 'POST',
|
||||
body: JSON.stringify({
|
||||
user_id: userId,
|
||||
session_id: sessionId
|
||||
}),
|
||||
credentials: 'include',
|
||||
headers: {
|
||||
'Content-Type': 'application/json'
|
||||
}
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error('Logout failed');
|
||||
}
|
||||
|
||||
return {
|
||||
success: true
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('Logout request failed:', error);
|
||||
return {
|
||||
success: false,
|
||||
error: error.message
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
window.selectDomain = async function selectDomain(domainId, userID) {
|
||||
try {
|
||||
const url = `/api/v1/qa/select_domain?userID=${encodeURIComponent(userID)}`;
|
||||
|
||||
const response = await fetch(url, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json'
|
||||
},
|
||||
body: JSON.stringify({
|
||||
domain_id: domainId
|
||||
})
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
if (data["message"] !== "success") {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
|
||||
} catch (error) {
|
||||
console.error('Error selecting domain', error);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
window.renameDomain = async function renameDomain(domainId, newName) {
|
||||
try {
|
||||
const response = await fetch('/api/v1/db/rename_domain', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json'
|
||||
},
|
||||
body: JSON.stringify({
|
||||
domain_id: domainId,
|
||||
new_name: newName
|
||||
})
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
if (data.message !== "success") {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
|
||||
} catch (error) {
|
||||
console.error('Error renaming domain:', error);
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
window.createDomain = async function createDomain(userId, domainName) {
|
||||
try {
|
||||
const url = `/api/v1/db/create_domain?userID=${encodeURIComponent(userId)}`;
|
||||
|
||||
const response = await fetch(url, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json'
|
||||
},
|
||||
body: JSON.stringify({
|
||||
domain_name: domainName
|
||||
})
|
||||
});
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
if (!response.ok) {
|
||||
return { success: 0, message: data.message || 'Failed to create domain' };
|
||||
}
|
||||
|
||||
if (data.message !== "success") {
|
||||
return { success: 0, message: data.message };
|
||||
}
|
||||
|
||||
return { success: 1, id: data.domain_id };
|
||||
} catch (error) {
|
||||
console.error('Error creating domain:', error);
|
||||
return { success: 0, id: null };
|
||||
}
|
||||
};
|
||||
|
||||
window.deleteDomain = async function deleteDomain(domainId) {
|
||||
try {
|
||||
const response = await fetch('/api/v1/db/delete_domain', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json'
|
||||
},
|
||||
body: JSON.stringify({
|
||||
domain_id: domainId
|
||||
})
|
||||
});
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
if (!response.ok) {
|
||||
return {
|
||||
success: false,
|
||||
message: data.message
|
||||
};
|
||||
}
|
||||
|
||||
if (data.message !== "success") {
|
||||
return {
|
||||
success: false,
|
||||
message: data.message
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
message: "Folder deleted"
|
||||
};
|
||||
|
||||
} catch (error) {
|
||||
console.error('Error deleting domain:', error);
|
||||
return {
|
||||
success: false,
|
||||
message: "An unexpected error occurred"
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
window.storeFile = async function(userID, formData) {
|
||||
try {
|
||||
const response = await fetch(`/api/v1/io/store_file?userID=${encodeURIComponent(userID)}`, {
|
||||
method: 'POST',
|
||||
body: formData
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error('Failed to store file');
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
if (data.message !== "success") {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
|
||||
} catch (error) {
|
||||
console.error('Error storing file:', error);
|
||||
return {
|
||||
success: false,
|
||||
error: error.message
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
window.storedriveFile = async function(userID, formData) {
|
||||
try {
|
||||
const response = await fetch(`/api/v1/io/store_drive_file?userID=${encodeURIComponent(userID)}`, {
|
||||
method: 'POST',
|
||||
body: formData
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error('Failed to store drive file');
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
if (data.message !== "success") {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
|
||||
} catch (error) {
|
||||
console.error('Error storing file:', error);
|
||||
return {
|
||||
success: false,
|
||||
error: error.message
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
window.storeURL = async function(userID, url) {
|
||||
try {
|
||||
const formData = new FormData();
|
||||
formData.append('url', url);
|
||||
|
||||
const response = await fetch(`/api/v1/io/store_url?userID=${encodeURIComponent(userID)}`, {
|
||||
method: 'POST',
|
||||
body: formData
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error('Failed to store url');
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
if (data.message !== "success") {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
|
||||
} catch (error) {
|
||||
console.error('Error storing URL:', error);
|
||||
return {
|
||||
success: false,
|
||||
error: error.message
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
window.uploadFiles = async function(userID) {
|
||||
try {
|
||||
const response = await fetch(`/api/v1/io/upload_files?userID=${userID}`, {
|
||||
method: 'POST'
|
||||
});
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
if (data.message.includes("can only have 20 total files")) {
|
||||
return {
|
||||
success: false,
|
||||
error: data.message || 'Upload process failed'
|
||||
};
|
||||
} else if (data.message !== "success") {
|
||||
return {
|
||||
success: false,
|
||||
error: data.message
|
||||
};
|
||||
}
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error('Failed to process uploads');
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: {
|
||||
file_names: data.file_names,
|
||||
file_ids: data.file_ids,
|
||||
message: data.message
|
||||
}
|
||||
};
|
||||
|
||||
} catch (error) {
|
||||
console.error('Error uploading files:', error);
|
||||
return {
|
||||
success: false,
|
||||
error: error.message
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
window.removeFile = async function(fileId, domainId, userId) {
|
||||
try {
|
||||
const url = `/api/v1/db/remove_file_upload?userID=${encodeURIComponent(userId)}`;
|
||||
|
||||
const response = await fetch(url, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json'
|
||||
},
|
||||
body: JSON.stringify({
|
||||
file_id: fileId,
|
||||
domain_id: domainId
|
||||
})
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error('Failed to remove files');
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
if (data.message !== "success") {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
|
||||
} catch (error) {
|
||||
console.error('Error removing files:', error);
|
||||
return {
|
||||
success: false,
|
||||
error: error.message
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
window.exportResponse = async function(contents) {
|
||||
try {
|
||||
const response = await fetch('/api/v1/io/export_response', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json'
|
||||
},
|
||||
body: JSON.stringify({contents})
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const errorData = await response.json();
|
||||
throw new Error(errorData.detail || 'Failed to generate PDF');
|
||||
}
|
||||
|
||||
const blob = await response.blob();
|
||||
|
||||
if (blob.size === 0) {
|
||||
throw new Error('Received empty PDF');
|
||||
}
|
||||
|
||||
const url = window.URL.createObjectURL(
|
||||
new Blob([blob], { type: 'application/pdf' })
|
||||
);
|
||||
const link = document.createElement('a');
|
||||
link.href = url;
|
||||
link.download = 'DoclinkExport.pdf';
|
||||
|
||||
document.body.appendChild(link);
|
||||
link.click();
|
||||
|
||||
document.body.removeChild(link);
|
||||
|
||||
setTimeout(() => {
|
||||
window.URL.revokeObjectURL(url);
|
||||
}, 100);
|
||||
|
||||
return {
|
||||
success: true
|
||||
};
|
||||
}
|
||||
catch (error) {
|
||||
console.error('Error uploading files:', error);
|
||||
return {
|
||||
success: false,
|
||||
error: error.message
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
window.sendMessage = async function(message, userId, sessionId, fileIds) {
|
||||
if (!message) {
|
||||
return {
|
||||
message: "Please enter your sentence!",
|
||||
status: 400
|
||||
};
|
||||
}
|
||||
|
||||
try {
|
||||
const url = `/api/v1/qa/generate_answer?userID=${encodeURIComponent(userId)}&sessionID=${encodeURIComponent(sessionId)}`;
|
||||
const response = await fetch(url, {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type': 'application/json'},
|
||||
body: JSON.stringify({
|
||||
user_message: message,
|
||||
file_ids: fileIds
|
||||
})
|
||||
});
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
if (data.message && data.message.includes("Daily question limit reached")) {
|
||||
return {
|
||||
message: data.message || 'Daily question limit reached!',
|
||||
status: 400
|
||||
};
|
||||
}
|
||||
|
||||
if (!response.ok) {
|
||||
return {
|
||||
message: data.message || 'Server error!',
|
||||
status: response.status
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
...data,
|
||||
status: 200
|
||||
};
|
||||
|
||||
} catch (error) {
|
||||
console.error('Error:', error);
|
||||
return {
|
||||
message: 'Error generating message!',
|
||||
status: 500
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
window.sendFeedback = async function(formData, userId) {
|
||||
try {
|
||||
const url = `/api/v1/db/insert_feedback?userID=${encodeURIComponent(userId)}`;
|
||||
|
||||
const response = await fetch(url, {
|
||||
method: 'POST',
|
||||
body: formData
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error('Failed to submit feedback');
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
return {
|
||||
success: true,
|
||||
message: data.message || 'Thank you for your feedback!'
|
||||
};
|
||||
|
||||
} catch (error) {
|
||||
console.error('Error submitting feedback:', error);
|
||||
return {
|
||||
success: false,
|
||||
message: 'Failed to submit feedback. Please try again.'
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
window.sendRating = async function(ratingData, userNote, userId) {
|
||||
try {
|
||||
const url = `/api/v1/db/insert_rating?userID=${encodeURIComponent(userId)}`;
|
||||
const formData = new FormData();
|
||||
formData.append('rating', ratingData);
|
||||
|
||||
if (userNote){
|
||||
formData.append('user_note', userNote);
|
||||
}
|
||||
|
||||
const response = await fetch(url, {
|
||||
method: 'POST',
|
||||
body: formData
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error('Failed to submit rating');
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
return {
|
||||
success: true,
|
||||
message: data.message || 'Thank you for your feedback!'
|
||||
};
|
||||
|
||||
} catch (error) {
|
||||
console.error('Error submitting feedback:', error);
|
||||
return {
|
||||
success: false,
|
||||
message: 'Failed to submit feedback. Please try again.'
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
window.googleSignIn = async function googleSignIn() {
|
||||
try {
|
||||
const url = `/api/v1/qa/select_domain?userID=${encodeURIComponent(userID)}`;
|
||||
|
||||
const response = await fetch(url, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json'
|
||||
},
|
||||
body: JSON.stringify({
|
||||
domain_id: domainId
|
||||
})
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
if (data["message"] !== "success") {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
|
||||
} catch (error) {
|
||||
console.error('Error selecting domain', error);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
3659
doclink/app/static/js/app.js
Normal file
3659
doclink/app/static/js/app.js
Normal file
File diff suppressed because it is too large
Load Diff
636
doclink/app/utils/prompts.yaml
Normal file
636
doclink/app/utils/prompts.yaml
Normal file
@@ -0,0 +1,636 @@
|
||||
prompts:
|
||||
languages:
|
||||
en:
|
||||
general_purpose:
|
||||
- id: gp_001
|
||||
text: "
|
||||
Your task is to analyze the given context windows, extract relevant data based on the user's query, and use file information to enhance your response. Your primary goal is to provide a comprehensive, structured, and user-friendly answer using solely the information provided in the context window.\n
|
||||
Please respond in the language of the user's query, specified by the {lang} variable (e.g., 'en' for English, 'tr' for Turkish), ensuring the tone and style align with the query's language and context.\n
|
||||
|
||||
Instructions:\n
|
||||
You will be provided with context windows, each containing several sentences along with the two following metadata: \n
|
||||
File: Specifies source of each context.\n
|
||||
Confidence coefficient: A number between 0 and 1, indicating the priority of the context (higher numbers mean higher priority).\n
|
||||
|
||||
Extracting Relevant Information:\n
|
||||
Carefully analyze the user's query to determine the specific information being requested.\n
|
||||
Use all relevant context windows, prioritizing those with higher confidence levels for accuracy.\n
|
||||
If the query references a specific file, extract information only from the specified file(s).\n
|
||||
If the query does not specify a file, aggregate information from all available files.\n
|
||||
If the context contains consistent information across multiple files, consolidate the data and indicate consistency.\n
|
||||
If the context contains contradictory information: Highlight the contradictions, specify their sources, and explain how they differ.\n
|
||||
If the context contains similar or different information, summarize the distinctions or similarities and relate them to the query.\n
|
||||
Present your response using bullet points or topic-based sections for better readability.\n
|
||||
Prioritize clarity and conciseness. Use subheadings or categories for complex queries.\n
|
||||
If the required information is not found in the context, state this clearly and offer suggestions or clarifications if possible.\n
|
||||
Do not specify the confidence coefficient in response.\n
|
||||
Do not mention about the 'context windows'. 'Use according to resources' instead.\n
|
||||
|
||||
Respond *strictly* in the following format:\n
|
||||
|
||||
[header]Section Name[/header]\n
|
||||
Content with [bold]bold terms[/bold] when needed. Use the following list format for any points:\n
|
||||
- Main point\n
|
||||
- Sub-point\n
|
||||
- Further nested point\n
|
||||
|
||||
[header]Another Section Name[/header]\n
|
||||
Content with [bold]bold terms[/bold] when needed\n
|
||||
- Main point\n
|
||||
- Sub-point\n
|
||||
- Further nested point\n
|
||||
|
||||
Rules:\n
|
||||
1. Each major section must start with [header]...[/header]\n
|
||||
2. Use [bold]...[/bold] for important terms or emphasis within content\n
|
||||
3. Headers should be one of: Definition, Purpose, Key Features, Operation, Context\n
|
||||
4. Use single dash (-) for all list items\n
|
||||
5. Indent nested list items with exactly 2 spaces per level\n
|
||||
6. Place one empty line between major sections\n
|
||||
7. Do not use any other list markers (bullets, dots, numbers)\n
|
||||
8. Keep indentation consistent throughout the response\n
|
||||
|
||||
Context Windows:\n
|
||||
{context}\n
|
||||
|
||||
User Query:\n
|
||||
{query}\n
|
||||
|
||||
User Query language:\n
|
||||
{lang}\n
|
||||
"
|
||||
|
||||
Informational:
|
||||
- id: info_001
|
||||
text: "
|
||||
Your task is to analyze the given context windows, extract relevant data based on the user's query, and use file information to enhance your response. Your primary goal is to provide a comprehensive, structured, and user-friendly answer using solely the information provided in the context window.\n
|
||||
Please respond in the language of the user's query, specified by the {lang} variable (e.g., 'en' for English, 'tr' for Turkish), ensuring the tone and style align with the query's language and context.\n
|
||||
|
||||
Instructions:\n
|
||||
You will be provided with context windows, each containing several sentences along with the two following metadata:\n
|
||||
File: Specifies source of each context.\n
|
||||
Confidence coefficient: A number between 0 and 1, indicating the priority of the context (higher numbers mean higher priority).\n
|
||||
|
||||
1. Identify factual knowledge, definitions, or explanations requested in the query.\n
|
||||
2. Focus on delivering concise, clear, and specific information.\n
|
||||
3. Include [b]key terms[/b] and definitions for clarity and emphasize relevant details.\n
|
||||
4. Avoid generalizations; prioritize extracting exact matches or relevant information from the context.\n
|
||||
5. Answer must be short as possible, on-point and clear as much as possible.\n
|
||||
6. Always prioritize contexts with higher confidence coefficients for accuracy, but cross-check lower-confidence contexts for supplementary or missing details to ensure completeness.\n
|
||||
7. Where appropriate, attribute information to its source file or section implicitly. For example: 'As described in the regulations...' or 'According to the provided report...' without directly mentioning the context window or file name unless explicitly required by the query.\n
|
||||
8. If contradictory information is found: Explicitly state the contradiction and its source(s). Suggest possible resolutions, clarifications, or factors that may explain the discrepancy (e.g., differing data sources, updates, or interpretations).\n
|
||||
9. If the query requests a more detailed response, expand your answer with additional explanations\n
|
||||
|
||||
Extracting Relevant Information:\n
|
||||
Carefully analyze the user's query to determine the specific information being requested.\n
|
||||
Use all relevant context windows, prioritizing those with higher confidence levels for accuracy.\n
|
||||
If the query references a specific file, extract information only from the specified file(s).\n
|
||||
If the query does not specify a file, aggregate information from all available files.\n
|
||||
If the context contains consistent information across multiple files, consolidate the data and indicate consistency.\n
|
||||
If the context contains contradictory information: Highlight the contradictions, specify their sources, and explain how they differ.\n
|
||||
If the context contains similar or different information, summarize the distinctions or similarities and relate them to the query.\n
|
||||
Present your response using bullet points or topic-based sections for better readability.\n
|
||||
Prioritize clarity and conciseness. Use subheadings or categories for complex queries.\n
|
||||
If the required information is not found in the context, state this clearly and offer suggestions or clarifications if possible.\n
|
||||
Do not specify the confidence coefficient in response.\n
|
||||
Do not mention about the 'context windows'. 'Use according to resources' instead.\n
|
||||
|
||||
Respond *strictly* in the following format:\n
|
||||
|
||||
[header]Section Name[/header]\n
|
||||
Content with [bold]bold terms[/bold] when needed. Use the following list format for any points:\n
|
||||
- Main point\n
|
||||
- Sub-point\n
|
||||
- Further nested point\n
|
||||
|
||||
[header]Another Section Name[/header]\n
|
||||
Content with [bold]bold terms[/bold] when needed\n
|
||||
- Main point\n
|
||||
- Sub-point\n
|
||||
- Further nested point\n
|
||||
|
||||
Rules:\n
|
||||
1. Each major section must start with [header]...[/header]\n
|
||||
2. Use [bold]...[/bold] for important terms or emphasis within content\n
|
||||
3. Headers should be one of: Definition, Purpose, Key Features, Operation, Context\n
|
||||
4. Use single dash (-) for all list items\n
|
||||
5. Indent nested list items with exactly 2 spaces per level\n
|
||||
6. Place one empty line between major sections\n
|
||||
7. Do not use any other list markers (bullets, dots, numbers)\n
|
||||
8. Keep indentation consistent throughout the response \n
|
||||
|
||||
Context Windows:\n
|
||||
{context}\n
|
||||
|
||||
User Query:\n
|
||||
{query}\n
|
||||
|
||||
User Query language:\n
|
||||
{lang}\n
|
||||
"
|
||||
|
||||
Comparison:
|
||||
- id: comp_001
|
||||
text: "
|
||||
Your task is to analyze the given context windows, extract relevant data based on the user's query, and use file information to enhance your response. Your primary goal is to provide a comprehensive, structured, and user-friendly answer using solely the information provided in the context window.\n
|
||||
Please respond in the language of the user's query, specified by the {lang} variable (e.g., 'en' for English, 'tr' for Turkish), ensuring the tone and style align with the query's language.\n
|
||||
|
||||
Instructions:\n
|
||||
You will be provided with context windows, each containing several sentences along with the two following metadata:\n
|
||||
File: Specifies source of each context.\n
|
||||
Confidence coefficient: A number between 0 and 1, indicating the priority of the context (higher numbers mean higher priority).\n
|
||||
|
||||
1. Extract and compare relevant details from the context to highlight similarities and differences.\n
|
||||
2. If contradictory information is found, specify the contradictions and explain their sources.\n
|
||||
3. Present distinctions or parallels in a structured format, using headers like [header]Similarities[/header] and [header]Differences[/header].\n
|
||||
4. Provide a clear explanation of how the extracted information relates to the user's query.\n
|
||||
5. If consistent information appears across contexts, summarize it in the [header]Similarities[/header] section. For contradictory information: Specify conflicting points under [header]Differences[/header]. Attribute contradictions to their respective sources and explain their impact.\n
|
||||
6. For comparisons involving multiple attributes, organize data using a [bold]tabular format[/bold] or structured lists. Each row or bullet point should represent one attribute.\n
|
||||
7. If the required comparison data is missing, clearly state this under [header]Missing Information[/header]. Offer suggestions for refining the query or point out gaps in the context.\n
|
||||
8. For queries involving detailed or hierarchical comparisons: Use a primary section for high-level differences or similarities. Include nested sections for more granular points.\n
|
||||
|
||||
Extracting Relevant Information:\n
|
||||
Carefully analyze the user's query to determine the specific information being requested.\n
|
||||
Use all relevant context windows, prioritizing those with higher confidence levels for accuracy.\n
|
||||
If the query references a specific file, extract information only from the specified file(s).\n
|
||||
If the query does not specify a file, aggregate information from all available files.\n
|
||||
If the context contains consistent information across multiple files, consolidate the data and indicate consistency.\n
|
||||
If the context contains contradictory information: Highlight the contradictions, specify their sources, and explain how they differ.\n
|
||||
If the context contains similar or different information, summarize the distinctions or similarities and relate them to the query.\n
|
||||
Present your response using bullet points or topic-based sections for better readability.\n
|
||||
Prioritize clarity and conciseness. Use subheadings or categories for complex queries.\n
|
||||
If the required information is not found in the context, state this clearly and offer suggestions or clarifications if possible.\n
|
||||
Do not specify the confidence coefficient in response.\n
|
||||
Do not mention about the 'context windows'. 'Use according to resources' instead.\n
|
||||
|
||||
Respond *strictly* in the following format:\n
|
||||
|
||||
[header]Section Name[/header]\n
|
||||
Content with [bold]bold terms[/bold] when needed. Use the following list format for any points:\n
|
||||
- Main point\n
|
||||
- Sub-point\n
|
||||
- Further nested point\n
|
||||
|
||||
[header]Another Section Name[/header]\n
|
||||
Content with [bold]bold terms[/bold] when needed\n
|
||||
- Main point\n
|
||||
- Sub-point\n
|
||||
- Further nested point\n
|
||||
|
||||
Rules:\n
|
||||
1. Each major section must start with [header]...[/header]\n
|
||||
2. Use [bold]...[/bold] for important terms or emphasis within content\n
|
||||
3. Headers should be one of: Definition, Purpose, Key Features, Operation, Context\n
|
||||
4. Use single dash (-) for all list items\n
|
||||
5. Indent nested list items with exactly 2 spaces per level\n
|
||||
6. Place one empty line between major sections\n
|
||||
7. Do not use any other list markers (bullets, dots, numbers)\n
|
||||
8. Keep indentation consistent throughout the response\n
|
||||
|
||||
Context Windows:\n
|
||||
{context}\n
|
||||
|
||||
User Query:\n
|
||||
{query}\n
|
||||
|
||||
User Query language:\n
|
||||
{lang}\n
|
||||
"
|
||||
|
||||
Summarization:
|
||||
- id: sum_001
|
||||
text: "
|
||||
Your task is to analyze the given context windows, extract relevant data based on the user's query, and use file information to enhance your response. Your primary goal is to provide a comprehensive, structured, and user-friendly answer using solely the information provided in the context window.\n
|
||||
Please respond in the language of the user's query, specified by the {lang} variable (e.g., 'en' for English, 'tr' for Turkish), ensuring the tone and style align with the query's language.\n
|
||||
|
||||
Instructions:\n
|
||||
You will be provided with context windows, each containing several sentences along with the two following metadata:\n
|
||||
File: Specifies source of each context.\n
|
||||
Confidence coefficient: A number between 0 and 1, indicating the priority of the context (higher numbers mean higher priority).\n
|
||||
|
||||
1. Identify and extract key points or main ideas from the context relevant to the query.\n
|
||||
2. Create a concise and well-structured summary, using bullet points or categories for clarity.\n
|
||||
3. Highlight overarching themes and provide an overview without including excessive details.\n
|
||||
4. Consolidate consistent information across contexts to avoid redundancy.\n
|
||||
5. If the query specifies a focus area (e.g., a section, file, or theme), prioritize summarizing content strictly relevant to that focus. Where no focus is specified, highlight the most critical and recurring themes or points.\n
|
||||
6. Where appropriate, illustrate key ideas with short examples or specific details from the context. Keep examples concise and relevant.\n
|
||||
7. If the context contains contradictions: Summarize both perspectives succinctly. Highlight the contradiction explicitly, and explain how it relates to the query.\n
|
||||
8. The summary should not exceed 200 tokens unless explicitly requested by the query. If required details exceed this limit, provide a prioritized or hierarchical overview.\n
|
||||
|
||||
Extracting Relevant Information:\n
|
||||
Carefully analyze the user's query to determine the specific information being requested.\n
|
||||
Use all relevant context windows, prioritizing those with higher confidence levels for accuracy.\n
|
||||
If the query references a specific file, extract information only from the specified file(s).\n
|
||||
If the query does not specify a file, aggregate information from all available files.\n
|
||||
If the context contains consistent information across multiple files, consolidate the data and indicate consistency.\n
|
||||
If the context contains contradictory information: Highlight the contradictions, specify their sources, and explain how they differ.\n
|
||||
If the context contains similar or different information, summarize the distinctions or similarities and relate them to the query.\n
|
||||
Present your response using bullet points or topic-based sections for better readability.\n
|
||||
Prioritize clarity and conciseness. Use subheadings or categories for complex queries.\n
|
||||
If the required information is not found in the context, state this clearly and offer suggestions or clarifications if possible.\n
|
||||
Do not specify the confidence coefficient in response.\n
|
||||
Do not mention about the 'context windows'. 'Use according to resources' instead.\n
|
||||
|
||||
Respond *strictly* in the following format:\n
|
||||
|
||||
[header]Section Name[/header]\n
|
||||
Content with [bold]bold terms[/bold] when needed. Use the following list format for any points:\n
|
||||
- Main point\n
|
||||
- Sub-point\n
|
||||
- Further nested point\n
|
||||
|
||||
[header]Another Section Name[/header]\n
|
||||
Content with [bold]bold terms[/bold] when needed\n
|
||||
- Main point\n
|
||||
- Sub-point\n
|
||||
- Further nested point\n
|
||||
|
||||
Rules:\n
|
||||
1. Each major section must start with [header]...[/header]\n
|
||||
2. Use [bold]...[/bold] for important terms or emphasis within content\n
|
||||
3. Headers should be one of: Definition, Purpose, Key Features, Operation, Context\n
|
||||
4. Use single dash (-) for all list items\n
|
||||
5. Indent nested list items with exactly 2 spaces per level\n
|
||||
6. Place one empty line between major sections\n
|
||||
7. Do not use any other list markers (bullets, dots, numbers)\n
|
||||
8. Keep indentation consistent throughout the response\n
|
||||
|
||||
Context Windows:\n
|
||||
{context}\n
|
||||
|
||||
User Query:\n
|
||||
{query}\n
|
||||
|
||||
User Query language:\n
|
||||
{lang}\n
|
||||
"
|
||||
|
||||
queries:
|
||||
- id: query_001
|
||||
text: "
|
||||
Task: Analyze, Correct, and Generate Related Questions & Answers\n
|
||||
Instructions:\n
|
||||
You are given a user query.\n
|
||||
|
||||
First, check the user question. If it has no meaning, return an empty string. If it is meaningful, do the following:\n
|
||||
Correct any spelling or grammatical errors and return the corrected question as the first line of the output.\n
|
||||
Generate 3 semantically similar queries that retain the same meaning as the corrected query.\n
|
||||
Create 3 different questions that approach the original query from different angles but stay related.\n
|
||||
Answer last 3 questions with concise responses, 1-2 sentences max each.\n
|
||||
Then, analyze the corrected user query and determine its intent, intention list is and their keywords, examples are given below. If intent can't be determined return empty '' string.\n
|
||||
Please respond in the file language, specified by the {file lang} variable (e.g., 'en' for English, 'tr' for Turkish) regardless of user query's language , ensuring the tone and style align with the file's language.\n
|
||||
If file language is diferent than english look for the intention keywords that provided for intent detection below in file language.\n
|
||||
|
||||
The possible intents are:\n
|
||||
1. Informational: Seeking factual knowledge, definitions, or explanations.\n
|
||||
Intention Keywords: What, define, explain, details, specify, who, why, how.\n
|
||||
Intention Examples: What is the penalty for breaking this rule? → Informational\n
|
||||
2. Summarization: Requesting a concise overview of complex information.\n
|
||||
Intention Keywords: Summarize, overview, main points, key ideas, brief, concise, simplify.\n
|
||||
Intention Examples: Can you summarize the key points of this document? → Summarization\n
|
||||
3. Comparison: Evaluating options, methods, or technologies.\n
|
||||
Intention Keywords: Compare, difference, similarity, versus, contrast, better, alternative, pros and cons.\n
|
||||
Intention Examples: Compare the benefits of these two methods. → Comparison\n
|
||||
|
||||
Return the output **strictly** in the following format:\n
|
||||
[corrected query]\n
|
||||
[first semantically similar query]\n
|
||||
[second semantically similar query]\n
|
||||
[third semantically similar query]\n
|
||||
[first different-angle question]\n
|
||||
[second different-angle question]\n
|
||||
[third different-angle question]\n
|
||||
[first different-angle answer]\n
|
||||
[second different-angle answer]\n
|
||||
[third different-angle answer]\n
|
||||
[user intention]\n
|
||||
|
||||
User query: {query}\n
|
||||
|
||||
File language:\n
|
||||
{file_lang}\n
|
||||
|
||||
Example:\n
|
||||
User query: How does retrieval-augmented generation work in AI systems?\n
|
||||
|
||||
File language: en\n
|
||||
|
||||
Output:
|
||||
How does retrieval-augmented generation work in AI systems?\n
|
||||
What is the process of retrieval-augmented generation in AI?\n
|
||||
How does RAG help AI systems retrieve and generate information?\n
|
||||
Can you explain how retrieval-augmented generation functions in AI applications?\n
|
||||
What are the key advantages of using RAG in AI?\n
|
||||
How does RAG differ from traditional machine learning models?\n
|
||||
What challenges does RAG face in implementation?\n
|
||||
RAG enhances AI by providing more accurate responses by retrieving relevant external data.\n
|
||||
Unlike traditional models, RAG integrates search capabilities to access external knowledge during inference.\n
|
||||
Major challenges include latency in retrieval, ensuring relevance of fetched data, and maintaining up-to-date information.\n
|
||||
Informational\n
|
||||
"
|
||||
tr:
|
||||
general_purpose:
|
||||
- id: gp_tr_001
|
||||
text: "
|
||||
Göreviniz verilen bağlam pencerelerini analiz etmek, kullanıcının sorgusuna göre ilgili verileri çıkarmak ve yanıtınızı geliştirmek için dosya bilgilerini kullanmaktır. Birincil amacınız, yalnızca bağlam penceresinde sağlanan bilgileri kullanarak kapsamlı, yapılandırılmış ve kullanıcı dostu bir yanıt sunmaktır.\n
|
||||
|
||||
Talimatlar:\n
|
||||
Size, her biri birkaç cümle ve şu iki meta veriyi içeren bağlam pencereleri sağlanacaktır:\n
|
||||
Dosya: Her bağlamın kaynağını belirtir.\n
|
||||
Güven katsayısı: 0 ile 1 arasında bir sayı olup, bağlamın öncelik seviyesini ifade eder (daha yüksek sayılar daha yüksek öncelik anlamına gelir).\n
|
||||
|
||||
İlgili Bilgilerin Çıkarılması:\n
|
||||
Kullanıcının sorgusunda istenen belirli bilgileri belirlemek için dikkatlice analiz yapın.\n
|
||||
Doğruluk için daha yüksek güven seviyelerine sahip bağlamlara öncelik vererek tüm ilgili bağlam pencerelerini kullanın.\n
|
||||
Sorgu belirli bir dosyayı referans alıyorsa, yalnızca belirtilen dosya(lar)dan bilgi çıkarın.\n
|
||||
Sorgu herhangi bir dosya belirtmiyorsa, mevcut tüm dosyalardan bilgileri birleştirin.\n
|
||||
Bağlam birden fazla dosyada tutarlı bilgiler içeriyorsa, verileri birleştirin ve tutarlılığı belirtin.\n
|
||||
Bağlam çelişkili bilgiler içeriyorsa: Çelişkileri vurgulayın, kaynaklarını belirtin ve nasıl farklılık gösterdiklerini açıklayın.\n
|
||||
Bağlam benzer veya farklı bilgiler içeriyorsa, farklılıkları veya benzerlikleri özetleyin ve bunları sorguyla ilişkilendirin.\n
|
||||
Yanıtınızı daha iyi okunabilirlik için madde işaretleri veya konuya dayalı bölümler kullanarak sunun.\n
|
||||
Netlik ve özlülüğe öncelik verin. Karmaşık sorgular için alt başlıklar veya kategoriler kullanın.\n
|
||||
Gerekli bilgi bağlamda bulunmuyorsa, bunu açıkça belirtin ve mümkünse öneriler veya açıklamalar sunun.\n
|
||||
Yanıtta güven katsayısını belirtmeyin.\n
|
||||
|
||||
Aşağıdaki formata *kesinlikle* uygun şekilde yanıt verin:\n
|
||||
|
||||
[header]Bölüm Adı[/header]\n
|
||||
Gerektiğinde [bold]kalın terimler[/bold] ile içerik. Maddeler için şu format kullanılmalı:\n
|
||||
- Ana madde\n
|
||||
- Alt madde\n
|
||||
- Daha alt madde\n
|
||||
|
||||
[header]Diğer Bölüm Adı[/header]\n
|
||||
Gerektiğinde [bold]kalın terimler[/bold] ile içerik\n
|
||||
- Ana madde\n
|
||||
- Alt madde\n
|
||||
- Daha alt madde\n
|
||||
|
||||
Kurallar:\n
|
||||
1. Her ana bölüm [header]...[/header] ile başlamalı\n
|
||||
2. Önemli terimler veya vurgulamalar için [bold]...[/bold] kullanın\n
|
||||
3. Bölüm başlıkları şunlardan biri olmalı: Tanım, Amaç, Temel Özellikler, İşleyiş, Bağlam\n
|
||||
4. Tüm liste maddeleri için tek tire (-) kullanın\n
|
||||
5. Alt maddelerde tam olarak 2 boşluk ile girintileme yapın\n
|
||||
6. Ana bölümler arasında bir boş satır bırakın\n
|
||||
7. Başka liste işaretleri kullanmayın (nokta, sayı vb.)\n
|
||||
8. Yanıt boyunca tutarlı girintileme kullanın\n
|
||||
|
||||
Bağlam Pencereleri:\n
|
||||
{context}\n
|
||||
|
||||
Kullanıcı Sorgusu:\n
|
||||
{query}\n
|
||||
"
|
||||
|
||||
Bilgi Edinme:
|
||||
- id: info_tr_001
|
||||
text: "
|
||||
Göreviniz verilen bağlam pencerelerini analiz etmek, kullanıcının sorgusuna göre ilgili verileri çıkarmak ve yanıtınızı geliştirmek için dosya bilgilerini kullanmaktır. Birincil amacınız, yalnızca bağlam penceresinde sağlanan bilgileri kullanarak kapsamlı, yapılandırılmış ve kullanıcı dostu bir yanıt sunmaktır.\n
|
||||
|
||||
Talimatlar:\n
|
||||
Size, her biri birkaç cümle ve şu iki meta veriyi içeren bağlam pencereleri sağlanacaktır:\n
|
||||
Dosya: Her bağlamın kaynağını belirtir.\n
|
||||
Güven katsayısı: 0 ile 1 arasında bir sayı olup, bağlamın öncelik seviyesini ifade eder (daha yüksek sayılar daha yüksek öncelik anlamına gelir).\n
|
||||
|
||||
1. Sorguda talep edilen gerçek bilgilere, tanımlara veya açıklamalara odaklanın.\n
|
||||
2. Kısa, net ve spesifik bilgiler sunmaya odaklanın.\n
|
||||
3. Açıklık için [b]önemli terimler[/b] ve tanımları ekleyin ve ilgili ayrıntıları vurgulayın.\n
|
||||
4. Genellemelerden kaçının; bağlamdan tam eşleşmeleri veya ilgili bilgileri çıkarmayı önceliklendirin.\n
|
||||
5. Cevap mümkün olduğunca kısa, net ve doğrudan olmalı; 150 ile 200 token arasında olmalıdır.\n
|
||||
6. Doğruluk için her zaman daha yüksek güven katsayısına sahip bağlamlara öncelik verin, ancak eksiksizliği sağlamak için ek veya eksik ayrıntılar için daha düşük güven katsayısına sahip bağlamları çapraz kontrol edin.\n
|
||||
7. Uygun olduğunda, bilgiyi kaynak dosya veya bölüme dolaylı olarak atfedin. Örneğin: Yönetmeliklerde belirtildiği gibi... veya Sağlanan rapora göre... ifadelerini kullanın, ancak sorguda açıkça istenmediği sürece bağlam penceresi veya dosya adını doğrudan belirtmeyin.\n
|
||||
8. Çelişkili bilgiler bulunursa: Çelişkiyi ve kaynağını açıkça belirtin. Olası çözüm yollarını, açıklamaları veya farklılıkları açıklayabilecek faktörleri (örneğin, farklı veri kaynakları, güncellemeler veya yorumlar) önerin.\n
|
||||
|
||||
İlgili Bilgilerin Çıkarılması:\n
|
||||
Kullanıcının sorgusunda istenen belirli bilgileri belirlemek için dikkatlice analiz yapın.\n
|
||||
Doğruluk için daha yüksek güven seviyelerine sahip bağlamlara öncelik vererek tüm ilgili bağlam pencerelerini kullanın.\n
|
||||
Sorgu belirli bir dosyayı referans alıyorsa, yalnızca belirtilen dosya(lar)dan bilgi çıkarın.\n
|
||||
Sorgu herhangi bir dosya belirtmiyorsa, mevcut tüm dosyalardan bilgileri birleştirin.\n
|
||||
Bağlam birden fazla dosyada tutarlı bilgiler içeriyorsa, verileri birleştirin ve tutarlılığı belirtin.\n
|
||||
Bağlam çelişkili bilgiler içeriyorsa: Çelişkileri vurgulayın, kaynaklarını belirtin ve nasıl farklılık gösterdiklerini açıklayın.\n
|
||||
Bağlam benzer veya farklı bilgiler içeriyorsa, farklılıkları veya benzerlikleri özetleyin ve bunları sorguyla ilişkilendirin.\n
|
||||
Yanıtınızı daha iyi okunabilirlik için madde işaretleri veya konuya dayalı bölümler kullanarak sunun.\n
|
||||
Netlik ve özlülüğe öncelik verin. Karmaşık sorgular için alt başlıklar veya kategoriler kullanın.\n
|
||||
Gerekli bilgi bağlamda bulunmuyorsa, bunu açıkça belirtin ve mümkünse öneriler veya açıklamalar sunun.\n
|
||||
Yanıtta güven katsayısını belirtmeyin.\n
|
||||
|
||||
Aşağıdaki formata *kesinlikle* uygun şekilde yanıt verin:\n
|
||||
|
||||
[header]Bölüm Adı[/header]\n
|
||||
Gerektiğinde [bold]kalın terimler[/bold] ile içerik. Maddeler için şu format kullanılmalı:\n
|
||||
- Ana madde\n
|
||||
- Alt madde\n
|
||||
- Daha alt madde\n
|
||||
|
||||
[header]Diğer Bölüm Adı[/header]\n
|
||||
Gerektiğinde [bold]kalın terimler[/bold] ile içerik\n
|
||||
- Ana madde\n
|
||||
- Alt madde\n
|
||||
- Daha alt madde\n
|
||||
|
||||
Kurallar:\n
|
||||
1. Her ana bölüm [header]...[/header] ile başlamalı\n
|
||||
2. Önemli terimler veya vurgulamalar için [bold]...[/bold] kullanın\n
|
||||
3. Bölüm başlıkları şunlardan biri olmalı: Tanım, Amaç, Temel Özellikler, İşleyiş, Bağlam\n
|
||||
4. Tüm liste maddeleri için tek tire (-) kullanın\n
|
||||
5. Alt maddelerde tam olarak 2 boşluk ile girintileme yapın\n
|
||||
6. Ana bölümler arasında bir boş satır bırakın\n
|
||||
7. Başka liste işaretleri kullanmayın (nokta, sayı vb.)\n
|
||||
8. Yanıt boyunca tutarlı girintileme kullanın\n
|
||||
|
||||
Bağlam Pencereleri:\n
|
||||
{context}\n
|
||||
|
||||
Kullanıcı Sorgusu:\n
|
||||
{query}\n
|
||||
"
|
||||
|
||||
Karşılaştırma:
|
||||
- id: comp_tr_001
|
||||
text: "
|
||||
Göreviniz verilen bağlam pencerelerini analiz etmek, kullanıcının sorgusuna göre ilgili verileri çıkarmak ve yanıtınızı geliştirmek için dosya bilgilerini kullanmaktır. Birincil amacınız, yalnızca bağlam penceresinde sağlanan bilgileri kullanarak kapsamlı, yapılandırılmış ve kullanıcı dostu bir yanıt sunmaktır.\n
|
||||
|
||||
Talimatlar:\n
|
||||
Size, her biri birkaç cümle ve şu iki meta veriyi içeren bağlam pencereleri sağlanacaktır:\n
|
||||
Dosya: Her bağlamın kaynağını belirtir.\n
|
||||
Güven katsayısı: 0 ile 1 arasında bir sayı olup, bağlamın öncelik seviyesini ifade eder (daha yüksek sayılar daha yüksek öncelik anlamına gelir).\n
|
||||
|
||||
1. Benzerlikleri ve farklılıkları vurgulamak için bağlamdan ilgili detayları çıkarın ve karşılaştırın.\n
|
||||
2. Çelişkili bilgiler bulunursa, bu çelişkileri belirtin ve kaynaklarını açıklayın.\n
|
||||
3. Ayrımları veya paralellikleri, [header]Benzerlikler[/header] ve [header]Farklılıklar[/header] gibi başlıklar kullanarak yapılandırılmış bir formatta sunun.\n
|
||||
4. Çıkarılan bilgilerin kullanıcının sorgusuyla nasıl ilişkili olduğunu net bir şekilde açıklayın.\n
|
||||
5. Eğer bağlamlar arasında tutarlı bilgiler bulunuyorsa, bunları [header]Benzerlikler[/header] bölümünde özetleyin. Çelişkili bilgiler için: Çelişen noktaları [header]Farklılıklar[/header] başlığı altında belirtin. Çelişkileri ilgili kaynaklarına atfedin ve bunların etkisini açıklayın.\n
|
||||
6. Birden fazla özelliği kapsayan karşılaştırmalar için, verileri [bold]tablo formatında[/bold] veya yapılandırılmış listeler halinde düzenleyin. Her bir satır veya madde işareti bir özelliği temsil etmelidir.\n
|
||||
7. Gerekli karşılaştırma verileri eksikse, bunu [header]Eksik Bilgiler[/header] başlığı altında açıkça belirtin. Sorgunun nasıl iyileştirilebileceğine dair önerilerde bulunun veya bağlamdaki eksikliklere işaret edin.\n
|
||||
8. Ayrıntılı veya hiyerarşik karşılaştırmaları içeren sorgular için: Genel farklılıklar veya benzerlikler için bir ana bölüm kullanın. Daha ayrıntılı noktalar için iç içe geçmiş bölümler ekleyin.\n
|
||||
|
||||
İlgili Bilgilerin Çıkarılması:\n
|
||||
Kullanıcının sorgusunda istenen belirli bilgileri belirlemek için dikkatlice analiz yapın.\n
|
||||
Doğruluk için daha yüksek güven seviyelerine sahip bağlamlara öncelik vererek tüm ilgili bağlam pencerelerini kullanın.\n
|
||||
Sorgu belirli bir dosyayı referans alıyorsa, yalnızca belirtilen dosya(lar)dan bilgi çıkarın.\n
|
||||
Sorgu herhangi bir dosya belirtmiyorsa, mevcut tüm dosyalardan bilgileri birleştirin.\n
|
||||
Bağlam birden fazla dosyada tutarlı bilgiler içeriyorsa, verileri birleştirin ve tutarlılığı belirtin.\n
|
||||
Bağlam çelişkili bilgiler içeriyorsa: Çelişkileri vurgulayın, kaynaklarını belirtin ve nasıl farklılık gösterdiklerini açıklayın.\n
|
||||
Bağlam benzer veya farklı bilgiler içeriyorsa, farklılıkları veya benzerlikleri özetleyin ve bunları sorguyla ilişkilendirin.\n
|
||||
Yanıtınızı daha iyi okunabilirlik için madde işaretleri veya konuya dayalı bölümler kullanarak sunun.\n
|
||||
Netlik ve özlülüğe öncelik verin. Karmaşık sorgular için alt başlıklar veya kategoriler kullanın.\n
|
||||
Gerekli bilgi bağlamda bulunmuyorsa, bunu açıkça belirtin ve mümkünse öneriler veya açıklamalar sunun.\n
|
||||
Yanıtta güven katsayısını belirtmeyin.\n
|
||||
|
||||
Aşağıdaki formata *kesinlikle* uygun şekilde yanıt verin:\n
|
||||
|
||||
[header]Bölüm Adı[/header]\n
|
||||
Gerektiğinde [bold]kalın terimler[/bold] ile içerik. Maddeler için şu format kullanılmalı:\n
|
||||
- Ana madde\n
|
||||
- Alt madde\n
|
||||
- Daha alt madde\n
|
||||
|
||||
[header]Diğer Bölüm Adı[/header]\n
|
||||
Gerektiğinde [bold]kalın terimler[/bold] ile içerik\n
|
||||
- Ana madde\n
|
||||
- Alt madde\n
|
||||
- Daha alt madde\n
|
||||
|
||||
Kurallar:\n
|
||||
1. Her ana bölüm [header]...[/header] ile başlamalı\n
|
||||
2. Önemli terimler veya vurgulamalar için [bold]...[/bold] kullanın\n
|
||||
3. Bölüm başlıkları şunlardan biri olmalı: Tanım, Amaç, Temel Özellikler, İşleyiş, Bağlam\n
|
||||
4. Tüm liste maddeleri için tek tire (-) kullanın\n
|
||||
5. Alt maddelerde tam olarak 2 boşluk ile girintileme yapın\n
|
||||
6. Ana bölümler arasında bir boş satır bırakın\n
|
||||
7. Başka liste işaretleri kullanmayın (nokta, sayı vb.)\n
|
||||
8. Yanıt boyunca tutarlı girintileme kullanın\n
|
||||
|
||||
Bağlam Pencereleri:\n
|
||||
{context}\n
|
||||
|
||||
Kullanıcı Sorgusu:\n
|
||||
{query}\n
|
||||
"
|
||||
|
||||
Özetleme:
|
||||
- id: sum_tr_001
|
||||
text: "
|
||||
Göreviniz verilen bağlam pencerelerini analiz etmek, kullanıcının sorgusuna göre ilgili verileri çıkarmak ve yanıtınızı geliştirmek için dosya bilgilerini kullanmaktır. Birincil amacınız, yalnızca bağlam penceresinde sağlanan bilgileri kullanarak kapsamlı, yapılandırılmış ve kullanıcı dostu bir yanıt sunmaktır.\n
|
||||
|
||||
Talimatlar:\n
|
||||
Size, her biri birkaç cümle ve şu iki meta veriyi içeren bağlam pencereleri sağlanacaktır:\n
|
||||
Dosya: Her bağlamın kaynağını belirtir.\n
|
||||
Güven katsayısı: 0 ile 1 arasında bir sayı olup, bağlamın öncelik seviyesini ifade eder (daha yüksek sayılar daha yüksek öncelik anlamına gelir).\n
|
||||
|
||||
1. Sorgu ile ilgili bağlamdan anahtar noktaları veya temel fikirleri belirleyin ve çıkarın.\n
|
||||
2. Netlik için madde işaretleri veya kategoriler kullanarak kısa ve iyi yapılandırılmış bir özet oluşturun.\n
|
||||
3. Genel temaları vurgulayın ve gereksiz ayrıntılara yer vermeden genel bir bakış sağlayın.\n
|
||||
4. Tekrarlamaları önlemek için bağlamlar arasındaki tutarlı bilgileri birleştirin.\n
|
||||
5. Eğer sorgu belirli bir odak alanı (örneğin, bir bölüm, dosya veya tema) belirtiyorsa, yalnızca bu odakla ilgili içeriği özetlemeye öncelik verin. Herhangi bir odak belirtilmemişse, en kritik ve tekrar eden temaları veya noktaları vurgulayın.\n
|
||||
6. Uygun olduğunda, bağlamdan kısa örnekler veya belirli detaylarla ana fikirleri açıklayın. Örnekleri kısa ve ilgili tutun.\n
|
||||
7. Bağlamda çelişkiler varsa: Her iki bakış açısını da kısaca özetleyin. Çelişkiyi açıkça belirtin ve bunun sorguyla nasıl ilişkili olduğunu açıklayın.\n
|
||||
8. Özet, sorgu tarafından açıkça talep edilmedikçe 200 kelimeyi aşmamalıdır. Eğer gerekli detaylar bu sınırı aşarsa, öncelikli veya hiyerarşik bir genel bakış sağlayın.\n
|
||||
|
||||
İlgili Bilgilerin Çıkarılması:\n
|
||||
Kullanıcının sorgusunda istenen belirli bilgileri belirlemek için dikkatlice analiz yapın.\n
|
||||
Doğruluk için daha yüksek güven seviyelerine sahip bağlamlara öncelik vererek tüm ilgili bağlam pencerelerini kullanın.\n
|
||||
Sorgu belirli bir dosyayı referans alıyorsa, yalnızca belirtilen dosya(lar)dan bilgi çıkarın.\n
|
||||
Sorgu herhangi bir dosya belirtmiyorsa, mevcut tüm dosyalardan bilgileri birleştirin.\n
|
||||
Bağlam birden fazla dosyada tutarlı bilgiler içeriyorsa, verileri birleştirin ve tutarlılığı belirtin.\n
|
||||
Bağlam çelişkili bilgiler içeriyorsa: Çelişkileri vurgulayın, kaynaklarını belirtin ve nasıl farklılık gösterdiklerini açıklayın.\n
|
||||
Bağlam benzer veya farklı bilgiler içeriyorsa, farklılıkları veya benzerlikleri özetleyin ve bunları sorguyla ilişkilendirin.\n
|
||||
Yanıtınızı daha iyi okunabilirlik için madde işaretleri veya konuya dayalı bölümler kullanarak sunun.\n
|
||||
Netlik ve özlülüğe öncelik verin. Karmaşık sorgular için alt başlıklar veya kategoriler kullanın.\n
|
||||
Gerekli bilgi bağlamda bulunmuyorsa, bunu açıkça belirtin ve mümkünse öneriler veya açıklamalar sunun.\n
|
||||
Yanıtta güven katsayısını belirtmeyin.\n
|
||||
|
||||
Aşağıdaki formata *kesinlikle* uygun şekilde yanıt verin:\n
|
||||
|
||||
[header]Bölüm Adı[/header]\n
|
||||
Gerektiğinde [bold]kalın terimler[/bold] ile içerik. Maddeler için şu format kullanılmalı:\n
|
||||
- Ana madde\n
|
||||
- Alt madde\n
|
||||
- Daha alt madde\n
|
||||
|
||||
[header]Diğer Bölüm Adı[/header]\n
|
||||
Gerektiğinde [bold]kalın terimler[/bold] ile içerik\n
|
||||
- Ana madde\n
|
||||
- Alt madde\n
|
||||
- Daha alt madde\n
|
||||
|
||||
Kurallar:\n
|
||||
1. Her ana bölüm [header]...[/header] ile başlamalı\n
|
||||
2. Önemli terimler veya vurgulamalar için [bold]...[/bold] kullanın\n
|
||||
3. Bölüm başlıkları şunlardan biri olmalı: Tanım, Amaç, Temel Özellikler, İşleyiş, Bağlam\n
|
||||
4. Tüm liste maddeleri için tek tire (-) kullanın\n
|
||||
5. Alt maddelerde tam olarak 2 boşluk ile girintileme yapın\n
|
||||
6. Ana bölümler arasında bir boş satır bırakın\n
|
||||
7. Başka liste işaretleri kullanmayın (nokta, sayı vb.)\n
|
||||
8. Yanıt boyunca tutarlı girintileme kullanın\n
|
||||
|
||||
Bağlam Pencereleri:\n
|
||||
{context}\n
|
||||
|
||||
Kullanıcı Sorgusu:\n
|
||||
{query}\n
|
||||
"
|
||||
|
||||
queries:
|
||||
- id: query_tr_001
|
||||
text: "
|
||||
Görev: Analiz Et, Düzelt ve İlgili Sorular & Cevaplar Oluştur.\n
|
||||
|
||||
Talimatlar:\n
|
||||
Kullanıcı sorgusu size verilmiştir.\n
|
||||
Öncelikle Kullanıcı sorusunu kontrol edin. Eğer anlamsızsa, boş bir string '' döndürün. Anlamlıysa, şu işlemleri yapın:\n
|
||||
Herhangi bir yazım veya dilbilgisi hatası olup olmadığını kontrol edin ve düzeltilmiş soruyu çıktıdaki ilk soru olarak döndürün.\n
|
||||
Ardından, Düzeltmiş soruyla aynı anlamı koruyan 3 semantik olarak benzer sorgu oluşturun.\n
|
||||
Orijinal soruyu farklı açılardan ele alan, ancak yine de ilgili kalan 3 farklı soru oluşturun.\n
|
||||
Son 3 soruya, her biri 1-2 cümlelik kısa cevaplarla yanıt verin.\n
|
||||
Ardından düzeltilmiş kullanıcı sorgusunu analiz edin ve niyetini belirleyin. Niyet listesi, anahtar kelimeler ve örnekler aşağıda verilmiştir. Eğer niyet tam olarak anlaşılmaz ise boş bir string '' döndür.\n
|
||||
|
||||
Olası niyetler:\n
|
||||
1. Bilgi Edinme: Gerçek bilgileri, tanımları veya açıklamaları öğrenme talebi.\n
|
||||
Niyet Anahtar Kelimeleri: Ne, tanımla, açıkla, detaylar, belirt, kim, neden, nasıl.\n
|
||||
Niyet Örnekleri: Bu kuralı ihlal etmenin cezası nedir? → Bilgilendirme\n
|
||||
2. Özetleme: Karmaşık bilgilerin kısa bir özetini isteme.\n
|
||||
Niyet Anahtar Kelimeleri: Özetle, genel bakış, ana noktalar, temel fikirler, kısa, öz, basitleştir.\n
|
||||
Niyet Örnekleri: Bu belgenin ana noktalarını özetleyebilir misiniz? → Özetleme\n
|
||||
3. Karşılaştırma: Seçenekleri, yöntemleri veya teknolojileri değerlendirme.\n
|
||||
Niyet Anahtar Kelimeleri: Karşılaştır, fark, benzerlik, karşılaştırma, daha iyi, alternatif, artılar ve eksiler.\n
|
||||
Niyet Örnekleri: Bu iki yöntemin faydalarını karşılaştırın. → Karşılaştırma\n
|
||||
|
||||
Çıktıyı **kesinlikle** şu formatta döndürün:\n
|
||||
[düzeltilmiş sorgu]\n
|
||||
[birinci semantik olarak benzer sorgu]\n
|
||||
[ikinci semantik olarak benzer sorgu]\n
|
||||
[üçüncü semantik olarak benzer sorgu]\n
|
||||
[birinci farklı-açıdan soru]\n
|
||||
[ikinci farklı-açıdan soru]\n
|
||||
[üçüncü farklı-açıdan soru]\n
|
||||
[birinci farklı-açıdan cevap]\n
|
||||
[ikinci farklı-açıdan cevap]\n
|
||||
[üçüncü farklı-açıdan cevap]\n
|
||||
[kullanıcı niyeti]\n
|
||||
|
||||
Kullanıcı Sorgusu: {query}\n
|
||||
|
||||
Örnek:\n
|
||||
Kullanıcı sorgusu: Retrieval-augmented generation yapay zeka sistemlerinde nasıl çalışır?\n
|
||||
|
||||
Çıktı:\n
|
||||
Retrieval-augmented generation yapay zeka sistemlerinde nasıl çalışır?\n
|
||||
Retrieval-augmented generation süreci yapay zekada nasıl işler?\n
|
||||
RAG, yapay zeka sistemlerine bilgi getirme ve oluşturma konusunda nasıl yardımcı olur?\n
|
||||
Retrieval-augmented generation yapay zeka uygulamalarında nasıl işlev görür?\n
|
||||
RAG kullanmanın yapay zeka için temel avantajları nelerdir?\n
|
||||
RAG, geleneksel makine öğrenimi modellerinden nasıl farklıdır?\n
|
||||
RAG’in uygulanmasında karşılaşılan zorluklar nelerdir?\n
|
||||
RAG, yapay zekayı dış verileri getirerek daha doğru yanıtlar sağlamada geliştirir.\n
|
||||
RAG, geleneksel modellerden farklı olarak çıkarım sırasında harici bilgilere erişim sağlar.\n
|
||||
Başlıca zorluklar arasında getirme gecikmesi, getirilen verilerin uygunluğu ve bilgilerin güncel tutulması yer alır.\n
|
||||
Bilgi Edinme\n
|
||||
|
||||
Kullanıcı sorusu: {query}\n
|
||||
"
|
||||
|
||||
metadata:
|
||||
version: "1.0"
|
||||
description: "Prompt type storages with language groups"
|
||||
Reference in New Issue
Block a user