Initial commit: intelaide backend and frontend

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
root
2026-01-20 04:54:10 +00:00
commit 566576637d
297 changed files with 74281 additions and 0 deletions

0
doclink/app/__init__.py Normal file
View File

View File

474
doclink/app/api/core.py Normal file
View File

@@ -0,0 +1,474 @@
from typing import List
import numpy as np
import bcrypt
import re
import base64
import os
from dotenv import load_dotenv
from cryptography.hazmat.primitives.ciphers.aead import AESGCM
from ..functions.reading_functions import ReadingFunctions
from ..functions.embedding_functions import EmbeddingFunctions
from ..functions.indexing_functions import IndexingFunctions
from ..functions.chatbot_functions import ChatbotFunctions
from ..functions.scraping_functions import Webscraper
from ..functions.export_functions import Exporter
class Authenticator:
def __init__(self):
pass
def verify_password(self, plain_password: str, hashed_password: str) -> bool:
return bcrypt.checkpw(
plain_password.encode("utf-8"), hashed_password.encode("utf-8")
)
def hash_password(self, password: str) -> str:
salt = bcrypt.gensalt()
return bcrypt.hashpw(password.encode("utf-8"), salt).decode("utf-8")
class Encryptor:
def __init__(self):
load_dotenv()
self.key = os.getenv("ENCRYPTION_KEY")
self.email_auth = "EMAIL_AUTH_DATA_2025"
self.email_nonce = self.email_auth.encode("utf-8")[:12].ljust(12, b"\0")
self._key_bytes = base64.b64decode(self.key)
self.aesgcm = AESGCM(self._key_bytes)
def encrypt(self, text: str, auth_data) -> str:
try:
nonce = os.urandom(12)
encrypted_data = self.aesgcm.encrypt(
nonce, text.encode("utf-8"), auth_data.encode("utf-8")
)
combined_encrypt = nonce + encrypted_data
encrypted_sentence = base64.b64encode(combined_encrypt).decode("utf-8")
return encrypted_sentence
except Exception as e:
raise e
def decrypt(self, encrypted_data: str, auth_data) -> str:
try:
decoded_text = base64.b64decode(encrypted_data.encode("utf-8"))
nonce = decoded_text[:12]
encrypted_text = decoded_text[12:]
decrypted_data = self.aesgcm.decrypt(
nonce, encrypted_text, auth_data.encode("utf-8")
)
return decrypted_data.decode("utf-8")
except Exception as e:
raise e
class Processor:
def __init__(
self,
):
self.ef = EmbeddingFunctions()
self.rf = ReadingFunctions()
self.indf = IndexingFunctions()
self.cf = ChatbotFunctions()
self.en = Encryptor()
self.ws = Webscraper()
self.ex = Exporter()
def create_index(self, embeddings: np.ndarray, index_type: str = "flat"):
if index_type == "flat":
index = self.indf.create_flat_index(embeddings=embeddings)
return index
def filter_search(
self, domain_content: dict, domain_embeddings: np.ndarray, file_ids: list
):
filtered_indexes = []
filtered_content = []
for i, content in enumerate(domain_content):
if content[4] in file_ids:
filtered_indexes.append(i)
filtered_content.append(content)
filtered_embeddings = domain_embeddings[filtered_indexes]
index = self.create_index(embeddings=filtered_embeddings)
boost_info = self.extract_boost_info(
domain_content=filtered_content, embeddings=filtered_embeddings
)
try:
index_header = self.create_index(embeddings=boost_info["header_embeddings"])
except IndexError:
index_header = None
return index, filtered_content, boost_info, index_header
def search_index(
self,
user_query: str,
domain_content: dict,
boost_info: dict,
index,
index_header,
):
file_lang = self.file_lang_detection(domain_content=domain_content)
queries, lang = self.query_preprocessing(
user_query=user_query, file_lang=file_lang
)
if not queries:
if lang == "tr":
return (
"Sorunu anlayamadım",
None,
None,
)
else:
return (
f"I didn't understand {user_query}",
None,
None,
)
query_embeddings = self.ef.create_embeddings_from_sentences(
sentences=queries[:-1]
)
boost_array = self._create_boost_array(
header_indexes=boost_info["header_indexes"],
sentence_amount=index.ntotal,
query_vector=query_embeddings[0],
index_header=index_header,
)
# Get search distances with occurrences
dict_resource = {}
for i, query_embedding in enumerate(query_embeddings):
D, I = index.search(query_embedding.reshape(1, -1), len(domain_content)) # noqa: E741
if i == 0:
convergence_vector, distance_vector = I[0], D[0]
for i, match_index in enumerate(I[0]):
if match_index in dict_resource:
dict_resource[match_index].append(D[0][i])
else:
dict_resource[match_index] = [D[0][i]]
file_boost_array = self._create_file_boost_array(
domain_content=domain_content,
distance_vector=distance_vector,
convergence_vector=convergence_vector,
)
# Combine boost arrays
combined_boost_array = 0.25 * file_boost_array + 0.75 * boost_array
# Get average occurrences
dict_resource = self._avg_resources(dict_resource)
for key in dict_resource:
dict_resource[key] *= combined_boost_array[key]
sorted_dict = dict(
sorted(dict_resource.items(), key=lambda item: item[1], reverse=True)
)
filtered_indexes = [
sentence_index
for sentence_index in sorted_dict.keys()
if sorted_dict[sentence_index] >= 0.35
]
sorted_sentence_indexes = filtered_indexes[:10]
# Early return with message
if not sorted_sentence_indexes:
if lang == "tr":
return (
"Seçtiğin dokümanlarda bu sorunun cevabını bulamadım",
None,
None,
)
else:
return (
"I couldn't find the answer of the question within the selected files",
None,
None,
)
# Sentences to context creation
context, context_windows, resources = self.context_creator(
sentence_index_list=sorted_sentence_indexes,
domain_content=domain_content,
header_indexes=boost_info["header_indexes"],
table_indexes=boost_info["table_indexes"],
)
answer = self.cf.response_generation(
query=user_query, context=context, intention=queries[-1]
)
return answer, resources, context_windows
def query_preprocessing(self, user_query, file_lang):
generated_queries, lang = self.cf.query_generation(
query=user_query, file_lang=file_lang
)
splitted_queries = generated_queries.split("\n")
if len(splitted_queries) > 1:
return splitted_queries, lang
return None, lang
def _create_boost_array(
self,
header_indexes: list,
sentence_amount: int,
query_vector: np.ndarray,
index_header,
):
boost_array = np.ones(sentence_amount)
if not index_header:
return boost_array
D, I = index_header.search(query_vector.reshape(1, -1), 10) # noqa: E741
filtered_header_indexes = [
header_index
for index, header_index in enumerate(I[0])
if D[0][index] > 0.30
]
if not filtered_header_indexes:
return boost_array
else:
for i, filtered_index in enumerate(filtered_header_indexes):
try:
start = header_indexes[filtered_index] + 1
end = header_indexes[filtered_index + 1]
if i > 2:
boost_array[start:end] *= 1.1
elif i > 0:
boost_array[start:end] *= 1.2
else:
boost_array[start:end] *= 1.3
except IndexError as e:
print(f"List is out of range {e}")
continue
return boost_array
# File boost function
def _create_file_boost_array(
self,
domain_content: list,
distance_vector: np.ndarray,
convergence_vector: np.ndarray,
):
boost_array = np.ones(len(domain_content))
sort_order = np.argsort(convergence_vector)
sorted_scores = distance_vector[sort_order]
file_counts = {}
if not domain_content:
return boost_array
else:
for _, _, _, _, _, filename in domain_content:
file_counts[filename] = file_counts.get(filename, 0) + 1
file_sentence_counts = np.cumsum([0] + list(file_counts.values()))
for i in range(len(file_sentence_counts) - 1):
start, end = file_sentence_counts[i], file_sentence_counts[i + 1]
if np.mean(sorted_scores[start:end]) > 0.30:
boost_array[start:end] *= 1.1
return boost_array
def context_creator(
self,
sentence_index_list: list,
domain_content: List[tuple],
header_indexes: list,
table_indexes: list,
):
context = ""
context_windows = []
widened_indexes = []
original_matches = set(sentence_index_list)
for i, sentence_index in enumerate(sentence_index_list):
window_size = 4 if i < 3 else 2
start = max(0, sentence_index - window_size)
end = min(len(domain_content) - 1, sentence_index + window_size)
if table_indexes:
for table_index in table_indexes:
if sentence_index == table_index:
widened_indexes.append((table_index, table_index))
table_indexes.remove(table_index)
break
if not header_indexes:
widened_indexes.append((start, end))
else:
for i, current_header in enumerate(header_indexes):
if sentence_index == current_header:
start = max(0, sentence_index)
if (
i + 1 < len(header_indexes)
and abs(sentence_index - header_indexes[i + 1]) <= 20
):
end = min(
len(domain_content) - 1, header_indexes[i + 1] - 1
)
else:
end = min(
len(domain_content) - 1, sentence_index + window_size
)
break
elif (
i + 1 < len(header_indexes)
and current_header < sentence_index < header_indexes[i + 1]
):
start = (
current_header
if abs(sentence_index - current_header) <= 20
else max(0, sentence_index - window_size)
)
end = (
header_indexes[i + 1] - 1
if abs(header_indexes[i + 1] - sentence_index) <= 20
else min(
len(domain_content) - 1, sentence_index + window_size
)
)
break
elif (
i == len(header_indexes) - 1
and current_header >= sentence_index
):
start = (
max(0, sentence_index)
if abs(current_header - sentence_index) <= 20
else max(0, sentence_index - window_size)
)
end = min(len(domain_content) - 1, sentence_index + window_size)
break
if (start, end) not in widened_indexes:
widened_indexes.append((start, end))
merged_truples = self.merge_tuples(widen_sentences=widened_indexes)
used_indexes = [
min(index for index in sentence_index_list if tuple[0] <= index <= tuple[1])
for tuple in merged_truples
]
resources = self._extract_resources(
sentence_indexes=used_indexes, domain_content=domain_content
)
for i, tuple in enumerate(merged_truples):
if tuple[0] == tuple[1]:
windened_sentence = " ".join(
self.en.decrypt(
domain_content[tuple[0]][0], domain_content[tuple[0]][4]
)
)
context += f"Context{i + 1}: File:{resources['file_names'][i]}, Confidence:{(len(sentence_index_list) - i) / len(sentence_index_list)}, Table\n{windened_sentence}\n"
context_windows.append(windened_sentence)
else:
highlighted_sentences = []
for index in range(tuple[0], tuple[1] + 1):
sentence_text = self.en.decrypt(
domain_content[index][0], domain_content[index][4]
)
# Highlight original matches
if index in original_matches:
highlighted_sentences.append(f"<mark>{sentence_text}</mark>")
else:
highlighted_sentences.append(sentence_text)
windened_sentence = " ".join(highlighted_sentences)
context += f"Context{i + 1}: File:{resources['file_names'][i]}, Confidence:{(len(sentence_index_list) - i) / len(sentence_index_list)}, {windened_sentence}\n\n"
context_windows.append(windened_sentence)
return context, context_windows, resources
def _avg_resources(self, resources_dict):
for key, value in resources_dict.items():
value_mean = sum(value) / len(value)
value_coefficient = value_mean + len(value) * 0.0025
resources_dict[key] = value_coefficient
return resources_dict
def _extract_resources(self, sentence_indexes: list, domain_content: List[tuple]):
resources = {"file_names": [], "page_numbers": []}
for index in sentence_indexes:
resources["file_names"].append(domain_content[index][5])
resources["page_numbers"].append(domain_content[index][3])
return resources
def _create_dynamic_context(self, sentences):
context = ""
for i, sentence in enumerate(sentences):
context += f"{i + 1}: {sentence}\n"
return context
def extract_boost_info(self, domain_content: List[tuple], embeddings: np.ndarray):
boost_info = {
"header_indexes": [],
"headers": [],
"header_embeddings": [],
"table_indexes": [],
}
for index in range(len(domain_content)):
if domain_content[index][1]:
boost_info["header_indexes"].append(index)
boost_info["headers"].append(domain_content[index][0])
if domain_content[index][2]:
boost_info["table_indexes"].append(index)
boost_info["header_embeddings"] = embeddings[boost_info["header_indexes"]]
return boost_info
def merge_tuples(self, widen_sentences):
sorted_dict = {0: widen_sentences[0]}
for sentence_tuple in widen_sentences[1:]:
tuple_range = range(sentence_tuple[0], sentence_tuple[1])
is_in = 0
for index, value in sorted_dict.items():
current_range = range(value[0], value[1])
if set(tuple_range) & set(current_range):
interval = (
min(sorted_dict[index][0], sentence_tuple[0]),
max(sorted_dict[index][1], sentence_tuple[1]),
)
sorted_dict[index] = interval
is_in = 1
if not is_in:
sorted_dict[index + 1] = sentence_tuple
return list(dict.fromkeys(sorted_dict.values()))
def file_lang_detection(self, domain_content: List[tuple]):
file_lang = {}
detected_sentence_amount = (
25 if len(domain_content) > 25 else len(domain_content)
)
for i in range(0, detected_sentence_amount):
decrypted_content = self.en.decrypt(
domain_content[i][0], domain_content[i][4]
)
if re.match(r"\b[a-zA-Z]{" + str(4) + r",}\b", decrypted_content) or (
decrypted_content[0] == "|" and decrypted_content[-1] == "|"
):
lang = self.cf.detect_language(decrypted_content)
file_lang[lang] = file_lang.get(lang, 0) + 1
try:
return max(file_lang, key=file_lang.get)
except ValueError:
return "en"

View File

@@ -0,0 +1,846 @@
from fastapi import APIRouter, UploadFile, HTTPException, Request, Query, File, Form
from fastapi.responses import JSONResponse, StreamingResponse
from google.oauth2.credentials import Credentials
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
from datetime import datetime
import os
import logging
import uuid
import base64
import psycopg2
import io
import hmac
import hashlib
from .core import Processor
from .core import Authenticator
from .core import Encryptor
from ..db.database import Database
from ..redis_manager import RedisManager, RedisConnectionError
# services
router = APIRouter()
processor = Processor()
authenticator = Authenticator()
redis_manager = RedisManager()
encryptor = Encryptor()
# logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# environment variables
GOOGLE_CLIENT_ID = os.getenv("GOOGLE_CLIENT_ID")
GOOGLE_CLIENT_SECRET = os.getenv("GOOGLE_CLIENT_SECRET")
GOOGLE_REDIRECT_URI = os.getenv("GOOGLE_REDIRECT_URI")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
# request functions
@router.post("/db/get_user_info")
async def get_user_info(request: Request):
try:
data = await request.json()
user_id = data.get("user_id")
with Database() as db:
user_info, domain_info = db.get_user_info_w_id(user_id)
return JSONResponse(
content={
"user_info": user_info,
"domain_info": domain_info,
},
status_code=200,
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.post("/db/rename_domain")
async def rename_domain(request: Request):
try:
data = await request.json()
selected_domain_id = data.get("domain_id")
new_name = data.get("new_name")
with Database() as db:
success = db.rename_domain(domain_id=selected_domain_id, new_name=new_name)
if not success:
return JSONResponse(
content={"message": "error while renaming domain"},
status_code=400,
)
return JSONResponse(
content={"message": "success"},
status_code=200,
)
except Exception as e:
logger.error(f"Error renaming domain: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
@router.post("/db/create_domain")
async def create_domain(
request: Request,
userID: str = Query(...),
):
try:
data = await request.json()
domain_name = data.get("domain_name")
domain_id = str(uuid.uuid4())
with Database() as db:
result = db.create_domain(
user_id=userID,
domain_id=domain_id,
domain_name=domain_name,
domain_type=1,
)
if not result["success"]:
return JSONResponse(
content={"message": result["message"]},
status_code=400,
)
return JSONResponse(
content={"message": "success", "domain_id": domain_id},
status_code=200,
)
except Exception as e:
logger.error(f"Error renaming domain: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
@router.post("/db/delete_domain")
async def delete_domain(request: Request):
try:
data = await request.json()
domain_id = data.get("domain_id")
with Database() as db:
success = db.delete_domain(domain_id=domain_id)
if success < 0:
return JSONResponse(
content={
"message": "This is your default domain. You cannot delete it completely, instead you can delete the unnucessary files inside!"
},
status_code=400,
)
elif success == 0:
return JSONResponse(
content={
"message": "Error while deleting domain. Please report this to us, using feedback on the bottom left."
},
status_code=400,
)
db.conn.commit()
return JSONResponse(
content={"message": "success"},
status_code=200,
)
except Exception as e:
logger.error(f"Error while deleting domain: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
@router.post("/db/insert_feedback")
async def insert_feedback(
userID: str = Query(...),
feedback_type: str = Form(...),
feedback_description: str = Form(...),
feedback_screenshot: UploadFile = File(None),
):
try:
feedback_id = str(uuid.uuid4())
screenshot_data = None
if feedback_screenshot:
contents = await feedback_screenshot.read()
if len(contents) > 2 * 1024 * 1024: # 2MB limit
raise HTTPException(
status_code=400, detail="Screenshot size should be less than 2MB"
)
screenshot_data = base64.b64encode(contents).decode("utf-8")
with Database() as db:
db.insert_user_feedback(
feedback_id=feedback_id,
user_id=userID,
feedback_type=feedback_type,
description=feedback_description[:5000],
screenshot=screenshot_data,
)
db.conn.commit()
return JSONResponse(
content={"message": "Thanks for the feedback!"}, status_code=200
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.post("/db/insert_rating")
async def insert_rating(
userID: str = Query(...),
rating: int = Form(...),
user_note: str = Form(""),
):
try:
rating_id = str(uuid.uuid4())
with Database() as db:
db.insert_user_rating(
rating_id=rating_id,
user_id=userID,
rating=rating,
user_note=user_note if user_note else None,
)
db.conn.commit()
return JSONResponse(
content={"message": "Thank you for the rating!"}, status_code=200
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.post("/qa/select_domain")
async def select_domain(
request: Request,
userID: str = Query(...),
):
try:
data = await request.json()
selected_domain_id = data.get("domain_id")
_, _, success = update_selected_domain(
user_id=userID, domain_id=selected_domain_id
)
if not success:
return JSONResponse(
content={"message": "error while updating selected domain"},
status_code=400,
)
redis_manager.refresh_user_ttl(userID)
return JSONResponse(
content={"message": "success"},
status_code=200,
)
except RedisConnectionError as e:
logger.error(f"Redis connection error: {str(e)}")
raise HTTPException(status_code=503, detail="Service temporarily unavailable")
except Exception as e:
logger.error(f"Error in select_domain: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
@router.post("/qa/generate_answer")
async def generate_answer(
request: Request,
userID: str = Query(...),
sessionID: str = Query(...),
):
try:
data = await request.json()
user_message = data.get("user_message")
file_ids = data.get("file_ids")
# Check if domain is selected
selected_domain_id = redis_manager.get_data(f"user:{userID}:selected_domain")
if not selected_domain_id:
return JSONResponse(
content={"message": "Please select a domain first..."},
status_code=400,
)
if not file_ids:
return JSONResponse(
content={"message": "You didn't select any files..."},
status_code=400,
)
with Database() as db:
update_result = db.upsert_session_info(user_id=userID, session_id=sessionID)
if not update_result["success"]:
return JSONResponse(
content={"message": update_result["message"]},
status_code=400,
)
# Get required data from Redis
index, filtered_content, boost_info, index_header = processor.filter_search(
domain_content=redis_manager.get_data(f"user:{userID}:domain_content"),
domain_embeddings=redis_manager.get_data(
f"user:{userID}:domain_embeddings"
),
file_ids=file_ids,
)
if not index or not filtered_content:
return JSONResponse(
content={"message": "Nothing in here..."},
status_code=400,
)
# Process search
answer, resources, resource_sentences = processor.search_index(
user_query=user_message,
domain_content=filtered_content,
boost_info=boost_info,
index=index,
index_header=index_header,
)
if not resources or not resource_sentences:
return JSONResponse(
content={
"message": answer,
"daily_count": update_result["daily_count"],
},
status_code=200,
)
redis_manager.refresh_user_ttl(userID)
return JSONResponse(
content={
"answer": answer,
"resources": resources,
"resource_sentences": resource_sentences,
"question_count": update_result["question_count"],
"daily_count": update_result["daily_count"],
},
status_code=200,
)
except RedisConnectionError as e:
logger.error(f"Redis connection error: {str(e)}")
raise HTTPException(status_code=503, detail="Service temporarily unavailable")
except Exception as e:
logger.error(f"Error in generate_answer: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
@router.post("/io/store_file")
async def store_file(
userID: str = Query(...),
file: UploadFile = File(...),
lastModified: str = Form(...),
):
try:
file_bytes = await file.read()
if not file_bytes:
return JSONResponse(
content={
"message": f"Empty file {file.filename}. If you think not, please report this to us!"
},
status_code=400,
)
file_data = processor.rf.read_file(
file_bytes=file_bytes, file_name=file.filename
)
if not file_data["sentences"]:
return JSONResponse(
content={
"message": f"No content to extract in {file.filename}. If there is please report this to us!"
},
status_code=400,
)
# Create embeddings
file_embeddings = processor.ef.create_embeddings_from_sentences(
sentences=file_data["sentences"]
)
# Store in Redis
redis_key = f"user:{userID}:upload:{file.filename}"
upload_data = {
"file_name": file.filename,
"last_modified": datetime.fromtimestamp(int(lastModified) / 1000).strftime(
"%Y-%m-%d"
)[:20],
"sentences": file_data["sentences"],
"page_numbers": file_data["page_number"],
"is_headers": file_data["is_header"],
"is_tables": file_data["is_table"],
"embeddings": file_embeddings,
}
redis_manager.set_data(redis_key, upload_data, expiry=3600)
return JSONResponse(
content={"message": "success", "file_name": file.filename},
status_code=200,
)
except Exception as e:
logging.error(f"Error storing file {file.filename}: {str(e)}")
return JSONResponse(
content={"message": f"Error storing file: {str(e)}"}, status_code=500
)
@router.post("/io/store_drive_file")
async def store_drive_file(
userID: str = Query(...),
lastModified: str = Form(...),
driveFileId: str = Form(...),
driveFileName: str = Form(...),
accessToken: str = Form(...),
):
try:
credentials = Credentials(
token=accessToken,
client_id=GOOGLE_CLIENT_ID,
client_secret=GOOGLE_CLIENT_SECRET,
token_uri="https://oauth2.googleapis.com/token",
)
drive_service = build("drive", "v3", credentials=credentials)
google_mime_types = {
"application/vnd.google-apps.document": ("application/pdf", ".pdf"),
"application/vnd.google-apps.spreadsheet": (
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
".xlsx",
),
"application/vnd.google-apps.presentation": (
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
".pptx",
),
"application/vnd.google-apps.script": ("text/plain", ".txt"),
}
file_metadata = (
drive_service.files().get(fileId=driveFileId, fields="mimeType").execute()
)
mime_type = file_metadata["mimeType"]
if mime_type in google_mime_types:
export_mime_type, extension = google_mime_types[mime_type]
request = drive_service.files().export_media(
fileId=driveFileId, mimeType=export_mime_type
)
if not driveFileName.endswith(extension):
driveFileName += extension
else:
request = drive_service.files().get_media(fileId=driveFileId)
file_stream = io.BytesIO()
downloader = MediaIoBaseDownload(file_stream, request)
done = False
while not done:
_, done = downloader.next_chunk()
file_stream.seek(0)
file_bytes = file_stream.read()
if not file_bytes:
return JSONResponse(
content={
"message": f"Empty file {driveFileName}. If you think not, please report this to us!"
},
status_code=400,
)
file_data = processor.rf.read_file(
file_bytes=file_bytes, file_name=driveFileName
)
if not file_data["sentences"]:
return JSONResponse(
content={
"message": f"No content to extract in {driveFileName}. If there is please report this to us!"
},
status_code=400,
)
file_embeddings = processor.ef.create_embeddings_from_sentences(
sentences=file_data["sentences"]
)
redis_key = f"user:{userID}:upload:{driveFileName}"
upload_data = {
"file_name": driveFileName,
"last_modified": datetime.fromtimestamp(int(lastModified) / 1000).strftime(
"%Y-%m-%d"
)[:20],
"sentences": file_data["sentences"],
"page_numbers": file_data["page_number"],
"is_headers": file_data["is_header"],
"is_tables": file_data["is_table"],
"embeddings": file_embeddings,
}
redis_manager.set_data(redis_key, upload_data, expiry=3600)
return JSONResponse(
content={"message": "success", "file_name": driveFileName}, status_code=200
)
except Exception as e:
logging.error(f"Error storing Drive file {driveFileName}: {str(e)}")
return JSONResponse(
content={"message": f"Error storing file: {str(e)}"}, status_code=500
)
@router.post("/io/store_url")
async def store_url(userID: str = Query(...), url: str = Form(...)):
try:
if not processor.ws.url_validator(url):
return JSONResponse(
content={"message": "Invalid URL. Please enter a valid URL."},
status_code=400,
)
html = processor.ws.request_creator(url)
if not html:
return JSONResponse(
content={"message": "Error fetching the URL. Please try again later."},
status_code=400,
)
file_data = processor.rf.read_url(html_content=html)
if not file_data["sentences"]:
return JSONResponse(
content={
"message": f"No content to extract in {url}. If there is please report this to us!"
},
status_code=400,
)
file_embeddings = processor.ef.create_embeddings_from_sentences(
sentences=file_data["sentences"]
)
redis_key = f"user:{userID}:upload:{url}"
upload_data = {
"file_name": url,
"last_modified": datetime.now().strftime("%Y-%m-%d"),
"sentences": file_data["sentences"],
"page_numbers": file_data["page_number"],
"is_headers": file_data["is_header"],
"is_tables": file_data["is_table"],
"embeddings": file_embeddings,
}
redis_manager.set_data(redis_key, upload_data, expiry=3600)
return JSONResponse(
content={"message": "success", "file_name": url}, status_code=200
)
except Exception as e:
logging.error(f"Error storing URL {url}: {str(e)}")
return JSONResponse(
content={"message": f"Error storing URL: {str(e)}"}, status_code=500
)
@router.post("/io/upload_files")
async def upload_files(userID: str = Query(...)):
try:
# Get domain info
selected_domain_id = redis_manager.get_data(f"user:{userID}:selected_domain")
with Database() as db:
domain_info = db.get_domain_info(
user_id=userID, domain_id=selected_domain_id
)
if not domain_info:
return JSONResponse(
content={"message": "Invalid domain selected"}, status_code=400
)
# Get all stored files from Redis
stored_files = redis_manager.get_keys_by_pattern(f"user:{userID}:upload:*")
if not stored_files:
return JSONResponse(
content={"message": "No files to process"}, status_code=400
)
file_info_batch = []
file_content_batch = []
# Process stored files
for redis_key in stored_files:
upload_data = redis_manager.get_data(redis_key)
if not upload_data:
continue
file_id = str(uuid.uuid4())
# Prepare batches
file_info_batch.append(
(
userID,
file_id,
selected_domain_id,
upload_data["file_name"],
upload_data["last_modified"],
)
)
for i in range(len(upload_data["sentences"])):
file_content_batch.append(
(
file_id,
encryptor.encrypt(
text=upload_data["sentences"][i], auth_data=file_id
),
upload_data["page_numbers"][i],
upload_data["is_headers"][i],
upload_data["is_tables"][i],
psycopg2.Binary(upload_data["embeddings"][i]),
)
)
# Clean up Redis
redis_manager.delete_data(redis_key)
# Bulk insert with limit check
result = db.insert_file_batches(file_info_batch, file_content_batch)
if not result["success"]:
return JSONResponse(
content={"message": result["message"]}, status_code=400
)
db.conn.commit()
# Update domain info
file_names, file_ids, success = update_selected_domain(
user_id=userID, domain_id=selected_domain_id
)
if not success:
return JSONResponse(
content={
"message": "Files uploaded but, domain could not be updated",
"file_names": None,
"file_ids": None,
},
status_code=400,
)
return JSONResponse(
content={
"message": "success",
"file_names": file_names,
"file_ids": file_ids,
},
status_code=200,
)
except Exception as e:
logging.error(f"Error processing uploads: {str(e)}")
return JSONResponse(
content={"message": f"Error processing uploads: {str(e)}"}, status_code=500
)
@router.post("/db/remove_file_upload")
async def remove_file_upload(
request: Request,
userID: str = Query(...),
):
try:
data = await request.json()
file_id = data.get("file_id")
domain_id = data.get("domain_id")
with Database() as db:
success = db.clear_file_content(file_id=file_id)
if not success:
return JSONResponse(
content={
"message": "Error deleting files",
},
status_code=400,
)
db.conn.commit()
_, _, success = update_selected_domain(user_id=userID, domain_id=domain_id)
if not success:
return JSONResponse(
content={"message": "error"},
status_code=200,
)
return JSONResponse(
content={
"message": "success",
},
status_code=200,
)
except KeyError:
return JSONResponse(
content={"message": "Please select the domain number first"},
status_code=200,
)
except Exception as e:
db.conn.rollback()
logging.error(f"Error during file deletion: {str(e)}")
raise HTTPException(
content={"message": f"Failed deleting, error: {e}"}, status_code=500
)
@router.post("/io/export_response")
async def export_response(request: Request):
try:
data = await request.json()
text = data.get("contents", [])
if not text:
raise ValueError("No content selected for export")
formatted_text = "\n\n------------------\n\n".join(text)
response = processor.ex.export_pdf(data=formatted_text)
return StreamingResponse(
io.BytesIO(response.getvalue()),
media_type="application/pdf",
headers={
"Content-Disposition": "attachment; filename=DoclinkExport.pdf",
"Content-Type": "application/pdf",
"Content-Length": str(len(response.getvalue())),
},
)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
raise HTTPException(status_code=500, detail=f"PDF generation failed Error: {e}")
@router.post("/auth/logout")
async def logout(request: Request):
try:
data = await request.json()
user_id = data.get("user_id")
session_id = data.get("session_id")
response = JSONResponse(content={"message": "Logged out successfully"})
# Clear FastAPI session cookie
response.delete_cookie(
key="session_id",
path="/",
domain=None, # This will use the current domain
secure=True,
httponly=True,
samesite="lax",
)
# Delete user redis session
redis_key = f"user:{user_id}:session:{session_id}"
session_exists = redis_manager.client.exists(redis_key)
if session_exists:
redis_manager.client.delete(redis_key)
return response
except Exception as e:
logging.error(f"Error during logout: {str(e)}")
raise HTTPException(
content={"message": f"Failed logout, error: {e}"}, status_code=500
)
@router.post("/webhooks/lemon-squeezy")
async def handle_webhook(request: Request):
try:
# Get the raw request body
body = await request.body()
payload = await request.json()
# Get the signature from the header
signature = request.headers.get("X-Signature")
# Signature verification
webhook_secret = os.getenv("LEMON_SQUEEZY_WEBHOOK_SECRET")
expected_signature = hmac.new(
webhook_secret.encode(), body, hashlib.sha256
).hexdigest()
if not hmac.compare_digest(signature, expected_signature):
raise HTTPException(status_code=401, detail="Invalid signature")
event_name = payload.get("meta", {}).get("event_name")
if not event_name == "order_created":
return JSONResponse(
status_code=400, content={"message": "Wrong event came!"}
)
# Upgrade user to the premium limits
data = payload.get("data", {}).get("attributes", {})
customer_id = data.get("customer_id")
customer_email = data.get("user_email")
receipt_url = data.get("urls").get("receipt")
with Database() as db:
db.update_user_subscription(
user_email=customer_email,
lemon_squeezy_customer_id=customer_id,
receipt_url=receipt_url,
)
db.conn.commit()
return JSONResponse(status_code=200, content={"message": "Webhook received"})
except Exception as e:
logger.error(f"Webhook error: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
# local functions
def update_selected_domain(user_id: str, domain_id: str):
try:
redis_manager.set_data(f"user:{user_id}:selected_domain", domain_id)
with Database() as db:
file_info = db.get_file_info_with_domain(user_id, domain_id)
if not file_info:
# Clear any existing domain data
redis_manager.delete_data(f"user:{user_id}:domain_content")
redis_manager.delete_data(f"user:{user_id}:index")
redis_manager.delete_data(f"user:{user_id}:index_header")
redis_manager.delete_data(f"user:{user_id}:boost_info")
return None, None, 1
content, embeddings = db.get_file_content(
file_ids=[info["file_id"] for info in file_info]
)
if not content or not len(embeddings):
# Clear any existing domain data
redis_manager.delete_data(f"user:{user_id}:domain_content")
redis_manager.delete_data(f"user:{user_id}:index")
redis_manager.delete_data(f"user:{user_id}:index_header")
redis_manager.delete_data(f"user:{user_id}:boost_info")
return None, None, 0
# Store domain content in Redis
redis_manager.set_data(f"user:{user_id}:domain_content", content)
redis_manager.set_data(f"user:{user_id}:domain_embeddings", embeddings)
file_names = [info["file_name"] for info in file_info]
file_ids = [info["file_id"] for info in file_info]
return file_names, file_ids, 1
except Exception as e:
logger.error(f"Error in update_selected_domain: {str(e)}")
raise RedisConnectionError(f"Failed to update domain: {str(e)}")

View File

18
doclink/app/db/config.py Normal file
View File

@@ -0,0 +1,18 @@
from configparser import ConfigParser
class GenerateConfig:
def __init__(self) -> None:
pass
def config(filename="app/db/database.ini", section="postgresql"):
parser = ConfigParser()
parser.read(filename)
db_config = {}
if parser.has_section(section):
params = parser.items(section)
for param in params:
db_config[param[0]] = param[1]
else:
raise Exception(f"Section {section} is not found in {filename} file.")
return db_config

740
doclink/app/db/database.py Normal file
View File

@@ -0,0 +1,740 @@
from psycopg2 import extras
from psycopg2 import DatabaseError
from pathlib import Path
import psycopg2
import logging
import numpy as np
import uuid
from datetime import datetime
from .config import GenerateConfig
from ..api.core import Encryptor
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
encryptor = Encryptor()
class Database:
_instance = None
def __new__(cls):
if cls._instance is None:
cls._instance = super(Database, cls).__new__(cls)
cls._instance.db_config = GenerateConfig.config()
return cls._instance
def __enter__(self):
self.conn = psycopg2.connect(**self.db_config)
self.cursor = self.conn.cursor()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
if self.cursor:
self.cursor.close()
if self.conn:
if exc_type is None:
self.conn.commit()
else:
self.conn.rollback()
self.conn.close()
def initialize_tables(self):
sql_path = Path(__file__).resolve().parent / "sql" / "table_initialize.sql"
with sql_path.open("r") as file:
query = file.read()
try:
self.cursor.execute(query)
self.conn.commit()
except DatabaseError as e:
self.conn.rollback()
raise e
def reset_database(self):
sql_path = Path(__file__).resolve().parent / "sql" / "database_reset.sql"
with sql_path.open("r") as file:
query = file.read()
try:
self.cursor.execute(query)
self.conn.commit()
except DatabaseError as e:
self.conn.rollback()
raise e
def _bytes_to_embeddings(self, byte_array):
return np.frombuffer(byte_array.tobytes(), dtype=np.float16).reshape(
byte_array.shape[0], -1
)
def get_user_info_w_id(self, user_id: str):
query_get_user_info = """
SELECT DISTINCT user_name, user_surname, user_email, user_type, user_created_at, picture_url
FROM user_info
WHERE user_id = %s
"""
query_get_domain_ids = """
SELECT DISTINCT domain_id
FROM domain_info
WHERE user_id = %s
"""
query_get_domain_info = """
SELECT t1.domain_name, t1.domain_id, t2.file_name, t2.file_id
FROM domain_info t1
LEFT JOIN file_info t2 ON t1.domain_id = t2.domain_id
WHERE t1.domain_id IN %s
"""
query_get_daily_count = """
SELECT sum(question_count)
FROM session_info s
WHERE s.user_id = %s
AND s.created_at >= CURRENT_TIMESTAMP - INTERVAL '24 hours' AND s.created_at <= CURRENT_TIMESTAMP;
"""
try:
self.cursor.execute(query_get_user_info, (user_id,))
user_info_data = self.cursor.fetchone()
self.cursor.execute(query_get_daily_count, (user_id,))
user_daily_count = self.cursor.fetchone()
if not user_info_data:
return None, None
user_info = {
"user_name": user_info_data[0],
"user_surname": user_info_data[1],
"user_email": user_info_data[2],
"user_type": user_info_data[3],
"user_created_at": str(user_info_data[4]),
"user_daily_count": user_daily_count[0] if user_daily_count[0] else 0,
"user_picture_url": user_info_data[5],
}
self.cursor.execute(query_get_domain_ids, (user_id,))
domain_id_data = self.cursor.fetchall()
if not domain_id_data:
return user_info, None
domain_ids = [data[0] for data in domain_id_data]
self.cursor.execute(query_get_domain_info, (tuple(domain_ids),))
domain_info_data = self.cursor.fetchall()
domain_info = {}
for data in domain_info_data:
if data[1] not in domain_info.keys():
domain_info[data[1]] = {
"domain_name": data[0],
"file_names": [data[2]] if data[2] else [],
"file_ids": [data[3]] if data[3] else [],
}
else:
domain_info[data[1]]["file_names"].append(data[2])
domain_info[data[1]]["file_ids"].append(data[3])
return user_info, domain_info
except DatabaseError as e:
self.conn.rollback()
raise e
def get_file_info_with_domain(self, user_id: str, domain_id: str):
query_get_file_info = """
SELECT DISTINCT file_id, file_name, file_modified_date, file_upload_date
FROM file_info
WHERE user_id = %s AND domain_id = %s
"""
try:
self.cursor.execute(
query_get_file_info,
(
user_id,
domain_id,
),
)
data = self.cursor.fetchall()
return (
[
{
"file_id": row[0],
"file_name": row[1],
"file_modified_date": row[2],
"file_upload_date": row[3],
}
for row in data
]
if data
else None
)
except DatabaseError as e:
self.conn.rollback()
raise e
def get_domain_info(self, user_id: str, domain_id: int):
query = """
SELECT DISTINCT domain_name
FROM domain_info
WHERE user_id = %s AND domain_id = %s
"""
try:
self.cursor.execute(
query,
(
user_id,
domain_id,
),
)
data = self.cursor.fetchone()
return {"domain_name": data[0]} if data else None
except DatabaseError as e:
self.conn.rollback()
raise e
def get_file_content(self, file_ids: list):
query_get_content = """
SELECT t1.sentence AS sentence, t1.is_header AS is_header, t1.is_table AS is_table, t1.page_number AS page_number, t1.file_id AS file_id, t2.file_name AS file_name
FROM file_content t1
LEFT JOIN file_info t2 ON t1.file_id = t2.file_id
WHERE t1.file_id IN %s
"""
query_get_embeddings = """
SELECT array_agg(embedding) AS embeddings
FROM file_content
WHERE file_id IN %s
"""
try:
self.cursor.execute(query_get_content, (tuple(file_ids),))
content = self.cursor.fetchall()
self.cursor.execute(query_get_embeddings, (tuple(file_ids),))
byte_embeddings = self.cursor.fetchone()
if content and byte_embeddings and byte_embeddings[0]:
embeddings = self._bytes_to_embeddings(np.array(byte_embeddings[0]))
return content, embeddings
else:
return None, None
except DatabaseError as e:
self.conn.rollback()
print(f"Database error occurred: {e}")
return None, None
except Exception as e:
print(f"An unexpected error occurred: {e}")
return None, None
def get_session_info(self, session_id: str):
query_get_session = """
SELECT user_id, created_at
FROM session_info
WHERE session_id = %s
"""
self.cursor.execute(query_get_session, (session_id,))
data = self.cursor.fetchone()
return {"user_id": data[0], "created_at": data[1]} if data else None
def rename_domain(self, domain_id: str, new_name: str):
query = """
UPDATE domain_info
SET domain_name = %s
WHERE domain_id = %s
"""
try:
self.cursor.execute(query, (new_name, domain_id))
rows_affected = self.cursor.rowcount
return rows_affected > 0
except DatabaseError as e:
self.conn.rollback()
raise e
def insert_user_guide(self, user_id: str, domain_id: str):
"""
Insert default user guide content into user's default domain
using the file_id already present in default_content
"""
current_date = datetime.now().date()
file_id = str(uuid.uuid4())
try:
# Insert file info with the new file_id
query_insert_file_info = """
INSERT INTO file_info
(user_id, domain_id, file_id, file_name, file_modified_date, file_upload_date)
VALUES
(%s, %s, %s, %s, %s, %s)
"""
self.cursor.execute(
query_insert_file_info,
(
user_id,
domain_id,
file_id,
"User Guide.pdf",
current_date,
current_date,
),
)
query_get_guide_content = """
SELECT sentence, is_header, is_table, page_number, embedding
FROM default_content
"""
self.cursor.execute(query_get_guide_content)
default_content = self.cursor.fetchall()
for row in default_content:
sentence, is_header, is_table, page_number, embedding = row
encrypted_sentence = encryptor.encrypt(sentence, file_id)
self.cursor.execute(
"INSERT INTO file_content (file_id, sentence, is_header, is_table, page_number, embedding) VALUES (%s, %s, %s, %s, %s, %s)",
(
file_id,
encrypted_sentence,
is_header,
is_table,
page_number,
embedding,
),
)
return True
except DatabaseError as e:
self.conn.rollback()
logger.error(f"Error inserting user guide: {str(e)}")
raise e
except Exception as e:
self.conn.rollback()
logger.error(f"Unexpected error inserting user guide: {str(e)}")
raise e
def delete_domain(self, domain_id: str):
get_domain_type_query = """
SELECT domain_type
FROM domain_info
WHERE domain_id = %s
"""
get_files_query = """
SELECT file_id
FROM file_info
WHERE domain_id = %s
"""
delete_content_query = """
DELETE FROM file_content
WHERE file_id IN %s
"""
delete_files_query = """
DELETE FROM file_info
WHERE domain_id = %s
"""
delete_domain_query = """
DELETE FROM domain_info
WHERE domain_id = %s
"""
try:
self.cursor.execute(get_domain_type_query, (domain_id,))
domain_type = self.cursor.fetchone()
if not domain_type[0]:
return -1
self.cursor.execute(get_files_query, (domain_id,))
file_data = self.cursor.fetchall()
file_ids = [data[0] for data in file_data]
# content -> files -> domain
if file_ids:
self.cursor.execute(delete_content_query, (tuple(file_ids),))
self.cursor.execute(delete_files_query, (domain_id,))
self.cursor.execute(delete_domain_query, (domain_id,))
rows_affected = self.cursor.rowcount
return 1 if rows_affected else 0
except DatabaseError as e:
# Rollback in case of error
self.cursor.execute("ROLLBACK")
logger.error(f"Error deleting domain {domain_id}: {str(e)}")
raise e
def insert_user_feedback(
self,
feedback_id: str,
user_id: str,
feedback_type: str,
description: str,
screenshot: str = None,
):
query = """
INSERT INTO user_feedback (feedback_id, user_id, feedback_type, description, screenshot)
VALUES (%s, %s, %s, %s, %s)
"""
try:
self.cursor.execute(
query,
(
feedback_id,
user_id,
feedback_type,
description,
screenshot,
),
)
except DatabaseError as e:
self.conn.rollback()
raise e
def insert_domain_info(
self, user_id: str, domain_id: str, domain_name: str, domain_type: int
):
query_insert_domain_info = """
INSERT INTO domain_info (user_id, domain_id, domain_name, domain_type)
VALUES (%s, %s, %s, %s)
"""
try:
self.cursor.execute(
query_insert_domain_info,
(
user_id,
domain_id,
domain_name,
domain_type,
),
)
except DatabaseError as e:
self.conn.rollback()
raise e
def create_domain(
self, user_id: str, domain_name: str, domain_id: str, domain_type: int
):
query_count_domains = """
SELECT COUNT(*), user_type
FROM domain_info d
JOIN user_info u ON d.user_id = u.user_id
WHERE u.user_id = %s
GROUP BY user_type
"""
try:
self.cursor.execute(query_count_domains, (user_id,))
result = self.cursor.fetchall()
domain_count, user_type = result[0][0], result[0][1]
if user_type == "free" and domain_count >= 3:
return {
"success": False,
"message": "Free users can only create up to 3 domains. Upgrade account to create more domains!",
}
elif user_type == "premium" and domain_count >= 10:
return {
"success": False,
"message": "Premium users can only create up to 20 domains. Upgrade account to create more domains!",
}
query_insert = """
INSERT INTO domain_info (user_id, domain_id, domain_name, domain_type)
VALUES (%s, %s, %s, %s)
RETURNING domain_id
"""
self.cursor.execute(
query_insert, (user_id, domain_id, domain_name, domain_type)
)
created_domain_id = self.cursor.fetchone()[0]
return {
"success": True,
"domain_id": created_domain_id,
"message": "success",
}
except DatabaseError as e:
self.conn.rollback()
raise e
def get_user_total_file_count(self, user_id: str):
user_type_query = """
SELECT user_type
FROM user_info
WHERE user_id = %s
"""
file_count_query = """
SELECT COUNT(file_id)
FROM file_info
WHERE user_id = %s
"""
try:
# Get user type first
self.cursor.execute(user_type_query, (user_id,))
user_type_result = self.cursor.fetchone()
if not user_type_result:
logger.error(f"User {user_id} not found in database")
return False
user_type = user_type_result[0]
# Get file count
self.cursor.execute(file_count_query, (user_id,))
file_count_result = self.cursor.fetchone()
file_count = file_count_result[0] if file_count_result else 0
return file_count, user_type
except Exception as e:
self.conn.rollback()
logger.error(f"Error in user total file processing: {str(e)}")
return False
def insert_file_batches(
self, file_info_batch: list, file_content_batch: list
) -> bool:
"""Process both file info and content in a single transaction."""
try:
user_id = file_info_batch[0][0]
file_count, user_type = self.get_user_total_file_count(user_id)
if user_type == "free" and file_count + len(file_info_batch) > 10:
return {
"success": False,
"message": f"Free users can only have 10 total files. You currently have {file_count} files across all folders. Upgrade to add more!",
}
elif user_type == "premium" and file_count + len(file_info_batch) > 100:
return {
"success": False,
"message": f"Premium users can only have 100 total files. You currently have {file_count} files across all folders",
}
self._insert_file_info_batch(file_info_batch)
self._insert_file_content_batch(file_content_batch)
return {"success": True, "message": "Files uploaded successfully"}
except Exception as e:
self.conn.rollback()
logger.error(f"Error in batch processing: {str(e)}")
return False
def _insert_file_info_batch(self, file_info_batch: list):
"""Internal method for file info insertion."""
query = """
INSERT INTO file_info (user_id, file_id, domain_id, file_name, file_modified_date)
VALUES %s
"""
try:
extras.execute_values(self.cursor, query, file_info_batch)
logger.info(
f"Successfully inserted {len(file_info_batch)} file info records"
)
except Exception as e:
logger.error(f"Error while inserting file info: {str(e)}")
raise
def _insert_file_content_batch(self, file_content_batch: list):
"""Internal method for file content insertion."""
query = """
INSERT INTO file_content (file_id, sentence, page_number, is_header, is_table, embedding)
VALUES %s
"""
try:
extras.execute_values(self.cursor, query, file_content_batch)
logger.info(
f"Successfully inserted {len(file_content_batch)} content rows "
)
except Exception as e:
logger.error(f"Error while inserting file content: {str(e)}")
raise
def upsert_session_info(self, user_id: str, session_id: str):
# First check if the session exists
check_session_query = """
SELECT id FROM session_info
WHERE user_id = %s AND session_id = %s
"""
# Query to get daily question count and user type
query_get_daily_count = """
SELECT sum(question_count), u.user_type
FROM session_info s
JOIN user_info u ON s.user_id = u.user_id
WHERE s.user_id = %s
AND s.created_at >= CURRENT_TIMESTAMP - INTERVAL '24 hours' AND s.created_at <= CURRENT_TIMESTAMP
GROUP BY u.user_type;
"""
# Query to insert new session
insert_session_query = """
INSERT INTO session_info
(user_id, session_id, question_count, total_enterance, last_enterance)
VALUES (%s, %s, 0, 1, CURRENT_TIMESTAMP)
RETURNING id
"""
# Query to update existing session
update_question_query = """
UPDATE session_info
SET question_count = question_count + 1,
last_enterance = CURRENT_TIMESTAMP
WHERE user_id = %s AND session_id = %s
RETURNING question_count
"""
try:
# Check if session exists
self.cursor.execute(check_session_query, (user_id, session_id))
session_exists = self.cursor.fetchone()
# If session doesn't exist, create it
if not session_exists:
self.cursor.execute(insert_session_query, (user_id, session_id))
self.conn.commit()
# Get daily count and user type
self.cursor.execute(query_get_daily_count, (user_id,))
result = self.cursor.fetchall()
daily_count, user_type = result[0][0], result[0][1]
# Check free user limits
if user_type == "free" and daily_count >= 25:
return {
"success": False,
"message": "Daily question limit reached for free user. Please try again tomorrow or upgrade your plan!",
"question_count": daily_count,
}
# Increment question count
self.cursor.execute(update_question_query, (user_id, session_id))
question_count = self.cursor.fetchone()[0]
self.conn.commit()
return {
"success": True,
"message": "success",
"question_count": question_count,
"daily_count": daily_count,
}
except Exception as e:
self.conn.rollback()
print(f"Error updating session info: {str(e)}")
raise e
def insert_user_rating(
self, rating_id: str, user_id: str, rating: int, user_note: str
):
query = """
INSERT INTO user_rating (rating_id, user_id, rating, user_note)
VALUES (%s, %s, %s, %s)
"""
try:
self.cursor.execute(query, (rating_id, user_id, rating, user_note))
except Exception as e:
self.conn.rollback()
raise e
def clear_file_info(self, user_id: str, file_ids: list):
query = """
DELETE FROM file_info
WHERE user_id = %s AND file_id IN %s
"""
try:
self.cursor.execute(
query,
(
user_id,
tuple(
file_ids,
),
),
)
return 1
except DatabaseError as e:
self.conn.rollback()
raise e
def clear_file_content(self, file_id: list):
clear_content_query = """
DELETE FROM file_content
WHERE file_id = %s
"""
clear_file_info_query = """
DELETE FROM file_info
WHERE file_id = %s
"""
try:
self.cursor.execute(
clear_content_query,
(file_id,),
)
self.cursor.execute(
clear_file_info_query,
(file_id,),
)
rows_affected = self.cursor.rowcount
return 1 if rows_affected else 0
except DatabaseError as e:
self.conn.rollback()
raise e
def update_user_subscription(
self,
user_email: str,
lemon_squeezy_customer_id: str,
receipt_url: str,
):
try:
query_get_user = """
SELECT user_id FROM user_info
WHERE user_email = %s
LIMIT 1
"""
self.cursor.execute(query_get_user, (user_email,))
result = self.cursor.fetchone()
if result:
# Insert user into the premium table
user_id = result[0]
query_insert_premium_user = """
INSERT INTO premium_user_info (lemon_squeezy_customer_id, user_id, receipt_url)
VALUES (%s, %s, %s)
"""
self.cursor.execute(
query_insert_premium_user,
(lemon_squeezy_customer_id, user_id, receipt_url),
)
# Update user info within the user_info table
query_update_user_info = """
UPDATE user_info
SET user_type = %s
WHERE user_id = %s
RETURNING user_id
"""
self.cursor.execute(query_update_user_info, ("premium", user_id))
return
else:
# This is for handling webhooks before we've updated the user record
logger.warning(
f"Received webhook for unknown customer: {lemon_squeezy_customer_id}"
)
return False
except Exception as e:
logger.error(f"Error updating subscription: {str(e)}")
self.conn.rollback() # Added rollback to prevent transaction errors
return False
if __name__ == "__main__":
with Database() as db:
db.reset_database()
db.initialize_tables()

View File

@@ -0,0 +1,38 @@
-- drop_all_tables.sql
-- Disable foreign key checks to avoid dependency issues
SET session_replication_role = 'replica';
-- Drop all tables in the public schema
DO $$
DECLARE
r RECORD;
BEGIN
FOR r IN (SELECT tablename FROM pg_tables WHERE schemaname = 'public') LOOP
EXECUTE 'DROP TABLE IF EXISTS ' || quote_ident(r.tablename) || ' CASCADE';
END LOOP;
END $$;
-- Re-enable foreign key checks
SET session_replication_role = 'origin';
-- Optionally, you can also drop sequences if you have any
DO $$
DECLARE
r RECORD;
BEGIN
FOR r IN (SELECT sequencename FROM pg_sequences WHERE schemaname = 'public') LOOP
EXECUTE 'DROP SEQUENCE IF EXISTS ' || quote_ident(r.sequencename) || ' CASCADE';
END LOOP;
END $$;
-- If you want to reset the primary key sequences for all tables, you can add this:
-- (Note: Only necessary if you've inserted data and want to reset auto-incrementing ids)
DO $$
DECLARE
r RECORD;
BEGIN
FOR r IN (SELECT tablename FROM pg_tables WHERE schemaname = 'public') LOOP
EXECUTE 'ALTER TABLE ' || quote_ident(r.tablename) || ' ALTER COLUMN id RESTART WITH 1;';
END LOOP;
END $$;

View File

@@ -0,0 +1,88 @@
CREATE TABLE IF NOT EXISTS user_info (
user_id UUID PRIMARY KEY,
google_id VARCHAR(255) NOT NULL,
user_name VARCHAR(50) NOT NULL,
user_surname VARCHAR(50) NOT NULL,
user_email VARCHAR(100) UNIQUE NOT NULL,
user_type VARCHAR(20) DEFAULT 'free',
picture_url VARCHAR(255),
user_created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS premium_user_info (
lemon_squeezy_customer_id VARCHAR(255) NOT NULL,
user_id UUID PRIMARY KEY,
receipt_url VARCHAR,
payment_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (user_id) REFERENCES user_info(user_id)
);
CREATE TABLE IF NOT EXISTS user_feedback (
feedback_id UUID PRIMARY KEY,
user_id UUID NOT NULL,
feedback_type VARCHAR(20) NOT NULL,
description TEXT NOT NULL,
screenshot TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (user_id) REFERENCES user_info(user_id)
);
CREATE TABLE IF NOT EXISTS domain_info (
user_id UUID NOT NULL,
domain_id UUID PRIMARY KEY,
domain_name VARCHAR(30) NOT NULL,
domain_type INTEGER,
FOREIGN KEY (user_id) REFERENCES user_info(user_id)
);
CREATE TABLE IF NOT EXISTS file_info (
user_id UUID NOT NULL,
domain_id UUID NOT NULL,
file_id UUID PRIMARY KEY,
file_name VARCHAR(255) NOT NULL,
file_modified_date DATE,
file_upload_date DATE DEFAULT CURRENT_DATE,
FOREIGN KEY (user_id) REFERENCES user_info(user_id),
FOREIGN KEY (domain_id) REFERENCES domain_info(domain_id)
);
CREATE TABLE IF NOT EXISTS file_content (
content_id SERIAL PRIMARY KEY,
file_id UUID NOT NULL,
sentence TEXT NOT NULL,
is_header BOOLEAN DEFAULT FALSE,
is_table BOOLEAN DEFAULT FALSE,
page_number INTEGER,
embedding BYTEA,
FOREIGN KEY (file_id) REFERENCES file_info(file_id)
);
CREATE TABLE IF NOT EXISTS session_info (
id SERIAL PRIMARY KEY,
user_id UUID NOT NULL,
session_id UUID NOT NULL,
question_count INTEGER DEFAULT 0,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
total_enterance INTEGER DEFAULT 0,
last_enterance TIMESTAMP,
FOREIGN KEY (user_id) REFERENCES user_info(user_id)
);
CREATE TABLE IF NOT EXISTS default_content (
content_id SERIAL PRIMARY KEY,
file_id UUID NOT NULL,
sentence TEXT NOT NULL,
is_header BOOLEAN DEFAULT FALSE,
is_table BOOLEAN DEFAULT FALSE,
page_number INTEGER,
embedding BYTEA
);
CREATE TABLE IF NOT EXISTS user_rating (
rating_id UUID PRIMARY KEY,
user_id UUID NOT NULL,
rating INTEGER NOT NULL,
user_note TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (user_id) REFERENCES user_info(user_id)
);

View File

View File

@@ -0,0 +1,83 @@
from openai import OpenAI
from dotenv import load_dotenv
from langdetect import detect
import textwrap
import yaml
import re
from typing import Dict, Any, Match
class ChatbotFunctions:
def __init__(self):
load_dotenv()
self.client = OpenAI()
with open("app/utils/prompts.yaml", "r", encoding="utf-8") as file:
self.prompt_data = yaml.safe_load(file)
def _prompt_query_generation(self, query, file_lang):
return textwrap.dedent(
self.get_prompt(category="queries", query=query, file_lang=file_lang)
)
def _prompt_answer_generation(self, query, context, lang, intention):
return textwrap.dedent(
self.get_prompt(category=intention, query=query, context=context, lang=lang)
)
def response_generation(self, query, context, intention):
lang = self.detect_language(query=query)
prompt = self._prompt_answer_generation(
query=query, context=context, lang=lang, intention=intention
)
response = self.client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": prompt},
{"role": "user", "content": query},
],
temperature=0,
)
answer = response.choices[0].message.content.strip()
return answer
def query_generation(self, query, file_lang):
lang = self.detect_language(query=query)
prompt = self._prompt_query_generation(query, file_lang=file_lang)
response = self.client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": prompt},
{"role": "user", "content": query},
],
temperature=0,
)
new_queries = response.choices[0].message.content.strip()
return new_queries, lang
def detect_language(self, query):
if query.isalpha():
lang = detect(text=query)
return "tr" if lang == "tr" else "en"
return None
def replace_variables(self, match: Match, kwargs: Dict[str, Any]):
variables = match.group(1) or match.group(2)
value = kwargs.get(variables)
return str(value)
def get_prompt(self, category, **kwargs):
variable_pattern = r"\${?(\w+)}?|\{(\w+)\}"
try:
prompt = self.prompt_data["prompts"]["languages"]["en"][category.strip()][
0
]["text"]
def replace_wrapper(match):
return self.replace_variables(match, kwargs)
full_prompt = re.sub(variable_pattern, replace_wrapper, prompt)
return full_prompt
except KeyError:
print(f"No template found for {category}")
return None

View File

@@ -0,0 +1,36 @@
import numpy as np
from openai import OpenAI
from dotenv import load_dotenv
from typing import List
class EmbeddingFunctions:
def __init__(self):
load_dotenv()
self.client = OpenAI()
def create_embeddings_from_sentences(
self, sentences: List[str], chunk_size: int = 2000
) -> List[np.ndarray]:
file_embeddings = []
for chunk_index in range(0, len(sentences), chunk_size):
chunk_embeddings = self.client.embeddings.create(
model="text-embedding-3-small",
input=sentences[chunk_index : chunk_index + chunk_size],
)
chunk_array = np.array(
[x.embedding for x in chunk_embeddings.data], dtype=np.float16
)
file_embeddings.append(
chunk_array / np.linalg.norm(chunk_array, axis=1)[:, np.newaxis]
)
return np.vstack(file_embeddings)
def create_embedding_from_sentence(self, sentence: list) -> np.ndarray:
query_embedding = self.client.embeddings.create(
model="text-embedding-3-small", input=sentence
)
return np.array(query_embedding.data[0].embedding, dtype=np.float16).reshape(
1, -1
)

View File

@@ -0,0 +1,126 @@
from reportlab.lib import colors
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from io import BytesIO
import re
class Exporter:
def __init__(self):
self.styles = getSampleStyleSheet()
self.setup_styles()
def _register_fonts(self):
pdfmetrics.registerFont(TTFont("Helvetica", "Helvetica"))
pdfmetrics.registerFont(TTFont("Helvetica-Bold", "Helvetica-Bold"))
def setup_styles(self):
self.styles.add(
ParagraphStyle(
name="Header",
fontSize=14,
textColor=colors.HexColor("#10B981"),
spaceAfter=12,
fontName="Helvetica-Bold",
encoding="utf-8",
)
)
self.styles.add(
ParagraphStyle(
name="Content",
fontSize=11,
textColor=colors.black,
spaceAfter=8,
fontName="Helvetica",
encoding="utf-8",
)
)
self.styles.add(
ParagraphStyle(
name="Bullet-Point",
fontSize=11,
leftIndent=20,
bulletIndent=10,
spaceAfter=5,
fontName="Helvetica",
encoding="utf-8",
)
)
def clean_text(self, text: str) -> str:
if not isinstance(text, str):
text = text.decode("utf-8")
text = text.replace("ı", "i").replace("İ", "I")
text = text.replace("ğ", "g").replace("Ğ", "G")
text = text.replace("ü", "u").replace("Ü", "U")
text = text.replace("ş", "s").replace("Ş", "S")
text = text.replace("ö", "o").replace("Ö", "O")
text = text.replace("ç", "c").replace("Ç", "C")
text = re.sub(
r"\[header\](.*?)\[/header\]", r'<para style="Header">\1</para>', text
)
text = re.sub(r"\[bold\](.*?)\[/bold\]", r"<b>\1</b>", text)
return text
def create_watermark(self, canvas, doc):
canvas.saveState()
canvas.setFillColor(colors.HexColor("#10B981"))
canvas.setFont("Helvetica", 8)
canvas.drawString(30, 20, "Generated by docklink.io")
canvas.restoreState()
def export_pdf(self, data: str) -> BytesIO:
buffer = BytesIO()
content = []
cleaned_text = self.clean_text(data)
doc = SimpleDocTemplate(
buffer,
pagesize=A4,
rightMargin=30,
leftMargin=30,
topMargin=30,
bottomMargin=30,
)
lines = cleaned_text.split("\n")
for line in lines:
if line.strip():
if (
line.startswith("<h1>")
or line.startswith('<para style="Header">')
or "header" in line
):
# Header section
text = line.replace("<h1>", "").replace("</h1>", "")
content.append(Paragraph(text, self.styles["Header"]))
elif line.startswith("-"):
# Bullet point
text = line.strip()
content.append(Paragraph(f"- {text}", self.styles["Bullet-Point"]))
else:
# Normal text
content.append(Paragraph(line, self.styles["Content"]))
content.append(Spacer(1, 2))
try:
doc.build(
content,
onFirstPage=self.create_watermark,
onLaterPages=self.create_watermark,
)
buffer.seek(0)
return buffer
except Exception as e:
raise ValueError(
f"Error: {e} Content too large or complex to export to PDF"
)

View File

@@ -0,0 +1,12 @@
import faiss
class IndexingFunctions:
def __init__(self):
pass
def create_flat_index(self, embeddings):
dimension = len(embeddings[0])
index = faiss.IndexFlatIP(dimension)
index.add(embeddings)
return index

View File

@@ -0,0 +1,536 @@
import fitz
import tempfile
import io
import re
import spacy
import pymupdf4llm
import xml.etree.ElementTree as ET
import zipfile
from pathlib import Path
from langchain_text_splitters import MarkdownHeaderTextSplitter
from docling.datamodel.base_models import InputFormat
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.document_converter import (
DocumentConverter,
WordFormatOption,
PowerpointFormatOption,
HTMLFormatOption,
)
class ReadingFunctions:
def __init__(self):
self.nlp = spacy.load(
"en_core_web_sm",
disable=[
"tagger",
"attribute_ruler",
"lemmatizer",
"ner",
"textcat",
"custom",
],
)
self.max_file_size_mb = 50
self.headers_to_split_on = [
("#", "Header 1"),
("##", "Header 2"),
("###", "Header 3"),
("####", "Header 4"),
]
self.markdown_splitter = MarkdownHeaderTextSplitter(
self.headers_to_split_on, strip_headers=False, return_each_line=True
)
self.converter = DocumentConverter(
allowed_formats=[
InputFormat.DOCX,
InputFormat.PPTX,
InputFormat.XLSX,
InputFormat.PDF,
InputFormat.HTML,
],
format_options={
InputFormat.DOCX: WordFormatOption(pipeline_cls=SimplePipeline),
InputFormat.PPTX: PowerpointFormatOption(pipeline_cls=SimplePipeline),
InputFormat.HTML: HTMLFormatOption(pipeline_cls=SimplePipeline),
},
)
def read_file(self, file_bytes: bytes, file_name: str):
"""Read and process file content from bytes"""
file_size_mb = self._get_file_size(file_bytes=file_bytes)
file_type = file_name.split(".")[-1].lower()
if file_size_mb > self.max_file_size_mb:
raise ValueError(f"File size exceeds {self.max_file_size_mb}MB limit")
try:
if file_type == "pdf":
return self._process_pdf(file_bytes=file_bytes)
elif file_type == "docx":
return self._process_docx(file_bytes=file_bytes)
elif file_type == "pptx":
return self._process_pptx(file_bytes=file_bytes)
elif file_type == "xlsx":
return self._process_xlsx(file_bytes=file_bytes)
elif file_type == "udf":
return self._process_udf(file_bytes=file_bytes)
elif file_type in ["txt", "rtf"]:
return self._process_txt(file_bytes=file_bytes)
else:
raise ValueError(f"Unsupported file type: {file_type}")
except Exception as e:
raise ValueError(f"Error processing {file_name}: {str(e)}")
def read_url(self, html_content: tuple):
html_data = {
"sentences": [],
"page_number": [],
"is_header": [],
"is_table": [],
}
try:
with tempfile.NamedTemporaryFile(delete=True, suffix=".html") as temp_file:
temp_file.write(html_content.encode("utf-8"))
temp_file.flush()
html_path = Path(temp_file.name)
md_text = self.converter.convert(
html_path
).document.export_to_markdown()
splits = self.markdown_splitter.split_text(md_text)
for split in splits:
if (
not len(split.page_content) > 5
or re.match(r"^[^\w]*$", split.page_content)
or split.page_content[:4] == "<!--"
):
continue
elif split.metadata and split.page_content[0] == "#":
html_data["sentences"].append(split.page_content)
html_data["is_header"].append(True)
html_data["is_table"].append(False)
html_data["page_number"].append(1)
elif split.page_content[0] == "|" and split.page_content[-1] == "|":
html_data["sentences"].append(split.page_content)
html_data["is_header"].append(False)
html_data["is_table"].append(True)
html_data["page_number"].append(1)
else:
html_data["sentences"].append(split.page_content)
html_data["is_header"].append(False)
html_data["is_table"].append(False)
html_data["page_number"].append(1)
return self._chunk_html(html_data)
except Exception as e:
raise ValueError(f"Error processing HTML content: {str(e)}")
def _process_pdf(self, file_bytes: bytes):
pdf_data = {"sentences": [], "page_number": [], "is_header": [], "is_table": []}
pdf_file = io.BytesIO(file_bytes)
with fitz.open(stream=pdf_file, filetype="pdf") as pdf:
# Process each page
markdown_pages = pymupdf4llm.to_markdown(
pdf, page_chunks=True, show_progress=False, margins=0
)
for i, page in enumerate(markdown_pages):
splits = self.markdown_splitter.split_text(page["text"])
for split in splits:
if not len(split.page_content) > 5 or re.match(
r"^[^\w]*$", split.page_content
):
continue
elif (
split.metadata and split.page_content[0] == "#"
): # Header detection
pdf_data["sentences"].append(split.page_content)
pdf_data["is_header"].append(True)
pdf_data["is_table"].append(False)
pdf_data["page_number"].append(i + 1)
elif (
split.page_content[0] == "*"
and split.page_content[-1] == "*"
and (
re.match(
r"(\*{2,})(\d+(?:\.\d+)*)\s*(\*{2,})?(.*)$",
split.page_content,
)
or re.match(
r"(\*{1,3})?([A-Z][a-zA-Z\s\-]+)(\*{1,3})?$",
split.page_content,
)
)
): # Sub-Header and Header variant detection
pdf_data["sentences"].append(split.page_content)
pdf_data["is_header"].append(True)
pdf_data["is_table"].append(False)
pdf_data["page_number"].append(i + 1)
elif (
split.page_content[0] == "|" and split.page_content[-1] == "|"
): # Table detection
pdf_data["sentences"].append(split.page_content)
pdf_data["is_header"].append(False)
pdf_data["is_table"].append(True)
pdf_data["page_number"].append(i + 1)
else:
pdf_data["sentences"].append(split.page_content)
pdf_data["is_header"].append(False)
pdf_data["is_table"].append(False)
pdf_data["page_number"].append(i + 1)
return pdf_data
def _process_docx(self, file_bytes: bytes):
docx_data = {
"sentences": [],
"page_number": [],
"is_header": [],
"is_table": [],
}
current_length = 0
chars_per_page = 2000
current_page = 1
docx_file = io.BytesIO(file_bytes)
with tempfile.NamedTemporaryFile(delete=True, suffix=".docx") as temp_file:
temp_file.write(docx_file.getvalue())
docx_path = Path(temp_file.name)
md_text = self.converter.convert(docx_path).document.export_to_markdown()
splits = self.markdown_splitter.split_text(md_text)
for split in splits:
if current_length + len(split.page_content) > chars_per_page:
current_page += 1
current_length = 0
if (
not len(split.page_content) > 5
or re.match(r"^[^\w]*$", split.page_content)
or split.page_content[:4] == "<!--"
):
continue
elif (
split.metadata and split.page_content[0] == "#"
): # Header detection
docx_data["sentences"].append(split.page_content)
docx_data["is_header"].append(True)
docx_data["is_table"].append(False)
docx_data["page_number"].append(current_page)
current_length += len(split.page_content)
elif (
split.page_content[0] == "*"
and split.page_content[-1] == "*"
and (
re.match(
r"(\*{2,})(\d+(?:\.\d+)*)\s*(\*{2,})?(.*)$",
split.page_content,
)
or re.match(
r"(\*{1,3})?([A-Z][a-zA-Z\s\-]+)(\*{1,3})?$",
split.page_content,
)
)
): # Sub-Header and Header variant detection
docx_data["sentences"].append(split.page_content)
docx_data["is_header"].append(True)
docx_data["is_table"].append(False)
docx_data["page_number"].append(current_page)
current_length += len(split.page_content)
elif (
split.page_content[0] == "|" and split.page_content[-1] == "|"
): # Table detection
docx_data["sentences"].append(split.page_content)
docx_data["is_header"].append(False)
docx_data["is_table"].append(True)
docx_data["page_number"].append(current_page)
current_length += len(split.page_content)
else:
docx_data["sentences"].append(split.page_content)
docx_data["is_header"].append(False)
docx_data["is_table"].append(False)
docx_data["page_number"].append(current_page)
current_length += len(split.page_content)
return docx_data
def _process_pptx(self, file_bytes: bytes):
pptx_data = {
"sentences": [],
"page_number": [],
"is_header": [],
"is_table": [],
}
current_length = 0
chars_per_page = 500
current_page = 1
pptx_file = io.BytesIO(file_bytes)
with tempfile.NamedTemporaryFile(delete=True, suffix=".pptx") as temp_file:
temp_file.write(pptx_file.getvalue())
pptx_path = Path(temp_file.name)
md_text = self.converter.convert(pptx_path).document.export_to_markdown()
splits = self.markdown_splitter.split_text(md_text)
for split in splits:
if current_length + len(split.page_content) > chars_per_page:
current_page += 1
current_length = 0
if (
not len(split.page_content) > 5
or re.match(r"^[^\w]*$", split.page_content)
or split.page_content[:4] == "<!--"
):
continue
elif (
split.metadata and split.page_content[0] == "#"
): # Header detection
pptx_data["sentences"].append(split.page_content)
pptx_data["is_header"].append(True)
pptx_data["is_table"].append(False)
pptx_data["page_number"].append(current_page)
current_length += len(split.page_content)
elif (
split.page_content[0] == "*"
and split.page_content[-1] == "*"
and (
re.match(
r"(\*{2,})(\d+(?:\.\d+)*)\s*(\*{2,})?(.*)$",
split.page_content,
)
or re.match(
r"(\*{1,3})?([A-Z][a-zA-Z\s\-]+)(\*{1,3})?$",
split.page_content,
)
)
): # Sub-Header and Header variant detection
pptx_data["sentences"].append(split.page_content)
pptx_data["is_header"].append(True)
pptx_data["is_table"].append(False)
pptx_data["page_number"].append(current_page)
current_length += len(split.page_content)
elif (
split.page_content[0] == "|" and split.page_content[-1] == "|"
): # Table detection
pptx_data["sentences"].append(split.page_content)
pptx_data["is_header"].append(False)
pptx_data["is_table"].append(True)
pptx_data["page_number"].append(current_page)
current_length += len(split.page_content)
else:
pptx_data["sentences"].append(split.page_content)
pptx_data["is_header"].append(False)
pptx_data["is_table"].append(False)
pptx_data["page_number"].append(current_page)
current_length += len(split.page_content)
return pptx_data
def _process_xlsx(self, file_bytes: bytes):
xlsx_data = {
"sentences": [],
"page_number": [],
"is_header": [],
"is_table": [],
}
current_length = 0
chars_per_page = 2000
current_page = 1
xlsx_file = io.BytesIO(file_bytes)
with tempfile.NamedTemporaryFile(delete=True, suffix=".xlsx") as temp_file:
temp_file.write(xlsx_file.getvalue())
xlsx_path = Path(temp_file.name)
md_text = self.converter.convert(xlsx_path).document.export_to_markdown()
splits = self.markdown_splitter.split_text(md_text)
for split in splits:
if current_length + len(split.page_content) > chars_per_page:
current_page += 1
current_length = 0
if (
not len(split.page_content) > 5
or re.match(r"^[^\w]*$", split.page_content)
or split.page_content[:4] == "<!--"
):
continue
elif (
split.metadata and split.page_content[0] == "#"
): # Header detection
xlsx_data["sentences"].append(split.page_content)
xlsx_data["is_header"].append(True)
xlsx_data["is_table"].append(False)
xlsx_data["page_number"].append(current_page)
current_length += len(split.page_content)
elif (
split.page_content[0] == "*"
and split.page_content[-1] == "*"
and (
re.match(
r"(\*{2,})(\d+(?:\.\d+)*)\s*(\*{2,})?(.*)$",
split.page_content,
)
or re.match(
r"(\*{1,3})?([A-Z][a-zA-Z\s\-]+)(\*{1,3})?$",
split.page_content,
)
)
): # Sub-Header and Header variant detection
xlsx_data["sentences"].append(split.page_content)
xlsx_data["is_header"].append(True)
xlsx_data["is_table"].append(False)
xlsx_data["page_number"].append(current_page)
current_length += len(split.page_content)
elif (
split.page_content[0] == "|" and split.page_content[-1] == "|"
): # Table detection
xlsx_data["sentences"].append(split.page_content)
xlsx_data["is_header"].append(False)
xlsx_data["is_table"].append(True)
xlsx_data["page_number"].append(current_page)
current_length += len(split.page_content)
else:
xlsx_data["sentences"].append(split.page_content)
xlsx_data["is_header"].append(False)
xlsx_data["is_table"].append(False)
xlsx_data["page_number"].append(current_page)
current_length += len(split.page_content)
return xlsx_data
def _process_udf(self, file_bytes: bytes):
udf_data = {
"sentences": [],
"page_number": [],
"is_header": [],
"is_table": [],
}
current_length = 0
chars_per_page = 2000
current_page = 1
udf_file = io.BytesIO(file_bytes)
with zipfile.ZipFile(udf_file, "r") as zip_ref:
xml_content = zip_ref.read("content.xml")
dataTree = ET.parse(io.BytesIO(xml_content))
splits = self.markdown_splitter.split_text(
dataTree.find(".//content").text.strip()
)
for split in splits:
if current_length + len(split.page_content) > chars_per_page:
current_page += 1
current_length = 0
if (
not len(split.page_content) > 5
or re.match(r"^[^\w]*$", split.page_content)
or split.page_content[:4] == "<!--"
):
continue
elif (
split.metadata and split.page_content[0] == "#"
): # Header detection
udf_data["sentences"].append(split.page_content)
udf_data["is_header"].append(True)
udf_data["is_table"].append(False)
udf_data["page_number"].append(current_page)
current_length += len(split.page_content)
elif (
split.page_content[0] == "*"
and split.page_content[-1] == "*"
and (
re.match(
r"(\*{2,})(\d+(?:\.\d+)*)\s*(\*{2,})?(.*)$",
split.page_content,
)
or re.match(
r"(\*{1,3})?([A-Z][a-zA-Z\s\-]+)(\*{1,3})?$",
split.page_content,
)
)
): # Sub-Header and Header variant detection
udf_data["sentences"].append(split.page_content)
udf_data["is_header"].append(True)
udf_data["is_table"].append(False)
udf_data["page_number"].append(current_page)
current_length += len(split.page_content)
elif (
split.page_content[0] == "|" and split.page_content[-1] == "|"
): # Table detection
udf_data["sentences"].append(split.page_content)
udf_data["is_header"].append(False)
udf_data["is_table"].append(True)
udf_data["page_number"].append(current_page)
current_length += len(split.page_content)
else:
udf_data["sentences"].append(split.page_content)
udf_data["is_header"].append(False)
udf_data["is_table"].append(False)
udf_data["page_number"].append(current_page)
current_length += len(split.page_content)
return udf_data
def _process_txt(self, file_bytes: bytes):
text_data = {
"sentences": [],
"page_number": [],
"is_header": [],
"is_table": [],
}
text = file_bytes.decode("utf-8", errors="ignore")
valid_sentences = self._process_text(text=text)
text_data["sentences"].extend(valid_sentences)
text_data["page_number"].extend([1] * len(valid_sentences))
text_data["is_header"].extend([False] * len(valid_sentences))
text_data["is_table"] = [False] * len(text_data["sentences"])
return text_data
def _process_text(self, text):
docs = self.nlp(text)
sentences = [sent.text.replace("\n", " ").strip() for sent in docs.sents]
return [sentence for sentence in sentences if len(sentence) > 15]
def _chunk_html(self, html_text: str, max_tokens: int = 2000):
chunked_data = {
"sentences": [],
"page_number": [],
"is_header": [],
"is_table": [],
}
current_length = 0
for i, sentence in enumerate(html_text["sentences"]):
estimated_tokens = len(sentence.split())
if estimated_tokens > max_tokens:
words = sentence.split()
for j in range(0, len(words), max_tokens):
chunk = " ".join(words[j : j + max_tokens])
chunked_data["sentences"].append(chunk)
chunked_data["page_number"].append(html_text["page_number"][i])
chunked_data["is_header"].append(html_text["is_header"][i])
chunked_data["is_table"].append(html_text["is_table"][i])
else:
if current_length + estimated_tokens > max_tokens:
chunked_data["sentences"].append(sentence)
chunked_data["page_number"].append(html_text["page_number"][i])
chunked_data["is_header"].append(html_text["is_header"][i])
chunked_data["is_table"].append(html_text["is_table"][i])
current_length = 0
else:
chunked_data["sentences"].append(sentence)
chunked_data["page_number"].append(html_text["page_number"][i])
chunked_data["is_header"].append(html_text["is_header"][i])
chunked_data["is_table"].append(html_text["is_table"][i])
current_length += estimated_tokens
return chunked_data
def _get_file_size(self, file_bytes: bytes) -> None:
return len(file_bytes) / (1024 * 1024)
def _clean_text(self, text: str) -> str:
text = re.sub(r"(\b\w+)\s*\n\s*(\w+\b)", r"\1 \2", text)
text = re.sub(r"(\w+)-\s+(\w+)", r"\1\2", text)
text = re.sub(r"[,()]\s*\n\s*(\w+)", r" \1", text)
text = re.sub(r"(\b\w+)\s*-\s*(\w+\b)", r"\1 \2", text)
text = re.sub(r"(\w+)\s*[-]\s*(\w+)", r"\1\2", text)
text = re.sub(
r"(?:[\s!\"#$%&\'()*+,\-.:;<=>?@\[\\\]^_`{|}~]+)(?!\w)", r" ", text
)
text = text.replace("\n", " ").strip()
return " ".join(text.split())

View File

@@ -0,0 +1,114 @@
import logging
import validators
from requests_html import HTMLSession
from urllib.parse import urlparse
from ratelimit import limits, sleep_and_retry
from bs4 import BeautifulSoup
from typing import Optional, Tuple
class Webscraper:
def __init__(self):
self.session = HTMLSession()
logging.basicConfig(level=logging.INFO)
self.logger = logging.getLogger(__name__)
self.unwanted_tags = [
"nav",
"header",
"footer",
"aside",
"script",
"style",
"noscript",
"iframe",
"advertisement",
"banner",
"cookie-banner",
"social-media",
"comments",
'[class*="ad-"]',
'[class*="advertisement"]',
'[class*="banner"]',
'[class*="social"]',
'[class*="footer"]',
'[class*="header-nav"]',
'[class*="cookie"]',
'[class*="popup"]',
'[class*="modal"]',
'[class*="newsletter"]',
]
@sleep_and_retry
@limits(calls=30, period=60)
def request_creator(self, url: str) -> Optional[str]:
try:
response = self.session.get(url, timeout=30)
response.raise_for_status()
return response.html.html
except Exception as e:
self.logger.error(f"Error making request to {url}: {e}")
return None
def url_validator(self, url: str) -> bool:
try:
if not validators.url(url):
return False
parsed = urlparse(url)
return parsed.scheme in ["https", "http"]
except Exception as e:
self.logger.error(f"URL validation error: {str(e)}")
return False
def html_parser(self, html: str) -> str:
try:
soup = BeautifulSoup(html, "html.parser")
for selector in self.unwanted_tags:
for element in soup.select(selector):
element.decompose()
main_content = None
main_tags = ["article", "main", "div"]
for tag in main_tags:
if tag == "div":
for element in soup.find_all(tag, class_=True):
class_name = str(element.get("class", ""))
if any(
pattern in class_name.lower()
for pattern in ["content", "article", "post", "entry"]
):
main_content = element
break
else:
main_content = soup.find(tag)
if main_content:
break
if not main_content:
main_content = soup.body
return str(main_content) if main_content else str(soup)
except Exception as e:
self.logger.error(f"Error cleaning HTML: {str(e)}")
return html
def scraper(self, url: str) -> Tuple[Optional[str], Optional[str]]:
if not self.url_validator(url):
return None, "Invalid Format"
html = self.request_creator(url)
if not html:
return None, "Failed to fetch URL"
try:
parsed_html = self.html_parser(html=html)
return parsed_html, None
except Exception as e:
self.logger.error(f"Error processing URL {url}: {str(e)}")
return None, f"Error processing URL {str(e)}"

214
doclink/app/main_dev.py Normal file
View File

@@ -0,0 +1,214 @@
from fastapi import FastAPI, Request, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.templating import Jinja2Templates
from fastapi.staticfiles import StaticFiles
from fastapi.responses import RedirectResponse
import requests as http_requests
import os
import jwt
import uuid
from datetime import datetime, timedelta
from dotenv import load_dotenv
from .api import endpoints
from .db.database import Database
# Load configurations
load_dotenv()
# Constants
FRONTEND_URL = os.getenv("FRONTEND_URL_DEV", "http://localhost:3000")
GOOGLE_CLIENT_ID = os.getenv("GOOGLE_CLIENT_ID")
SECRET_KEY = os.getenv("MIDDLEWARE_SECRET_KEY")
# App initialization
app = FastAPI(title="Doclink")
app.mount("/static", StaticFiles(directory="app/static"), name="static")
templates = Jinja2Templates(directory="templates")
# CORS Configuration
app.add_middleware(
CORSMiddleware,
allow_origins=[FRONTEND_URL],
allow_credentials=True,
allow_methods=["GET", "POST"],
allow_headers=["*"],
)
async def verify_google_token(token: str) -> dict:
"""Verify Google OAuth token and get user info"""
try:
# Use the access token to get user info from Google
userinfo_response = http_requests.get(
"https://www.googleapis.com/oauth2/v3/userinfo",
headers={"Authorization": f"Bearer {token}"},
)
if not userinfo_response.ok:
raise ValueError("Failed to get user info")
userinfo = userinfo_response.json()
# Verify basic user info exists
if not userinfo.get("sub"): # 'sub' is the Google user ID
raise ValueError("Invalid user info")
return userinfo
except Exception as e:
print(f"Token verification error: {str(e)}")
raise HTTPException(status_code=401, detail=f"Invalid token: {str(e)}")
def create_session_token(user_data: dict) -> str:
"""Create an encrypted session token"""
payload = {
"user_id": user_data["user_id"],
"email": user_data["email"],
"exp": datetime.utcnow() + timedelta(days=1), # 1 day expiration
}
return jwt.encode(payload, SECRET_KEY, algorithm="HS256")
def verify_session_token(session_token: str) -> dict:
"""Verify and decode session token"""
try:
payload = jwt.decode(session_token, SECRET_KEY, algorithms=["HS256"])
return payload
except jwt.ExpiredSignatureError:
raise HTTPException(status_code=401, detail="Session expired")
except jwt.InvalidTokenError:
raise HTTPException(status_code=401, detail="Invalid session")
@app.middleware("http")
async def auth_middleware(request: Request, call_next):
"""Middleware to check authentication for protected routes"""
# Public routes that don't need authentication
public_paths = {"/api/version", "/docs", "/redoc", "/openapi.json"}
if request.url.path in public_paths:
return await call_next(request)
# Check if it's a chat route
if request.url.path.startswith("/chat/"):
# Get either query parameters (from Next.js redirect) or session cookie
token = request.query_params.get("token")
session_cookie = request.cookies.get("session_token")
if not token and not session_cookie:
return RedirectResponse(url=FRONTEND_URL)
try:
# If we have both token and session, prioritize session
if session_cookie:
try:
user_data = verify_session_token(session_cookie)
request.state.user_data = user_data
return await call_next(request)
except Exception as e:
print(f"Error {e}")
if not token:
return RedirectResponse(url=FRONTEND_URL)
# Token-based auth as fallback
if token:
print("Using token authentication")
request.state.token = token
request.state.user_id = request.query_params.get("userId")
request.state.is_new_user = (
request.query_params.get("isNewUser", "false").lower() == "true"
)
return await call_next(request)
# No valid auth method
print("No valid authentication method found")
return RedirectResponse(url=FRONTEND_URL)
except Exception as e:
print(f"Auth middleware error: {str(e)}", exc_info=True)
return RedirectResponse(url=FRONTEND_URL)
return await call_next(request)
@app.get("/chat/{session_id}")
async def chat_page(request: Request, session_id: str):
"""Handle both initial and subsequent visits to chat page"""
try:
# If we have a token in query params, this is an initial visit
if hasattr(request.state, "token"):
# Verify Google token and get user info
google_user = await verify_google_token(request.state.token)
# Create user data
user_data = {
"user_id": request.state.user_id,
"email": google_user.get("email"),
"name": google_user.get("name"),
"picture": google_user.get("picture"),
}
# Create session token
session_token = create_session_token(user_data)
# Create domain if first time
if request.state.is_new_user:
with Database() as db:
domain_id = str(uuid.uuid4())
db.insert_domain_info(
user_id=request.state.user_id,
domain_id=domain_id,
domain_name="My First Folder",
domain_type=0,
)
db.insert_user_guide(
user_id=request.state.user_id, domain_id=domain_id
)
# Create response with template
response = templates.TemplateResponse(
"app.html",
{
"request": request,
"session_id": session_id,
"user_id": user_data["user_id"],
"is_first_time": request.state.is_new_user,
"environment": "dev",
},
)
# Set session cookie
response.set_cookie(
key="session_token",
value=session_token,
httponly=True,
secure=False,
max_age=259200, # 1 day
samesite="lax",
)
return response
# If we have user_data from cookie, this is a subsequent visit
else:
user_data = request.state.user_data
return templates.TemplateResponse(
"app.html",
{
"request": request,
"session_id": session_id,
"user_id": user_data["user_id"],
"is_first_time": False,
"environment": "dev",
},
)
except Exception as e:
print(f"Error in chat page: {str(e)}")
raise HTTPException(status_code=500, detail="Error rendering application")
# Include other routes
app.include_router(endpoints.router, prefix="/api/v1")

279
doclink/app/main_prod.py Normal file
View File

@@ -0,0 +1,279 @@
from fastapi import FastAPI, Request, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.templating import Jinja2Templates
from fastapi.staticfiles import StaticFiles
from fastapi.responses import RedirectResponse
import requests as http_requests
import os
import jwt
import uuid
import logging
from logging.handlers import RotatingFileHandler
from datetime import datetime, timedelta
from dotenv import load_dotenv
from .api import endpoints
from .db.database import Database
# Load configurations
load_dotenv()
# Constants
FRONTEND_URL = os.getenv("FRONTEND_URL_PROD", "http://localhost:3000")
GOOGLE_CLIENT_ID = os.getenv("GOOGLE_CLIENT_ID")
SECRET_KEY = os.getenv("MIDDLEWARE_SECRET_KEY")
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
handlers=[
RotatingFileHandler(
"/var/log/doclink/doclink.log",
maxBytes=10000000, # 10MB
backupCount=5,
),
logging.StreamHandler(),
],
)
logger = logging.getLogger(__name__)
# App initialization
app = FastAPI(title="Doclink")
app.mount("/static", StaticFiles(directory="app/static"), name="static")
templates = Jinja2Templates(directory="templates")
# Middleware headers
@app.middleware("http")
async def add_security_headers(request: Request, call_next):
response = await call_next(request)
response.headers["Content-Security-Policy"] = (
"default-src 'self';"
"script-src 'self' 'unsafe-inline' 'unsafe-eval' "
"https://cdnjs.cloudflare.com "
"https://www.googletagmanager.com "
"https://www.google-analytics.com "
"https://cdn.jsdelivr.net;"
"style-src 'self' 'unsafe-inline' "
"https://fonts.googleapis.com "
"https://cdn.jsdelivr.net "
"https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/ "
"https://cdn.jsdelivr.net/npm/bootstrap-icons@1.11.2/;"
"style-src-elem 'self' 'unsafe-inline' "
"https://fonts.googleapis.com "
"https://cdn.jsdelivr.net "
"https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/ "
"https://cdn.jsdelivr.net/npm/bootstrap-icons@1.11.2/;"
"font-src 'self' https://fonts.gstatic.com "
"https://cdn.jsdelivr.net data:;"
"img-src 'self' data: https://www.google-analytics.com https://*.googleusercontent.com;"
"connect-src 'self' https://www.google-analytics.com;"
)
return response
# CORS
app.add_middleware(
CORSMiddleware,
allow_origins=[FRONTEND_URL],
allow_credentials=True,
allow_methods=["GET", "POST"],
allow_headers=["*"],
)
async def verify_google_token(token: str) -> dict:
"""Verify Google OAuth token and get user info"""
try:
# Use the access token to get user info from Google
userinfo_response = http_requests.get(
"https://www.googleapis.com/oauth2/v3/userinfo",
headers={"Authorization": f"Bearer {token}"},
)
if not userinfo_response.ok:
raise ValueError("Failed to get user info")
userinfo = userinfo_response.json()
# Verify basic user info exists
if not userinfo.get("sub"): # 'sub' is the Google user ID
raise ValueError("Invalid user info")
return userinfo
except Exception as e:
logger.info(f"Token verification error: {str(e)}")
raise HTTPException(status_code=401, detail=f"Invalid token: {str(e)}")
def create_session_token(user_data: dict) -> str:
"""Create an encrypted session token"""
payload = {
"user_id": user_data["user_id"],
"email": user_data["email"],
"exp": datetime.utcnow() + timedelta(days=1), # 1 day expiration
}
return jwt.encode(payload, SECRET_KEY, algorithm="HS256")
def verify_session_token(session_token: str) -> dict:
"""Verify and decode session token"""
try:
payload = jwt.decode(session_token, SECRET_KEY, algorithms=["HS256"])
return payload
except jwt.ExpiredSignatureError:
raise HTTPException(status_code=401, detail="Session expired")
except jwt.InvalidTokenError:
raise HTTPException(status_code=401, detail="Invalid session")
@app.middleware("http")
async def auth_middleware(request: Request, call_next):
"""Middleware to check authentication for protected routes"""
# Public routes that don't need authentication
public_paths = {"/api/version", "/docs", "/redoc", "/openapi.json"}
if request.url.path in public_paths:
return await call_next(request)
# Check if it's a chat route
if request.url.path.startswith("/chat/"):
# Get either query parameters (from Next.js redirect) or session cookie
token = request.query_params.get("token")
session_cookie = request.cookies.get("session_token")
if not token and not session_cookie:
return RedirectResponse(url=FRONTEND_URL)
try:
# If we have both token and session, prioritize session
if session_cookie:
try:
user_data = verify_session_token(session_cookie)
request.state.user_data = user_data
return await call_next(request)
except Exception as e:
logger.info(f"Error validation of session cookie {e}")
if not token:
return RedirectResponse(url=FRONTEND_URL)
# Token-based auth as fallback
if token:
logger.info("Using token authentication")
request.state.token = token
request.state.user_id = request.query_params.get("userId")
request.state.is_new_user = (
request.query_params.get("isNewUser", "false").lower() == "true"
)
return await call_next(request)
# No valid auth method
logger.info("No valid authentication method found")
return RedirectResponse(url=FRONTEND_URL)
except Exception as e:
logger.info(f"Auth middleware error: {str(e)}", exc_info=True)
return RedirectResponse(url=FRONTEND_URL)
return await call_next(request)
@app.get("/chat/{session_id}")
async def chat_page(request: Request, session_id: str):
"""Handle both initial and subsequent visits to chat page"""
logger.info(f"******** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ********")
try:
logger.info(f"Processing chat page request for session {session_id}")
logger.info(f"Request state: {vars(request.state)}")
# If we have a token in query params, this is an initial visit
if hasattr(request.state, "token"):
logger.info("Processing initial visit with token")
# Verify Google token and get user info
try:
# Verify Google token and get user info
google_user = await verify_google_token(request.state.token)
logger.info(f"Google user verified: {google_user.get('email')}")
# Create user data
user_data = {
"user_id": request.state.user_id,
"email": google_user.get("email"),
"name": google_user.get("name"),
"picture": google_user.get("picture"),
}
except Exception as e:
logger.error(f"Error processing token: {str(e)}", exc_info=True)
raise
# Create session token
session_token = create_session_token(user_data)
# Create domain if first time
if request.state.is_new_user:
with Database() as db:
domain_id = str(uuid.uuid4())
db.insert_domain_info(
user_id=request.state.user_id,
domain_id=domain_id,
domain_name="My First Folder",
domain_type=0,
)
db.insert_user_guide(
user_id=request.state.user_id, domain_id=domain_id
)
# Create response with template
response = templates.TemplateResponse(
"app.html",
{
"request": request,
"session_id": session_id,
"user_id": user_data["user_id"],
"is_first_time": request.state.is_new_user,
"environment": "prod",
},
)
# Set session cookie
response.set_cookie(
key="session_token",
value=session_token,
httponly=True,
secure=False,
max_age=86400, # 1 day
samesite="lax",
)
return response
# If we have user_data from cookie, this is a subsequent visit
else:
logger.info("Processing subsequent visit with session cookie")
user_data = request.state.user_data
return templates.TemplateResponse(
"app.html",
{
"request": request,
"session_id": session_id,
"user_id": user_data["user_id"],
"is_first_time": False,
"environment": "prod",
},
)
except Exception as e:
logger.info(f"Error processing subsequent visit with session cookie {e}")
raise HTTPException(status_code=500, detail=f"Error rendering application {e}")
# Include other routes
app.include_router(endpoints.router, prefix="/api/v1")

View File

@@ -0,0 +1,190 @@
from redis import Redis
from typing import Optional, Any
import pickle
import logging
from functools import wraps
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class RedisConnectionError(Exception):
"""Custom exception for Redis connection issues"""
pass
class RedisManager:
_instance = None
def __new__(cls):
if cls._instance is None:
cls._instance = super(RedisManager, cls).__new__(cls)
cls._instance._initialized = False
return cls._instance
def __init__(self):
if not self._initialized:
try:
self.client = Redis(
host="localhost",
port=6380,
db=0,
decode_responses=False,
socket_timeout=5,
)
self._initialized = True
logger.info("Redis connection established")
except Exception as e:
logger.error(f"Failed to connect to Redis: {str(e)}")
raise RedisConnectionError(f"Redis connection failed: {str(e)}")
self.default_ttl = 1800
def _handle_connection(func):
"""Decorator to handle Redis connection errors"""
@wraps(func)
def wrapper(self, *args, **kwargs):
try:
return func(self, *args, **kwargs)
except Exception as e:
logger.error(f"Redis operation failed: {str(e)}")
raise RedisConnectionError(f"Redis operation failed: {str(e)}")
return wrapper
@_handle_connection
def set_data(self, key: str, value: Any, expiry: int = 1800) -> bool:
"""Store data in Redis with expiry time"""
try:
pickled_value = pickle.dumps(value)
return self.client.set(key, pickled_value, ex=expiry)
except Exception as e:
logger.error(f"Failed to set data for key {key}: {str(e)}")
return False
@_handle_connection
def get_data(self, key: str) -> Optional[Any]:
"""Retrieve data from Redis"""
try:
data = self.client.get(key)
return pickle.loads(data) if data else None
except Exception as e:
logger.error(f"Failed to get data for key {key}: {str(e)}")
return None
@_handle_connection
def delete_data(self, key: str) -> bool:
"""Delete data from Redis"""
try:
return bool(self.client.delete(key))
except Exception as e:
logger.error(f"Failed to delete key {key}: {str(e)}")
return False
@_handle_connection
def clear_user_data(self, user_id: str) -> bool:
"""Clear all data for a specific user"""
try:
pattern = f"user:{user_id}:*"
keys = self.client.keys(pattern)
if keys:
return bool(self.client.delete(*keys))
return True
except Exception as e:
logger.error(f"Failed to clear data for user {user_id}: {str(e)}")
return False
@_handle_connection
def get_memory_usage(self) -> dict:
"""Get Redis memory statistics"""
try:
info = self.client.info(section="memory")
return {
"used_memory": info["used_memory_human"],
"peak_memory": info["used_memory_peak_human"],
"fragmentation": info["mem_fragmentation_ratio"],
}
except Exception as e:
logger.error(f"Failed to get memory usage: {str(e)}")
return {}
@_handle_connection
def refresh_user_ttl(self, user_id: str) -> bool:
"""Refresh TTL for all keys belonging to a user"""
try:
# Get all keys for this user
pattern = f"user:{user_id}:*"
user_keys = self.client.keys(pattern)
if not user_keys:
return False
# Update TTL for all user's keys
pipeline = self.client.pipeline()
for key in user_keys:
pipeline.expire(key, self.default_ttl)
# Execute all EXPIRE commands atomically
results = pipeline.execute()
# Check if all operations succeeded
success = all(results)
if not success:
logger.warning(f"Some TTL updates failed for user {user_id}")
return success
except Exception as e:
logger.error(f"Failed to refresh TTL for user {user_id}: {str(e)}")
return False
@_handle_connection
def refresh_key_ttl(self, key: str, ttl: int = None) -> bool:
"""Refresh TTL for a specific key"""
try:
return self.client.expire(key, ttl or self.default_ttl)
except Exception as e:
logger.error(f"Failed to refresh TTL for key {key}: {str(e)}")
return False
def is_connected(self) -> bool:
"""Check if Redis connection is alive"""
try:
return self.client.ping()
except Exception:
return False
def get_keys_by_pattern(self, pattern: str = "*") -> list:
"""Get all keys matching pattern"""
try:
return [key.decode("utf-8") for key in self.client.keys(pattern)]
except Exception as e:
logger.error(f"Error getting keys: {e}")
return []
def get_key_info(self, key: str) -> dict:
"""Get detailed information about a key"""
try:
return {
"type": self.client.type(key).decode("utf-8"),
"ttl": self.client.ttl(key),
"memory": self.client.memory_usage(key),
}
except Exception as e:
logger.error(f"Error getting key info: {e}")
return {}
def monitor_user_data(self, user_id: str) -> dict:
"""Monitor all data for a specific user"""
try:
user_keys = self.get_keys_by_pattern(f"user:{user_id}:*")
return {
"total_keys": len(user_keys),
"keys": {key: self.get_key_info(key) for key in user_keys},
"memory_usage": self.get_memory_usage(),
}
except Exception as e:
logger.error(f"Error monitoring user data: {e}")
return {}

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 543 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.3 KiB

View File

@@ -0,0 +1 @@
{"name":"","short_name":"","icons":[{"src":"/android-chrome-192x192.png","sizes":"192x192","type":"image/png"},{"src":"/android-chrome-512x512.png","sizes":"512x512","type":"image/png"}],"theme_color":"#ffffff","background_color":"#ffffff","display":"standalone"}

View File

@@ -0,0 +1,553 @@
window.fetchUserInfo = async function(userID) {
try {
const response = await fetch('/api/v1/db/get_user_info', {
method: 'POST',
body: JSON.stringify({ user_id: userID }),
headers: {
'Content-Type': 'application/json'
},
});
if (!response.ok) {
throw new Error('Failed to fetch initial user data');
}
const data = await response.json();
if (!data) {
console.error('User could not be found!');
return null;
}
return data;
} catch (error) {
console.error('Error fetching initial user data:', error);
return null;
}
};
window.handleLogoutRequest = async function handleLogoutRequest(userId, sessionId) {
try {
const response = await fetch('/api/v1/auth/logout', {
method: 'POST',
body: JSON.stringify({
user_id: userId,
session_id: sessionId
}),
credentials: 'include',
headers: {
'Content-Type': 'application/json'
}
});
if (!response.ok) {
throw new Error('Logout failed');
}
return {
success: true
};
} catch (error) {
console.error('Logout request failed:', error);
return {
success: false,
error: error.message
};
}
};
window.selectDomain = async function selectDomain(domainId, userID) {
try {
const url = `/api/v1/qa/select_domain?userID=${encodeURIComponent(userID)}`;
const response = await fetch(url, {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({
domain_id: domainId
})
});
if (!response.ok) {
return 0;
}
const data = await response.json();
if (data["message"] !== "success") {
return 0;
}
return 1;
} catch (error) {
console.error('Error selecting domain', error);
return 0;
}
}
window.renameDomain = async function renameDomain(domainId, newName) {
try {
const response = await fetch('/api/v1/db/rename_domain', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({
domain_id: domainId,
new_name: newName
})
});
if (!response.ok) {
return 0;
}
const data = await response.json();
if (data.message !== "success") {
return 0;
}
return 1;
} catch (error) {
console.error('Error renaming domain:', error);
return 0;
}
};
window.createDomain = async function createDomain(userId, domainName) {
try {
const url = `/api/v1/db/create_domain?userID=${encodeURIComponent(userId)}`;
const response = await fetch(url, {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({
domain_name: domainName
})
});
const data = await response.json();
if (!response.ok) {
return { success: 0, message: data.message || 'Failed to create domain' };
}
if (data.message !== "success") {
return { success: 0, message: data.message };
}
return { success: 1, id: data.domain_id };
} catch (error) {
console.error('Error creating domain:', error);
return { success: 0, id: null };
}
};
window.deleteDomain = async function deleteDomain(domainId) {
try {
const response = await fetch('/api/v1/db/delete_domain', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({
domain_id: domainId
})
});
const data = await response.json();
if (!response.ok) {
return {
success: false,
message: data.message
};
}
if (data.message !== "success") {
return {
success: false,
message: data.message
};
}
return {
success: true,
message: "Folder deleted"
};
} catch (error) {
console.error('Error deleting domain:', error);
return {
success: false,
message: "An unexpected error occurred"
};
}
};
window.storeFile = async function(userID, formData) {
try {
const response = await fetch(`/api/v1/io/store_file?userID=${encodeURIComponent(userID)}`, {
method: 'POST',
body: formData
});
if (!response.ok) {
throw new Error('Failed to store file');
}
const data = await response.json();
if (data.message !== "success") {
return 0;
}
return 1;
} catch (error) {
console.error('Error storing file:', error);
return {
success: false,
error: error.message
};
}
};
window.storedriveFile = async function(userID, formData) {
try {
const response = await fetch(`/api/v1/io/store_drive_file?userID=${encodeURIComponent(userID)}`, {
method: 'POST',
body: formData
});
if (!response.ok) {
throw new Error('Failed to store drive file');
}
const data = await response.json();
if (data.message !== "success") {
return 0;
}
return 1;
} catch (error) {
console.error('Error storing file:', error);
return {
success: false,
error: error.message
};
}
};
window.storeURL = async function(userID, url) {
try {
const formData = new FormData();
formData.append('url', url);
const response = await fetch(`/api/v1/io/store_url?userID=${encodeURIComponent(userID)}`, {
method: 'POST',
body: formData
});
if (!response.ok) {
throw new Error('Failed to store url');
}
const data = await response.json();
if (data.message !== "success") {
return 0;
}
return 1;
} catch (error) {
console.error('Error storing URL:', error);
return {
success: false,
error: error.message
};
}
};
window.uploadFiles = async function(userID) {
try {
const response = await fetch(`/api/v1/io/upload_files?userID=${userID}`, {
method: 'POST'
});
const data = await response.json();
if (data.message.includes("can only have 20 total files")) {
return {
success: false,
error: data.message || 'Upload process failed'
};
} else if (data.message !== "success") {
return {
success: false,
error: data.message
};
}
if (!response.ok) {
throw new Error('Failed to process uploads');
}
return {
success: true,
data: {
file_names: data.file_names,
file_ids: data.file_ids,
message: data.message
}
};
} catch (error) {
console.error('Error uploading files:', error);
return {
success: false,
error: error.message
};
}
};
window.removeFile = async function(fileId, domainId, userId) {
try {
const url = `/api/v1/db/remove_file_upload?userID=${encodeURIComponent(userId)}`;
const response = await fetch(url, {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({
file_id: fileId,
domain_id: domainId
})
});
if (!response.ok) {
throw new Error('Failed to remove files');
}
const data = await response.json();
if (data.message !== "success") {
return 0;
}
return 1;
} catch (error) {
console.error('Error removing files:', error);
return {
success: false,
error: error.message
};
}
};
window.exportResponse = async function(contents) {
try {
const response = await fetch('/api/v1/io/export_response', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({contents})
});
if (!response.ok) {
const errorData = await response.json();
throw new Error(errorData.detail || 'Failed to generate PDF');
}
const blob = await response.blob();
if (blob.size === 0) {
throw new Error('Received empty PDF');
}
const url = window.URL.createObjectURL(
new Blob([blob], { type: 'application/pdf' })
);
const link = document.createElement('a');
link.href = url;
link.download = 'DoclinkExport.pdf';
document.body.appendChild(link);
link.click();
document.body.removeChild(link);
setTimeout(() => {
window.URL.revokeObjectURL(url);
}, 100);
return {
success: true
};
}
catch (error) {
console.error('Error uploading files:', error);
return {
success: false,
error: error.message
};
}
};
window.sendMessage = async function(message, userId, sessionId, fileIds) {
if (!message) {
return {
message: "Please enter your sentence!",
status: 400
};
}
try {
const url = `/api/v1/qa/generate_answer?userID=${encodeURIComponent(userId)}&sessionID=${encodeURIComponent(sessionId)}`;
const response = await fetch(url, {
method: 'POST',
headers: {'Content-Type': 'application/json'},
body: JSON.stringify({
user_message: message,
file_ids: fileIds
})
});
const data = await response.json();
if (data.message && data.message.includes("Daily question limit reached")) {
return {
message: data.message || 'Daily question limit reached!',
status: 400
};
}
if (!response.ok) {
return {
message: data.message || 'Server error!',
status: response.status
};
}
return {
...data,
status: 200
};
} catch (error) {
console.error('Error:', error);
return {
message: 'Error generating message!',
status: 500
};
}
};
window.sendFeedback = async function(formData, userId) {
try {
const url = `/api/v1/db/insert_feedback?userID=${encodeURIComponent(userId)}`;
const response = await fetch(url, {
method: 'POST',
body: formData
});
if (!response.ok) {
throw new Error('Failed to submit feedback');
}
const data = await response.json();
return {
success: true,
message: data.message || 'Thank you for your feedback!'
};
} catch (error) {
console.error('Error submitting feedback:', error);
return {
success: false,
message: 'Failed to submit feedback. Please try again.'
};
}
}
window.sendRating = async function(ratingData, userNote, userId) {
try {
const url = `/api/v1/db/insert_rating?userID=${encodeURIComponent(userId)}`;
const formData = new FormData();
formData.append('rating', ratingData);
if (userNote){
formData.append('user_note', userNote);
}
const response = await fetch(url, {
method: 'POST',
body: formData
});
if (!response.ok) {
throw new Error('Failed to submit rating');
}
const data = await response.json();
return {
success: true,
message: data.message || 'Thank you for your feedback!'
};
} catch (error) {
console.error('Error submitting feedback:', error);
return {
success: false,
message: 'Failed to submit feedback. Please try again.'
};
}
}
window.googleSignIn = async function googleSignIn() {
try {
const url = `/api/v1/qa/select_domain?userID=${encodeURIComponent(userID)}`;
const response = await fetch(url, {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({
domain_id: domainId
})
});
if (!response.ok) {
return 0;
}
const data = await response.json();
if (data["message"] !== "success") {
return 0;
}
return 1;
} catch (error) {
console.error('Error selecting domain', error);
return 0;
}
}

3659
doclink/app/static/js/app.js Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,636 @@
prompts:
languages:
en:
general_purpose:
- id: gp_001
text: "
Your task is to analyze the given context windows, extract relevant data based on the user's query, and use file information to enhance your response. Your primary goal is to provide a comprehensive, structured, and user-friendly answer using solely the information provided in the context window.\n
Please respond in the language of the user's query, specified by the {lang} variable (e.g., 'en' for English, 'tr' for Turkish), ensuring the tone and style align with the query's language and context.\n
Instructions:\n
You will be provided with context windows, each containing several sentences along with the two following metadata: \n
File: Specifies source of each context.\n
Confidence coefficient: A number between 0 and 1, indicating the priority of the context (higher numbers mean higher priority).\n
Extracting Relevant Information:\n
Carefully analyze the user's query to determine the specific information being requested.\n
Use all relevant context windows, prioritizing those with higher confidence levels for accuracy.\n
If the query references a specific file, extract information only from the specified file(s).\n
If the query does not specify a file, aggregate information from all available files.\n
If the context contains consistent information across multiple files, consolidate the data and indicate consistency.\n
If the context contains contradictory information: Highlight the contradictions, specify their sources, and explain how they differ.\n
If the context contains similar or different information, summarize the distinctions or similarities and relate them to the query.\n
Present your response using bullet points or topic-based sections for better readability.\n
Prioritize clarity and conciseness. Use subheadings or categories for complex queries.\n
If the required information is not found in the context, state this clearly and offer suggestions or clarifications if possible.\n
Do not specify the confidence coefficient in response.\n
Do not mention about the 'context windows'. 'Use according to resources' instead.\n
Respond *strictly* in the following format:\n
[header]Section Name[/header]\n
Content with [bold]bold terms[/bold] when needed. Use the following list format for any points:\n
- Main point\n
- Sub-point\n
- Further nested point\n
[header]Another Section Name[/header]\n
Content with [bold]bold terms[/bold] when needed\n
- Main point\n
- Sub-point\n
- Further nested point\n
Rules:\n
1. Each major section must start with [header]...[/header]\n
2. Use [bold]...[/bold] for important terms or emphasis within content\n
3. Headers should be one of: Definition, Purpose, Key Features, Operation, Context\n
4. Use single dash (-) for all list items\n
5. Indent nested list items with exactly 2 spaces per level\n
6. Place one empty line between major sections\n
7. Do not use any other list markers (bullets, dots, numbers)\n
8. Keep indentation consistent throughout the response\n
Context Windows:\n
{context}\n
User Query:\n
{query}\n
User Query language:\n
{lang}\n
"
Informational:
- id: info_001
text: "
Your task is to analyze the given context windows, extract relevant data based on the user's query, and use file information to enhance your response. Your primary goal is to provide a comprehensive, structured, and user-friendly answer using solely the information provided in the context window.\n
Please respond in the language of the user's query, specified by the {lang} variable (e.g., 'en' for English, 'tr' for Turkish), ensuring the tone and style align with the query's language and context.\n
Instructions:\n
You will be provided with context windows, each containing several sentences along with the two following metadata:\n
File: Specifies source of each context.\n
Confidence coefficient: A number between 0 and 1, indicating the priority of the context (higher numbers mean higher priority).\n
1. Identify factual knowledge, definitions, or explanations requested in the query.\n
2. Focus on delivering concise, clear, and specific information.\n
3. Include [b]key terms[/b] and definitions for clarity and emphasize relevant details.\n
4. Avoid generalizations; prioritize extracting exact matches or relevant information from the context.\n
5. Answer must be short as possible, on-point and clear as much as possible.\n
6. Always prioritize contexts with higher confidence coefficients for accuracy, but cross-check lower-confidence contexts for supplementary or missing details to ensure completeness.\n
7. Where appropriate, attribute information to its source file or section implicitly. For example: 'As described in the regulations...' or 'According to the provided report...' without directly mentioning the context window or file name unless explicitly required by the query.\n
8. If contradictory information is found: Explicitly state the contradiction and its source(s). Suggest possible resolutions, clarifications, or factors that may explain the discrepancy (e.g., differing data sources, updates, or interpretations).\n
9. If the query requests a more detailed response, expand your answer with additional explanations\n
Extracting Relevant Information:\n
Carefully analyze the user's query to determine the specific information being requested.\n
Use all relevant context windows, prioritizing those with higher confidence levels for accuracy.\n
If the query references a specific file, extract information only from the specified file(s).\n
If the query does not specify a file, aggregate information from all available files.\n
If the context contains consistent information across multiple files, consolidate the data and indicate consistency.\n
If the context contains contradictory information: Highlight the contradictions, specify their sources, and explain how they differ.\n
If the context contains similar or different information, summarize the distinctions or similarities and relate them to the query.\n
Present your response using bullet points or topic-based sections for better readability.\n
Prioritize clarity and conciseness. Use subheadings or categories for complex queries.\n
If the required information is not found in the context, state this clearly and offer suggestions or clarifications if possible.\n
Do not specify the confidence coefficient in response.\n
Do not mention about the 'context windows'. 'Use according to resources' instead.\n
Respond *strictly* in the following format:\n
[header]Section Name[/header]\n
Content with [bold]bold terms[/bold] when needed. Use the following list format for any points:\n
- Main point\n
- Sub-point\n
- Further nested point\n
[header]Another Section Name[/header]\n
Content with [bold]bold terms[/bold] when needed\n
- Main point\n
- Sub-point\n
- Further nested point\n
Rules:\n
1. Each major section must start with [header]...[/header]\n
2. Use [bold]...[/bold] for important terms or emphasis within content\n
3. Headers should be one of: Definition, Purpose, Key Features, Operation, Context\n
4. Use single dash (-) for all list items\n
5. Indent nested list items with exactly 2 spaces per level\n
6. Place one empty line between major sections\n
7. Do not use any other list markers (bullets, dots, numbers)\n
8. Keep indentation consistent throughout the response \n
Context Windows:\n
{context}\n
User Query:\n
{query}\n
User Query language:\n
{lang}\n
"
Comparison:
- id: comp_001
text: "
Your task is to analyze the given context windows, extract relevant data based on the user's query, and use file information to enhance your response. Your primary goal is to provide a comprehensive, structured, and user-friendly answer using solely the information provided in the context window.\n
Please respond in the language of the user's query, specified by the {lang} variable (e.g., 'en' for English, 'tr' for Turkish), ensuring the tone and style align with the query's language.\n
Instructions:\n
You will be provided with context windows, each containing several sentences along with the two following metadata:\n
File: Specifies source of each context.\n
Confidence coefficient: A number between 0 and 1, indicating the priority of the context (higher numbers mean higher priority).\n
1. Extract and compare relevant details from the context to highlight similarities and differences.\n
2. If contradictory information is found, specify the contradictions and explain their sources.\n
3. Present distinctions or parallels in a structured format, using headers like [header]Similarities[/header] and [header]Differences[/header].\n
4. Provide a clear explanation of how the extracted information relates to the user's query.\n
5. If consistent information appears across contexts, summarize it in the [header]Similarities[/header] section. For contradictory information: Specify conflicting points under [header]Differences[/header]. Attribute contradictions to their respective sources and explain their impact.\n
6. For comparisons involving multiple attributes, organize data using a [bold]tabular format[/bold] or structured lists. Each row or bullet point should represent one attribute.\n
7. If the required comparison data is missing, clearly state this under [header]Missing Information[/header]. Offer suggestions for refining the query or point out gaps in the context.\n
8. For queries involving detailed or hierarchical comparisons: Use a primary section for high-level differences or similarities. Include nested sections for more granular points.\n
Extracting Relevant Information:\n
Carefully analyze the user's query to determine the specific information being requested.\n
Use all relevant context windows, prioritizing those with higher confidence levels for accuracy.\n
If the query references a specific file, extract information only from the specified file(s).\n
If the query does not specify a file, aggregate information from all available files.\n
If the context contains consistent information across multiple files, consolidate the data and indicate consistency.\n
If the context contains contradictory information: Highlight the contradictions, specify their sources, and explain how they differ.\n
If the context contains similar or different information, summarize the distinctions or similarities and relate them to the query.\n
Present your response using bullet points or topic-based sections for better readability.\n
Prioritize clarity and conciseness. Use subheadings or categories for complex queries.\n
If the required information is not found in the context, state this clearly and offer suggestions or clarifications if possible.\n
Do not specify the confidence coefficient in response.\n
Do not mention about the 'context windows'. 'Use according to resources' instead.\n
Respond *strictly* in the following format:\n
[header]Section Name[/header]\n
Content with [bold]bold terms[/bold] when needed. Use the following list format for any points:\n
- Main point\n
- Sub-point\n
- Further nested point\n
[header]Another Section Name[/header]\n
Content with [bold]bold terms[/bold] when needed\n
- Main point\n
- Sub-point\n
- Further nested point\n
Rules:\n
1. Each major section must start with [header]...[/header]\n
2. Use [bold]...[/bold] for important terms or emphasis within content\n
3. Headers should be one of: Definition, Purpose, Key Features, Operation, Context\n
4. Use single dash (-) for all list items\n
5. Indent nested list items with exactly 2 spaces per level\n
6. Place one empty line between major sections\n
7. Do not use any other list markers (bullets, dots, numbers)\n
8. Keep indentation consistent throughout the response\n
Context Windows:\n
{context}\n
User Query:\n
{query}\n
User Query language:\n
{lang}\n
"
Summarization:
- id: sum_001
text: "
Your task is to analyze the given context windows, extract relevant data based on the user's query, and use file information to enhance your response. Your primary goal is to provide a comprehensive, structured, and user-friendly answer using solely the information provided in the context window.\n
Please respond in the language of the user's query, specified by the {lang} variable (e.g., 'en' for English, 'tr' for Turkish), ensuring the tone and style align with the query's language.\n
Instructions:\n
You will be provided with context windows, each containing several sentences along with the two following metadata:\n
File: Specifies source of each context.\n
Confidence coefficient: A number between 0 and 1, indicating the priority of the context (higher numbers mean higher priority).\n
1. Identify and extract key points or main ideas from the context relevant to the query.\n
2. Create a concise and well-structured summary, using bullet points or categories for clarity.\n
3. Highlight overarching themes and provide an overview without including excessive details.\n
4. Consolidate consistent information across contexts to avoid redundancy.\n
5. If the query specifies a focus area (e.g., a section, file, or theme), prioritize summarizing content strictly relevant to that focus. Where no focus is specified, highlight the most critical and recurring themes or points.\n
6. Where appropriate, illustrate key ideas with short examples or specific details from the context. Keep examples concise and relevant.\n
7. If the context contains contradictions: Summarize both perspectives succinctly. Highlight the contradiction explicitly, and explain how it relates to the query.\n
8. The summary should not exceed 200 tokens unless explicitly requested by the query. If required details exceed this limit, provide a prioritized or hierarchical overview.\n
Extracting Relevant Information:\n
Carefully analyze the user's query to determine the specific information being requested.\n
Use all relevant context windows, prioritizing those with higher confidence levels for accuracy.\n
If the query references a specific file, extract information only from the specified file(s).\n
If the query does not specify a file, aggregate information from all available files.\n
If the context contains consistent information across multiple files, consolidate the data and indicate consistency.\n
If the context contains contradictory information: Highlight the contradictions, specify their sources, and explain how they differ.\n
If the context contains similar or different information, summarize the distinctions or similarities and relate them to the query.\n
Present your response using bullet points or topic-based sections for better readability.\n
Prioritize clarity and conciseness. Use subheadings or categories for complex queries.\n
If the required information is not found in the context, state this clearly and offer suggestions or clarifications if possible.\n
Do not specify the confidence coefficient in response.\n
Do not mention about the 'context windows'. 'Use according to resources' instead.\n
Respond *strictly* in the following format:\n
[header]Section Name[/header]\n
Content with [bold]bold terms[/bold] when needed. Use the following list format for any points:\n
- Main point\n
- Sub-point\n
- Further nested point\n
[header]Another Section Name[/header]\n
Content with [bold]bold terms[/bold] when needed\n
- Main point\n
- Sub-point\n
- Further nested point\n
Rules:\n
1. Each major section must start with [header]...[/header]\n
2. Use [bold]...[/bold] for important terms or emphasis within content\n
3. Headers should be one of: Definition, Purpose, Key Features, Operation, Context\n
4. Use single dash (-) for all list items\n
5. Indent nested list items with exactly 2 spaces per level\n
6. Place one empty line between major sections\n
7. Do not use any other list markers (bullets, dots, numbers)\n
8. Keep indentation consistent throughout the response\n
Context Windows:\n
{context}\n
User Query:\n
{query}\n
User Query language:\n
{lang}\n
"
queries:
- id: query_001
text: "
Task: Analyze, Correct, and Generate Related Questions & Answers\n
Instructions:\n
You are given a user query.\n
First, check the user question. If it has no meaning, return an empty string. If it is meaningful, do the following:\n
Correct any spelling or grammatical errors and return the corrected question as the first line of the output.\n
Generate 3 semantically similar queries that retain the same meaning as the corrected query.\n
Create 3 different questions that approach the original query from different angles but stay related.\n
Answer last 3 questions with concise responses, 1-2 sentences max each.\n
Then, analyze the corrected user query and determine its intent, intention list is and their keywords, examples are given below. If intent can't be determined return empty '' string.\n
Please respond in the file language, specified by the {file lang} variable (e.g., 'en' for English, 'tr' for Turkish) regardless of user query's language , ensuring the tone and style align with the file's language.\n
If file language is diferent than english look for the intention keywords that provided for intent detection below in file language.\n
The possible intents are:\n
1. Informational: Seeking factual knowledge, definitions, or explanations.\n
Intention Keywords: What, define, explain, details, specify, who, why, how.\n
Intention Examples: What is the penalty for breaking this rule? → Informational\n
2. Summarization: Requesting a concise overview of complex information.\n
Intention Keywords: Summarize, overview, main points, key ideas, brief, concise, simplify.\n
Intention Examples: Can you summarize the key points of this document? → Summarization\n
3. Comparison: Evaluating options, methods, or technologies.\n
Intention Keywords: Compare, difference, similarity, versus, contrast, better, alternative, pros and cons.\n
Intention Examples: Compare the benefits of these two methods. → Comparison\n
Return the output **strictly** in the following format:\n
[corrected query]\n
[first semantically similar query]\n
[second semantically similar query]\n
[third semantically similar query]\n
[first different-angle question]\n
[second different-angle question]\n
[third different-angle question]\n
[first different-angle answer]\n
[second different-angle answer]\n
[third different-angle answer]\n
[user intention]\n
User query: {query}\n
File language:\n
{file_lang}\n
Example:\n
User query: How does retrieval-augmented generation work in AI systems?\n
File language: en\n
Output:
How does retrieval-augmented generation work in AI systems?\n
What is the process of retrieval-augmented generation in AI?\n
How does RAG help AI systems retrieve and generate information?\n
Can you explain how retrieval-augmented generation functions in AI applications?\n
What are the key advantages of using RAG in AI?\n
How does RAG differ from traditional machine learning models?\n
What challenges does RAG face in implementation?\n
RAG enhances AI by providing more accurate responses by retrieving relevant external data.\n
Unlike traditional models, RAG integrates search capabilities to access external knowledge during inference.\n
Major challenges include latency in retrieval, ensuring relevance of fetched data, and maintaining up-to-date information.\n
Informational\n
"
tr:
general_purpose:
- id: gp_tr_001
text: "
Göreviniz verilen bağlam pencerelerini analiz etmek, kullanıcının sorgusuna göre ilgili verileri çıkarmak ve yanıtınızı geliştirmek için dosya bilgilerini kullanmaktır. Birincil amacınız, yalnızca bağlam penceresinde sağlanan bilgileri kullanarak kapsamlı, yapılandırılmış ve kullanıcı dostu bir yanıt sunmaktır.\n
Talimatlar:\n
Size, her biri birkaç cümle ve şu iki meta veriyi içeren bağlam pencereleri sağlanacaktır:\n
Dosya: Her bağlamın kaynağını belirtir.\n
Güven katsayısı: 0 ile 1 arasında bir sayı olup, bağlamın öncelik seviyesini ifade eder (daha yüksek sayılar daha yüksek öncelik anlamına gelir).\n
İlgili Bilgilerin Çıkarılması:\n
Kullanıcının sorgusunda istenen belirli bilgileri belirlemek için dikkatlice analiz yapın.\n
Doğruluk için daha yüksek güven seviyelerine sahip bağlamlara öncelik vererek tüm ilgili bağlam pencerelerini kullanın.\n
Sorgu belirli bir dosyayı referans alıyorsa, yalnızca belirtilen dosya(lar)dan bilgi çıkarın.\n
Sorgu herhangi bir dosya belirtmiyorsa, mevcut tüm dosyalardan bilgileri birleştirin.\n
Bağlam birden fazla dosyada tutarlı bilgiler içeriyorsa, verileri birleştirin ve tutarlılığı belirtin.\n
Bağlam çelişkili bilgiler içeriyorsa: Çelişkileri vurgulayın, kaynaklarını belirtin ve nasıl farklılık gösterdiklerini açıklayın.\n
Bağlam benzer veya farklı bilgiler içeriyorsa, farklılıkları veya benzerlikleri özetleyin ve bunları sorguyla ilişkilendirin.\n
Yanıtınızı daha iyi okunabilirlik için madde işaretleri veya konuya dayalı bölümler kullanarak sunun.\n
Netlik ve özlülüğe öncelik verin. Karmaşık sorgular için alt başlıklar veya kategoriler kullanın.\n
Gerekli bilgi bağlamda bulunmuyorsa, bunu açıkça belirtin ve mümkünse öneriler veya açıklamalar sunun.\n
Yanıtta güven katsayısını belirtmeyin.\n
Aşağıdaki formata *kesinlikle* uygun şekilde yanıt verin:\n
[header]Bölüm Adı[/header]\n
Gerektiğinde [bold]kalın terimler[/bold] ile içerik. Maddeler için şu format kullanılmalı:\n
- Ana madde\n
- Alt madde\n
- Daha alt madde\n
[header]Diğer Bölüm Adı[/header]\n
Gerektiğinde [bold]kalın terimler[/bold] ile içerik\n
- Ana madde\n
- Alt madde\n
- Daha alt madde\n
Kurallar:\n
1. Her ana bölüm [header]...[/header] ile başlamalı\n
2. Önemli terimler veya vurgulamalar için [bold]...[/bold] kullanın\n
3. Bölüm başlıkları şunlardan biri olmalı: Tanım, Amaç, Temel Özellikler, İşleyiş, Bağlam\n
4. Tüm liste maddeleri için tek tire (-) kullanın\n
5. Alt maddelerde tam olarak 2 boşluk ile girintileme yapın\n
6. Ana bölümler arasında bir boş satır bırakın\n
7. Başka liste işaretleri kullanmayın (nokta, sayı vb.)\n
8. Yanıt boyunca tutarlı girintileme kullanın\n
Bağlam Pencereleri:\n
{context}\n
Kullanıcı Sorgusu:\n
{query}\n
"
Bilgi Edinme:
- id: info_tr_001
text: "
Göreviniz verilen bağlam pencerelerini analiz etmek, kullanıcının sorgusuna göre ilgili verileri çıkarmak ve yanıtınızı geliştirmek için dosya bilgilerini kullanmaktır. Birincil amacınız, yalnızca bağlam penceresinde sağlanan bilgileri kullanarak kapsamlı, yapılandırılmış ve kullanıcı dostu bir yanıt sunmaktır.\n
Talimatlar:\n
Size, her biri birkaç cümle ve şu iki meta veriyi içeren bağlam pencereleri sağlanacaktır:\n
Dosya: Her bağlamın kaynağını belirtir.\n
Güven katsayısı: 0 ile 1 arasında bir sayı olup, bağlamın öncelik seviyesini ifade eder (daha yüksek sayılar daha yüksek öncelik anlamına gelir).\n
1. Sorguda talep edilen gerçek bilgilere, tanımlara veya açıklamalara odaklanın.\n
2. Kısa, net ve spesifik bilgiler sunmaya odaklanın.\n
3. Açıklık için [b]önemli terimler[/b] ve tanımları ekleyin ve ilgili ayrıntıları vurgulayın.\n
4. Genellemelerden kaçının; bağlamdan tam eşleşmeleri veya ilgili bilgileri çıkarmayı önceliklendirin.\n
5. Cevap mümkün olduğunca kısa, net ve doğrudan olmalı; 150 ile 200 token arasında olmalıdır.\n
6. Doğruluk için her zaman daha yüksek güven katsayısına sahip bağlamlara öncelik verin, ancak eksiksizliği sağlamak için ek veya eksik ayrıntılar için daha düşük güven katsayısına sahip bağlamları çapraz kontrol edin.\n
7. Uygun olduğunda, bilgiyi kaynak dosya veya bölüme dolaylı olarak atfedin. Örneğin: Yönetmeliklerde belirtildiği gibi... veya Sağlanan rapora göre... ifadelerini kullanın, ancak sorguda açıkça istenmediği sürece bağlam penceresi veya dosya adını doğrudan belirtmeyin.\n
8. Çelişkili bilgiler bulunursa: Çelişkiyi ve kaynağınııkça belirtin. Olası çözüm yollarını, açıklamaları veya farklılıklarııklayabilecek faktörleri (örneğin, farklı veri kaynakları, güncellemeler veya yorumlar) önerin.\n
İlgili Bilgilerin Çıkarılması:\n
Kullanıcının sorgusunda istenen belirli bilgileri belirlemek için dikkatlice analiz yapın.\n
Doğruluk için daha yüksek güven seviyelerine sahip bağlamlara öncelik vererek tüm ilgili bağlam pencerelerini kullanın.\n
Sorgu belirli bir dosyayı referans alıyorsa, yalnızca belirtilen dosya(lar)dan bilgi çıkarın.\n
Sorgu herhangi bir dosya belirtmiyorsa, mevcut tüm dosyalardan bilgileri birleştirin.\n
Bağlam birden fazla dosyada tutarlı bilgiler içeriyorsa, verileri birleştirin ve tutarlılığı belirtin.\n
Bağlam çelişkili bilgiler içeriyorsa: Çelişkileri vurgulayın, kaynaklarını belirtin ve nasıl farklılık gösterdiklerini açıklayın.\n
Bağlam benzer veya farklı bilgiler içeriyorsa, farklılıkları veya benzerlikleri özetleyin ve bunları sorguyla ilişkilendirin.\n
Yanıtınızı daha iyi okunabilirlik için madde işaretleri veya konuya dayalı bölümler kullanarak sunun.\n
Netlik ve özlülüğe öncelik verin. Karmaşık sorgular için alt başlıklar veya kategoriler kullanın.\n
Gerekli bilgi bağlamda bulunmuyorsa, bunu açıkça belirtin ve mümkünse öneriler veya açıklamalar sunun.\n
Yanıtta güven katsayısını belirtmeyin.\n
Aşağıdaki formata *kesinlikle* uygun şekilde yanıt verin:\n
[header]Bölüm Adı[/header]\n
Gerektiğinde [bold]kalın terimler[/bold] ile içerik. Maddeler için şu format kullanılmalı:\n
- Ana madde\n
- Alt madde\n
- Daha alt madde\n
[header]Diğer Bölüm Adı[/header]\n
Gerektiğinde [bold]kalın terimler[/bold] ile içerik\n
- Ana madde\n
- Alt madde\n
- Daha alt madde\n
Kurallar:\n
1. Her ana bölüm [header]...[/header] ile başlamalı\n
2. Önemli terimler veya vurgulamalar için [bold]...[/bold] kullanın\n
3. Bölüm başlıkları şunlardan biri olmalı: Tanım, Amaç, Temel Özellikler, İşleyiş, Bağlam\n
4. Tüm liste maddeleri için tek tire (-) kullanın\n
5. Alt maddelerde tam olarak 2 boşluk ile girintileme yapın\n
6. Ana bölümler arasında bir boş satır bırakın\n
7. Başka liste işaretleri kullanmayın (nokta, sayı vb.)\n
8. Yanıt boyunca tutarlı girintileme kullanın\n
Bağlam Pencereleri:\n
{context}\n
Kullanıcı Sorgusu:\n
{query}\n
"
Karşılaştırma:
- id: comp_tr_001
text: "
Göreviniz verilen bağlam pencerelerini analiz etmek, kullanıcının sorgusuna göre ilgili verileri çıkarmak ve yanıtınızı geliştirmek için dosya bilgilerini kullanmaktır. Birincil amacınız, yalnızca bağlam penceresinde sağlanan bilgileri kullanarak kapsamlı, yapılandırılmış ve kullanıcı dostu bir yanıt sunmaktır.\n
Talimatlar:\n
Size, her biri birkaç cümle ve şu iki meta veriyi içeren bağlam pencereleri sağlanacaktır:\n
Dosya: Her bağlamın kaynağını belirtir.\n
Güven katsayısı: 0 ile 1 arasında bir sayı olup, bağlamın öncelik seviyesini ifade eder (daha yüksek sayılar daha yüksek öncelik anlamına gelir).\n
1. Benzerlikleri ve farklılıkları vurgulamak için bağlamdan ilgili detayları çıkarın ve karşılaştırın.\n
2. Çelişkili bilgiler bulunursa, bu çelişkileri belirtin ve kaynaklarınııklayın.\n
3. Ayrımları veya paralellikleri, [header]Benzerlikler[/header] ve [header]Farklılıklar[/header] gibi başlıklar kullanarak yapılandırılmış bir formatta sunun.\n
4. Çıkarılan bilgilerin kullanıcının sorgusuyla nasıl ilişkili olduğunu net bir şekilde açıklayın.\n
5. Eğer bağlamlar arasında tutarlı bilgiler bulunuyorsa, bunları [header]Benzerlikler[/header] bölümünde özetleyin. Çelişkili bilgiler için: Çelişen noktaları [header]Farklılıklar[/header] başlığı altında belirtin. Çelişkileri ilgili kaynaklarına atfedin ve bunların etkisini açıklayın.\n
6. Birden fazla özelliği kapsayan karşılaştırmalar için, verileri [bold]tablo formatında[/bold] veya yapılandırılmış listeler halinde düzenleyin. Her bir satır veya madde işareti bir özelliği temsil etmelidir.\n
7. Gerekli karşılaştırma verileri eksikse, bunu [header]Eksik Bilgiler[/header] başlığı altında açıkça belirtin. Sorgunun nasıl iyileştirilebileceğine dair önerilerde bulunun veya bağlamdaki eksikliklere işaret edin.\n
8. Ayrıntılı veya hiyerarşik karşılaştırmaları içeren sorgular için: Genel farklılıklar veya benzerlikler için bir ana bölüm kullanın. Daha ayrıntılı noktalar için iç içe geçmiş bölümler ekleyin.\n
İlgili Bilgilerin Çıkarılması:\n
Kullanıcının sorgusunda istenen belirli bilgileri belirlemek için dikkatlice analiz yapın.\n
Doğruluk için daha yüksek güven seviyelerine sahip bağlamlara öncelik vererek tüm ilgili bağlam pencerelerini kullanın.\n
Sorgu belirli bir dosyayı referans alıyorsa, yalnızca belirtilen dosya(lar)dan bilgi çıkarın.\n
Sorgu herhangi bir dosya belirtmiyorsa, mevcut tüm dosyalardan bilgileri birleştirin.\n
Bağlam birden fazla dosyada tutarlı bilgiler içeriyorsa, verileri birleştirin ve tutarlılığı belirtin.\n
Bağlam çelişkili bilgiler içeriyorsa: Çelişkileri vurgulayın, kaynaklarını belirtin ve nasıl farklılık gösterdiklerini açıklayın.\n
Bağlam benzer veya farklı bilgiler içeriyorsa, farklılıkları veya benzerlikleri özetleyin ve bunları sorguyla ilişkilendirin.\n
Yanıtınızı daha iyi okunabilirlik için madde işaretleri veya konuya dayalı bölümler kullanarak sunun.\n
Netlik ve özlülüğe öncelik verin. Karmaşık sorgular için alt başlıklar veya kategoriler kullanın.\n
Gerekli bilgi bağlamda bulunmuyorsa, bunu açıkça belirtin ve mümkünse öneriler veya açıklamalar sunun.\n
Yanıtta güven katsayısını belirtmeyin.\n
Aşağıdaki formata *kesinlikle* uygun şekilde yanıt verin:\n
[header]Bölüm Adı[/header]\n
Gerektiğinde [bold]kalın terimler[/bold] ile içerik. Maddeler için şu format kullanılmalı:\n
- Ana madde\n
- Alt madde\n
- Daha alt madde\n
[header]Diğer Bölüm Adı[/header]\n
Gerektiğinde [bold]kalın terimler[/bold] ile içerik\n
- Ana madde\n
- Alt madde\n
- Daha alt madde\n
Kurallar:\n
1. Her ana bölüm [header]...[/header] ile başlamalı\n
2. Önemli terimler veya vurgulamalar için [bold]...[/bold] kullanın\n
3. Bölüm başlıkları şunlardan biri olmalı: Tanım, Amaç, Temel Özellikler, İşleyiş, Bağlam\n
4. Tüm liste maddeleri için tek tire (-) kullanın\n
5. Alt maddelerde tam olarak 2 boşluk ile girintileme yapın\n
6. Ana bölümler arasında bir boş satır bırakın\n
7. Başka liste işaretleri kullanmayın (nokta, sayı vb.)\n
8. Yanıt boyunca tutarlı girintileme kullanın\n
Bağlam Pencereleri:\n
{context}\n
Kullanıcı Sorgusu:\n
{query}\n
"
Özetleme:
- id: sum_tr_001
text: "
Göreviniz verilen bağlam pencerelerini analiz etmek, kullanıcının sorgusuna göre ilgili verileri çıkarmak ve yanıtınızı geliştirmek için dosya bilgilerini kullanmaktır. Birincil amacınız, yalnızca bağlam penceresinde sağlanan bilgileri kullanarak kapsamlı, yapılandırılmış ve kullanıcı dostu bir yanıt sunmaktır.\n
Talimatlar:\n
Size, her biri birkaç cümle ve şu iki meta veriyi içeren bağlam pencereleri sağlanacaktır:\n
Dosya: Her bağlamın kaynağını belirtir.\n
Güven katsayısı: 0 ile 1 arasında bir sayı olup, bağlamın öncelik seviyesini ifade eder (daha yüksek sayılar daha yüksek öncelik anlamına gelir).\n
1. Sorgu ile ilgili bağlamdan anahtar noktaları veya temel fikirleri belirleyin ve çıkarın.\n
2. Netlik için madde işaretleri veya kategoriler kullanarak kısa ve iyi yapılandırılmış bir özet oluşturun.\n
3. Genel temaları vurgulayın ve gereksiz ayrıntılara yer vermeden genel bir bakış sağlayın.\n
4. Tekrarlamaları önlemek için bağlamlar arasındaki tutarlı bilgileri birleştirin.\n
5. Eğer sorgu belirli bir odak alanı (örneğin, bir bölüm, dosya veya tema) belirtiyorsa, yalnızca bu odakla ilgili içeriği özetlemeye öncelik verin. Herhangi bir odak belirtilmemişse, en kritik ve tekrar eden temaları veya noktaları vurgulayın.\n
6. Uygun olduğunda, bağlamdan kısa örnekler veya belirli detaylarla ana fikirleri açıklayın. Örnekleri kısa ve ilgili tutun.\n
7. Bağlamda çelişkiler varsa: Her iki bakış açısını da kısaca özetleyin. Çelişkiyi açıkça belirtin ve bunun sorguyla nasıl ilişkili olduğunu açıklayın.\n
8. Özet, sorgu tarafından açıkça talep edilmedikçe 200 kelimeyi aşmamalıdır. Eğer gerekli detaylar bu sınırı aşarsa, öncelikli veya hiyerarşik bir genel bakış sağlayın.\n
İlgili Bilgilerin Çıkarılması:\n
Kullanıcının sorgusunda istenen belirli bilgileri belirlemek için dikkatlice analiz yapın.\n
Doğruluk için daha yüksek güven seviyelerine sahip bağlamlara öncelik vererek tüm ilgili bağlam pencerelerini kullanın.\n
Sorgu belirli bir dosyayı referans alıyorsa, yalnızca belirtilen dosya(lar)dan bilgi çıkarın.\n
Sorgu herhangi bir dosya belirtmiyorsa, mevcut tüm dosyalardan bilgileri birleştirin.\n
Bağlam birden fazla dosyada tutarlı bilgiler içeriyorsa, verileri birleştirin ve tutarlılığı belirtin.\n
Bağlam çelişkili bilgiler içeriyorsa: Çelişkileri vurgulayın, kaynaklarını belirtin ve nasıl farklılık gösterdiklerini açıklayın.\n
Bağlam benzer veya farklı bilgiler içeriyorsa, farklılıkları veya benzerlikleri özetleyin ve bunları sorguyla ilişkilendirin.\n
Yanıtınızı daha iyi okunabilirlik için madde işaretleri veya konuya dayalı bölümler kullanarak sunun.\n
Netlik ve özlülüğe öncelik verin. Karmaşık sorgular için alt başlıklar veya kategoriler kullanın.\n
Gerekli bilgi bağlamda bulunmuyorsa, bunu açıkça belirtin ve mümkünse öneriler veya açıklamalar sunun.\n
Yanıtta güven katsayısını belirtmeyin.\n
Aşağıdaki formata *kesinlikle* uygun şekilde yanıt verin:\n
[header]Bölüm Adı[/header]\n
Gerektiğinde [bold]kalın terimler[/bold] ile içerik. Maddeler için şu format kullanılmalı:\n
- Ana madde\n
- Alt madde\n
- Daha alt madde\n
[header]Diğer Bölüm Adı[/header]\n
Gerektiğinde [bold]kalın terimler[/bold] ile içerik\n
- Ana madde\n
- Alt madde\n
- Daha alt madde\n
Kurallar:\n
1. Her ana bölüm [header]...[/header] ile başlamalı\n
2. Önemli terimler veya vurgulamalar için [bold]...[/bold] kullanın\n
3. Bölüm başlıkları şunlardan biri olmalı: Tanım, Amaç, Temel Özellikler, İşleyiş, Bağlam\n
4. Tüm liste maddeleri için tek tire (-) kullanın\n
5. Alt maddelerde tam olarak 2 boşluk ile girintileme yapın\n
6. Ana bölümler arasında bir boş satır bırakın\n
7. Başka liste işaretleri kullanmayın (nokta, sayı vb.)\n
8. Yanıt boyunca tutarlı girintileme kullanın\n
Bağlam Pencereleri:\n
{context}\n
Kullanıcı Sorgusu:\n
{query}\n
"
queries:
- id: query_tr_001
text: "
Görev: Analiz Et, Düzelt ve İlgili Sorular & Cevaplar Oluştur.\n
Talimatlar:\n
Kullanıcı sorgusu size verilmiştir.\n
Öncelikle Kullanıcı sorusunu kontrol edin. Eğer anlamsızsa, boş bir string '' döndürün. Anlamlıysa, şu işlemleri yapın:\n
Herhangi bir yazım veya dilbilgisi hatası olup olmadığını kontrol edin ve düzeltilmiş soruyu çıktıdaki ilk soru olarak döndürün.\n
Ardından, Düzeltmiş soruyla aynı anlamı koruyan 3 semantik olarak benzer sorgu oluşturun.\n
Orijinal soruyu farklıılardan ele alan, ancak yine de ilgili kalan 3 farklı soru oluşturun.\n
Son 3 soruya, her biri 1-2 cümlelik kısa cevaplarla yanıt verin.\n
Ardından düzeltilmiş kullanıcı sorgusunu analiz edin ve niyetini belirleyin. Niyet listesi, anahtar kelimeler ve örnekler aşağıda verilmiştir. Eğer niyet tam olarak anlaşılmaz ise boş bir string '' döndür.\n
Olası niyetler:\n
1. Bilgi Edinme: Gerçek bilgileri, tanımları veya açıklamaları öğrenme talebi.\n
Niyet Anahtar Kelimeleri: Ne, tanımla, açıkla, detaylar, belirt, kim, neden, nasıl.\n
Niyet Örnekleri: Bu kuralı ihlal etmenin cezası nedir? → Bilgilendirme\n
2. Özetleme: Karmaşık bilgilerin kısa bir özetini isteme.\n
Niyet Anahtar Kelimeleri: Özetle, genel bakış, ana noktalar, temel fikirler, kısa, öz, basitleştir.\n
Niyet Örnekleri: Bu belgenin ana noktalarını özetleyebilir misiniz? → Özetleme\n
3. Karşılaştırma: Seçenekleri, yöntemleri veya teknolojileri değerlendirme.\n
Niyet Anahtar Kelimeleri: Karşılaştır, fark, benzerlik, karşılaştırma, daha iyi, alternatif, artılar ve eksiler.\n
Niyet Örnekleri: Bu iki yöntemin faydalarını karşılaştırın. → Karşılaştırma\n
Çıktıyı **kesinlikle** şu formatta döndürün:\n
[düzeltilmiş sorgu]\n
[birinci semantik olarak benzer sorgu]\n
[ikinci semantik olarak benzer sorgu]\n
[üçüncü semantik olarak benzer sorgu]\n
[birinci farklı-açıdan soru]\n
[ikinci farklı-açıdan soru]\n
[üçüncü farklı-açıdan soru]\n
[birinci farklı-açıdan cevap]\n
[ikinci farklı-açıdan cevap]\n
[üçüncü farklı-açıdan cevap]\n
[kullanıcı niyeti]\n
Kullanıcı Sorgusu: {query}\n
Örnek:\n
Kullanıcı sorgusu: Retrieval-augmented generation yapay zeka sistemlerinde nasıl çalışır?\n
Çıktı:\n
Retrieval-augmented generation yapay zeka sistemlerinde nasıl çalışır?\n
Retrieval-augmented generation süreci yapay zekada nasıl işler?\n
RAG, yapay zeka sistemlerine bilgi getirme ve oluşturma konusunda nasıl yardımcı olur?\n
Retrieval-augmented generation yapay zeka uygulamalarında nasıl işlev görür?\n
RAG kullanmanın yapay zeka için temel avantajları nelerdir?\n
RAG, geleneksel makine öğrenimi modellerinden nasıl farklıdır?\n
RAGin uygulanmasında karşılaşılan zorluklar nelerdir?\n
RAG, yapay zekayı dış verileri getirerek daha doğru yanıtlar sağlamada geliştirir.\n
RAG, geleneksel modellerden farklı olarak çıkarım sırasında harici bilgilere erişim sağlar.\n
Başlıca zorluklar arasında getirme gecikmesi, getirilen verilerin uygunluğu ve bilgilerin güncel tutulması yer alır.\n
Bilgi Edinme\n
Kullanıcı sorusu: {query}\n
"
metadata:
version: "1.0"
description: "Prompt type storages with language groups"