import fitz import tempfile import io import re import spacy import pymupdf4llm import xml.etree.ElementTree as ET import zipfile from pathlib import Path from langchain_text_splitters import MarkdownHeaderTextSplitter from docling.datamodel.base_models import InputFormat from docling.pipeline.simple_pipeline import SimplePipeline from docling.document_converter import ( DocumentConverter, WordFormatOption, PowerpointFormatOption, HTMLFormatOption, ) class ReadingFunctions: def __init__(self): self.nlp = spacy.load( "en_core_web_sm", disable=[ "tagger", "attribute_ruler", "lemmatizer", "ner", "textcat", "custom", ], ) self.max_file_size_mb = 50 self.headers_to_split_on = [ ("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3"), ("####", "Header 4"), ] self.markdown_splitter = MarkdownHeaderTextSplitter( self.headers_to_split_on, strip_headers=False, return_each_line=True ) self.converter = DocumentConverter( allowed_formats=[ InputFormat.DOCX, InputFormat.PPTX, InputFormat.XLSX, InputFormat.PDF, InputFormat.HTML, ], format_options={ InputFormat.DOCX: WordFormatOption(pipeline_cls=SimplePipeline), InputFormat.PPTX: PowerpointFormatOption(pipeline_cls=SimplePipeline), InputFormat.HTML: HTMLFormatOption(pipeline_cls=SimplePipeline), }, ) def read_file(self, file_bytes: bytes, file_name: str): """Read and process file content from bytes""" file_size_mb = self._get_file_size(file_bytes=file_bytes) file_type = file_name.split(".")[-1].lower() if file_size_mb > self.max_file_size_mb: raise ValueError(f"File size exceeds {self.max_file_size_mb}MB limit") try: if file_type == "pdf": return self._process_pdf(file_bytes=file_bytes) elif file_type == "docx": return self._process_docx(file_bytes=file_bytes) elif file_type == "pptx": return self._process_pptx(file_bytes=file_bytes) elif file_type == "xlsx": return self._process_xlsx(file_bytes=file_bytes) elif file_type == "udf": return self._process_udf(file_bytes=file_bytes) elif file_type in ["txt", "rtf"]: return self._process_txt(file_bytes=file_bytes) else: raise ValueError(f"Unsupported file type: {file_type}") except Exception as e: raise ValueError(f"Error processing {file_name}: {str(e)}") def read_url(self, html_content: tuple): html_data = { "sentences": [], "page_number": [], "is_header": [], "is_table": [], } try: with tempfile.NamedTemporaryFile(delete=True, suffix=".html") as temp_file: temp_file.write(html_content.encode("utf-8")) temp_file.flush() html_path = Path(temp_file.name) md_text = self.converter.convert( html_path ).document.export_to_markdown() splits = self.markdown_splitter.split_text(md_text) for split in splits: if ( not len(split.page_content) > 5 or re.match(r"^[^\w]*$", split.page_content) or split.page_content[:4] == "