import logging import validators from requests_html import HTMLSession from urllib.parse import urlparse from ratelimit import limits, sleep_and_retry from bs4 import BeautifulSoup from typing import Optional, Tuple class Webscraper: def __init__(self): self.session = HTMLSession() logging.basicConfig(level=logging.INFO) self.logger = logging.getLogger(__name__) self.unwanted_tags = [ "nav", "header", "footer", "aside", "script", "style", "noscript", "iframe", "advertisement", "banner", "cookie-banner", "social-media", "comments", '[class*="ad-"]', '[class*="advertisement"]', '[class*="banner"]', '[class*="social"]', '[class*="footer"]', '[class*="header-nav"]', '[class*="cookie"]', '[class*="popup"]', '[class*="modal"]', '[class*="newsletter"]', ] @sleep_and_retry @limits(calls=30, period=60) def request_creator(self, url: str) -> Optional[str]: try: response = self.session.get(url, timeout=30) response.raise_for_status() return response.html.html except Exception as e: self.logger.error(f"Error making request to {url}: {e}") return None def url_validator(self, url: str) -> bool: try: if not validators.url(url): return False parsed = urlparse(url) return parsed.scheme in ["https", "http"] except Exception as e: self.logger.error(f"URL validation error: {str(e)}") return False def html_parser(self, html: str) -> str: try: soup = BeautifulSoup(html, "html.parser") for selector in self.unwanted_tags: for element in soup.select(selector): element.decompose() main_content = None main_tags = ["article", "main", "div"] for tag in main_tags: if tag == "div": for element in soup.find_all(tag, class_=True): class_name = str(element.get("class", "")) if any( pattern in class_name.lower() for pattern in ["content", "article", "post", "entry"] ): main_content = element break else: main_content = soup.find(tag) if main_content: break if not main_content: main_content = soup.body return str(main_content) if main_content else str(soup) except Exception as e: self.logger.error(f"Error cleaning HTML: {str(e)}") return html def scraper(self, url: str) -> Tuple[Optional[str], Optional[str]]: if not self.url_validator(url): return None, "Invalid Format" html = self.request_creator(url) if not html: return None, "Failed to fetch URL" try: parsed_html = self.html_parser(html=html) return parsed_html, None except Exception as e: self.logger.error(f"Error processing URL {url}: {str(e)}") return None, f"Error processing URL {str(e)}"