#!/usr/bin/env python3 import os import re import json from urllib.parse import urljoin, urlencode from datetime import datetime, timedelta import requests import pytz from bs4 import BeautifulSoup from dotenv import load_dotenv from flask import ( Flask, request, jsonify, render_template, abort, redirect, url_for, make_response, session ) from flask_cors import CORS from readability import Document from werkzeug.middleware.proxy_fix import ProxyFix from werkzeug.security import check_password_hash from db import get_db load_dotenv() APP_ROOT = os.environ.get("APPLICATION_ROOT", "/readitlater") API_TOKEN = os.environ.get("API_TOKEN", "") SECRET_KEY = os.environ.get("SECRET_KEY", "dev") # Login creds (single-user) LOGIN_USERNAME = os.environ.get("LOGIN_USERNAME", "admin") LOGIN_PASSWORD_HASH = os.environ.get("LOGIN_PASSWORD_HASH", "") # werkzeug hash NY_TZ = pytz.timezone("America/New_York") def create_app(): app = Flask(__name__, static_url_path=f"{APP_ROOT}/static", static_folder="static") app.config.update( SECRET_KEY=SECRET_KEY, SESSION_COOKIE_SECURE=True, SESSION_COOKIE_HTTPONLY=True, SESSION_COOKIE_SAMESITE="Lax", PERMANENT_SESSION_LIFETIME=timedelta(days=7), ) app.wsgi_app = ProxyFix(app.wsgi_app, x_for=1, x_proto=1, x_host=1, x_prefix=1) # CORS for API (tighten as desired) CORS(app, resources={rf"{APP_ROOT}/api/*": {"origins": [ "https://www.jaredlog.com", "https://jaredlog.com" ]}}) # ------------------------------------------------------------------------- # NOTE (extension patch): In your Chrome extension, prefer: # document.documentElement.getHTML({ includeShadowRoots: true }) # and fall back to outerHTML. That captures Shadow DOM content IBM/others use. # ------------------------------------------------------------------------- # --- Sanitization policy for captured HTML --- ALLOWED_TAGS = { "article", "section", "header", "footer", "h1", "h2", "h3", "h4", "p", "blockquote", "pre", "code", "ul", "ol", "li", "a", "em", "strong", "b", "i", "img", "figure", "figcaption", "hr", "br", "picture", "source" } ALLOWED_ATTRS = { "href", "src", "alt", "title", "target", "rel", "loading", "srcset", "sizes", "referrerpolicy" } TAG_RE = re.compile(r"\s+") # ----------------------------- Auth gate ----------------------------- def is_ui_path(path: str) -> bool: # Protect everything under APP_ROOT except allowlisted paths if path == APP_ROOT or path.startswith(APP_ROOT + "/"): if (path.startswith(f"{APP_ROOT}/static/") or path.startswith(f"{APP_ROOT}/api/") or path == f"{APP_ROOT}/healthz" or path == f"{APP_ROOT}/login" or path == f"{APP_ROOT}/logout"): return False return True return False @app.before_request def _gate_ui(): if not is_ui_path(request.path): return if session.get("auth_ok") is True: return next_qs = urlencode({"next": request.full_path if request.query_string else request.path}) return redirect(f"{APP_ROOT}/login?{next_qs}", code=302) # ----------------------------- Utils ----------------------------- def require_token() -> bool: return bool(API_TOKEN) and request.headers.get("Authorization", "") == f"Bearer {API_TOKEN}" def normalize_tag(s: str) -> str: s = TAG_RE.sub(" ", (s or "").strip().lower()) return s # --------- Helpers for images / shadow DOM / JSON-LD --------- def is_image_url(u: str) -> bool: u = (u or "").lower() return any(u.split("?")[0].endswith(ext) for ext in (".png", ".jpg", ".jpeg", ".gif", ".webp", ".avif", ".svg")) def absolutize_srcset(value: str, base_url: str) -> str: parts = [] for part in (value or "").split(","): bits = part.strip().split() if bits: bits[0] = urljoin(base_url, bits[0]) parts.append(" ".join(bits)) return ", ".join(parts) def inline_declarative_shadow_dom(raw_html: str) -> str: try: soup = BeautifulSoup(raw_html, "lxml") for tpl in soup.find_all("template"): if tpl.has_attr("shadowrootmode"): frag = BeautifulSoup(tpl.decode_contents(), "lxml") tpl.replace_with(frag) return str(soup) except Exception: return raw_html def extract_jsonld_article_body(raw_html: str) -> tuple[str | None, str | None]: try: doc = BeautifulSoup(raw_html, "lxml") for s in doc.find_all("script", type="application/ld+json"): text = s.string if not text: continue data = json.loads(text) items = data if isinstance(data, list) else [data] for it in items: t = (it.get("@type") or it.get("type") or "") t = t.lower() if isinstance(t, str) else str(t).lower() if "article" in t or "newsarticle" in t or "blogposting" in t: body = it.get("articleBody") or it.get("articlebody") title = it.get("headline") or it.get("name") if body and isinstance(body, str) and len(body.strip()) > 400: html = "".join( f"

{p.strip()}

" for p in body.split("\n") if p.strip() ) return title, html except Exception: pass return None, None def pick_thumbnail(raw_html: str, cleaned_soup: BeautifulSoup, base_url: str) -> str | None: img = cleaned_soup.find("img", src=True) if img and img.get("src"): return urljoin(base_url, img["src"]) for tag in cleaned_soup.find_all("img"): lazy = ( tag.get("data-src") or tag.get("data-lazy-src") or tag.get("data-original") or tag.get("data-srcset") ) if not tag.get("src") and lazy: return urljoin(base_url, lazy) if not tag.get("src"): ss = tag.get("srcset") if ss: first = ss.split(",")[0].strip().split(" ")[0] if first: return urljoin(base_url, first) for pic in cleaned_soup.find_all("picture"): for source in pic.find_all("source"): ss = source.get("srcset") if ss: first = ss.split(",")[0].strip().split(" ")[0] if first: return urljoin(base_url, first) try: full = BeautifulSoup(raw_html, "lxml") meta = ( full.find("meta", property="og:image") or full.find("meta", attrs={"name": "og:image"}) or full.find("meta", attrs={"name": "twitter:image"}) or full.find("meta", property="twitter:image") ) if meta and meta.get("content"): return urljoin(base_url, meta["content"]) except Exception: pass return None def extract_and_clean(raw_html: str, base_url: str) -> tuple[str, str, str, str | None]: # 1) Prepass: inline Declarative Shadow DOM pre_html = inline_declarative_shadow_dom(raw_html) # 2) Readability extraction doc = Document(pre_html) title = (doc.short_title() or doc.title() or "").strip() summary_html = doc.summary(html_partial=True) soup = BeautifulSoup(summary_html, "lxml") # Keep