#!/usr/bin/env python3
import os
import re
import json
from urllib.parse import urljoin, urlencode
from datetime import datetime, timedelta
import requests
import pytz
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from flask import (
Flask, request, jsonify, render_template, abort,
redirect, url_for, make_response, session
)
from flask_cors import CORS
from readability import Document
from werkzeug.middleware.proxy_fix import ProxyFix
from werkzeug.security import check_password_hash
from db import get_db
load_dotenv()
APP_ROOT = os.environ.get("APPLICATION_ROOT", "/readitlater")
API_TOKEN = os.environ.get("API_TOKEN", "")
SECRET_KEY = os.environ.get("SECRET_KEY", "dev")
# Login creds (single-user)
LOGIN_USERNAME = os.environ.get("LOGIN_USERNAME", "admin")
LOGIN_PASSWORD_HASH = os.environ.get("LOGIN_PASSWORD_HASH", "") # werkzeug hash
NY_TZ = pytz.timezone("America/New_York")
def create_app():
app = Flask(__name__, static_url_path=f"{APP_ROOT}/static", static_folder="static")
app.config.update(
SECRET_KEY=SECRET_KEY,
SESSION_COOKIE_SECURE=True,
SESSION_COOKIE_HTTPONLY=True,
SESSION_COOKIE_SAMESITE="Lax",
PERMANENT_SESSION_LIFETIME=timedelta(days=7),
)
app.wsgi_app = ProxyFix(app.wsgi_app, x_for=1, x_proto=1, x_host=1, x_prefix=1)
# CORS for API (tighten as desired)
CORS(app, resources={rf"{APP_ROOT}/api/*": {"origins": [
"https://www.jaredlog.com", "https://jaredlog.com"
]}})
# -------------------------------------------------------------------------
# NOTE (extension patch): In your Chrome extension, prefer:
# document.documentElement.getHTML({ includeShadowRoots: true })
# and fall back to outerHTML. That captures Shadow DOM content IBM/others use.
# -------------------------------------------------------------------------
# --- Sanitization policy for captured HTML ---
ALLOWED_TAGS = {
"article", "section", "header", "footer",
"h1", "h2", "h3", "h4", "p", "blockquote", "pre", "code",
"ul", "ol", "li", "a", "em", "strong", "b", "i",
"img", "figure", "figcaption", "hr", "br",
"picture", "source"
}
ALLOWED_ATTRS = {
"href", "src", "alt", "title", "target", "rel", "loading",
"srcset", "sizes", "referrerpolicy"
}
TAG_RE = re.compile(r"\s+")
# ----------------------------- Auth gate -----------------------------
def is_ui_path(path: str) -> bool:
# Protect everything under APP_ROOT except allowlisted paths
if path == APP_ROOT or path.startswith(APP_ROOT + "/"):
if (path.startswith(f"{APP_ROOT}/static/")
or path.startswith(f"{APP_ROOT}/api/")
or path == f"{APP_ROOT}/healthz"
or path == f"{APP_ROOT}/login"
or path == f"{APP_ROOT}/logout"):
return False
return True
return False
@app.before_request
def _gate_ui():
if not is_ui_path(request.path):
return
if session.get("auth_ok") is True:
return
next_qs = urlencode({"next": request.full_path if request.query_string else request.path})
return redirect(f"{APP_ROOT}/login?{next_qs}", code=302)
# ----------------------------- Utils -----------------------------
def require_token() -> bool:
return bool(API_TOKEN) and request.headers.get("Authorization", "") == f"Bearer {API_TOKEN}"
def normalize_tag(s: str) -> str:
s = TAG_RE.sub(" ", (s or "").strip().lower())
return s
# --------- Helpers for images / shadow DOM / JSON-LD ---------
def is_image_url(u: str) -> bool:
u = (u or "").lower()
return any(u.split("?")[0].endswith(ext) for ext in (".png", ".jpg", ".jpeg", ".gif", ".webp", ".avif", ".svg"))
def absolutize_srcset(value: str, base_url: str) -> str:
parts = []
for part in (value or "").split(","):
bits = part.strip().split()
if bits:
bits[0] = urljoin(base_url, bits[0])
parts.append(" ".join(bits))
return ", ".join(parts)
def inline_declarative_shadow_dom(raw_html: str) -> str:
try:
soup = BeautifulSoup(raw_html, "lxml")
for tpl in soup.find_all("template"):
if tpl.has_attr("shadowrootmode"):
frag = BeautifulSoup(tpl.decode_contents(), "lxml")
tpl.replace_with(frag)
return str(soup)
except Exception:
return raw_html
def extract_jsonld_article_body(raw_html: str) -> tuple[str | None, str | None]:
try:
doc = BeautifulSoup(raw_html, "lxml")
for s in doc.find_all("script", type="application/ld+json"):
text = s.string
if not text:
continue
data = json.loads(text)
items = data if isinstance(data, list) else [data]
for it in items:
t = (it.get("@type") or it.get("type") or "")
t = t.lower() if isinstance(t, str) else str(t).lower()
if "article" in t or "newsarticle" in t or "blogposting" in t:
body = it.get("articleBody") or it.get("articlebody")
title = it.get("headline") or it.get("name")
if body and isinstance(body, str) and len(body.strip()) > 400:
html = "".join(
f"
{p.strip()}
"
for p in body.split("\n")
if p.strip()
)
return title, html
except Exception:
pass
return None, None
def pick_thumbnail(raw_html: str, cleaned_soup: BeautifulSoup, base_url: str) -> str | None:
img = cleaned_soup.find("img", src=True)
if img and img.get("src"):
return urljoin(base_url, img["src"])
for tag in cleaned_soup.find_all("img"):
lazy = (
tag.get("data-src")
or tag.get("data-lazy-src")
or tag.get("data-original")
or tag.get("data-srcset")
)
if not tag.get("src") and lazy:
return urljoin(base_url, lazy)
if not tag.get("src"):
ss = tag.get("srcset")
if ss:
first = ss.split(",")[0].strip().split(" ")[0]
if first:
return urljoin(base_url, first)
for pic in cleaned_soup.find_all("picture"):
for source in pic.find_all("source"):
ss = source.get("srcset")
if ss:
first = ss.split(",")[0].strip().split(" ")[0]
if first:
return urljoin(base_url, first)
try:
full = BeautifulSoup(raw_html, "lxml")
meta = (
full.find("meta", property="og:image") or
full.find("meta", attrs={"name": "og:image"}) or
full.find("meta", attrs={"name": "twitter:image"}) or
full.find("meta", property="twitter:image")
)
if meta and meta.get("content"):
return urljoin(base_url, meta["content"])
except Exception:
pass
return None
def extract_and_clean(raw_html: str, base_url: str) -> tuple[str, str, str, str | None]:
# 1) Prepass: inline Declarative Shadow DOM
pre_html = inline_declarative_shadow_dom(raw_html)
# 2) Readability extraction
doc = Document(pre_html)
title = (doc.short_title() or doc.title() or "").strip()
summary_html = doc.summary(html_partial=True)
soup = BeautifulSoup(summary_html, "lxml")
# Keep