From 1c4aaf18b2f58dbc8184ef6df9b55b55e19a9d2e Mon Sep 17 00:00:00 2001 From: root Date: Tue, 20 Jan 2026 03:53:49 +0000 Subject: [PATCH] Initial commit Co-Authored-By: Claude Opus 4.5 --- .gitignore | 28 ++ app.py | 664 +++++++++++++++++++++++++++++++ cat_ai_files.sh | 55 +++ chrome_extension_readitlater.txt | 151 +++++++ database_schema_display.sh | 65 +++ db.py | 19 + env.txt | 6 + generate_password_hash.py | 2 + gunicorn_conf.py | 7 + nginx-config.txt | 86 ++++ recreate_ai_files.sh | 10 + templates/base.html | 64 +++ templates/bkup_base.txt | 64 +++ templates/detail.html | 80 ++++ templates/index.html | 100 +++++ templates/login.html | 24 ++ templates/tags.html | 16 + 17 files changed, 1441 insertions(+) create mode 100644 .gitignore create mode 100644 app.py create mode 100755 cat_ai_files.sh create mode 100644 chrome_extension_readitlater.txt create mode 100755 database_schema_display.sh create mode 100644 db.py create mode 100644 env.txt create mode 100644 generate_password_hash.py create mode 100644 gunicorn_conf.py create mode 100644 nginx-config.txt create mode 100755 recreate_ai_files.sh create mode 100644 templates/base.html create mode 100644 templates/bkup_base.txt create mode 100644 templates/detail.html create mode 100644 templates/index.html create mode 100644 templates/login.html create mode 100644 templates/tags.html diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..226f6aa --- /dev/null +++ b/.gitignore @@ -0,0 +1,28 @@ +# Virtual environment +.venv/ +venv/ +env/ + +# Python bytecode +__pycache__/ +*.py[cod] +*$py.class +*.pyo + +# Environment variables (contains secrets) +.env + +# Database +*.db +*.sqlite3 + +# IDE/Editor +.idea/ +.vscode/ +*.swp +*.swo +*~ + +# OS files +.DS_Store +Thumbs.db diff --git a/app.py b/app.py new file mode 100644 index 0000000..1cbbbce --- /dev/null +++ b/app.py @@ -0,0 +1,664 @@ +#!/usr/bin/env python3 +import os +import re +import json +from urllib.parse import urljoin, urlencode +from datetime import datetime, timedelta + +import requests +import pytz +from bs4 import BeautifulSoup +from dotenv import load_dotenv +from flask import ( + Flask, request, jsonify, render_template, abort, + redirect, url_for, make_response, session +) +from flask_cors import CORS +from readability import Document +from werkzeug.middleware.proxy_fix import ProxyFix +from werkzeug.security import check_password_hash + +from db import get_db + +load_dotenv() + +APP_ROOT = os.environ.get("APPLICATION_ROOT", "/readitlater") +API_TOKEN = os.environ.get("API_TOKEN", "") +SECRET_KEY = os.environ.get("SECRET_KEY", "dev") + +# Login creds (single-user) +LOGIN_USERNAME = os.environ.get("LOGIN_USERNAME", "admin") +LOGIN_PASSWORD_HASH = os.environ.get("LOGIN_PASSWORD_HASH", "") # werkzeug hash + +NY_TZ = pytz.timezone("America/New_York") + + +def create_app(): + app = Flask(__name__, static_url_path=f"{APP_ROOT}/static", static_folder="static") + app.config.update( + SECRET_KEY=SECRET_KEY, + SESSION_COOKIE_SECURE=True, + SESSION_COOKIE_HTTPONLY=True, + SESSION_COOKIE_SAMESITE="Lax", + PERMANENT_SESSION_LIFETIME=timedelta(days=7), + ) + app.wsgi_app = ProxyFix(app.wsgi_app, x_for=1, x_proto=1, x_host=1, x_prefix=1) + + # CORS for API (tighten as desired) + CORS(app, resources={rf"{APP_ROOT}/api/*": {"origins": [ + "https://www.jaredlog.com", "https://jaredlog.com" + ]}}) + + # ------------------------------------------------------------------------- + # NOTE (extension patch): In your Chrome extension, prefer: + # document.documentElement.getHTML({ includeShadowRoots: true }) + # and fall back to outerHTML. That captures Shadow DOM content IBM/others use. + # ------------------------------------------------------------------------- + + # --- Sanitization policy for captured HTML --- + ALLOWED_TAGS = { + "article", "section", "header", "footer", + "h1", "h2", "h3", "h4", "p", "blockquote", "pre", "code", + "ul", "ol", "li", "a", "em", "strong", "b", "i", + "img", "figure", "figcaption", "hr", "br", + "picture", "source" + } + ALLOWED_ATTRS = { + "href", "src", "alt", "title", "target", "rel", "loading", + "srcset", "sizes", "referrerpolicy" + } + + TAG_RE = re.compile(r"\s+") + + # ----------------------------- Auth gate ----------------------------- + def is_ui_path(path: str) -> bool: + # Protect everything under APP_ROOT except allowlisted paths + if path == APP_ROOT or path.startswith(APP_ROOT + "/"): + if (path.startswith(f"{APP_ROOT}/static/") + or path.startswith(f"{APP_ROOT}/api/") + or path == f"{APP_ROOT}/healthz" + or path == f"{APP_ROOT}/login" + or path == f"{APP_ROOT}/logout"): + return False + return True + return False + + @app.before_request + def _gate_ui(): + if not is_ui_path(request.path): + return + if session.get("auth_ok") is True: + return + next_qs = urlencode({"next": request.full_path if request.query_string else request.path}) + return redirect(f"{APP_ROOT}/login?{next_qs}", code=302) + + # ----------------------------- Utils ----------------------------- + def require_token() -> bool: + return bool(API_TOKEN) and request.headers.get("Authorization", "") == f"Bearer {API_TOKEN}" + + def normalize_tag(s: str) -> str: + s = TAG_RE.sub(" ", (s or "").strip().lower()) + return s + + # --------- Helpers for images / shadow DOM / JSON-LD --------- + def is_image_url(u: str) -> bool: + u = (u or "").lower() + return any(u.split("?")[0].endswith(ext) for ext in (".png", ".jpg", ".jpeg", ".gif", ".webp", ".avif", ".svg")) + + def absolutize_srcset(value: str, base_url: str) -> str: + parts = [] + for part in (value or "").split(","): + bits = part.strip().split() + if bits: + bits[0] = urljoin(base_url, bits[0]) + parts.append(" ".join(bits)) + return ", ".join(parts) + + def inline_declarative_shadow_dom(raw_html: str) -> str: + try: + soup = BeautifulSoup(raw_html, "lxml") + for tpl in soup.find_all("template"): + if tpl.has_attr("shadowrootmode"): + frag = BeautifulSoup(tpl.decode_contents(), "lxml") + tpl.replace_with(frag) + return str(soup) + except Exception: + return raw_html + + def extract_jsonld_article_body(raw_html: str) -> tuple[str | None, str | None]: + try: + doc = BeautifulSoup(raw_html, "lxml") + for s in doc.find_all("script", type="application/ld+json"): + text = s.string + if not text: + continue + data = json.loads(text) + items = data if isinstance(data, list) else [data] + for it in items: + t = (it.get("@type") or it.get("type") or "") + t = t.lower() if isinstance(t, str) else str(t).lower() + if "article" in t or "newsarticle" in t or "blogposting" in t: + body = it.get("articleBody") or it.get("articlebody") + title = it.get("headline") or it.get("name") + if body and isinstance(body, str) and len(body.strip()) > 400: + html = "".join( + f"

{p.strip()}

" + for p in body.split("\n") + if p.strip() + ) + return title, html + except Exception: + pass + return None, None + + def pick_thumbnail(raw_html: str, cleaned_soup: BeautifulSoup, base_url: str) -> str | None: + img = cleaned_soup.find("img", src=True) + if img and img.get("src"): + return urljoin(base_url, img["src"]) + + for tag in cleaned_soup.find_all("img"): + lazy = ( + tag.get("data-src") + or tag.get("data-lazy-src") + or tag.get("data-original") + or tag.get("data-srcset") + ) + if not tag.get("src") and lazy: + return urljoin(base_url, lazy) + if not tag.get("src"): + ss = tag.get("srcset") + if ss: + first = ss.split(",")[0].strip().split(" ")[0] + if first: + return urljoin(base_url, first) + + for pic in cleaned_soup.find_all("picture"): + for source in pic.find_all("source"): + ss = source.get("srcset") + if ss: + first = ss.split(",")[0].strip().split(" ")[0] + if first: + return urljoin(base_url, first) + + try: + full = BeautifulSoup(raw_html, "lxml") + meta = ( + full.find("meta", property="og:image") or + full.find("meta", attrs={"name": "og:image"}) or + full.find("meta", attrs={"name": "twitter:image"}) or + full.find("meta", property="twitter:image") + ) + if meta and meta.get("content"): + return urljoin(base_url, meta["content"]) + except Exception: + pass + return None + + def extract_and_clean(raw_html: str, base_url: str) -> tuple[str, str, str, str | None]: + # 1) Prepass: inline Declarative Shadow DOM + pre_html = inline_declarative_shadow_dom(raw_html) + + # 2) Readability extraction + doc = Document(pre_html) + title = (doc.short_title() or doc.title() or "").strip() + summary_html = doc.summary(html_partial=True) + soup = BeautifulSoup(summary_html, "lxml") + + # Keep