Files
2026-01-27 22:15:15 +00:00

330 lines
11 KiB
JavaScript

import 'dotenv/config'
import express from 'express'
import cors from 'cors'
import Parser from 'rss-parser'
import OpenAI from 'openai'
import Database from 'better-sqlite3'
const app = express()
const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY })
const parser = new Parser()
const PORT = 5555
// Initialize SQLite database for caching
const db = new Database('cache.db')
db.exec(`
CREATE TABLE IF NOT EXISTS grouped_news_cache (
id INTEGER PRIMARY KEY,
data TEXT NOT NULL,
created_at INTEGER NOT NULL
)
`)
const CACHE_DURATION_MS = 3 * 60 * 60 * 1000 // 3 hours
function getCachedGroupedNews() {
const row = db.prepare('SELECT data, created_at FROM grouped_news_cache WHERE id = 1').get()
if (!row) return null
const age = Date.now() - row.created_at
if (age > CACHE_DURATION_MS) return null
return { data: JSON.parse(row.data), age }
}
function setCachedGroupedNews(data) {
const stmt = db.prepare('INSERT OR REPLACE INTO grouped_news_cache (id, data, created_at) VALUES (1, ?, ?)')
stmt.run(JSON.stringify(data), Date.now())
}
function clearCache() {
db.prepare('DELETE FROM grouped_news_cache').run()
}
// Source names to filter out from AI summaries
const SOURCE_NAMES = [
'ABC News', 'ABC', 'NPR', 'CNN', 'Reuters', 'NBC News', 'NBC',
'CBS News', 'CBS', 'NY Times', 'New York Times', 'NYT', 'AP News',
'Associated Press', 'AP', 'BBC', 'Guardian', 'The Guardian'
]
function replaceSourceNames(text) {
if (!text) return text
let result = text
// Sort by length descending to replace longer names first (e.g., "New York Times" before "NY")
const sortedNames = [...SOURCE_NAMES].sort((a, b) => b.length - a.length)
for (const name of sortedNames) {
// Use word boundaries to only match whole words, not parts of other words
const regex = new RegExp(`\\b${name}\\b`, 'gi')
result = result.replace(regex, '[news]')
}
return result
}
function sanitizeGroups(groups) {
return groups.map(group => {
const newTitle = replaceSourceNames(group.title)
const newSummary = replaceSourceNames(group.summary)
if (newTitle !== group.title || newSummary !== group.summary) {
console.log(`Replaced source names in group: "${group.title}"`)
}
return {
...group,
title: newTitle,
summary: newSummary,
}
})
}
app.use(cors())
const RSS_FEEDS = {
abc: 'https://abcnews.go.com/abcnews/topstories',
npr: 'https://feeds.npr.org/1001/rss.xml',
cnn: 'http://rss.cnn.com/rss/cnn_topstories.rss',
nbc: 'https://feeds.nbcnews.com/nbcnews/public/news',
cbs: 'https://www.cbsnews.com/latest/rss/main',
nytimes: 'https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml',
}
app.get('/api/news', async (req, res) => {
try {
const results = await Promise.allSettled(
Object.entries(RSS_FEEDS).map(async ([source, url]) => {
const feed = await parser.parseURL(url)
return {
source,
title: feed.title,
items: feed.items.map((item) => ({
title: item.title,
link: item.link,
pubDate: item.pubDate,
content: item.contentSnippet || item.content || '',
source,
image: extractImage(item),
})),
}
})
)
const feeds = results
.filter((r) => r.status === 'fulfilled')
.map((r) => r.value)
const errors = results
.filter((r) => r.status === 'rejected')
.map((r, i) => ({ source: Object.keys(RSS_FEEDS)[i], error: r.reason.message }))
res.json({ feeds, errors })
} catch (error) {
res.status(500).json({ error: error.message })
}
})
// Endpoint to clear the cache
app.post('/api/clear-cache', (req, res) => {
clearCache()
console.log('Cache cleared by user')
res.json({ success: true, message: 'Cache cleared' })
})
app.get('/api/grouped-news', async (req, res) => {
try {
// Check if user wants to force refresh
const forceRefresh = req.query.refresh === 'true'
if (forceRefresh) {
clearCache()
console.log('Force refresh requested - cache cleared')
}
// Check cache first
const cached = getCachedGroupedNews()
if (cached) {
const remainingMs = CACHE_DURATION_MS - cached.age
const remainingMins = Math.round(remainingMs / 60000)
console.log(`Serving cached grouped news (${remainingMins} minutes until refresh)`)
return res.json({ groups: cached.data, cached: true, cacheExpiresIn: remainingMins })
}
console.log('Cache miss - fetching RSS feeds...')
// Fetch all news first
const results = await Promise.allSettled(
Object.entries(RSS_FEEDS).map(async ([source, url]) => {
console.log(` Fetching ${source}...`)
try {
const feed = await parser.parseURL(url)
console.log(`${source}: ${feed.items.length} articles`)
return {
source,
items: feed.items.map((item) => ({
title: item.title,
link: item.link,
pubDate: item.pubDate,
content: item.contentSnippet || item.content || '',
source,
image: extractImage(item),
})),
}
} catch (err) {
console.log(`${source}: ${err.message}`)
throw err
}
})
)
const feedResults = results
.filter((r) => r.status === 'fulfilled')
.map((r) => r.value)
// Ensure at least 5 articles from each source, then fill rest by date
const MIN_PER_SOURCE = 5
const TOTAL_LIMIT = 50
let selectedArticles = []
const usedIds = new Set()
// First pass: take up to MIN_PER_SOURCE from each source (sorted by date)
for (const feed of feedResults) {
const sorted = [...feed.items].sort((a, b) => new Date(b.pubDate) - new Date(a.pubDate))
const toTake = sorted.slice(0, MIN_PER_SOURCE)
for (const article of toTake) {
const id = article.link
if (!usedIds.has(id)) {
usedIds.add(id)
selectedArticles.push(article)
}
}
}
// Second pass: fill remaining slots with newest articles across all sources
const allRemaining = feedResults
.flatMap((f) => f.items)
.filter((a) => !usedIds.has(a.link))
.sort((a, b) => new Date(b.pubDate) - new Date(a.pubDate))
const remaining = TOTAL_LIMIT - selectedArticles.length
selectedArticles.push(...allRemaining.slice(0, remaining))
// Final sort by date
const allArticles = selectedArticles.sort((a, b) => new Date(b.pubDate) - new Date(a.pubDate))
console.log(`Selected ${allArticles.length} articles (min ${MIN_PER_SOURCE}/source, then by date)`)
if (allArticles.length === 0) {
return res.json({ groups: [] })
}
// Send to OpenAI for grouping
const articlesForAI = allArticles.map((a, i) => ({
id: i,
title: a.title,
content: a.content?.slice(0, 200) || '',
source: a.source,
}))
console.log(`Sending ${articlesForAI.length} articles to OpenAI gpt-5-mini...`)
const openaiStartTime = Date.now()
const completion = await openai.chat.completions.create({
model: 'gpt-5-mini',
messages: [
{
role: 'system',
content: `You are a news analyst. Group articles that cover THE SAME SPECIFIC NEWS STORY together.
IMPORTANT RULES:
- Each group must contain articles about ONE specific news event or story
- Do NOT combine unrelated topics into a single group
- Do NOT create broad category groups (e.g., "Various Political News")
- Articles about different events should be in SEPARATE groups, even if they share a category
- It's better to have more specific groups than fewer broad ones
- If an article doesn't match any group, put it in its own single-article group
WRITING RULES:
- Write ORIGINAL titles - do NOT copy or closely paraphrase headlines from the source articles
- Write ORIGINAL summaries in your own words - do NOT copy sentences from the articles
- Synthesize information from multiple sources into a fresh, unique narrative
- Use different phrasing and sentence structure than the originals
- Never mention the news source names (ABC, CNN, NPR, etc.) in titles or summaries
Return JSON in this exact format:
{
"groups": [
{
"title": "Your original headline for this story (max 80 chars)",
"summary": "Your original summary synthesizing the story in your own words (max 500 chars)",
"articleIds": [0, 1, 2],
"category": "politics|business|technology|sports|entertainment|health|science|world|other"
}
]
}
Only return valid JSON.`
},
{
role: 'user',
content: JSON.stringify(articlesForAI)
}
],
})
const openaiDuration = ((Date.now() - openaiStartTime) / 1000).toFixed(2)
const aiResponse = JSON.parse(completion.choices[0].message.content)
const usage = completion.usage || {}
console.log(`✓ OpenAI response received in ${openaiDuration}s`)
console.log(` - Groups returned: ${aiResponse.groups.length}`)
console.log(` - Tokens: ${usage.prompt_tokens || 'N/A'} prompt, ${usage.completion_tokens || 'N/A'} completion, ${usage.total_tokens || 'N/A'} total`)
// Enrich groups with source articles and images
const enrichedGroups = aiResponse.groups.map((group) => {
const groupArticles = group.articleIds
.map((id) => allArticles[id])
.filter(Boolean)
const images = groupArticles
.map((a) => a.image)
.filter(Boolean)
const sources = [...new Set(groupArticles.map((a) => a.source))]
const links = groupArticles.map((a) => ({ title: a.title, link: a.link, source: a.source }))
return {
title: group.title,
summary: group.summary,
category: group.category,
image: images[0] || null,
sources,
articles: links,
articleCount: groupArticles.length,
}
})
// Replace any source name mentions with [news]
const sanitizedGroups = sanitizeGroups(enrichedGroups)
// Cache the results
setCachedGroupedNews(sanitizedGroups)
console.log(`Cached ${sanitizedGroups.length} grouped news for 3 hours`)
res.json({ groups: sanitizedGroups, cached: false })
} catch (error) {
console.error('Grouped news error:', error)
res.status(500).json({ error: error.message })
}
})
function extractImage(item) {
if (item.enclosure?.url) return item.enclosure.url
if (item['media:content']?.$.url) return item['media:content'].$.url
if (item['media:thumbnail']?.$.url) return item['media:thumbnail'].$.url
const contentMatch = (item.content || '').match(/<img[^>]+src="([^"]+)"/)
if (contentMatch) return contentMatch[1]
return null
}
app.listen(PORT, () => {
console.log(`API server running on http://localhost:${PORT}`)
})