325 lines
10 KiB
JavaScript
325 lines
10 KiB
JavaScript
import 'dotenv/config'
|
|
import express from 'express'
|
|
import cors from 'cors'
|
|
import Parser from 'rss-parser'
|
|
import OpenAI from 'openai'
|
|
import Database from 'better-sqlite3'
|
|
|
|
const app = express()
|
|
const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY })
|
|
const parser = new Parser()
|
|
const PORT = 3001
|
|
|
|
// Initialize SQLite database for caching
|
|
const db = new Database('cache.db')
|
|
db.exec(`
|
|
CREATE TABLE IF NOT EXISTS grouped_news_cache (
|
|
id INTEGER PRIMARY KEY,
|
|
data TEXT NOT NULL,
|
|
created_at INTEGER NOT NULL
|
|
)
|
|
`)
|
|
|
|
const CACHE_DURATION_MS = 3 * 60 * 60 * 1000 // 3 hours
|
|
|
|
function getCachedGroupedNews() {
|
|
const row = db.prepare('SELECT data, created_at FROM grouped_news_cache WHERE id = 1').get()
|
|
if (!row) return null
|
|
|
|
const age = Date.now() - row.created_at
|
|
if (age > CACHE_DURATION_MS) return null
|
|
|
|
return { data: JSON.parse(row.data), age }
|
|
}
|
|
|
|
function setCachedGroupedNews(data) {
|
|
const stmt = db.prepare('INSERT OR REPLACE INTO grouped_news_cache (id, data, created_at) VALUES (1, ?, ?)')
|
|
stmt.run(JSON.stringify(data), Date.now())
|
|
}
|
|
|
|
function clearCache() {
|
|
db.prepare('DELETE FROM grouped_news_cache').run()
|
|
}
|
|
|
|
// Source names to filter out from AI summaries
|
|
const SOURCE_NAMES = [
|
|
'ABC News', 'ABC', 'NPR', 'CNN', 'Reuters', 'NBC News', 'NBC',
|
|
'CBS News', 'CBS', 'NY Times', 'New York Times', 'NYT', 'AP News',
|
|
'Associated Press', 'AP', 'BBC', 'Guardian', 'The Guardian'
|
|
]
|
|
|
|
function replaceSourceNames(text) {
|
|
if (!text) return text
|
|
let result = text
|
|
// Sort by length descending to replace longer names first (e.g., "New York Times" before "NY")
|
|
const sortedNames = [...SOURCE_NAMES].sort((a, b) => b.length - a.length)
|
|
for (const name of sortedNames) {
|
|
// Use word boundaries to only match whole words, not parts of other words
|
|
const regex = new RegExp(`\\b${name}\\b`, 'gi')
|
|
result = result.replace(regex, '[news]')
|
|
}
|
|
return result
|
|
}
|
|
|
|
function sanitizeGroups(groups) {
|
|
return groups.map(group => {
|
|
const newTitle = replaceSourceNames(group.title)
|
|
const newSummary = replaceSourceNames(group.summary)
|
|
if (newTitle !== group.title || newSummary !== group.summary) {
|
|
console.log(`Replaced source names in group: "${group.title}"`)
|
|
}
|
|
return {
|
|
...group,
|
|
title: newTitle,
|
|
summary: newSummary,
|
|
}
|
|
})
|
|
}
|
|
|
|
app.use(cors())
|
|
|
|
const RSS_FEEDS = {
|
|
abc: 'https://abcnews.go.com/abcnews/topstories',
|
|
npr: 'https://feeds.npr.org/1001/rss.xml',
|
|
cnn: 'http://rss.cnn.com/rss/cnn_topstories.rss',
|
|
nbc: 'https://feeds.nbcnews.com/nbcnews/public/news',
|
|
cbs: 'https://www.cbsnews.com/latest/rss/main',
|
|
nytimes: 'https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml',
|
|
}
|
|
|
|
app.get('/api/news', async (req, res) => {
|
|
try {
|
|
const results = await Promise.allSettled(
|
|
Object.entries(RSS_FEEDS).map(async ([source, url]) => {
|
|
const feed = await parser.parseURL(url)
|
|
return {
|
|
source,
|
|
title: feed.title,
|
|
items: feed.items.map((item) => ({
|
|
title: item.title,
|
|
link: item.link,
|
|
pubDate: item.pubDate,
|
|
content: item.contentSnippet || item.content || '',
|
|
source,
|
|
image: extractImage(item),
|
|
})),
|
|
}
|
|
})
|
|
)
|
|
|
|
const feeds = results
|
|
.filter((r) => r.status === 'fulfilled')
|
|
.map((r) => r.value)
|
|
|
|
const errors = results
|
|
.filter((r) => r.status === 'rejected')
|
|
.map((r, i) => ({ source: Object.keys(RSS_FEEDS)[i], error: r.reason.message }))
|
|
|
|
res.json({ feeds, errors })
|
|
} catch (error) {
|
|
res.status(500).json({ error: error.message })
|
|
}
|
|
})
|
|
|
|
// Endpoint to clear the cache
|
|
app.post('/api/clear-cache', (req, res) => {
|
|
clearCache()
|
|
console.log('Cache cleared by user')
|
|
res.json({ success: true, message: 'Cache cleared' })
|
|
})
|
|
|
|
app.get('/api/grouped-news', async (req, res) => {
|
|
try {
|
|
// Check if user wants to force refresh
|
|
const forceRefresh = req.query.refresh === 'true'
|
|
|
|
if (forceRefresh) {
|
|
clearCache()
|
|
console.log('Force refresh requested - cache cleared')
|
|
}
|
|
|
|
// Check cache first
|
|
const cached = getCachedGroupedNews()
|
|
if (cached) {
|
|
const remainingMs = CACHE_DURATION_MS - cached.age
|
|
const remainingMins = Math.round(remainingMs / 60000)
|
|
console.log(`Serving cached grouped news (${remainingMins} minutes until refresh)`)
|
|
return res.json({ groups: cached.data, cached: true, cacheExpiresIn: remainingMins })
|
|
}
|
|
|
|
console.log('Cache miss - fetching RSS feeds...')
|
|
|
|
// Fetch all news first
|
|
const results = await Promise.allSettled(
|
|
Object.entries(RSS_FEEDS).map(async ([source, url]) => {
|
|
console.log(` Fetching ${source}...`)
|
|
try {
|
|
const feed = await parser.parseURL(url)
|
|
console.log(` ✓ ${source}: ${feed.items.length} articles`)
|
|
return {
|
|
source,
|
|
items: feed.items.map((item) => ({
|
|
title: item.title,
|
|
link: item.link,
|
|
pubDate: item.pubDate,
|
|
content: item.contentSnippet || item.content || '',
|
|
source,
|
|
image: extractImage(item),
|
|
})),
|
|
}
|
|
} catch (err) {
|
|
console.log(` ✗ ${source}: ${err.message}`)
|
|
throw err
|
|
}
|
|
})
|
|
)
|
|
|
|
const feedResults = results
|
|
.filter((r) => r.status === 'fulfilled')
|
|
.map((r) => r.value)
|
|
|
|
// Ensure at least 5 articles from each source, then fill rest by date
|
|
const MIN_PER_SOURCE = 5
|
|
const TOTAL_LIMIT = 50
|
|
|
|
let selectedArticles = []
|
|
const usedIds = new Set()
|
|
|
|
// First pass: take up to MIN_PER_SOURCE from each source (sorted by date)
|
|
for (const feed of feedResults) {
|
|
const sorted = [...feed.items].sort((a, b) => new Date(b.pubDate) - new Date(a.pubDate))
|
|
const toTake = sorted.slice(0, MIN_PER_SOURCE)
|
|
for (const article of toTake) {
|
|
const id = article.link
|
|
if (!usedIds.has(id)) {
|
|
usedIds.add(id)
|
|
selectedArticles.push(article)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Second pass: fill remaining slots with newest articles across all sources
|
|
const allRemaining = feedResults
|
|
.flatMap((f) => f.items)
|
|
.filter((a) => !usedIds.has(a.link))
|
|
.sort((a, b) => new Date(b.pubDate) - new Date(a.pubDate))
|
|
|
|
const remaining = TOTAL_LIMIT - selectedArticles.length
|
|
selectedArticles.push(...allRemaining.slice(0, remaining))
|
|
|
|
// Final sort by date
|
|
const allArticles = selectedArticles.sort((a, b) => new Date(b.pubDate) - new Date(a.pubDate))
|
|
|
|
console.log(`Selected ${allArticles.length} articles (min ${MIN_PER_SOURCE}/source, then by date)`)
|
|
|
|
if (allArticles.length === 0) {
|
|
return res.json({ groups: [] })
|
|
}
|
|
|
|
// Send to OpenAI for grouping
|
|
const articlesForAI = allArticles.map((a, i) => ({
|
|
id: i,
|
|
title: a.title,
|
|
content: a.content?.slice(0, 200) || '',
|
|
source: a.source,
|
|
}))
|
|
|
|
console.log(`Sending ${articlesForAI.length} articles to OpenAI gpt-5-mini...`)
|
|
const completion = await openai.chat.completions.create({
|
|
model: 'gpt-5-mini',
|
|
messages: [
|
|
{
|
|
role: 'system',
|
|
content: `You are a news analyst. Group articles that cover THE SAME SPECIFIC NEWS STORY together.
|
|
|
|
IMPORTANT RULES:
|
|
- Each group must contain articles about ONE specific news event or story
|
|
- Do NOT combine unrelated topics into a single group
|
|
- Do NOT create broad category groups (e.g., "Various Political News")
|
|
- Articles about different events should be in SEPARATE groups, even if they share a category
|
|
- It's better to have more specific groups than fewer broad ones
|
|
- If an article doesn't match any group, put it in its own single-article group
|
|
|
|
WRITING RULES:
|
|
- Write ORIGINAL titles - do NOT copy or closely paraphrase headlines from the source articles
|
|
- Write ORIGINAL summaries in your own words - do NOT copy sentences from the articles
|
|
- Synthesize information from multiple sources into a fresh, unique narrative
|
|
- Use different phrasing and sentence structure than the originals
|
|
- Never mention the news source names (ABC, CNN, NPR, etc.) in titles or summaries
|
|
|
|
Return JSON in this exact format:
|
|
{
|
|
"groups": [
|
|
{
|
|
"title": "Your original headline for this story (max 80 chars)",
|
|
"summary": "Your original summary synthesizing the story in your own words (max 500 chars)",
|
|
"articleIds": [0, 1, 2],
|
|
"category": "politics|business|technology|sports|entertainment|health|science|world|other"
|
|
}
|
|
]
|
|
}
|
|
Only return valid JSON.`
|
|
},
|
|
{
|
|
role: 'user',
|
|
content: JSON.stringify(articlesForAI)
|
|
}
|
|
],
|
|
})
|
|
|
|
const aiResponse = JSON.parse(completion.choices[0].message.content)
|
|
console.log(`✓ OpenAI returned ${aiResponse.groups.length} groups`)
|
|
|
|
// Enrich groups with source articles and images
|
|
const enrichedGroups = aiResponse.groups.map((group) => {
|
|
const groupArticles = group.articleIds
|
|
.map((id) => allArticles[id])
|
|
.filter(Boolean)
|
|
|
|
const images = groupArticles
|
|
.map((a) => a.image)
|
|
.filter(Boolean)
|
|
|
|
const sources = [...new Set(groupArticles.map((a) => a.source))]
|
|
const links = groupArticles.map((a) => ({ title: a.title, link: a.link, source: a.source }))
|
|
|
|
return {
|
|
title: group.title,
|
|
summary: group.summary,
|
|
category: group.category,
|
|
image: images[0] || null,
|
|
sources,
|
|
articles: links,
|
|
articleCount: groupArticles.length,
|
|
}
|
|
})
|
|
|
|
// Replace any source name mentions with [news]
|
|
const sanitizedGroups = sanitizeGroups(enrichedGroups)
|
|
|
|
// Cache the results
|
|
setCachedGroupedNews(sanitizedGroups)
|
|
console.log(`Cached ${sanitizedGroups.length} grouped news for 3 hours`)
|
|
|
|
res.json({ groups: sanitizedGroups, cached: false })
|
|
} catch (error) {
|
|
console.error('Grouped news error:', error)
|
|
res.status(500).json({ error: error.message })
|
|
}
|
|
})
|
|
|
|
function extractImage(item) {
|
|
if (item.enclosure?.url) return item.enclosure.url
|
|
if (item['media:content']?.$.url) return item['media:content'].$.url
|
|
if (item['media:thumbnail']?.$.url) return item['media:thumbnail'].$.url
|
|
|
|
const contentMatch = (item.content || '').match(/<img[^>]+src="([^"]+)"/)
|
|
if (contentMatch) return contentMatch[1]
|
|
|
|
return null
|
|
}
|
|
|
|
app.listen(PORT, () => {
|
|
console.log(`API server running on http://localhost:${PORT}`)
|
|
})
|