pre-repos/news-feeds/news-app/server.js

import 'dotenv/config'
import express from 'express'
import cors from 'cors'
import Parser from 'rss-parser'
import OpenAI from 'openai'
import Database from 'better-sqlite3'

const app = express()
const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY })
const parser = new Parser()
const PORT = 3001

// Initialize SQLite database for caching
const db = new Database('cache.db')
db.exec(`
  CREATE TABLE IF NOT EXISTS grouped_news_cache (
    id INTEGER PRIMARY KEY,
    data TEXT NOT NULL,
    created_at INTEGER NOT NULL
  )
`)

const CACHE_DURATION_MS = 3 * 60 * 60 * 1000 // 3 hours

function getCachedGroupedNews() {
  const row = db.prepare('SELECT data, created_at FROM grouped_news_cache WHERE id = 1').get()
  if (!row) return null

  const age = Date.now() - row.created_at
  if (age > CACHE_DURATION_MS) return null

  return { data: JSON.parse(row.data), age }
}

function setCachedGroupedNews(data) {
  const stmt = db.prepare('INSERT OR REPLACE INTO grouped_news_cache (id, data, created_at) VALUES (1, ?, ?)')
  stmt.run(JSON.stringify(data), Date.now())
}

function clearCache() {
  db.prepare('DELETE FROM grouped_news_cache').run()
}

// Source names to filter out from AI summaries
const SOURCE_NAMES = [
  'ABC News', 'ABC', 'NPR', 'CNN', 'Reuters', 'NBC News', 'NBC',
  'CBS News', 'CBS', 'NY Times', 'New York Times', 'NYT', 'AP News',
  'Associated Press', 'AP', 'BBC', 'Guardian', 'The Guardian'
]

function replaceSourceNames(text) {
  if (!text) return text
  let result = text
  // Sort by length descending to replace longer names first (e.g., "New York Times" before "NY")
  const sortedNames = [...SOURCE_NAMES].sort((a, b) => b.length - a.length)
  for (const name of sortedNames) {
    // Use word boundaries to only match whole words, not parts of other words
    const regex = new RegExp(`\\b${name}\\b`, 'gi')
    result = result.replace(regex, '[news]')
  }
  return result
}

function sanitizeGroups(groups) {
  return groups.map(group => {
    const newTitle = replaceSourceNames(group.title)
    const newSummary = replaceSourceNames(group.summary)
    if (newTitle !== group.title || newSummary !== group.summary) {
      console.log(`Replaced source names in group: "${group.title}"`)
    }
    return {
      ...group,
      title: newTitle,
      summary: newSummary,
    }
  })
}

app.use(cors())

const RSS_FEEDS = {
  abc: 'https://abcnews.go.com/abcnews/topstories',
  npr: 'https://feeds.npr.org/1001/rss.xml',
  cnn: 'http://rss.cnn.com/rss/cnn_topstories.rss',
  nbc: 'https://feeds.nbcnews.com/nbcnews/public/news',
  cbs: 'https://www.cbsnews.com/latest/rss/main',
  nytimes: 'https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml',
}

app.get('/api/news', async (req, res) => {
  try {
    const results = await Promise.allSettled(
      Object.entries(RSS_FEEDS).map(async ([source, url]) => {
        const feed = await parser.parseURL(url)
        return {
          source,
          title: feed.title,
          items: feed.items.map((item) => ({
            title: item.title,
            link: item.link,
            pubDate: item.pubDate,
            content: item.contentSnippet || item.content || '',
            source,
            image: extractImage(item),
          })),
        }
      })
    )

    const feeds = results
      .filter((r) => r.status === 'fulfilled')
      .map((r) => r.value)

    const errors = results
      .filter((r) => r.status === 'rejected')
      .map((r, i) => ({ source: Object.keys(RSS_FEEDS)[i], error: r.reason.message }))

    res.json({ feeds, errors })
  } catch (error) {
    res.status(500).json({ error: error.message })
  }
})

// Endpoint to clear the cache
app.post('/api/clear-cache', (req, res) => {
  clearCache()
  console.log('Cache cleared by user')
  res.json({ success: true, message: 'Cache cleared' })
})

app.get('/api/grouped-news', async (req, res) => {
  try {
    // Check if user wants to force refresh
    const forceRefresh = req.query.refresh === 'true'

    if (forceRefresh) {
      clearCache()
      console.log('Force refresh requested - cache cleared')
    }

    // Check cache first
    const cached = getCachedGroupedNews()
    if (cached) {
      const remainingMs = CACHE_DURATION_MS - cached.age
      const remainingMins = Math.round(remainingMs / 60000)
      console.log(`Serving cached grouped news (${remainingMins} minutes until refresh)`)
      return res.json({ groups: cached.data, cached: true, cacheExpiresIn: remainingMins })
    }

    console.log('Cache miss - fetching RSS feeds...')

    // Fetch all news first
    const results = await Promise.allSettled(
      Object.entries(RSS_FEEDS).map(async ([source, url]) => {
        console.log(`  Fetching ${source}...`)
        try {
          const feed = await parser.parseURL(url)
          console.log(`  ✓ ${source}: ${feed.items.length} articles`)
          return {
            source,
            items: feed.items.map((item) => ({
              title: item.title,
              link: item.link,
              pubDate: item.pubDate,
              content: item.contentSnippet || item.content || '',
              source,
              image: extractImage(item),
            })),
          }
        } catch (err) {
          console.log(`  ✗ ${source}: ${err.message}`)
          throw err
        }
      })
    )

    const feedResults = results
      .filter((r) => r.status === 'fulfilled')
      .map((r) => r.value)

    // Ensure at least 5 articles from each source, then fill rest by date
    const MIN_PER_SOURCE = 5
    const TOTAL_LIMIT = 50

    let selectedArticles = []
    const usedIds = new Set()

    // First pass: take up to MIN_PER_SOURCE from each source (sorted by date)
    for (const feed of feedResults) {
      const sorted = [...feed.items].sort((a, b) => new Date(b.pubDate) - new Date(a.pubDate))
      const toTake = sorted.slice(0, MIN_PER_SOURCE)
      for (const article of toTake) {
        const id = article.link
        if (!usedIds.has(id)) {
          usedIds.add(id)
          selectedArticles.push(article)
        }
      }
    }

    // Second pass: fill remaining slots with newest articles across all sources
    const allRemaining = feedResults
      .flatMap((f) => f.items)
      .filter((a) => !usedIds.has(a.link))
      .sort((a, b) => new Date(b.pubDate) - new Date(a.pubDate))

    const remaining = TOTAL_LIMIT - selectedArticles.length
    selectedArticles.push(...allRemaining.slice(0, remaining))

    // Final sort by date
    const allArticles = selectedArticles.sort((a, b) => new Date(b.pubDate) - new Date(a.pubDate))

    console.log(`Selected ${allArticles.length} articles (min ${MIN_PER_SOURCE}/source, then by date)`)

    if (allArticles.length === 0) {
      return res.json({ groups: [] })
    }

    // Send to OpenAI for grouping
    const articlesForAI = allArticles.map((a, i) => ({
      id: i,
      title: a.title,
      content: a.content?.slice(0, 200) || '',
      source: a.source,
    }))

    console.log(`Sending ${articlesForAI.length} articles to OpenAI gpt-5-mini...`)
    const completion = await openai.chat.completions.create({
      model: 'gpt-5-mini',
      messages: [
        {
          role: 'system',
          content: `You are a news analyst. Group articles that cover THE SAME SPECIFIC NEWS STORY together.

IMPORTANT RULES:
- Each group must contain articles about ONE specific news event or story
- Do NOT combine unrelated topics into a single group
- Do NOT create broad category groups (e.g., "Various Political News")
- Articles about different events should be in SEPARATE groups, even if they share a category
- It's better to have more specific groups than fewer broad ones
- If an article doesn't match any group, put it in its own single-article group

WRITING RULES:
- Write ORIGINAL titles - do NOT copy or closely paraphrase headlines from the source articles
- Write ORIGINAL summaries in your own words - do NOT copy sentences from the articles
- Synthesize information from multiple sources into a fresh, unique narrative
- Use different phrasing and sentence structure than the originals
- Never mention the news source names (ABC, CNN, NPR, etc.) in titles or summaries

Return JSON in this exact format:
{
  "groups": [
    {
      "title": "Your original headline for this story (max 80 chars)",
      "summary": "Your original summary synthesizing the story in your own words (max 500 chars)",
      "articleIds": [0, 1, 2],
      "category": "politics|business|technology|sports|entertainment|health|science|world|other"
    }
  ]
}
Only return valid JSON.`
        },
        {
          role: 'user',
          content: JSON.stringify(articlesForAI)
        }
      ],
    })

    const aiResponse = JSON.parse(completion.choices[0].message.content)
    console.log(`✓ OpenAI returned ${aiResponse.groups.length} groups`)

    // Enrich groups with source articles and images
    const enrichedGroups = aiResponse.groups.map((group) => {
      const groupArticles = group.articleIds
        .map((id) => allArticles[id])
        .filter(Boolean)

      const images = groupArticles
        .map((a) => a.image)
        .filter(Boolean)

      const sources = [...new Set(groupArticles.map((a) => a.source))]
      const links = groupArticles.map((a) => ({ title: a.title, link: a.link, source: a.source }))

      return {
        title: group.title,
        summary: group.summary,
        category: group.category,
        image: images[0] || null,
        sources,
        articles: links,
        articleCount: groupArticles.length,
      }
    })

    // Replace any source name mentions with [news]
    const sanitizedGroups = sanitizeGroups(enrichedGroups)

    // Cache the results
    setCachedGroupedNews(sanitizedGroups)
    console.log(`Cached ${sanitizedGroups.length} grouped news for 3 hours`)

    res.json({ groups: sanitizedGroups, cached: false })
  } catch (error) {
    console.error('Grouped news error:', error)
    res.status(500).json({ error: error.message })
  }
})

function extractImage(item) {
  if (item.enclosure?.url) return item.enclosure.url
  if (item['media:content']?.$.url) return item['media:content'].$.url
  if (item['media:thumbnail']?.$.url) return item['media:thumbnail'].$.url

  const contentMatch = (item.content || '').match(/<img[^>]+src="([^"]+)"/)
  if (contentMatch) return contentMatch[1]

  return null
}

app.listen(PORT, () => {
  console.log(`API server running on http://localhost:${PORT}`)
})