Files
aihot/backend/app/crawler/rss_fetcher.py
2026-05-24 01:16:07 +08:00

74 lines
2.5 KiB
Python

import logging
from datetime import datetime
from email.utils import parsedate_to_datetime
from typing import Optional
import feedparser
import httpx
logger = logging.getLogger(__name__)
HEADERS = {
"User-Agent": "Mozilla/5.0 (compatible; PharmaIntelBot/1.0)"
}
def _parse_date(raw: str) -> Optional[datetime]:
if not raw:
return None
try:
return parsedate_to_datetime(raw).replace(tzinfo=None)
except Exception:
pass
try:
return datetime.fromisoformat(raw.replace("Z", "+00:00")).replace(tzinfo=None)
except Exception:
return None
async def fetch_rss(url: str, max_items: int = 30) -> list[dict]:
try:
async with httpx.AsyncClient(headers=HEADERS, timeout=30, follow_redirects=True) as client:
resp = await client.get(url)
resp.raise_for_status()
text = resp.text
except Exception as e:
logger.warning(f"RSS fetch failed {url}: {e}")
return []
feed = feedparser.parse(text)
items = []
for entry in feed.entries[:max_items]:
title = entry.get("title", "").strip()
link = entry.get("link", "").strip()
if not title or not link:
continue
content = (
entry.get("summary", "")
or entry.get("content", [{}])[0].get("value", "")
).strip()
published_raw = entry.get("published") or entry.get("updated") or ""
items.append({
"title": title,
"url": link,
"content": content[:3000],
"published_at": _parse_date(published_raw),
})
logger.info(f"RSS {url}: got {len(items)} items")
return items
# 默认新闻源(管理页可增删)
DEFAULT_SOURCES = [
# 中文
{"name": "国家药监局", "url": "https://www.nmpa.gov.cn/rss/yaopinxinxi.xml", "language": "zh", "category": "药品监管"},
{"name": "丁香园", "url": "https://www.dxy.cn/bbs/feed.xml", "language": "zh", "category": "临床研究"},
{"name": "医学界", "url": "https://www.yxj.org.cn/rss.xml", "language": "zh", "category": "行业动态"},
# 英文
{"name": "STAT News", "url": "https://www.statnews.com/feed/", "language": "en", "category": "临床研究"},
{"name": "FiercePharma", "url": "https://www.fiercepharma.com/rss/xml", "language": "en", "category": "行业动态"},
{"name": "FDA News", "url": "https://www.fda.gov/about-fda/contact-fda/stay-informed/rss-feeds/fda-news-feed/rss.xml", "language": "en", "category": "药品监管"},
]