74 lines
2.5 KiB
Python
74 lines
2.5 KiB
Python
import logging
|
|
from datetime import datetime
|
|
from email.utils import parsedate_to_datetime
|
|
from typing import Optional
|
|
import feedparser
|
|
import httpx
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
HEADERS = {
|
|
"User-Agent": "Mozilla/5.0 (compatible; PharmaIntelBot/1.0)"
|
|
}
|
|
|
|
|
|
def _parse_date(raw: str) -> Optional[datetime]:
|
|
if not raw:
|
|
return None
|
|
try:
|
|
return parsedate_to_datetime(raw).replace(tzinfo=None)
|
|
except Exception:
|
|
pass
|
|
try:
|
|
return datetime.fromisoformat(raw.replace("Z", "+00:00")).replace(tzinfo=None)
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
async def fetch_rss(url: str, max_items: int = 30) -> list[dict]:
|
|
try:
|
|
async with httpx.AsyncClient(headers=HEADERS, timeout=30, follow_redirects=True) as client:
|
|
resp = await client.get(url)
|
|
resp.raise_for_status()
|
|
text = resp.text
|
|
except Exception as e:
|
|
logger.warning(f"RSS fetch failed {url}: {e}")
|
|
return []
|
|
|
|
feed = feedparser.parse(text)
|
|
items = []
|
|
for entry in feed.entries[:max_items]:
|
|
title = entry.get("title", "").strip()
|
|
link = entry.get("link", "").strip()
|
|
if not title or not link:
|
|
continue
|
|
|
|
content = (
|
|
entry.get("summary", "")
|
|
or entry.get("content", [{}])[0].get("value", "")
|
|
).strip()
|
|
|
|
published_raw = entry.get("published") or entry.get("updated") or ""
|
|
items.append({
|
|
"title": title,
|
|
"url": link,
|
|
"content": content[:3000],
|
|
"published_at": _parse_date(published_raw),
|
|
})
|
|
|
|
logger.info(f"RSS {url}: got {len(items)} items")
|
|
return items
|
|
|
|
|
|
# 默认新闻源(管理页可增删)
|
|
DEFAULT_SOURCES = [
|
|
# 中文
|
|
{"name": "国家药监局", "url": "https://www.nmpa.gov.cn/rss/yaopinxinxi.xml", "language": "zh", "category": "药品监管"},
|
|
{"name": "丁香园", "url": "https://www.dxy.cn/bbs/feed.xml", "language": "zh", "category": "临床研究"},
|
|
{"name": "医学界", "url": "https://www.yxj.org.cn/rss.xml", "language": "zh", "category": "行业动态"},
|
|
# 英文
|
|
{"name": "STAT News", "url": "https://www.statnews.com/feed/", "language": "en", "category": "临床研究"},
|
|
{"name": "FiercePharma", "url": "https://www.fiercepharma.com/rss/xml", "language": "en", "category": "行业动态"},
|
|
{"name": "FDA News", "url": "https://www.fda.gov/about-fda/contact-fda/stay-informed/rss-feeds/fda-news-feed/rss.xml", "language": "en", "category": "药品监管"},
|
|
]
|