inital
This commit is contained in:
73
backend/app/crawler/rss_fetcher.py
Normal file
73
backend/app/crawler/rss_fetcher.py
Normal file
@@ -0,0 +1,73 @@
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from email.utils import parsedate_to_datetime
|
||||
from typing import Optional
|
||||
import feedparser
|
||||
import httpx
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; PharmaIntelBot/1.0)"
|
||||
}
|
||||
|
||||
|
||||
def _parse_date(raw: str) -> Optional[datetime]:
|
||||
if not raw:
|
||||
return None
|
||||
try:
|
||||
return parsedate_to_datetime(raw).replace(tzinfo=None)
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
return datetime.fromisoformat(raw.replace("Z", "+00:00")).replace(tzinfo=None)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
async def fetch_rss(url: str, max_items: int = 30) -> list[dict]:
|
||||
try:
|
||||
async with httpx.AsyncClient(headers=HEADERS, timeout=30, follow_redirects=True) as client:
|
||||
resp = await client.get(url)
|
||||
resp.raise_for_status()
|
||||
text = resp.text
|
||||
except Exception as e:
|
||||
logger.warning(f"RSS fetch failed {url}: {e}")
|
||||
return []
|
||||
|
||||
feed = feedparser.parse(text)
|
||||
items = []
|
||||
for entry in feed.entries[:max_items]:
|
||||
title = entry.get("title", "").strip()
|
||||
link = entry.get("link", "").strip()
|
||||
if not title or not link:
|
||||
continue
|
||||
|
||||
content = (
|
||||
entry.get("summary", "")
|
||||
or entry.get("content", [{}])[0].get("value", "")
|
||||
).strip()
|
||||
|
||||
published_raw = entry.get("published") or entry.get("updated") or ""
|
||||
items.append({
|
||||
"title": title,
|
||||
"url": link,
|
||||
"content": content[:3000],
|
||||
"published_at": _parse_date(published_raw),
|
||||
})
|
||||
|
||||
logger.info(f"RSS {url}: got {len(items)} items")
|
||||
return items
|
||||
|
||||
|
||||
# 默认新闻源(管理页可增删)
|
||||
DEFAULT_SOURCES = [
|
||||
# 中文
|
||||
{"name": "国家药监局", "url": "https://www.nmpa.gov.cn/rss/yaopinxinxi.xml", "language": "zh", "category": "药品监管"},
|
||||
{"name": "丁香园", "url": "https://www.dxy.cn/bbs/feed.xml", "language": "zh", "category": "临床研究"},
|
||||
{"name": "医学界", "url": "https://www.yxj.org.cn/rss.xml", "language": "zh", "category": "行业动态"},
|
||||
# 英文
|
||||
{"name": "STAT News", "url": "https://www.statnews.com/feed/", "language": "en", "category": "临床研究"},
|
||||
{"name": "FiercePharma", "url": "https://www.fiercepharma.com/rss/xml", "language": "en", "category": "行业动态"},
|
||||
{"name": "FDA News", "url": "https://www.fda.gov/about-fda/contact-fda/stay-informed/rss-feeds/fda-news-feed/rss.xml", "language": "en", "category": "药品监管"},
|
||||
]
|
||||
Reference in New Issue
Block a user