inital
This commit is contained in:
0
backend/app/__init__.py
Normal file
0
backend/app/__init__.py
Normal file
0
backend/app/ai/__init__.py
Normal file
0
backend/app/ai/__init__.py
Normal file
54
backend/app/ai/llm_client.py
Normal file
54
backend/app/ai/llm_client.py
Normal file
@@ -0,0 +1,54 @@
|
||||
import json
|
||||
import httpx
|
||||
|
||||
|
||||
class LLMClient:
|
||||
"""统一 LLM 接口,支持 OpenAI 兼容接口和 Anthropic 原生接口。"""
|
||||
|
||||
def __init__(self, provider: str, api_key: str, base_url: str, model: str):
|
||||
self.provider = provider.lower()
|
||||
self.api_key = api_key
|
||||
self.base_url = base_url.rstrip("/")
|
||||
self.model = model
|
||||
|
||||
async def complete(self, system_prompt: str, user_prompt: str) -> str:
|
||||
if self.provider == "anthropic":
|
||||
return await self._call_anthropic(system_prompt, user_prompt)
|
||||
return await self._call_openai_compat(system_prompt, user_prompt)
|
||||
|
||||
async def _call_openai_compat(self, system_prompt: str, user_prompt: str) -> str:
|
||||
"""适配 DeepSeek / 通义千问 / OpenAI 等兼容 /v1/chat/completions 的接口。"""
|
||||
async with httpx.AsyncClient(timeout=90) as client:
|
||||
resp = await client.post(
|
||||
f"{self.base_url}/v1/chat/completions",
|
||||
headers={"Authorization": f"Bearer {self.api_key}"},
|
||||
json={
|
||||
"model": self.model,
|
||||
"temperature": 0.2,
|
||||
"messages": [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_prompt},
|
||||
],
|
||||
},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return resp.json()["choices"][0]["message"]["content"]
|
||||
|
||||
async def _call_anthropic(self, system_prompt: str, user_prompt: str) -> str:
|
||||
async with httpx.AsyncClient(timeout=90) as client:
|
||||
resp = await client.post(
|
||||
f"{self.base_url}/v1/messages",
|
||||
headers={
|
||||
"x-api-key": self.api_key,
|
||||
"anthropic-version": "2023-06-01",
|
||||
"content-type": "application/json",
|
||||
},
|
||||
json={
|
||||
"model": self.model,
|
||||
"max_tokens": 2048,
|
||||
"system": system_prompt,
|
||||
"messages": [{"role": "user", "content": user_prompt}],
|
||||
},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return resp.json()["content"][0]["text"]
|
||||
189
backend/app/ai/processor.py
Normal file
189
backend/app/ai/processor.py
Normal file
@@ -0,0 +1,189 @@
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime, date
|
||||
from sqlalchemy import select, func
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from ..models.news import RawNews, ProcessedNews, LLMConfig, NewsSource, SystemLog
|
||||
from ..crawler.rss_fetcher import fetch_rss
|
||||
from .llm_client import LLMClient
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
SYSTEM_PROMPT = "你是医药行业资深分析师,擅长解读全球医药政策、临床研究、行业动态。"
|
||||
|
||||
ANALYSIS_PROMPT = """分析以下新闻,返回严格的 JSON 格式结果,不要包含任何其他文字。
|
||||
|
||||
新闻标题:{title}
|
||||
新闻内容:{content}
|
||||
新闻语言:{language}
|
||||
|
||||
返回格式:
|
||||
{{
|
||||
"is_medical_related": true,
|
||||
"title_zh": "中文标题(英文原文请翻译成简洁中文)",
|
||||
"summary": "中文摘要(100-150字,客观陈述核心内容)",
|
||||
"opinion": "核心观点或行业影响(50-100字,分析性语言,点明实际意义)",
|
||||
"keywords": ["关键词1", "关键词2", "关键词3", "关键词4", "关键词5"],
|
||||
"importance_score": 8.5,
|
||||
"importance_reason": "评分理由(30字内)",
|
||||
"category": "药品监管"
|
||||
}}
|
||||
|
||||
category 只能是以下四个之一:药品监管 / 临床研究 / 行业动态 / 政策法规
|
||||
|
||||
importance_score 评分标准(1-10):
|
||||
9-10:重大监管决定 / 突破性研究 / 影响整个行业的政策
|
||||
7-8 :行业重要动态,有明显商业或学术价值
|
||||
5-6 :常规行业新闻,有一定参考价值
|
||||
1-4 :普通资讯,信息价值有限
|
||||
"""
|
||||
|
||||
|
||||
async def _log(db: AsyncSession, level: str, event_type: str, message: str):
|
||||
db.add(SystemLog(level=level, event_type=event_type, message=message))
|
||||
await db.commit()
|
||||
|
||||
|
||||
async def _get_active_llm(db: AsyncSession) -> LLMConfig | None:
|
||||
result = await db.execute(select(LLMConfig).where(LLMConfig.is_active == True).limit(1))
|
||||
return result.scalar_one_or_none()
|
||||
|
||||
|
||||
async def _analyze_article(client: LLMClient, title: str, content: str, language: str) -> dict | None:
|
||||
prompt = ANALYSIS_PROMPT.format(
|
||||
title=title,
|
||||
content=content[:2000] if content else "(无正文)",
|
||||
language="中文" if language == "zh" else "英文",
|
||||
)
|
||||
try:
|
||||
raw = await client.complete(SYSTEM_PROMPT, prompt)
|
||||
raw = raw.strip()
|
||||
if raw.startswith("```"):
|
||||
raw = raw.split("```")[1]
|
||||
if raw.startswith("json"):
|
||||
raw = raw[4:]
|
||||
return json.loads(raw)
|
||||
except Exception as e:
|
||||
logger.warning(f"LLM parse error: {e}")
|
||||
return None
|
||||
|
||||
|
||||
async def _select_top_10(db: AsyncSession, target: date):
|
||||
"""Reset featured flags and elect TOP 10 with category diversity."""
|
||||
result = await db.execute(
|
||||
select(ProcessedNews)
|
||||
.where(func.date(ProcessedNews.processed_at) == target)
|
||||
.order_by(ProcessedNews.importance_score.desc())
|
||||
)
|
||||
all_news = result.scalars().all()
|
||||
|
||||
# Reset
|
||||
for n in all_news:
|
||||
n.is_featured = False
|
||||
n.featured_rank = None
|
||||
|
||||
categories = ["药品监管", "临床研究", "行业动态", "政策法规"]
|
||||
selected: list[ProcessedNews] = []
|
||||
seen_cats: set[str] = set()
|
||||
|
||||
# First pass: one guaranteed per category
|
||||
for cat in categories:
|
||||
for n in all_news:
|
||||
if n.category == cat and cat not in seen_cats and n not in selected:
|
||||
selected.append(n)
|
||||
seen_cats.add(cat)
|
||||
break
|
||||
|
||||
# Second pass: fill up to 10 by score
|
||||
for n in all_news:
|
||||
if len(selected) >= 10:
|
||||
break
|
||||
if n not in selected:
|
||||
selected.append(n)
|
||||
|
||||
for rank, n in enumerate(selected, start=1):
|
||||
n.is_featured = True
|
||||
n.featured_rank = rank
|
||||
|
||||
await db.commit()
|
||||
return len(selected)
|
||||
|
||||
|
||||
async def run_daily_pipeline(db: AsyncSession):
|
||||
await _log(db, "INFO", "pipeline_start", "每日流水线启动")
|
||||
|
||||
llm_cfg = await _get_active_llm(db)
|
||||
if not llm_cfg:
|
||||
await _log(db, "ERROR", "pipeline_error", "未找到激活的 LLM 配置,请在管理后台配置")
|
||||
return
|
||||
|
||||
client = LLMClient(
|
||||
provider=llm_cfg.provider,
|
||||
api_key=llm_cfg.api_key,
|
||||
base_url=llm_cfg.base_url,
|
||||
model=llm_cfg.model_name,
|
||||
)
|
||||
|
||||
# ── 1. 抓取 ──────────────────────────────────────────────────────────────
|
||||
sources_result = await db.execute(select(NewsSource).where(NewsSource.is_active == True))
|
||||
sources = sources_result.scalars().all()
|
||||
raw_added = 0
|
||||
|
||||
for src in sources:
|
||||
items = await fetch_rss(src.url)
|
||||
for item in items:
|
||||
exists = await db.execute(select(RawNews.id).where(RawNews.url == item["url"]))
|
||||
if exists.scalar_one_or_none():
|
||||
continue
|
||||
db.add(RawNews(
|
||||
source_id=src.id,
|
||||
title=item["title"],
|
||||
url=item["url"],
|
||||
raw_content=item["content"],
|
||||
published_at=item["published_at"],
|
||||
))
|
||||
raw_added += 1
|
||||
await db.commit()
|
||||
|
||||
await _log(db, "INFO", "crawl_done", f"抓取完成,新增 {raw_added} 条原始新闻")
|
||||
|
||||
# ── 2. AI 处理 ────────────────────────────────────────────────────────────
|
||||
pending_result = await db.execute(
|
||||
select(RawNews).join(RawNews.source).where(RawNews.status == "pending").limit(120)
|
||||
)
|
||||
pending = pending_result.scalars().all()
|
||||
processed_count = 0
|
||||
skipped_count = 0
|
||||
|
||||
for raw in pending:
|
||||
language = raw.source.language if raw.source else "zh"
|
||||
analysis = await _analyze_article(client, raw.title, raw.raw_content or "", language)
|
||||
|
||||
if not analysis or not analysis.get("is_medical_related"):
|
||||
raw.status = "skipped"
|
||||
skipped_count += 1
|
||||
else:
|
||||
db.add(ProcessedNews(
|
||||
raw_news_id=raw.id,
|
||||
title_zh=analysis.get("title_zh", raw.title),
|
||||
summary=analysis.get("summary", ""),
|
||||
opinion=analysis.get("opinion"),
|
||||
keywords=analysis.get("keywords", []),
|
||||
importance_score=float(analysis.get("importance_score", 5.0)),
|
||||
importance_reason=analysis.get("importance_reason"),
|
||||
category=analysis.get("category", "行业动态"),
|
||||
source_name=raw.source.name if raw.source else "",
|
||||
source_url=raw.url,
|
||||
published_at=raw.published_at,
|
||||
))
|
||||
raw.status = "processed"
|
||||
processed_count += 1
|
||||
|
||||
await db.commit()
|
||||
|
||||
await _log(db, "INFO", "process_done", f"AI 处理完成:{processed_count} 条入库,{skipped_count} 条跳过")
|
||||
|
||||
# ── 3. 精选 TOP 10 ────────────────────────────────────────────────────────
|
||||
featured = await _select_top_10(db, date.today())
|
||||
await _log(db, "INFO", "pipeline_done", f"流水线完成,精选 {featured} 条入今日 TOP 10")
|
||||
0
backend/app/api/__init__.py
Normal file
0
backend/app/api/__init__.py
Normal file
182
backend/app/api/admin.py
Normal file
182
backend/app/api/admin.py
Normal file
@@ -0,0 +1,182 @@
|
||||
from fastapi import APIRouter, Depends, HTTPException, Header
|
||||
from sqlalchemy import select, func
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional
|
||||
import asyncio
|
||||
|
||||
from ..database import get_db
|
||||
from ..models.news import LLMConfig, NewsSource, SystemLog, RawNews, ProcessedNews
|
||||
from ..config import settings
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
def verify_admin(authorization: str = Header(...)):
|
||||
token = authorization.removeprefix("Bearer ").strip()
|
||||
if token != settings.admin_token:
|
||||
raise HTTPException(status_code=401, detail="Invalid admin token")
|
||||
|
||||
|
||||
# ── LLM Config ────────────────────────────────────────────────────────────────
|
||||
|
||||
class LLMConfigIn(BaseModel):
|
||||
name: str
|
||||
provider: str
|
||||
api_key: str
|
||||
base_url: str
|
||||
model_name: str
|
||||
|
||||
|
||||
@router.get("/llm-config", dependencies=[Depends(verify_admin)])
|
||||
async def get_llm_config(db: AsyncSession = Depends(get_db)):
|
||||
result = await db.execute(select(LLMConfig).where(LLMConfig.is_active == True).limit(1))
|
||||
cfg = result.scalar_one_or_none()
|
||||
if not cfg:
|
||||
return None
|
||||
return {
|
||||
"id": cfg.id, "name": cfg.name, "provider": cfg.provider,
|
||||
"api_key": "***" + cfg.api_key[-4:] if len(cfg.api_key) > 4 else "****",
|
||||
"base_url": cfg.base_url, "model_name": cfg.model_name,
|
||||
}
|
||||
|
||||
|
||||
@router.post("/llm-config", dependencies=[Depends(verify_admin)])
|
||||
async def save_llm_config(body: LLMConfigIn, db: AsyncSession = Depends(get_db)):
|
||||
await db.execute(
|
||||
LLMConfig.__table__.update().values(is_active=False)
|
||||
)
|
||||
cfg = LLMConfig(**body.model_dump(), is_active=True)
|
||||
db.add(cfg)
|
||||
await db.commit()
|
||||
return {"ok": True, "id": cfg.id}
|
||||
|
||||
|
||||
@router.post("/llm-config/test", dependencies=[Depends(verify_admin)])
|
||||
async def test_llm_config(body: LLMConfigIn):
|
||||
from ..ai.llm_client import LLMClient
|
||||
client = LLMClient(
|
||||
provider=body.provider,
|
||||
api_key=body.api_key,
|
||||
base_url=body.base_url,
|
||||
model=body.model_name,
|
||||
)
|
||||
try:
|
||||
reply = await client.complete(
|
||||
system_prompt="你是一个助手。",
|
||||
user_prompt="请回复'连接正常',不要说其他内容。",
|
||||
)
|
||||
return {"ok": True, "reply": reply}
|
||||
except Exception as e:
|
||||
return {"ok": False, "error": str(e)}
|
||||
|
||||
|
||||
# ── News Sources ──────────────────────────────────────────────────────────────
|
||||
|
||||
class SourceIn(BaseModel):
|
||||
name: str
|
||||
url: str
|
||||
source_type: str = "rss"
|
||||
language: str = "zh"
|
||||
category: Optional[str] = None
|
||||
|
||||
|
||||
@router.get("/sources", dependencies=[Depends(verify_admin)])
|
||||
async def get_sources(db: AsyncSession = Depends(get_db)):
|
||||
result = await db.execute(select(NewsSource).order_by(NewsSource.id))
|
||||
sources = result.scalars().all()
|
||||
return [
|
||||
{"id": s.id, "name": s.name, "url": s.url, "source_type": s.source_type,
|
||||
"language": s.language, "category": s.category, "is_active": s.is_active}
|
||||
for s in sources
|
||||
]
|
||||
|
||||
|
||||
@router.post("/sources", dependencies=[Depends(verify_admin)])
|
||||
async def add_source(body: SourceIn, db: AsyncSession = Depends(get_db)):
|
||||
src = NewsSource(**body.model_dump())
|
||||
db.add(src)
|
||||
await db.commit()
|
||||
return {"ok": True, "id": src.id}
|
||||
|
||||
|
||||
@router.put("/sources/{source_id}", dependencies=[Depends(verify_admin)])
|
||||
async def toggle_source(source_id: int, body: dict, db: AsyncSession = Depends(get_db)):
|
||||
result = await db.execute(select(NewsSource).where(NewsSource.id == source_id))
|
||||
src = result.scalar_one_or_none()
|
||||
if not src:
|
||||
raise HTTPException(status_code=404)
|
||||
if "is_active" in body:
|
||||
src.is_active = body["is_active"]
|
||||
await db.commit()
|
||||
return {"ok": True}
|
||||
|
||||
|
||||
@router.delete("/sources/{source_id}", dependencies=[Depends(verify_admin)])
|
||||
async def delete_source(source_id: int, db: AsyncSession = Depends(get_db)):
|
||||
result = await db.execute(select(NewsSource).where(NewsSource.id == source_id))
|
||||
src = result.scalar_one_or_none()
|
||||
if src:
|
||||
await db.delete(src)
|
||||
await db.commit()
|
||||
return {"ok": True}
|
||||
|
||||
|
||||
# ── Trigger & Stats ───────────────────────────────────────────────────────────
|
||||
|
||||
_pipeline_running = False
|
||||
|
||||
|
||||
@router.post("/crawl/trigger", dependencies=[Depends(verify_admin)])
|
||||
async def trigger_crawl():
|
||||
global _pipeline_running
|
||||
if _pipeline_running:
|
||||
return {"ok": False, "message": "Pipeline already running"}
|
||||
_pipeline_running = True
|
||||
asyncio.create_task(_run_pipeline())
|
||||
return {"ok": True, "message": "Pipeline started"}
|
||||
|
||||
|
||||
async def _run_pipeline():
|
||||
global _pipeline_running
|
||||
from ..scheduler import trigger_now
|
||||
try:
|
||||
await trigger_now()
|
||||
finally:
|
||||
_pipeline_running = False
|
||||
|
||||
|
||||
@router.get("/stats", dependencies=[Depends(verify_admin)])
|
||||
async def get_stats(db: AsyncSession = Depends(get_db)):
|
||||
from datetime import date
|
||||
today = date.today()
|
||||
raw_today = (await db.execute(
|
||||
select(func.count(RawNews.id)).where(func.date(RawNews.crawled_at) == today)
|
||||
)).scalar_one()
|
||||
processed_today = (await db.execute(
|
||||
select(func.count(ProcessedNews.id)).where(func.date(ProcessedNews.processed_at) == today)
|
||||
)).scalar_one()
|
||||
featured_today = (await db.execute(
|
||||
select(func.count(ProcessedNews.id))
|
||||
.where(func.date(ProcessedNews.processed_at) == today)
|
||||
.where(ProcessedNews.is_featured == True)
|
||||
)).scalar_one()
|
||||
return {
|
||||
"raw_today": raw_today,
|
||||
"processed_today": processed_today,
|
||||
"featured_today": featured_today,
|
||||
"pipeline_running": _pipeline_running,
|
||||
}
|
||||
|
||||
|
||||
@router.get("/logs", dependencies=[Depends(verify_admin)])
|
||||
async def get_logs(limit: int = 100, db: AsyncSession = Depends(get_db)):
|
||||
result = await db.execute(
|
||||
select(SystemLog).order_by(SystemLog.created_at.desc()).limit(limit)
|
||||
)
|
||||
logs = result.scalars().all()
|
||||
return [
|
||||
{"id": l.id, "level": l.level, "event_type": l.event_type,
|
||||
"message": l.message, "created_at": l.created_at.isoformat()}
|
||||
for l in logs
|
||||
]
|
||||
96
backend/app/api/news.py
Normal file
96
backend/app/api/news.py
Normal file
@@ -0,0 +1,96 @@
|
||||
from datetime import date, datetime
|
||||
from typing import Optional
|
||||
from fastapi import APIRouter, Depends, Query
|
||||
from sqlalchemy import select, func, distinct
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from ..database import get_db
|
||||
from ..models.news import ProcessedNews, RawNews
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
def _serialize(n: ProcessedNews) -> dict:
|
||||
raw = n.raw_news
|
||||
return {
|
||||
"id": n.id,
|
||||
"title_zh": n.title_zh,
|
||||
"summary": n.summary,
|
||||
"opinion": n.opinion,
|
||||
"keywords": n.keywords or [],
|
||||
"importance_score": n.importance_score,
|
||||
"importance_reason": n.importance_reason,
|
||||
"category": n.category,
|
||||
"is_featured": n.is_featured,
|
||||
"featured_rank": n.featured_rank,
|
||||
"source_name": n.source_name or (raw.source.name if raw and raw.source else ""),
|
||||
"source_url": n.source_url or (raw.url if raw else ""),
|
||||
"published_at": n.published_at.isoformat() if n.published_at else None,
|
||||
"processed_at": n.processed_at.isoformat() if n.processed_at else None,
|
||||
}
|
||||
|
||||
|
||||
@router.get("/featured")
|
||||
async def get_featured(
|
||||
news_date: Optional[str] = Query(default=None, alias="date"),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
):
|
||||
target = date.fromisoformat(news_date) if news_date else date.today()
|
||||
stmt = (
|
||||
select(ProcessedNews)
|
||||
.join(ProcessedNews.raw_news)
|
||||
.where(ProcessedNews.is_featured == True)
|
||||
.where(func.date(ProcessedNews.processed_at) == target)
|
||||
.order_by(ProcessedNews.featured_rank)
|
||||
)
|
||||
result = await db.execute(stmt)
|
||||
items = result.scalars().all()
|
||||
return {"date": str(target), "items": [_serialize(n) for n in items]}
|
||||
|
||||
|
||||
@router.get("")
|
||||
async def get_news(
|
||||
news_date: Optional[str] = Query(default=None, alias="date"),
|
||||
category: Optional[str] = Query(default=None),
|
||||
page: int = Query(default=1, ge=1),
|
||||
page_size: int = Query(default=20, ge=1, le=100),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
):
|
||||
target = date.fromisoformat(news_date) if news_date else date.today()
|
||||
stmt = (
|
||||
select(ProcessedNews)
|
||||
.join(ProcessedNews.raw_news)
|
||||
.where(func.date(ProcessedNews.processed_at) == target)
|
||||
)
|
||||
if category:
|
||||
stmt = stmt.where(ProcessedNews.category == category)
|
||||
|
||||
count_stmt = select(func.count()).select_from(stmt.subquery())
|
||||
total = (await db.execute(count_stmt)).scalar_one()
|
||||
|
||||
stmt = stmt.order_by(ProcessedNews.importance_score.desc()).offset((page - 1) * page_size).limit(page_size)
|
||||
result = await db.execute(stmt)
|
||||
items = result.scalars().all()
|
||||
|
||||
return {"date": str(target), "total": total, "page": page, "items": [_serialize(n) for n in items]}
|
||||
|
||||
|
||||
@router.get("/dates")
|
||||
async def get_dates(db: AsyncSession = Depends(get_db)):
|
||||
stmt = select(
|
||||
func.date(ProcessedNews.processed_at).label("d"),
|
||||
func.count(ProcessedNews.id).label("cnt"),
|
||||
).group_by("d").order_by(func.date(ProcessedNews.processed_at).desc()).limit(30)
|
||||
result = await db.execute(stmt)
|
||||
return [{"date": str(row.d), "count": row.cnt} for row in result]
|
||||
|
||||
|
||||
@router.get("/{news_id}")
|
||||
async def get_news_detail(news_id: int, db: AsyncSession = Depends(get_db)):
|
||||
stmt = select(ProcessedNews).join(ProcessedNews.raw_news).where(ProcessedNews.id == news_id)
|
||||
result = await db.execute(stmt)
|
||||
news = result.scalar_one_or_none()
|
||||
if not news:
|
||||
from fastapi import HTTPException
|
||||
raise HTTPException(status_code=404, detail="Not found")
|
||||
return _serialize(news)
|
||||
17
backend/app/config.py
Normal file
17
backend/app/config.py
Normal file
@@ -0,0 +1,17 @@
|
||||
from pydantic_settings import BaseSettings
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
database_url: str = "postgresql+asyncpg://pharma:pharma123@localhost/pharma_news"
|
||||
admin_token: str = "change-me-admin-token"
|
||||
|
||||
initial_llm_provider: str = "deepseek"
|
||||
initial_llm_api_key: str = ""
|
||||
initial_llm_base_url: str = "https://api.deepseek.com"
|
||||
initial_llm_model: str = "deepseek-chat"
|
||||
|
||||
class Config:
|
||||
env_file = ".env"
|
||||
|
||||
|
||||
settings = Settings()
|
||||
0
backend/app/crawler/__init__.py
Normal file
0
backend/app/crawler/__init__.py
Normal file
73
backend/app/crawler/rss_fetcher.py
Normal file
73
backend/app/crawler/rss_fetcher.py
Normal file
@@ -0,0 +1,73 @@
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from email.utils import parsedate_to_datetime
|
||||
from typing import Optional
|
||||
import feedparser
|
||||
import httpx
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; PharmaIntelBot/1.0)"
|
||||
}
|
||||
|
||||
|
||||
def _parse_date(raw: str) -> Optional[datetime]:
|
||||
if not raw:
|
||||
return None
|
||||
try:
|
||||
return parsedate_to_datetime(raw).replace(tzinfo=None)
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
return datetime.fromisoformat(raw.replace("Z", "+00:00")).replace(tzinfo=None)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
async def fetch_rss(url: str, max_items: int = 30) -> list[dict]:
|
||||
try:
|
||||
async with httpx.AsyncClient(headers=HEADERS, timeout=30, follow_redirects=True) as client:
|
||||
resp = await client.get(url)
|
||||
resp.raise_for_status()
|
||||
text = resp.text
|
||||
except Exception as e:
|
||||
logger.warning(f"RSS fetch failed {url}: {e}")
|
||||
return []
|
||||
|
||||
feed = feedparser.parse(text)
|
||||
items = []
|
||||
for entry in feed.entries[:max_items]:
|
||||
title = entry.get("title", "").strip()
|
||||
link = entry.get("link", "").strip()
|
||||
if not title or not link:
|
||||
continue
|
||||
|
||||
content = (
|
||||
entry.get("summary", "")
|
||||
or entry.get("content", [{}])[0].get("value", "")
|
||||
).strip()
|
||||
|
||||
published_raw = entry.get("published") or entry.get("updated") or ""
|
||||
items.append({
|
||||
"title": title,
|
||||
"url": link,
|
||||
"content": content[:3000],
|
||||
"published_at": _parse_date(published_raw),
|
||||
})
|
||||
|
||||
logger.info(f"RSS {url}: got {len(items)} items")
|
||||
return items
|
||||
|
||||
|
||||
# 默认新闻源(管理页可增删)
|
||||
DEFAULT_SOURCES = [
|
||||
# 中文
|
||||
{"name": "国家药监局", "url": "https://www.nmpa.gov.cn/rss/yaopinxinxi.xml", "language": "zh", "category": "药品监管"},
|
||||
{"name": "丁香园", "url": "https://www.dxy.cn/bbs/feed.xml", "language": "zh", "category": "临床研究"},
|
||||
{"name": "医学界", "url": "https://www.yxj.org.cn/rss.xml", "language": "zh", "category": "行业动态"},
|
||||
# 英文
|
||||
{"name": "STAT News", "url": "https://www.statnews.com/feed/", "language": "en", "category": "临床研究"},
|
||||
{"name": "FiercePharma", "url": "https://www.fiercepharma.com/rss/xml", "language": "en", "category": "行业动态"},
|
||||
{"name": "FDA News", "url": "https://www.fda.gov/about-fda/contact-fda/stay-informed/rss-feeds/fda-news-feed/rss.xml", "language": "en", "category": "药品监管"},
|
||||
]
|
||||
21
backend/app/database.py
Normal file
21
backend/app/database.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker
|
||||
from sqlalchemy.orm import DeclarativeBase
|
||||
from .config import settings
|
||||
|
||||
engine = create_async_engine(settings.database_url, echo=False)
|
||||
AsyncSessionLocal = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
|
||||
|
||||
|
||||
class Base(DeclarativeBase):
|
||||
pass
|
||||
|
||||
|
||||
async def get_db():
|
||||
async with AsyncSessionLocal() as session:
|
||||
yield session
|
||||
|
||||
|
||||
async def create_tables():
|
||||
async with engine.begin() as conn:
|
||||
from .models import news # noqa: ensure models are registered
|
||||
await conn.run_sync(Base.metadata.create_all)
|
||||
57
backend/app/main.py
Normal file
57
backend/app/main.py
Normal file
@@ -0,0 +1,57 @@
|
||||
from contextlib import asynccontextmanager
|
||||
from fastapi import FastAPI
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
|
||||
from .database import create_tables, AsyncSessionLocal
|
||||
from .scheduler import start_scheduler, shutdown_scheduler
|
||||
from .api import news, admin
|
||||
from .config import settings
|
||||
from .models.news import LLMConfig
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
await create_tables()
|
||||
await seed_initial_llm_config()
|
||||
start_scheduler()
|
||||
yield
|
||||
shutdown_scheduler()
|
||||
|
||||
|
||||
async def seed_initial_llm_config():
|
||||
"""Insert default LLM config on first run if none exists."""
|
||||
from sqlalchemy import select
|
||||
async with AsyncSessionLocal() as db:
|
||||
result = await db.execute(select(LLMConfig).limit(1))
|
||||
if result.scalar_one_or_none():
|
||||
return
|
||||
if not settings.initial_llm_api_key:
|
||||
return
|
||||
config = LLMConfig(
|
||||
name="默认配置",
|
||||
provider=settings.initial_llm_provider,
|
||||
api_key=settings.initial_llm_api_key,
|
||||
base_url=settings.initial_llm_base_url,
|
||||
model_name=settings.initial_llm_model,
|
||||
is_active=True,
|
||||
)
|
||||
db.add(config)
|
||||
await db.commit()
|
||||
|
||||
|
||||
app = FastAPI(title="医药情报 API", version="1.0.0", lifespan=lifespan)
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
app.include_router(news.router, prefix="/api/news", tags=["news"])
|
||||
app.include_router(admin.router, prefix="/api/admin", tags=["admin"])
|
||||
|
||||
|
||||
@app.get("/api/health")
|
||||
async def health():
|
||||
return {"status": "ok"}
|
||||
0
backend/app/models/__init__.py
Normal file
0
backend/app/models/__init__.py
Normal file
81
backend/app/models/news.py
Normal file
81
backend/app/models/news.py
Normal file
@@ -0,0 +1,81 @@
|
||||
from datetime import datetime
|
||||
from typing import Optional, List
|
||||
from sqlalchemy import String, Text, Integer, Float, Boolean, DateTime, ForeignKey, ARRAY
|
||||
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
||||
from ..database import Base
|
||||
|
||||
|
||||
class NewsSource(Base):
|
||||
__tablename__ = "news_sources"
|
||||
|
||||
id: Mapped[int] = mapped_column(primary_key=True)
|
||||
name: Mapped[str] = mapped_column(String(100))
|
||||
url: Mapped[str] = mapped_column(String(500))
|
||||
source_type: Mapped[str] = mapped_column(String(20), default="rss") # rss | scrape
|
||||
language: Mapped[str] = mapped_column(String(5), default="zh") # zh | en
|
||||
category: Mapped[Optional[str]] = mapped_column(String(50))
|
||||
is_active: Mapped[bool] = mapped_column(Boolean, default=True)
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
|
||||
|
||||
raw_news: Mapped[List["RawNews"]] = relationship(back_populates="source")
|
||||
|
||||
|
||||
class RawNews(Base):
|
||||
__tablename__ = "raw_news"
|
||||
|
||||
id: Mapped[int] = mapped_column(primary_key=True)
|
||||
source_id: Mapped[Optional[int]] = mapped_column(ForeignKey("news_sources.id"))
|
||||
title: Mapped[str] = mapped_column(String(500))
|
||||
url: Mapped[str] = mapped_column(String(1000), unique=True)
|
||||
raw_content: Mapped[Optional[str]] = mapped_column(Text)
|
||||
published_at: Mapped[Optional[datetime]] = mapped_column(DateTime)
|
||||
crawled_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
|
||||
status: Mapped[str] = mapped_column(String(20), default="pending") # pending|processed|skipped|error
|
||||
|
||||
source: Mapped[Optional["NewsSource"]] = relationship(back_populates="raw_news")
|
||||
processed: Mapped[Optional["ProcessedNews"]] = relationship(back_populates="raw_news", uselist=False)
|
||||
|
||||
|
||||
class ProcessedNews(Base):
|
||||
__tablename__ = "processed_news"
|
||||
|
||||
id: Mapped[int] = mapped_column(primary_key=True)
|
||||
raw_news_id: Mapped[int] = mapped_column(ForeignKey("raw_news.id"))
|
||||
title_zh: Mapped[str] = mapped_column(String(500))
|
||||
summary: Mapped[str] = mapped_column(Text)
|
||||
opinion: Mapped[Optional[str]] = mapped_column(Text)
|
||||
keywords: Mapped[Optional[List[str]]] = mapped_column(ARRAY(String))
|
||||
importance_score: Mapped[float] = mapped_column(Float, default=5.0)
|
||||
importance_reason: Mapped[Optional[str]] = mapped_column(Text)
|
||||
category: Mapped[str] = mapped_column(String(50), default="行业动态")
|
||||
is_featured: Mapped[bool] = mapped_column(Boolean, default=False)
|
||||
featured_rank: Mapped[Optional[int]] = mapped_column(Integer)
|
||||
source_name: Mapped[Optional[str]] = mapped_column(String(200))
|
||||
source_url: Mapped[Optional[str]] = mapped_column(String(1000))
|
||||
published_at: Mapped[Optional[datetime]] = mapped_column(DateTime)
|
||||
processed_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
|
||||
|
||||
raw_news: Mapped["RawNews"] = relationship(back_populates="processed")
|
||||
|
||||
|
||||
class LLMConfig(Base):
|
||||
__tablename__ = "llm_config"
|
||||
|
||||
id: Mapped[int] = mapped_column(primary_key=True)
|
||||
name: Mapped[str] = mapped_column(String(100))
|
||||
provider: Mapped[str] = mapped_column(String(50)) # openai | anthropic | qwen | deepseek | custom
|
||||
api_key: Mapped[str] = mapped_column(String(500))
|
||||
base_url: Mapped[str] = mapped_column(String(500))
|
||||
model_name: Mapped[str] = mapped_column(String(200))
|
||||
is_active: Mapped[bool] = mapped_column(Boolean, default=True)
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
|
||||
|
||||
|
||||
class SystemLog(Base):
|
||||
__tablename__ = "system_logs"
|
||||
|
||||
id: Mapped[int] = mapped_column(primary_key=True)
|
||||
event_type: Mapped[str] = mapped_column(String(50))
|
||||
message: Mapped[str] = mapped_column(Text)
|
||||
level: Mapped[str] = mapped_column(String(20), default="INFO")
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
|
||||
31
backend/app/scheduler.py
Normal file
31
backend/app/scheduler.py
Normal file
@@ -0,0 +1,31 @@
|
||||
import logging
|
||||
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
||||
from apscheduler.triggers.cron import CronTrigger
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
scheduler = AsyncIOScheduler(timezone="Asia/Shanghai")
|
||||
|
||||
|
||||
async def daily_pipeline_job():
|
||||
from .database import AsyncSessionLocal
|
||||
from .ai.processor import run_daily_pipeline
|
||||
async with AsyncSessionLocal() as db:
|
||||
try:
|
||||
await run_daily_pipeline(db)
|
||||
except Exception as e:
|
||||
logger.error(f"Daily pipeline failed: {e}", exc_info=True)
|
||||
|
||||
|
||||
def start_scheduler():
|
||||
scheduler.add_job(daily_pipeline_job, CronTrigger(hour=6, minute=0), id="daily_pipeline", replace_existing=True)
|
||||
scheduler.start()
|
||||
logger.info("Scheduler started — daily pipeline runs at 06:00 Asia/Shanghai")
|
||||
|
||||
|
||||
def shutdown_scheduler():
|
||||
scheduler.shutdown(wait=False)
|
||||
|
||||
|
||||
async def trigger_now():
|
||||
"""Manually trigger the pipeline (called from admin API)."""
|
||||
await daily_pipeline_job()
|
||||
Reference in New Issue
Block a user