v1.0定版

This commit is contained in:
2026-05-27 17:14:08 +08:00
parent 1b7210de4f
commit 5b19d9fe69
32 changed files with 2074 additions and 2915 deletions

View File

@@ -25,18 +25,20 @@ ANALYSIS_PROMPT = """分析以下新闻,返回严格的 JSON 格式结果,
"summary": "中文摘要100-150字客观陈述核心内容",
"opinion": "核心观点或行业影响50-100字分析性语言点明实际意义",
"keywords": ["关键词1", "关键词2", "关键词3", "关键词4", "关键词5"],
"importance_score": 8.5,
"importance_score": 85,
"importance_reason": "评分理由30字内",
"category": "药品监管"
}}
category 只能是以下四个之一:药品监管 / 临床研究 / 行业动态 / 政策法规
importance_score 评分标准1-10
9-10重大监管决定 / 突破性研究 / 影响整个行业的政策
7-8 :行业重要动态,有明显商业或学术价值
5-6 :常规行业新闻,有一定参考价值
1-4 :普通资讯,信息价值有限
importance_score 评分标准1-100整数
90-100:重大监管决定 / 突破性研究 / 影响整个行业的政策
70-89 :行业重要动态,有明显商业或学术价值
50-69 :常规行业新闻,有一定参考价值
1-49 :普通资讯,信息价值有限
注意:只有 85 分及以上的新闻才有资格进入每日精选,请严格区分。
"""
@@ -70,7 +72,8 @@ async def _analyze_article(client: LLMClient, title: str, content: str, language
async def _select_top_10(db: AsyncSession, target: date):
"""Reset featured flags and elect TOP 10 with category diversity."""
"""Reset featured flags and elect TOP 10 with category diversity.
Only news with importance_score >= 85 is eligible for 精选."""
result = await db.execute(
select(ProcessedNews)
.where(func.date(ProcessedNews.processed_at) == target)
@@ -78,25 +81,28 @@ async def _select_top_10(db: AsyncSession, target: date):
)
all_news = result.scalars().all()
# Reset
# Reset all
for n in all_news:
n.is_featured = False
n.featured_rank = None
# Only candidates with score >= 85
candidates = [n for n in all_news if n.importance_score >= 85]
categories = ["药品监管", "临床研究", "行业动态", "政策法规"]
selected: list[ProcessedNews] = []
seen_cats: set[str] = set()
# First pass: one guaranteed per category
# First pass: one guaranteed per category (from high-score candidates)
for cat in categories:
for n in all_news:
for n in candidates:
if n.category == cat and cat not in seen_cats and n not in selected:
selected.append(n)
seen_cats.add(cat)
break
# Second pass: fill up to 10 by score
for n in all_news:
# Second pass: fill up to 10 by score (still from candidates only)
for n in candidates:
if len(selected) >= 10:
break
if n not in selected:
@@ -141,6 +147,7 @@ async def run_daily_pipeline(db: AsyncSession):
title=item["title"],
url=item["url"],
raw_content=item["content"],
image_url=item.get("image_url"),
published_at=item["published_at"],
))
raw_added += 1
@@ -170,11 +177,12 @@ async def run_daily_pipeline(db: AsyncSession):
summary=analysis.get("summary", ""),
opinion=analysis.get("opinion"),
keywords=analysis.get("keywords", []),
importance_score=float(analysis.get("importance_score", 5.0)),
importance_score=float(analysis.get("importance_score", 50.0)),
importance_reason=analysis.get("importance_reason"),
category=analysis.get("category", "行业动态"),
source_name=raw.source.name if raw.source else "",
source_url=raw.url,
image_url=raw.image_url,
published_at=raw.published_at,
))
raw.status = "processed"

View File

@@ -24,6 +24,7 @@ def _serialize(n: ProcessedNews) -> dict:
"featured_rank": n.featured_rank,
"source_name": n.source_name or "",
"source_url": n.source_url or "",
"image_url": n.image_url or None,
"published_at": n.published_at.isoformat() if n.published_at else None,
"processed_at": n.processed_at.isoformat() if n.processed_at else None,
}

View File

@@ -25,6 +25,31 @@ def _parse_date(raw: str) -> Optional[datetime]:
return None
def _extract_image(entry) -> Optional[str]:
"""Try to pull an image URL from common RSS media extensions."""
# <media:thumbnail>
thumbnails = getattr(entry, "media_thumbnail", [])
if thumbnails:
url = thumbnails[0].get("url", "").strip()
if url:
return url
# <media:content medium="image">
for mc in getattr(entry, "media_content", []):
mc_type = mc.get("type", "")
mc_medium = mc.get("medium", "")
if mc_medium == "image" or mc_type.startswith("image/"):
url = mc.get("url", "").strip()
if url:
return url
# <enclosure type="image/...">
for enc in getattr(entry, "enclosures", []):
if enc.get("type", "").startswith("image/"):
url = (enc.get("href") or enc.get("url") or "").strip()
if url:
return url
return None
async def fetch_rss(url: str, max_items: int = 30) -> list[dict]:
try:
async with httpx.AsyncClient(headers=HEADERS, timeout=30, follow_redirects=True) as client:
@@ -54,6 +79,7 @@ async def fetch_rss(url: str, max_items: int = 30) -> list[dict]:
"url": link,
"content": content[:3000],
"published_at": _parse_date(published_raw),
"image_url": _extract_image(entry),
})
logger.info(f"RSS {url}: got {len(items)} items")

View File

@@ -28,6 +28,7 @@ class RawNews(Base):
title: Mapped[str] = mapped_column(String(500))
url: Mapped[str] = mapped_column(String(1000), unique=True)
raw_content: Mapped[Optional[str]] = mapped_column(Text)
image_url: Mapped[Optional[str]] = mapped_column(String(2000))
published_at: Mapped[Optional[datetime]] = mapped_column(DateTime)
crawled_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
status: Mapped[str] = mapped_column(String(20), default="pending") # pending|processed|skipped|error
@@ -52,6 +53,7 @@ class ProcessedNews(Base):
featured_rank: Mapped[Optional[int]] = mapped_column(Integer)
source_name: Mapped[Optional[str]] = mapped_column(String(200))
source_url: Mapped[Optional[str]] = mapped_column(String(1000))
image_url: Mapped[Optional[str]] = mapped_column(String(2000))
published_at: Mapped[Optional[datetime]] = mapped_column(DateTime)
processed_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)