v1.0定版
This commit is contained in:
Binary file not shown.
@@ -25,18 +25,20 @@ ANALYSIS_PROMPT = """分析以下新闻,返回严格的 JSON 格式结果,
|
||||
"summary": "中文摘要(100-150字,客观陈述核心内容)",
|
||||
"opinion": "核心观点或行业影响(50-100字,分析性语言,点明实际意义)",
|
||||
"keywords": ["关键词1", "关键词2", "关键词3", "关键词4", "关键词5"],
|
||||
"importance_score": 8.5,
|
||||
"importance_score": 85,
|
||||
"importance_reason": "评分理由(30字内)",
|
||||
"category": "药品监管"
|
||||
}}
|
||||
|
||||
category 只能是以下四个之一:药品监管 / 临床研究 / 行业动态 / 政策法规
|
||||
|
||||
importance_score 评分标准(1-10):
|
||||
9-10:重大监管决定 / 突破性研究 / 影响整个行业的政策
|
||||
7-8 :行业重要动态,有明显商业或学术价值
|
||||
5-6 :常规行业新闻,有一定参考价值
|
||||
1-4 :普通资讯,信息价值有限
|
||||
importance_score 评分标准(1-100整数):
|
||||
90-100:重大监管决定 / 突破性研究 / 影响整个行业的政策
|
||||
70-89 :行业重要动态,有明显商业或学术价值
|
||||
50-69 :常规行业新闻,有一定参考价值
|
||||
1-49 :普通资讯,信息价值有限
|
||||
|
||||
注意:只有 85 分及以上的新闻才有资格进入每日精选,请严格区分。
|
||||
"""
|
||||
|
||||
|
||||
@@ -70,7 +72,8 @@ async def _analyze_article(client: LLMClient, title: str, content: str, language
|
||||
|
||||
|
||||
async def _select_top_10(db: AsyncSession, target: date):
|
||||
"""Reset featured flags and elect TOP 10 with category diversity."""
|
||||
"""Reset featured flags and elect TOP 10 with category diversity.
|
||||
Only news with importance_score >= 85 is eligible for 精选."""
|
||||
result = await db.execute(
|
||||
select(ProcessedNews)
|
||||
.where(func.date(ProcessedNews.processed_at) == target)
|
||||
@@ -78,25 +81,28 @@ async def _select_top_10(db: AsyncSession, target: date):
|
||||
)
|
||||
all_news = result.scalars().all()
|
||||
|
||||
# Reset
|
||||
# Reset all
|
||||
for n in all_news:
|
||||
n.is_featured = False
|
||||
n.featured_rank = None
|
||||
|
||||
# Only candidates with score >= 85
|
||||
candidates = [n for n in all_news if n.importance_score >= 85]
|
||||
|
||||
categories = ["药品监管", "临床研究", "行业动态", "政策法规"]
|
||||
selected: list[ProcessedNews] = []
|
||||
seen_cats: set[str] = set()
|
||||
|
||||
# First pass: one guaranteed per category
|
||||
# First pass: one guaranteed per category (from high-score candidates)
|
||||
for cat in categories:
|
||||
for n in all_news:
|
||||
for n in candidates:
|
||||
if n.category == cat and cat not in seen_cats and n not in selected:
|
||||
selected.append(n)
|
||||
seen_cats.add(cat)
|
||||
break
|
||||
|
||||
# Second pass: fill up to 10 by score
|
||||
for n in all_news:
|
||||
# Second pass: fill up to 10 by score (still from candidates only)
|
||||
for n in candidates:
|
||||
if len(selected) >= 10:
|
||||
break
|
||||
if n not in selected:
|
||||
@@ -141,6 +147,7 @@ async def run_daily_pipeline(db: AsyncSession):
|
||||
title=item["title"],
|
||||
url=item["url"],
|
||||
raw_content=item["content"],
|
||||
image_url=item.get("image_url"),
|
||||
published_at=item["published_at"],
|
||||
))
|
||||
raw_added += 1
|
||||
@@ -170,11 +177,12 @@ async def run_daily_pipeline(db: AsyncSession):
|
||||
summary=analysis.get("summary", ""),
|
||||
opinion=analysis.get("opinion"),
|
||||
keywords=analysis.get("keywords", []),
|
||||
importance_score=float(analysis.get("importance_score", 5.0)),
|
||||
importance_score=float(analysis.get("importance_score", 50.0)),
|
||||
importance_reason=analysis.get("importance_reason"),
|
||||
category=analysis.get("category", "行业动态"),
|
||||
source_name=raw.source.name if raw.source else "",
|
||||
source_url=raw.url,
|
||||
image_url=raw.image_url,
|
||||
published_at=raw.published_at,
|
||||
))
|
||||
raw.status = "processed"
|
||||
|
||||
Binary file not shown.
@@ -24,6 +24,7 @@ def _serialize(n: ProcessedNews) -> dict:
|
||||
"featured_rank": n.featured_rank,
|
||||
"source_name": n.source_name or "",
|
||||
"source_url": n.source_url or "",
|
||||
"image_url": n.image_url or None,
|
||||
"published_at": n.published_at.isoformat() if n.published_at else None,
|
||||
"processed_at": n.processed_at.isoformat() if n.processed_at else None,
|
||||
}
|
||||
|
||||
Binary file not shown.
@@ -25,6 +25,31 @@ def _parse_date(raw: str) -> Optional[datetime]:
|
||||
return None
|
||||
|
||||
|
||||
def _extract_image(entry) -> Optional[str]:
|
||||
"""Try to pull an image URL from common RSS media extensions."""
|
||||
# <media:thumbnail>
|
||||
thumbnails = getattr(entry, "media_thumbnail", [])
|
||||
if thumbnails:
|
||||
url = thumbnails[0].get("url", "").strip()
|
||||
if url:
|
||||
return url
|
||||
# <media:content medium="image">
|
||||
for mc in getattr(entry, "media_content", []):
|
||||
mc_type = mc.get("type", "")
|
||||
mc_medium = mc.get("medium", "")
|
||||
if mc_medium == "image" or mc_type.startswith("image/"):
|
||||
url = mc.get("url", "").strip()
|
||||
if url:
|
||||
return url
|
||||
# <enclosure type="image/...">
|
||||
for enc in getattr(entry, "enclosures", []):
|
||||
if enc.get("type", "").startswith("image/"):
|
||||
url = (enc.get("href") or enc.get("url") or "").strip()
|
||||
if url:
|
||||
return url
|
||||
return None
|
||||
|
||||
|
||||
async def fetch_rss(url: str, max_items: int = 30) -> list[dict]:
|
||||
try:
|
||||
async with httpx.AsyncClient(headers=HEADERS, timeout=30, follow_redirects=True) as client:
|
||||
@@ -54,6 +79,7 @@ async def fetch_rss(url: str, max_items: int = 30) -> list[dict]:
|
||||
"url": link,
|
||||
"content": content[:3000],
|
||||
"published_at": _parse_date(published_raw),
|
||||
"image_url": _extract_image(entry),
|
||||
})
|
||||
|
||||
logger.info(f"RSS {url}: got {len(items)} items")
|
||||
|
||||
Binary file not shown.
@@ -28,6 +28,7 @@ class RawNews(Base):
|
||||
title: Mapped[str] = mapped_column(String(500))
|
||||
url: Mapped[str] = mapped_column(String(1000), unique=True)
|
||||
raw_content: Mapped[Optional[str]] = mapped_column(Text)
|
||||
image_url: Mapped[Optional[str]] = mapped_column(String(2000))
|
||||
published_at: Mapped[Optional[datetime]] = mapped_column(DateTime)
|
||||
crawled_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
|
||||
status: Mapped[str] = mapped_column(String(20), default="pending") # pending|processed|skipped|error
|
||||
@@ -52,6 +53,7 @@ class ProcessedNews(Base):
|
||||
featured_rank: Mapped[Optional[int]] = mapped_column(Integer)
|
||||
source_name: Mapped[Optional[str]] = mapped_column(String(200))
|
||||
source_url: Mapped[Optional[str]] = mapped_column(String(1000))
|
||||
image_url: Mapped[Optional[str]] = mapped_column(String(2000))
|
||||
published_at: Mapped[Optional[datetime]] = mapped_column(DateTime)
|
||||
processed_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user