v1.0定版

2026-05-27 17:14:08 +08:00
parent 1b7210de4f
commit 5b19d9fe69
32 changed files with 2074 additions and 2915 deletions
--- a/backend/app/ai/pycache/processor.cpython-314.pyc
+++ b/backend/app/ai/pycache/processor.cpython-314.pyc
--- a/backend/app/ai/processor.py
+++ b/backend/app/ai/processor.py
@@ -25,18 +25,20 @@ ANALYSIS_PROMPT = """分析以下新闻，返回严格的 JSON 格式结果，
  "summary": "中文摘要（100-150字，客观陈述核心内容）",
  "opinion": "核心观点或行业影响（50-100字，分析性语言，点明实际意义）",
  "keywords": ["关键词1", "关键词2", "关键词3", "关键词4", "关键词5"],
-  "importance_score": 8.5,
+  "importance_score": 85,
  "importance_reason": "评分理由（30字内）",
  "category": "药品监管"
 }}

 category 只能是以下四个之一：药品监管 / 临床研究 / 行业动态 / 政策法规

-importance_score 评分标准（1-10）：
-9-10：重大监管决定 / 突破性研究 / 影响整个行业的政策
-7-8 ：行业重要动态，有明显商业或学术价值
-5-6 ：常规行业新闻，有一定参考价值
-1-4 ：普通资讯，信息价值有限
+importance_score 评分标准（1-100整数）：
+90-100：重大监管决定 / 突破性研究 / 影响整个行业的政策
+70-89 ：行业重要动态，有明显商业或学术价值
+50-69 ：常规行业新闻，有一定参考价值
+1-49  ：普通资讯，信息价值有限
+
+注意：只有 85 分及以上的新闻才有资格进入每日精选，请严格区分。
 """


@@ -70,7 +72,8 @@ async def _analyze_article(client: LLMClient, title: str, content: str, language


 async def _select_top_10(db: AsyncSession, target: date):
-    """Reset featured flags and elect TOP 10 with category diversity."""
+    """Reset featured flags and elect TOP 10 with category diversity.
+    Only news with importance_score >= 85 is eligible for 精选."""
    result = await db.execute(
        select(ProcessedNews)
        .where(func.date(ProcessedNews.processed_at) == target)
@@ -78,25 +81,28 @@ async def _select_top_10(db: AsyncSession, target: date):
    )
    all_news = result.scalars().all()

-    # Reset
+    # Reset all
    for n in all_news:
        n.is_featured = False
        n.featured_rank = None

+    # Only candidates with score >= 85
+    candidates = [n for n in all_news if n.importance_score >= 85]
+
    categories = ["药品监管", "临床研究", "行业动态", "政策法规"]
    selected: list[ProcessedNews] = []
    seen_cats: set[str] = set()

-    # First pass: one guaranteed per category
+    # First pass: one guaranteed per category (from high-score candidates)
    for cat in categories:
-        for n in all_news:
+        for n in candidates:
            if n.category == cat and cat not in seen_cats and n not in selected:
                selected.append(n)
                seen_cats.add(cat)
                break

-    # Second pass: fill up to 10 by score
-    for n in all_news:
+    # Second pass: fill up to 10 by score (still from candidates only)
+    for n in candidates:
        if len(selected) >= 10:
            break
        if n not in selected:
@@ -141,6 +147,7 @@ async def run_daily_pipeline(db: AsyncSession):
                title=item["title"],
                url=item["url"],
                raw_content=item["content"],
+                image_url=item.get("image_url"),
                published_at=item["published_at"],
            ))
            raw_added += 1
@@ -170,11 +177,12 @@ async def run_daily_pipeline(db: AsyncSession):
                summary=analysis.get("summary", ""),
                opinion=analysis.get("opinion"),
                keywords=analysis.get("keywords", []),
-                importance_score=float(analysis.get("importance_score", 5.0)),
+                importance_score=float(analysis.get("importance_score", 50.0)),
                importance_reason=analysis.get("importance_reason"),
                category=analysis.get("category", "行业动态"),
                source_name=raw.source.name if raw.source else "",
                source_url=raw.url,
+                image_url=raw.image_url,
                published_at=raw.published_at,
            ))
            raw.status = "processed"
--- a/backend/app/api/pycache/news.cpython-314.pyc
+++ b/backend/app/api/pycache/news.cpython-314.pyc
--- a/backend/app/api/news.py
+++ b/backend/app/api/news.py
@@ -24,6 +24,7 @@ def _serialize(n: ProcessedNews) -> dict:
        "featured_rank": n.featured_rank,
        "source_name": n.source_name or "",
        "source_url": n.source_url or "",
+        "image_url": n.image_url or None,
        "published_at": n.published_at.isoformat() if n.published_at else None,
        "processed_at": n.processed_at.isoformat() if n.processed_at else None,
    }
--- a/backend/app/crawler/pycache/rss_fetcher.cpython-314.pyc
+++ b/backend/app/crawler/pycache/rss_fetcher.cpython-314.pyc
--- a/backend/app/crawler/rss_fetcher.py
+++ b/backend/app/crawler/rss_fetcher.py
@@ -25,6 +25,31 @@ def _parse_date(raw: str) -> Optional[datetime]:
        return None


+def _extract_image(entry) -> Optional[str]:
+    """Try to pull an image URL from common RSS media extensions."""
+    # <media:thumbnail>
+    thumbnails = getattr(entry, "media_thumbnail", [])
+    if thumbnails:
+        url = thumbnails[0].get("url", "").strip()
+        if url:
+            return url
+    # <media:content medium="image">
+    for mc in getattr(entry, "media_content", []):
+        mc_type = mc.get("type", "")
+        mc_medium = mc.get("medium", "")
+        if mc_medium == "image" or mc_type.startswith("image/"):
+            url = mc.get("url", "").strip()
+            if url:
+                return url
+    # <enclosure type="image/...">
+    for enc in getattr(entry, "enclosures", []):
+        if enc.get("type", "").startswith("image/"):
+            url = (enc.get("href") or enc.get("url") or "").strip()
+            if url:
+                return url
+    return None
+
+
 async def fetch_rss(url: str, max_items: int = 30) -> list[dict]:
    try:
        async with httpx.AsyncClient(headers=HEADERS, timeout=30, follow_redirects=True) as client:
@@ -54,6 +79,7 @@ async def fetch_rss(url: str, max_items: int = 30) -> list[dict]:
            "url": link,
            "content": content[:3000],
            "published_at": _parse_date(published_raw),
+            "image_url": _extract_image(entry),
        })

    logger.info(f"RSS {url}: got {len(items)} items")
--- a/backend/app/models/pycache/news.cpython-314.pyc
+++ b/backend/app/models/pycache/news.cpython-314.pyc
--- a/backend/app/models/news.py
+++ b/backend/app/models/news.py
@@ -28,6 +28,7 @@ class RawNews(Base):
    title: Mapped[str] = mapped_column(String(500))
    url: Mapped[str] = mapped_column(String(1000), unique=True)
    raw_content: Mapped[Optional[str]] = mapped_column(Text)
+    image_url: Mapped[Optional[str]] = mapped_column(String(2000))
    published_at: Mapped[Optional[datetime]] = mapped_column(DateTime)
    crawled_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
    status: Mapped[str] = mapped_column(String(20), default="pending")  # pending|processed|skipped|error
@@ -52,6 +53,7 @@ class ProcessedNews(Base):
    featured_rank: Mapped[Optional[int]] = mapped_column(Integer)
    source_name: Mapped[Optional[str]] = mapped_column(String(200))
    source_url: Mapped[Optional[str]] = mapped_column(String(1000))
+    image_url: Mapped[Optional[str]] = mapped_column(String(2000))
    published_at: Mapped[Optional[datetime]] = mapped_column(DateTime)
    processed_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)