From 8a6d1328f88d419b4dae1b10f1ea8ab0743b12cd Mon Sep 17 00:00:00 2001
From: jinye_huang <jinye_huang@foxmail.com>
Date: Wed, 23 Jul 2025 10:25:35 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86=E6=96=87=E6=A1=A3?=
 =?UTF-8?q?=E8=A7=A3=E6=9E=90=E7=9A=84=E6=8E=A5=E5=8F=A3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 api/models/__pycache__/tweet.cpython-312.pyc | Bin 9875 -> 9875 bytes
 api/models/content_integration.py            |  89 ++++++++++++-------
 api/models/tweet.py                          |   2 +-
 api/routers/content_integration.py           |  43 +++++++--
 4 files changed, 95 insertions(+), 39 deletions(-)
diff --git a/api/models/__pycache__/tweet.cpython-312.pyc b/api/models/__pycache__/tweet.cpython-312.pyc
index e19f132ae9217fb2c77ee7278747cdc5c38cd000..905203c6ba2a46e9586b7902f37742188235cac8 100644
GIT binary patch
delta 27
hcmbR2JK2}}G%qg~0}!k|T)UAwnVC^;b3U_*G5~FY2UP$7

delta 27
hcmbR2JK2}}G%qg~0}zOom2c!uW@hBtoX@PH3;<w>237z7

diff --git a/api/models/content_integration.py b/api/models/content_integration.py
index ab0a35a..1d62920 100644
--- a/api/models/content_integration.py
+++ b/api/models/content_integration.py
@@ -7,53 +7,83 @@
 
 from typing import List, Optional, Dict, Any, Union
 from pydantic import BaseModel, Field, validator
+from typing import Dict
+
+
+class Base64Document(BaseModel):
+    """Base64编码的文档模型"""
+    filename: str = Field(..., description="文件名")
+    content: str = Field(..., description="Base64编码的文件内容")
+    mime_type: str = Field(..., description="文件MIME类型")
 
 
 class ContentIntegrationRequest(BaseModel):
     """内容整合请求模型"""
-    document_paths: Optional[List[str]] = Field(default=None, description="文档文件路径列表（可选，纯搜索模式时可为空）")
-    keywords: List[str] = Field(..., description="搜索关键词列表", min_length=1)
-    cookies: str = Field(..., description="小红书Cookie字符串")
+    documents: Optional[List[Base64Document]] = Field(default=None, description="Base64编码的文档列表")
+    keywords: Optional[List[str]] = Field(default=None, description="搜索关键词列表")
+    cookies: Optional[str] = Field(default=None, description="小红书Cookie字符串")
     
     # 小红书搜索配置
-    sort_type: int = Field(default=2, ge=0, le=4, description="排序方式: 0综合排序, 1最新, 2最多点赞, 3最多评论, 4最多收藏")
-    note_type: int = Field(default=2, ge=0, le=2, description="笔记类型: 0不限, 1视频笔记, 2普通笔记")
-    note_time: int = Field(default=0, ge=0, le=3, description="笔记时间: 0不限, 1一天内, 2一周内, 3半年内")
-    note_range: int = Field(default=0, ge=0, le=3, description="笔记范围: 0不限, 1已看过, 2未看过, 3已关注")
-    pos_distance: int = Field(default=0, ge=0, le=2, description="位置距离: 0不限, 1同城, 2附近")
-    query_num: int = Field(default=10, ge=1, le=50, description="每个关键词搜索的笔记数量")
+    sort_type: Optional[int] = Field(default=2, ge=0, le=4, description="排序方式: 0综合排序, 1最新, 2最多点赞, 3最多评论, 4最多收藏")
+    note_type: Optional[int] = Field(default=2, ge=0, le=2, description="笔记类型: 0不限, 1视频笔记, 2普通笔记")
+    note_time: Optional[int] = Field(default=0, ge=0, le=3, description="笔记时间: 0不限, 1一天内, 2一周内, 3半年内")
+    note_range: Optional[int] = Field(default=0, ge=0, le=3, description="笔记范围: 0不限, 1已看过, 2未看过, 3已关注")
+    pos_distance: Optional[int] = Field(default=0, ge=0, le=2, description="位置距离: 0不限, 1同城, 2附近")
+    query_num: Optional[int] = Field(default=10, ge=1, le=50, description="每个关键词搜索的笔记数量")
     
-    # 输出配置
-    output_path: str = Field(default="data/output", description="输出目录路径")
-    
-    @validator('document_paths')
-    def validate_document_paths(cls, v):
+    @validator('documents')
+    def validate_documents(cls, v):
         if v is not None and not v:
-            raise ValueError("如果提供文档路径，列表不能为空")
+            raise ValueError("如果提供文档，列表不能为空")
         return v
     
     @validator('keywords')
     def validate_keywords(cls, v):
-        if not v:
-            raise ValueError("关键词列表不能为空")
-        # 去除空字符串和重复关键词
-        cleaned = list(set(k.strip() for k in v if k.strip()))
-        if not cleaned:
-            raise ValueError("关键词列表不能全为空")
-        return cleaned
+        if v is not None:
+            if not v:
+                raise ValueError("如果提供关键词，列表不能为空")
+            # 去除空字符串和重复关键词
+            cleaned = list(set(k.strip() for k in v if k.strip()))
+            if not cleaned:
+                raise ValueError("关键词列表不能全为空")
+            return cleaned
+        return v
     
     @validator('cookies')
     def validate_cookies(cls, v):
-        if not v or not v.strip():
-            raise ValueError("Cookie不能为空")
-        return v.strip()
+        if v is not None:
+            if not v.strip():
+                raise ValueError("如果提供Cookie，不能为空")
+            return v.strip()
+        return v
+
+    @validator('*')
+    def validate_request(cls, v, values, field):
+        if field.name == 'documents':
+            has_documents = v is not None
+            has_keywords = values.get('keywords') is not None
+            has_cookies = values.get('cookies') is not None
+            
+            if not has_documents and not (has_keywords and has_cookies):
+                raise ValueError("必须提供文档或(关键词和Cookie)中的至少一项")
+            
+            if has_keywords and not has_cookies:
+                raise ValueError("提供关键词时必须提供Cookie")
+            
+            if has_cookies and not has_keywords:
+                raise ValueError("提供Cookie时必须提供关键词")
+        
+        return v
 
     class Config:
         schema_extra = {
             "example": {
-                "document_paths": [
-                    "uploads/travel_guide.pdf",
-                    "uploads/attraction_info.docx"
+                "documents": [
+                    {
+                        "filename": "travel_guide.pdf",
+                        "content": "base64_encoded_content_here",
+                        "mime_type": "application/pdf"
+                    }
                 ],
                 "keywords": ["北京旅游", "故宫攻略", "长城一日游"],
                 "cookies": "a1=your_cookie_value; web_session=your_session_value",
@@ -62,8 +92,7 @@ class ContentIntegrationRequest(BaseModel):
                 "note_time": 0,
                 "note_range": 0,
                 "pos_distance": 0,
-                "query_num": 10,
-                "output_path": "data/output"
+                "query_num": 10
             }
         }
 
diff --git a/api/models/tweet.py b/api/models/tweet.py
index c962043..574dc13 100644
--- a/api/models/tweet.py
+++ b/api/models/tweet.py
@@ -12,7 +12,7 @@ from pydantic import BaseModel, Field
 class TopicRequest(BaseModel):
     """选题生成请求模型"""
     dates: Optional[str] = Field(None, description="日期字符串，可能为单个日期、多个日期用逗号分隔或范围如'2023-01-01 to 2023-01-31'")
-    numTopics: int = Field(5, description="要生成的选题数量", ge=1, le=10)
+    numTopics: int = Field(5, description="要生成的选题数量", ge=1, le=30)
     styleIds: Optional[List[int]] = Field(None, description="风格ID列表")
     audienceIds: Optional[List[int]] = Field(None, description="受众ID列表")
     scenicSpotIds: Optional[List[int]] = Field(None, description="景区ID列表")
diff --git a/api/routers/content_integration.py b/api/routers/content_integration.py
index d248d45..f384433 100644
--- a/api/routers/content_integration.py
+++ b/api/routers/content_integration.py
@@ -6,7 +6,10 @@
 """
 
 import logging
-from fastapi import APIRouter, HTTPException, BackgroundTasks
+import tempfile
+import os
+import base64
+from fastapi import APIRouter, HTTPException
 from typing import Dict, Any
 
 from api.models.content_integration import (
@@ -29,8 +32,8 @@ async def integrate_content(request: ContentIntegrationRequest) -> ContentIntegr
     整合文档和小红书笔记内容
     
     该接口将：
-    1. 读取用户上传的文档文件（支持PDF、Word、图片等格式）
-    2. 根据关键词搜索小红书相关笔记
+    1. 处理用户上传的base64编码文档（支持PDF、Word、图片等格式）
+    2. 根据关键词搜索小红书相关笔记（可选）
     3. 使用LLM将两者整合成综合性旅游资料
     
     Args:
@@ -42,17 +45,33 @@ async def integrate_content(request: ContentIntegrationRequest) -> ContentIntegr
     Raises:
         HTTPException: 当请求参数无效或处理失败时
     """
+    temp_files = []
     try:
-        if request.document_paths is None:
-            request.document_paths = []
-        logger.info(f"收到内容整合请求：文档 {len(request.document_paths)} 个，关键词 {len(request.keywords)} 个")
+        # 创建临时文件处理base64文档
+        if request.documents:
+            temp_files = []
+            for doc in request.documents:
+                try:
+                    # 创建临时文件
+                    with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(doc.filename)[1]) as temp_file:
+                        # 解码base64内容并写入临时文件
+                        content = base64.b64decode(doc.content)
+                        temp_file.write(content)
+                        temp_files.append(temp_file.name)
+                except Exception as e:
+                    logger.error(f"处理文档 {doc.filename} 失败: {e}")
+                    raise HTTPException(
+                        status_code=400,
+                        detail=f"文档 {doc.filename} 处理失败: {str(e)}"
+                    )
+
+        logger.info(f"收到内容整合请求：文档 {len(temp_files) if temp_files else 0} 个，关键词 {len(request.keywords) if request.keywords else 0} 个")
         
         # 调用服务层处理
         result = await integration_service.integrate_content(
-            document_paths=request.document_paths,
+            document_paths=temp_files,
             keywords=request.keywords,
             cookies=request.cookies,
-            output_path=request.output_path,
             sort_type=request.sort_type,
             note_type=request.note_type,
             note_time=request.note_time,
@@ -98,6 +117,14 @@ async def integrate_content(request: ContentIntegrationRequest) -> ContentIntegr
             status_code=500,
             detail=f"内容整合处理失败：{str(e)}"
         )
+    finally:
+        # 清理临时文件
+        for temp_file in temp_files:
+            try:
+                if os.path.exists(temp_file):
+                    os.unlink(temp_file)
+            except Exception as e:
+                logger.error(f"清理临时文件 {temp_file} 失败: {e}")
 
 
 @router.get("/health")