修改了文档解析的接口

2025-07-23 10:25:35 +08:00 · 2025-07-23 10:25:35 +08:00 · 8a6d1328f8
commit 8a6d1328f8
parent a1a282548c
4 changed files with 95 additions and 39 deletions
--- a/api/models/pycache/tweet.cpython-312.pyc
+++ b/api/models/pycache/tweet.cpython-312.pyc
--- a/api/models/content_integration.py
+++ b/api/models/content_integration.py
@ -7,53 +7,83 @@

 from typing import List, Optional, Dict, Any, Union
 from pydantic import BaseModel, Field, validator
+from typing import Dict
+
+
+class Base64Document(BaseModel):
+    """Base64编码的文档模型"""
+    filename: str = Field(..., description="文件名")
+    content: str = Field(..., description="Base64编码的文件内容")
+    mime_type: str = Field(..., description="文件MIME类型")


 class ContentIntegrationRequest(BaseModel):
    """内容整合请求模型"""
-    document_paths: Optional[List[str]] = Field(default=None, description="文档文件路径列表（可选，纯搜索模式时可为空）")
-    keywords: List[str] = Field(..., description="搜索关键词列表", min_length=1)
-    cookies: str = Field(..., description="小红书Cookie字符串")
+    documents: Optional[List[Base64Document]] = Field(default=None, description="Base64编码的文档列表")
+    keywords: Optional[List[str]] = Field(default=None, description="搜索关键词列表")
+    cookies: Optional[str] = Field(default=None, description="小红书Cookie字符串")
    
    # 小红书搜索配置
-    sort_type: int = Field(default=2, ge=0, le=4, description="排序方式: 0综合排序, 1最新, 2最多点赞, 3最多评论, 4最多收藏")
-    note_type: int = Field(default=2, ge=0, le=2, description="笔记类型: 0不限, 1视频笔记, 2普通笔记")
-    note_time: int = Field(default=0, ge=0, le=3, description="笔记时间: 0不限, 1一天内, 2一周内, 3半年内")
-    note_range: int = Field(default=0, ge=0, le=3, description="笔记范围: 0不限, 1已看过, 2未看过, 3已关注")
-    pos_distance: int = Field(default=0, ge=0, le=2, description="位置距离: 0不限, 1同城, 2附近")
-    query_num: int = Field(default=10, ge=1, le=50, description="每个关键词搜索的笔记数量")
+    sort_type: Optional[int] = Field(default=2, ge=0, le=4, description="排序方式: 0综合排序, 1最新, 2最多点赞, 3最多评论, 4最多收藏")
+    note_type: Optional[int] = Field(default=2, ge=0, le=2, description="笔记类型: 0不限, 1视频笔记, 2普通笔记")
+    note_time: Optional[int] = Field(default=0, ge=0, le=3, description="笔记时间: 0不限, 1一天内, 2一周内, 3半年内")
+    note_range: Optional[int] = Field(default=0, ge=0, le=3, description="笔记范围: 0不限, 1已看过, 2未看过, 3已关注")
+    pos_distance: Optional[int] = Field(default=0, ge=0, le=2, description="位置距离: 0不限, 1同城, 2附近")
+    query_num: Optional[int] = Field(default=10, ge=1, le=50, description="每个关键词搜索的笔记数量")
    
-    # 输出配置
-    output_path: str = Field(default="data/output", description="输出目录路径")
-    
-    @validator('document_paths')
-    def validate_document_paths(cls, v):
+    @validator('documents')
+    def validate_documents(cls, v):
        if v is not None and not v:
-            raise ValueError("如果提供文档路径，列表不能为空")
+            raise ValueError("如果提供文档，列表不能为空")
        return v
    
    @validator('keywords')
    def validate_keywords(cls, v):
-        if not v:
-            raise ValueError("关键词列表不能为空")
-        # 去除空字符串和重复关键词
-        cleaned = list(set(k.strip() for k in v if k.strip()))
-        if not cleaned:
-            raise ValueError("关键词列表不能全为空")
-        return cleaned
+        if v is not None:
+            if not v:
+                raise ValueError("如果提供关键词，列表不能为空")
+            # 去除空字符串和重复关键词
+            cleaned = list(set(k.strip() for k in v if k.strip()))
+            if not cleaned:
+                raise ValueError("关键词列表不能全为空")
+            return cleaned
+        return v
    
    @validator('cookies')
    def validate_cookies(cls, v):
-        if not v or not v.strip():
-            raise ValueError("Cookie不能为空")
-        return v.strip()
+        if v is not None:
+            if not v.strip():
+                raise ValueError("如果提供Cookie，不能为空")
+            return v.strip()
+        return v
+
+    @validator('*')
+    def validate_request(cls, v, values, field):
+        if field.name == 'documents':
+            has_documents = v is not None
+            has_keywords = values.get('keywords') is not None
+            has_cookies = values.get('cookies') is not None
+            
+            if not has_documents and not (has_keywords and has_cookies):
+                raise ValueError("必须提供文档或(关键词和Cookie)中的至少一项")
+            
+            if has_keywords and not has_cookies:
+                raise ValueError("提供关键词时必须提供Cookie")
+            
+            if has_cookies and not has_keywords:
+                raise ValueError("提供Cookie时必须提供关键词")
+        
+        return v

    class Config:
        schema_extra = {
            "example": {
-                "document_paths": [
-                    "uploads/travel_guide.pdf",
-                    "uploads/attraction_info.docx"
+                "documents": [
+                    {
+                        "filename": "travel_guide.pdf",
+                        "content": "base64_encoded_content_here",
+                        "mime_type": "application/pdf"
+                    }
                ],
                "keywords": ["北京旅游", "故宫攻略", "长城一日游"],
                "cookies": "a1=your_cookie_value; web_session=your_session_value",
@ -62,8 +92,7 @@ class ContentIntegrationRequest(BaseModel):
                "note_time": 0,
                "note_range": 0,
                "pos_distance": 0,
-                "query_num": 10,
-                "output_path": "data/output"
+                "query_num": 10
            }
        }

--- a/api/models/tweet.py
+++ b/api/models/tweet.py
@ -12,7 +12,7 @@ from pydantic import BaseModel, Field
 class TopicRequest(BaseModel):
    """选题生成请求模型"""
    dates: Optional[str] = Field(None, description="日期字符串，可能为单个日期、多个日期用逗号分隔或范围如'2023-01-01 to 2023-01-31'")
-    numTopics: int = Field(5, description="要生成的选题数量", ge=1, le=10)
+    numTopics: int = Field(5, description="要生成的选题数量", ge=1, le=30)
    styleIds: Optional[List[int]] = Field(None, description="风格ID列表")
    audienceIds: Optional[List[int]] = Field(None, description="受众ID列表")
    scenicSpotIds: Optional[List[int]] = Field(None, description="景区ID列表")
--- a/api/routers/content_integration.py
+++ b/api/routers/content_integration.py
@ -6,7 +6,10 @@
 """

 import logging
-from fastapi import APIRouter, HTTPException, BackgroundTasks
+import tempfile
+import os
+import base64
+from fastapi import APIRouter, HTTPException
 from typing import Dict, Any

 from api.models.content_integration import (
@ -29,8 +32,8 @@ async def integrate_content(request: ContentIntegrationRequest) -> ContentIntegr
    整合文档和小红书笔记内容
    
    该接口将：
-    1. 读取用户上传的文档文件（支持PDF、Word、图片等格式）
-    2. 根据关键词搜索小红书相关笔记
+    1. 处理用户上传的base64编码文档（支持PDF、Word、图片等格式）
+    2. 根据关键词搜索小红书相关笔记（可选）
    3. 使用LLM将两者整合成综合性旅游资料
    
    Args:
@ -42,17 +45,33 @@ async def integrate_content(request: ContentIntegrationRequest) -> ContentIntegr
    Raises:
        HTTPException: 当请求参数无效或处理失败时
    """
+    temp_files = []
    try:
-        if request.document_paths is None:
-            request.document_paths = []
-        logger.info(f"收到内容整合请求：文档 {len(request.document_paths)} 个，关键词 {len(request.keywords)} 个")
+        # 创建临时文件处理base64文档
+        if request.documents:
+            temp_files = []
+            for doc in request.documents:
+                try:
+                    # 创建临时文件
+                    with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(doc.filename)[1]) as temp_file:
+                        # 解码base64内容并写入临时文件
+                        content = base64.b64decode(doc.content)
+                        temp_file.write(content)
+                        temp_files.append(temp_file.name)
+                except Exception as e:
+                    logger.error(f"处理文档 {doc.filename} 失败: {e}")
+                    raise HTTPException(
+                        status_code=400,
+                        detail=f"文档 {doc.filename} 处理失败: {str(e)}"
+                    )
+
+        logger.info(f"收到内容整合请求：文档 {len(temp_files) if temp_files else 0} 个，关键词 {len(request.keywords) if request.keywords else 0} 个")
        
        # 调用服务层处理
        result = await integration_service.integrate_content(
-            document_paths=request.document_paths,
+            document_paths=temp_files,
            keywords=request.keywords,
            cookies=request.cookies,
-            output_path=request.output_path,
            sort_type=request.sort_type,
            note_type=request.note_type,
            note_time=request.note_time,
@ -98,6 +117,14 @@ async def integrate_content(request: ContentIntegrationRequest) -> ContentIntegr
            status_code=500,
            detail=f"内容整合处理失败：{str(e)}"
        )
+    finally:
+        # 清理临时文件
+        for temp_file in temp_files:
+            try:
+                if os.path.exists(temp_file):
+                    os.unlink(temp_file)
+            except Exception as e:
+                logger.error(f"清理临时文件 {temp_file} 失败: {e}")


@router.get("/health")