From 8a6d1328f88d419b4dae1b10f1ea8ab0743b12cd Mon Sep 17 00:00:00 2001 From: jinye_huang Date: Wed, 23 Jul 2025 10:25:35 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86=E6=96=87=E6=A1=A3?= =?UTF-8?q?=E8=A7=A3=E6=9E=90=E7=9A=84=E6=8E=A5=E5=8F=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- api/models/__pycache__/tweet.cpython-312.pyc | Bin 9875 -> 9875 bytes api/models/content_integration.py | 89 ++++++++++++------- api/models/tweet.py | 2 +- api/routers/content_integration.py | 43 +++++++-- 4 files changed, 95 insertions(+), 39 deletions(-) diff --git a/api/models/__pycache__/tweet.cpython-312.pyc b/api/models/__pycache__/tweet.cpython-312.pyc index e19f132ae9217fb2c77ee7278747cdc5c38cd000..905203c6ba2a46e9586b7902f37742188235cac8 100644 GIT binary patch delta 27 hcmbR2JK2}}G%qg~0}!k|T)UAwnVC^;b3U_*G5~FY2UP$7 delta 27 hcmbR2JK2}}G%qg~0}zOom2c!uW@hBtoX@PH3;237z7 diff --git a/api/models/content_integration.py b/api/models/content_integration.py index ab0a35a..1d62920 100644 --- a/api/models/content_integration.py +++ b/api/models/content_integration.py @@ -7,53 +7,83 @@ from typing import List, Optional, Dict, Any, Union from pydantic import BaseModel, Field, validator +from typing import Dict + + +class Base64Document(BaseModel): + """Base64编码的文档模型""" + filename: str = Field(..., description="文件名") + content: str = Field(..., description="Base64编码的文件内容") + mime_type: str = Field(..., description="文件MIME类型") class ContentIntegrationRequest(BaseModel): """内容整合请求模型""" - document_paths: Optional[List[str]] = Field(default=None, description="文档文件路径列表(可选,纯搜索模式时可为空)") - keywords: List[str] = Field(..., description="搜索关键词列表", min_length=1) - cookies: str = Field(..., description="小红书Cookie字符串") + documents: Optional[List[Base64Document]] = Field(default=None, description="Base64编码的文档列表") + keywords: Optional[List[str]] = Field(default=None, description="搜索关键词列表") + cookies: Optional[str] = Field(default=None, description="小红书Cookie字符串") # 小红书搜索配置 - sort_type: int = Field(default=2, ge=0, le=4, description="排序方式: 0综合排序, 1最新, 2最多点赞, 3最多评论, 4最多收藏") - note_type: int = Field(default=2, ge=0, le=2, description="笔记类型: 0不限, 1视频笔记, 2普通笔记") - note_time: int = Field(default=0, ge=0, le=3, description="笔记时间: 0不限, 1一天内, 2一周内, 3半年内") - note_range: int = Field(default=0, ge=0, le=3, description="笔记范围: 0不限, 1已看过, 2未看过, 3已关注") - pos_distance: int = Field(default=0, ge=0, le=2, description="位置距离: 0不限, 1同城, 2附近") - query_num: int = Field(default=10, ge=1, le=50, description="每个关键词搜索的笔记数量") + sort_type: Optional[int] = Field(default=2, ge=0, le=4, description="排序方式: 0综合排序, 1最新, 2最多点赞, 3最多评论, 4最多收藏") + note_type: Optional[int] = Field(default=2, ge=0, le=2, description="笔记类型: 0不限, 1视频笔记, 2普通笔记") + note_time: Optional[int] = Field(default=0, ge=0, le=3, description="笔记时间: 0不限, 1一天内, 2一周内, 3半年内") + note_range: Optional[int] = Field(default=0, ge=0, le=3, description="笔记范围: 0不限, 1已看过, 2未看过, 3已关注") + pos_distance: Optional[int] = Field(default=0, ge=0, le=2, description="位置距离: 0不限, 1同城, 2附近") + query_num: Optional[int] = Field(default=10, ge=1, le=50, description="每个关键词搜索的笔记数量") - # 输出配置 - output_path: str = Field(default="data/output", description="输出目录路径") - - @validator('document_paths') - def validate_document_paths(cls, v): + @validator('documents') + def validate_documents(cls, v): if v is not None and not v: - raise ValueError("如果提供文档路径,列表不能为空") + raise ValueError("如果提供文档,列表不能为空") return v @validator('keywords') def validate_keywords(cls, v): - if not v: - raise ValueError("关键词列表不能为空") - # 去除空字符串和重复关键词 - cleaned = list(set(k.strip() for k in v if k.strip())) - if not cleaned: - raise ValueError("关键词列表不能全为空") - return cleaned + if v is not None: + if not v: + raise ValueError("如果提供关键词,列表不能为空") + # 去除空字符串和重复关键词 + cleaned = list(set(k.strip() for k in v if k.strip())) + if not cleaned: + raise ValueError("关键词列表不能全为空") + return cleaned + return v @validator('cookies') def validate_cookies(cls, v): - if not v or not v.strip(): - raise ValueError("Cookie不能为空") - return v.strip() + if v is not None: + if not v.strip(): + raise ValueError("如果提供Cookie,不能为空") + return v.strip() + return v + + @validator('*') + def validate_request(cls, v, values, field): + if field.name == 'documents': + has_documents = v is not None + has_keywords = values.get('keywords') is not None + has_cookies = values.get('cookies') is not None + + if not has_documents and not (has_keywords and has_cookies): + raise ValueError("必须提供文档或(关键词和Cookie)中的至少一项") + + if has_keywords and not has_cookies: + raise ValueError("提供关键词时必须提供Cookie") + + if has_cookies and not has_keywords: + raise ValueError("提供Cookie时必须提供关键词") + + return v class Config: schema_extra = { "example": { - "document_paths": [ - "uploads/travel_guide.pdf", - "uploads/attraction_info.docx" + "documents": [ + { + "filename": "travel_guide.pdf", + "content": "base64_encoded_content_here", + "mime_type": "application/pdf" + } ], "keywords": ["北京旅游", "故宫攻略", "长城一日游"], "cookies": "a1=your_cookie_value; web_session=your_session_value", @@ -62,8 +92,7 @@ class ContentIntegrationRequest(BaseModel): "note_time": 0, "note_range": 0, "pos_distance": 0, - "query_num": 10, - "output_path": "data/output" + "query_num": 10 } } diff --git a/api/models/tweet.py b/api/models/tweet.py index c962043..574dc13 100644 --- a/api/models/tweet.py +++ b/api/models/tweet.py @@ -12,7 +12,7 @@ from pydantic import BaseModel, Field class TopicRequest(BaseModel): """选题生成请求模型""" dates: Optional[str] = Field(None, description="日期字符串,可能为单个日期、多个日期用逗号分隔或范围如'2023-01-01 to 2023-01-31'") - numTopics: int = Field(5, description="要生成的选题数量", ge=1, le=10) + numTopics: int = Field(5, description="要生成的选题数量", ge=1, le=30) styleIds: Optional[List[int]] = Field(None, description="风格ID列表") audienceIds: Optional[List[int]] = Field(None, description="受众ID列表") scenicSpotIds: Optional[List[int]] = Field(None, description="景区ID列表") diff --git a/api/routers/content_integration.py b/api/routers/content_integration.py index d248d45..f384433 100644 --- a/api/routers/content_integration.py +++ b/api/routers/content_integration.py @@ -6,7 +6,10 @@ """ import logging -from fastapi import APIRouter, HTTPException, BackgroundTasks +import tempfile +import os +import base64 +from fastapi import APIRouter, HTTPException from typing import Dict, Any from api.models.content_integration import ( @@ -29,8 +32,8 @@ async def integrate_content(request: ContentIntegrationRequest) -> ContentIntegr 整合文档和小红书笔记内容 该接口将: - 1. 读取用户上传的文档文件(支持PDF、Word、图片等格式) - 2. 根据关键词搜索小红书相关笔记 + 1. 处理用户上传的base64编码文档(支持PDF、Word、图片等格式) + 2. 根据关键词搜索小红书相关笔记(可选) 3. 使用LLM将两者整合成综合性旅游资料 Args: @@ -42,17 +45,33 @@ async def integrate_content(request: ContentIntegrationRequest) -> ContentIntegr Raises: HTTPException: 当请求参数无效或处理失败时 """ + temp_files = [] try: - if request.document_paths is None: - request.document_paths = [] - logger.info(f"收到内容整合请求:文档 {len(request.document_paths)} 个,关键词 {len(request.keywords)} 个") + # 创建临时文件处理base64文档 + if request.documents: + temp_files = [] + for doc in request.documents: + try: + # 创建临时文件 + with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(doc.filename)[1]) as temp_file: + # 解码base64内容并写入临时文件 + content = base64.b64decode(doc.content) + temp_file.write(content) + temp_files.append(temp_file.name) + except Exception as e: + logger.error(f"处理文档 {doc.filename} 失败: {e}") + raise HTTPException( + status_code=400, + detail=f"文档 {doc.filename} 处理失败: {str(e)}" + ) + + logger.info(f"收到内容整合请求:文档 {len(temp_files) if temp_files else 0} 个,关键词 {len(request.keywords) if request.keywords else 0} 个") # 调用服务层处理 result = await integration_service.integrate_content( - document_paths=request.document_paths, + document_paths=temp_files, keywords=request.keywords, cookies=request.cookies, - output_path=request.output_path, sort_type=request.sort_type, note_type=request.note_type, note_time=request.note_time, @@ -98,6 +117,14 @@ async def integrate_content(request: ContentIntegrationRequest) -> ContentIntegr status_code=500, detail=f"内容整合处理失败:{str(e)}" ) + finally: + # 清理临时文件 + for temp_file in temp_files: + try: + if os.path.exists(temp_file): + os.unlink(temp_file) + except Exception as e: + logger.error(f"清理临时文件 {temp_file} 失败: {e}") @router.get("/health")