修改了文档解析的接口

This commit is contained in:
jinye_huang 2025-07-23 10:25:35 +08:00
parent a1a282548c
commit 8a6d1328f8
4 changed files with 95 additions and 39 deletions

View File

@ -7,53 +7,83 @@
from typing import List, Optional, Dict, Any, Union from typing import List, Optional, Dict, Any, Union
from pydantic import BaseModel, Field, validator from pydantic import BaseModel, Field, validator
from typing import Dict
class Base64Document(BaseModel):
"""Base64编码的文档模型"""
filename: str = Field(..., description="文件名")
content: str = Field(..., description="Base64编码的文件内容")
mime_type: str = Field(..., description="文件MIME类型")
class ContentIntegrationRequest(BaseModel): class ContentIntegrationRequest(BaseModel):
"""内容整合请求模型""" """内容整合请求模型"""
document_paths: Optional[List[str]] = Field(default=None, description="文档文件路径列表(可选,纯搜索模式时可为空)") documents: Optional[List[Base64Document]] = Field(default=None, description="Base64编码的文档列表")
keywords: List[str] = Field(..., description="搜索关键词列表", min_length=1) keywords: Optional[List[str]] = Field(default=None, description="搜索关键词列表")
cookies: str = Field(..., description="小红书Cookie字符串") cookies: Optional[str] = Field(default=None, description="小红书Cookie字符串")
# 小红书搜索配置 # 小红书搜索配置
sort_type: int = Field(default=2, ge=0, le=4, description="排序方式: 0综合排序, 1最新, 2最多点赞, 3最多评论, 4最多收藏") sort_type: Optional[int] = Field(default=2, ge=0, le=4, description="排序方式: 0综合排序, 1最新, 2最多点赞, 3最多评论, 4最多收藏")
note_type: int = Field(default=2, ge=0, le=2, description="笔记类型: 0不限, 1视频笔记, 2普通笔记") note_type: Optional[int] = Field(default=2, ge=0, le=2, description="笔记类型: 0不限, 1视频笔记, 2普通笔记")
note_time: int = Field(default=0, ge=0, le=3, description="笔记时间: 0不限, 1一天内, 2一周内, 3半年内") note_time: Optional[int] = Field(default=0, ge=0, le=3, description="笔记时间: 0不限, 1一天内, 2一周内, 3半年内")
note_range: int = Field(default=0, ge=0, le=3, description="笔记范围: 0不限, 1已看过, 2未看过, 3已关注") note_range: Optional[int] = Field(default=0, ge=0, le=3, description="笔记范围: 0不限, 1已看过, 2未看过, 3已关注")
pos_distance: int = Field(default=0, ge=0, le=2, description="位置距离: 0不限, 1同城, 2附近") pos_distance: Optional[int] = Field(default=0, ge=0, le=2, description="位置距离: 0不限, 1同城, 2附近")
query_num: int = Field(default=10, ge=1, le=50, description="每个关键词搜索的笔记数量") query_num: Optional[int] = Field(default=10, ge=1, le=50, description="每个关键词搜索的笔记数量")
# 输出配置 @validator('documents')
output_path: str = Field(default="data/output", description="输出目录路径") def validate_documents(cls, v):
@validator('document_paths')
def validate_document_paths(cls, v):
if v is not None and not v: if v is not None and not v:
raise ValueError("如果提供文档路径,列表不能为空") raise ValueError("如果提供文档,列表不能为空")
return v return v
@validator('keywords') @validator('keywords')
def validate_keywords(cls, v): def validate_keywords(cls, v):
if not v: if v is not None:
raise ValueError("关键词列表不能为空") if not v:
# 去除空字符串和重复关键词 raise ValueError("如果提供关键词,列表不能为空")
cleaned = list(set(k.strip() for k in v if k.strip())) # 去除空字符串和重复关键词
if not cleaned: cleaned = list(set(k.strip() for k in v if k.strip()))
raise ValueError("关键词列表不能全为空") if not cleaned:
return cleaned raise ValueError("关键词列表不能全为空")
return cleaned
return v
@validator('cookies') @validator('cookies')
def validate_cookies(cls, v): def validate_cookies(cls, v):
if not v or not v.strip(): if v is not None:
raise ValueError("Cookie不能为空") if not v.strip():
return v.strip() raise ValueError("如果提供Cookie不能为空")
return v.strip()
return v
@validator('*')
def validate_request(cls, v, values, field):
if field.name == 'documents':
has_documents = v is not None
has_keywords = values.get('keywords') is not None
has_cookies = values.get('cookies') is not None
if not has_documents and not (has_keywords and has_cookies):
raise ValueError("必须提供文档或(关键词和Cookie)中的至少一项")
if has_keywords and not has_cookies:
raise ValueError("提供关键词时必须提供Cookie")
if has_cookies and not has_keywords:
raise ValueError("提供Cookie时必须提供关键词")
return v
class Config: class Config:
schema_extra = { schema_extra = {
"example": { "example": {
"document_paths": [ "documents": [
"uploads/travel_guide.pdf", {
"uploads/attraction_info.docx" "filename": "travel_guide.pdf",
"content": "base64_encoded_content_here",
"mime_type": "application/pdf"
}
], ],
"keywords": ["北京旅游", "故宫攻略", "长城一日游"], "keywords": ["北京旅游", "故宫攻略", "长城一日游"],
"cookies": "a1=your_cookie_value; web_session=your_session_value", "cookies": "a1=your_cookie_value; web_session=your_session_value",
@ -62,8 +92,7 @@ class ContentIntegrationRequest(BaseModel):
"note_time": 0, "note_time": 0,
"note_range": 0, "note_range": 0,
"pos_distance": 0, "pos_distance": 0,
"query_num": 10, "query_num": 10
"output_path": "data/output"
} }
} }

View File

@ -12,7 +12,7 @@ from pydantic import BaseModel, Field
class TopicRequest(BaseModel): class TopicRequest(BaseModel):
"""选题生成请求模型""" """选题生成请求模型"""
dates: Optional[str] = Field(None, description="日期字符串,可能为单个日期、多个日期用逗号分隔或范围如'2023-01-01 to 2023-01-31'") dates: Optional[str] = Field(None, description="日期字符串,可能为单个日期、多个日期用逗号分隔或范围如'2023-01-01 to 2023-01-31'")
numTopics: int = Field(5, description="要生成的选题数量", ge=1, le=10) numTopics: int = Field(5, description="要生成的选题数量", ge=1, le=30)
styleIds: Optional[List[int]] = Field(None, description="风格ID列表") styleIds: Optional[List[int]] = Field(None, description="风格ID列表")
audienceIds: Optional[List[int]] = Field(None, description="受众ID列表") audienceIds: Optional[List[int]] = Field(None, description="受众ID列表")
scenicSpotIds: Optional[List[int]] = Field(None, description="景区ID列表") scenicSpotIds: Optional[List[int]] = Field(None, description="景区ID列表")

View File

@ -6,7 +6,10 @@
""" """
import logging import logging
from fastapi import APIRouter, HTTPException, BackgroundTasks import tempfile
import os
import base64
from fastapi import APIRouter, HTTPException
from typing import Dict, Any from typing import Dict, Any
from api.models.content_integration import ( from api.models.content_integration import (
@ -29,8 +32,8 @@ async def integrate_content(request: ContentIntegrationRequest) -> ContentIntegr
整合文档和小红书笔记内容 整合文档和小红书笔记内容
该接口将 该接口将
1. 读取用户上传的文档文件支持PDFWord图片等格式 1. 处理用户上传的base64编码文档支持PDFWord图片等格式
2. 根据关键词搜索小红书相关笔记 2. 根据关键词搜索小红书相关笔记可选
3. 使用LLM将两者整合成综合性旅游资料 3. 使用LLM将两者整合成综合性旅游资料
Args: Args:
@ -42,17 +45,33 @@ async def integrate_content(request: ContentIntegrationRequest) -> ContentIntegr
Raises: Raises:
HTTPException: 当请求参数无效或处理失败时 HTTPException: 当请求参数无效或处理失败时
""" """
temp_files = []
try: try:
if request.document_paths is None: # 创建临时文件处理base64文档
request.document_paths = [] if request.documents:
logger.info(f"收到内容整合请求:文档 {len(request.document_paths)} 个,关键词 {len(request.keywords)}") temp_files = []
for doc in request.documents:
try:
# 创建临时文件
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(doc.filename)[1]) as temp_file:
# 解码base64内容并写入临时文件
content = base64.b64decode(doc.content)
temp_file.write(content)
temp_files.append(temp_file.name)
except Exception as e:
logger.error(f"处理文档 {doc.filename} 失败: {e}")
raise HTTPException(
status_code=400,
detail=f"文档 {doc.filename} 处理失败: {str(e)}"
)
logger.info(f"收到内容整合请求:文档 {len(temp_files) if temp_files else 0} 个,关键词 {len(request.keywords) if request.keywords else 0}")
# 调用服务层处理 # 调用服务层处理
result = await integration_service.integrate_content( result = await integration_service.integrate_content(
document_paths=request.document_paths, document_paths=temp_files,
keywords=request.keywords, keywords=request.keywords,
cookies=request.cookies, cookies=request.cookies,
output_path=request.output_path,
sort_type=request.sort_type, sort_type=request.sort_type,
note_type=request.note_type, note_type=request.note_type,
note_time=request.note_time, note_time=request.note_time,
@ -98,6 +117,14 @@ async def integrate_content(request: ContentIntegrationRequest) -> ContentIntegr
status_code=500, status_code=500,
detail=f"内容整合处理失败:{str(e)}" detail=f"内容整合处理失败:{str(e)}"
) )
finally:
# 清理临时文件
for temp_file in temp_files:
try:
if os.path.exists(temp_file):
os.unlink(temp_file)
except Exception as e:
logger.error(f"清理临时文件 {temp_file} 失败: {e}")
@router.get("/health") @router.get("/health")