修改了文档解析的接口

This commit is contained in:
jinye_huang 2025-07-23 10:25:35 +08:00
parent a1a282548c
commit 8a6d1328f8
4 changed files with 95 additions and 39 deletions

View File

@ -7,53 +7,83 @@
from typing import List, Optional, Dict, Any, Union
from pydantic import BaseModel, Field, validator
from typing import Dict
class Base64Document(BaseModel):
"""Base64编码的文档模型"""
filename: str = Field(..., description="文件名")
content: str = Field(..., description="Base64编码的文件内容")
mime_type: str = Field(..., description="文件MIME类型")
class ContentIntegrationRequest(BaseModel):
"""内容整合请求模型"""
document_paths: Optional[List[str]] = Field(default=None, description="文档文件路径列表(可选,纯搜索模式时可为空)")
keywords: List[str] = Field(..., description="搜索关键词列表", min_length=1)
cookies: str = Field(..., description="小红书Cookie字符串")
documents: Optional[List[Base64Document]] = Field(default=None, description="Base64编码的文档列表")
keywords: Optional[List[str]] = Field(default=None, description="搜索关键词列表")
cookies: Optional[str] = Field(default=None, description="小红书Cookie字符串")
# 小红书搜索配置
sort_type: int = Field(default=2, ge=0, le=4, description="排序方式: 0综合排序, 1最新, 2最多点赞, 3最多评论, 4最多收藏")
note_type: int = Field(default=2, ge=0, le=2, description="笔记类型: 0不限, 1视频笔记, 2普通笔记")
note_time: int = Field(default=0, ge=0, le=3, description="笔记时间: 0不限, 1一天内, 2一周内, 3半年内")
note_range: int = Field(default=0, ge=0, le=3, description="笔记范围: 0不限, 1已看过, 2未看过, 3已关注")
pos_distance: int = Field(default=0, ge=0, le=2, description="位置距离: 0不限, 1同城, 2附近")
query_num: int = Field(default=10, ge=1, le=50, description="每个关键词搜索的笔记数量")
sort_type: Optional[int] = Field(default=2, ge=0, le=4, description="排序方式: 0综合排序, 1最新, 2最多点赞, 3最多评论, 4最多收藏")
note_type: Optional[int] = Field(default=2, ge=0, le=2, description="笔记类型: 0不限, 1视频笔记, 2普通笔记")
note_time: Optional[int] = Field(default=0, ge=0, le=3, description="笔记时间: 0不限, 1一天内, 2一周内, 3半年内")
note_range: Optional[int] = Field(default=0, ge=0, le=3, description="笔记范围: 0不限, 1已看过, 2未看过, 3已关注")
pos_distance: Optional[int] = Field(default=0, ge=0, le=2, description="位置距离: 0不限, 1同城, 2附近")
query_num: Optional[int] = Field(default=10, ge=1, le=50, description="每个关键词搜索的笔记数量")
# 输出配置
output_path: str = Field(default="data/output", description="输出目录路径")
@validator('document_paths')
def validate_document_paths(cls, v):
@validator('documents')
def validate_documents(cls, v):
if v is not None and not v:
raise ValueError("如果提供文档路径,列表不能为空")
raise ValueError("如果提供文档,列表不能为空")
return v
@validator('keywords')
def validate_keywords(cls, v):
if not v:
raise ValueError("关键词列表不能为空")
# 去除空字符串和重复关键词
cleaned = list(set(k.strip() for k in v if k.strip()))
if not cleaned:
raise ValueError("关键词列表不能全为空")
return cleaned
if v is not None:
if not v:
raise ValueError("如果提供关键词,列表不能为空")
# 去除空字符串和重复关键词
cleaned = list(set(k.strip() for k in v if k.strip()))
if not cleaned:
raise ValueError("关键词列表不能全为空")
return cleaned
return v
@validator('cookies')
def validate_cookies(cls, v):
if not v or not v.strip():
raise ValueError("Cookie不能为空")
return v.strip()
if v is not None:
if not v.strip():
raise ValueError("如果提供Cookie不能为空")
return v.strip()
return v
@validator('*')
def validate_request(cls, v, values, field):
if field.name == 'documents':
has_documents = v is not None
has_keywords = values.get('keywords') is not None
has_cookies = values.get('cookies') is not None
if not has_documents and not (has_keywords and has_cookies):
raise ValueError("必须提供文档或(关键词和Cookie)中的至少一项")
if has_keywords and not has_cookies:
raise ValueError("提供关键词时必须提供Cookie")
if has_cookies and not has_keywords:
raise ValueError("提供Cookie时必须提供关键词")
return v
class Config:
schema_extra = {
"example": {
"document_paths": [
"uploads/travel_guide.pdf",
"uploads/attraction_info.docx"
"documents": [
{
"filename": "travel_guide.pdf",
"content": "base64_encoded_content_here",
"mime_type": "application/pdf"
}
],
"keywords": ["北京旅游", "故宫攻略", "长城一日游"],
"cookies": "a1=your_cookie_value; web_session=your_session_value",
@ -62,8 +92,7 @@ class ContentIntegrationRequest(BaseModel):
"note_time": 0,
"note_range": 0,
"pos_distance": 0,
"query_num": 10,
"output_path": "data/output"
"query_num": 10
}
}

View File

@ -12,7 +12,7 @@ from pydantic import BaseModel, Field
class TopicRequest(BaseModel):
"""选题生成请求模型"""
dates: Optional[str] = Field(None, description="日期字符串,可能为单个日期、多个日期用逗号分隔或范围如'2023-01-01 to 2023-01-31'")
numTopics: int = Field(5, description="要生成的选题数量", ge=1, le=10)
numTopics: int = Field(5, description="要生成的选题数量", ge=1, le=30)
styleIds: Optional[List[int]] = Field(None, description="风格ID列表")
audienceIds: Optional[List[int]] = Field(None, description="受众ID列表")
scenicSpotIds: Optional[List[int]] = Field(None, description="景区ID列表")

View File

@ -6,7 +6,10 @@
"""
import logging
from fastapi import APIRouter, HTTPException, BackgroundTasks
import tempfile
import os
import base64
from fastapi import APIRouter, HTTPException
from typing import Dict, Any
from api.models.content_integration import (
@ -29,8 +32,8 @@ async def integrate_content(request: ContentIntegrationRequest) -> ContentIntegr
整合文档和小红书笔记内容
该接口将
1. 读取用户上传的文档文件支持PDFWord图片等格式
2. 根据关键词搜索小红书相关笔记
1. 处理用户上传的base64编码文档支持PDFWord图片等格式
2. 根据关键词搜索小红书相关笔记可选
3. 使用LLM将两者整合成综合性旅游资料
Args:
@ -42,17 +45,33 @@ async def integrate_content(request: ContentIntegrationRequest) -> ContentIntegr
Raises:
HTTPException: 当请求参数无效或处理失败时
"""
temp_files = []
try:
if request.document_paths is None:
request.document_paths = []
logger.info(f"收到内容整合请求:文档 {len(request.document_paths)} 个,关键词 {len(request.keywords)}")
# 创建临时文件处理base64文档
if request.documents:
temp_files = []
for doc in request.documents:
try:
# 创建临时文件
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(doc.filename)[1]) as temp_file:
# 解码base64内容并写入临时文件
content = base64.b64decode(doc.content)
temp_file.write(content)
temp_files.append(temp_file.name)
except Exception as e:
logger.error(f"处理文档 {doc.filename} 失败: {e}")
raise HTTPException(
status_code=400,
detail=f"文档 {doc.filename} 处理失败: {str(e)}"
)
logger.info(f"收到内容整合请求:文档 {len(temp_files) if temp_files else 0} 个,关键词 {len(request.keywords) if request.keywords else 0}")
# 调用服务层处理
result = await integration_service.integrate_content(
document_paths=request.document_paths,
document_paths=temp_files,
keywords=request.keywords,
cookies=request.cookies,
output_path=request.output_path,
sort_type=request.sort_type,
note_type=request.note_type,
note_time=request.note_time,
@ -98,6 +117,14 @@ async def integrate_content(request: ContentIntegrationRequest) -> ContentIntegr
status_code=500,
detail=f"内容整合处理失败:{str(e)}"
)
finally:
# 清理临时文件
for temp_file in temp_files:
try:
if os.path.exists(temp_file):
os.unlink(temp_file)
except Exception as e:
logger.error(f"清理临时文件 {temp_file} 失败: {e}")
@router.get("/health")