修改了文档解析的接口
This commit is contained in:
parent
a1a282548c
commit
8a6d1328f8
Binary file not shown.
@ -7,53 +7,83 @@
|
||||
|
||||
from typing import List, Optional, Dict, Any, Union
|
||||
from pydantic import BaseModel, Field, validator
|
||||
from typing import Dict
|
||||
|
||||
|
||||
class Base64Document(BaseModel):
|
||||
"""Base64编码的文档模型"""
|
||||
filename: str = Field(..., description="文件名")
|
||||
content: str = Field(..., description="Base64编码的文件内容")
|
||||
mime_type: str = Field(..., description="文件MIME类型")
|
||||
|
||||
|
||||
class ContentIntegrationRequest(BaseModel):
|
||||
"""内容整合请求模型"""
|
||||
document_paths: Optional[List[str]] = Field(default=None, description="文档文件路径列表(可选,纯搜索模式时可为空)")
|
||||
keywords: List[str] = Field(..., description="搜索关键词列表", min_length=1)
|
||||
cookies: str = Field(..., description="小红书Cookie字符串")
|
||||
documents: Optional[List[Base64Document]] = Field(default=None, description="Base64编码的文档列表")
|
||||
keywords: Optional[List[str]] = Field(default=None, description="搜索关键词列表")
|
||||
cookies: Optional[str] = Field(default=None, description="小红书Cookie字符串")
|
||||
|
||||
# 小红书搜索配置
|
||||
sort_type: int = Field(default=2, ge=0, le=4, description="排序方式: 0综合排序, 1最新, 2最多点赞, 3最多评论, 4最多收藏")
|
||||
note_type: int = Field(default=2, ge=0, le=2, description="笔记类型: 0不限, 1视频笔记, 2普通笔记")
|
||||
note_time: int = Field(default=0, ge=0, le=3, description="笔记时间: 0不限, 1一天内, 2一周内, 3半年内")
|
||||
note_range: int = Field(default=0, ge=0, le=3, description="笔记范围: 0不限, 1已看过, 2未看过, 3已关注")
|
||||
pos_distance: int = Field(default=0, ge=0, le=2, description="位置距离: 0不限, 1同城, 2附近")
|
||||
query_num: int = Field(default=10, ge=1, le=50, description="每个关键词搜索的笔记数量")
|
||||
sort_type: Optional[int] = Field(default=2, ge=0, le=4, description="排序方式: 0综合排序, 1最新, 2最多点赞, 3最多评论, 4最多收藏")
|
||||
note_type: Optional[int] = Field(default=2, ge=0, le=2, description="笔记类型: 0不限, 1视频笔记, 2普通笔记")
|
||||
note_time: Optional[int] = Field(default=0, ge=0, le=3, description="笔记时间: 0不限, 1一天内, 2一周内, 3半年内")
|
||||
note_range: Optional[int] = Field(default=0, ge=0, le=3, description="笔记范围: 0不限, 1已看过, 2未看过, 3已关注")
|
||||
pos_distance: Optional[int] = Field(default=0, ge=0, le=2, description="位置距离: 0不限, 1同城, 2附近")
|
||||
query_num: Optional[int] = Field(default=10, ge=1, le=50, description="每个关键词搜索的笔记数量")
|
||||
|
||||
# 输出配置
|
||||
output_path: str = Field(default="data/output", description="输出目录路径")
|
||||
|
||||
@validator('document_paths')
|
||||
def validate_document_paths(cls, v):
|
||||
@validator('documents')
|
||||
def validate_documents(cls, v):
|
||||
if v is not None and not v:
|
||||
raise ValueError("如果提供文档路径,列表不能为空")
|
||||
raise ValueError("如果提供文档,列表不能为空")
|
||||
return v
|
||||
|
||||
@validator('keywords')
|
||||
def validate_keywords(cls, v):
|
||||
if not v:
|
||||
raise ValueError("关键词列表不能为空")
|
||||
# 去除空字符串和重复关键词
|
||||
cleaned = list(set(k.strip() for k in v if k.strip()))
|
||||
if not cleaned:
|
||||
raise ValueError("关键词列表不能全为空")
|
||||
return cleaned
|
||||
if v is not None:
|
||||
if not v:
|
||||
raise ValueError("如果提供关键词,列表不能为空")
|
||||
# 去除空字符串和重复关键词
|
||||
cleaned = list(set(k.strip() for k in v if k.strip()))
|
||||
if not cleaned:
|
||||
raise ValueError("关键词列表不能全为空")
|
||||
return cleaned
|
||||
return v
|
||||
|
||||
@validator('cookies')
|
||||
def validate_cookies(cls, v):
|
||||
if not v or not v.strip():
|
||||
raise ValueError("Cookie不能为空")
|
||||
return v.strip()
|
||||
if v is not None:
|
||||
if not v.strip():
|
||||
raise ValueError("如果提供Cookie,不能为空")
|
||||
return v.strip()
|
||||
return v
|
||||
|
||||
@validator('*')
|
||||
def validate_request(cls, v, values, field):
|
||||
if field.name == 'documents':
|
||||
has_documents = v is not None
|
||||
has_keywords = values.get('keywords') is not None
|
||||
has_cookies = values.get('cookies') is not None
|
||||
|
||||
if not has_documents and not (has_keywords and has_cookies):
|
||||
raise ValueError("必须提供文档或(关键词和Cookie)中的至少一项")
|
||||
|
||||
if has_keywords and not has_cookies:
|
||||
raise ValueError("提供关键词时必须提供Cookie")
|
||||
|
||||
if has_cookies and not has_keywords:
|
||||
raise ValueError("提供Cookie时必须提供关键词")
|
||||
|
||||
return v
|
||||
|
||||
class Config:
|
||||
schema_extra = {
|
||||
"example": {
|
||||
"document_paths": [
|
||||
"uploads/travel_guide.pdf",
|
||||
"uploads/attraction_info.docx"
|
||||
"documents": [
|
||||
{
|
||||
"filename": "travel_guide.pdf",
|
||||
"content": "base64_encoded_content_here",
|
||||
"mime_type": "application/pdf"
|
||||
}
|
||||
],
|
||||
"keywords": ["北京旅游", "故宫攻略", "长城一日游"],
|
||||
"cookies": "a1=your_cookie_value; web_session=your_session_value",
|
||||
@ -62,8 +92,7 @@ class ContentIntegrationRequest(BaseModel):
|
||||
"note_time": 0,
|
||||
"note_range": 0,
|
||||
"pos_distance": 0,
|
||||
"query_num": 10,
|
||||
"output_path": "data/output"
|
||||
"query_num": 10
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -12,7 +12,7 @@ from pydantic import BaseModel, Field
|
||||
class TopicRequest(BaseModel):
|
||||
"""选题生成请求模型"""
|
||||
dates: Optional[str] = Field(None, description="日期字符串,可能为单个日期、多个日期用逗号分隔或范围如'2023-01-01 to 2023-01-31'")
|
||||
numTopics: int = Field(5, description="要生成的选题数量", ge=1, le=10)
|
||||
numTopics: int = Field(5, description="要生成的选题数量", ge=1, le=30)
|
||||
styleIds: Optional[List[int]] = Field(None, description="风格ID列表")
|
||||
audienceIds: Optional[List[int]] = Field(None, description="受众ID列表")
|
||||
scenicSpotIds: Optional[List[int]] = Field(None, description="景区ID列表")
|
||||
|
||||
@ -6,7 +6,10 @@
|
||||
"""
|
||||
|
||||
import logging
|
||||
from fastapi import APIRouter, HTTPException, BackgroundTasks
|
||||
import tempfile
|
||||
import os
|
||||
import base64
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from typing import Dict, Any
|
||||
|
||||
from api.models.content_integration import (
|
||||
@ -29,8 +32,8 @@ async def integrate_content(request: ContentIntegrationRequest) -> ContentIntegr
|
||||
整合文档和小红书笔记内容
|
||||
|
||||
该接口将:
|
||||
1. 读取用户上传的文档文件(支持PDF、Word、图片等格式)
|
||||
2. 根据关键词搜索小红书相关笔记
|
||||
1. 处理用户上传的base64编码文档(支持PDF、Word、图片等格式)
|
||||
2. 根据关键词搜索小红书相关笔记(可选)
|
||||
3. 使用LLM将两者整合成综合性旅游资料
|
||||
|
||||
Args:
|
||||
@ -42,17 +45,33 @@ async def integrate_content(request: ContentIntegrationRequest) -> ContentIntegr
|
||||
Raises:
|
||||
HTTPException: 当请求参数无效或处理失败时
|
||||
"""
|
||||
temp_files = []
|
||||
try:
|
||||
if request.document_paths is None:
|
||||
request.document_paths = []
|
||||
logger.info(f"收到内容整合请求:文档 {len(request.document_paths)} 个,关键词 {len(request.keywords)} 个")
|
||||
# 创建临时文件处理base64文档
|
||||
if request.documents:
|
||||
temp_files = []
|
||||
for doc in request.documents:
|
||||
try:
|
||||
# 创建临时文件
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(doc.filename)[1]) as temp_file:
|
||||
# 解码base64内容并写入临时文件
|
||||
content = base64.b64decode(doc.content)
|
||||
temp_file.write(content)
|
||||
temp_files.append(temp_file.name)
|
||||
except Exception as e:
|
||||
logger.error(f"处理文档 {doc.filename} 失败: {e}")
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"文档 {doc.filename} 处理失败: {str(e)}"
|
||||
)
|
||||
|
||||
logger.info(f"收到内容整合请求:文档 {len(temp_files) if temp_files else 0} 个,关键词 {len(request.keywords) if request.keywords else 0} 个")
|
||||
|
||||
# 调用服务层处理
|
||||
result = await integration_service.integrate_content(
|
||||
document_paths=request.document_paths,
|
||||
document_paths=temp_files,
|
||||
keywords=request.keywords,
|
||||
cookies=request.cookies,
|
||||
output_path=request.output_path,
|
||||
sort_type=request.sort_type,
|
||||
note_type=request.note_type,
|
||||
note_time=request.note_time,
|
||||
@ -98,6 +117,14 @@ async def integrate_content(request: ContentIntegrationRequest) -> ContentIntegr
|
||||
status_code=500,
|
||||
detail=f"内容整合处理失败:{str(e)}"
|
||||
)
|
||||
finally:
|
||||
# 清理临时文件
|
||||
for temp_file in temp_files:
|
||||
try:
|
||||
if os.path.exists(temp_file):
|
||||
os.unlink(temp_file)
|
||||
except Exception as e:
|
||||
logger.error(f"清理临时文件 {temp_file} 失败: {e}")
|
||||
|
||||
|
||||
@router.get("/health")
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user