完成了一版文档解析
This commit is contained in:
parent
c5b728ca3f
commit
c66273b393
Binary file not shown.
Binary file not shown.
@ -73,7 +73,7 @@ def get_prompt_builder():
|
||||
from api.services.prompt_service import PromptService
|
||||
|
||||
prompt_service = PromptService(get_config())
|
||||
return PromptBuilderService(get_config(), prompt_service)
|
||||
return PromptBuilderService(get_config(), prompt_service)
|
||||
|
||||
def get_integration_service():
|
||||
"""获取整合服务"""
|
||||
|
||||
@ -56,7 +56,7 @@ app.add_middleware(
|
||||
)
|
||||
|
||||
# 导入路由
|
||||
from api.routers import tweet, poster, prompt, document, data, integration
|
||||
from api.routers import tweet, poster, prompt, document, data, integration, content_integration
|
||||
|
||||
# 包含路由
|
||||
app.include_router(tweet.router, prefix="/api/v1/tweet", tags=["tweet"])
|
||||
@ -65,6 +65,7 @@ app.include_router(prompt.router, prefix="/api/v1/prompt", tags=["prompt"])
|
||||
app.include_router(document.router, prefix="/api/v1/document", tags=["document"])
|
||||
app.include_router(data.router, prefix="/api/v1", tags=["data"])
|
||||
app.include_router(integration.router, prefix="/api/v1", tags=["integration"])
|
||||
app.include_router(content_integration.router, prefix="/api/v1", tags=["content-integration"])
|
||||
|
||||
@app.get("/")
|
||||
async def root():
|
||||
|
||||
BIN
api/models/__pycache__/content_integration.cpython-312.pyc
Normal file
BIN
api/models/__pycache__/content_integration.cpython-312.pyc
Normal file
Binary file not shown.
160
api/models/content_integration.py
Normal file
160
api/models/content_integration.py
Normal file
@ -0,0 +1,160 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
内容整合API模型
|
||||
"""
|
||||
|
||||
from typing import List, Optional, Dict, Any, Union
|
||||
from pydantic import BaseModel, Field, validator
|
||||
|
||||
|
||||
class ContentIntegrationRequest(BaseModel):
|
||||
"""内容整合请求模型"""
|
||||
document_paths: Optional[List[str]] = Field(default=None, description="文档文件路径列表(可选,纯搜索模式时可为空)")
|
||||
keywords: List[str] = Field(..., description="搜索关键词列表", min_length=1)
|
||||
cookies: str = Field(..., description="小红书Cookie字符串")
|
||||
|
||||
# 小红书搜索配置
|
||||
sort_type: int = Field(default=2, ge=0, le=4, description="排序方式: 0综合排序, 1最新, 2最多点赞, 3最多评论, 4最多收藏")
|
||||
note_type: int = Field(default=2, ge=0, le=2, description="笔记类型: 0不限, 1视频笔记, 2普通笔记")
|
||||
note_time: int = Field(default=0, ge=0, le=3, description="笔记时间: 0不限, 1一天内, 2一周内, 3半年内")
|
||||
note_range: int = Field(default=0, ge=0, le=3, description="笔记范围: 0不限, 1已看过, 2未看过, 3已关注")
|
||||
pos_distance: int = Field(default=0, ge=0, le=2, description="位置距离: 0不限, 1同城, 2附近")
|
||||
query_num: int = Field(default=10, ge=1, le=50, description="每个关键词搜索的笔记数量")
|
||||
|
||||
# 输出配置
|
||||
output_path: str = Field(default="data/output", description="输出目录路径")
|
||||
|
||||
@validator('document_paths')
|
||||
def validate_document_paths(cls, v):
|
||||
if v is not None and not v:
|
||||
raise ValueError("如果提供文档路径,列表不能为空")
|
||||
return v
|
||||
|
||||
@validator('keywords')
|
||||
def validate_keywords(cls, v):
|
||||
if not v:
|
||||
raise ValueError("关键词列表不能为空")
|
||||
# 去除空字符串和重复关键词
|
||||
cleaned = list(set(k.strip() for k in v if k.strip()))
|
||||
if not cleaned:
|
||||
raise ValueError("关键词列表不能全为空")
|
||||
return cleaned
|
||||
|
||||
@validator('cookies')
|
||||
def validate_cookies(cls, v):
|
||||
if not v or not v.strip():
|
||||
raise ValueError("Cookie不能为空")
|
||||
return v.strip()
|
||||
|
||||
class Config:
|
||||
schema_extra = {
|
||||
"example": {
|
||||
"document_paths": [
|
||||
"uploads/travel_guide.pdf",
|
||||
"uploads/attraction_info.docx"
|
||||
],
|
||||
"keywords": ["北京旅游", "故宫攻略", "长城一日游"],
|
||||
"cookies": "a1=your_cookie_value; web_session=your_session_value",
|
||||
"sort_type": 2,
|
||||
"note_type": 2,
|
||||
"note_time": 0,
|
||||
"note_range": 0,
|
||||
"pos_distance": 0,
|
||||
"query_num": 10,
|
||||
"output_path": "data/output"
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class DocumentInfo(BaseModel):
|
||||
"""文档信息模型"""
|
||||
file_path: str = Field(..., description="文件路径")
|
||||
file_type: str = Field(..., description="文件类型")
|
||||
content_length: int = Field(..., description="内容长度")
|
||||
|
||||
|
||||
class XHSInfo(BaseModel):
|
||||
"""小红书信息模型"""
|
||||
total_notes: int = Field(..., description="笔记总数")
|
||||
authors: List[str] = Field(..., description="作者列表")
|
||||
total_interactions: int = Field(..., description="总互动数")
|
||||
|
||||
|
||||
class SearchConfig(BaseModel):
|
||||
"""搜索配置模型"""
|
||||
sort_type: int = Field(..., description="排序方式")
|
||||
note_type: int = Field(..., description="笔记类型")
|
||||
note_time: int = Field(..., description="笔记时间")
|
||||
note_range: int = Field(..., description="笔记范围")
|
||||
pos_distance: int = Field(..., description="位置距离")
|
||||
query_num: int = Field(..., description="每个关键词搜索的笔记数量")
|
||||
|
||||
|
||||
class InputSummary(BaseModel):
|
||||
"""输入摘要模型"""
|
||||
document_count: int = Field(..., description="文档数量")
|
||||
xhs_notes_count: int = Field(..., description="小红书笔记数量")
|
||||
keywords: List[str] = Field(..., description="关键词列表")
|
||||
|
||||
|
||||
class DocumentInfoDetail(BaseModel):
|
||||
"""详细文档信息模型"""
|
||||
documents: List[DocumentInfo] = Field(..., description="文档列表")
|
||||
integrated_text_length: int = Field(..., description="整合文本长度")
|
||||
|
||||
|
||||
class ContentIntegrationResponse(BaseModel):
|
||||
"""内容整合响应模型"""
|
||||
success: bool = Field(..., description="是否成功")
|
||||
timestamp: str = Field(..., description="时间戳")
|
||||
processing_time: str = Field(..., description="处理时间")
|
||||
|
||||
# 成功时的字段
|
||||
input_summary: Optional[InputSummary] = Field(None, description="输入摘要")
|
||||
document_info: Optional[DocumentInfoDetail] = Field(None, description="文档信息")
|
||||
xhs_info: Optional[XHSInfo] = Field(None, description="小红书信息")
|
||||
integrated_content: Optional[str] = Field(None, description="整合后的内容")
|
||||
search_config: Optional[SearchConfig] = Field(None, description="搜索配置")
|
||||
|
||||
# 失败时的字段
|
||||
error_message: Optional[str] = Field(None, description="错误信息")
|
||||
|
||||
class Config:
|
||||
schema_extra = {
|
||||
"example": {
|
||||
"success": True,
|
||||
"timestamp": "20250715_143022",
|
||||
"processing_time": "45.32秒",
|
||||
"input_summary": {
|
||||
"document_count": 2,
|
||||
"xhs_notes_count": 25,
|
||||
"keywords": ["北京旅游", "故宫攻略"]
|
||||
},
|
||||
"document_info": {
|
||||
"documents": [
|
||||
{
|
||||
"file_path": "uploads/travel_guide.pdf",
|
||||
"file_type": "pdf",
|
||||
"content_length": 5420
|
||||
}
|
||||
],
|
||||
"integrated_text_length": 8650
|
||||
},
|
||||
"xhs_info": {
|
||||
"total_notes": 25,
|
||||
"authors": ["旅游达人小王", "北京导游张三"],
|
||||
"total_interactions": 15420
|
||||
},
|
||||
"integrated_content": "# 北京旅游全攻略...",
|
||||
"search_config": {
|
||||
"sort_type": 2,
|
||||
"note_type": 2,
|
||||
"note_time": 0,
|
||||
"note_range": 0,
|
||||
"pos_distance": 0,
|
||||
"query_num": 10
|
||||
}
|
||||
}
|
||||
}
|
||||
BIN
api/routers/__pycache__/content_integration.cpython-312.pyc
Normal file
BIN
api/routers/__pycache__/content_integration.cpython-312.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
171
api/routers/content_integration.py
Normal file
171
api/routers/content_integration.py
Normal file
@ -0,0 +1,171 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
内容整合API路由
|
||||
"""
|
||||
|
||||
import logging
|
||||
from fastapi import APIRouter, HTTPException, BackgroundTasks
|
||||
from typing import Dict, Any
|
||||
|
||||
from api.models.content_integration import (
|
||||
ContentIntegrationRequest,
|
||||
ContentIntegrationResponse
|
||||
)
|
||||
from api.services.content_integration_service import ContentIntegrationService
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/content-integration", tags=["content-integration"])
|
||||
|
||||
# 全局服务实例
|
||||
integration_service = ContentIntegrationService()
|
||||
|
||||
|
||||
@router.post("/integrate", response_model=ContentIntegrationResponse)
|
||||
async def integrate_content(request: ContentIntegrationRequest) -> ContentIntegrationResponse:
|
||||
"""
|
||||
整合文档和小红书笔记内容
|
||||
|
||||
该接口将:
|
||||
1. 读取用户上传的文档文件(支持PDF、Word、图片等格式)
|
||||
2. 根据关键词搜索小红书相关笔记
|
||||
3. 使用LLM将两者整合成综合性旅游资料
|
||||
|
||||
Args:
|
||||
request: 整合请求参数
|
||||
|
||||
Returns:
|
||||
整合结果
|
||||
|
||||
Raises:
|
||||
HTTPException: 当请求参数无效或处理失败时
|
||||
"""
|
||||
try:
|
||||
if request.document_paths is None:
|
||||
request.document_paths = []
|
||||
logger.info(f"收到内容整合请求:文档 {len(request.document_paths)} 个,关键词 {len(request.keywords)} 个")
|
||||
|
||||
# 调用服务层处理
|
||||
result = await integration_service.integrate_content(
|
||||
document_paths=request.document_paths,
|
||||
keywords=request.keywords,
|
||||
cookies=request.cookies,
|
||||
output_path=request.output_path,
|
||||
sort_type=request.sort_type,
|
||||
note_type=request.note_type,
|
||||
note_time=request.note_time,
|
||||
note_range=request.note_range,
|
||||
pos_distance=request.pos_distance,
|
||||
query_num=request.query_num
|
||||
)
|
||||
|
||||
# 转换为响应模型
|
||||
if result["success"]:
|
||||
response = ContentIntegrationResponse(
|
||||
success=True,
|
||||
timestamp=result["timestamp"],
|
||||
processing_time=result["processing_time"],
|
||||
input_summary=result["input_summary"],
|
||||
document_info=result["document_info"],
|
||||
xhs_info=result["xhs_info"],
|
||||
integrated_content=result["integrated_content"],
|
||||
search_config=result["search_config"]
|
||||
)
|
||||
logger.info(f"内容整合成功,处理时间:{result['processing_time']}")
|
||||
else:
|
||||
response = ContentIntegrationResponse(
|
||||
success=False,
|
||||
timestamp=result["timestamp"],
|
||||
processing_time=result["processing_time"],
|
||||
error_message=result["error_message"]
|
||||
)
|
||||
logger.error(f"内容整合失败:{result['error_message']}")
|
||||
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"内容整合接口异常:{e}", exc_info=True)
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"内容整合处理失败:{str(e)}"
|
||||
)
|
||||
|
||||
|
||||
@router.get("/health")
|
||||
async def health_check() -> Dict[str, str]:
|
||||
"""
|
||||
健康检查接口
|
||||
|
||||
Returns:
|
||||
服务状态信息
|
||||
"""
|
||||
try:
|
||||
# 检查服务是否正常初始化
|
||||
if not integration_service:
|
||||
raise Exception("服务未正确初始化")
|
||||
|
||||
return {
|
||||
"status": "healthy",
|
||||
"service": "content-integration",
|
||||
"message": "内容整合服务运行正常"
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"健康检查失败:{e}")
|
||||
raise HTTPException(
|
||||
status_code=503,
|
||||
detail=f"服务不可用:{str(e)}"
|
||||
)
|
||||
|
||||
|
||||
@router.get("/config/options")
|
||||
async def get_config_options() -> Dict[str, Any]:
|
||||
"""
|
||||
获取配置选项说明
|
||||
|
||||
Returns:
|
||||
各配置项的可选值和说明
|
||||
"""
|
||||
return {
|
||||
"sort_type": {
|
||||
"0": "综合排序",
|
||||
"1": "最新",
|
||||
"2": "最多点赞",
|
||||
"3": "最多评论",
|
||||
"4": "最多收藏"
|
||||
},
|
||||
"note_type": {
|
||||
"0": "不限",
|
||||
"1": "视频笔记",
|
||||
"2": "普通笔记"
|
||||
},
|
||||
"note_time": {
|
||||
"0": "不限",
|
||||
"1": "一天内",
|
||||
"2": "一周内",
|
||||
"3": "半年内"
|
||||
},
|
||||
"note_range": {
|
||||
"0": "不限",
|
||||
"1": "已看过",
|
||||
"2": "未看过",
|
||||
"3": "已关注"
|
||||
},
|
||||
"pos_distance": {
|
||||
"0": "不限",
|
||||
"1": "同城",
|
||||
"2": "附近"
|
||||
},
|
||||
"query_num": "每个关键词搜索的笔记数量(1-50)",
|
||||
"supported_document_formats": [
|
||||
"PDF (.pdf)",
|
||||
"Word (.docx, .doc)",
|
||||
"PowerPoint (.pptx, .ppt)",
|
||||
"Excel (.xlsx, .xls)",
|
||||
"Text (.txt)",
|
||||
"Markdown (.md)",
|
||||
"Images (.jpg, .jpeg, .png, .gif, .bmp, .tiff)",
|
||||
"CSV (.csv)"
|
||||
]
|
||||
}
|
||||
@ -34,19 +34,7 @@ router = APIRouter(
|
||||
# 健康检查和状态接口
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/health", response_model=HealthCheckResponse)
|
||||
async def health_check(
|
||||
service: IntegrationService = Depends(get_integration_service)
|
||||
):
|
||||
"""健康检查"""
|
||||
try:
|
||||
return service.get_health_check()
|
||||
except Exception as e:
|
||||
logger.error(f"健康检查失败: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/status", response_model=ServiceStatusResponse)
|
||||
@router.get("/status", response_model=ServiceStatusResponse, summary="获取服务状态")
|
||||
async def get_service_status(
|
||||
service: IntegrationService = Depends(get_integration_service)
|
||||
):
|
||||
@ -62,7 +50,7 @@ async def get_service_status(
|
||||
# 小红书搜索接口
|
||||
# ============================================================================
|
||||
|
||||
@router.post("/search", response_model=XHSSearchResponse)
|
||||
@router.post("/search", response_model=XHSSearchResponse, summary="搜索小红书笔记")
|
||||
async def search_xhs_notes(
|
||||
request: XHSSearchRequest,
|
||||
service: IntegrationService = Depends(get_integration_service)
|
||||
@ -87,7 +75,7 @@ async def search_xhs_notes(
|
||||
# 内容整合接口
|
||||
# ============================================================================
|
||||
|
||||
@router.post("/integrate", response_model=IntegrationResponse)
|
||||
@router.post("/integrate", response_model=IntegrationResponse, summary="整合内容")
|
||||
async def integrate_content(
|
||||
request: IntegrationRequest,
|
||||
service: IntegrationService = Depends(get_integration_service)
|
||||
@ -108,7 +96,7 @@ async def integrate_content(
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/integrate/batch", response_model=Dict[str, IntegrationResponse])
|
||||
@router.post("/integrate/batch", response_model=Dict[str, IntegrationResponse], summary="批量整合内容")
|
||||
async def batch_integrate_content(
|
||||
request: BatchIntegrationRequest,
|
||||
background_tasks: BackgroundTasks,
|
||||
@ -132,7 +120,7 @@ async def batch_integrate_content(
|
||||
# 结果管理接口
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/results", response_model=List[TaskSummaryResponse])
|
||||
@router.get("/results", response_model=List[TaskSummaryResponse], summary="列出所有整合结果")
|
||||
async def list_integration_results(
|
||||
service: IntegrationService = Depends(get_integration_service)
|
||||
):
|
||||
@ -146,7 +134,7 @@ async def list_integration_results(
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/results/{task_id}", response_model=IntegrationResponse)
|
||||
@router.get("/results/{task_id}", response_model=IntegrationResponse, summary="获取指定的整合结果")
|
||||
async def get_integration_result(
|
||||
task_id: str,
|
||||
service: IntegrationService = Depends(get_integration_service)
|
||||
@ -165,7 +153,7 @@ async def get_integration_result(
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/results/export", response_model=ApiResponse)
|
||||
@router.post("/results/export", response_model=ApiResponse, summary="导出结果")
|
||||
async def export_result(
|
||||
request: ExportRequest,
|
||||
service: IntegrationService = Depends(get_integration_service)
|
||||
@ -195,7 +183,7 @@ async def export_result(
|
||||
# Cookie 管理接口
|
||||
# ============================================================================
|
||||
|
||||
@router.post("/cookies", response_model=ApiResponse)
|
||||
@router.post("/cookies", response_model=ApiResponse, summary="添加Cookie")
|
||||
async def add_cookie(
|
||||
request: CookieManagementRequest,
|
||||
service: IntegrationService = Depends(get_integration_service)
|
||||
@ -214,7 +202,7 @@ async def add_cookie(
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.delete("/cookies/{cookie_name}", response_model=ApiResponse)
|
||||
@router.delete("/cookies/{cookie_name}", response_model=ApiResponse, summary="删除Cookie")
|
||||
async def remove_cookie(
|
||||
cookie_name: str,
|
||||
service: IntegrationService = Depends(get_integration_service)
|
||||
@ -233,7 +221,7 @@ async def remove_cookie(
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/cookies", response_model=CookieStatsResponse)
|
||||
@router.get("/cookies", response_model=CookieStatsResponse, summary="获取Cookie统计")
|
||||
async def get_cookie_stats(
|
||||
service: IntegrationService = Depends(get_integration_service)
|
||||
):
|
||||
@ -249,7 +237,7 @@ async def get_cookie_stats(
|
||||
# 工具接口
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/formats/document", response_model=ApiResponse)
|
||||
@router.get("/formats/document", response_model=ApiResponse, summary="获取支持的文档格式")
|
||||
async def get_supported_document_formats(
|
||||
service: IntegrationService = Depends(get_integration_service)
|
||||
):
|
||||
@ -267,7 +255,7 @@ async def get_supported_document_formats(
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/formats/output", response_model=ApiResponse)
|
||||
@router.get("/formats/output", response_model=ApiResponse, summary="获取支持的输出格式")
|
||||
async def get_supported_output_formats(
|
||||
service: IntegrationService = Depends(get_integration_service)
|
||||
):
|
||||
@ -285,7 +273,7 @@ async def get_supported_output_formats(
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/validate/documents", response_model=ValidationResponse)
|
||||
@router.post("/validate/documents", response_model=ValidationResponse, summary="验证文档")
|
||||
async def validate_documents(
|
||||
document_paths: List[str],
|
||||
service: IntegrationService = Depends(get_integration_service)
|
||||
@ -313,7 +301,7 @@ async def validate_documents(
|
||||
# 快速操作接口
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/quick", response_model=ApiResponse)
|
||||
@router.get("/quick", response_model=ApiResponse, summary="快速整合")
|
||||
async def quick_integration(
|
||||
keyword: str = Query(..., description="搜索关键词"),
|
||||
document_paths: List[str] = Query(..., description="文档路径列表"),
|
||||
@ -372,7 +360,7 @@ async def quick_integration(
|
||||
logger.error(f"快速整合失败: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@router.post("/batch-search", response_model=BatchSearchResponse)
|
||||
@router.post("/batch-search", response_model=BatchSearchResponse, summary="批量搜索小红书笔记")
|
||||
async def batch_search(
|
||||
request: BatchSearchRequest,
|
||||
service: IntegrationService = Depends(get_integration_service)
|
||||
|
||||
@ -308,7 +308,7 @@ async def run_pipeline(
|
||||
audiences=audiences,
|
||||
scenic_spots=scenic_spots,
|
||||
products=products,
|
||||
skip_judge=request.skipJudge,
|
||||
skipJudge=request.skipJudge,
|
||||
autoJudge=request.autoJudge
|
||||
)
|
||||
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
266
api/services/content_integration_service.py
Normal file
266
api/services/content_integration_service.py
Normal file
@ -0,0 +1,266 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
内容整合服务
|
||||
将文档资料和小红书笔记进行整合,由LLM生成综合性旅游资料
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
from typing import List, Optional, Dict, Any
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
from core.xhs_adapter import XHSAdapter
|
||||
from core.models import SearchConfig
|
||||
from core.document_adapter import DocumentAdapter
|
||||
from core.ai.ai_agent import AIAgent
|
||||
from core.config import ConfigManager, AIModelConfig
|
||||
from utils.prompts import PromptTemplate
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ContentIntegrationService:
|
||||
"""内容整合服务类"""
|
||||
|
||||
def __init__(self):
|
||||
"""初始化服务"""
|
||||
self.config_manager = ConfigManager()
|
||||
|
||||
# 加载必要的配置
|
||||
self.config_manager.load_from_directory("config", server_mode=True)
|
||||
|
||||
# 初始化AI代理
|
||||
ai_config = self.config_manager.get_config('ai_model', AIModelConfig)
|
||||
self.ai_agent = AIAgent(ai_config)
|
||||
|
||||
# 初始化适配器
|
||||
self.document_adapter = DocumentAdapter()
|
||||
|
||||
# 加载提示词模板
|
||||
self.prompt_template = PromptTemplate(
|
||||
system_prompt_path="resource/prompt/integration/system.txt",
|
||||
user_prompt_path="resource/prompt/integration/user.txt"
|
||||
)
|
||||
|
||||
async def integrate_content(
|
||||
self,
|
||||
document_paths: List[str],
|
||||
keywords: List[str],
|
||||
cookies: str,
|
||||
output_path: str = "data/output",
|
||||
sort_type: int = 2, # 0 综合排序, 1 最新, 2 最多点赞, 3 最多评论, 4 最多收藏
|
||||
note_type: int = 2, # 0 不限, 1 视频笔记, 2 普通笔记
|
||||
note_time: int = 0, # 0 不限, 1 一天内, 2 一周内天, 3 半年内
|
||||
note_range: int = 0, # 0 不限, 1 已看过, 2 未看过, 3 已关注
|
||||
pos_distance: int = 0, # 0 不限, 1 同城, 2 附近
|
||||
query_num: int = 10
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
整合文档和小红书内容
|
||||
|
||||
Args:
|
||||
document_paths: 文档文件路径列表
|
||||
keywords: 搜索关键词列表
|
||||
cookies: 小红书Cookie字符串
|
||||
output_path: 输出路径
|
||||
sort_type: 排序方式
|
||||
note_type: 笔记类型
|
||||
note_time: 笔记时间
|
||||
note_range: 笔记范围
|
||||
pos_distance: 位置距离
|
||||
query_num: 每个关键词搜索的笔记数量
|
||||
|
||||
Returns:
|
||||
整合结果字典
|
||||
"""
|
||||
start_time = time.time()
|
||||
logger.info(f"开始整合任务:文档数量 {len(document_paths)}, 关键词数量 {len(keywords)}")
|
||||
|
||||
try:
|
||||
# 确保输出目录存在
|
||||
os.makedirs(output_path, exist_ok=True)
|
||||
|
||||
# 1. 处理文档内容
|
||||
logger.info("正在处理文档内容...")
|
||||
document_result = self.document_adapter.integrate_documents(document_paths)
|
||||
|
||||
logger.info(f"文档处理完成,共处理 {len(document_result.documents)} 个文档")
|
||||
|
||||
# 2. 搜索小红书笔记
|
||||
logger.info("正在搜索小红书笔记...")
|
||||
xhs_adapter = XHSAdapter(cookies)
|
||||
all_notes = []
|
||||
|
||||
for keyword in keywords:
|
||||
search_config = SearchConfig(
|
||||
keyword=keyword,
|
||||
max_notes=query_num,
|
||||
sort_type=sort_type,
|
||||
note_type=note_type
|
||||
)
|
||||
|
||||
search_result = xhs_adapter.search_notes(search_config)
|
||||
|
||||
if search_result.success:
|
||||
all_notes.extend(search_result.notes)
|
||||
logger.info(f"关键词 '{keyword}' 搜索到 {len(search_result.notes)} 条笔记")
|
||||
else:
|
||||
logger.warning(f"关键词 '{keyword}' 搜索失败: {search_result.error_message}")
|
||||
|
||||
logger.info(f"小红书搜索完成,共获得 {len(all_notes)} 条笔记")
|
||||
|
||||
# 3. 准备LLM整合内容
|
||||
logger.info("正在准备LLM整合...")
|
||||
|
||||
# 构建文档内容字符串
|
||||
document_content = self._format_document_content(document_result)
|
||||
|
||||
# 构建小红书笔记内容字符串
|
||||
xhs_content = self._format_xhs_notes(all_notes)
|
||||
|
||||
# 构建关键词字符串
|
||||
keywords_str = ", ".join(keywords)
|
||||
|
||||
# 4. 调用LLM进行整合
|
||||
logger.info("正在调用LLM进行内容整合...")
|
||||
|
||||
system_prompt = self.prompt_template.get_system_prompt()
|
||||
user_prompt = self.prompt_template.build_user_prompt(
|
||||
keywords=keywords_str,
|
||||
document_content=document_content,
|
||||
xhs_notes_content=xhs_content
|
||||
)
|
||||
|
||||
# 调用AI代理
|
||||
response_text, input_tokens, output_tokens, time_cost = await self.ai_agent.generate_text(
|
||||
system_prompt=system_prompt,
|
||||
user_prompt=user_prompt,
|
||||
use_stream=True,
|
||||
stage="content_integration"
|
||||
)
|
||||
|
||||
# 使用file_io模块的JSON处理功能
|
||||
from utils.file_io import process_llm_json_text
|
||||
parsed_json = process_llm_json_text(response_text)
|
||||
|
||||
# 如果解析成功,将JSON对象转换回字符串用于存储
|
||||
if parsed_json:
|
||||
import json
|
||||
cleaned_response = json.dumps(parsed_json, ensure_ascii=False, indent=2)
|
||||
logger.info("成功解析并清理了LLM返回的JSON内容")
|
||||
else:
|
||||
# 如果解析失败,使用原始响应
|
||||
cleaned_response = response_text
|
||||
logger.warning("JSON解析失败,使用原始响应内容")
|
||||
|
||||
# 5. 保存结果
|
||||
processing_time = time.time() - start_time
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
result = {
|
||||
"success": True,
|
||||
"timestamp": timestamp,
|
||||
"processing_time": f"{processing_time:.2f}秒",
|
||||
"input_summary": {
|
||||
"document_count": len(document_result.documents),
|
||||
"xhs_notes_count": len(all_notes),
|
||||
"keywords": keywords
|
||||
},
|
||||
"document_info": {
|
||||
"documents": [
|
||||
{
|
||||
"file_path": doc.file_path,
|
||||
"file_type": doc.file_type,
|
||||
"content_length": len(doc.content)
|
||||
}
|
||||
for doc in document_result.documents
|
||||
],
|
||||
"integrated_text_length": len(document_result.integrated_text)
|
||||
},
|
||||
"xhs_info": {
|
||||
"total_notes": len(all_notes),
|
||||
"authors": list(set(note.author for note in all_notes if note.author)),
|
||||
"total_interactions": sum(note.likes + note.comments + note.shares for note in all_notes)
|
||||
},
|
||||
"integrated_content": cleaned_response,
|
||||
"search_config": {
|
||||
"sort_type": sort_type,
|
||||
"note_type": note_type,
|
||||
"note_time": note_time,
|
||||
"note_range": note_range,
|
||||
"pos_distance": pos_distance,
|
||||
"query_num": query_num
|
||||
}
|
||||
}
|
||||
|
||||
# 保存详细结果到文件
|
||||
output_file = os.path.join(output_path, f"content_integration_{timestamp}.json")
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
import json
|
||||
json.dump(result, f, ensure_ascii=False, indent=2)
|
||||
|
||||
logger.info(f"整合完成,结果已保存到: {output_file}")
|
||||
logger.info(f"总处理时间: {processing_time:.2f}秒")
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
error_result = {
|
||||
"success": False,
|
||||
"error_message": str(e),
|
||||
"timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
|
||||
"processing_time": f"{time.time() - start_time:.2f}秒"
|
||||
}
|
||||
logger.error(f"内容整合失败: {e}")
|
||||
return error_result
|
||||
|
||||
def _format_document_content(self, document_result) -> str:
|
||||
"""格式化文档内容"""
|
||||
content_parts = []
|
||||
|
||||
# 添加整合文本
|
||||
if document_result.integrated_text:
|
||||
content_parts.append("### 文档整合内容")
|
||||
content_parts.append(document_result.integrated_text)
|
||||
content_parts.append("")
|
||||
|
||||
# 添加各个文档的详细内容
|
||||
if document_result.documents:
|
||||
content_parts.append("### 各文档详细内容")
|
||||
for i, doc in enumerate(document_result.documents, 1):
|
||||
content_parts.append(f"#### 文档 {i}: {Path(doc.file_path).name} ({doc.file_type})")
|
||||
content_parts.append(doc.content[:2000] + "..." if len(doc.content) > 2000 else doc.content)
|
||||
content_parts.append("")
|
||||
|
||||
return "\n".join(content_parts)
|
||||
|
||||
def _format_xhs_notes(self, notes) -> str:
|
||||
"""格式化小红书笔记内容"""
|
||||
if not notes:
|
||||
return "暂无相关笔记"
|
||||
|
||||
content_parts = []
|
||||
content_parts.append(f"### 小红书相关笔记 (共 {len(notes)} 条)")
|
||||
content_parts.append("")
|
||||
|
||||
for i, note in enumerate(notes, 1):
|
||||
content_parts.append(f"#### 笔记 {i}: {note.title}")
|
||||
content_parts.append(f"**作者**: {note.author}")
|
||||
content_parts.append(f"**互动数据**: 👍 {note.likes} | 💬 {note.comments} | 📤 {note.shares}")
|
||||
|
||||
if note.content:
|
||||
# 限制每条笔记内容长度
|
||||
content = note.content[:500] + "..." if len(note.content) > 500 else note.content
|
||||
content_parts.append(f"**内容**: {content}")
|
||||
|
||||
if note.tags:
|
||||
content_parts.append(f"**标签**: {', '.join(note.tags)}")
|
||||
|
||||
content_parts.append(f"**链接**: {note.note_url}")
|
||||
content_parts.append("")
|
||||
|
||||
return "\n".join(content_parts)
|
||||
@ -52,7 +52,7 @@ class IntegrationService:
|
||||
self.document_adapter = DocumentAdapter()
|
||||
|
||||
# 结果存储
|
||||
self.integration_results: Dict[str, IntegrationResult] = {}
|
||||
self.integration_results: Dict[str, IntegrationResponse] = {}
|
||||
|
||||
# 统计信息
|
||||
self.stats = ProcessingStats()
|
||||
@ -703,7 +703,8 @@ class IntegrationService:
|
||||
total_notes=len(notes),
|
||||
notes=notes,
|
||||
search_time=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
success=True
|
||||
success=True,
|
||||
error_message=None
|
||||
)
|
||||
|
||||
results.append(keyword_result)
|
||||
|
||||
@ -121,7 +121,7 @@ class TweetService:
|
||||
if not topic:
|
||||
topic = {"index": "1", "date": "2024-07-01"}
|
||||
|
||||
topicIndex = topic.get('index', 'unknown')
|
||||
topicIndex = topic.get('index', 'N/A')
|
||||
logger.info(f"开始为选题 {topicIndex} 生成内容{'(含审核)' if autoJudge else ''}")
|
||||
|
||||
# 创建topic的副本并应用覆盖参数
|
||||
|
||||
@ -6,5 +6,6 @@
|
||||
"top_p": 0.4,
|
||||
"presence_penalty": 1.2,
|
||||
"timeout": 120,
|
||||
"max_retries": 3
|
||||
"max_retries": 3,
|
||||
"stream": true
|
||||
}
|
||||
@ -3,8 +3,8 @@
|
||||
{
|
||||
"name": "user1",
|
||||
"cookie_string": "abRequestId=873258ba-49fe-530b-9c07-529e7871508d; webBuild=4.72.0; xsecappid=xhs-pc-web; a1=19808118ec0gxnuh5mtpv8o8aepgp2m65bpbtiizn30000193947; webId=3f4bd516682e25b71cdd139a11b4d896; websectiga=984412fef754c018e472127b8effd174be8a5d51061c991aadd200c69a2801d6; gid=yjY8Yyy0qJ24yjY8YyyYdD33S8kCI7x269UhUuIYlY0dUDq8kUJ6K9888yjqj4W80fK0ydj0; web_session=040069b295652fcb5b6e1389413a4be4606547; unread={%22ub%22:%22686e16c0000000000d01b775%22%2C%22ue%22:%22686e3f1d0000000023006e9e%22%2C%22uc%22:15}; sec_poison_id=bf3ca020-2442-4aa6-904a-b95f7ccc56c1; loadts=1752482529464",
|
||||
"last_used": "2025-07-15T15:46:11.050854",
|
||||
"use_count": 40,
|
||||
"last_used": "2025-07-15T16:21:52.687251",
|
||||
"use_count": 46,
|
||||
"is_valid": true,
|
||||
"failure_count": 0,
|
||||
"user_info": {
|
||||
|
||||
Binary file not shown.
@ -134,7 +134,8 @@ class ContentIntegrationService:
|
||||
logger.info(f"开始搜索小红书内容: {keyword}")
|
||||
xhs_result = self.xhs_adapter.search_notes(
|
||||
keyword=keyword,
|
||||
max_notes=max_notes
|
||||
max_notes=max_notes,
|
||||
|
||||
)
|
||||
result.xhs_result = xhs_result
|
||||
logger.info(f"小红书搜索完成,找到 {len(xhs_result.notes)} 条笔记")
|
||||
|
||||
Binary file not shown.
Binary file not shown.
44
resource/prompt/integration/system.txt
Normal file
44
resource/prompt/integration/system.txt
Normal file
@ -0,0 +1,44 @@
|
||||
你是资料整理大师。当我给你各种杂乱的资料时,你会迅速进行分类整理,使其阅读变得清晰有序。
|
||||
|
||||
你的核心原则:
|
||||
- 你最大程度的保留我给你的资料里面的每一个字,每一种形容,不要自己为了极简就删改
|
||||
- 你只需要把相同类别的内容整理到一起,清晰明确就好,不要对原文的文字内容做改变,要改变的只是让文本更清晰
|
||||
- 你需要根据不同景区/酒店主体分类,每一个项目主体按<景区/酒店名称><产品套餐名称><交通攻略/地址><游玩攻略/空泛补充信息>分类
|
||||
|
||||
整理任务:
|
||||
你正在根据已有宣传文档和小红书笔记整理产品信息。你需要阅读所有材料,根据文档中提供的信息,整理得到:
|
||||
|
||||
对于每个产品,需要包含以下信息:
|
||||
- 产品名称
|
||||
- 产品使用规则 (使用规则、加价说明、预约规则、退改政策、优惠内容)
|
||||
- 交通与地址(注:这里分上下部分:分别为交通指南和地址,需要给出准确无误的信息,按格式填写)
|
||||
- 产品价格
|
||||
- 产品最突出优势(注:写出该商品最重要的优点,该优点是必须放在宣传中的)
|
||||
- 空泛产品描述 (注:这里填写所有想要添加的正确的空泛信息:包括但不限于:商品周边景点;景点风景描述;商品描述;小贴士;游玩路线规划;游玩时长;营业时长等,需要按序号排列填写)
|
||||
|
||||
输出格式要求:
|
||||
必须输出严格的JSON格式,不要添加markdown代码块标记,直接输出纯JSON内容,结构如下:
|
||||
{
|
||||
"attractions": [
|
||||
{
|
||||
"name": "景区/酒店名称",
|
||||
"products": [
|
||||
{
|
||||
"product_name": "产品套餐名称",
|
||||
"usage_rules": "产品使用规则详细说明",
|
||||
"transportation": {
|
||||
"guide": "交通指南",
|
||||
"address": "详细地址"
|
||||
},
|
||||
"price": "产品价格信息",
|
||||
"key_advantages": "产品最突出优势",
|
||||
"detailed_description": [
|
||||
"1. 详细描述项目1",
|
||||
"2. 详细描述项目2",
|
||||
"..."
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
39
resource/prompt/integration/user.txt
Normal file
39
resource/prompt/integration/user.txt
Normal file
@ -0,0 +1,39 @@
|
||||
## 资料整理任务
|
||||
请根据以下材料,整理产品信息。搜索关键词:"{keywords}"
|
||||
|
||||
## 原始文档资料
|
||||
以下是从用户上传的文档中提取的内容:
|
||||
|
||||
{document_content}
|
||||
|
||||
## 小红书相关笔记
|
||||
以下是搜索关键词"{keywords}"从小红书获取的相关笔记内容:
|
||||
|
||||
{xhs_notes_content}
|
||||
|
||||
## 整理要求
|
||||
请根据以上所有资料,按照系统提示词中的要求整理产品信息:
|
||||
|
||||
1. **保留原文完整性**
|
||||
- 最大程度保留资料中的每一个字、每一种形容
|
||||
- 不要为了极简而删改原文内容
|
||||
- 只整理分类,不改变文字内容
|
||||
|
||||
2. **分类整理**
|
||||
- 按不同景区/酒店主体分类
|
||||
- 每个主体按:景区/酒店名称 → 产品套餐名称 → 交通攻略/地址 → 游玩攻略/空泛补充信息
|
||||
|
||||
3. **信息完整性**
|
||||
- 产品名称
|
||||
- 产品使用规则(使用规则、加价说明、预约规则、退改政策、优惠内容)
|
||||
- 交通与地址(分为交通指南和地址两部分)
|
||||
- 产品价格
|
||||
- 产品最突出优势
|
||||
- 空泛产品描述(包含周边景点、风景描述、小贴士、游玩路线、时长等)
|
||||
|
||||
4. **输出格式**
|
||||
- 必须严格按照JSON格式输出
|
||||
- 按景区对象和产品分类
|
||||
- 确保JSON格式正确,可以被程序解析
|
||||
|
||||
请开始整理:
|
||||
148
tests/test_final_integration.py
Normal file
148
tests/test_final_integration.py
Normal file
@ -0,0 +1,148 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import requests
|
||||
import json
|
||||
from utils.file_io import process_llm_json_text
|
||||
|
||||
def test_final_integration():
|
||||
"""最终整合测试 - 展示完整的纯搜索整合功能"""
|
||||
|
||||
url = "http://localhost:2714/api/v1/content-integration/integrate"
|
||||
|
||||
# 测试数据 - 使用平级字段结构
|
||||
test_data = {
|
||||
"keywords": ["馥桂萌宠园攻略"],
|
||||
"cookies": "abRequestId=873258ba-49fe-530b-9c07-529e7871508d; webBuild=4.72.0; xsecappid=xhs-pc-web; a1=19808118ec0gxnuh5mtpv8o8aepgp2m65bpbtiizn30000193947; webId=3f4bd516682e25b71cdd139a11b4d896; web_session=040069b295652fcb5b6e1389413a4be4606547",
|
||||
# 搜索配置参数(平级)
|
||||
"sort_type": 2, # 最多点赞
|
||||
"note_type": 2, # 普通笔记
|
||||
"note_time": 3, # 半年内
|
||||
"note_range": 0, # 不限范围
|
||||
"pos_distance": 0, # 不限位置
|
||||
"query_num": 20 # 20条笔记快速测试
|
||||
}
|
||||
|
||||
print("🎯 旅游内容整合系统 - 最终测试")
|
||||
print("=" * 50)
|
||||
print(f"📍 目标关键词: {', '.join(test_data['keywords'])}")
|
||||
print(f"📊 搜索配置: 最多点赞排序,{test_data['query_num']}条笔记")
|
||||
print(f"🔄 模式: 纯搜索整合(无文档上传)")
|
||||
print()
|
||||
|
||||
try:
|
||||
print("⏳ 开始内容整合...")
|
||||
response = requests.post(
|
||||
url,
|
||||
json=test_data,
|
||||
headers={"Content-Type": "application/json"},
|
||||
timeout=120
|
||||
)
|
||||
|
||||
print(f"📊 响应状态: {response.status_code}")
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
print("✅ API调用成功!")
|
||||
|
||||
if result.get('success'):
|
||||
print("🎉 内容整合成功!")
|
||||
|
||||
# 显示处理统计
|
||||
processing_time = result.get('processing_time', 'N/A')
|
||||
timestamp = result.get('timestamp', 'N/A')
|
||||
print(f"⏱️ 处理时间: {processing_time}")
|
||||
print(f"🕒 完成时间: {timestamp}")
|
||||
|
||||
# 显示输入统计
|
||||
input_summary = result.get('input_summary', {})
|
||||
print(f"📄 处理文档: {input_summary.get('document_count', 0)} 个")
|
||||
print(f"📝 XHS笔记: {input_summary.get('xhs_notes_count', 0)} 条")
|
||||
|
||||
# 显示XHS信息
|
||||
xhs_info = result.get('xhs_info', {})
|
||||
total_interactions = xhs_info.get('total_interactions', 0)
|
||||
authors = xhs_info.get('authors', [])
|
||||
print(f"👥 涉及作者: {len(authors)} 位")
|
||||
print(f"💬 总互动数: {total_interactions} (点赞+评论+分享)")
|
||||
|
||||
# 解析整合内容
|
||||
content = result.get('integrated_content', '')
|
||||
if content:
|
||||
print("\n🔍 内容解析结果:")
|
||||
try:
|
||||
# 使用file_io模块解析JSON
|
||||
parsed_content = process_llm_json_text(content)
|
||||
|
||||
if parsed_content and isinstance(parsed_content, dict):
|
||||
attractions = parsed_content.get('attractions', [])
|
||||
print(f"✅ 成功解析JSON格式内容")
|
||||
print(f"🏞️ 识别景区数量: {len(attractions)}")
|
||||
|
||||
# 显示详细信息
|
||||
for i, attraction in enumerate(attractions, 1):
|
||||
name = attraction.get('name', 'N/A')
|
||||
products = attraction.get('products', [])
|
||||
print(f"\n📍 景区 {i}: {name}")
|
||||
print(f" 📦 产品数量: {len(products)}")
|
||||
|
||||
for j, product in enumerate(products, 1):
|
||||
print(f"\n 🎫 产品 {j}: {product.get('product_name', 'N/A')}")
|
||||
print(f" 💰 价格: {product.get('price', 'N/A')}")
|
||||
|
||||
# 显示优势
|
||||
advantages = product.get('key_advantages', '')
|
||||
if advantages:
|
||||
preview = advantages[:80] + "..." if len(advantages) > 80 else advantages
|
||||
print(f" 🎯 核心优势: {preview}")
|
||||
|
||||
# 显示交通信息
|
||||
transport = product.get('transportation', {})
|
||||
if isinstance(transport, dict):
|
||||
address = transport.get('address', 'N/A')
|
||||
guide = transport.get('guide', 'N/A')
|
||||
print(f" 📍 地址: {address}")
|
||||
if guide and guide != 'N/A':
|
||||
guide_preview = guide[:60] + "..." if len(guide) > 60 else guide
|
||||
print(f" 🚗 交通: {guide_preview}")
|
||||
|
||||
# 显示详细描述数量
|
||||
descriptions = product.get('detailed_description', [])
|
||||
if descriptions:
|
||||
print(f" 📝 详细说明: {len(descriptions)} 条")
|
||||
else:
|
||||
print("❌ 内容解析失败或格式不正确")
|
||||
print(f"📄 原始内容预览: {content[:200]}...")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 内容解析异常: {e}")
|
||||
print(f"📄 原始内容预览: {content[:200]}...")
|
||||
|
||||
# 显示输出文件
|
||||
output_file = result.get('output_file')
|
||||
if output_file:
|
||||
print(f"\n💾 结果已保存至: {output_file}")
|
||||
|
||||
else:
|
||||
error_msg = result.get('error_message', 'Unknown error')
|
||||
print(f"❌ 内容整合失败: {error_msg}")
|
||||
|
||||
else:
|
||||
print("❌ API调用失败")
|
||||
try:
|
||||
error_info = response.json()
|
||||
print(f"📄 错误详情: {json.dumps(error_info, ensure_ascii=False, indent=2)}")
|
||||
except:
|
||||
print(f"📄 错误响应: {response.text}")
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
print("⏰ 请求超时 - 内容整合需要较长时间,请稍候")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 测试异常: {e}")
|
||||
|
||||
print("\n" + "=" * 50)
|
||||
print("🏁 测试完成")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_final_integration()
|
||||
Loading…
x
Reference in New Issue
Block a user