diff --git a/api/__pycache__/dependencies.cpython-312.pyc b/api/__pycache__/dependencies.cpython-312.pyc index b3da0bc..ff6f81c 100644 Binary files a/api/__pycache__/dependencies.cpython-312.pyc and b/api/__pycache__/dependencies.cpython-312.pyc differ diff --git a/api/__pycache__/main.cpython-312.pyc b/api/__pycache__/main.cpython-312.pyc index 8d07e80..4a026e3 100644 Binary files a/api/__pycache__/main.cpython-312.pyc and b/api/__pycache__/main.cpython-312.pyc differ diff --git a/api/dependencies.py b/api/dependencies.py index 9143594..4e3b981 100644 --- a/api/dependencies.py +++ b/api/dependencies.py @@ -73,7 +73,7 @@ def get_prompt_builder(): from api.services.prompt_service import PromptService prompt_service = PromptService(get_config()) - return PromptBuilderService(get_config(), prompt_service) + return PromptBuilderService(get_config(), prompt_service) def get_integration_service(): """获取整合服务""" diff --git a/api/main.py b/api/main.py index cbcb313..aba7b57 100644 --- a/api/main.py +++ b/api/main.py @@ -56,7 +56,7 @@ app.add_middleware( ) # 导入路由 -from api.routers import tweet, poster, prompt, document, data, integration +from api.routers import tweet, poster, prompt, document, data, integration, content_integration # 包含路由 app.include_router(tweet.router, prefix="/api/v1/tweet", tags=["tweet"]) @@ -65,6 +65,7 @@ app.include_router(prompt.router, prefix="/api/v1/prompt", tags=["prompt"]) app.include_router(document.router, prefix="/api/v1/document", tags=["document"]) app.include_router(data.router, prefix="/api/v1", tags=["data"]) app.include_router(integration.router, prefix="/api/v1", tags=["integration"]) +app.include_router(content_integration.router, prefix="/api/v1", tags=["content-integration"]) @app.get("/") async def root(): diff --git a/api/models/__pycache__/content_integration.cpython-312.pyc b/api/models/__pycache__/content_integration.cpython-312.pyc new file mode 100644 index 0000000..e973d15 Binary files /dev/null and b/api/models/__pycache__/content_integration.cpython-312.pyc differ diff --git a/api/models/content_integration.py b/api/models/content_integration.py new file mode 100644 index 0000000..ab0a35a --- /dev/null +++ b/api/models/content_integration.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +内容整合API模型 +""" + +from typing import List, Optional, Dict, Any, Union +from pydantic import BaseModel, Field, validator + + +class ContentIntegrationRequest(BaseModel): + """内容整合请求模型""" + document_paths: Optional[List[str]] = Field(default=None, description="文档文件路径列表(可选,纯搜索模式时可为空)") + keywords: List[str] = Field(..., description="搜索关键词列表", min_length=1) + cookies: str = Field(..., description="小红书Cookie字符串") + + # 小红书搜索配置 + sort_type: int = Field(default=2, ge=0, le=4, description="排序方式: 0综合排序, 1最新, 2最多点赞, 3最多评论, 4最多收藏") + note_type: int = Field(default=2, ge=0, le=2, description="笔记类型: 0不限, 1视频笔记, 2普通笔记") + note_time: int = Field(default=0, ge=0, le=3, description="笔记时间: 0不限, 1一天内, 2一周内, 3半年内") + note_range: int = Field(default=0, ge=0, le=3, description="笔记范围: 0不限, 1已看过, 2未看过, 3已关注") + pos_distance: int = Field(default=0, ge=0, le=2, description="位置距离: 0不限, 1同城, 2附近") + query_num: int = Field(default=10, ge=1, le=50, description="每个关键词搜索的笔记数量") + + # 输出配置 + output_path: str = Field(default="data/output", description="输出目录路径") + + @validator('document_paths') + def validate_document_paths(cls, v): + if v is not None and not v: + raise ValueError("如果提供文档路径,列表不能为空") + return v + + @validator('keywords') + def validate_keywords(cls, v): + if not v: + raise ValueError("关键词列表不能为空") + # 去除空字符串和重复关键词 + cleaned = list(set(k.strip() for k in v if k.strip())) + if not cleaned: + raise ValueError("关键词列表不能全为空") + return cleaned + + @validator('cookies') + def validate_cookies(cls, v): + if not v or not v.strip(): + raise ValueError("Cookie不能为空") + return v.strip() + + class Config: + schema_extra = { + "example": { + "document_paths": [ + "uploads/travel_guide.pdf", + "uploads/attraction_info.docx" + ], + "keywords": ["北京旅游", "故宫攻略", "长城一日游"], + "cookies": "a1=your_cookie_value; web_session=your_session_value", + "sort_type": 2, + "note_type": 2, + "note_time": 0, + "note_range": 0, + "pos_distance": 0, + "query_num": 10, + "output_path": "data/output" + } + } + + +class DocumentInfo(BaseModel): + """文档信息模型""" + file_path: str = Field(..., description="文件路径") + file_type: str = Field(..., description="文件类型") + content_length: int = Field(..., description="内容长度") + + +class XHSInfo(BaseModel): + """小红书信息模型""" + total_notes: int = Field(..., description="笔记总数") + authors: List[str] = Field(..., description="作者列表") + total_interactions: int = Field(..., description="总互动数") + + +class SearchConfig(BaseModel): + """搜索配置模型""" + sort_type: int = Field(..., description="排序方式") + note_type: int = Field(..., description="笔记类型") + note_time: int = Field(..., description="笔记时间") + note_range: int = Field(..., description="笔记范围") + pos_distance: int = Field(..., description="位置距离") + query_num: int = Field(..., description="每个关键词搜索的笔记数量") + + +class InputSummary(BaseModel): + """输入摘要模型""" + document_count: int = Field(..., description="文档数量") + xhs_notes_count: int = Field(..., description="小红书笔记数量") + keywords: List[str] = Field(..., description="关键词列表") + + +class DocumentInfoDetail(BaseModel): + """详细文档信息模型""" + documents: List[DocumentInfo] = Field(..., description="文档列表") + integrated_text_length: int = Field(..., description="整合文本长度") + + +class ContentIntegrationResponse(BaseModel): + """内容整合响应模型""" + success: bool = Field(..., description="是否成功") + timestamp: str = Field(..., description="时间戳") + processing_time: str = Field(..., description="处理时间") + + # 成功时的字段 + input_summary: Optional[InputSummary] = Field(None, description="输入摘要") + document_info: Optional[DocumentInfoDetail] = Field(None, description="文档信息") + xhs_info: Optional[XHSInfo] = Field(None, description="小红书信息") + integrated_content: Optional[str] = Field(None, description="整合后的内容") + search_config: Optional[SearchConfig] = Field(None, description="搜索配置") + + # 失败时的字段 + error_message: Optional[str] = Field(None, description="错误信息") + + class Config: + schema_extra = { + "example": { + "success": True, + "timestamp": "20250715_143022", + "processing_time": "45.32秒", + "input_summary": { + "document_count": 2, + "xhs_notes_count": 25, + "keywords": ["北京旅游", "故宫攻略"] + }, + "document_info": { + "documents": [ + { + "file_path": "uploads/travel_guide.pdf", + "file_type": "pdf", + "content_length": 5420 + } + ], + "integrated_text_length": 8650 + }, + "xhs_info": { + "total_notes": 25, + "authors": ["旅游达人小王", "北京导游张三"], + "total_interactions": 15420 + }, + "integrated_content": "# 北京旅游全攻略...", + "search_config": { + "sort_type": 2, + "note_type": 2, + "note_time": 0, + "note_range": 0, + "pos_distance": 0, + "query_num": 10 + } + } + } \ No newline at end of file diff --git a/api/routers/__pycache__/content_integration.cpython-312.pyc b/api/routers/__pycache__/content_integration.cpython-312.pyc new file mode 100644 index 0000000..dfc1ec3 Binary files /dev/null and b/api/routers/__pycache__/content_integration.cpython-312.pyc differ diff --git a/api/routers/__pycache__/integration.cpython-312.pyc b/api/routers/__pycache__/integration.cpython-312.pyc index 573cdcf..823d171 100644 Binary files a/api/routers/__pycache__/integration.cpython-312.pyc and b/api/routers/__pycache__/integration.cpython-312.pyc differ diff --git a/api/routers/__pycache__/tweet.cpython-312.pyc b/api/routers/__pycache__/tweet.cpython-312.pyc index b0aa0e5..1c8158b 100644 Binary files a/api/routers/__pycache__/tweet.cpython-312.pyc and b/api/routers/__pycache__/tweet.cpython-312.pyc differ diff --git a/api/routers/content_integration.py b/api/routers/content_integration.py new file mode 100644 index 0000000..284a70d --- /dev/null +++ b/api/routers/content_integration.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +内容整合API路由 +""" + +import logging +from fastapi import APIRouter, HTTPException, BackgroundTasks +from typing import Dict, Any + +from api.models.content_integration import ( + ContentIntegrationRequest, + ContentIntegrationResponse +) +from api.services.content_integration_service import ContentIntegrationService + +logger = logging.getLogger(__name__) + +router = APIRouter(prefix="/content-integration", tags=["content-integration"]) + +# 全局服务实例 +integration_service = ContentIntegrationService() + + +@router.post("/integrate", response_model=ContentIntegrationResponse) +async def integrate_content(request: ContentIntegrationRequest) -> ContentIntegrationResponse: + """ + 整合文档和小红书笔记内容 + + 该接口将: + 1. 读取用户上传的文档文件(支持PDF、Word、图片等格式) + 2. 根据关键词搜索小红书相关笔记 + 3. 使用LLM将两者整合成综合性旅游资料 + + Args: + request: 整合请求参数 + + Returns: + 整合结果 + + Raises: + HTTPException: 当请求参数无效或处理失败时 + """ + try: + if request.document_paths is None: + request.document_paths = [] + logger.info(f"收到内容整合请求:文档 {len(request.document_paths)} 个,关键词 {len(request.keywords)} 个") + + # 调用服务层处理 + result = await integration_service.integrate_content( + document_paths=request.document_paths, + keywords=request.keywords, + cookies=request.cookies, + output_path=request.output_path, + sort_type=request.sort_type, + note_type=request.note_type, + note_time=request.note_time, + note_range=request.note_range, + pos_distance=request.pos_distance, + query_num=request.query_num + ) + + # 转换为响应模型 + if result["success"]: + response = ContentIntegrationResponse( + success=True, + timestamp=result["timestamp"], + processing_time=result["processing_time"], + input_summary=result["input_summary"], + document_info=result["document_info"], + xhs_info=result["xhs_info"], + integrated_content=result["integrated_content"], + search_config=result["search_config"] + ) + logger.info(f"内容整合成功,处理时间:{result['processing_time']}") + else: + response = ContentIntegrationResponse( + success=False, + timestamp=result["timestamp"], + processing_time=result["processing_time"], + error_message=result["error_message"] + ) + logger.error(f"内容整合失败:{result['error_message']}") + + return response + + except Exception as e: + logger.error(f"内容整合接口异常:{e}", exc_info=True) + raise HTTPException( + status_code=500, + detail=f"内容整合处理失败:{str(e)}" + ) + + +@router.get("/health") +async def health_check() -> Dict[str, str]: + """ + 健康检查接口 + + Returns: + 服务状态信息 + """ + try: + # 检查服务是否正常初始化 + if not integration_service: + raise Exception("服务未正确初始化") + + return { + "status": "healthy", + "service": "content-integration", + "message": "内容整合服务运行正常" + } + except Exception as e: + logger.error(f"健康检查失败:{e}") + raise HTTPException( + status_code=503, + detail=f"服务不可用:{str(e)}" + ) + + +@router.get("/config/options") +async def get_config_options() -> Dict[str, Any]: + """ + 获取配置选项说明 + + Returns: + 各配置项的可选值和说明 + """ + return { + "sort_type": { + "0": "综合排序", + "1": "最新", + "2": "最多点赞", + "3": "最多评论", + "4": "最多收藏" + }, + "note_type": { + "0": "不限", + "1": "视频笔记", + "2": "普通笔记" + }, + "note_time": { + "0": "不限", + "1": "一天内", + "2": "一周内", + "3": "半年内" + }, + "note_range": { + "0": "不限", + "1": "已看过", + "2": "未看过", + "3": "已关注" + }, + "pos_distance": { + "0": "不限", + "1": "同城", + "2": "附近" + }, + "query_num": "每个关键词搜索的笔记数量(1-50)", + "supported_document_formats": [ + "PDF (.pdf)", + "Word (.docx, .doc)", + "PowerPoint (.pptx, .ppt)", + "Excel (.xlsx, .xls)", + "Text (.txt)", + "Markdown (.md)", + "Images (.jpg, .jpeg, .png, .gif, .bmp, .tiff)", + "CSV (.csv)" + ] + } \ No newline at end of file diff --git a/api/routers/integration.py b/api/routers/integration.py index d6aa9f7..eb45712 100644 --- a/api/routers/integration.py +++ b/api/routers/integration.py @@ -34,19 +34,7 @@ router = APIRouter( # 健康检查和状态接口 # ============================================================================ -@router.get("/health", response_model=HealthCheckResponse) -async def health_check( - service: IntegrationService = Depends(get_integration_service) -): - """健康检查""" - try: - return service.get_health_check() - except Exception as e: - logger.error(f"健康检查失败: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - -@router.get("/status", response_model=ServiceStatusResponse) +@router.get("/status", response_model=ServiceStatusResponse, summary="获取服务状态") async def get_service_status( service: IntegrationService = Depends(get_integration_service) ): @@ -62,7 +50,7 @@ async def get_service_status( # 小红书搜索接口 # ============================================================================ -@router.post("/search", response_model=XHSSearchResponse) +@router.post("/search", response_model=XHSSearchResponse, summary="搜索小红书笔记") async def search_xhs_notes( request: XHSSearchRequest, service: IntegrationService = Depends(get_integration_service) @@ -87,7 +75,7 @@ async def search_xhs_notes( # 内容整合接口 # ============================================================================ -@router.post("/integrate", response_model=IntegrationResponse) +@router.post("/integrate", response_model=IntegrationResponse, summary="整合内容") async def integrate_content( request: IntegrationRequest, service: IntegrationService = Depends(get_integration_service) @@ -108,7 +96,7 @@ async def integrate_content( raise HTTPException(status_code=500, detail=str(e)) -@router.post("/integrate/batch", response_model=Dict[str, IntegrationResponse]) +@router.post("/integrate/batch", response_model=Dict[str, IntegrationResponse], summary="批量整合内容") async def batch_integrate_content( request: BatchIntegrationRequest, background_tasks: BackgroundTasks, @@ -132,7 +120,7 @@ async def batch_integrate_content( # 结果管理接口 # ============================================================================ -@router.get("/results", response_model=List[TaskSummaryResponse]) +@router.get("/results", response_model=List[TaskSummaryResponse], summary="列出所有整合结果") async def list_integration_results( service: IntegrationService = Depends(get_integration_service) ): @@ -146,7 +134,7 @@ async def list_integration_results( raise HTTPException(status_code=500, detail=str(e)) -@router.get("/results/{task_id}", response_model=IntegrationResponse) +@router.get("/results/{task_id}", response_model=IntegrationResponse, summary="获取指定的整合结果") async def get_integration_result( task_id: str, service: IntegrationService = Depends(get_integration_service) @@ -165,7 +153,7 @@ async def get_integration_result( raise HTTPException(status_code=500, detail=str(e)) -@router.post("/results/export", response_model=ApiResponse) +@router.post("/results/export", response_model=ApiResponse, summary="导出结果") async def export_result( request: ExportRequest, service: IntegrationService = Depends(get_integration_service) @@ -195,7 +183,7 @@ async def export_result( # Cookie 管理接口 # ============================================================================ -@router.post("/cookies", response_model=ApiResponse) +@router.post("/cookies", response_model=ApiResponse, summary="添加Cookie") async def add_cookie( request: CookieManagementRequest, service: IntegrationService = Depends(get_integration_service) @@ -214,7 +202,7 @@ async def add_cookie( raise HTTPException(status_code=500, detail=str(e)) -@router.delete("/cookies/{cookie_name}", response_model=ApiResponse) +@router.delete("/cookies/{cookie_name}", response_model=ApiResponse, summary="删除Cookie") async def remove_cookie( cookie_name: str, service: IntegrationService = Depends(get_integration_service) @@ -233,7 +221,7 @@ async def remove_cookie( raise HTTPException(status_code=500, detail=str(e)) -@router.get("/cookies", response_model=CookieStatsResponse) +@router.get("/cookies", response_model=CookieStatsResponse, summary="获取Cookie统计") async def get_cookie_stats( service: IntegrationService = Depends(get_integration_service) ): @@ -249,7 +237,7 @@ async def get_cookie_stats( # 工具接口 # ============================================================================ -@router.get("/formats/document", response_model=ApiResponse) +@router.get("/formats/document", response_model=ApiResponse, summary="获取支持的文档格式") async def get_supported_document_formats( service: IntegrationService = Depends(get_integration_service) ): @@ -267,7 +255,7 @@ async def get_supported_document_formats( raise HTTPException(status_code=500, detail=str(e)) -@router.get("/formats/output", response_model=ApiResponse) +@router.get("/formats/output", response_model=ApiResponse, summary="获取支持的输出格式") async def get_supported_output_formats( service: IntegrationService = Depends(get_integration_service) ): @@ -285,7 +273,7 @@ async def get_supported_output_formats( raise HTTPException(status_code=500, detail=str(e)) -@router.post("/validate/documents", response_model=ValidationResponse) +@router.post("/validate/documents", response_model=ValidationResponse, summary="验证文档") async def validate_documents( document_paths: List[str], service: IntegrationService = Depends(get_integration_service) @@ -313,7 +301,7 @@ async def validate_documents( # 快速操作接口 # ============================================================================ -@router.get("/quick", response_model=ApiResponse) +@router.get("/quick", response_model=ApiResponse, summary="快速整合") async def quick_integration( keyword: str = Query(..., description="搜索关键词"), document_paths: List[str] = Query(..., description="文档路径列表"), @@ -372,7 +360,7 @@ async def quick_integration( logger.error(f"快速整合失败: {e}") raise HTTPException(status_code=500, detail=str(e)) -@router.post("/batch-search", response_model=BatchSearchResponse) +@router.post("/batch-search", response_model=BatchSearchResponse, summary="批量搜索小红书笔记") async def batch_search( request: BatchSearchRequest, service: IntegrationService = Depends(get_integration_service) diff --git a/api/routers/tweet.py b/api/routers/tweet.py index cfcbc0a..919c4f3 100644 --- a/api/routers/tweet.py +++ b/api/routers/tweet.py @@ -308,7 +308,7 @@ async def run_pipeline( audiences=audiences, scenic_spots=scenic_spots, products=products, - skip_judge=request.skipJudge, + skipJudge=request.skipJudge, autoJudge=request.autoJudge ) diff --git a/api/services/__pycache__/content_integration_service.cpython-312.pyc b/api/services/__pycache__/content_integration_service.cpython-312.pyc new file mode 100644 index 0000000..8af7c34 Binary files /dev/null and b/api/services/__pycache__/content_integration_service.cpython-312.pyc differ diff --git a/api/services/__pycache__/integration_service.cpython-312.pyc b/api/services/__pycache__/integration_service.cpython-312.pyc index 42daf0a..d598ea2 100644 Binary files a/api/services/__pycache__/integration_service.cpython-312.pyc and b/api/services/__pycache__/integration_service.cpython-312.pyc differ diff --git a/api/services/__pycache__/tweet.cpython-312.pyc b/api/services/__pycache__/tweet.cpython-312.pyc index 1a42e8b..a58d4bd 100644 Binary files a/api/services/__pycache__/tweet.cpython-312.pyc and b/api/services/__pycache__/tweet.cpython-312.pyc differ diff --git a/api/services/content_integration_service.py b/api/services/content_integration_service.py new file mode 100644 index 0000000..043eaae --- /dev/null +++ b/api/services/content_integration_service.py @@ -0,0 +1,266 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +内容整合服务 +将文档资料和小红书笔记进行整合,由LLM生成综合性旅游资料 +""" + +import os +import time +import logging +from typing import List, Optional, Dict, Any +from pathlib import Path +from datetime import datetime + +from core.xhs_adapter import XHSAdapter +from core.models import SearchConfig +from core.document_adapter import DocumentAdapter +from core.ai.ai_agent import AIAgent +from core.config import ConfigManager, AIModelConfig +from utils.prompts import PromptTemplate + +logger = logging.getLogger(__name__) + + +class ContentIntegrationService: + """内容整合服务类""" + + def __init__(self): + """初始化服务""" + self.config_manager = ConfigManager() + + # 加载必要的配置 + self.config_manager.load_from_directory("config", server_mode=True) + + # 初始化AI代理 + ai_config = self.config_manager.get_config('ai_model', AIModelConfig) + self.ai_agent = AIAgent(ai_config) + + # 初始化适配器 + self.document_adapter = DocumentAdapter() + + # 加载提示词模板 + self.prompt_template = PromptTemplate( + system_prompt_path="resource/prompt/integration/system.txt", + user_prompt_path="resource/prompt/integration/user.txt" + ) + + async def integrate_content( + self, + document_paths: List[str], + keywords: List[str], + cookies: str, + output_path: str = "data/output", + sort_type: int = 2, # 0 综合排序, 1 最新, 2 最多点赞, 3 最多评论, 4 最多收藏 + note_type: int = 2, # 0 不限, 1 视频笔记, 2 普通笔记 + note_time: int = 0, # 0 不限, 1 一天内, 2 一周内天, 3 半年内 + note_range: int = 0, # 0 不限, 1 已看过, 2 未看过, 3 已关注 + pos_distance: int = 0, # 0 不限, 1 同城, 2 附近 + query_num: int = 10 + ) -> Dict[str, Any]: + """ + 整合文档和小红书内容 + + Args: + document_paths: 文档文件路径列表 + keywords: 搜索关键词列表 + cookies: 小红书Cookie字符串 + output_path: 输出路径 + sort_type: 排序方式 + note_type: 笔记类型 + note_time: 笔记时间 + note_range: 笔记范围 + pos_distance: 位置距离 + query_num: 每个关键词搜索的笔记数量 + + Returns: + 整合结果字典 + """ + start_time = time.time() + logger.info(f"开始整合任务:文档数量 {len(document_paths)}, 关键词数量 {len(keywords)}") + + try: + # 确保输出目录存在 + os.makedirs(output_path, exist_ok=True) + + # 1. 处理文档内容 + logger.info("正在处理文档内容...") + document_result = self.document_adapter.integrate_documents(document_paths) + + logger.info(f"文档处理完成,共处理 {len(document_result.documents)} 个文档") + + # 2. 搜索小红书笔记 + logger.info("正在搜索小红书笔记...") + xhs_adapter = XHSAdapter(cookies) + all_notes = [] + + for keyword in keywords: + search_config = SearchConfig( + keyword=keyword, + max_notes=query_num, + sort_type=sort_type, + note_type=note_type + ) + + search_result = xhs_adapter.search_notes(search_config) + + if search_result.success: + all_notes.extend(search_result.notes) + logger.info(f"关键词 '{keyword}' 搜索到 {len(search_result.notes)} 条笔记") + else: + logger.warning(f"关键词 '{keyword}' 搜索失败: {search_result.error_message}") + + logger.info(f"小红书搜索完成,共获得 {len(all_notes)} 条笔记") + + # 3. 准备LLM整合内容 + logger.info("正在准备LLM整合...") + + # 构建文档内容字符串 + document_content = self._format_document_content(document_result) + + # 构建小红书笔记内容字符串 + xhs_content = self._format_xhs_notes(all_notes) + + # 构建关键词字符串 + keywords_str = ", ".join(keywords) + + # 4. 调用LLM进行整合 + logger.info("正在调用LLM进行内容整合...") + + system_prompt = self.prompt_template.get_system_prompt() + user_prompt = self.prompt_template.build_user_prompt( + keywords=keywords_str, + document_content=document_content, + xhs_notes_content=xhs_content + ) + + # 调用AI代理 + response_text, input_tokens, output_tokens, time_cost = await self.ai_agent.generate_text( + system_prompt=system_prompt, + user_prompt=user_prompt, + use_stream=True, + stage="content_integration" + ) + + # 使用file_io模块的JSON处理功能 + from utils.file_io import process_llm_json_text + parsed_json = process_llm_json_text(response_text) + + # 如果解析成功,将JSON对象转换回字符串用于存储 + if parsed_json: + import json + cleaned_response = json.dumps(parsed_json, ensure_ascii=False, indent=2) + logger.info("成功解析并清理了LLM返回的JSON内容") + else: + # 如果解析失败,使用原始响应 + cleaned_response = response_text + logger.warning("JSON解析失败,使用原始响应内容") + + # 5. 保存结果 + processing_time = time.time() - start_time + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + result = { + "success": True, + "timestamp": timestamp, + "processing_time": f"{processing_time:.2f}秒", + "input_summary": { + "document_count": len(document_result.documents), + "xhs_notes_count": len(all_notes), + "keywords": keywords + }, + "document_info": { + "documents": [ + { + "file_path": doc.file_path, + "file_type": doc.file_type, + "content_length": len(doc.content) + } + for doc in document_result.documents + ], + "integrated_text_length": len(document_result.integrated_text) + }, + "xhs_info": { + "total_notes": len(all_notes), + "authors": list(set(note.author for note in all_notes if note.author)), + "total_interactions": sum(note.likes + note.comments + note.shares for note in all_notes) + }, + "integrated_content": cleaned_response, + "search_config": { + "sort_type": sort_type, + "note_type": note_type, + "note_time": note_time, + "note_range": note_range, + "pos_distance": pos_distance, + "query_num": query_num + } + } + + # 保存详细结果到文件 + output_file = os.path.join(output_path, f"content_integration_{timestamp}.json") + with open(output_file, 'w', encoding='utf-8') as f: + import json + json.dump(result, f, ensure_ascii=False, indent=2) + + logger.info(f"整合完成,结果已保存到: {output_file}") + logger.info(f"总处理时间: {processing_time:.2f}秒") + + return result + + except Exception as e: + error_result = { + "success": False, + "error_message": str(e), + "timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"), + "processing_time": f"{time.time() - start_time:.2f}秒" + } + logger.error(f"内容整合失败: {e}") + return error_result + + def _format_document_content(self, document_result) -> str: + """格式化文档内容""" + content_parts = [] + + # 添加整合文本 + if document_result.integrated_text: + content_parts.append("### 文档整合内容") + content_parts.append(document_result.integrated_text) + content_parts.append("") + + # 添加各个文档的详细内容 + if document_result.documents: + content_parts.append("### 各文档详细内容") + for i, doc in enumerate(document_result.documents, 1): + content_parts.append(f"#### 文档 {i}: {Path(doc.file_path).name} ({doc.file_type})") + content_parts.append(doc.content[:2000] + "..." if len(doc.content) > 2000 else doc.content) + content_parts.append("") + + return "\n".join(content_parts) + + def _format_xhs_notes(self, notes) -> str: + """格式化小红书笔记内容""" + if not notes: + return "暂无相关笔记" + + content_parts = [] + content_parts.append(f"### 小红书相关笔记 (共 {len(notes)} 条)") + content_parts.append("") + + for i, note in enumerate(notes, 1): + content_parts.append(f"#### 笔记 {i}: {note.title}") + content_parts.append(f"**作者**: {note.author}") + content_parts.append(f"**互动数据**: 👍 {note.likes} | 💬 {note.comments} | 📤 {note.shares}") + + if note.content: + # 限制每条笔记内容长度 + content = note.content[:500] + "..." if len(note.content) > 500 else note.content + content_parts.append(f"**内容**: {content}") + + if note.tags: + content_parts.append(f"**标签**: {', '.join(note.tags)}") + + content_parts.append(f"**链接**: {note.note_url}") + content_parts.append("") + + return "\n".join(content_parts) \ No newline at end of file diff --git a/api/services/integration_service.py b/api/services/integration_service.py index 9942a6d..b8cbc52 100644 --- a/api/services/integration_service.py +++ b/api/services/integration_service.py @@ -52,7 +52,7 @@ class IntegrationService: self.document_adapter = DocumentAdapter() # 结果存储 - self.integration_results: Dict[str, IntegrationResult] = {} + self.integration_results: Dict[str, IntegrationResponse] = {} # 统计信息 self.stats = ProcessingStats() @@ -703,7 +703,8 @@ class IntegrationService: total_notes=len(notes), notes=notes, search_time=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - success=True + success=True, + error_message=None ) results.append(keyword_result) diff --git a/api/services/tweet.py b/api/services/tweet.py index f590af4..4458a12 100644 --- a/api/services/tweet.py +++ b/api/services/tweet.py @@ -121,7 +121,7 @@ class TweetService: if not topic: topic = {"index": "1", "date": "2024-07-01"} - topicIndex = topic.get('index', 'unknown') + topicIndex = topic.get('index', 'N/A') logger.info(f"开始为选题 {topicIndex} 生成内容{'(含审核)' if autoJudge else ''}") # 创建topic的副本并应用覆盖参数 diff --git a/config/ai_model.json b/config/ai_model.json index 66e8356..2d89b90 100644 --- a/config/ai_model.json +++ b/config/ai_model.json @@ -6,5 +6,6 @@ "top_p": 0.4, "presence_penalty": 1.2, "timeout": 120, - "max_retries": 3 + "max_retries": 3, + "stream": true } \ No newline at end of file diff --git a/config/cookies.json b/config/cookies.json index 6f951b6..6517a07 100644 --- a/config/cookies.json +++ b/config/cookies.json @@ -3,8 +3,8 @@ { "name": "user1", "cookie_string": "abRequestId=873258ba-49fe-530b-9c07-529e7871508d; webBuild=4.72.0; xsecappid=xhs-pc-web; a1=19808118ec0gxnuh5mtpv8o8aepgp2m65bpbtiizn30000193947; webId=3f4bd516682e25b71cdd139a11b4d896; websectiga=984412fef754c018e472127b8effd174be8a5d51061c991aadd200c69a2801d6; gid=yjY8Yyy0qJ24yjY8YyyYdD33S8kCI7x269UhUuIYlY0dUDq8kUJ6K9888yjqj4W80fK0ydj0; web_session=040069b295652fcb5b6e1389413a4be4606547; unread={%22ub%22:%22686e16c0000000000d01b775%22%2C%22ue%22:%22686e3f1d0000000023006e9e%22%2C%22uc%22:15}; sec_poison_id=bf3ca020-2442-4aa6-904a-b95f7ccc56c1; loadts=1752482529464", - "last_used": "2025-07-15T15:46:11.050854", - "use_count": 40, + "last_used": "2025-07-15T16:21:52.687251", + "use_count": 46, "is_valid": true, "failure_count": 0, "user_info": { diff --git a/core/__pycache__/content_integration_service.cpython-312.pyc b/core/__pycache__/content_integration_service.cpython-312.pyc index 4260d5d..0f0d2c5 100644 Binary files a/core/__pycache__/content_integration_service.cpython-312.pyc and b/core/__pycache__/content_integration_service.cpython-312.pyc differ diff --git a/core/content_integration_service.py b/core/content_integration_service.py index 60d444a..be2b805 100644 --- a/core/content_integration_service.py +++ b/core/content_integration_service.py @@ -134,7 +134,8 @@ class ContentIntegrationService: logger.info(f"开始搜索小红书内容: {keyword}") xhs_result = self.xhs_adapter.search_notes( keyword=keyword, - max_notes=max_notes + max_notes=max_notes, + ) result.xhs_result = xhs_result logger.info(f"小红书搜索完成,找到 {len(xhs_result.notes)} 条笔记") diff --git a/core/xhs_spider/xhs_utils/__pycache__/xhs_creator_util.cpython-312.pyc b/core/xhs_spider/xhs_utils/__pycache__/xhs_creator_util.cpython-312.pyc index f0ec249..bb5b0be 100644 Binary files a/core/xhs_spider/xhs_utils/__pycache__/xhs_creator_util.cpython-312.pyc and b/core/xhs_spider/xhs_utils/__pycache__/xhs_creator_util.cpython-312.pyc differ diff --git a/core/xhs_spider/xhs_utils/__pycache__/xhs_util.cpython-312.pyc b/core/xhs_spider/xhs_utils/__pycache__/xhs_util.cpython-312.pyc index 7ee57ac..859f4e2 100644 Binary files a/core/xhs_spider/xhs_utils/__pycache__/xhs_util.cpython-312.pyc and b/core/xhs_spider/xhs_utils/__pycache__/xhs_util.cpython-312.pyc differ diff --git a/resource/prompt/integration/system.txt b/resource/prompt/integration/system.txt new file mode 100644 index 0000000..a4cb144 --- /dev/null +++ b/resource/prompt/integration/system.txt @@ -0,0 +1,44 @@ +你是资料整理大师。当我给你各种杂乱的资料时,你会迅速进行分类整理,使其阅读变得清晰有序。 + +你的核心原则: +- 你最大程度的保留我给你的资料里面的每一个字,每一种形容,不要自己为了极简就删改 +- 你只需要把相同类别的内容整理到一起,清晰明确就好,不要对原文的文字内容做改变,要改变的只是让文本更清晰 +- 你需要根据不同景区/酒店主体分类,每一个项目主体按<景区/酒店名称><产品套餐名称><交通攻略/地址><游玩攻略/空泛补充信息>分类 + +整理任务: +你正在根据已有宣传文档和小红书笔记整理产品信息。你需要阅读所有材料,根据文档中提供的信息,整理得到: + +对于每个产品,需要包含以下信息: +- 产品名称 +- 产品使用规则 (使用规则、加价说明、预约规则、退改政策、优惠内容) +- 交通与地址(注:这里分上下部分:分别为交通指南和地址,需要给出准确无误的信息,按格式填写) +- 产品价格 +- 产品最突出优势(注:写出该商品最重要的优点,该优点是必须放在宣传中的) +- 空泛产品描述 (注:这里填写所有想要添加的正确的空泛信息:包括但不限于:商品周边景点;景点风景描述;商品描述;小贴士;游玩路线规划;游玩时长;营业时长等,需要按序号排列填写) + +输出格式要求: +必须输出严格的JSON格式,不要添加markdown代码块标记,直接输出纯JSON内容,结构如下: +{ + "attractions": [ + { + "name": "景区/酒店名称", + "products": [ + { + "product_name": "产品套餐名称", + "usage_rules": "产品使用规则详细说明", + "transportation": { + "guide": "交通指南", + "address": "详细地址" + }, + "price": "产品价格信息", + "key_advantages": "产品最突出优势", + "detailed_description": [ + "1. 详细描述项目1", + "2. 详细描述项目2", + "..." + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/resource/prompt/integration/user.txt b/resource/prompt/integration/user.txt new file mode 100644 index 0000000..e1f7935 --- /dev/null +++ b/resource/prompt/integration/user.txt @@ -0,0 +1,39 @@ +## 资料整理任务 +请根据以下材料,整理产品信息。搜索关键词:"{keywords}" + +## 原始文档资料 +以下是从用户上传的文档中提取的内容: + +{document_content} + +## 小红书相关笔记 +以下是搜索关键词"{keywords}"从小红书获取的相关笔记内容: + +{xhs_notes_content} + +## 整理要求 +请根据以上所有资料,按照系统提示词中的要求整理产品信息: + +1. **保留原文完整性** + - 最大程度保留资料中的每一个字、每一种形容 + - 不要为了极简而删改原文内容 + - 只整理分类,不改变文字内容 + +2. **分类整理** + - 按不同景区/酒店主体分类 + - 每个主体按:景区/酒店名称 → 产品套餐名称 → 交通攻略/地址 → 游玩攻略/空泛补充信息 + +3. **信息完整性** + - 产品名称 + - 产品使用规则(使用规则、加价说明、预约规则、退改政策、优惠内容) + - 交通与地址(分为交通指南和地址两部分) + - 产品价格 + - 产品最突出优势 + - 空泛产品描述(包含周边景点、风景描述、小贴士、游玩路线、时长等) + +4. **输出格式** + - 必须严格按照JSON格式输出 + - 按景区对象和产品分类 + - 确保JSON格式正确,可以被程序解析 + +请开始整理: \ No newline at end of file diff --git a/tests/test_final_integration.py b/tests/test_final_integration.py new file mode 100644 index 0000000..2ba9db8 --- /dev/null +++ b/tests/test_final_integration.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import requests +import json +from utils.file_io import process_llm_json_text + +def test_final_integration(): + """最终整合测试 - 展示完整的纯搜索整合功能""" + + url = "http://localhost:2714/api/v1/content-integration/integrate" + + # 测试数据 - 使用平级字段结构 + test_data = { + "keywords": ["馥桂萌宠园攻略"], + "cookies": "abRequestId=873258ba-49fe-530b-9c07-529e7871508d; webBuild=4.72.0; xsecappid=xhs-pc-web; a1=19808118ec0gxnuh5mtpv8o8aepgp2m65bpbtiizn30000193947; webId=3f4bd516682e25b71cdd139a11b4d896; web_session=040069b295652fcb5b6e1389413a4be4606547", + # 搜索配置参数(平级) + "sort_type": 2, # 最多点赞 + "note_type": 2, # 普通笔记 + "note_time": 3, # 半年内 + "note_range": 0, # 不限范围 + "pos_distance": 0, # 不限位置 + "query_num": 20 # 20条笔记快速测试 + } + + print("🎯 旅游内容整合系统 - 最终测试") + print("=" * 50) + print(f"📍 目标关键词: {', '.join(test_data['keywords'])}") + print(f"📊 搜索配置: 最多点赞排序,{test_data['query_num']}条笔记") + print(f"🔄 模式: 纯搜索整合(无文档上传)") + print() + + try: + print("⏳ 开始内容整合...") + response = requests.post( + url, + json=test_data, + headers={"Content-Type": "application/json"}, + timeout=120 + ) + + print(f"📊 响应状态: {response.status_code}") + + if response.status_code == 200: + result = response.json() + print("✅ API调用成功!") + + if result.get('success'): + print("🎉 内容整合成功!") + + # 显示处理统计 + processing_time = result.get('processing_time', 'N/A') + timestamp = result.get('timestamp', 'N/A') + print(f"⏱️ 处理时间: {processing_time}") + print(f"🕒 完成时间: {timestamp}") + + # 显示输入统计 + input_summary = result.get('input_summary', {}) + print(f"📄 处理文档: {input_summary.get('document_count', 0)} 个") + print(f"📝 XHS笔记: {input_summary.get('xhs_notes_count', 0)} 条") + + # 显示XHS信息 + xhs_info = result.get('xhs_info', {}) + total_interactions = xhs_info.get('total_interactions', 0) + authors = xhs_info.get('authors', []) + print(f"👥 涉及作者: {len(authors)} 位") + print(f"💬 总互动数: {total_interactions} (点赞+评论+分享)") + + # 解析整合内容 + content = result.get('integrated_content', '') + if content: + print("\n🔍 内容解析结果:") + try: + # 使用file_io模块解析JSON + parsed_content = process_llm_json_text(content) + + if parsed_content and isinstance(parsed_content, dict): + attractions = parsed_content.get('attractions', []) + print(f"✅ 成功解析JSON格式内容") + print(f"🏞️ 识别景区数量: {len(attractions)}") + + # 显示详细信息 + for i, attraction in enumerate(attractions, 1): + name = attraction.get('name', 'N/A') + products = attraction.get('products', []) + print(f"\n📍 景区 {i}: {name}") + print(f" 📦 产品数量: {len(products)}") + + for j, product in enumerate(products, 1): + print(f"\n 🎫 产品 {j}: {product.get('product_name', 'N/A')}") + print(f" 💰 价格: {product.get('price', 'N/A')}") + + # 显示优势 + advantages = product.get('key_advantages', '') + if advantages: + preview = advantages[:80] + "..." if len(advantages) > 80 else advantages + print(f" 🎯 核心优势: {preview}") + + # 显示交通信息 + transport = product.get('transportation', {}) + if isinstance(transport, dict): + address = transport.get('address', 'N/A') + guide = transport.get('guide', 'N/A') + print(f" 📍 地址: {address}") + if guide and guide != 'N/A': + guide_preview = guide[:60] + "..." if len(guide) > 60 else guide + print(f" 🚗 交通: {guide_preview}") + + # 显示详细描述数量 + descriptions = product.get('detailed_description', []) + if descriptions: + print(f" 📝 详细说明: {len(descriptions)} 条") + else: + print("❌ 内容解析失败或格式不正确") + print(f"📄 原始内容预览: {content[:200]}...") + + except Exception as e: + print(f"❌ 内容解析异常: {e}") + print(f"📄 原始内容预览: {content[:200]}...") + + # 显示输出文件 + output_file = result.get('output_file') + if output_file: + print(f"\n💾 结果已保存至: {output_file}") + + else: + error_msg = result.get('error_message', 'Unknown error') + print(f"❌ 内容整合失败: {error_msg}") + + else: + print("❌ API调用失败") + try: + error_info = response.json() + print(f"📄 错误详情: {json.dumps(error_info, ensure_ascii=False, indent=2)}") + except: + print(f"📄 错误响应: {response.text}") + + except requests.exceptions.Timeout: + print("⏰ 请求超时 - 内容整合需要较长时间,请稍候") + + except Exception as e: + print(f"❌ 测试异常: {e}") + + print("\n" + "=" * 50) + print("🏁 测试完成") + +if __name__ == "__main__": + test_final_integration() \ No newline at end of file