#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Content Integration Service 内容整合服务 core模块的主要服务,通过适配器与xhs_spider和document模块交互 """ import asyncio from typing import Dict, List, Optional, Any from dataclasses import dataclass, field from datetime import datetime import logging import uuid from .xhs_adapter import XHSAdapter, XHSSearchResult from .document_adapter import DocumentAdapter, IntegratedContent from .cookie_manager import CookieManager from .media_manager import ImageStorageManager logger = logging.getLogger(__name__) @dataclass class ProcessingConfig: """处理配置""" keyword: str document_paths: Optional[List[str]] = None max_notes: int = 20 output_format: str = "summary" download_media: bool = True custom_prompt: Optional[str] = None @dataclass class ProcessingResult: """处理结果""" task_id: str config: ProcessingConfig success: bool xhs_result: Optional[XHSSearchResult] = None document_result: Optional[IntegratedContent] = None final_content: str = "" error_message: str = "" processing_time: float = 0.0 created_time: datetime = field(default_factory=datetime.now) @dataclass class ProcessingStats: """处理统计信息""" total_tasks: int = 0 successful_tasks: int = 0 failed_tasks: int = 0 total_processing_time: float = 0.0 average_processing_time: float = 0.0 total_notes_processed: int = 0 total_documents_processed: int = 0 start_time: datetime = field(default_factory=datetime.now) last_updated: datetime = field(default_factory=datetime.now) class ContentIntegrationService: """内容整合服务""" def __init__(self, cookie_config_path: str = "cookies.json", media_storage_path: str = "media", enable_logging: bool = True): """ 初始化服务 Args: cookie_config_path: Cookie配置文件路径 media_storage_path: 媒体存储路径 enable_logging: 是否启用日志 """ self.cookie_config_path = cookie_config_path self.media_storage_path = media_storage_path self.enable_logging = enable_logging # 初始化适配器 self.xhs_adapter = XHSAdapter(cookie_config_path) self.document_adapter = DocumentAdapter() # 初始化管理器 self.cookie_manager = CookieManager(cookie_config_path) self.media_manager = ImageStorageManager(media_storage_path) # 结果存储和统计 self.results: Dict[str, ProcessingResult] = {} self.stats = ProcessingStats() logger.info("内容整合服务初始化完成") async def process_content(self, keyword: str, document_paths: Optional[List[str]] = None, output_format: str = "summary", max_notes: int = 20, custom_prompt: Optional[str] = None) -> ProcessingResult: """ 处理内容整合 Args: keyword: 搜索关键词 document_paths: 文档路径列表 output_format: 输出格式 max_notes: 最大笔记数量 custom_prompt: 自定义提示词 Returns: ProcessingResult: 处理结果 """ task_id = str(uuid.uuid4()) start_time = datetime.now() config = ProcessingConfig( keyword=keyword, document_paths=document_paths, max_notes=max_notes, output_format=output_format, custom_prompt=custom_prompt ) result = ProcessingResult( task_id=task_id, config=config, success=False ) try: # 1. 搜索小红书内容(如果有关键词) xhs_result = None if keyword: logger.info(f"开始搜索小红书内容: {keyword}") xhs_result = self.xhs_adapter.search_notes( keyword=keyword, max_notes=max_notes, ) result.xhs_result = xhs_result logger.info(f"小红书搜索完成,找到 {len(xhs_result.notes)} 条笔记") # 2. 处理文档内容(如果有文档) document_result = None if document_paths: logger.info(f"开始处理文档: {len(document_paths)} 个文件") document_result = self.document_adapter.integrate_documents(document_paths) result.document_result = document_result logger.info(f"文档处理完成,总长度: {document_result.total_length}") # 3. 整合内容 final_content = self._integrate_content( xhs_result=xhs_result, document_result=document_result, output_format=output_format, custom_prompt=custom_prompt ) result.final_content = final_content # 4. 下载媒体文件(如果需要) if config.download_media and xhs_result: await self._download_media(xhs_result) result.success = True logger.info(f"任务 {task_id} 处理完成") except Exception as e: result.error_message = str(e) logger.error(f"任务 {task_id} 处理失败: {e}") # 计算处理时间 result.processing_time = (datetime.now() - start_time).total_seconds() # 存储结果和更新统计 self.results[task_id] = result self._update_statistics(result) return result async def batch_process(self, tasks: List[Dict[str, Any]], output_format: str = "summary") -> List[ProcessingResult]: """ 批量处理任务 Args: tasks: 任务列表,每个任务包含keyword和document_paths output_format: 输出格式 Returns: List[ProcessingResult]: 处理结果列表 """ results = [] # 并发处理任务 async def process_task(task_config): return await self.process_content( keyword=task_config.get("keyword", ""), document_paths=task_config.get("document_paths"), output_format=output_format, max_notes=task_config.get("max_notes", 20), custom_prompt=task_config.get("custom_prompt") ) # 使用asyncio.gather并发执行 tasks_coroutines = [process_task(task) for task in tasks] results = await asyncio.gather(*tasks_coroutines, return_exceptions=True) # 处理异常结果 processed_results = [] for i, result in enumerate(results): if isinstance(result, Exception): error_result = ProcessingResult( task_id=str(uuid.uuid4()), config=ProcessingConfig(keyword=tasks[i].get("keyword", "")), success=False, error_message=str(result) ) processed_results.append(error_result) else: processed_results.append(result) return processed_results def _integrate_content(self, xhs_result: Optional[XHSSearchResult], document_result: Optional[IntegratedContent], output_format: str, custom_prompt: Optional[str]) -> str: """ 整合内容 Args: xhs_result: 小红书搜索结果 document_result: 文档处理结果 output_format: 输出格式 custom_prompt: 自定义提示词 Returns: str: 整合后的内容 """ content_parts = [] # 添加小红书内容 if xhs_result and xhs_result.notes: xhs_content = "\n".join([ f"标题: {note.title}\n内容: {note.content}\n作者: {note.author}\n" for note in xhs_result.notes[:5] # 只取前5条 ]) content_parts.append(f"=== 小红书相关内容 ===\n{xhs_content}") # 添加文档内容 if document_result and document_result.integrated_text: content_parts.append(f"=== 文档相关内容 ===\n{document_result.integrated_text}") # 合并内容 if not content_parts: return "没有找到相关内容" combined_content = "\n\n".join(content_parts) # 根据输出格式转换内容 if self.document_adapter.is_available(): try: formatted_content = self.document_adapter.transform_content( combined_content, output_format ) return formatted_content except Exception as e: logger.error(f"内容格式转换失败: {e}") # 简单格式化 return self._simple_format(combined_content, output_format) def _simple_format(self, content: str, output_format: str) -> str: """简单的内容格式化""" if output_format == "summary": return f"摘要:\n{content[:500]}..." elif output_format == "blog_post": return f"# 博客文章\n\n{content}" elif output_format == "travel_guide": return f"# 旅游攻略\n\n{content}" else: return content async def _download_media(self, xhs_result: XHSSearchResult): """下载媒体文件""" try: for note in xhs_result.notes: # 下载图片 for image_url in note.images: await self.media_manager.download_image(image_url, note.note_id) # 下载视频 for video_url in note.videos: await self.media_manager.download_video(video_url, note.note_id) except Exception as e: logger.error(f"媒体下载失败: {e}") def _update_statistics(self, result: ProcessingResult): """更新统计信息""" self.stats.total_tasks += 1 self.stats.total_processing_time += result.processing_time if result.success: self.stats.successful_tasks += 1 if result.xhs_result: self.stats.total_notes_processed += len(result.xhs_result.notes) if result.document_result: self.stats.total_documents_processed += len(result.document_result.documents) else: self.stats.failed_tasks += 1 # 计算平均处理时间 if self.stats.total_tasks > 0: self.stats.average_processing_time = ( self.stats.total_processing_time / self.stats.total_tasks ) self.stats.last_updated = datetime.now() def get_statistics(self) -> ProcessingStats: """获取处理统计信息""" return self.stats def get_result(self, task_id: str) -> Optional[ProcessingResult]: """获取处理结果""" return self.results.get(task_id) def get_all_results(self) -> Dict[str, ProcessingResult]: """获取所有处理结果""" return self.results.copy() def clear_results(self): """清空处理结果""" self.results.clear() def get_status(self) -> Dict[str, Any]: """获取服务状态""" return { "xhs_adapter": self.xhs_adapter.get_status(), "document_adapter": self.document_adapter.get_status(), "cookie_manager": self.cookie_manager.get_status() if hasattr(self.cookie_manager, 'get_status') else {}, "media_manager": self.media_manager.get_status() if hasattr(self.media_manager, 'get_status') else {}, "statistics": self.stats, "total_results": len(self.results) }