2025-07-15 15:47:47 +08:00
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
Content Integration Service
|
|
|
|
|
|
内容整合服务
|
|
|
|
|
|
|
|
|
|
|
|
core模块的主要服务,通过适配器与xhs_spider和document模块交互
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
import asyncio
|
|
|
|
|
|
from typing import Dict, List, Optional, Any
|
|
|
|
|
|
from dataclasses import dataclass, field
|
|
|
|
|
|
from datetime import datetime
|
|
|
|
|
|
import logging
|
|
|
|
|
|
import uuid
|
|
|
|
|
|
|
|
|
|
|
|
from .xhs_adapter import XHSAdapter, XHSSearchResult
|
|
|
|
|
|
from .document_adapter import DocumentAdapter, IntegratedContent
|
|
|
|
|
|
from .cookie_manager import CookieManager
|
|
|
|
|
|
from .media_manager import ImageStorageManager
|
|
|
|
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
|
class ProcessingConfig:
|
|
|
|
|
|
"""处理配置"""
|
|
|
|
|
|
keyword: str
|
|
|
|
|
|
document_paths: Optional[List[str]] = None
|
|
|
|
|
|
max_notes: int = 20
|
|
|
|
|
|
output_format: str = "summary"
|
|
|
|
|
|
download_media: bool = True
|
|
|
|
|
|
custom_prompt: Optional[str] = None
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
|
class ProcessingResult:
|
|
|
|
|
|
"""处理结果"""
|
|
|
|
|
|
task_id: str
|
|
|
|
|
|
config: ProcessingConfig
|
|
|
|
|
|
success: bool
|
|
|
|
|
|
xhs_result: Optional[XHSSearchResult] = None
|
|
|
|
|
|
document_result: Optional[IntegratedContent] = None
|
|
|
|
|
|
final_content: str = ""
|
|
|
|
|
|
error_message: str = ""
|
|
|
|
|
|
processing_time: float = 0.0
|
|
|
|
|
|
created_time: datetime = field(default_factory=datetime.now)
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
|
class ProcessingStats:
|
|
|
|
|
|
"""处理统计信息"""
|
|
|
|
|
|
total_tasks: int = 0
|
|
|
|
|
|
successful_tasks: int = 0
|
|
|
|
|
|
failed_tasks: int = 0
|
|
|
|
|
|
total_processing_time: float = 0.0
|
|
|
|
|
|
average_processing_time: float = 0.0
|
|
|
|
|
|
total_notes_processed: int = 0
|
|
|
|
|
|
total_documents_processed: int = 0
|
|
|
|
|
|
start_time: datetime = field(default_factory=datetime.now)
|
|
|
|
|
|
last_updated: datetime = field(default_factory=datetime.now)
|
|
|
|
|
|
|
|
|
|
|
|
class ContentIntegrationService:
|
|
|
|
|
|
"""内容整合服务"""
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self,
|
|
|
|
|
|
cookie_config_path: str = "cookies.json",
|
|
|
|
|
|
media_storage_path: str = "media",
|
|
|
|
|
|
enable_logging: bool = True):
|
|
|
|
|
|
"""
|
|
|
|
|
|
初始化服务
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
cookie_config_path: Cookie配置文件路径
|
|
|
|
|
|
media_storage_path: 媒体存储路径
|
|
|
|
|
|
enable_logging: 是否启用日志
|
|
|
|
|
|
"""
|
|
|
|
|
|
self.cookie_config_path = cookie_config_path
|
|
|
|
|
|
self.media_storage_path = media_storage_path
|
|
|
|
|
|
self.enable_logging = enable_logging
|
|
|
|
|
|
|
|
|
|
|
|
# 初始化适配器
|
|
|
|
|
|
self.xhs_adapter = XHSAdapter(cookie_config_path)
|
|
|
|
|
|
self.document_adapter = DocumentAdapter()
|
|
|
|
|
|
|
|
|
|
|
|
# 初始化管理器
|
|
|
|
|
|
self.cookie_manager = CookieManager(cookie_config_path)
|
|
|
|
|
|
self.media_manager = ImageStorageManager(media_storage_path)
|
|
|
|
|
|
|
|
|
|
|
|
# 结果存储和统计
|
|
|
|
|
|
self.results: Dict[str, ProcessingResult] = {}
|
|
|
|
|
|
self.stats = ProcessingStats()
|
|
|
|
|
|
|
|
|
|
|
|
logger.info("内容整合服务初始化完成")
|
|
|
|
|
|
|
|
|
|
|
|
async def process_content(self,
|
|
|
|
|
|
keyword: str,
|
|
|
|
|
|
document_paths: Optional[List[str]] = None,
|
|
|
|
|
|
output_format: str = "summary",
|
|
|
|
|
|
max_notes: int = 20,
|
|
|
|
|
|
custom_prompt: Optional[str] = None) -> ProcessingResult:
|
|
|
|
|
|
"""
|
|
|
|
|
|
处理内容整合
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
keyword: 搜索关键词
|
|
|
|
|
|
document_paths: 文档路径列表
|
|
|
|
|
|
output_format: 输出格式
|
|
|
|
|
|
max_notes: 最大笔记数量
|
|
|
|
|
|
custom_prompt: 自定义提示词
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
ProcessingResult: 处理结果
|
|
|
|
|
|
"""
|
|
|
|
|
|
task_id = str(uuid.uuid4())
|
|
|
|
|
|
start_time = datetime.now()
|
|
|
|
|
|
|
|
|
|
|
|
config = ProcessingConfig(
|
|
|
|
|
|
keyword=keyword,
|
|
|
|
|
|
document_paths=document_paths,
|
|
|
|
|
|
max_notes=max_notes,
|
|
|
|
|
|
output_format=output_format,
|
|
|
|
|
|
custom_prompt=custom_prompt
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
result = ProcessingResult(
|
|
|
|
|
|
task_id=task_id,
|
|
|
|
|
|
config=config,
|
|
|
|
|
|
success=False
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
# 1. 搜索小红书内容(如果有关键词)
|
|
|
|
|
|
xhs_result = None
|
|
|
|
|
|
if keyword:
|
|
|
|
|
|
logger.info(f"开始搜索小红书内容: {keyword}")
|
|
|
|
|
|
xhs_result = self.xhs_adapter.search_notes(
|
|
|
|
|
|
keyword=keyword,
|
2025-07-15 17:18:46 +08:00
|
|
|
|
max_notes=max_notes,
|
|
|
|
|
|
|
2025-07-15 15:47:47 +08:00
|
|
|
|
)
|
|
|
|
|
|
result.xhs_result = xhs_result
|
|
|
|
|
|
logger.info(f"小红书搜索完成,找到 {len(xhs_result.notes)} 条笔记")
|
|
|
|
|
|
|
|
|
|
|
|
# 2. 处理文档内容(如果有文档)
|
|
|
|
|
|
document_result = None
|
|
|
|
|
|
if document_paths:
|
|
|
|
|
|
logger.info(f"开始处理文档: {len(document_paths)} 个文件")
|
|
|
|
|
|
document_result = self.document_adapter.integrate_documents(document_paths)
|
|
|
|
|
|
result.document_result = document_result
|
|
|
|
|
|
logger.info(f"文档处理完成,总长度: {document_result.total_length}")
|
|
|
|
|
|
|
|
|
|
|
|
# 3. 整合内容
|
|
|
|
|
|
final_content = self._integrate_content(
|
|
|
|
|
|
xhs_result=xhs_result,
|
|
|
|
|
|
document_result=document_result,
|
|
|
|
|
|
output_format=output_format,
|
|
|
|
|
|
custom_prompt=custom_prompt
|
|
|
|
|
|
)
|
|
|
|
|
|
result.final_content = final_content
|
|
|
|
|
|
|
|
|
|
|
|
# 4. 下载媒体文件(如果需要)
|
|
|
|
|
|
if config.download_media and xhs_result:
|
|
|
|
|
|
await self._download_media(xhs_result)
|
|
|
|
|
|
|
|
|
|
|
|
result.success = True
|
|
|
|
|
|
logger.info(f"任务 {task_id} 处理完成")
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
result.error_message = str(e)
|
|
|
|
|
|
logger.error(f"任务 {task_id} 处理失败: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
# 计算处理时间
|
|
|
|
|
|
result.processing_time = (datetime.now() - start_time).total_seconds()
|
|
|
|
|
|
|
|
|
|
|
|
# 存储结果和更新统计
|
|
|
|
|
|
self.results[task_id] = result
|
|
|
|
|
|
self._update_statistics(result)
|
|
|
|
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
async def batch_process(self,
|
|
|
|
|
|
tasks: List[Dict[str, Any]],
|
|
|
|
|
|
output_format: str = "summary") -> List[ProcessingResult]:
|
|
|
|
|
|
"""
|
|
|
|
|
|
批量处理任务
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
tasks: 任务列表,每个任务包含keyword和document_paths
|
|
|
|
|
|
output_format: 输出格式
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
List[ProcessingResult]: 处理结果列表
|
|
|
|
|
|
"""
|
|
|
|
|
|
results = []
|
|
|
|
|
|
|
|
|
|
|
|
# 并发处理任务
|
|
|
|
|
|
async def process_task(task_config):
|
|
|
|
|
|
return await self.process_content(
|
|
|
|
|
|
keyword=task_config.get("keyword", ""),
|
|
|
|
|
|
document_paths=task_config.get("document_paths"),
|
|
|
|
|
|
output_format=output_format,
|
|
|
|
|
|
max_notes=task_config.get("max_notes", 20),
|
|
|
|
|
|
custom_prompt=task_config.get("custom_prompt")
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# 使用asyncio.gather并发执行
|
|
|
|
|
|
tasks_coroutines = [process_task(task) for task in tasks]
|
|
|
|
|
|
results = await asyncio.gather(*tasks_coroutines, return_exceptions=True)
|
|
|
|
|
|
|
|
|
|
|
|
# 处理异常结果
|
|
|
|
|
|
processed_results = []
|
|
|
|
|
|
for i, result in enumerate(results):
|
|
|
|
|
|
if isinstance(result, Exception):
|
|
|
|
|
|
error_result = ProcessingResult(
|
|
|
|
|
|
task_id=str(uuid.uuid4()),
|
|
|
|
|
|
config=ProcessingConfig(keyword=tasks[i].get("keyword", "")),
|
|
|
|
|
|
success=False,
|
|
|
|
|
|
error_message=str(result)
|
|
|
|
|
|
)
|
|
|
|
|
|
processed_results.append(error_result)
|
|
|
|
|
|
else:
|
|
|
|
|
|
processed_results.append(result)
|
|
|
|
|
|
|
|
|
|
|
|
return processed_results
|
|
|
|
|
|
|
|
|
|
|
|
def _integrate_content(self,
|
|
|
|
|
|
xhs_result: Optional[XHSSearchResult],
|
|
|
|
|
|
document_result: Optional[IntegratedContent],
|
|
|
|
|
|
output_format: str,
|
|
|
|
|
|
custom_prompt: Optional[str]) -> str:
|
|
|
|
|
|
"""
|
|
|
|
|
|
整合内容
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
xhs_result: 小红书搜索结果
|
|
|
|
|
|
document_result: 文档处理结果
|
|
|
|
|
|
output_format: 输出格式
|
|
|
|
|
|
custom_prompt: 自定义提示词
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
str: 整合后的内容
|
|
|
|
|
|
"""
|
|
|
|
|
|
content_parts = []
|
|
|
|
|
|
|
|
|
|
|
|
# 添加小红书内容
|
|
|
|
|
|
if xhs_result and xhs_result.notes:
|
|
|
|
|
|
xhs_content = "\n".join([
|
|
|
|
|
|
f"标题: {note.title}\n内容: {note.content}\n作者: {note.author}\n"
|
|
|
|
|
|
for note in xhs_result.notes[:5] # 只取前5条
|
|
|
|
|
|
])
|
|
|
|
|
|
content_parts.append(f"=== 小红书相关内容 ===\n{xhs_content}")
|
|
|
|
|
|
|
|
|
|
|
|
# 添加文档内容
|
|
|
|
|
|
if document_result and document_result.integrated_text:
|
|
|
|
|
|
content_parts.append(f"=== 文档相关内容 ===\n{document_result.integrated_text}")
|
|
|
|
|
|
|
|
|
|
|
|
# 合并内容
|
|
|
|
|
|
if not content_parts:
|
|
|
|
|
|
return "没有找到相关内容"
|
|
|
|
|
|
|
|
|
|
|
|
combined_content = "\n\n".join(content_parts)
|
|
|
|
|
|
|
|
|
|
|
|
# 根据输出格式转换内容
|
|
|
|
|
|
if self.document_adapter.is_available():
|
|
|
|
|
|
try:
|
|
|
|
|
|
formatted_content = self.document_adapter.transform_content(
|
|
|
|
|
|
combined_content,
|
|
|
|
|
|
output_format
|
|
|
|
|
|
)
|
|
|
|
|
|
return formatted_content
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.error(f"内容格式转换失败: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
# 简单格式化
|
|
|
|
|
|
return self._simple_format(combined_content, output_format)
|
|
|
|
|
|
|
|
|
|
|
|
def _simple_format(self, content: str, output_format: str) -> str:
|
|
|
|
|
|
"""简单的内容格式化"""
|
|
|
|
|
|
if output_format == "summary":
|
|
|
|
|
|
return f"摘要:\n{content[:500]}..."
|
|
|
|
|
|
elif output_format == "blog_post":
|
|
|
|
|
|
return f"# 博客文章\n\n{content}"
|
|
|
|
|
|
elif output_format == "travel_guide":
|
|
|
|
|
|
return f"# 旅游攻略\n\n{content}"
|
|
|
|
|
|
else:
|
|
|
|
|
|
return content
|
|
|
|
|
|
|
|
|
|
|
|
async def _download_media(self, xhs_result: XHSSearchResult):
|
|
|
|
|
|
"""下载媒体文件"""
|
|
|
|
|
|
try:
|
|
|
|
|
|
for note in xhs_result.notes:
|
|
|
|
|
|
# 下载图片
|
|
|
|
|
|
for image_url in note.images:
|
|
|
|
|
|
await self.media_manager.download_image(image_url, note.note_id)
|
|
|
|
|
|
|
|
|
|
|
|
# 下载视频
|
|
|
|
|
|
for video_url in note.videos:
|
|
|
|
|
|
await self.media_manager.download_video(video_url, note.note_id)
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.error(f"媒体下载失败: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
def _update_statistics(self, result: ProcessingResult):
|
|
|
|
|
|
"""更新统计信息"""
|
|
|
|
|
|
self.stats.total_tasks += 1
|
|
|
|
|
|
self.stats.total_processing_time += result.processing_time
|
|
|
|
|
|
|
|
|
|
|
|
if result.success:
|
|
|
|
|
|
self.stats.successful_tasks += 1
|
|
|
|
|
|
if result.xhs_result:
|
|
|
|
|
|
self.stats.total_notes_processed += len(result.xhs_result.notes)
|
|
|
|
|
|
if result.document_result:
|
|
|
|
|
|
self.stats.total_documents_processed += len(result.document_result.documents)
|
|
|
|
|
|
else:
|
|
|
|
|
|
self.stats.failed_tasks += 1
|
|
|
|
|
|
|
|
|
|
|
|
# 计算平均处理时间
|
|
|
|
|
|
if self.stats.total_tasks > 0:
|
|
|
|
|
|
self.stats.average_processing_time = (
|
|
|
|
|
|
self.stats.total_processing_time / self.stats.total_tasks
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
self.stats.last_updated = datetime.now()
|
|
|
|
|
|
|
|
|
|
|
|
def get_statistics(self) -> ProcessingStats:
|
|
|
|
|
|
"""获取处理统计信息"""
|
|
|
|
|
|
return self.stats
|
|
|
|
|
|
|
|
|
|
|
|
def get_result(self, task_id: str) -> Optional[ProcessingResult]:
|
|
|
|
|
|
"""获取处理结果"""
|
|
|
|
|
|
return self.results.get(task_id)
|
|
|
|
|
|
|
|
|
|
|
|
def get_all_results(self) -> Dict[str, ProcessingResult]:
|
|
|
|
|
|
"""获取所有处理结果"""
|
|
|
|
|
|
return self.results.copy()
|
|
|
|
|
|
|
|
|
|
|
|
def clear_results(self):
|
|
|
|
|
|
"""清空处理结果"""
|
|
|
|
|
|
self.results.clear()
|
|
|
|
|
|
|
|
|
|
|
|
def get_status(self) -> Dict[str, Any]:
|
|
|
|
|
|
"""获取服务状态"""
|
|
|
|
|
|
return {
|
|
|
|
|
|
"xhs_adapter": self.xhs_adapter.get_status(),
|
|
|
|
|
|
"document_adapter": self.document_adapter.get_status(),
|
|
|
|
|
|
"cookie_manager": self.cookie_manager.get_status() if hasattr(self.cookie_manager, 'get_status') else {},
|
|
|
|
|
|
"media_manager": self.media_manager.get_status() if hasattr(self.media_manager, 'get_status') else {},
|
|
|
|
|
|
"statistics": self.stats,
|
|
|
|
|
|
"total_results": len(self.results)
|
|
|
|
|
|
}
|