TravelContentCreator/core/content_integration_service.py

349 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Content Integration Service
内容整合服务
core模块的主要服务通过适配器与xhs_spider和document模块交互
"""
import asyncio
from typing import Dict, List, Optional, Any
from dataclasses import dataclass, field
from datetime import datetime
import logging
import uuid
from .xhs_adapter import XHSAdapter, XHSSearchResult
from .document_adapter import DocumentAdapter, IntegratedContent
from .cookie_manager import CookieManager
from .media_manager import ImageStorageManager
logger = logging.getLogger(__name__)
@dataclass
class ProcessingConfig:
"""处理配置"""
keyword: str
document_paths: Optional[List[str]] = None
max_notes: int = 20
output_format: str = "summary"
download_media: bool = True
custom_prompt: Optional[str] = None
@dataclass
class ProcessingResult:
"""处理结果"""
task_id: str
config: ProcessingConfig
success: bool
xhs_result: Optional[XHSSearchResult] = None
document_result: Optional[IntegratedContent] = None
final_content: str = ""
error_message: str = ""
processing_time: float = 0.0
created_time: datetime = field(default_factory=datetime.now)
@dataclass
class ProcessingStats:
"""处理统计信息"""
total_tasks: int = 0
successful_tasks: int = 0
failed_tasks: int = 0
total_processing_time: float = 0.0
average_processing_time: float = 0.0
total_notes_processed: int = 0
total_documents_processed: int = 0
start_time: datetime = field(default_factory=datetime.now)
last_updated: datetime = field(default_factory=datetime.now)
class ContentIntegrationService:
"""内容整合服务"""
def __init__(self,
cookie_config_path: str = "cookies.json",
media_storage_path: str = "media",
enable_logging: bool = True):
"""
初始化服务
Args:
cookie_config_path: Cookie配置文件路径
media_storage_path: 媒体存储路径
enable_logging: 是否启用日志
"""
self.cookie_config_path = cookie_config_path
self.media_storage_path = media_storage_path
self.enable_logging = enable_logging
# 初始化适配器
self.xhs_adapter = XHSAdapter(cookie_config_path)
self.document_adapter = DocumentAdapter()
# 初始化管理器
self.cookie_manager = CookieManager(cookie_config_path)
self.media_manager = ImageStorageManager(media_storage_path)
# 结果存储和统计
self.results: Dict[str, ProcessingResult] = {}
self.stats = ProcessingStats()
logger.info("内容整合服务初始化完成")
async def process_content(self,
keyword: str,
document_paths: Optional[List[str]] = None,
output_format: str = "summary",
max_notes: int = 20,
custom_prompt: Optional[str] = None) -> ProcessingResult:
"""
处理内容整合
Args:
keyword: 搜索关键词
document_paths: 文档路径列表
output_format: 输出格式
max_notes: 最大笔记数量
custom_prompt: 自定义提示词
Returns:
ProcessingResult: 处理结果
"""
task_id = str(uuid.uuid4())
start_time = datetime.now()
config = ProcessingConfig(
keyword=keyword,
document_paths=document_paths,
max_notes=max_notes,
output_format=output_format,
custom_prompt=custom_prompt
)
result = ProcessingResult(
task_id=task_id,
config=config,
success=False
)
try:
# 1. 搜索小红书内容(如果有关键词)
xhs_result = None
if keyword:
logger.info(f"开始搜索小红书内容: {keyword}")
xhs_result = self.xhs_adapter.search_notes(
keyword=keyword,
max_notes=max_notes,
)
result.xhs_result = xhs_result
logger.info(f"小红书搜索完成,找到 {len(xhs_result.notes)} 条笔记")
# 2. 处理文档内容(如果有文档)
document_result = None
if document_paths:
logger.info(f"开始处理文档: {len(document_paths)} 个文件")
document_result = self.document_adapter.integrate_documents(document_paths)
result.document_result = document_result
logger.info(f"文档处理完成,总长度: {document_result.total_length}")
# 3. 整合内容
final_content = self._integrate_content(
xhs_result=xhs_result,
document_result=document_result,
output_format=output_format,
custom_prompt=custom_prompt
)
result.final_content = final_content
# 4. 下载媒体文件(如果需要)
if config.download_media and xhs_result:
await self._download_media(xhs_result)
result.success = True
logger.info(f"任务 {task_id} 处理完成")
except Exception as e:
result.error_message = str(e)
logger.error(f"任务 {task_id} 处理失败: {e}")
# 计算处理时间
result.processing_time = (datetime.now() - start_time).total_seconds()
# 存储结果和更新统计
self.results[task_id] = result
self._update_statistics(result)
return result
async def batch_process(self,
tasks: List[Dict[str, Any]],
output_format: str = "summary") -> List[ProcessingResult]:
"""
批量处理任务
Args:
tasks: 任务列表每个任务包含keyword和document_paths
output_format: 输出格式
Returns:
List[ProcessingResult]: 处理结果列表
"""
results = []
# 并发处理任务
async def process_task(task_config):
return await self.process_content(
keyword=task_config.get("keyword", ""),
document_paths=task_config.get("document_paths"),
output_format=output_format,
max_notes=task_config.get("max_notes", 20),
custom_prompt=task_config.get("custom_prompt")
)
# 使用asyncio.gather并发执行
tasks_coroutines = [process_task(task) for task in tasks]
results = await asyncio.gather(*tasks_coroutines, return_exceptions=True)
# 处理异常结果
processed_results = []
for i, result in enumerate(results):
if isinstance(result, Exception):
error_result = ProcessingResult(
task_id=str(uuid.uuid4()),
config=ProcessingConfig(keyword=tasks[i].get("keyword", "")),
success=False,
error_message=str(result)
)
processed_results.append(error_result)
else:
processed_results.append(result)
return processed_results
def _integrate_content(self,
xhs_result: Optional[XHSSearchResult],
document_result: Optional[IntegratedContent],
output_format: str,
custom_prompt: Optional[str]) -> str:
"""
整合内容
Args:
xhs_result: 小红书搜索结果
document_result: 文档处理结果
output_format: 输出格式
custom_prompt: 自定义提示词
Returns:
str: 整合后的内容
"""
content_parts = []
# 添加小红书内容
if xhs_result and xhs_result.notes:
xhs_content = "\n".join([
f"标题: {note.title}\n内容: {note.content}\n作者: {note.author}\n"
for note in xhs_result.notes[:5] # 只取前5条
])
content_parts.append(f"=== 小红书相关内容 ===\n{xhs_content}")
# 添加文档内容
if document_result and document_result.integrated_text:
content_parts.append(f"=== 文档相关内容 ===\n{document_result.integrated_text}")
# 合并内容
if not content_parts:
return "没有找到相关内容"
combined_content = "\n\n".join(content_parts)
# 根据输出格式转换内容
if self.document_adapter.is_available():
try:
formatted_content = self.document_adapter.transform_content(
combined_content,
output_format
)
return formatted_content
except Exception as e:
logger.error(f"内容格式转换失败: {e}")
# 简单格式化
return self._simple_format(combined_content, output_format)
def _simple_format(self, content: str, output_format: str) -> str:
"""简单的内容格式化"""
if output_format == "summary":
return f"摘要:\n{content[:500]}..."
elif output_format == "blog_post":
return f"# 博客文章\n\n{content}"
elif output_format == "travel_guide":
return f"# 旅游攻略\n\n{content}"
else:
return content
async def _download_media(self, xhs_result: XHSSearchResult):
"""下载媒体文件"""
try:
for note in xhs_result.notes:
# 下载图片
for image_url in note.images:
await self.media_manager.download_image(image_url, note.note_id)
# 下载视频
for video_url in note.videos:
await self.media_manager.download_video(video_url, note.note_id)
except Exception as e:
logger.error(f"媒体下载失败: {e}")
def _update_statistics(self, result: ProcessingResult):
"""更新统计信息"""
self.stats.total_tasks += 1
self.stats.total_processing_time += result.processing_time
if result.success:
self.stats.successful_tasks += 1
if result.xhs_result:
self.stats.total_notes_processed += len(result.xhs_result.notes)
if result.document_result:
self.stats.total_documents_processed += len(result.document_result.documents)
else:
self.stats.failed_tasks += 1
# 计算平均处理时间
if self.stats.total_tasks > 0:
self.stats.average_processing_time = (
self.stats.total_processing_time / self.stats.total_tasks
)
self.stats.last_updated = datetime.now()
def get_statistics(self) -> ProcessingStats:
"""获取处理统计信息"""
return self.stats
def get_result(self, task_id: str) -> Optional[ProcessingResult]:
"""获取处理结果"""
return self.results.get(task_id)
def get_all_results(self) -> Dict[str, ProcessingResult]:
"""获取所有处理结果"""
return self.results.copy()
def clear_results(self):
"""清空处理结果"""
self.results.clear()
def get_status(self) -> Dict[str, Any]:
"""获取服务状态"""
return {
"xhs_adapter": self.xhs_adapter.get_status(),
"document_adapter": self.document_adapter.get_status(),
"cookie_manager": self.cookie_manager.get_status() if hasattr(self.cookie_manager, 'get_status') else {},
"media_manager": self.media_manager.get_status() if hasattr(self.media_manager, 'get_status') else {},
"statistics": self.stats,
"total_results": len(self.results)
}