优化了文本解析的输出字段

This commit is contained in:
jinye_huang 2025-07-15 18:20:36 +08:00
parent c66273b393
commit f6cff4e5c0
8 changed files with 48 additions and 402 deletions

View File

@ -71,15 +71,22 @@ async def integrate_content(request: ContentIntegrationRequest) -> ContentIntegr
document_info=result["document_info"],
xhs_info=result["xhs_info"],
integrated_content=result["integrated_content"],
search_config=result["search_config"]
search_config=result["search_config"],
error_message=None # 成功时无错误信息
)
logger.info(f"内容整合成功,处理时间:{result['processing_time']}")
else:
from datetime import datetime
response = ContentIntegrationResponse(
success=False,
timestamp=result["timestamp"],
processing_time=result["processing_time"],
error_message=result["error_message"]
timestamp=result.get("timestamp", datetime.now().strftime("%Y%m%d_%H%M%S")),
processing_time=result.get("processing_time", "0秒"),
input_summary=result.get("input_summary"),
document_info=result.get("document_info"),
xhs_info=result.get("xhs_info"),
integrated_content=result.get("integrated_content"),
search_config=result.get("search_config"),
error_message=result.get("error_message")
)
logger.error(f"内容整合失败:{result['error_message']}")

View File

@ -8,13 +8,8 @@ Core Module
提供内容整合的核心服务通过适配器模式与xhs_spider和document模块交互
"""
# 导入主要服务
from .content_integration_service import (
ContentIntegrationService,
ProcessingConfig,
ProcessingResult,
ProcessingStats
)
# 注意ContentIntegrationService已移至api.services模块
# 这里不再导入以避免与API服务冲突
# 导入适配器
from .xhs_adapter import XHSAdapter, XHSNote, XHSSearchResult
@ -30,14 +25,6 @@ __author__ = "TravelContentCreator Team"
# 导出所有公共接口
__all__ = [
# 主要服务
'ContentIntegrationService',
# 数据模型
'ProcessingConfig',
'ProcessingResult',
'ProcessingStats',
# 适配器
'XHSAdapter',
'DocumentAdapter',
@ -57,26 +44,4 @@ __all__ = [
'__author__'
]
# 便捷函数
def create_integration_service(cookie_config_path: str = "cookies.json",
media_storage_path: str = "media",
enable_logging: bool = True) -> ContentIntegrationService:
"""
创建内容整合服务实例的便捷函数
Args:
cookie_config_path: Cookie配置文件路径
media_storage_path: 媒体存储路径
enable_logging: 是否启用日志
Returns:
ContentIntegrationService: 内容整合服务实例
"""
return ContentIntegrationService(
cookie_config_path=cookie_config_path,
media_storage_path=media_storage_path,
enable_logging=enable_logging
)
# 添加到导出列表
__all__.append('create_integration_service')
# 注意ContentIntegrationService及相关模型已移至api.services模块

View File

@ -1,349 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Content Integration Service
内容整合服务
core模块的主要服务通过适配器与xhs_spider和document模块交互
"""
import asyncio
from typing import Dict, List, Optional, Any
from dataclasses import dataclass, field
from datetime import datetime
import logging
import uuid
from .xhs_adapter import XHSAdapter, XHSSearchResult
from .document_adapter import DocumentAdapter, IntegratedContent
from .cookie_manager import CookieManager
from .media_manager import ImageStorageManager
logger = logging.getLogger(__name__)
@dataclass
class ProcessingConfig:
"""处理配置"""
keyword: str
document_paths: Optional[List[str]] = None
max_notes: int = 20
output_format: str = "summary"
download_media: bool = True
custom_prompt: Optional[str] = None
@dataclass
class ProcessingResult:
"""处理结果"""
task_id: str
config: ProcessingConfig
success: bool
xhs_result: Optional[XHSSearchResult] = None
document_result: Optional[IntegratedContent] = None
final_content: str = ""
error_message: str = ""
processing_time: float = 0.0
created_time: datetime = field(default_factory=datetime.now)
@dataclass
class ProcessingStats:
"""处理统计信息"""
total_tasks: int = 0
successful_tasks: int = 0
failed_tasks: int = 0
total_processing_time: float = 0.0
average_processing_time: float = 0.0
total_notes_processed: int = 0
total_documents_processed: int = 0
start_time: datetime = field(default_factory=datetime.now)
last_updated: datetime = field(default_factory=datetime.now)
class ContentIntegrationService:
"""内容整合服务"""
def __init__(self,
cookie_config_path: str = "cookies.json",
media_storage_path: str = "media",
enable_logging: bool = True):
"""
初始化服务
Args:
cookie_config_path: Cookie配置文件路径
media_storage_path: 媒体存储路径
enable_logging: 是否启用日志
"""
self.cookie_config_path = cookie_config_path
self.media_storage_path = media_storage_path
self.enable_logging = enable_logging
# 初始化适配器
self.xhs_adapter = XHSAdapter(cookie_config_path)
self.document_adapter = DocumentAdapter()
# 初始化管理器
self.cookie_manager = CookieManager(cookie_config_path)
self.media_manager = ImageStorageManager(media_storage_path)
# 结果存储和统计
self.results: Dict[str, ProcessingResult] = {}
self.stats = ProcessingStats()
logger.info("内容整合服务初始化完成")
async def process_content(self,
keyword: str,
document_paths: Optional[List[str]] = None,
output_format: str = "summary",
max_notes: int = 20,
custom_prompt: Optional[str] = None) -> ProcessingResult:
"""
处理内容整合
Args:
keyword: 搜索关键词
document_paths: 文档路径列表
output_format: 输出格式
max_notes: 最大笔记数量
custom_prompt: 自定义提示词
Returns:
ProcessingResult: 处理结果
"""
task_id = str(uuid.uuid4())
start_time = datetime.now()
config = ProcessingConfig(
keyword=keyword,
document_paths=document_paths,
max_notes=max_notes,
output_format=output_format,
custom_prompt=custom_prompt
)
result = ProcessingResult(
task_id=task_id,
config=config,
success=False
)
try:
# 1. 搜索小红书内容(如果有关键词)
xhs_result = None
if keyword:
logger.info(f"开始搜索小红书内容: {keyword}")
xhs_result = self.xhs_adapter.search_notes(
keyword=keyword,
max_notes=max_notes,
)
result.xhs_result = xhs_result
logger.info(f"小红书搜索完成,找到 {len(xhs_result.notes)} 条笔记")
# 2. 处理文档内容(如果有文档)
document_result = None
if document_paths:
logger.info(f"开始处理文档: {len(document_paths)} 个文件")
document_result = self.document_adapter.integrate_documents(document_paths)
result.document_result = document_result
logger.info(f"文档处理完成,总长度: {document_result.total_length}")
# 3. 整合内容
final_content = self._integrate_content(
xhs_result=xhs_result,
document_result=document_result,
output_format=output_format,
custom_prompt=custom_prompt
)
result.final_content = final_content
# 4. 下载媒体文件(如果需要)
if config.download_media and xhs_result:
await self._download_media(xhs_result)
result.success = True
logger.info(f"任务 {task_id} 处理完成")
except Exception as e:
result.error_message = str(e)
logger.error(f"任务 {task_id} 处理失败: {e}")
# 计算处理时间
result.processing_time = (datetime.now() - start_time).total_seconds()
# 存储结果和更新统计
self.results[task_id] = result
self._update_statistics(result)
return result
async def batch_process(self,
tasks: List[Dict[str, Any]],
output_format: str = "summary") -> List[ProcessingResult]:
"""
批量处理任务
Args:
tasks: 任务列表每个任务包含keyword和document_paths
output_format: 输出格式
Returns:
List[ProcessingResult]: 处理结果列表
"""
results = []
# 并发处理任务
async def process_task(task_config):
return await self.process_content(
keyword=task_config.get("keyword", ""),
document_paths=task_config.get("document_paths"),
output_format=output_format,
max_notes=task_config.get("max_notes", 20),
custom_prompt=task_config.get("custom_prompt")
)
# 使用asyncio.gather并发执行
tasks_coroutines = [process_task(task) for task in tasks]
results = await asyncio.gather(*tasks_coroutines, return_exceptions=True)
# 处理异常结果
processed_results = []
for i, result in enumerate(results):
if isinstance(result, Exception):
error_result = ProcessingResult(
task_id=str(uuid.uuid4()),
config=ProcessingConfig(keyword=tasks[i].get("keyword", "")),
success=False,
error_message=str(result)
)
processed_results.append(error_result)
else:
processed_results.append(result)
return processed_results
def _integrate_content(self,
xhs_result: Optional[XHSSearchResult],
document_result: Optional[IntegratedContent],
output_format: str,
custom_prompt: Optional[str]) -> str:
"""
整合内容
Args:
xhs_result: 小红书搜索结果
document_result: 文档处理结果
output_format: 输出格式
custom_prompt: 自定义提示词
Returns:
str: 整合后的内容
"""
content_parts = []
# 添加小红书内容
if xhs_result and xhs_result.notes:
xhs_content = "\n".join([
f"标题: {note.title}\n内容: {note.content}\n作者: {note.author}\n"
for note in xhs_result.notes[:5] # 只取前5条
])
content_parts.append(f"=== 小红书相关内容 ===\n{xhs_content}")
# 添加文档内容
if document_result and document_result.integrated_text:
content_parts.append(f"=== 文档相关内容 ===\n{document_result.integrated_text}")
# 合并内容
if not content_parts:
return "没有找到相关内容"
combined_content = "\n\n".join(content_parts)
# 根据输出格式转换内容
if self.document_adapter.is_available():
try:
formatted_content = self.document_adapter.transform_content(
combined_content,
output_format
)
return formatted_content
except Exception as e:
logger.error(f"内容格式转换失败: {e}")
# 简单格式化
return self._simple_format(combined_content, output_format)
def _simple_format(self, content: str, output_format: str) -> str:
"""简单的内容格式化"""
if output_format == "summary":
return f"摘要:\n{content[:500]}..."
elif output_format == "blog_post":
return f"# 博客文章\n\n{content}"
elif output_format == "travel_guide":
return f"# 旅游攻略\n\n{content}"
else:
return content
async def _download_media(self, xhs_result: XHSSearchResult):
"""下载媒体文件"""
try:
for note in xhs_result.notes:
# 下载图片
for image_url in note.images:
await self.media_manager.download_image(image_url, note.note_id)
# 下载视频
for video_url in note.videos:
await self.media_manager.download_video(video_url, note.note_id)
except Exception as e:
logger.error(f"媒体下载失败: {e}")
def _update_statistics(self, result: ProcessingResult):
"""更新统计信息"""
self.stats.total_tasks += 1
self.stats.total_processing_time += result.processing_time
if result.success:
self.stats.successful_tasks += 1
if result.xhs_result:
self.stats.total_notes_processed += len(result.xhs_result.notes)
if result.document_result:
self.stats.total_documents_processed += len(result.document_result.documents)
else:
self.stats.failed_tasks += 1
# 计算平均处理时间
if self.stats.total_tasks > 0:
self.stats.average_processing_time = (
self.stats.total_processing_time / self.stats.total_tasks
)
self.stats.last_updated = datetime.now()
def get_statistics(self) -> ProcessingStats:
"""获取处理统计信息"""
return self.stats
def get_result(self, task_id: str) -> Optional[ProcessingResult]:
"""获取处理结果"""
return self.results.get(task_id)
def get_all_results(self) -> Dict[str, ProcessingResult]:
"""获取所有处理结果"""
return self.results.copy()
def clear_results(self):
"""清空处理结果"""
self.results.clear()
def get_status(self) -> Dict[str, Any]:
"""获取服务状态"""
return {
"xhs_adapter": self.xhs_adapter.get_status(),
"document_adapter": self.document_adapter.get_status(),
"cookie_manager": self.cookie_manager.get_status() if hasattr(self.cookie_manager, 'get_status') else {},
"media_manager": self.media_manager.get_status() if hasattr(self.media_manager, 'get_status') else {},
"statistics": self.stats,
"total_results": len(self.results)
}

View File

@ -21,24 +21,47 @@
{
"attractions": [
{
"name": "景区/酒店名称",
"name": "景区名称",
"address": "景区详细地址",
"trafficInfo": "交通指南详细信息",
"description": "景区空泛描述信息",
"advantage": "景区最大优势",
"highlight": "景区亮点描述",
"products": [
{
"product_name": "产品套餐名称",
"usage_rules": "产品使用规则详细说明",
"product_name": "产品名称",
"originPrice": "原始价格",
"realPrice": "实际价格",
"packageInfo": "产品详情完整描述",
"salesPeriod": "销售周期信息",
"stock": "库存数量",
"usage_rules": {
"rules": "使用规则详细说明",
"surcharge": "加价说明",
"reservation": "预约规则",
"refund": "退改政策",
"discounts": "优惠内容"
},
"transportation": {
"guide": "交通指南",
"guide": "交通指南详细信息",
"address": "详细地址"
},
"price": "产品价格信息",
"key_advantages": "产品最突出优势",
"key_advantages": "产品最大优势描述",
"highlights": "产品亮点详细描述",
"detailed_description": [
"1. 详细描述项目1",
"2. 详细描述项目2",
"..."
"1. 游泳池尺寸长xx米宽xx米深xx米",
"2. 泳池特色:如无边泳池、恒温泳池等",
"3. 开放时间xx:xx-xx:xx",
"4. 适合人群:儿童/成人等",
"5. 安全设施:救生员配置等",
"6. 周边景点xxx景点距离xx米",
"7. 风景描述:如海景、山景等",
"8. 小贴士:需要携带的物品等",
"9. 游玩路线建议",
"10. 推荐游玩时长"
]
}
]
}
]
}
}

View File

@ -342,7 +342,7 @@ class ContentPromptBuilder(BasePromptBuilder):
style_filename = topic.get("style", "")
style_content = self._find_and_read_file(style_filename, self.resource_config.style.paths) or "通用风格"
demand_filename = topic.get("target_audience", "")
demand_filename = topic.get("targetAudience", "")
demand_content = self._find_and_read_file(demand_filename, self.resource_config.demand.paths) or "通用用户画像"
object_name = topic.get("object", "")