236 lines
7.0 KiB
Python
236 lines
7.0 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
|
|||
|
|
"""
|
|||
|
|
内容转换器模块
|
|||
|
|
使用LLM将解析的文档内容转换为标准化的景区和产品资料格式
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import logging
|
|||
|
|
from typing import Dict, Any, Optional, List
|
|||
|
|
from dataclasses import dataclass
|
|||
|
|
from datetime import datetime
|
|||
|
|
import uuid
|
|||
|
|
|
|||
|
|
from .content_integrator import IntegratedContent
|
|||
|
|
from core.ai.ai_agent import AIAgent
|
|||
|
|
from core.config.manager import ConfigManager
|
|||
|
|
from utils.file_io import OutputManager
|
|||
|
|
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
@dataclass
|
|||
|
|
class TransformedContent:
|
|||
|
|
"""转换后的内容"""
|
|||
|
|
original_content: IntegratedContent
|
|||
|
|
transformed_text: str
|
|||
|
|
format_type: str
|
|||
|
|
transformation_metadata: Dict[str, Any]
|
|||
|
|
transformed_at: datetime
|
|||
|
|
|
|||
|
|
class ContentTransformer:
|
|||
|
|
"""内容转换器 - 将整合的内容转换为指定格式"""
|
|||
|
|
|
|||
|
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|||
|
|
self.config = config or {}
|
|||
|
|
self.supported_formats = {
|
|||
|
|
'attraction_standard': self._transform_to_attraction_standard,
|
|||
|
|
'product_sales': self._transform_to_product_sales,
|
|||
|
|
'travel_guide': self._transform_to_travel_guide,
|
|||
|
|
'blog_post': self._transform_to_blog_post,
|
|||
|
|
'summary': self._transform_to_summary
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
def transform_content(self,
|
|||
|
|
integrated_content: IntegratedContent,
|
|||
|
|
format_type: str = 'summary',
|
|||
|
|
custom_prompt: Optional[str] = None) -> TransformedContent:
|
|||
|
|
"""转换内容
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
integrated_content: 整合后的内容
|
|||
|
|
format_type: 转换格式类型
|
|||
|
|
custom_prompt: 自定义提示词
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
TransformedContent: 转换后的内容
|
|||
|
|
"""
|
|||
|
|
if format_type not in self.supported_formats:
|
|||
|
|
raise ValueError(f"不支持的格式类型: {format_type}")
|
|||
|
|
|
|||
|
|
logger.info(f"开始转换内容,格式: {format_type}")
|
|||
|
|
|
|||
|
|
# 执行转换
|
|||
|
|
transform_func = self.supported_formats[format_type]
|
|||
|
|
transformed_text = transform_func(integrated_content, custom_prompt)
|
|||
|
|
|
|||
|
|
# 生成转换元数据
|
|||
|
|
transformation_metadata = {
|
|||
|
|
'format_type': format_type,
|
|||
|
|
'source_document_count': integrated_content.document_count,
|
|||
|
|
'source_content_length': integrated_content.total_content_length,
|
|||
|
|
'transformed_content_length': len(transformed_text),
|
|||
|
|
'key_topics_used': integrated_content.key_topics,
|
|||
|
|
'custom_prompt_used': custom_prompt is not None
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return TransformedContent(
|
|||
|
|
original_content=integrated_content,
|
|||
|
|
transformed_text=transformed_text,
|
|||
|
|
format_type=format_type,
|
|||
|
|
transformation_metadata=transformation_metadata,
|
|||
|
|
transformed_at=datetime.now()
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
def _transform_to_attraction_standard(self, content: IntegratedContent, custom_prompt: Optional[str] = None) -> str:
|
|||
|
|
"""转换为景点标准格式"""
|
|||
|
|
template = """
|
|||
|
|
# 景点信息整理
|
|||
|
|
|
|||
|
|
## 基本信息
|
|||
|
|
- 文档来源: {document_count}个文档
|
|||
|
|
- 主要主题: {key_topics}
|
|||
|
|
|
|||
|
|
## 详细内容
|
|||
|
|
{combined_content}
|
|||
|
|
|
|||
|
|
## 内容摘要
|
|||
|
|
{content_summary}
|
|||
|
|
|
|||
|
|
---
|
|||
|
|
*基于提供的文档整理,如需更多信息请参考原始文档*
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
return template.format(
|
|||
|
|
document_count=content.document_count,
|
|||
|
|
key_topics=", ".join(content.key_topics[:5]),
|
|||
|
|
combined_content=content.combined_content,
|
|||
|
|
content_summary=content.content_summary
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
def _transform_to_product_sales(self, content: IntegratedContent, custom_prompt: Optional[str] = None) -> str:
|
|||
|
|
"""转换为产品销售格式"""
|
|||
|
|
template = """
|
|||
|
|
# 产品销售资料
|
|||
|
|
|
|||
|
|
## 产品特色
|
|||
|
|
基于{document_count}个文档的信息整理:
|
|||
|
|
|
|||
|
|
{content_summary}
|
|||
|
|
|
|||
|
|
## 详细介绍
|
|||
|
|
{combined_content}
|
|||
|
|
|
|||
|
|
## 关键卖点
|
|||
|
|
{key_topics}
|
|||
|
|
|
|||
|
|
---
|
|||
|
|
*内容整理自提供的文档资料*
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
key_points = "\n".join([f"• {topic}" for topic in content.key_topics[:8]])
|
|||
|
|
|
|||
|
|
return template.format(
|
|||
|
|
document_count=content.document_count,
|
|||
|
|
content_summary=content.content_summary,
|
|||
|
|
combined_content=content.combined_content,
|
|||
|
|
key_topics=key_points
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
def _transform_to_travel_guide(self, content: IntegratedContent, custom_prompt: Optional[str] = None) -> str:
|
|||
|
|
"""转换为旅游指南格式"""
|
|||
|
|
template = """
|
|||
|
|
# 旅游指南
|
|||
|
|
|
|||
|
|
## 概述
|
|||
|
|
{content_summary}
|
|||
|
|
|
|||
|
|
## 详细信息
|
|||
|
|
{combined_content}
|
|||
|
|
|
|||
|
|
## 重要提示
|
|||
|
|
- 信息来源: {document_count}个文档
|
|||
|
|
- 关键主题: {key_topics}
|
|||
|
|
|
|||
|
|
---
|
|||
|
|
*本指南基于提供的文档整理,出行前请核实最新信息*
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
return template.format(
|
|||
|
|
content_summary=content.content_summary,
|
|||
|
|
combined_content=content.combined_content,
|
|||
|
|
document_count=content.document_count,
|
|||
|
|
key_topics=", ".join(content.key_topics[:5])
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
def _transform_to_blog_post(self, content: IntegratedContent, custom_prompt: Optional[str] = None) -> str:
|
|||
|
|
"""转换为博客文章格式"""
|
|||
|
|
template = """
|
|||
|
|
# 博客文章
|
|||
|
|
|
|||
|
|
## 前言
|
|||
|
|
本文基于{document_count}个文档资料整理而成。
|
|||
|
|
|
|||
|
|
## 主要内容
|
|||
|
|
|
|||
|
|
{combined_content}
|
|||
|
|
|
|||
|
|
## 总结
|
|||
|
|
{content_summary}
|
|||
|
|
|
|||
|
|
## 相关主题
|
|||
|
|
{key_topics}
|
|||
|
|
|
|||
|
|
---
|
|||
|
|
*本文内容整理自多个文档资料*
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
topics_list = "\n".join([f"- {topic}" for topic in content.key_topics[:10]])
|
|||
|
|
|
|||
|
|
return template.format(
|
|||
|
|
document_count=content.document_count,
|
|||
|
|
combined_content=content.combined_content,
|
|||
|
|
content_summary=content.content_summary,
|
|||
|
|
key_topics=topics_list
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
def _transform_to_summary(self, content: IntegratedContent, custom_prompt: Optional[str] = None) -> str:
|
|||
|
|
"""转换为摘要格式"""
|
|||
|
|
template = """
|
|||
|
|
# 文档内容摘要
|
|||
|
|
|
|||
|
|
## 文档统计
|
|||
|
|
- 文档数量: {document_count}
|
|||
|
|
- 文档类型: {document_types}
|
|||
|
|
- 内容长度: {content_length}字符
|
|||
|
|
|
|||
|
|
## 内容摘要
|
|||
|
|
{content_summary}
|
|||
|
|
|
|||
|
|
## 关键主题
|
|||
|
|
{key_topics}
|
|||
|
|
|
|||
|
|
## 完整内容
|
|||
|
|
{combined_content}
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
doc_types = ", ".join([f"{k}({v}个)" for k, v in content.document_types.items()])
|
|||
|
|
topics_list = "\n".join([f"• {topic}" for topic in content.key_topics])
|
|||
|
|
|
|||
|
|
return template.format(
|
|||
|
|
document_count=content.document_count,
|
|||
|
|
document_types=doc_types,
|
|||
|
|
content_length=content.total_content_length,
|
|||
|
|
content_summary=content.content_summary,
|
|||
|
|
key_topics=topics_list,
|
|||
|
|
combined_content=content.combined_content
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
def get_supported_formats(self) -> List[str]:
|
|||
|
|
"""获取支持的格式列表"""
|
|||
|
|
return list(self.supported_formats.keys())
|
|||
|
|
|
|||
|
|
def add_custom_format(self, format_name: str, transform_func):
|
|||
|
|
"""添加自定义格式"""
|
|||
|
|
self.supported_formats[format_name] = transform_func
|
|||
|
|
logger.info(f"添加自定义格式: {format_name}")
|