970 lines
94 KiB
JSON
970 lines
94 KiB
JSON
{
|
||
"file_path": "travel-algorithms/travel_algorithms/document_processing/content_transformer.py",
|
||
"file_size": 25896,
|
||
"line_count": 807,
|
||
"functions": [
|
||
{
|
||
"name": "__post_init__",
|
||
"line_start": 33,
|
||
"line_end": 36,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
}
|
||
],
|
||
"return_type": null,
|
||
"docstring": "初始化后处理",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def __post_init__(self):\n \"\"\"初始化后处理\"\"\"\n if not self.transformed_at:\n self.transformed_at = datetime.now()",
|
||
"code_hash": "98adcf64ad6666b500fb2842b7e9ad72"
|
||
},
|
||
{
|
||
"name": "to_dict",
|
||
"line_start": 38,
|
||
"line_end": 49,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
}
|
||
],
|
||
"return_type": "Dict[str, Any]",
|
||
"docstring": "转换为字典格式",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def to_dict(self) -> Dict[str, Any]:\n \"\"\"转换为字典格式\"\"\"\n return {\n 'transformed_text': self.transformed_text,\n 'format_type': self.format_type,\n 'transformation_metadata': self.transformation_metadata,\n 'transformed_at': self.transformed_at.isoformat(),\n 'structured_data': self.structured_data,\n 'quality_score': self.quality_score,\n 'original_summary': self.original_content.content_summary,\n 'original_document_count': self.original_content.document_count\n }",
|
||
"code_hash": "dcd995d469d689d1ef3e0a4541a74850"
|
||
},
|
||
{
|
||
"name": "get_summary",
|
||
"line_start": 51,
|
||
"line_end": 60,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
}
|
||
],
|
||
"return_type": "Dict[str, Any]",
|
||
"docstring": "获取转换摘要",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def get_summary(self) -> Dict[str, Any]:\n \"\"\"获取转换摘要\"\"\"\n return {\n 'format_type': self.format_type,\n 'transformed_length': len(self.transformed_text),\n 'quality_score': self.quality_score,\n 'source_documents': self.original_content.document_count,\n 'transformation_method': self.transformation_metadata.get('method'),\n 'transformed_at': self.transformed_at.isoformat()\n }",
|
||
"code_hash": "4195b93949396eff3edc6ae0001c235c"
|
||
},
|
||
{
|
||
"name": "__init__",
|
||
"line_start": 80,
|
||
"line_end": 110,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "config",
|
||
"type_hint": "AlgorithmConfig"
|
||
}
|
||
],
|
||
"return_type": null,
|
||
"docstring": "初始化内容转换器\n\nArgs:\n config: 算法配置",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def __init__(self, config: AlgorithmConfig):\n \"\"\"\n 初始化内容转换器\n \n Args:\n config: 算法配置\n \"\"\"\n self.config = config\n self.ai_service = AIService(config.ai_model)\n self.prompt_manager = PromptManager(config.prompts, config.resources)\n self.json_processor = JSONProcessor(\n enable_repair=config.content_generation.enable_json_repair,\n max_repair_attempts=config.content_generation.json_repair_attempts\n )\n \n # 获取任务特定的模型配置\n self.task_model_config = config.ai_model.get_task_config(\"content_transformation\")\n \n # 格式转换方法映射\n self.format_methods = {\n 'attraction_standard': self._transform_to_attraction_standard,\n 'product_sales': self._transform_to_product_sales,\n 'travel_guide': self._transform_to_travel_guide,\n 'blog_post': self._transform_to_blog_post,\n 'summary': self._transform_to_summary,\n 'structured_data': self._transform_to_structured_data,\n 'marketing_copy': self._transform_to_marketing_copy,\n 'faq': self._transform_to_faq\n }\n \n logger.info(f\"内容转换器初始化完成,支持格式: {list(self.SUPPORTED_FORMATS.keys())}\")",
|
||
"code_hash": "bb54e03402fd9abaec8a920b03e82bb8"
|
||
},
|
||
{
|
||
"name": "_get_system_prompt",
|
||
"line_start": 599,
|
||
"line_end": 617,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "format_type",
|
||
"type_hint": "str"
|
||
}
|
||
],
|
||
"return_type": "str",
|
||
"docstring": "获取系统提示词",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _get_system_prompt(self, format_type: str) -> str:\n \"\"\"获取系统提示词\"\"\"\n try:\n return self.prompt_manager.get_prompt(f\"content_transformation_{format_type}\", \"system\")\n except:\n # 使用默认系统提示词\n return f\"\"\"你是一个专业的内容编辑和转换专家,擅长将各种文档内容转换为 {self.SUPPORTED_FORMATS.get(format_type, format_type)} 格式。\n\n你的任务是:\n1. 理解和分析原始文档内容\n2. 提取关键信息和要点\n3. 按照目标格式的要求重新组织内容\n4. 确保信息准确、完整、易读\n\n要求:\n- 保持信息的准确性和完整性\n- 语言简洁明了,逻辑清晰\n- 适合目标受众阅读\n- 突出重点信息和特色内容\"\"\"",
|
||
"code_hash": "4d3dbd3d80fe6e57b42cff669f992171"
|
||
},
|
||
{
|
||
"name": "_build_user_prompt",
|
||
"line_start": 619,
|
||
"line_end": 650,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "content",
|
||
"type_hint": "IntegratedContent"
|
||
},
|
||
{
|
||
"name": "format_name",
|
||
"type_hint": "str"
|
||
},
|
||
{
|
||
"name": "additional_requirements",
|
||
"type_hint": "Optional[str]"
|
||
},
|
||
{
|
||
"name": "format_template",
|
||
"type_hint": "str"
|
||
}
|
||
],
|
||
"return_type": "str",
|
||
"docstring": "构建用户提示词",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _build_user_prompt(\n self,\n content: IntegratedContent,\n format_name: str,\n additional_requirements: Optional[str],\n format_template: str\n ) -> str:\n \"\"\"构建用户提示词\"\"\"\n \n prompt_parts = [\n f\"请将以下文档内容转换为{format_name}格式:\",\n \"\",\n f\"原始文档摘要:\",\n content.content_summary,\n \"\",\n f\"关键主题:{', '.join(content.key_topics)}\",\n \"\",\n f\"文档内容:\",\n content.combined_content,\n \"\",\n f\"转换要求:\",\n format_template\n ]\n \n if additional_requirements:\n prompt_parts.extend([\n \"\",\n f\"额外要求:\",\n additional_requirements\n ])\n \n return \"\\n\".join(prompt_parts)",
|
||
"code_hash": "d4556acf4f03ffbaa313456613c094b5"
|
||
},
|
||
{
|
||
"name": "_extract_structured_data",
|
||
"line_start": 652,
|
||
"line_end": 672,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "text",
|
||
"type_hint": "str"
|
||
},
|
||
{
|
||
"name": "data_type",
|
||
"type_hint": "str"
|
||
}
|
||
],
|
||
"return_type": "Optional[Dict[str, Any]]",
|
||
"docstring": "从转换结果中提取结构化数据",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _extract_structured_data(self, text: str, data_type: str) -> Optional[Dict[str, Any]]:\n \"\"\"从转换结果中提取结构化数据\"\"\"\n try:\n # 简单的关键信息提取\n structured = {}\n \n if data_type == \"attraction\":\n # 提取景区相关结构化信息\n structured = self._extract_attraction_data(text)\n elif data_type == \"product\":\n # 提取产品相关结构化信息\n structured = self._extract_product_data(text)\n elif data_type in [\"guide\", \"blog\", \"summary\", \"marketing\", \"faq\"]:\n # 提取通用结构化信息\n structured = self._extract_general_data(text)\n \n return structured if structured else None\n \n except Exception as e:\n logger.warning(f\"结构化数据提取失败: {e}\")\n return None",
|
||
"code_hash": "3ee64ced14f9e07fe0c4b2848c6889dd"
|
||
},
|
||
{
|
||
"name": "_extract_attraction_data",
|
||
"line_start": 674,
|
||
"line_end": 706,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "text",
|
||
"type_hint": "str"
|
||
}
|
||
],
|
||
"return_type": "Dict[str, Any]",
|
||
"docstring": "提取景区数据",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _extract_attraction_data(self, text: str) -> Dict[str, Any]:\n \"\"\"提取景区数据\"\"\"\n import re\n \n data = {}\n \n # 提取价格信息\n price_patterns = [\n r'门票[::]?\\s*(\\d+(?:\\.\\d+)?)\\s*元',\n r'票价[::]?\\s*(\\d+(?:\\.\\d+)?)\\s*元',\n r'价格[::]?\\s*(\\d+(?:\\.\\d+)?)\\s*元'\n ]\n \n for pattern in price_patterns:\n match = re.search(pattern, text)\n if match:\n data['ticket_price'] = float(match.group(1))\n break\n \n # 提取时间信息\n time_patterns = [\n r'开放时间[::]?\\s*([^\\n]+)',\n r'营业时间[::]?\\s*([^\\n]+)',\n r'游览时间[::]?\\s*([^\\n]+)'\n ]\n \n for pattern in time_patterns:\n match = re.search(pattern, text)\n if match:\n data['opening_hours'] = match.group(1).strip()\n break\n \n return data",
|
||
"code_hash": "f37d939e1bd1927abe6e43a5a86d161f"
|
||
},
|
||
{
|
||
"name": "_extract_product_data",
|
||
"line_start": 708,
|
||
"line_end": 727,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "text",
|
||
"type_hint": "str"
|
||
}
|
||
],
|
||
"return_type": "Dict[str, Any]",
|
||
"docstring": "提取产品数据",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _extract_product_data(self, text: str) -> Dict[str, Any]:\n \"\"\"提取产品数据\"\"\"\n import re\n \n data = {}\n \n # 提取价格信息\n price_patterns = [\n r'原价[::]?\\s*(\\d+(?:\\.\\d+)?)',\n r'现价[::]?\\s*(\\d+(?:\\.\\d+)?)',\n r'售价[::]?\\s*(\\d+(?:\\.\\d+)?)'\n ]\n \n for pattern in price_patterns:\n match = re.search(pattern, text)\n if match:\n data['price'] = float(match.group(1))\n break\n \n return data",
|
||
"code_hash": "49a87013a6203a797ff1d4bc3e027f82"
|
||
},
|
||
{
|
||
"name": "_extract_general_data",
|
||
"line_start": 729,
|
||
"line_end": 737,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "text",
|
||
"type_hint": "str"
|
||
}
|
||
],
|
||
"return_type": "Dict[str, Any]",
|
||
"docstring": "提取通用数据",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _extract_general_data(self, text: str) -> Dict[str, Any]:\n \"\"\"提取通用数据\"\"\"\n data = {\n 'word_count': len(text.split()),\n 'character_count': len(text),\n 'paragraph_count': len([p for p in text.split('\\n\\n') if p.strip()])\n }\n \n return data",
|
||
"code_hash": "2f772f6f6268c366f68dfd6eaa28d4ad"
|
||
},
|
||
{
|
||
"name": "_calculate_quality_score",
|
||
"line_start": 739,
|
||
"line_end": 790,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "original_content",
|
||
"type_hint": "IntegratedContent"
|
||
},
|
||
{
|
||
"name": "transformed_text",
|
||
"type_hint": "str"
|
||
},
|
||
{
|
||
"name": "format_type",
|
||
"type_hint": "str"
|
||
}
|
||
],
|
||
"return_type": "float",
|
||
"docstring": "计算转换质量评分",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _calculate_quality_score(\n self,\n original_content: IntegratedContent,\n transformed_text: str,\n format_type: str\n ) -> float:\n \"\"\"计算转换质量评分\"\"\"\n try:\n score = 0.0\n \n # 基于长度的评分(30%)\n original_length = len(original_content.combined_content)\n transformed_length = len(transformed_text)\n \n if original_length > 0:\n length_ratio = min(transformed_length / original_length, 1.0)\n score += length_ratio * 0.3\n \n # 基于内容完整性的评分(40%)\n key_topics = original_content.key_topics\n topics_found = sum(1 for topic in key_topics if topic in transformed_text)\n \n if key_topics:\n topic_coverage = topics_found / len(key_topics)\n score += topic_coverage * 0.4\n else:\n score += 0.2 # 如果没有关键主题,给予部分分数\n \n # 基于格式适配性的评分(30%)\n format_indicators = {\n 'attraction_standard': ['景区', '门票', '开放时间', '交通'],\n 'product_sales': ['产品', '价格', '优惠', '购买'],\n 'travel_guide': ['攻略', '行程', '推荐', '贴士'],\n 'blog_post': ['体验', '感受', '推荐', '分享'],\n 'summary': ['总结', '要点', '关键', '概述'],\n 'marketing_copy': ['优惠', '限时', '立即', '独家'],\n 'faq': ['问题', '答案', '如何', '什么']\n }\n \n format_words = format_indicators.get(format_type, [])\n if format_words:\n format_matches = sum(1 for word in format_words if word in transformed_text)\n format_score = min(format_matches / len(format_words), 1.0)\n score += format_score * 0.3\n else:\n score += 0.15 # 默认格式分数\n \n return min(score, 1.0) # 确保分数不超过1.0\n \n except Exception as e:\n logger.warning(f\"质量评分计算失败: {e}\")\n return 0.5 # 返回中等分数",
|
||
"code_hash": "5c0254104230fe26a1b7a472b54feef3"
|
||
},
|
||
{
|
||
"name": "get_supported_formats",
|
||
"line_start": 792,
|
||
"line_end": 794,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
}
|
||
],
|
||
"return_type": "Dict[str, str]",
|
||
"docstring": "获取支持的转换格式",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def get_supported_formats(self) -> Dict[str, str]:\n \"\"\"获取支持的转换格式\"\"\"\n return self.SUPPORTED_FORMATS.copy()",
|
||
"code_hash": "da58535e7c5de66716664cc49f2d4286"
|
||
},
|
||
{
|
||
"name": "is_supported_format",
|
||
"line_start": 796,
|
||
"line_end": 798,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "format_type",
|
||
"type_hint": "str"
|
||
}
|
||
],
|
||
"return_type": "bool",
|
||
"docstring": "检查格式是否支持",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def is_supported_format(self, format_type: str) -> bool:\n \"\"\"检查格式是否支持\"\"\"\n return format_type in self.SUPPORTED_FORMATS",
|
||
"code_hash": "549606aebdf62433bc4275ee211b04bb"
|
||
},
|
||
{
|
||
"name": "get_transformation_stats",
|
||
"line_start": 800,
|
||
"line_end": 808,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
}
|
||
],
|
||
"return_type": "Dict[str, Any]",
|
||
"docstring": "获取转换器统计信息",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def get_transformation_stats(self) -> Dict[str, Any]:\n \"\"\"获取转换器统计信息\"\"\"\n return {\n 'supported_formats': list(self.SUPPORTED_FORMATS.keys()),\n 'task_model_config': self.task_model_config,\n 'ai_model_info': self.ai_service.get_model_info(),\n 'json_repair_enabled': self.json_processor.enable_repair,\n 'available_methods': list(self.format_methods.keys())\n } ",
|
||
"code_hash": "bc0c00ae316130c09cd0b51551919892"
|
||
}
|
||
],
|
||
"classes": [
|
||
{
|
||
"name": "TransformedContent",
|
||
"line_start": 23,
|
||
"line_end": 60,
|
||
"bases": [],
|
||
"methods": [
|
||
{
|
||
"name": "__post_init__",
|
||
"line_start": 33,
|
||
"line_end": 36,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
}
|
||
],
|
||
"return_type": null,
|
||
"docstring": "初始化后处理",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def __post_init__(self):\n \"\"\"初始化后处理\"\"\"\n if not self.transformed_at:\n self.transformed_at = datetime.now()",
|
||
"code_hash": "98adcf64ad6666b500fb2842b7e9ad72"
|
||
},
|
||
{
|
||
"name": "to_dict",
|
||
"line_start": 38,
|
||
"line_end": 49,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
}
|
||
],
|
||
"return_type": "Dict[str, Any]",
|
||
"docstring": "转换为字典格式",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def to_dict(self) -> Dict[str, Any]:\n \"\"\"转换为字典格式\"\"\"\n return {\n 'transformed_text': self.transformed_text,\n 'format_type': self.format_type,\n 'transformation_metadata': self.transformation_metadata,\n 'transformed_at': self.transformed_at.isoformat(),\n 'structured_data': self.structured_data,\n 'quality_score': self.quality_score,\n 'original_summary': self.original_content.content_summary,\n 'original_document_count': self.original_content.document_count\n }",
|
||
"code_hash": "dcd995d469d689d1ef3e0a4541a74850"
|
||
},
|
||
{
|
||
"name": "get_summary",
|
||
"line_start": 51,
|
||
"line_end": 60,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
}
|
||
],
|
||
"return_type": "Dict[str, Any]",
|
||
"docstring": "获取转换摘要",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def get_summary(self) -> Dict[str, Any]:\n \"\"\"获取转换摘要\"\"\"\n return {\n 'format_type': self.format_type,\n 'transformed_length': len(self.transformed_text),\n 'quality_score': self.quality_score,\n 'source_documents': self.original_content.document_count,\n 'transformation_method': self.transformation_metadata.get('method'),\n 'transformed_at': self.transformed_at.isoformat()\n }",
|
||
"code_hash": "4195b93949396eff3edc6ae0001c235c"
|
||
}
|
||
],
|
||
"docstring": "转换后的内容",
|
||
"decorators": [
|
||
"dataclass"
|
||
],
|
||
"code": "class TransformedContent:\n \"\"\"转换后的内容\"\"\"\n original_content: IntegratedContent\n transformed_text: str\n format_type: str\n transformation_metadata: Dict[str, Any]\n transformed_at: datetime\n structured_data: Optional[Dict[str, Any]] = None\n quality_score: Optional[float] = None\n \n def __post_init__(self):\n \"\"\"初始化后处理\"\"\"\n if not self.transformed_at:\n self.transformed_at = datetime.now()\n \n def to_dict(self) -> Dict[str, Any]:\n \"\"\"转换为字典格式\"\"\"\n return {\n 'transformed_text': self.transformed_text,\n 'format_type': self.format_type,\n 'transformation_metadata': self.transformation_metadata,\n 'transformed_at': self.transformed_at.isoformat(),\n 'structured_data': self.structured_data,\n 'quality_score': self.quality_score,\n 'original_summary': self.original_content.content_summary,\n 'original_document_count': self.original_content.document_count\n }\n \n def get_summary(self) -> Dict[str, Any]:\n \"\"\"获取转换摘要\"\"\"\n return {\n 'format_type': self.format_type,\n 'transformed_length': len(self.transformed_text),\n 'quality_score': self.quality_score,\n 'source_documents': self.original_content.document_count,\n 'transformation_method': self.transformation_metadata.get('method'),\n 'transformed_at': self.transformed_at.isoformat()\n }",
|
||
"code_hash": "259d1dd57fbef291fb58e70ebd072d58"
|
||
},
|
||
{
|
||
"name": "ContentTransformer",
|
||
"line_start": 63,
|
||
"line_end": 808,
|
||
"bases": [],
|
||
"methods": [
|
||
{
|
||
"name": "__init__",
|
||
"line_start": 80,
|
||
"line_end": 110,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "config",
|
||
"type_hint": "AlgorithmConfig"
|
||
}
|
||
],
|
||
"return_type": null,
|
||
"docstring": "初始化内容转换器\n\nArgs:\n config: 算法配置",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def __init__(self, config: AlgorithmConfig):\n \"\"\"\n 初始化内容转换器\n \n Args:\n config: 算法配置\n \"\"\"\n self.config = config\n self.ai_service = AIService(config.ai_model)\n self.prompt_manager = PromptManager(config.prompts, config.resources)\n self.json_processor = JSONProcessor(\n enable_repair=config.content_generation.enable_json_repair,\n max_repair_attempts=config.content_generation.json_repair_attempts\n )\n \n # 获取任务特定的模型配置\n self.task_model_config = config.ai_model.get_task_config(\"content_transformation\")\n \n # 格式转换方法映射\n self.format_methods = {\n 'attraction_standard': self._transform_to_attraction_standard,\n 'product_sales': self._transform_to_product_sales,\n 'travel_guide': self._transform_to_travel_guide,\n 'blog_post': self._transform_to_blog_post,\n 'summary': self._transform_to_summary,\n 'structured_data': self._transform_to_structured_data,\n 'marketing_copy': self._transform_to_marketing_copy,\n 'faq': self._transform_to_faq\n }\n \n logger.info(f\"内容转换器初始化完成,支持格式: {list(self.SUPPORTED_FORMATS.keys())}\")",
|
||
"code_hash": "bb54e03402fd9abaec8a920b03e82bb8"
|
||
},
|
||
{
|
||
"name": "transform_content",
|
||
"line_start": 112,
|
||
"line_end": 190,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "integrated_content",
|
||
"type_hint": "IntegratedContent"
|
||
},
|
||
{
|
||
"name": "format_type",
|
||
"type_hint": "str"
|
||
},
|
||
{
|
||
"name": "custom_prompt",
|
||
"type_hint": "Optional[str]"
|
||
},
|
||
{
|
||
"name": "additional_requirements",
|
||
"type_hint": "Optional[str]"
|
||
}
|
||
],
|
||
"return_type": "TransformedContent",
|
||
"docstring": "转换内容\n\nArgs:\n integrated_content: 整合后的内容\n format_type: 目标格式类型\n custom_prompt: 自定义提示词\n additional_requirements: 额外要求\n \nReturns:\n TransformedContent: 转换后的内容\n \nRaises:\n DocumentProcessingError: 转换失败时抛出",
|
||
"is_async": true,
|
||
"decorators": [],
|
||
"code": " async def transform_content(\n self,\n integrated_content: IntegratedContent,\n format_type: str = 'summary',\n custom_prompt: Optional[str] = None,\n additional_requirements: Optional[str] = None\n ) -> TransformedContent:\n \"\"\"\n 转换内容\n \n Args:\n integrated_content: 整合后的内容\n format_type: 目标格式类型\n custom_prompt: 自定义提示词\n additional_requirements: 额外要求\n \n Returns:\n TransformedContent: 转换后的内容\n \n Raises:\n DocumentProcessingError: 转换失败时抛出\n \"\"\"\n if format_type not in self.SUPPORTED_FORMATS:\n raise DocumentProcessingError(f\"不支持的格式类型: {format_type}\")\n \n if not integrated_content.combined_content.strip():\n raise DocumentProcessingError(\"没有可转换的内容\")\n \n try:\n logger.info(f\"开始转换内容为 {format_type} 格式\")\n \n # 获取转换方法\n transform_method = self.format_methods[format_type]\n \n # 执行转换\n transformed_text, structured_data, metadata = await transform_method(\n integrated_content,\n custom_prompt,\n additional_requirements\n )\n \n # 计算质量评分\n quality_score = self._calculate_quality_score(\n integrated_content,\n transformed_text,\n format_type\n )\n \n # 收集转换元数据\n transformation_metadata = {\n 'method': transform_method.__name__,\n 'format_type': format_type,\n 'custom_prompt_used': bool(custom_prompt),\n 'additional_requirements': additional_requirements,\n 'source_document_count': integrated_content.document_count,\n 'source_content_length': len(integrated_content.combined_content),\n 'target_content_length': len(transformed_text),\n 'model_config': self.task_model_config,\n 'transformation_timestamp': datetime.now().isoformat(),\n **metadata\n }\n \n transformed_content = TransformedContent(\n original_content=integrated_content,\n transformed_text=transformed_text,\n format_type=format_type,\n transformation_metadata=transformation_metadata,\n transformed_at=datetime.now(),\n structured_data=structured_data,\n quality_score=quality_score\n )\n \n logger.info(f\"内容转换完成,输出长度: {len(transformed_text)}\")\n return transformed_content\n \n except Exception as e:\n error_msg = f\"内容转换失败 ({format_type}): {str(e)}\"\n logger.error(error_msg, exc_info=True)\n raise DocumentProcessingError(error_msg)",
|
||
"code_hash": "0b8be16dfb781017c7d2853b9515fe95"
|
||
},
|
||
{
|
||
"name": "_transform_to_attraction_standard",
|
||
"line_start": 192,
|
||
"line_end": 235,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "content",
|
||
"type_hint": "IntegratedContent"
|
||
},
|
||
{
|
||
"name": "custom_prompt",
|
||
"type_hint": "Optional[str]"
|
||
},
|
||
{
|
||
"name": "additional_requirements",
|
||
"type_hint": "Optional[str]"
|
||
}
|
||
],
|
||
"return_type": "tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]",
|
||
"docstring": "转换为景区标准信息格式",
|
||
"is_async": true,
|
||
"decorators": [],
|
||
"code": " async def _transform_to_attraction_standard(\n self,\n content: IntegratedContent,\n custom_prompt: Optional[str],\n additional_requirements: Optional[str]\n ) -> tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]:\n \"\"\"转换为景区标准信息格式\"\"\"\n \n system_prompt = custom_prompt or self._get_system_prompt(\"attraction_standard\")\n user_prompt = self._build_user_prompt(\n content,\n \"景区标准信息\",\n additional_requirements,\n \"\"\"\n 请按照以下结构整理景区信息:\n 1. 景区基本信息(名称、位置、类型、等级)\n 2. 门票信息(价格、优惠政策、购票方式)\n 3. 开放时间(营业时间、季节性变化)\n 4. 交通指南(公共交通、自驾路线、停车信息)\n 5. 景区特色(主要景点、特色活动、文化背景)\n 6. 服务设施(餐饮、住宿、购物、卫生间等)\n 7. 游览建议(推荐路线、游览时长、最佳时间)\n 8. 注意事项(安全提醒、禁止事项、特殊要求)\n \"\"\"\n )\n \n result, tokens_input, tokens_output, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"景区标准信息转换\",\n **self.task_model_config\n )\n \n # 尝试提取结构化数据\n structured_data = self._extract_structured_data(result, \"attraction\")\n \n metadata = {\n 'tokens_input': tokens_input,\n 'tokens_output': tokens_output,\n 'elapsed_time': elapsed_time,\n 'extraction_method': 'ai_attraction_standard'\n }\n \n return result.strip(), structured_data, metadata",
|
||
"code_hash": "93c7f5d14b71dbca5dd8594bb0f67acb"
|
||
},
|
||
{
|
||
"name": "_transform_to_product_sales",
|
||
"line_start": 237,
|
||
"line_end": 278,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "content",
|
||
"type_hint": "IntegratedContent"
|
||
},
|
||
{
|
||
"name": "custom_prompt",
|
||
"type_hint": "Optional[str]"
|
||
},
|
||
{
|
||
"name": "additional_requirements",
|
||
"type_hint": "Optional[str]"
|
||
}
|
||
],
|
||
"return_type": "tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]",
|
||
"docstring": "转换为产品销售介绍格式",
|
||
"is_async": true,
|
||
"decorators": [],
|
||
"code": " async def _transform_to_product_sales(\n self,\n content: IntegratedContent,\n custom_prompt: Optional[str],\n additional_requirements: Optional[str]\n ) -> tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]:\n \"\"\"转换为产品销售介绍格式\"\"\"\n \n system_prompt = custom_prompt or self._get_system_prompt(\"product_sales\")\n user_prompt = self._build_user_prompt(\n content,\n \"产品销售介绍\",\n additional_requirements,\n \"\"\"\n 请按照销售文案的要求整理产品信息:\n 1. 产品亮点(核心卖点、独特优势)\n 2. 产品详情(套餐内容、服务项目、规格说明)\n 3. 价格体系(原价、优惠价、性价比分析)\n 4. 适用人群(目标客户、使用场景)\n 5. 购买指南(预订方式、使用方法、有效期)\n 6. 客户保障(退改政策、服务承诺、联系方式)\n 7. 用户评价(客户反馈、推荐理由)\n \"\"\"\n )\n \n result, tokens_input, tokens_output, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"产品销售介绍转换\",\n **self.task_model_config\n )\n \n structured_data = self._extract_structured_data(result, \"product\")\n \n metadata = {\n 'tokens_input': tokens_input,\n 'tokens_output': tokens_output,\n 'elapsed_time': elapsed_time,\n 'extraction_method': 'ai_product_sales'\n }\n \n return result.strip(), structured_data, metadata",
|
||
"code_hash": "59238ad356a40708344362f36216f751"
|
||
},
|
||
{
|
||
"name": "_transform_to_travel_guide",
|
||
"line_start": 280,
|
||
"line_end": 322,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "content",
|
||
"type_hint": "IntegratedContent"
|
||
},
|
||
{
|
||
"name": "custom_prompt",
|
||
"type_hint": "Optional[str]"
|
||
},
|
||
{
|
||
"name": "additional_requirements",
|
||
"type_hint": "Optional[str]"
|
||
}
|
||
],
|
||
"return_type": "tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]",
|
||
"docstring": "转换为旅游攻略格式",
|
||
"is_async": true,
|
||
"decorators": [],
|
||
"code": " async def _transform_to_travel_guide(\n self,\n content: IntegratedContent,\n custom_prompt: Optional[str],\n additional_requirements: Optional[str]\n ) -> tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]:\n \"\"\"转换为旅游攻略格式\"\"\"\n \n system_prompt = custom_prompt or self._get_system_prompt(\"travel_guide\")\n user_prompt = self._build_user_prompt(\n content,\n \"旅游攻略\",\n additional_requirements,\n \"\"\"\n 请整理成实用的旅游攻略:\n 1. 目的地概览(地理位置、气候特点、最佳旅游时间)\n 2. 行程规划(推荐天数、必游景点、路线安排)\n 3. 交通攻略(到达方式、当地交通、费用预算)\n 4. 住宿推荐(不同档次选择、位置建议、预订提醒)\n 5. 美食指南(特色菜品、推荐餐厅、小吃街区)\n 6. 购物指南(特产推荐、购物地点、注意事项)\n 7. 实用贴士(天气准备、必备物品、省钱技巧)\n 8. 安全提醒(注意事项、紧急联系方式)\n \"\"\"\n )\n \n result, tokens_input, tokens_output, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"旅游攻略转换\",\n **self.task_model_config\n )\n \n structured_data = self._extract_structured_data(result, \"guide\")\n \n metadata = {\n 'tokens_input': tokens_input,\n 'tokens_output': tokens_output,\n 'elapsed_time': elapsed_time,\n 'extraction_method': 'ai_travel_guide'\n }\n \n return result.strip(), structured_data, metadata",
|
||
"code_hash": "030bc5230ab8b397e096e69d1a6255f3"
|
||
},
|
||
{
|
||
"name": "_transform_to_blog_post",
|
||
"line_start": 324,
|
||
"line_end": 367,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "content",
|
||
"type_hint": "IntegratedContent"
|
||
},
|
||
{
|
||
"name": "custom_prompt",
|
||
"type_hint": "Optional[str]"
|
||
},
|
||
{
|
||
"name": "additional_requirements",
|
||
"type_hint": "Optional[str]"
|
||
}
|
||
],
|
||
"return_type": "tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]",
|
||
"docstring": "转换为博客文章格式",
|
||
"is_async": true,
|
||
"decorators": [],
|
||
"code": " async def _transform_to_blog_post(\n self,\n content: IntegratedContent,\n custom_prompt: Optional[str],\n additional_requirements: Optional[str]\n ) -> tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]:\n \"\"\"转换为博客文章格式\"\"\"\n \n system_prompt = custom_prompt or self._get_system_prompt(\"blog_post\")\n user_prompt = self._build_user_prompt(\n content,\n \"博客文章\",\n additional_requirements,\n \"\"\"\n 请整理成吸引人的博客文章:\n 1. 引人入胜的标题\n 2. 开头引言(吸引读者兴趣)\n 3. 主体内容(详细介绍、个人体验、实用信息)\n 4. 精彩亮点(重点推荐、独特发现)\n 5. 实用建议(贴心提醒、经验分享)\n 6. 总结感悟(个人感受、推荐理由)\n 7. 互动结尾(鼓励评论、分享邀请)\n \n 要求:语言生动有趣,有个人色彩,适合社交媒体分享。\n \"\"\"\n )\n \n result, tokens_input, tokens_output, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"博客文章转换\",\n **self.task_model_config\n )\n \n structured_data = self._extract_structured_data(result, \"blog\")\n \n metadata = {\n 'tokens_input': tokens_input,\n 'tokens_output': tokens_output,\n 'elapsed_time': elapsed_time,\n 'extraction_method': 'ai_blog_post'\n }\n \n return result.strip(), structured_data, metadata",
|
||
"code_hash": "cb2a8be10972442c5c6ed8c9642625a0"
|
||
},
|
||
{
|
||
"name": "_transform_to_summary",
|
||
"line_start": 369,
|
||
"line_end": 409,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "content",
|
||
"type_hint": "IntegratedContent"
|
||
},
|
||
{
|
||
"name": "custom_prompt",
|
||
"type_hint": "Optional[str]"
|
||
},
|
||
{
|
||
"name": "additional_requirements",
|
||
"type_hint": "Optional[str]"
|
||
}
|
||
],
|
||
"return_type": "tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]",
|
||
"docstring": "转换为内容摘要格式",
|
||
"is_async": true,
|
||
"decorators": [],
|
||
"code": " async def _transform_to_summary(\n self,\n content: IntegratedContent,\n custom_prompt: Optional[str],\n additional_requirements: Optional[str]\n ) -> tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]:\n \"\"\"转换为内容摘要格式\"\"\"\n \n system_prompt = custom_prompt or self._get_system_prompt(\"summary\")\n user_prompt = self._build_user_prompt(\n content,\n \"内容摘要\",\n additional_requirements,\n \"\"\"\n 请提供简洁的内容摘要:\n 1. 核心信息总结(主要内容、关键信息)\n 2. 重要数据提取(价格、时间、数量等)\n 3. 关键特色亮点(独特卖点、优势特征)\n 4. 实用信息汇总(联系方式、地址、网址等)\n \n 要求:简洁明了,突出重点,便于快速了解。\n \"\"\"\n )\n \n result, tokens_input, tokens_output, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"内容摘要转换\",\n **self.task_model_config\n )\n \n structured_data = self._extract_structured_data(result, \"summary\")\n \n metadata = {\n 'tokens_input': tokens_input,\n 'tokens_output': tokens_output,\n 'elapsed_time': elapsed_time,\n 'extraction_method': 'ai_summary'\n }\n \n return result.strip(), structured_data, metadata",
|
||
"code_hash": "e2815ea90e067be707101daf9cb3fd26"
|
||
},
|
||
{
|
||
"name": "_transform_to_structured_data",
|
||
"line_start": 411,
|
||
"line_end": 494,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "content",
|
||
"type_hint": "IntegratedContent"
|
||
},
|
||
{
|
||
"name": "custom_prompt",
|
||
"type_hint": "Optional[str]"
|
||
},
|
||
{
|
||
"name": "additional_requirements",
|
||
"type_hint": "Optional[str]"
|
||
}
|
||
],
|
||
"return_type": "tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]",
|
||
"docstring": "转换为结构化数据格式",
|
||
"is_async": true,
|
||
"decorators": [],
|
||
"code": " async def _transform_to_structured_data(\n self,\n content: IntegratedContent,\n custom_prompt: Optional[str],\n additional_requirements: Optional[str]\n ) -> tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]:\n \"\"\"转换为结构化数据格式\"\"\"\n \n system_prompt = custom_prompt or \"\"\"你是一个数据结构化专家,负责将非结构化的文档内容转换为标准的JSON格式数据。\n\n请按照以下JSON结构提取和整理信息:\n{\n \"basic_info\": {\n \"name\": \"名称\",\n \"type\": \"类型\",\n \"location\": \"位置\",\n \"description\": \"简介\"\n },\n \"pricing\": {\n \"ticket_price\": \"门票价格\",\n \"package_price\": \"套餐价格\",\n \"discount_info\": \"优惠信息\"\n },\n \"schedule\": {\n \"opening_hours\": \"开放时间\",\n \"best_visit_time\": \"最佳游览时间\",\n \"duration\": \"建议游览时长\"\n },\n \"transportation\": {\n \"public_transport\": \"公共交通\",\n \"self_driving\": \"自驾信息\",\n \"parking\": \"停车信息\"\n },\n \"services\": {\n \"facilities\": [\"设施列表\"],\n \"dining\": [\"餐饮选择\"],\n \"accommodation\": [\"住宿选择\"]\n },\n \"highlights\": [\"亮点特色\"],\n \"tips\": [\"游览贴士\"],\n \"contact\": {\n \"phone\": \"联系电话\",\n \"website\": \"官方网站\",\n \"address\": \"详细地址\"\n }\n}\n\n请严格按照JSON格式输出,没有信息的字段可以设为null或空数组。\"\"\"\n \n user_prompt = f\"\"\"请将以下文档内容转换为结构化的JSON数据:\n\n{content.combined_content}\n\n{f'额外要求:{additional_requirements}' if additional_requirements else ''}\n\n请提取所有可用信息并按照指定的JSON结构格式化输出。\"\"\"\n \n result, tokens_input, tokens_output, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"结构化数据转换\",\n **self.task_model_config\n )\n \n # 解析JSON结构化数据\n try:\n structured_data = self.json_processor.parse_llm_output(\n raw_output=result,\n expected_fields=[\"basic_info\", \"pricing\", \"schedule\", \"transportation\", \"services\"],\n required_fields=[\"basic_info\"]\n )\n except Exception as e:\n logger.warning(f\"结构化数据解析失败: {e}\")\n structured_data = None\n \n metadata = {\n 'tokens_input': tokens_input,\n 'tokens_output': tokens_output,\n 'elapsed_time': elapsed_time,\n 'extraction_method': 'ai_structured_data',\n 'json_parsing_success': structured_data is not None\n }\n \n return result.strip(), structured_data, metadata",
|
||
"code_hash": "18d327ad60f5eece83291ee102c14426"
|
||
},
|
||
{
|
||
"name": "_transform_to_marketing_copy",
|
||
"line_start": 496,
|
||
"line_end": 539,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "content",
|
||
"type_hint": "IntegratedContent"
|
||
},
|
||
{
|
||
"name": "custom_prompt",
|
||
"type_hint": "Optional[str]"
|
||
},
|
||
{
|
||
"name": "additional_requirements",
|
||
"type_hint": "Optional[str]"
|
||
}
|
||
],
|
||
"return_type": "tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]",
|
||
"docstring": "转换为营销文案格式",
|
||
"is_async": true,
|
||
"decorators": [],
|
||
"code": " async def _transform_to_marketing_copy(\n self,\n content: IntegratedContent,\n custom_prompt: Optional[str],\n additional_requirements: Optional[str]\n ) -> tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]:\n \"\"\"转换为营销文案格式\"\"\"\n \n system_prompt = custom_prompt or self._get_system_prompt(\"marketing_copy\")\n user_prompt = self._build_user_prompt(\n content,\n \"营销文案\",\n additional_requirements,\n \"\"\"\n 请创作有吸引力的营销文案:\n 1. 吸引眼球的标题(突出卖点、制造悬念)\n 2. 开头钩子(抓住痛点、激发兴趣)\n 3. 价值主张(核心利益、独特优势)\n 4. 社会证明(客户好评、权威认证)\n 5. 紧迫感营造(限时优惠、数量有限)\n 6. 行动召唤(明确指引、简化流程)\n 7. 风险消除(保障承诺、退款政策)\n \n 要求:语言有感染力,突出情感共鸣,促进转化行动。\n \"\"\"\n )\n \n result, tokens_input, tokens_output, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"营销文案转换\",\n **self.task_model_config\n )\n \n structured_data = self._extract_structured_data(result, \"marketing\")\n \n metadata = {\n 'tokens_input': tokens_input,\n 'tokens_output': tokens_output,\n 'elapsed_time': elapsed_time,\n 'extraction_method': 'ai_marketing_copy'\n }\n \n return result.strip(), structured_data, metadata",
|
||
"code_hash": "3f01b70214e0aa9e4a826ab8d36e6c0a"
|
||
},
|
||
{
|
||
"name": "_transform_to_faq",
|
||
"line_start": 541,
|
||
"line_end": 597,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "content",
|
||
"type_hint": "IntegratedContent"
|
||
},
|
||
{
|
||
"name": "custom_prompt",
|
||
"type_hint": "Optional[str]"
|
||
},
|
||
{
|
||
"name": "additional_requirements",
|
||
"type_hint": "Optional[str]"
|
||
}
|
||
],
|
||
"return_type": "tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]",
|
||
"docstring": "转换为常见问题格式",
|
||
"is_async": true,
|
||
"decorators": [],
|
||
"code": " async def _transform_to_faq(\n self,\n content: IntegratedContent,\n custom_prompt: Optional[str],\n additional_requirements: Optional[str]\n ) -> tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]:\n \"\"\"转换为常见问题格式\"\"\"\n \n system_prompt = custom_prompt or self._get_system_prompt(\"faq\")\n user_prompt = self._build_user_prompt(\n content,\n \"常见问题FAQ\",\n additional_requirements,\n \"\"\"\n 请整理成常见问题FAQ格式:\n \n 基本信息类:\n - 景区/产品介绍相关问题\n - 位置交通相关问题\n \n 预订购买类:\n - 门票价格和购买问题\n - 预订流程和注意事项\n \n 游览体验类:\n - 开放时间和游览时长\n - 设施服务和特色项目\n \n 实用贴士类:\n - 注意事项和安全提醒\n - 最佳游览时间和建议\n \n 售后服务类:\n - 退改政策和联系方式\n - 投诉建议和客服支持\n \n 每个问题要简洁明了,答案要准确详细。\n \"\"\"\n )\n \n result, tokens_input, tokens_output, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"FAQ转换\",\n **self.task_model_config\n )\n \n structured_data = self._extract_structured_data(result, \"faq\")\n \n metadata = {\n 'tokens_input': tokens_input,\n 'tokens_output': tokens_output,\n 'elapsed_time': elapsed_time,\n 'extraction_method': 'ai_faq'\n }\n \n return result.strip(), structured_data, metadata",
|
||
"code_hash": "8730c4f107dde26dca26057eedb41f1f"
|
||
},
|
||
{
|
||
"name": "_get_system_prompt",
|
||
"line_start": 599,
|
||
"line_end": 617,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "format_type",
|
||
"type_hint": "str"
|
||
}
|
||
],
|
||
"return_type": "str",
|
||
"docstring": "获取系统提示词",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _get_system_prompt(self, format_type: str) -> str:\n \"\"\"获取系统提示词\"\"\"\n try:\n return self.prompt_manager.get_prompt(f\"content_transformation_{format_type}\", \"system\")\n except:\n # 使用默认系统提示词\n return f\"\"\"你是一个专业的内容编辑和转换专家,擅长将各种文档内容转换为 {self.SUPPORTED_FORMATS.get(format_type, format_type)} 格式。\n\n你的任务是:\n1. 理解和分析原始文档内容\n2. 提取关键信息和要点\n3. 按照目标格式的要求重新组织内容\n4. 确保信息准确、完整、易读\n\n要求:\n- 保持信息的准确性和完整性\n- 语言简洁明了,逻辑清晰\n- 适合目标受众阅读\n- 突出重点信息和特色内容\"\"\"",
|
||
"code_hash": "4d3dbd3d80fe6e57b42cff669f992171"
|
||
},
|
||
{
|
||
"name": "_build_user_prompt",
|
||
"line_start": 619,
|
||
"line_end": 650,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "content",
|
||
"type_hint": "IntegratedContent"
|
||
},
|
||
{
|
||
"name": "format_name",
|
||
"type_hint": "str"
|
||
},
|
||
{
|
||
"name": "additional_requirements",
|
||
"type_hint": "Optional[str]"
|
||
},
|
||
{
|
||
"name": "format_template",
|
||
"type_hint": "str"
|
||
}
|
||
],
|
||
"return_type": "str",
|
||
"docstring": "构建用户提示词",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _build_user_prompt(\n self,\n content: IntegratedContent,\n format_name: str,\n additional_requirements: Optional[str],\n format_template: str\n ) -> str:\n \"\"\"构建用户提示词\"\"\"\n \n prompt_parts = [\n f\"请将以下文档内容转换为{format_name}格式:\",\n \"\",\n f\"原始文档摘要:\",\n content.content_summary,\n \"\",\n f\"关键主题:{', '.join(content.key_topics)}\",\n \"\",\n f\"文档内容:\",\n content.combined_content,\n \"\",\n f\"转换要求:\",\n format_template\n ]\n \n if additional_requirements:\n prompt_parts.extend([\n \"\",\n f\"额外要求:\",\n additional_requirements\n ])\n \n return \"\\n\".join(prompt_parts)",
|
||
"code_hash": "d4556acf4f03ffbaa313456613c094b5"
|
||
},
|
||
{
|
||
"name": "_extract_structured_data",
|
||
"line_start": 652,
|
||
"line_end": 672,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "text",
|
||
"type_hint": "str"
|
||
},
|
||
{
|
||
"name": "data_type",
|
||
"type_hint": "str"
|
||
}
|
||
],
|
||
"return_type": "Optional[Dict[str, Any]]",
|
||
"docstring": "从转换结果中提取结构化数据",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _extract_structured_data(self, text: str, data_type: str) -> Optional[Dict[str, Any]]:\n \"\"\"从转换结果中提取结构化数据\"\"\"\n try:\n # 简单的关键信息提取\n structured = {}\n \n if data_type == \"attraction\":\n # 提取景区相关结构化信息\n structured = self._extract_attraction_data(text)\n elif data_type == \"product\":\n # 提取产品相关结构化信息\n structured = self._extract_product_data(text)\n elif data_type in [\"guide\", \"blog\", \"summary\", \"marketing\", \"faq\"]:\n # 提取通用结构化信息\n structured = self._extract_general_data(text)\n \n return structured if structured else None\n \n except Exception as e:\n logger.warning(f\"结构化数据提取失败: {e}\")\n return None",
|
||
"code_hash": "3ee64ced14f9e07fe0c4b2848c6889dd"
|
||
},
|
||
{
|
||
"name": "_extract_attraction_data",
|
||
"line_start": 674,
|
||
"line_end": 706,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "text",
|
||
"type_hint": "str"
|
||
}
|
||
],
|
||
"return_type": "Dict[str, Any]",
|
||
"docstring": "提取景区数据",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _extract_attraction_data(self, text: str) -> Dict[str, Any]:\n \"\"\"提取景区数据\"\"\"\n import re\n \n data = {}\n \n # 提取价格信息\n price_patterns = [\n r'门票[::]?\\s*(\\d+(?:\\.\\d+)?)\\s*元',\n r'票价[::]?\\s*(\\d+(?:\\.\\d+)?)\\s*元',\n r'价格[::]?\\s*(\\d+(?:\\.\\d+)?)\\s*元'\n ]\n \n for pattern in price_patterns:\n match = re.search(pattern, text)\n if match:\n data['ticket_price'] = float(match.group(1))\n break\n \n # 提取时间信息\n time_patterns = [\n r'开放时间[::]?\\s*([^\\n]+)',\n r'营业时间[::]?\\s*([^\\n]+)',\n r'游览时间[::]?\\s*([^\\n]+)'\n ]\n \n for pattern in time_patterns:\n match = re.search(pattern, text)\n if match:\n data['opening_hours'] = match.group(1).strip()\n break\n \n return data",
|
||
"code_hash": "f37d939e1bd1927abe6e43a5a86d161f"
|
||
},
|
||
{
|
||
"name": "_extract_product_data",
|
||
"line_start": 708,
|
||
"line_end": 727,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "text",
|
||
"type_hint": "str"
|
||
}
|
||
],
|
||
"return_type": "Dict[str, Any]",
|
||
"docstring": "提取产品数据",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _extract_product_data(self, text: str) -> Dict[str, Any]:\n \"\"\"提取产品数据\"\"\"\n import re\n \n data = {}\n \n # 提取价格信息\n price_patterns = [\n r'原价[::]?\\s*(\\d+(?:\\.\\d+)?)',\n r'现价[::]?\\s*(\\d+(?:\\.\\d+)?)',\n r'售价[::]?\\s*(\\d+(?:\\.\\d+)?)'\n ]\n \n for pattern in price_patterns:\n match = re.search(pattern, text)\n if match:\n data['price'] = float(match.group(1))\n break\n \n return data",
|
||
"code_hash": "49a87013a6203a797ff1d4bc3e027f82"
|
||
},
|
||
{
|
||
"name": "_extract_general_data",
|
||
"line_start": 729,
|
||
"line_end": 737,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "text",
|
||
"type_hint": "str"
|
||
}
|
||
],
|
||
"return_type": "Dict[str, Any]",
|
||
"docstring": "提取通用数据",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _extract_general_data(self, text: str) -> Dict[str, Any]:\n \"\"\"提取通用数据\"\"\"\n data = {\n 'word_count': len(text.split()),\n 'character_count': len(text),\n 'paragraph_count': len([p for p in text.split('\\n\\n') if p.strip()])\n }\n \n return data",
|
||
"code_hash": "2f772f6f6268c366f68dfd6eaa28d4ad"
|
||
},
|
||
{
|
||
"name": "_calculate_quality_score",
|
||
"line_start": 739,
|
||
"line_end": 790,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "original_content",
|
||
"type_hint": "IntegratedContent"
|
||
},
|
||
{
|
||
"name": "transformed_text",
|
||
"type_hint": "str"
|
||
},
|
||
{
|
||
"name": "format_type",
|
||
"type_hint": "str"
|
||
}
|
||
],
|
||
"return_type": "float",
|
||
"docstring": "计算转换质量评分",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _calculate_quality_score(\n self,\n original_content: IntegratedContent,\n transformed_text: str,\n format_type: str\n ) -> float:\n \"\"\"计算转换质量评分\"\"\"\n try:\n score = 0.0\n \n # 基于长度的评分(30%)\n original_length = len(original_content.combined_content)\n transformed_length = len(transformed_text)\n \n if original_length > 0:\n length_ratio = min(transformed_length / original_length, 1.0)\n score += length_ratio * 0.3\n \n # 基于内容完整性的评分(40%)\n key_topics = original_content.key_topics\n topics_found = sum(1 for topic in key_topics if topic in transformed_text)\n \n if key_topics:\n topic_coverage = topics_found / len(key_topics)\n score += topic_coverage * 0.4\n else:\n score += 0.2 # 如果没有关键主题,给予部分分数\n \n # 基于格式适配性的评分(30%)\n format_indicators = {\n 'attraction_standard': ['景区', '门票', '开放时间', '交通'],\n 'product_sales': ['产品', '价格', '优惠', '购买'],\n 'travel_guide': ['攻略', '行程', '推荐', '贴士'],\n 'blog_post': ['体验', '感受', '推荐', '分享'],\n 'summary': ['总结', '要点', '关键', '概述'],\n 'marketing_copy': ['优惠', '限时', '立即', '独家'],\n 'faq': ['问题', '答案', '如何', '什么']\n }\n \n format_words = format_indicators.get(format_type, [])\n if format_words:\n format_matches = sum(1 for word in format_words if word in transformed_text)\n format_score = min(format_matches / len(format_words), 1.0)\n score += format_score * 0.3\n else:\n score += 0.15 # 默认格式分数\n \n return min(score, 1.0) # 确保分数不超过1.0\n \n except Exception as e:\n logger.warning(f\"质量评分计算失败: {e}\")\n return 0.5 # 返回中等分数",
|
||
"code_hash": "5c0254104230fe26a1b7a472b54feef3"
|
||
},
|
||
{
|
||
"name": "get_supported_formats",
|
||
"line_start": 792,
|
||
"line_end": 794,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
}
|
||
],
|
||
"return_type": "Dict[str, str]",
|
||
"docstring": "获取支持的转换格式",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def get_supported_formats(self) -> Dict[str, str]:\n \"\"\"获取支持的转换格式\"\"\"\n return self.SUPPORTED_FORMATS.copy()",
|
||
"code_hash": "da58535e7c5de66716664cc49f2d4286"
|
||
},
|
||
{
|
||
"name": "is_supported_format",
|
||
"line_start": 796,
|
||
"line_end": 798,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "format_type",
|
||
"type_hint": "str"
|
||
}
|
||
],
|
||
"return_type": "bool",
|
||
"docstring": "检查格式是否支持",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def is_supported_format(self, format_type: str) -> bool:\n \"\"\"检查格式是否支持\"\"\"\n return format_type in self.SUPPORTED_FORMATS",
|
||
"code_hash": "549606aebdf62433bc4275ee211b04bb"
|
||
},
|
||
{
|
||
"name": "get_transformation_stats",
|
||
"line_start": 800,
|
||
"line_end": 808,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
}
|
||
],
|
||
"return_type": "Dict[str, Any]",
|
||
"docstring": "获取转换器统计信息",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def get_transformation_stats(self) -> Dict[str, Any]:\n \"\"\"获取转换器统计信息\"\"\"\n return {\n 'supported_formats': list(self.SUPPORTED_FORMATS.keys()),\n 'task_model_config': self.task_model_config,\n 'ai_model_info': self.ai_service.get_model_info(),\n 'json_repair_enabled': self.json_processor.enable_repair,\n 'available_methods': list(self.format_methods.keys())\n } ",
|
||
"code_hash": "bc0c00ae316130c09cd0b51551919892"
|
||
}
|
||
],
|
||
"docstring": "内容转换器 - 重构版本\n使用AI模型将整合的文档内容转换为指定的标准化格式",
|
||
"decorators": [],
|
||
"code": "class ContentTransformer:\n \"\"\"\n 内容转换器 - 重构版本\n 使用AI模型将整合的文档内容转换为指定的标准化格式\n \"\"\"\n \n SUPPORTED_FORMATS = {\n 'attraction_standard': '景区标准信息格式',\n 'product_sales': '产品销售介绍格式',\n 'travel_guide': '旅游攻略格式',\n 'blog_post': '博客文章格式',\n 'summary': '内容摘要格式',\n 'structured_data': '结构化数据格式',\n 'marketing_copy': '营销文案格式',\n 'faq': '常见问题格式'\n }\n \n def __init__(self, config: AlgorithmConfig):\n \"\"\"\n 初始化内容转换器\n \n Args:\n config: 算法配置\n \"\"\"\n self.config = config\n self.ai_service = AIService(config.ai_model)\n self.prompt_manager = PromptManager(config.prompts, config.resources)\n self.json_processor = JSONProcessor(\n enable_repair=config.content_generation.enable_json_repair,\n max_repair_attempts=config.content_generation.json_repair_attempts\n )\n \n # 获取任务特定的模型配置\n self.task_model_config = config.ai_model.get_task_config(\"content_transformation\")\n \n # 格式转换方法映射\n self.format_methods = {\n 'attraction_standard': self._transform_to_attraction_standard,\n 'product_sales': self._transform_to_product_sales,\n 'travel_guide': self._transform_to_travel_guide,\n 'blog_post': self._transform_to_blog_post,\n 'summary': self._transform_to_summary,\n 'structured_data': self._transform_to_structured_data,\n 'marketing_copy': self._transform_to_marketing_copy,\n 'faq': self._transform_to_faq\n }\n \n logger.info(f\"内容转换器初始化完成,支持格式: {list(self.SUPPORTED_FORMATS.keys())}\")\n \n async def transform_content(\n self,\n integrated_content: IntegratedContent,\n format_type: str = 'summary',\n custom_prompt: Optional[str] = None,\n additional_requirements: Optional[str] = None\n ) -> TransformedContent:\n \"\"\"\n 转换内容\n \n Args:\n integrated_content: 整合后的内容\n format_type: 目标格式类型\n custom_prompt: 自定义提示词\n additional_requirements: 额外要求\n \n Returns:\n TransformedContent: 转换后的内容\n \n Raises:\n DocumentProcessingError: 转换失败时抛出\n \"\"\"\n if format_type not in self.SUPPORTED_FORMATS:\n raise DocumentProcessingError(f\"不支持的格式类型: {format_type}\")\n \n if not integrated_content.combined_content.strip():\n raise DocumentProcessingError(\"没有可转换的内容\")\n \n try:\n logger.info(f\"开始转换内容为 {format_type} 格式\")\n \n # 获取转换方法\n transform_method = self.format_methods[format_type]\n \n # 执行转换\n transformed_text, structured_data, metadata = await transform_method(\n integrated_content,\n custom_prompt,\n additional_requirements\n )\n \n # 计算质量评分\n quality_score = self._calculate_quality_score(\n integrated_content,\n transformed_text,\n format_type\n )\n \n # 收集转换元数据\n transformation_metadata = {\n 'method': transform_method.__name__,\n 'format_type': format_type,\n 'custom_prompt_used': bool(custom_prompt),\n 'additional_requirements': additional_requirements,\n 'source_document_count': integrated_content.document_count,\n 'source_content_length': len(integrated_content.combined_content),\n 'target_content_length': len(transformed_text),\n 'model_config': self.task_model_config,\n 'transformation_timestamp': datetime.now().isoformat(),\n **metadata\n }\n \n transformed_content = TransformedContent(\n original_content=integrated_content,\n transformed_text=transformed_text,\n format_type=format_type,\n transformation_metadata=transformation_metadata,\n transformed_at=datetime.now(),\n structured_data=structured_data,\n quality_score=quality_score\n )\n \n logger.info(f\"内容转换完成,输出长度: {len(transformed_text)}\")\n return transformed_content\n \n except Exception as e:\n error_msg = f\"内容转换失败 ({format_type}): {str(e)}\"\n logger.error(error_msg, exc_info=True)\n raise DocumentProcessingError(error_msg)\n \n async def _transform_to_attraction_standard(\n self,\n content: IntegratedContent,\n custom_prompt: Optional[str],\n additional_requirements: Optional[str]\n ) -> tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]:\n \"\"\"转换为景区标准信息格式\"\"\"\n \n system_prompt = custom_prompt or self._get_system_prompt(\"attraction_standard\")\n user_prompt = self._build_user_prompt(\n content,\n \"景区标准信息\",\n additional_requirements,\n \"\"\"\n 请按照以下结构整理景区信息:\n 1. 景区基本信息(名称、位置、类型、等级)\n 2. 门票信息(价格、优惠政策、购票方式)\n 3. 开放时间(营业时间、季节性变化)\n 4. 交通指南(公共交通、自驾路线、停车信息)\n 5. 景区特色(主要景点、特色活动、文化背景)\n 6. 服务设施(餐饮、住宿、购物、卫生间等)\n 7. 游览建议(推荐路线、游览时长、最佳时间)\n 8. 注意事项(安全提醒、禁止事项、特殊要求)\n \"\"\"\n )\n \n result, tokens_input, tokens_output, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"景区标准信息转换\",\n **self.task_model_config\n )\n \n # 尝试提取结构化数据\n structured_data = self._extract_structured_data(result, \"attraction\")\n \n metadata = {\n 'tokens_input': tokens_input,\n 'tokens_output': tokens_output,\n 'elapsed_time': elapsed_time,\n 'extraction_method': 'ai_attraction_standard'\n }\n \n return result.strip(), structured_data, metadata\n \n async def _transform_to_product_sales(\n self,\n content: IntegratedContent,\n custom_prompt: Optional[str],\n additional_requirements: Optional[str]\n ) -> tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]:\n \"\"\"转换为产品销售介绍格式\"\"\"\n \n system_prompt = custom_prompt or self._get_system_prompt(\"product_sales\")\n user_prompt = self._build_user_prompt(\n content,\n \"产品销售介绍\",\n additional_requirements,\n \"\"\"\n 请按照销售文案的要求整理产品信息:\n 1. 产品亮点(核心卖点、独特优势)\n 2. 产品详情(套餐内容、服务项目、规格说明)\n 3. 价格体系(原价、优惠价、性价比分析)\n 4. 适用人群(目标客户、使用场景)\n 5. 购买指南(预订方式、使用方法、有效期)\n 6. 客户保障(退改政策、服务承诺、联系方式)\n 7. 用户评价(客户反馈、推荐理由)\n \"\"\"\n )\n \n result, tokens_input, tokens_output, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"产品销售介绍转换\",\n **self.task_model_config\n )\n \n structured_data = self._extract_structured_data(result, \"product\")\n \n metadata = {\n 'tokens_input': tokens_input,\n 'tokens_output': tokens_output,\n 'elapsed_time': elapsed_time,\n 'extraction_method': 'ai_product_sales'\n }\n \n return result.strip(), structured_data, metadata\n \n async def _transform_to_travel_guide(\n self,\n content: IntegratedContent,\n custom_prompt: Optional[str],\n additional_requirements: Optional[str]\n ) -> tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]:\n \"\"\"转换为旅游攻略格式\"\"\"\n \n system_prompt = custom_prompt or self._get_system_prompt(\"travel_guide\")\n user_prompt = self._build_user_prompt(\n content,\n \"旅游攻略\",\n additional_requirements,\n \"\"\"\n 请整理成实用的旅游攻略:\n 1. 目的地概览(地理位置、气候特点、最佳旅游时间)\n 2. 行程规划(推荐天数、必游景点、路线安排)\n 3. 交通攻略(到达方式、当地交通、费用预算)\n 4. 住宿推荐(不同档次选择、位置建议、预订提醒)\n 5. 美食指南(特色菜品、推荐餐厅、小吃街区)\n 6. 购物指南(特产推荐、购物地点、注意事项)\n 7. 实用贴士(天气准备、必备物品、省钱技巧)\n 8. 安全提醒(注意事项、紧急联系方式)\n \"\"\"\n )\n \n result, tokens_input, tokens_output, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"旅游攻略转换\",\n **self.task_model_config\n )\n \n structured_data = self._extract_structured_data(result, \"guide\")\n \n metadata = {\n 'tokens_input': tokens_input,\n 'tokens_output': tokens_output,\n 'elapsed_time': elapsed_time,\n 'extraction_method': 'ai_travel_guide'\n }\n \n return result.strip(), structured_data, metadata\n \n async def _transform_to_blog_post(\n self,\n content: IntegratedContent,\n custom_prompt: Optional[str],\n additional_requirements: Optional[str]\n ) -> tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]:\n \"\"\"转换为博客文章格式\"\"\"\n \n system_prompt = custom_prompt or self._get_system_prompt(\"blog_post\")\n user_prompt = self._build_user_prompt(\n content,\n \"博客文章\",\n additional_requirements,\n \"\"\"\n 请整理成吸引人的博客文章:\n 1. 引人入胜的标题\n 2. 开头引言(吸引读者兴趣)\n 3. 主体内容(详细介绍、个人体验、实用信息)\n 4. 精彩亮点(重点推荐、独特发现)\n 5. 实用建议(贴心提醒、经验分享)\n 6. 总结感悟(个人感受、推荐理由)\n 7. 互动结尾(鼓励评论、分享邀请)\n \n 要求:语言生动有趣,有个人色彩,适合社交媒体分享。\n \"\"\"\n )\n \n result, tokens_input, tokens_output, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"博客文章转换\",\n **self.task_model_config\n )\n \n structured_data = self._extract_structured_data(result, \"blog\")\n \n metadata = {\n 'tokens_input': tokens_input,\n 'tokens_output': tokens_output,\n 'elapsed_time': elapsed_time,\n 'extraction_method': 'ai_blog_post'\n }\n \n return result.strip(), structured_data, metadata\n \n async def _transform_to_summary(\n self,\n content: IntegratedContent,\n custom_prompt: Optional[str],\n additional_requirements: Optional[str]\n ) -> tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]:\n \"\"\"转换为内容摘要格式\"\"\"\n \n system_prompt = custom_prompt or self._get_system_prompt(\"summary\")\n user_prompt = self._build_user_prompt(\n content,\n \"内容摘要\",\n additional_requirements,\n \"\"\"\n 请提供简洁的内容摘要:\n 1. 核心信息总结(主要内容、关键信息)\n 2. 重要数据提取(价格、时间、数量等)\n 3. 关键特色亮点(独特卖点、优势特征)\n 4. 实用信息汇总(联系方式、地址、网址等)\n \n 要求:简洁明了,突出重点,便于快速了解。\n \"\"\"\n )\n \n result, tokens_input, tokens_output, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"内容摘要转换\",\n **self.task_model_config\n )\n \n structured_data = self._extract_structured_data(result, \"summary\")\n \n metadata = {\n 'tokens_input': tokens_input,\n 'tokens_output': tokens_output,\n 'elapsed_time': elapsed_time,\n 'extraction_method': 'ai_summary'\n }\n \n return result.strip(), structured_data, metadata\n \n async def _transform_to_structured_data(\n self,\n content: IntegratedContent,\n custom_prompt: Optional[str],\n additional_requirements: Optional[str]\n ) -> tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]:\n \"\"\"转换为结构化数据格式\"\"\"\n \n system_prompt = custom_prompt or \"\"\"你是一个数据结构化专家,负责将非结构化的文档内容转换为标准的JSON格式数据。\n\n请按照以下JSON结构提取和整理信息:\n{\n \"basic_info\": {\n \"name\": \"名称\",\n \"type\": \"类型\",\n \"location\": \"位置\",\n \"description\": \"简介\"\n },\n \"pricing\": {\n \"ticket_price\": \"门票价格\",\n \"package_price\": \"套餐价格\",\n \"discount_info\": \"优惠信息\"\n },\n \"schedule\": {\n \"opening_hours\": \"开放时间\",\n \"best_visit_time\": \"最佳游览时间\",\n \"duration\": \"建议游览时长\"\n },\n \"transportation\": {\n \"public_transport\": \"公共交通\",\n \"self_driving\": \"自驾信息\",\n \"parking\": \"停车信息\"\n },\n \"services\": {\n \"facilities\": [\"设施列表\"],\n \"dining\": [\"餐饮选择\"],\n \"accommodation\": [\"住宿选择\"]\n },\n \"highlights\": [\"亮点特色\"],\n \"tips\": [\"游览贴士\"],\n \"contact\": {\n \"phone\": \"联系电话\",\n \"website\": \"官方网站\",\n \"address\": \"详细地址\"\n }\n}\n\n请严格按照JSON格式输出,没有信息的字段可以设为null或空数组。\"\"\"\n \n user_prompt = f\"\"\"请将以下文档内容转换为结构化的JSON数据:\n\n{content.combined_content}\n\n{f'额外要求:{additional_requirements}' if additional_requirements else ''}\n\n请提取所有可用信息并按照指定的JSON结构格式化输出。\"\"\"\n \n result, tokens_input, tokens_output, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"结构化数据转换\",\n **self.task_model_config\n )\n \n # 解析JSON结构化数据\n try:\n structured_data = self.json_processor.parse_llm_output(\n raw_output=result,\n expected_fields=[\"basic_info\", \"pricing\", \"schedule\", \"transportation\", \"services\"],\n required_fields=[\"basic_info\"]\n )\n except Exception as e:\n logger.warning(f\"结构化数据解析失败: {e}\")\n structured_data = None\n \n metadata = {\n 'tokens_input': tokens_input,\n 'tokens_output': tokens_output,\n 'elapsed_time': elapsed_time,\n 'extraction_method': 'ai_structured_data',\n 'json_parsing_success': structured_data is not None\n }\n \n return result.strip(), structured_data, metadata\n \n async def _transform_to_marketing_copy(\n self,\n content: IntegratedContent,\n custom_prompt: Optional[str],\n additional_requirements: Optional[str]\n ) -> tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]:\n \"\"\"转换为营销文案格式\"\"\"\n \n system_prompt = custom_prompt or self._get_system_prompt(\"marketing_copy\")\n user_prompt = self._build_user_prompt(\n content,\n \"营销文案\",\n additional_requirements,\n \"\"\"\n 请创作有吸引力的营销文案:\n 1. 吸引眼球的标题(突出卖点、制造悬念)\n 2. 开头钩子(抓住痛点、激发兴趣)\n 3. 价值主张(核心利益、独特优势)\n 4. 社会证明(客户好评、权威认证)\n 5. 紧迫感营造(限时优惠、数量有限)\n 6. 行动召唤(明确指引、简化流程)\n 7. 风险消除(保障承诺、退款政策)\n \n 要求:语言有感染力,突出情感共鸣,促进转化行动。\n \"\"\"\n )\n \n result, tokens_input, tokens_output, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"营销文案转换\",\n **self.task_model_config\n )\n \n structured_data = self._extract_structured_data(result, \"marketing\")\n \n metadata = {\n 'tokens_input': tokens_input,\n 'tokens_output': tokens_output,\n 'elapsed_time': elapsed_time,\n 'extraction_method': 'ai_marketing_copy'\n }\n \n return result.strip(), structured_data, metadata\n \n async def _transform_to_faq(\n self,\n content: IntegratedContent,\n custom_prompt: Optional[str],\n additional_requirements: Optional[str]\n ) -> tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]:\n \"\"\"转换为常见问题格式\"\"\"\n \n system_prompt = custom_prompt or self._get_system_prompt(\"faq\")\n user_prompt = self._build_user_prompt(\n content,\n \"常见问题FAQ\",\n additional_requirements,\n \"\"\"\n 请整理成常见问题FAQ格式:\n \n 基本信息类:\n - 景区/产品介绍相关问题\n - 位置交通相关问题\n \n 预订购买类:\n - 门票价格和购买问题\n - 预订流程和注意事项\n \n 游览体验类:\n - 开放时间和游览时长\n - 设施服务和特色项目\n \n 实用贴士类:\n - 注意事项和安全提醒\n - 最佳游览时间和建议\n \n 售后服务类:\n - 退改政策和联系方式\n - 投诉建议和客服支持\n \n 每个问题要简洁明了,答案要准确详细。\n \"\"\"\n )\n \n result, tokens_input, tokens_output, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"FAQ转换\",\n **self.task_model_config\n )\n \n structured_data = self._extract_structured_data(result, \"faq\")\n \n metadata = {\n 'tokens_input': tokens_input,\n 'tokens_output': tokens_output,\n 'elapsed_time': elapsed_time,\n 'extraction_method': 'ai_faq'\n }\n \n return result.strip(), structured_data, metadata\n \n def _get_system_prompt(self, format_type: str) -> str:\n \"\"\"获取系统提示词\"\"\"\n try:\n return self.prompt_manager.get_prompt(f\"content_transformation_{format_type}\", \"system\")\n except:\n # 使用默认系统提示词\n return f\"\"\"你是一个专业的内容编辑和转换专家,擅长将各种文档内容转换为 {self.SUPPORTED_FORMATS.get(format_type, format_type)} 格式。\n\n你的任务是:\n1. 理解和分析原始文档内容\n2. 提取关键信息和要点\n3. 按照目标格式的要求重新组织内容\n4. 确保信息准确、完整、易读\n\n要求:\n- 保持信息的准确性和完整性\n- 语言简洁明了,逻辑清晰\n- 适合目标受众阅读\n- 突出重点信息和特色内容\"\"\"\n \n def _build_user_prompt(\n self,\n content: IntegratedContent,\n format_name: str,\n additional_requirements: Optional[str],\n format_template: str\n ) -> str:\n \"\"\"构建用户提示词\"\"\"\n \n prompt_parts = [\n f\"请将以下文档内容转换为{format_name}格式:\",\n \"\",\n f\"原始文档摘要:\",\n content.content_summary,\n \"\",\n f\"关键主题:{', '.join(content.key_topics)}\",\n \"\",\n f\"文档内容:\",\n content.combined_content,\n \"\",\n f\"转换要求:\",\n format_template\n ]\n \n if additional_requirements:\n prompt_parts.extend([\n \"\",\n f\"额外要求:\",\n additional_requirements\n ])\n \n return \"\\n\".join(prompt_parts)\n \n def _extract_structured_data(self, text: str, data_type: str) -> Optional[Dict[str, Any]]:\n \"\"\"从转换结果中提取结构化数据\"\"\"\n try:\n # 简单的关键信息提取\n structured = {}\n \n if data_type == \"attraction\":\n # 提取景区相关结构化信息\n structured = self._extract_attraction_data(text)\n elif data_type == \"product\":\n # 提取产品相关结构化信息\n structured = self._extract_product_data(text)\n elif data_type in [\"guide\", \"blog\", \"summary\", \"marketing\", \"faq\"]:\n # 提取通用结构化信息\n structured = self._extract_general_data(text)\n \n return structured if structured else None\n \n except Exception as e:\n logger.warning(f\"结构化数据提取失败: {e}\")\n return None\n \n def _extract_attraction_data(self, text: str) -> Dict[str, Any]:\n \"\"\"提取景区数据\"\"\"\n import re\n \n data = {}\n \n # 提取价格信息\n price_patterns = [\n r'门票[::]?\\s*(\\d+(?:\\.\\d+)?)\\s*元',\n r'票价[::]?\\s*(\\d+(?:\\.\\d+)?)\\s*元',\n r'价格[::]?\\s*(\\d+(?:\\.\\d+)?)\\s*元'\n ]\n \n for pattern in price_patterns:\n match = re.search(pattern, text)\n if match:\n data['ticket_price'] = float(match.group(1))\n break\n \n # 提取时间信息\n time_patterns = [\n r'开放时间[::]?\\s*([^\\n]+)',\n r'营业时间[::]?\\s*([^\\n]+)',\n r'游览时间[::]?\\s*([^\\n]+)'\n ]\n \n for pattern in time_patterns:\n match = re.search(pattern, text)\n if match:\n data['opening_hours'] = match.group(1).strip()\n break\n \n return data\n \n def _extract_product_data(self, text: str) -> Dict[str, Any]:\n \"\"\"提取产品数据\"\"\"\n import re\n \n data = {}\n \n # 提取价格信息\n price_patterns = [\n r'原价[::]?\\s*(\\d+(?:\\.\\d+)?)',\n r'现价[::]?\\s*(\\d+(?:\\.\\d+)?)',\n r'售价[::]?\\s*(\\d+(?:\\.\\d+)?)'\n ]\n \n for pattern in price_patterns:\n match = re.search(pattern, text)\n if match:\n data['price'] = float(match.group(1))\n break\n \n return data\n \n def _extract_general_data(self, text: str) -> Dict[str, Any]:\n \"\"\"提取通用数据\"\"\"\n data = {\n 'word_count': len(text.split()),\n 'character_count': len(text),\n 'paragraph_count': len([p for p in text.split('\\n\\n') if p.strip()])\n }\n \n return data\n \n def _calculate_quality_score(\n self,\n original_content: IntegratedContent,\n transformed_text: str,\n format_type: str\n ) -> float:\n \"\"\"计算转换质量评分\"\"\"\n try:\n score = 0.0\n \n # 基于长度的评分(30%)\n original_length = len(original_content.combined_content)\n transformed_length = len(transformed_text)\n \n if original_length > 0:\n length_ratio = min(transformed_length / original_length, 1.0)\n score += length_ratio * 0.3\n \n # 基于内容完整性的评分(40%)\n key_topics = original_content.key_topics\n topics_found = sum(1 for topic in key_topics if topic in transformed_text)\n \n if key_topics:\n topic_coverage = topics_found / len(key_topics)\n score += topic_coverage * 0.4\n else:\n score += 0.2 # 如果没有关键主题,给予部分分数\n \n # 基于格式适配性的评分(30%)\n format_indicators = {\n 'attraction_standard': ['景区', '门票', '开放时间', '交通'],\n 'product_sales': ['产品', '价格', '优惠', '购买'],\n 'travel_guide': ['攻略', '行程', '推荐', '贴士'],\n 'blog_post': ['体验', '感受', '推荐', '分享'],\n 'summary': ['总结', '要点', '关键', '概述'],\n 'marketing_copy': ['优惠', '限时', '立即', '独家'],\n 'faq': ['问题', '答案', '如何', '什么']\n }\n \n format_words = format_indicators.get(format_type, [])\n if format_words:\n format_matches = sum(1 for word in format_words if word in transformed_text)\n format_score = min(format_matches / len(format_words), 1.0)\n score += format_score * 0.3\n else:\n score += 0.15 # 默认格式分数\n \n return min(score, 1.0) # 确保分数不超过1.0\n \n except Exception as e:\n logger.warning(f\"质量评分计算失败: {e}\")\n return 0.5 # 返回中等分数\n \n def get_supported_formats(self) -> Dict[str, str]:\n \"\"\"获取支持的转换格式\"\"\"\n return self.SUPPORTED_FORMATS.copy()\n \n def is_supported_format(self, format_type: str) -> bool:\n \"\"\"检查格式是否支持\"\"\"\n return format_type in self.SUPPORTED_FORMATS\n \n def get_transformation_stats(self) -> Dict[str, Any]:\n \"\"\"获取转换器统计信息\"\"\"\n return {\n 'supported_formats': list(self.SUPPORTED_FORMATS.keys()),\n 'task_model_config': self.task_model_config,\n 'ai_model_info': self.ai_service.get_model_info(),\n 'json_repair_enabled': self.json_processor.enable_repair,\n 'available_methods': list(self.format_methods.keys())\n } ",
|
||
"code_hash": "4c7799e2549b35c98c7461ad6027491b"
|
||
}
|
||
],
|
||
"imports": [
|
||
{
|
||
"type": "import",
|
||
"modules": [
|
||
"logging"
|
||
],
|
||
"aliases": []
|
||
},
|
||
{
|
||
"type": "from_import",
|
||
"module": "typing",
|
||
"names": [
|
||
"Dict",
|
||
"Any",
|
||
"Optional",
|
||
"List"
|
||
],
|
||
"aliases": [],
|
||
"level": 0
|
||
},
|
||
{
|
||
"type": "from_import",
|
||
"module": "dataclasses",
|
||
"names": [
|
||
"dataclass"
|
||
],
|
||
"aliases": [],
|
||
"level": 0
|
||
},
|
||
{
|
||
"type": "from_import",
|
||
"module": "datetime",
|
||
"names": [
|
||
"datetime"
|
||
],
|
||
"aliases": [],
|
||
"level": 0
|
||
},
|
||
{
|
||
"type": "from_import",
|
||
"module": "content_integrator",
|
||
"names": [
|
||
"IntegratedContent"
|
||
],
|
||
"aliases": [],
|
||
"level": 1
|
||
},
|
||
{
|
||
"type": "from_import",
|
||
"module": "config",
|
||
"names": [
|
||
"AlgorithmConfig"
|
||
],
|
||
"aliases": [],
|
||
"level": 2
|
||
},
|
||
{
|
||
"type": "from_import",
|
||
"module": "core",
|
||
"names": [
|
||
"AIService",
|
||
"PromptManager",
|
||
"JSONProcessor"
|
||
],
|
||
"aliases": [],
|
||
"level": 2
|
||
},
|
||
{
|
||
"type": "from_import",
|
||
"module": "exceptions",
|
||
"names": [
|
||
"ContentGenerationError",
|
||
"DocumentProcessingError"
|
||
],
|
||
"aliases": [],
|
||
"level": 2
|
||
},
|
||
{
|
||
"type": "import",
|
||
"modules": [
|
||
"re"
|
||
],
|
||
"aliases": []
|
||
},
|
||
{
|
||
"type": "import",
|
||
"modules": [
|
||
"re"
|
||
],
|
||
"aliases": []
|
||
}
|
||
],
|
||
"constants": [
|
||
{
|
||
"name": "SUPPORTED_FORMATS",
|
||
"value": {
|
||
"attraction_standard": "景区标准信息格式",
|
||
"product_sales": "产品销售介绍格式",
|
||
"travel_guide": "旅游攻略格式",
|
||
"blog_post": "博客文章格式",
|
||
"summary": "内容摘要格式",
|
||
"structured_data": "结构化数据格式",
|
||
"marketing_copy": "营销文案格式",
|
||
"faq": "常见问题格式"
|
||
},
|
||
"type": "dict",
|
||
"line": 69
|
||
}
|
||
],
|
||
"docstring": "Content Transformer\n内容转换器 - 重构版本,使用AI将整合的文档内容转换为标准化格式",
|
||
"content_hash": "a3f673c6cd48bb63f2f1e1755df0b527"
|
||
} |