2025-07-31 15:35:23 +08:00

970 lines
94 KiB
JSON
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"file_path": "travel-algorithms/travel_algorithms/document_processing/content_transformer.py",
"file_size": 25896,
"line_count": 807,
"functions": [
{
"name": "__post_init__",
"line_start": 33,
"line_end": 36,
"args": [
{
"name": "self"
}
],
"return_type": null,
"docstring": "初始化后处理",
"is_async": false,
"decorators": [],
"code": " def __post_init__(self):\n \"\"\"初始化后处理\"\"\"\n if not self.transformed_at:\n self.transformed_at = datetime.now()",
"code_hash": "98adcf64ad6666b500fb2842b7e9ad72"
},
{
"name": "to_dict",
"line_start": 38,
"line_end": 49,
"args": [
{
"name": "self"
}
],
"return_type": "Dict[str, Any]",
"docstring": "转换为字典格式",
"is_async": false,
"decorators": [],
"code": " def to_dict(self) -> Dict[str, Any]:\n \"\"\"转换为字典格式\"\"\"\n return {\n 'transformed_text': self.transformed_text,\n 'format_type': self.format_type,\n 'transformation_metadata': self.transformation_metadata,\n 'transformed_at': self.transformed_at.isoformat(),\n 'structured_data': self.structured_data,\n 'quality_score': self.quality_score,\n 'original_summary': self.original_content.content_summary,\n 'original_document_count': self.original_content.document_count\n }",
"code_hash": "dcd995d469d689d1ef3e0a4541a74850"
},
{
"name": "get_summary",
"line_start": 51,
"line_end": 60,
"args": [
{
"name": "self"
}
],
"return_type": "Dict[str, Any]",
"docstring": "获取转换摘要",
"is_async": false,
"decorators": [],
"code": " def get_summary(self) -> Dict[str, Any]:\n \"\"\"获取转换摘要\"\"\"\n return {\n 'format_type': self.format_type,\n 'transformed_length': len(self.transformed_text),\n 'quality_score': self.quality_score,\n 'source_documents': self.original_content.document_count,\n 'transformation_method': self.transformation_metadata.get('method'),\n 'transformed_at': self.transformed_at.isoformat()\n }",
"code_hash": "4195b93949396eff3edc6ae0001c235c"
},
{
"name": "__init__",
"line_start": 80,
"line_end": 110,
"args": [
{
"name": "self"
},
{
"name": "config",
"type_hint": "AlgorithmConfig"
}
],
"return_type": null,
"docstring": "初始化内容转换器\n\nArgs:\n config: 算法配置",
"is_async": false,
"decorators": [],
"code": " def __init__(self, config: AlgorithmConfig):\n \"\"\"\n 初始化内容转换器\n \n Args:\n config: 算法配置\n \"\"\"\n self.config = config\n self.ai_service = AIService(config.ai_model)\n self.prompt_manager = PromptManager(config.prompts, config.resources)\n self.json_processor = JSONProcessor(\n enable_repair=config.content_generation.enable_json_repair,\n max_repair_attempts=config.content_generation.json_repair_attempts\n )\n \n # 获取任务特定的模型配置\n self.task_model_config = config.ai_model.get_task_config(\"content_transformation\")\n \n # 格式转换方法映射\n self.format_methods = {\n 'attraction_standard': self._transform_to_attraction_standard,\n 'product_sales': self._transform_to_product_sales,\n 'travel_guide': self._transform_to_travel_guide,\n 'blog_post': self._transform_to_blog_post,\n 'summary': self._transform_to_summary,\n 'structured_data': self._transform_to_structured_data,\n 'marketing_copy': self._transform_to_marketing_copy,\n 'faq': self._transform_to_faq\n }\n \n logger.info(f\"内容转换器初始化完成,支持格式: {list(self.SUPPORTED_FORMATS.keys())}\")",
"code_hash": "bb54e03402fd9abaec8a920b03e82bb8"
},
{
"name": "_get_system_prompt",
"line_start": 599,
"line_end": 617,
"args": [
{
"name": "self"
},
{
"name": "format_type",
"type_hint": "str"
}
],
"return_type": "str",
"docstring": "获取系统提示词",
"is_async": false,
"decorators": [],
"code": " def _get_system_prompt(self, format_type: str) -> str:\n \"\"\"获取系统提示词\"\"\"\n try:\n return self.prompt_manager.get_prompt(f\"content_transformation_{format_type}\", \"system\")\n except:\n # 使用默认系统提示词\n return f\"\"\"你是一个专业的内容编辑和转换专家,擅长将各种文档内容转换为 {self.SUPPORTED_FORMATS.get(format_type, format_type)} 格式。\n\n你的任务是\n1. 理解和分析原始文档内容\n2. 提取关键信息和要点\n3. 按照目标格式的要求重新组织内容\n4. 确保信息准确、完整、易读\n\n要求\n- 保持信息的准确性和完整性\n- 语言简洁明了,逻辑清晰\n- 适合目标受众阅读\n- 突出重点信息和特色内容\"\"\"",
"code_hash": "4d3dbd3d80fe6e57b42cff669f992171"
},
{
"name": "_build_user_prompt",
"line_start": 619,
"line_end": 650,
"args": [
{
"name": "self"
},
{
"name": "content",
"type_hint": "IntegratedContent"
},
{
"name": "format_name",
"type_hint": "str"
},
{
"name": "additional_requirements",
"type_hint": "Optional[str]"
},
{
"name": "format_template",
"type_hint": "str"
}
],
"return_type": "str",
"docstring": "构建用户提示词",
"is_async": false,
"decorators": [],
"code": " def _build_user_prompt(\n self,\n content: IntegratedContent,\n format_name: str,\n additional_requirements: Optional[str],\n format_template: str\n ) -> str:\n \"\"\"构建用户提示词\"\"\"\n \n prompt_parts = [\n f\"请将以下文档内容转换为{format_name}格式:\",\n \"\",\n f\"原始文档摘要:\",\n content.content_summary,\n \"\",\n f\"关键主题:{', '.join(content.key_topics)}\",\n \"\",\n f\"文档内容:\",\n content.combined_content,\n \"\",\n f\"转换要求:\",\n format_template\n ]\n \n if additional_requirements:\n prompt_parts.extend([\n \"\",\n f\"额外要求:\",\n additional_requirements\n ])\n \n return \"\\n\".join(prompt_parts)",
"code_hash": "d4556acf4f03ffbaa313456613c094b5"
},
{
"name": "_extract_structured_data",
"line_start": 652,
"line_end": 672,
"args": [
{
"name": "self"
},
{
"name": "text",
"type_hint": "str"
},
{
"name": "data_type",
"type_hint": "str"
}
],
"return_type": "Optional[Dict[str, Any]]",
"docstring": "从转换结果中提取结构化数据",
"is_async": false,
"decorators": [],
"code": " def _extract_structured_data(self, text: str, data_type: str) -> Optional[Dict[str, Any]]:\n \"\"\"从转换结果中提取结构化数据\"\"\"\n try:\n # 简单的关键信息提取\n structured = {}\n \n if data_type == \"attraction\":\n # 提取景区相关结构化信息\n structured = self._extract_attraction_data(text)\n elif data_type == \"product\":\n # 提取产品相关结构化信息\n structured = self._extract_product_data(text)\n elif data_type in [\"guide\", \"blog\", \"summary\", \"marketing\", \"faq\"]:\n # 提取通用结构化信息\n structured = self._extract_general_data(text)\n \n return structured if structured else None\n \n except Exception as e:\n logger.warning(f\"结构化数据提取失败: {e}\")\n return None",
"code_hash": "3ee64ced14f9e07fe0c4b2848c6889dd"
},
{
"name": "_extract_attraction_data",
"line_start": 674,
"line_end": 706,
"args": [
{
"name": "self"
},
{
"name": "text",
"type_hint": "str"
}
],
"return_type": "Dict[str, Any]",
"docstring": "提取景区数据",
"is_async": false,
"decorators": [],
"code": " def _extract_attraction_data(self, text: str) -> Dict[str, Any]:\n \"\"\"提取景区数据\"\"\"\n import re\n \n data = {}\n \n # 提取价格信息\n price_patterns = [\n r'门票[:]?\\s*(\\d+(?:\\.\\d+)?)\\s*元',\n r'票价[:]?\\s*(\\d+(?:\\.\\d+)?)\\s*元',\n r'价格[:]?\\s*(\\d+(?:\\.\\d+)?)\\s*元'\n ]\n \n for pattern in price_patterns:\n match = re.search(pattern, text)\n if match:\n data['ticket_price'] = float(match.group(1))\n break\n \n # 提取时间信息\n time_patterns = [\n r'开放时间[:]?\\s*([^\\n]+)',\n r'营业时间[:]?\\s*([^\\n]+)',\n r'游览时间[:]?\\s*([^\\n]+)'\n ]\n \n for pattern in time_patterns:\n match = re.search(pattern, text)\n if match:\n data['opening_hours'] = match.group(1).strip()\n break\n \n return data",
"code_hash": "f37d939e1bd1927abe6e43a5a86d161f"
},
{
"name": "_extract_product_data",
"line_start": 708,
"line_end": 727,
"args": [
{
"name": "self"
},
{
"name": "text",
"type_hint": "str"
}
],
"return_type": "Dict[str, Any]",
"docstring": "提取产品数据",
"is_async": false,
"decorators": [],
"code": " def _extract_product_data(self, text: str) -> Dict[str, Any]:\n \"\"\"提取产品数据\"\"\"\n import re\n \n data = {}\n \n # 提取价格信息\n price_patterns = [\n r'原价[:]?\\s*(\\d+(?:\\.\\d+)?)',\n r'现价[:]?\\s*(\\d+(?:\\.\\d+)?)',\n r'售价[:]?\\s*(\\d+(?:\\.\\d+)?)'\n ]\n \n for pattern in price_patterns:\n match = re.search(pattern, text)\n if match:\n data['price'] = float(match.group(1))\n break\n \n return data",
"code_hash": "49a87013a6203a797ff1d4bc3e027f82"
},
{
"name": "_extract_general_data",
"line_start": 729,
"line_end": 737,
"args": [
{
"name": "self"
},
{
"name": "text",
"type_hint": "str"
}
],
"return_type": "Dict[str, Any]",
"docstring": "提取通用数据",
"is_async": false,
"decorators": [],
"code": " def _extract_general_data(self, text: str) -> Dict[str, Any]:\n \"\"\"提取通用数据\"\"\"\n data = {\n 'word_count': len(text.split()),\n 'character_count': len(text),\n 'paragraph_count': len([p for p in text.split('\\n\\n') if p.strip()])\n }\n \n return data",
"code_hash": "2f772f6f6268c366f68dfd6eaa28d4ad"
},
{
"name": "_calculate_quality_score",
"line_start": 739,
"line_end": 790,
"args": [
{
"name": "self"
},
{
"name": "original_content",
"type_hint": "IntegratedContent"
},
{
"name": "transformed_text",
"type_hint": "str"
},
{
"name": "format_type",
"type_hint": "str"
}
],
"return_type": "float",
"docstring": "计算转换质量评分",
"is_async": false,
"decorators": [],
"code": " def _calculate_quality_score(\n self,\n original_content: IntegratedContent,\n transformed_text: str,\n format_type: str\n ) -> float:\n \"\"\"计算转换质量评分\"\"\"\n try:\n score = 0.0\n \n # 基于长度的评分30%\n original_length = len(original_content.combined_content)\n transformed_length = len(transformed_text)\n \n if original_length > 0:\n length_ratio = min(transformed_length / original_length, 1.0)\n score += length_ratio * 0.3\n \n # 基于内容完整性的评分40%\n key_topics = original_content.key_topics\n topics_found = sum(1 for topic in key_topics if topic in transformed_text)\n \n if key_topics:\n topic_coverage = topics_found / len(key_topics)\n score += topic_coverage * 0.4\n else:\n score += 0.2 # 如果没有关键主题,给予部分分数\n \n # 基于格式适配性的评分30%\n format_indicators = {\n 'attraction_standard': ['景区', '门票', '开放时间', '交通'],\n 'product_sales': ['产品', '价格', '优惠', '购买'],\n 'travel_guide': ['攻略', '行程', '推荐', '贴士'],\n 'blog_post': ['体验', '感受', '推荐', '分享'],\n 'summary': ['总结', '要点', '关键', '概述'],\n 'marketing_copy': ['优惠', '限时', '立即', '独家'],\n 'faq': ['问题', '答案', '如何', '什么']\n }\n \n format_words = format_indicators.get(format_type, [])\n if format_words:\n format_matches = sum(1 for word in format_words if word in transformed_text)\n format_score = min(format_matches / len(format_words), 1.0)\n score += format_score * 0.3\n else:\n score += 0.15 # 默认格式分数\n \n return min(score, 1.0) # 确保分数不超过1.0\n \n except Exception as e:\n logger.warning(f\"质量评分计算失败: {e}\")\n return 0.5 # 返回中等分数",
"code_hash": "5c0254104230fe26a1b7a472b54feef3"
},
{
"name": "get_supported_formats",
"line_start": 792,
"line_end": 794,
"args": [
{
"name": "self"
}
],
"return_type": "Dict[str, str]",
"docstring": "获取支持的转换格式",
"is_async": false,
"decorators": [],
"code": " def get_supported_formats(self) -> Dict[str, str]:\n \"\"\"获取支持的转换格式\"\"\"\n return self.SUPPORTED_FORMATS.copy()",
"code_hash": "da58535e7c5de66716664cc49f2d4286"
},
{
"name": "is_supported_format",
"line_start": 796,
"line_end": 798,
"args": [
{
"name": "self"
},
{
"name": "format_type",
"type_hint": "str"
}
],
"return_type": "bool",
"docstring": "检查格式是否支持",
"is_async": false,
"decorators": [],
"code": " def is_supported_format(self, format_type: str) -> bool:\n \"\"\"检查格式是否支持\"\"\"\n return format_type in self.SUPPORTED_FORMATS",
"code_hash": "549606aebdf62433bc4275ee211b04bb"
},
{
"name": "get_transformation_stats",
"line_start": 800,
"line_end": 808,
"args": [
{
"name": "self"
}
],
"return_type": "Dict[str, Any]",
"docstring": "获取转换器统计信息",
"is_async": false,
"decorators": [],
"code": " def get_transformation_stats(self) -> Dict[str, Any]:\n \"\"\"获取转换器统计信息\"\"\"\n return {\n 'supported_formats': list(self.SUPPORTED_FORMATS.keys()),\n 'task_model_config': self.task_model_config,\n 'ai_model_info': self.ai_service.get_model_info(),\n 'json_repair_enabled': self.json_processor.enable_repair,\n 'available_methods': list(self.format_methods.keys())\n } ",
"code_hash": "bc0c00ae316130c09cd0b51551919892"
}
],
"classes": [
{
"name": "TransformedContent",
"line_start": 23,
"line_end": 60,
"bases": [],
"methods": [
{
"name": "__post_init__",
"line_start": 33,
"line_end": 36,
"args": [
{
"name": "self"
}
],
"return_type": null,
"docstring": "初始化后处理",
"is_async": false,
"decorators": [],
"code": " def __post_init__(self):\n \"\"\"初始化后处理\"\"\"\n if not self.transformed_at:\n self.transformed_at = datetime.now()",
"code_hash": "98adcf64ad6666b500fb2842b7e9ad72"
},
{
"name": "to_dict",
"line_start": 38,
"line_end": 49,
"args": [
{
"name": "self"
}
],
"return_type": "Dict[str, Any]",
"docstring": "转换为字典格式",
"is_async": false,
"decorators": [],
"code": " def to_dict(self) -> Dict[str, Any]:\n \"\"\"转换为字典格式\"\"\"\n return {\n 'transformed_text': self.transformed_text,\n 'format_type': self.format_type,\n 'transformation_metadata': self.transformation_metadata,\n 'transformed_at': self.transformed_at.isoformat(),\n 'structured_data': self.structured_data,\n 'quality_score': self.quality_score,\n 'original_summary': self.original_content.content_summary,\n 'original_document_count': self.original_content.document_count\n }",
"code_hash": "dcd995d469d689d1ef3e0a4541a74850"
},
{
"name": "get_summary",
"line_start": 51,
"line_end": 60,
"args": [
{
"name": "self"
}
],
"return_type": "Dict[str, Any]",
"docstring": "获取转换摘要",
"is_async": false,
"decorators": [],
"code": " def get_summary(self) -> Dict[str, Any]:\n \"\"\"获取转换摘要\"\"\"\n return {\n 'format_type': self.format_type,\n 'transformed_length': len(self.transformed_text),\n 'quality_score': self.quality_score,\n 'source_documents': self.original_content.document_count,\n 'transformation_method': self.transformation_metadata.get('method'),\n 'transformed_at': self.transformed_at.isoformat()\n }",
"code_hash": "4195b93949396eff3edc6ae0001c235c"
}
],
"docstring": "转换后的内容",
"decorators": [
"dataclass"
],
"code": "class TransformedContent:\n \"\"\"转换后的内容\"\"\"\n original_content: IntegratedContent\n transformed_text: str\n format_type: str\n transformation_metadata: Dict[str, Any]\n transformed_at: datetime\n structured_data: Optional[Dict[str, Any]] = None\n quality_score: Optional[float] = None\n \n def __post_init__(self):\n \"\"\"初始化后处理\"\"\"\n if not self.transformed_at:\n self.transformed_at = datetime.now()\n \n def to_dict(self) -> Dict[str, Any]:\n \"\"\"转换为字典格式\"\"\"\n return {\n 'transformed_text': self.transformed_text,\n 'format_type': self.format_type,\n 'transformation_metadata': self.transformation_metadata,\n 'transformed_at': self.transformed_at.isoformat(),\n 'structured_data': self.structured_data,\n 'quality_score': self.quality_score,\n 'original_summary': self.original_content.content_summary,\n 'original_document_count': self.original_content.document_count\n }\n \n def get_summary(self) -> Dict[str, Any]:\n \"\"\"获取转换摘要\"\"\"\n return {\n 'format_type': self.format_type,\n 'transformed_length': len(self.transformed_text),\n 'quality_score': self.quality_score,\n 'source_documents': self.original_content.document_count,\n 'transformation_method': self.transformation_metadata.get('method'),\n 'transformed_at': self.transformed_at.isoformat()\n }",
"code_hash": "259d1dd57fbef291fb58e70ebd072d58"
},
{
"name": "ContentTransformer",
"line_start": 63,
"line_end": 808,
"bases": [],
"methods": [
{
"name": "__init__",
"line_start": 80,
"line_end": 110,
"args": [
{
"name": "self"
},
{
"name": "config",
"type_hint": "AlgorithmConfig"
}
],
"return_type": null,
"docstring": "初始化内容转换器\n\nArgs:\n config: 算法配置",
"is_async": false,
"decorators": [],
"code": " def __init__(self, config: AlgorithmConfig):\n \"\"\"\n 初始化内容转换器\n \n Args:\n config: 算法配置\n \"\"\"\n self.config = config\n self.ai_service = AIService(config.ai_model)\n self.prompt_manager = PromptManager(config.prompts, config.resources)\n self.json_processor = JSONProcessor(\n enable_repair=config.content_generation.enable_json_repair,\n max_repair_attempts=config.content_generation.json_repair_attempts\n )\n \n # 获取任务特定的模型配置\n self.task_model_config = config.ai_model.get_task_config(\"content_transformation\")\n \n # 格式转换方法映射\n self.format_methods = {\n 'attraction_standard': self._transform_to_attraction_standard,\n 'product_sales': self._transform_to_product_sales,\n 'travel_guide': self._transform_to_travel_guide,\n 'blog_post': self._transform_to_blog_post,\n 'summary': self._transform_to_summary,\n 'structured_data': self._transform_to_structured_data,\n 'marketing_copy': self._transform_to_marketing_copy,\n 'faq': self._transform_to_faq\n }\n \n logger.info(f\"内容转换器初始化完成,支持格式: {list(self.SUPPORTED_FORMATS.keys())}\")",
"code_hash": "bb54e03402fd9abaec8a920b03e82bb8"
},
{
"name": "transform_content",
"line_start": 112,
"line_end": 190,
"args": [
{
"name": "self"
},
{
"name": "integrated_content",
"type_hint": "IntegratedContent"
},
{
"name": "format_type",
"type_hint": "str"
},
{
"name": "custom_prompt",
"type_hint": "Optional[str]"
},
{
"name": "additional_requirements",
"type_hint": "Optional[str]"
}
],
"return_type": "TransformedContent",
"docstring": "转换内容\n\nArgs:\n integrated_content: 整合后的内容\n format_type: 目标格式类型\n custom_prompt: 自定义提示词\n additional_requirements: 额外要求\n \nReturns:\n TransformedContent: 转换后的内容\n \nRaises:\n DocumentProcessingError: 转换失败时抛出",
"is_async": true,
"decorators": [],
"code": " async def transform_content(\n self,\n integrated_content: IntegratedContent,\n format_type: str = 'summary',\n custom_prompt: Optional[str] = None,\n additional_requirements: Optional[str] = None\n ) -> TransformedContent:\n \"\"\"\n 转换内容\n \n Args:\n integrated_content: 整合后的内容\n format_type: 目标格式类型\n custom_prompt: 自定义提示词\n additional_requirements: 额外要求\n \n Returns:\n TransformedContent: 转换后的内容\n \n Raises:\n DocumentProcessingError: 转换失败时抛出\n \"\"\"\n if format_type not in self.SUPPORTED_FORMATS:\n raise DocumentProcessingError(f\"不支持的格式类型: {format_type}\")\n \n if not integrated_content.combined_content.strip():\n raise DocumentProcessingError(\"没有可转换的内容\")\n \n try:\n logger.info(f\"开始转换内容为 {format_type} 格式\")\n \n # 获取转换方法\n transform_method = self.format_methods[format_type]\n \n # 执行转换\n transformed_text, structured_data, metadata = await transform_method(\n integrated_content,\n custom_prompt,\n additional_requirements\n )\n \n # 计算质量评分\n quality_score = self._calculate_quality_score(\n integrated_content,\n transformed_text,\n format_type\n )\n \n # 收集转换元数据\n transformation_metadata = {\n 'method': transform_method.__name__,\n 'format_type': format_type,\n 'custom_prompt_used': bool(custom_prompt),\n 'additional_requirements': additional_requirements,\n 'source_document_count': integrated_content.document_count,\n 'source_content_length': len(integrated_content.combined_content),\n 'target_content_length': len(transformed_text),\n 'model_config': self.task_model_config,\n 'transformation_timestamp': datetime.now().isoformat(),\n **metadata\n }\n \n transformed_content = TransformedContent(\n original_content=integrated_content,\n transformed_text=transformed_text,\n format_type=format_type,\n transformation_metadata=transformation_metadata,\n transformed_at=datetime.now(),\n structured_data=structured_data,\n quality_score=quality_score\n )\n \n logger.info(f\"内容转换完成,输出长度: {len(transformed_text)}\")\n return transformed_content\n \n except Exception as e:\n error_msg = f\"内容转换失败 ({format_type}): {str(e)}\"\n logger.error(error_msg, exc_info=True)\n raise DocumentProcessingError(error_msg)",
"code_hash": "0b8be16dfb781017c7d2853b9515fe95"
},
{
"name": "_transform_to_attraction_standard",
"line_start": 192,
"line_end": 235,
"args": [
{
"name": "self"
},
{
"name": "content",
"type_hint": "IntegratedContent"
},
{
"name": "custom_prompt",
"type_hint": "Optional[str]"
},
{
"name": "additional_requirements",
"type_hint": "Optional[str]"
}
],
"return_type": "tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]",
"docstring": "转换为景区标准信息格式",
"is_async": true,
"decorators": [],
"code": " async def _transform_to_attraction_standard(\n self,\n content: IntegratedContent,\n custom_prompt: Optional[str],\n additional_requirements: Optional[str]\n ) -> tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]:\n \"\"\"转换为景区标准信息格式\"\"\"\n \n system_prompt = custom_prompt or self._get_system_prompt(\"attraction_standard\")\n user_prompt = self._build_user_prompt(\n content,\n \"景区标准信息\",\n additional_requirements,\n \"\"\"\n 请按照以下结构整理景区信息:\n 1. 景区基本信息(名称、位置、类型、等级)\n 2. 门票信息(价格、优惠政策、购票方式)\n 3. 开放时间(营业时间、季节性变化)\n 4. 交通指南(公共交通、自驾路线、停车信息)\n 5. 景区特色(主要景点、特色活动、文化背景)\n 6. 服务设施(餐饮、住宿、购物、卫生间等)\n 7. 游览建议(推荐路线、游览时长、最佳时间)\n 8. 注意事项(安全提醒、禁止事项、特殊要求)\n \"\"\"\n )\n \n result, tokens_input, tokens_output, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"景区标准信息转换\",\n **self.task_model_config\n )\n \n # 尝试提取结构化数据\n structured_data = self._extract_structured_data(result, \"attraction\")\n \n metadata = {\n 'tokens_input': tokens_input,\n 'tokens_output': tokens_output,\n 'elapsed_time': elapsed_time,\n 'extraction_method': 'ai_attraction_standard'\n }\n \n return result.strip(), structured_data, metadata",
"code_hash": "93c7f5d14b71dbca5dd8594bb0f67acb"
},
{
"name": "_transform_to_product_sales",
"line_start": 237,
"line_end": 278,
"args": [
{
"name": "self"
},
{
"name": "content",
"type_hint": "IntegratedContent"
},
{
"name": "custom_prompt",
"type_hint": "Optional[str]"
},
{
"name": "additional_requirements",
"type_hint": "Optional[str]"
}
],
"return_type": "tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]",
"docstring": "转换为产品销售介绍格式",
"is_async": true,
"decorators": [],
"code": " async def _transform_to_product_sales(\n self,\n content: IntegratedContent,\n custom_prompt: Optional[str],\n additional_requirements: Optional[str]\n ) -> tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]:\n \"\"\"转换为产品销售介绍格式\"\"\"\n \n system_prompt = custom_prompt or self._get_system_prompt(\"product_sales\")\n user_prompt = self._build_user_prompt(\n content,\n \"产品销售介绍\",\n additional_requirements,\n \"\"\"\n 请按照销售文案的要求整理产品信息:\n 1. 产品亮点(核心卖点、独特优势)\n 2. 产品详情(套餐内容、服务项目、规格说明)\n 3. 价格体系(原价、优惠价、性价比分析)\n 4. 适用人群(目标客户、使用场景)\n 5. 购买指南(预订方式、使用方法、有效期)\n 6. 客户保障(退改政策、服务承诺、联系方式)\n 7. 用户评价(客户反馈、推荐理由)\n \"\"\"\n )\n \n result, tokens_input, tokens_output, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"产品销售介绍转换\",\n **self.task_model_config\n )\n \n structured_data = self._extract_structured_data(result, \"product\")\n \n metadata = {\n 'tokens_input': tokens_input,\n 'tokens_output': tokens_output,\n 'elapsed_time': elapsed_time,\n 'extraction_method': 'ai_product_sales'\n }\n \n return result.strip(), structured_data, metadata",
"code_hash": "59238ad356a40708344362f36216f751"
},
{
"name": "_transform_to_travel_guide",
"line_start": 280,
"line_end": 322,
"args": [
{
"name": "self"
},
{
"name": "content",
"type_hint": "IntegratedContent"
},
{
"name": "custom_prompt",
"type_hint": "Optional[str]"
},
{
"name": "additional_requirements",
"type_hint": "Optional[str]"
}
],
"return_type": "tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]",
"docstring": "转换为旅游攻略格式",
"is_async": true,
"decorators": [],
"code": " async def _transform_to_travel_guide(\n self,\n content: IntegratedContent,\n custom_prompt: Optional[str],\n additional_requirements: Optional[str]\n ) -> tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]:\n \"\"\"转换为旅游攻略格式\"\"\"\n \n system_prompt = custom_prompt or self._get_system_prompt(\"travel_guide\")\n user_prompt = self._build_user_prompt(\n content,\n \"旅游攻略\",\n additional_requirements,\n \"\"\"\n 请整理成实用的旅游攻略:\n 1. 目的地概览(地理位置、气候特点、最佳旅游时间)\n 2. 行程规划(推荐天数、必游景点、路线安排)\n 3. 交通攻略(到达方式、当地交通、费用预算)\n 4. 住宿推荐(不同档次选择、位置建议、预订提醒)\n 5. 美食指南(特色菜品、推荐餐厅、小吃街区)\n 6. 购物指南(特产推荐、购物地点、注意事项)\n 7. 实用贴士(天气准备、必备物品、省钱技巧)\n 8. 安全提醒(注意事项、紧急联系方式)\n \"\"\"\n )\n \n result, tokens_input, tokens_output, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"旅游攻略转换\",\n **self.task_model_config\n )\n \n structured_data = self._extract_structured_data(result, \"guide\")\n \n metadata = {\n 'tokens_input': tokens_input,\n 'tokens_output': tokens_output,\n 'elapsed_time': elapsed_time,\n 'extraction_method': 'ai_travel_guide'\n }\n \n return result.strip(), structured_data, metadata",
"code_hash": "030bc5230ab8b397e096e69d1a6255f3"
},
{
"name": "_transform_to_blog_post",
"line_start": 324,
"line_end": 367,
"args": [
{
"name": "self"
},
{
"name": "content",
"type_hint": "IntegratedContent"
},
{
"name": "custom_prompt",
"type_hint": "Optional[str]"
},
{
"name": "additional_requirements",
"type_hint": "Optional[str]"
}
],
"return_type": "tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]",
"docstring": "转换为博客文章格式",
"is_async": true,
"decorators": [],
"code": " async def _transform_to_blog_post(\n self,\n content: IntegratedContent,\n custom_prompt: Optional[str],\n additional_requirements: Optional[str]\n ) -> tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]:\n \"\"\"转换为博客文章格式\"\"\"\n \n system_prompt = custom_prompt or self._get_system_prompt(\"blog_post\")\n user_prompt = self._build_user_prompt(\n content,\n \"博客文章\",\n additional_requirements,\n \"\"\"\n 请整理成吸引人的博客文章:\n 1. 引人入胜的标题\n 2. 开头引言(吸引读者兴趣)\n 3. 主体内容(详细介绍、个人体验、实用信息)\n 4. 精彩亮点(重点推荐、独特发现)\n 5. 实用建议(贴心提醒、经验分享)\n 6. 总结感悟(个人感受、推荐理由)\n 7. 互动结尾(鼓励评论、分享邀请)\n \n 要求:语言生动有趣,有个人色彩,适合社交媒体分享。\n \"\"\"\n )\n \n result, tokens_input, tokens_output, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"博客文章转换\",\n **self.task_model_config\n )\n \n structured_data = self._extract_structured_data(result, \"blog\")\n \n metadata = {\n 'tokens_input': tokens_input,\n 'tokens_output': tokens_output,\n 'elapsed_time': elapsed_time,\n 'extraction_method': 'ai_blog_post'\n }\n \n return result.strip(), structured_data, metadata",
"code_hash": "cb2a8be10972442c5c6ed8c9642625a0"
},
{
"name": "_transform_to_summary",
"line_start": 369,
"line_end": 409,
"args": [
{
"name": "self"
},
{
"name": "content",
"type_hint": "IntegratedContent"
},
{
"name": "custom_prompt",
"type_hint": "Optional[str]"
},
{
"name": "additional_requirements",
"type_hint": "Optional[str]"
}
],
"return_type": "tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]",
"docstring": "转换为内容摘要格式",
"is_async": true,
"decorators": [],
"code": " async def _transform_to_summary(\n self,\n content: IntegratedContent,\n custom_prompt: Optional[str],\n additional_requirements: Optional[str]\n ) -> tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]:\n \"\"\"转换为内容摘要格式\"\"\"\n \n system_prompt = custom_prompt or self._get_system_prompt(\"summary\")\n user_prompt = self._build_user_prompt(\n content,\n \"内容摘要\",\n additional_requirements,\n \"\"\"\n 请提供简洁的内容摘要:\n 1. 核心信息总结(主要内容、关键信息)\n 2. 重要数据提取(价格、时间、数量等)\n 3. 关键特色亮点(独特卖点、优势特征)\n 4. 实用信息汇总(联系方式、地址、网址等)\n \n 要求:简洁明了,突出重点,便于快速了解。\n \"\"\"\n )\n \n result, tokens_input, tokens_output, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"内容摘要转换\",\n **self.task_model_config\n )\n \n structured_data = self._extract_structured_data(result, \"summary\")\n \n metadata = {\n 'tokens_input': tokens_input,\n 'tokens_output': tokens_output,\n 'elapsed_time': elapsed_time,\n 'extraction_method': 'ai_summary'\n }\n \n return result.strip(), structured_data, metadata",
"code_hash": "e2815ea90e067be707101daf9cb3fd26"
},
{
"name": "_transform_to_structured_data",
"line_start": 411,
"line_end": 494,
"args": [
{
"name": "self"
},
{
"name": "content",
"type_hint": "IntegratedContent"
},
{
"name": "custom_prompt",
"type_hint": "Optional[str]"
},
{
"name": "additional_requirements",
"type_hint": "Optional[str]"
}
],
"return_type": "tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]",
"docstring": "转换为结构化数据格式",
"is_async": true,
"decorators": [],
"code": " async def _transform_to_structured_data(\n self,\n content: IntegratedContent,\n custom_prompt: Optional[str],\n additional_requirements: Optional[str]\n ) -> tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]:\n \"\"\"转换为结构化数据格式\"\"\"\n \n system_prompt = custom_prompt or \"\"\"你是一个数据结构化专家负责将非结构化的文档内容转换为标准的JSON格式数据。\n\n请按照以下JSON结构提取和整理信息\n{\n \"basic_info\": {\n \"name\": \"名称\",\n \"type\": \"类型\",\n \"location\": \"位置\",\n \"description\": \"简介\"\n },\n \"pricing\": {\n \"ticket_price\": \"门票价格\",\n \"package_price\": \"套餐价格\",\n \"discount_info\": \"优惠信息\"\n },\n \"schedule\": {\n \"opening_hours\": \"开放时间\",\n \"best_visit_time\": \"最佳游览时间\",\n \"duration\": \"建议游览时长\"\n },\n \"transportation\": {\n \"public_transport\": \"公共交通\",\n \"self_driving\": \"自驾信息\",\n \"parking\": \"停车信息\"\n },\n \"services\": {\n \"facilities\": [\"设施列表\"],\n \"dining\": [\"餐饮选择\"],\n \"accommodation\": [\"住宿选择\"]\n },\n \"highlights\": [\"亮点特色\"],\n \"tips\": [\"游览贴士\"],\n \"contact\": {\n \"phone\": \"联系电话\",\n \"website\": \"官方网站\",\n \"address\": \"详细地址\"\n }\n}\n\n请严格按照JSON格式输出没有信息的字段可以设为null或空数组。\"\"\"\n \n user_prompt = f\"\"\"请将以下文档内容转换为结构化的JSON数据\n\n{content.combined_content}\n\n{f'额外要求:{additional_requirements}' if additional_requirements else ''}\n\n请提取所有可用信息并按照指定的JSON结构格式化输出。\"\"\"\n \n result, tokens_input, tokens_output, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"结构化数据转换\",\n **self.task_model_config\n )\n \n # 解析JSON结构化数据\n try:\n structured_data = self.json_processor.parse_llm_output(\n raw_output=result,\n expected_fields=[\"basic_info\", \"pricing\", \"schedule\", \"transportation\", \"services\"],\n required_fields=[\"basic_info\"]\n )\n except Exception as e:\n logger.warning(f\"结构化数据解析失败: {e}\")\n structured_data = None\n \n metadata = {\n 'tokens_input': tokens_input,\n 'tokens_output': tokens_output,\n 'elapsed_time': elapsed_time,\n 'extraction_method': 'ai_structured_data',\n 'json_parsing_success': structured_data is not None\n }\n \n return result.strip(), structured_data, metadata",
"code_hash": "18d327ad60f5eece83291ee102c14426"
},
{
"name": "_transform_to_marketing_copy",
"line_start": 496,
"line_end": 539,
"args": [
{
"name": "self"
},
{
"name": "content",
"type_hint": "IntegratedContent"
},
{
"name": "custom_prompt",
"type_hint": "Optional[str]"
},
{
"name": "additional_requirements",
"type_hint": "Optional[str]"
}
],
"return_type": "tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]",
"docstring": "转换为营销文案格式",
"is_async": true,
"decorators": [],
"code": " async def _transform_to_marketing_copy(\n self,\n content: IntegratedContent,\n custom_prompt: Optional[str],\n additional_requirements: Optional[str]\n ) -> tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]:\n \"\"\"转换为营销文案格式\"\"\"\n \n system_prompt = custom_prompt or self._get_system_prompt(\"marketing_copy\")\n user_prompt = self._build_user_prompt(\n content,\n \"营销文案\",\n additional_requirements,\n \"\"\"\n 请创作有吸引力的营销文案:\n 1. 吸引眼球的标题(突出卖点、制造悬念)\n 2. 开头钩子(抓住痛点、激发兴趣)\n 3. 价值主张(核心利益、独特优势)\n 4. 社会证明(客户好评、权威认证)\n 5. 紧迫感营造(限时优惠、数量有限)\n 6. 行动召唤(明确指引、简化流程)\n 7. 风险消除(保障承诺、退款政策)\n \n 要求:语言有感染力,突出情感共鸣,促进转化行动。\n \"\"\"\n )\n \n result, tokens_input, tokens_output, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"营销文案转换\",\n **self.task_model_config\n )\n \n structured_data = self._extract_structured_data(result, \"marketing\")\n \n metadata = {\n 'tokens_input': tokens_input,\n 'tokens_output': tokens_output,\n 'elapsed_time': elapsed_time,\n 'extraction_method': 'ai_marketing_copy'\n }\n \n return result.strip(), structured_data, metadata",
"code_hash": "3f01b70214e0aa9e4a826ab8d36e6c0a"
},
{
"name": "_transform_to_faq",
"line_start": 541,
"line_end": 597,
"args": [
{
"name": "self"
},
{
"name": "content",
"type_hint": "IntegratedContent"
},
{
"name": "custom_prompt",
"type_hint": "Optional[str]"
},
{
"name": "additional_requirements",
"type_hint": "Optional[str]"
}
],
"return_type": "tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]",
"docstring": "转换为常见问题格式",
"is_async": true,
"decorators": [],
"code": " async def _transform_to_faq(\n self,\n content: IntegratedContent,\n custom_prompt: Optional[str],\n additional_requirements: Optional[str]\n ) -> tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]:\n \"\"\"转换为常见问题格式\"\"\"\n \n system_prompt = custom_prompt or self._get_system_prompt(\"faq\")\n user_prompt = self._build_user_prompt(\n content,\n \"常见问题FAQ\",\n additional_requirements,\n \"\"\"\n 请整理成常见问题FAQ格式\n \n 基本信息类:\n - 景区/产品介绍相关问题\n - 位置交通相关问题\n \n 预订购买类:\n - 门票价格和购买问题\n - 预订流程和注意事项\n \n 游览体验类:\n - 开放时间和游览时长\n - 设施服务和特色项目\n \n 实用贴士类:\n - 注意事项和安全提醒\n - 最佳游览时间和建议\n \n 售后服务类:\n - 退改政策和联系方式\n - 投诉建议和客服支持\n \n 每个问题要简洁明了,答案要准确详细。\n \"\"\"\n )\n \n result, tokens_input, tokens_output, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"FAQ转换\",\n **self.task_model_config\n )\n \n structured_data = self._extract_structured_data(result, \"faq\")\n \n metadata = {\n 'tokens_input': tokens_input,\n 'tokens_output': tokens_output,\n 'elapsed_time': elapsed_time,\n 'extraction_method': 'ai_faq'\n }\n \n return result.strip(), structured_data, metadata",
"code_hash": "8730c4f107dde26dca26057eedb41f1f"
},
{
"name": "_get_system_prompt",
"line_start": 599,
"line_end": 617,
"args": [
{
"name": "self"
},
{
"name": "format_type",
"type_hint": "str"
}
],
"return_type": "str",
"docstring": "获取系统提示词",
"is_async": false,
"decorators": [],
"code": " def _get_system_prompt(self, format_type: str) -> str:\n \"\"\"获取系统提示词\"\"\"\n try:\n return self.prompt_manager.get_prompt(f\"content_transformation_{format_type}\", \"system\")\n except:\n # 使用默认系统提示词\n return f\"\"\"你是一个专业的内容编辑和转换专家,擅长将各种文档内容转换为 {self.SUPPORTED_FORMATS.get(format_type, format_type)} 格式。\n\n你的任务是\n1. 理解和分析原始文档内容\n2. 提取关键信息和要点\n3. 按照目标格式的要求重新组织内容\n4. 确保信息准确、完整、易读\n\n要求\n- 保持信息的准确性和完整性\n- 语言简洁明了,逻辑清晰\n- 适合目标受众阅读\n- 突出重点信息和特色内容\"\"\"",
"code_hash": "4d3dbd3d80fe6e57b42cff669f992171"
},
{
"name": "_build_user_prompt",
"line_start": 619,
"line_end": 650,
"args": [
{
"name": "self"
},
{
"name": "content",
"type_hint": "IntegratedContent"
},
{
"name": "format_name",
"type_hint": "str"
},
{
"name": "additional_requirements",
"type_hint": "Optional[str]"
},
{
"name": "format_template",
"type_hint": "str"
}
],
"return_type": "str",
"docstring": "构建用户提示词",
"is_async": false,
"decorators": [],
"code": " def _build_user_prompt(\n self,\n content: IntegratedContent,\n format_name: str,\n additional_requirements: Optional[str],\n format_template: str\n ) -> str:\n \"\"\"构建用户提示词\"\"\"\n \n prompt_parts = [\n f\"请将以下文档内容转换为{format_name}格式:\",\n \"\",\n f\"原始文档摘要:\",\n content.content_summary,\n \"\",\n f\"关键主题:{', '.join(content.key_topics)}\",\n \"\",\n f\"文档内容:\",\n content.combined_content,\n \"\",\n f\"转换要求:\",\n format_template\n ]\n \n if additional_requirements:\n prompt_parts.extend([\n \"\",\n f\"额外要求:\",\n additional_requirements\n ])\n \n return \"\\n\".join(prompt_parts)",
"code_hash": "d4556acf4f03ffbaa313456613c094b5"
},
{
"name": "_extract_structured_data",
"line_start": 652,
"line_end": 672,
"args": [
{
"name": "self"
},
{
"name": "text",
"type_hint": "str"
},
{
"name": "data_type",
"type_hint": "str"
}
],
"return_type": "Optional[Dict[str, Any]]",
"docstring": "从转换结果中提取结构化数据",
"is_async": false,
"decorators": [],
"code": " def _extract_structured_data(self, text: str, data_type: str) -> Optional[Dict[str, Any]]:\n \"\"\"从转换结果中提取结构化数据\"\"\"\n try:\n # 简单的关键信息提取\n structured = {}\n \n if data_type == \"attraction\":\n # 提取景区相关结构化信息\n structured = self._extract_attraction_data(text)\n elif data_type == \"product\":\n # 提取产品相关结构化信息\n structured = self._extract_product_data(text)\n elif data_type in [\"guide\", \"blog\", \"summary\", \"marketing\", \"faq\"]:\n # 提取通用结构化信息\n structured = self._extract_general_data(text)\n \n return structured if structured else None\n \n except Exception as e:\n logger.warning(f\"结构化数据提取失败: {e}\")\n return None",
"code_hash": "3ee64ced14f9e07fe0c4b2848c6889dd"
},
{
"name": "_extract_attraction_data",
"line_start": 674,
"line_end": 706,
"args": [
{
"name": "self"
},
{
"name": "text",
"type_hint": "str"
}
],
"return_type": "Dict[str, Any]",
"docstring": "提取景区数据",
"is_async": false,
"decorators": [],
"code": " def _extract_attraction_data(self, text: str) -> Dict[str, Any]:\n \"\"\"提取景区数据\"\"\"\n import re\n \n data = {}\n \n # 提取价格信息\n price_patterns = [\n r'门票[:]?\\s*(\\d+(?:\\.\\d+)?)\\s*元',\n r'票价[:]?\\s*(\\d+(?:\\.\\d+)?)\\s*元',\n r'价格[:]?\\s*(\\d+(?:\\.\\d+)?)\\s*元'\n ]\n \n for pattern in price_patterns:\n match = re.search(pattern, text)\n if match:\n data['ticket_price'] = float(match.group(1))\n break\n \n # 提取时间信息\n time_patterns = [\n r'开放时间[:]?\\s*([^\\n]+)',\n r'营业时间[:]?\\s*([^\\n]+)',\n r'游览时间[:]?\\s*([^\\n]+)'\n ]\n \n for pattern in time_patterns:\n match = re.search(pattern, text)\n if match:\n data['opening_hours'] = match.group(1).strip()\n break\n \n return data",
"code_hash": "f37d939e1bd1927abe6e43a5a86d161f"
},
{
"name": "_extract_product_data",
"line_start": 708,
"line_end": 727,
"args": [
{
"name": "self"
},
{
"name": "text",
"type_hint": "str"
}
],
"return_type": "Dict[str, Any]",
"docstring": "提取产品数据",
"is_async": false,
"decorators": [],
"code": " def _extract_product_data(self, text: str) -> Dict[str, Any]:\n \"\"\"提取产品数据\"\"\"\n import re\n \n data = {}\n \n # 提取价格信息\n price_patterns = [\n r'原价[:]?\\s*(\\d+(?:\\.\\d+)?)',\n r'现价[:]?\\s*(\\d+(?:\\.\\d+)?)',\n r'售价[:]?\\s*(\\d+(?:\\.\\d+)?)'\n ]\n \n for pattern in price_patterns:\n match = re.search(pattern, text)\n if match:\n data['price'] = float(match.group(1))\n break\n \n return data",
"code_hash": "49a87013a6203a797ff1d4bc3e027f82"
},
{
"name": "_extract_general_data",
"line_start": 729,
"line_end": 737,
"args": [
{
"name": "self"
},
{
"name": "text",
"type_hint": "str"
}
],
"return_type": "Dict[str, Any]",
"docstring": "提取通用数据",
"is_async": false,
"decorators": [],
"code": " def _extract_general_data(self, text: str) -> Dict[str, Any]:\n \"\"\"提取通用数据\"\"\"\n data = {\n 'word_count': len(text.split()),\n 'character_count': len(text),\n 'paragraph_count': len([p for p in text.split('\\n\\n') if p.strip()])\n }\n \n return data",
"code_hash": "2f772f6f6268c366f68dfd6eaa28d4ad"
},
{
"name": "_calculate_quality_score",
"line_start": 739,
"line_end": 790,
"args": [
{
"name": "self"
},
{
"name": "original_content",
"type_hint": "IntegratedContent"
},
{
"name": "transformed_text",
"type_hint": "str"
},
{
"name": "format_type",
"type_hint": "str"
}
],
"return_type": "float",
"docstring": "计算转换质量评分",
"is_async": false,
"decorators": [],
"code": " def _calculate_quality_score(\n self,\n original_content: IntegratedContent,\n transformed_text: str,\n format_type: str\n ) -> float:\n \"\"\"计算转换质量评分\"\"\"\n try:\n score = 0.0\n \n # 基于长度的评分30%\n original_length = len(original_content.combined_content)\n transformed_length = len(transformed_text)\n \n if original_length > 0:\n length_ratio = min(transformed_length / original_length, 1.0)\n score += length_ratio * 0.3\n \n # 基于内容完整性的评分40%\n key_topics = original_content.key_topics\n topics_found = sum(1 for topic in key_topics if topic in transformed_text)\n \n if key_topics:\n topic_coverage = topics_found / len(key_topics)\n score += topic_coverage * 0.4\n else:\n score += 0.2 # 如果没有关键主题,给予部分分数\n \n # 基于格式适配性的评分30%\n format_indicators = {\n 'attraction_standard': ['景区', '门票', '开放时间', '交通'],\n 'product_sales': ['产品', '价格', '优惠', '购买'],\n 'travel_guide': ['攻略', '行程', '推荐', '贴士'],\n 'blog_post': ['体验', '感受', '推荐', '分享'],\n 'summary': ['总结', '要点', '关键', '概述'],\n 'marketing_copy': ['优惠', '限时', '立即', '独家'],\n 'faq': ['问题', '答案', '如何', '什么']\n }\n \n format_words = format_indicators.get(format_type, [])\n if format_words:\n format_matches = sum(1 for word in format_words if word in transformed_text)\n format_score = min(format_matches / len(format_words), 1.0)\n score += format_score * 0.3\n else:\n score += 0.15 # 默认格式分数\n \n return min(score, 1.0) # 确保分数不超过1.0\n \n except Exception as e:\n logger.warning(f\"质量评分计算失败: {e}\")\n return 0.5 # 返回中等分数",
"code_hash": "5c0254104230fe26a1b7a472b54feef3"
},
{
"name": "get_supported_formats",
"line_start": 792,
"line_end": 794,
"args": [
{
"name": "self"
}
],
"return_type": "Dict[str, str]",
"docstring": "获取支持的转换格式",
"is_async": false,
"decorators": [],
"code": " def get_supported_formats(self) -> Dict[str, str]:\n \"\"\"获取支持的转换格式\"\"\"\n return self.SUPPORTED_FORMATS.copy()",
"code_hash": "da58535e7c5de66716664cc49f2d4286"
},
{
"name": "is_supported_format",
"line_start": 796,
"line_end": 798,
"args": [
{
"name": "self"
},
{
"name": "format_type",
"type_hint": "str"
}
],
"return_type": "bool",
"docstring": "检查格式是否支持",
"is_async": false,
"decorators": [],
"code": " def is_supported_format(self, format_type: str) -> bool:\n \"\"\"检查格式是否支持\"\"\"\n return format_type in self.SUPPORTED_FORMATS",
"code_hash": "549606aebdf62433bc4275ee211b04bb"
},
{
"name": "get_transformation_stats",
"line_start": 800,
"line_end": 808,
"args": [
{
"name": "self"
}
],
"return_type": "Dict[str, Any]",
"docstring": "获取转换器统计信息",
"is_async": false,
"decorators": [],
"code": " def get_transformation_stats(self) -> Dict[str, Any]:\n \"\"\"获取转换器统计信息\"\"\"\n return {\n 'supported_formats': list(self.SUPPORTED_FORMATS.keys()),\n 'task_model_config': self.task_model_config,\n 'ai_model_info': self.ai_service.get_model_info(),\n 'json_repair_enabled': self.json_processor.enable_repair,\n 'available_methods': list(self.format_methods.keys())\n } ",
"code_hash": "bc0c00ae316130c09cd0b51551919892"
}
],
"docstring": "内容转换器 - 重构版本\n使用AI模型将整合的文档内容转换为指定的标准化格式",
"decorators": [],
"code": "class ContentTransformer:\n \"\"\"\n 内容转换器 - 重构版本\n 使用AI模型将整合的文档内容转换为指定的标准化格式\n \"\"\"\n \n SUPPORTED_FORMATS = {\n 'attraction_standard': '景区标准信息格式',\n 'product_sales': '产品销售介绍格式',\n 'travel_guide': '旅游攻略格式',\n 'blog_post': '博客文章格式',\n 'summary': '内容摘要格式',\n 'structured_data': '结构化数据格式',\n 'marketing_copy': '营销文案格式',\n 'faq': '常见问题格式'\n }\n \n def __init__(self, config: AlgorithmConfig):\n \"\"\"\n 初始化内容转换器\n \n Args:\n config: 算法配置\n \"\"\"\n self.config = config\n self.ai_service = AIService(config.ai_model)\n self.prompt_manager = PromptManager(config.prompts, config.resources)\n self.json_processor = JSONProcessor(\n enable_repair=config.content_generation.enable_json_repair,\n max_repair_attempts=config.content_generation.json_repair_attempts\n )\n \n # 获取任务特定的模型配置\n self.task_model_config = config.ai_model.get_task_config(\"content_transformation\")\n \n # 格式转换方法映射\n self.format_methods = {\n 'attraction_standard': self._transform_to_attraction_standard,\n 'product_sales': self._transform_to_product_sales,\n 'travel_guide': self._transform_to_travel_guide,\n 'blog_post': self._transform_to_blog_post,\n 'summary': self._transform_to_summary,\n 'structured_data': self._transform_to_structured_data,\n 'marketing_copy': self._transform_to_marketing_copy,\n 'faq': self._transform_to_faq\n }\n \n logger.info(f\"内容转换器初始化完成,支持格式: {list(self.SUPPORTED_FORMATS.keys())}\")\n \n async def transform_content(\n self,\n integrated_content: IntegratedContent,\n format_type: str = 'summary',\n custom_prompt: Optional[str] = None,\n additional_requirements: Optional[str] = None\n ) -> TransformedContent:\n \"\"\"\n 转换内容\n \n Args:\n integrated_content: 整合后的内容\n format_type: 目标格式类型\n custom_prompt: 自定义提示词\n additional_requirements: 额外要求\n \n Returns:\n TransformedContent: 转换后的内容\n \n Raises:\n DocumentProcessingError: 转换失败时抛出\n \"\"\"\n if format_type not in self.SUPPORTED_FORMATS:\n raise DocumentProcessingError(f\"不支持的格式类型: {format_type}\")\n \n if not integrated_content.combined_content.strip():\n raise DocumentProcessingError(\"没有可转换的内容\")\n \n try:\n logger.info(f\"开始转换内容为 {format_type} 格式\")\n \n # 获取转换方法\n transform_method = self.format_methods[format_type]\n \n # 执行转换\n transformed_text, structured_data, metadata = await transform_method(\n integrated_content,\n custom_prompt,\n additional_requirements\n )\n \n # 计算质量评分\n quality_score = self._calculate_quality_score(\n integrated_content,\n transformed_text,\n format_type\n )\n \n # 收集转换元数据\n transformation_metadata = {\n 'method': transform_method.__name__,\n 'format_type': format_type,\n 'custom_prompt_used': bool(custom_prompt),\n 'additional_requirements': additional_requirements,\n 'source_document_count': integrated_content.document_count,\n 'source_content_length': len(integrated_content.combined_content),\n 'target_content_length': len(transformed_text),\n 'model_config': self.task_model_config,\n 'transformation_timestamp': datetime.now().isoformat(),\n **metadata\n }\n \n transformed_content = TransformedContent(\n original_content=integrated_content,\n transformed_text=transformed_text,\n format_type=format_type,\n transformation_metadata=transformation_metadata,\n transformed_at=datetime.now(),\n structured_data=structured_data,\n quality_score=quality_score\n )\n \n logger.info(f\"内容转换完成,输出长度: {len(transformed_text)}\")\n return transformed_content\n \n except Exception as e:\n error_msg = f\"内容转换失败 ({format_type}): {str(e)}\"\n logger.error(error_msg, exc_info=True)\n raise DocumentProcessingError(error_msg)\n \n async def _transform_to_attraction_standard(\n self,\n content: IntegratedContent,\n custom_prompt: Optional[str],\n additional_requirements: Optional[str]\n ) -> tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]:\n \"\"\"转换为景区标准信息格式\"\"\"\n \n system_prompt = custom_prompt or self._get_system_prompt(\"attraction_standard\")\n user_prompt = self._build_user_prompt(\n content,\n \"景区标准信息\",\n additional_requirements,\n \"\"\"\n 请按照以下结构整理景区信息:\n 1. 景区基本信息(名称、位置、类型、等级)\n 2. 门票信息(价格、优惠政策、购票方式)\n 3. 开放时间(营业时间、季节性变化)\n 4. 交通指南(公共交通、自驾路线、停车信息)\n 5. 景区特色(主要景点、特色活动、文化背景)\n 6. 服务设施(餐饮、住宿、购物、卫生间等)\n 7. 游览建议(推荐路线、游览时长、最佳时间)\n 8. 注意事项(安全提醒、禁止事项、特殊要求)\n \"\"\"\n )\n \n result, tokens_input, tokens_output, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"景区标准信息转换\",\n **self.task_model_config\n )\n \n # 尝试提取结构化数据\n structured_data = self._extract_structured_data(result, \"attraction\")\n \n metadata = {\n 'tokens_input': tokens_input,\n 'tokens_output': tokens_output,\n 'elapsed_time': elapsed_time,\n 'extraction_method': 'ai_attraction_standard'\n }\n \n return result.strip(), structured_data, metadata\n \n async def _transform_to_product_sales(\n self,\n content: IntegratedContent,\n custom_prompt: Optional[str],\n additional_requirements: Optional[str]\n ) -> tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]:\n \"\"\"转换为产品销售介绍格式\"\"\"\n \n system_prompt = custom_prompt or self._get_system_prompt(\"product_sales\")\n user_prompt = self._build_user_prompt(\n content,\n \"产品销售介绍\",\n additional_requirements,\n \"\"\"\n 请按照销售文案的要求整理产品信息:\n 1. 产品亮点(核心卖点、独特优势)\n 2. 产品详情(套餐内容、服务项目、规格说明)\n 3. 价格体系(原价、优惠价、性价比分析)\n 4. 适用人群(目标客户、使用场景)\n 5. 购买指南(预订方式、使用方法、有效期)\n 6. 客户保障(退改政策、服务承诺、联系方式)\n 7. 用户评价(客户反馈、推荐理由)\n \"\"\"\n )\n \n result, tokens_input, tokens_output, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"产品销售介绍转换\",\n **self.task_model_config\n )\n \n structured_data = self._extract_structured_data(result, \"product\")\n \n metadata = {\n 'tokens_input': tokens_input,\n 'tokens_output': tokens_output,\n 'elapsed_time': elapsed_time,\n 'extraction_method': 'ai_product_sales'\n }\n \n return result.strip(), structured_data, metadata\n \n async def _transform_to_travel_guide(\n self,\n content: IntegratedContent,\n custom_prompt: Optional[str],\n additional_requirements: Optional[str]\n ) -> tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]:\n \"\"\"转换为旅游攻略格式\"\"\"\n \n system_prompt = custom_prompt or self._get_system_prompt(\"travel_guide\")\n user_prompt = self._build_user_prompt(\n content,\n \"旅游攻略\",\n additional_requirements,\n \"\"\"\n 请整理成实用的旅游攻略:\n 1. 目的地概览(地理位置、气候特点、最佳旅游时间)\n 2. 行程规划(推荐天数、必游景点、路线安排)\n 3. 交通攻略(到达方式、当地交通、费用预算)\n 4. 住宿推荐(不同档次选择、位置建议、预订提醒)\n 5. 美食指南(特色菜品、推荐餐厅、小吃街区)\n 6. 购物指南(特产推荐、购物地点、注意事项)\n 7. 实用贴士(天气准备、必备物品、省钱技巧)\n 8. 安全提醒(注意事项、紧急联系方式)\n \"\"\"\n )\n \n result, tokens_input, tokens_output, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"旅游攻略转换\",\n **self.task_model_config\n )\n \n structured_data = self._extract_structured_data(result, \"guide\")\n \n metadata = {\n 'tokens_input': tokens_input,\n 'tokens_output': tokens_output,\n 'elapsed_time': elapsed_time,\n 'extraction_method': 'ai_travel_guide'\n }\n \n return result.strip(), structured_data, metadata\n \n async def _transform_to_blog_post(\n self,\n content: IntegratedContent,\n custom_prompt: Optional[str],\n additional_requirements: Optional[str]\n ) -> tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]:\n \"\"\"转换为博客文章格式\"\"\"\n \n system_prompt = custom_prompt or self._get_system_prompt(\"blog_post\")\n user_prompt = self._build_user_prompt(\n content,\n \"博客文章\",\n additional_requirements,\n \"\"\"\n 请整理成吸引人的博客文章:\n 1. 引人入胜的标题\n 2. 开头引言(吸引读者兴趣)\n 3. 主体内容(详细介绍、个人体验、实用信息)\n 4. 精彩亮点(重点推荐、独特发现)\n 5. 实用建议(贴心提醒、经验分享)\n 6. 总结感悟(个人感受、推荐理由)\n 7. 互动结尾(鼓励评论、分享邀请)\n \n 要求:语言生动有趣,有个人色彩,适合社交媒体分享。\n \"\"\"\n )\n \n result, tokens_input, tokens_output, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"博客文章转换\",\n **self.task_model_config\n )\n \n structured_data = self._extract_structured_data(result, \"blog\")\n \n metadata = {\n 'tokens_input': tokens_input,\n 'tokens_output': tokens_output,\n 'elapsed_time': elapsed_time,\n 'extraction_method': 'ai_blog_post'\n }\n \n return result.strip(), structured_data, metadata\n \n async def _transform_to_summary(\n self,\n content: IntegratedContent,\n custom_prompt: Optional[str],\n additional_requirements: Optional[str]\n ) -> tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]:\n \"\"\"转换为内容摘要格式\"\"\"\n \n system_prompt = custom_prompt or self._get_system_prompt(\"summary\")\n user_prompt = self._build_user_prompt(\n content,\n \"内容摘要\",\n additional_requirements,\n \"\"\"\n 请提供简洁的内容摘要:\n 1. 核心信息总结(主要内容、关键信息)\n 2. 重要数据提取(价格、时间、数量等)\n 3. 关键特色亮点(独特卖点、优势特征)\n 4. 实用信息汇总(联系方式、地址、网址等)\n \n 要求:简洁明了,突出重点,便于快速了解。\n \"\"\"\n )\n \n result, tokens_input, tokens_output, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"内容摘要转换\",\n **self.task_model_config\n )\n \n structured_data = self._extract_structured_data(result, \"summary\")\n \n metadata = {\n 'tokens_input': tokens_input,\n 'tokens_output': tokens_output,\n 'elapsed_time': elapsed_time,\n 'extraction_method': 'ai_summary'\n }\n \n return result.strip(), structured_data, metadata\n \n async def _transform_to_structured_data(\n self,\n content: IntegratedContent,\n custom_prompt: Optional[str],\n additional_requirements: Optional[str]\n ) -> tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]:\n \"\"\"转换为结构化数据格式\"\"\"\n \n system_prompt = custom_prompt or \"\"\"你是一个数据结构化专家负责将非结构化的文档内容转换为标准的JSON格式数据。\n\n请按照以下JSON结构提取和整理信息\n{\n \"basic_info\": {\n \"name\": \"名称\",\n \"type\": \"类型\",\n \"location\": \"位置\",\n \"description\": \"简介\"\n },\n \"pricing\": {\n \"ticket_price\": \"门票价格\",\n \"package_price\": \"套餐价格\",\n \"discount_info\": \"优惠信息\"\n },\n \"schedule\": {\n \"opening_hours\": \"开放时间\",\n \"best_visit_time\": \"最佳游览时间\",\n \"duration\": \"建议游览时长\"\n },\n \"transportation\": {\n \"public_transport\": \"公共交通\",\n \"self_driving\": \"自驾信息\",\n \"parking\": \"停车信息\"\n },\n \"services\": {\n \"facilities\": [\"设施列表\"],\n \"dining\": [\"餐饮选择\"],\n \"accommodation\": [\"住宿选择\"]\n },\n \"highlights\": [\"亮点特色\"],\n \"tips\": [\"游览贴士\"],\n \"contact\": {\n \"phone\": \"联系电话\",\n \"website\": \"官方网站\",\n \"address\": \"详细地址\"\n }\n}\n\n请严格按照JSON格式输出没有信息的字段可以设为null或空数组。\"\"\"\n \n user_prompt = f\"\"\"请将以下文档内容转换为结构化的JSON数据\n\n{content.combined_content}\n\n{f'额外要求:{additional_requirements}' if additional_requirements else ''}\n\n请提取所有可用信息并按照指定的JSON结构格式化输出。\"\"\"\n \n result, tokens_input, tokens_output, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"结构化数据转换\",\n **self.task_model_config\n )\n \n # 解析JSON结构化数据\n try:\n structured_data = self.json_processor.parse_llm_output(\n raw_output=result,\n expected_fields=[\"basic_info\", \"pricing\", \"schedule\", \"transportation\", \"services\"],\n required_fields=[\"basic_info\"]\n )\n except Exception as e:\n logger.warning(f\"结构化数据解析失败: {e}\")\n structured_data = None\n \n metadata = {\n 'tokens_input': tokens_input,\n 'tokens_output': tokens_output,\n 'elapsed_time': elapsed_time,\n 'extraction_method': 'ai_structured_data',\n 'json_parsing_success': structured_data is not None\n }\n \n return result.strip(), structured_data, metadata\n \n async def _transform_to_marketing_copy(\n self,\n content: IntegratedContent,\n custom_prompt: Optional[str],\n additional_requirements: Optional[str]\n ) -> tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]:\n \"\"\"转换为营销文案格式\"\"\"\n \n system_prompt = custom_prompt or self._get_system_prompt(\"marketing_copy\")\n user_prompt = self._build_user_prompt(\n content,\n \"营销文案\",\n additional_requirements,\n \"\"\"\n 请创作有吸引力的营销文案:\n 1. 吸引眼球的标题(突出卖点、制造悬念)\n 2. 开头钩子(抓住痛点、激发兴趣)\n 3. 价值主张(核心利益、独特优势)\n 4. 社会证明(客户好评、权威认证)\n 5. 紧迫感营造(限时优惠、数量有限)\n 6. 行动召唤(明确指引、简化流程)\n 7. 风险消除(保障承诺、退款政策)\n \n 要求:语言有感染力,突出情感共鸣,促进转化行动。\n \"\"\"\n )\n \n result, tokens_input, tokens_output, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"营销文案转换\",\n **self.task_model_config\n )\n \n structured_data = self._extract_structured_data(result, \"marketing\")\n \n metadata = {\n 'tokens_input': tokens_input,\n 'tokens_output': tokens_output,\n 'elapsed_time': elapsed_time,\n 'extraction_method': 'ai_marketing_copy'\n }\n \n return result.strip(), structured_data, metadata\n \n async def _transform_to_faq(\n self,\n content: IntegratedContent,\n custom_prompt: Optional[str],\n additional_requirements: Optional[str]\n ) -> tuple[str, Optional[Dict[str, Any]], Dict[str, Any]]:\n \"\"\"转换为常见问题格式\"\"\"\n \n system_prompt = custom_prompt or self._get_system_prompt(\"faq\")\n user_prompt = self._build_user_prompt(\n content,\n \"常见问题FAQ\",\n additional_requirements,\n \"\"\"\n 请整理成常见问题FAQ格式\n \n 基本信息类:\n - 景区/产品介绍相关问题\n - 位置交通相关问题\n \n 预订购买类:\n - 门票价格和购买问题\n - 预订流程和注意事项\n \n 游览体验类:\n - 开放时间和游览时长\n - 设施服务和特色项目\n \n 实用贴士类:\n - 注意事项和安全提醒\n - 最佳游览时间和建议\n \n 售后服务类:\n - 退改政策和联系方式\n - 投诉建议和客服支持\n \n 每个问题要简洁明了,答案要准确详细。\n \"\"\"\n )\n \n result, tokens_input, tokens_output, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"FAQ转换\",\n **self.task_model_config\n )\n \n structured_data = self._extract_structured_data(result, \"faq\")\n \n metadata = {\n 'tokens_input': tokens_input,\n 'tokens_output': tokens_output,\n 'elapsed_time': elapsed_time,\n 'extraction_method': 'ai_faq'\n }\n \n return result.strip(), structured_data, metadata\n \n def _get_system_prompt(self, format_type: str) -> str:\n \"\"\"获取系统提示词\"\"\"\n try:\n return self.prompt_manager.get_prompt(f\"content_transformation_{format_type}\", \"system\")\n except:\n # 使用默认系统提示词\n return f\"\"\"你是一个专业的内容编辑和转换专家,擅长将各种文档内容转换为 {self.SUPPORTED_FORMATS.get(format_type, format_type)} 格式。\n\n你的任务是\n1. 理解和分析原始文档内容\n2. 提取关键信息和要点\n3. 按照目标格式的要求重新组织内容\n4. 确保信息准确、完整、易读\n\n要求\n- 保持信息的准确性和完整性\n- 语言简洁明了,逻辑清晰\n- 适合目标受众阅读\n- 突出重点信息和特色内容\"\"\"\n \n def _build_user_prompt(\n self,\n content: IntegratedContent,\n format_name: str,\n additional_requirements: Optional[str],\n format_template: str\n ) -> str:\n \"\"\"构建用户提示词\"\"\"\n \n prompt_parts = [\n f\"请将以下文档内容转换为{format_name}格式:\",\n \"\",\n f\"原始文档摘要:\",\n content.content_summary,\n \"\",\n f\"关键主题:{', '.join(content.key_topics)}\",\n \"\",\n f\"文档内容:\",\n content.combined_content,\n \"\",\n f\"转换要求:\",\n format_template\n ]\n \n if additional_requirements:\n prompt_parts.extend([\n \"\",\n f\"额外要求:\",\n additional_requirements\n ])\n \n return \"\\n\".join(prompt_parts)\n \n def _extract_structured_data(self, text: str, data_type: str) -> Optional[Dict[str, Any]]:\n \"\"\"从转换结果中提取结构化数据\"\"\"\n try:\n # 简单的关键信息提取\n structured = {}\n \n if data_type == \"attraction\":\n # 提取景区相关结构化信息\n structured = self._extract_attraction_data(text)\n elif data_type == \"product\":\n # 提取产品相关结构化信息\n structured = self._extract_product_data(text)\n elif data_type in [\"guide\", \"blog\", \"summary\", \"marketing\", \"faq\"]:\n # 提取通用结构化信息\n structured = self._extract_general_data(text)\n \n return structured if structured else None\n \n except Exception as e:\n logger.warning(f\"结构化数据提取失败: {e}\")\n return None\n \n def _extract_attraction_data(self, text: str) -> Dict[str, Any]:\n \"\"\"提取景区数据\"\"\"\n import re\n \n data = {}\n \n # 提取价格信息\n price_patterns = [\n r'门票[:]?\\s*(\\d+(?:\\.\\d+)?)\\s*元',\n r'票价[:]?\\s*(\\d+(?:\\.\\d+)?)\\s*元',\n r'价格[:]?\\s*(\\d+(?:\\.\\d+)?)\\s*元'\n ]\n \n for pattern in price_patterns:\n match = re.search(pattern, text)\n if match:\n data['ticket_price'] = float(match.group(1))\n break\n \n # 提取时间信息\n time_patterns = [\n r'开放时间[:]?\\s*([^\\n]+)',\n r'营业时间[:]?\\s*([^\\n]+)',\n r'游览时间[:]?\\s*([^\\n]+)'\n ]\n \n for pattern in time_patterns:\n match = re.search(pattern, text)\n if match:\n data['opening_hours'] = match.group(1).strip()\n break\n \n return data\n \n def _extract_product_data(self, text: str) -> Dict[str, Any]:\n \"\"\"提取产品数据\"\"\"\n import re\n \n data = {}\n \n # 提取价格信息\n price_patterns = [\n r'原价[:]?\\s*(\\d+(?:\\.\\d+)?)',\n r'现价[:]?\\s*(\\d+(?:\\.\\d+)?)',\n r'售价[:]?\\s*(\\d+(?:\\.\\d+)?)'\n ]\n \n for pattern in price_patterns:\n match = re.search(pattern, text)\n if match:\n data['price'] = float(match.group(1))\n break\n \n return data\n \n def _extract_general_data(self, text: str) -> Dict[str, Any]:\n \"\"\"提取通用数据\"\"\"\n data = {\n 'word_count': len(text.split()),\n 'character_count': len(text),\n 'paragraph_count': len([p for p in text.split('\\n\\n') if p.strip()])\n }\n \n return data\n \n def _calculate_quality_score(\n self,\n original_content: IntegratedContent,\n transformed_text: str,\n format_type: str\n ) -> float:\n \"\"\"计算转换质量评分\"\"\"\n try:\n score = 0.0\n \n # 基于长度的评分30%\n original_length = len(original_content.combined_content)\n transformed_length = len(transformed_text)\n \n if original_length > 0:\n length_ratio = min(transformed_length / original_length, 1.0)\n score += length_ratio * 0.3\n \n # 基于内容完整性的评分40%\n key_topics = original_content.key_topics\n topics_found = sum(1 for topic in key_topics if topic in transformed_text)\n \n if key_topics:\n topic_coverage = topics_found / len(key_topics)\n score += topic_coverage * 0.4\n else:\n score += 0.2 # 如果没有关键主题,给予部分分数\n \n # 基于格式适配性的评分30%\n format_indicators = {\n 'attraction_standard': ['景区', '门票', '开放时间', '交通'],\n 'product_sales': ['产品', '价格', '优惠', '购买'],\n 'travel_guide': ['攻略', '行程', '推荐', '贴士'],\n 'blog_post': ['体验', '感受', '推荐', '分享'],\n 'summary': ['总结', '要点', '关键', '概述'],\n 'marketing_copy': ['优惠', '限时', '立即', '独家'],\n 'faq': ['问题', '答案', '如何', '什么']\n }\n \n format_words = format_indicators.get(format_type, [])\n if format_words:\n format_matches = sum(1 for word in format_words if word in transformed_text)\n format_score = min(format_matches / len(format_words), 1.0)\n score += format_score * 0.3\n else:\n score += 0.15 # 默认格式分数\n \n return min(score, 1.0) # 确保分数不超过1.0\n \n except Exception as e:\n logger.warning(f\"质量评分计算失败: {e}\")\n return 0.5 # 返回中等分数\n \n def get_supported_formats(self) -> Dict[str, str]:\n \"\"\"获取支持的转换格式\"\"\"\n return self.SUPPORTED_FORMATS.copy()\n \n def is_supported_format(self, format_type: str) -> bool:\n \"\"\"检查格式是否支持\"\"\"\n return format_type in self.SUPPORTED_FORMATS\n \n def get_transformation_stats(self) -> Dict[str, Any]:\n \"\"\"获取转换器统计信息\"\"\"\n return {\n 'supported_formats': list(self.SUPPORTED_FORMATS.keys()),\n 'task_model_config': self.task_model_config,\n 'ai_model_info': self.ai_service.get_model_info(),\n 'json_repair_enabled': self.json_processor.enable_repair,\n 'available_methods': list(self.format_methods.keys())\n } ",
"code_hash": "4c7799e2549b35c98c7461ad6027491b"
}
],
"imports": [
{
"type": "import",
"modules": [
"logging"
],
"aliases": []
},
{
"type": "from_import",
"module": "typing",
"names": [
"Dict",
"Any",
"Optional",
"List"
],
"aliases": [],
"level": 0
},
{
"type": "from_import",
"module": "dataclasses",
"names": [
"dataclass"
],
"aliases": [],
"level": 0
},
{
"type": "from_import",
"module": "datetime",
"names": [
"datetime"
],
"aliases": [],
"level": 0
},
{
"type": "from_import",
"module": "content_integrator",
"names": [
"IntegratedContent"
],
"aliases": [],
"level": 1
},
{
"type": "from_import",
"module": "config",
"names": [
"AlgorithmConfig"
],
"aliases": [],
"level": 2
},
{
"type": "from_import",
"module": "core",
"names": [
"AIService",
"PromptManager",
"JSONProcessor"
],
"aliases": [],
"level": 2
},
{
"type": "from_import",
"module": "exceptions",
"names": [
"ContentGenerationError",
"DocumentProcessingError"
],
"aliases": [],
"level": 2
},
{
"type": "import",
"modules": [
"re"
],
"aliases": []
},
{
"type": "import",
"modules": [
"re"
],
"aliases": []
}
],
"constants": [
{
"name": "SUPPORTED_FORMATS",
"value": {
"attraction_standard": "景区标准信息格式",
"product_sales": "产品销售介绍格式",
"travel_guide": "旅游攻略格式",
"blog_post": "博客文章格式",
"summary": "内容摘要格式",
"structured_data": "结构化数据格式",
"marketing_copy": "营销文案格式",
"faq": "常见问题格式"
},
"type": "dict",
"line": 69
}
],
"docstring": "Content Transformer\n内容转换器 - 重构版本使用AI将整合的文档内容转换为标准化格式",
"content_hash": "a3f673c6cd48bb63f2f1e1755df0b527"
}