2025-07-31 15:35:23 +08:00

375 lines
44 KiB
JSON
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"file_path": "travel-algorithms/travel_algorithms/content_generation/content_generator.py",
"file_size": 12265,
"line_count": 353,
"functions": [
{
"name": "__init__",
"line_start": 26,
"line_end": 48,
"args": [
{
"name": "self"
},
{
"name": "config",
"type_hint": "AlgorithmConfig"
}
],
"return_type": null,
"docstring": "初始化内容生成器\n\nArgs:\n config: 算法配置",
"is_async": false,
"decorators": [],
"code": " def __init__(self, config: AlgorithmConfig):\n \"\"\"\n 初始化内容生成器\n\n Args:\n config: 算法配置\n \"\"\"\n self.config = config\n self.ai_service = AIService(config.ai_model)\n self.output_manager = OutputManager(config.output)\n self.prompt_manager = PromptManager(config.prompts, config.resources)\n \n # 初始化JSON处理器\n self.json_processor = JSONProcessor(\n enable_repair=config.content_generation.enable_json_repair,\n max_repair_attempts=config.content_generation.json_repair_attempts\n )\n \n # 获取任务特定的模型配置和字段配置\n self.task_model_config = config.ai_model.get_task_config(\"content_generation\")\n self.field_config = config.content_generation.result_field_mapping.get(\"content_generation\", {})\n \n logger.info(f\"内容生成器初始化完成,使用模型参数: {self.task_model_config}\")",
"code_hash": "a4aa3983bc905147da0ff79e7a39f46a"
},
{
"name": "_parse_content_result",
"line_start": 151,
"line_end": 181,
"args": [
{
"name": "self"
},
{
"name": "raw_content",
"type_hint": "str"
},
{
"name": "topic",
"type_hint": "Optional[Dict[str, Any]]"
}
],
"return_type": "Dict[str, Any]",
"docstring": "解析内容生成结果\n\nArgs:\n raw_content: AI原始输出\n topic: 原始主题信息\n\nReturns:\n 解析后的内容字典",
"is_async": false,
"decorators": [],
"code": " def _parse_content_result(self, raw_content: str, topic: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:\n \"\"\"\n 解析内容生成结果\n\n Args:\n raw_content: AI原始输出\n topic: 原始主题信息\n\n Returns:\n 解析后的内容字典\n \"\"\"\n try:\n # 使用JSON处理器解析\n parsed_data = self.json_processor.parse_llm_output(\n raw_output=raw_content,\n expected_fields=self.field_config.get(\"expected_fields\", [\"title\", \"content\", \"tag\"]),\n required_fields=self.field_config.get(\"required_fields\", [\"title\", \"content\"])\n )\n\n # 根据实际提示词返回格式进行标准化\n if isinstance(parsed_data, dict):\n content_data = self._normalize_content_data(parsed_data, topic)\n else:\n # 如果不是预期的字典格式,创建默认结构\n content_data = self._create_fallback_content_data(raw_content, topic)\n\n return content_data\n\n except Exception as e:\n logger.warning(f\"JSON解析失败使用回退方案: {e}\")\n return self._create_fallback_content_data(raw_content, topic)",
"code_hash": "adf9e723962b8af5a76599afd15f51ee"
},
{
"name": "_normalize_content_data",
"line_start": 183,
"line_end": 235,
"args": [
{
"name": "self"
},
{
"name": "parsed_data",
"type_hint": "Dict[str, Any]"
},
{
"name": "topic",
"type_hint": "Optional[Dict[str, Any]]"
}
],
"return_type": "Dict[str, Any]",
"docstring": "标准化内容数据格式\n\nArgs:\n parsed_data: 解析后的数据\n topic: 原始主题信息\n\nReturns:\n 标准化后的内容字典",
"is_async": false,
"decorators": [],
"code": " def _normalize_content_data(self, parsed_data: Dict[str, Any], topic: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:\n \"\"\"\n 标准化内容数据格式\n\n Args:\n parsed_data: 解析后的数据\n topic: 原始主题信息\n\n Returns:\n 标准化后的内容字典\n \"\"\"\n # 根据实际提示词的输出格式进行标准化\n normalized_data = {\n # 核心字段根据实际system.txt的输出格式\n \"title\": parsed_data.get(\"title\", \"\"),\n \"content\": parsed_data.get(\"content\", \"\"),\n \"tag\": parsed_data.get(\"tag\", \"\"),\n \n # 计算字段\n \"word_count\": len(parsed_data.get(\"content\", \"\")),\n \"title_length\": len(parsed_data.get(\"title\", \"\")),\n \"tag_count\": len(parsed_data.get(\"tag\", \"\").split(\"#\")) - 1 if parsed_data.get(\"tag\") else 0,\n \n # 元数据\n \"metadata\": {\n \"source_topic\": topic,\n \"generation_config\": self.task_model_config,\n \"field_config\": self.field_config,\n \"generated_at\": self.output_manager.run_id,\n \"format_version\": \"v1\",\n \"original_parsed_data\": parsed_data\n }\n }\n\n # 验证关键字段\n if not normalized_data[\"title\"] or not normalized_data[\"content\"]:\n logger.warning(\"解析结果缺少关键字段,尝试从原始数据提取\")\n \n # 尝试其他可能的字段名\n alternative_mappings = {\n \"title\": [\"标题\", \"主题\", \"title\", \"headline\"],\n \"content\": [\"内容\", \"正文\", \"content\", \"text\", \"body\"],\n \"tag\": [\"标签\", \"tag\", \"tags\", \"labels\"]\n }\n \n for std_field, alternatives in alternative_mappings.items():\n if not normalized_data[std_field]:\n for alt_field in alternatives:\n if alt_field in parsed_data and parsed_data[alt_field]:\n normalized_data[std_field] = parsed_data[alt_field]\n break\n\n return normalized_data",
"code_hash": "5bfb220ef95ac52397bb577a0a8ce558"
},
{
"name": "_create_fallback_content_data",
"line_start": 237,
"line_end": 283,
"args": [
{
"name": "self"
},
{
"name": "raw_content",
"type_hint": "str"
},
{
"name": "topic",
"type_hint": "Optional[Dict[str, Any]]"
}
],
"return_type": "Dict[str, Any]",
"docstring": "创建回退的内容数据当JSON解析失败时\n\nArgs:\n raw_content: 原始内容\n topic: 主题信息\n\nReturns:\n 回退的内容字典",
"is_async": false,
"decorators": [],
"code": " def _create_fallback_content_data(self, raw_content: str, topic: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:\n \"\"\"\n 创建回退的内容数据当JSON解析失败时\n\n Args:\n raw_content: 原始内容\n topic: 主题信息\n\n Returns:\n 回退的内容字典\n \"\"\"\n # 尝试从原始文本中提取结构化信息\n extracted_title = \"\"\n extracted_content = raw_content\n extracted_tag = \"\"\n\n # 简单的文本解析逻辑\n lines = raw_content.split('\\n')\n if lines:\n first_line = lines[0].strip()\n # 如果第一行较短且不以句号结尾,可能是标题\n if len(first_line) < 100 and not first_line.endswith('。'):\n extracted_title = first_line\n extracted_content = '\\n'.join(lines[1:]).strip()\n\n # 寻找标签(通常包含#符号)\n for line in lines:\n if '#' in line:\n extracted_tag = line.strip()\n break\n\n return {\n \"title\": extracted_title,\n \"content\": extracted_content,\n \"tag\": extracted_tag,\n \"word_count\": len(extracted_content),\n \"title_length\": len(extracted_title),\n \"tag_count\": extracted_tag.count('#'),\n \"metadata\": {\n \"source_topic\": topic,\n \"generation_config\": self.task_model_config,\n \"generated_at\": self.output_manager.run_id,\n \"format_version\": \"fallback\",\n \"parsing_method\": \"text_extraction\",\n \"original_raw_content\": raw_content[:500] + \"...\" if len(raw_content) > 500 else raw_content\n }\n }",
"code_hash": "461e0066e4cb7732bacb1e734eff651e"
},
{
"name": "get_generation_stats",
"line_start": 336,
"line_end": 354,
"args": [
{
"name": "self"
}
],
"return_type": "Dict[str, Any]",
"docstring": "获取生成统计信息\n\nReturns:\n 统计信息字典",
"is_async": false,
"decorators": [],
"code": " def get_generation_stats(self) -> Dict[str, Any]:\n \"\"\"\n 获取生成统计信息\n\n Returns:\n 统计信息字典\n \"\"\"\n return {\n \"task_model_config\": self.task_model_config,\n \"field_config\": self.field_config,\n \"output_directory\": str(self.output_manager.run_output_dir),\n \"ai_model_info\": self.ai_service.get_model_info(),\n \"prompt_templates\": self.prompt_manager.get_available_templates().get(\"content_generation\", {}),\n \"json_processor_enabled\": self.json_processor.enable_repair,\n \"content_config\": {\n \"enable_auto_judge\": self.config.content_generation.enable_auto_judge,\n \"judge_threshold\": self.config.content_generation.judge_threshold\n }\n } ",
"code_hash": "eb3347b32eb79d5e7ed3b2d84b3ffc31"
}
],
"classes": [
{
"name": "ContentGenerator",
"line_start": 20,
"line_end": 354,
"bases": [],
"methods": [
{
"name": "__init__",
"line_start": 26,
"line_end": 48,
"args": [
{
"name": "self"
},
{
"name": "config",
"type_hint": "AlgorithmConfig"
}
],
"return_type": null,
"docstring": "初始化内容生成器\n\nArgs:\n config: 算法配置",
"is_async": false,
"decorators": [],
"code": " def __init__(self, config: AlgorithmConfig):\n \"\"\"\n 初始化内容生成器\n\n Args:\n config: 算法配置\n \"\"\"\n self.config = config\n self.ai_service = AIService(config.ai_model)\n self.output_manager = OutputManager(config.output)\n self.prompt_manager = PromptManager(config.prompts, config.resources)\n \n # 初始化JSON处理器\n self.json_processor = JSONProcessor(\n enable_repair=config.content_generation.enable_json_repair,\n max_repair_attempts=config.content_generation.json_repair_attempts\n )\n \n # 获取任务特定的模型配置和字段配置\n self.task_model_config = config.ai_model.get_task_config(\"content_generation\")\n self.field_config = config.content_generation.result_field_mapping.get(\"content_generation\", {})\n \n logger.info(f\"内容生成器初始化完成,使用模型参数: {self.task_model_config}\")",
"code_hash": "a4aa3983bc905147da0ff79e7a39f46a"
},
{
"name": "generate_content",
"line_start": 50,
"line_end": 149,
"args": [
{
"name": "self"
},
{
"name": "style_content",
"type_hint": "str"
},
{
"name": "demand_content",
"type_hint": "str"
},
{
"name": "object_content",
"type_hint": "str"
},
{
"name": "refer_content",
"type_hint": "str"
},
{
"name": "product_content",
"type_hint": "str"
},
{
"name": "topic",
"type_hint": "Optional[Dict[str, Any]]"
}
],
"return_type": "Tuple[str, Dict[str, Any]]",
"docstring": "生成内容\n\nArgs:\n style_content: 风格内容\n demand_content: 需求内容\n object_content: 对象内容(景区信息)\n refer_content: 参考内容\n product_content: 产品内容\n topic: 主题信息(可选)\n **kwargs: 其他参数\n\nReturns:\n Tuple[请求ID, 生成的内容字典]\n\nRaises:\n ContentGenerationError: 生成失败时抛出",
"is_async": true,
"decorators": [],
"code": " async def generate_content(\n self,\n style_content: str,\n demand_content: str,\n object_content: str,\n refer_content: str,\n product_content: str,\n topic: Optional[Dict[str, Any]] = None,\n **kwargs\n ) -> Tuple[str, Dict[str, Any]]:\n \"\"\"\n 生成内容\n\n Args:\n style_content: 风格内容\n demand_content: 需求内容\n object_content: 对象内容(景区信息)\n refer_content: 参考内容\n product_content: 产品内容\n topic: 主题信息(可选)\n **kwargs: 其他参数\n\n Returns:\n Tuple[请求ID, 生成的内容字典]\n\n Raises:\n ContentGenerationError: 生成失败时抛出\n \"\"\"\n try:\n topic_title = topic.get('title', 'Unknown') if topic else 'Direct Content'\n logger.info(f\"开始生成内容,主题: {topic_title}\")\n\n # 1. 构建提示词\n system_prompt = self.prompt_manager.get_prompt(\"content_generation\", \"system\")\n user_prompt_template = self.prompt_manager.get_prompt(\"content_generation\", \"user\")\n \n # 格式化用户提示词(根据实际模板格式)\n user_prompt = self.prompt_manager.format_prompt(\n user_prompt_template,\n style_content=style_content,\n demand_content=demand_content,\n object_content=object_content,\n refer_content=refer_content,\n product_content=product_content,\n **kwargs\n )\n\n # 创建子目录保存内容\n topic_id = topic.get('id', 'direct_content') if topic else 'direct_content'\n subfolder = f\"topic_{topic_id}\"\n\n # 保存提示词(如果配置允许)\n if self.config.output.save_prompts:\n self.output_manager.save_text(system_prompt, \"system_prompt\", \"content_generation\", subfolder)\n self.output_manager.save_text(user_prompt, \"user_prompt\", \"content_generation\", subfolder)\n\n # 2. 调用AI生成\n content, input_tokens, output_tokens, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"内容生成\",\n **self.task_model_config\n )\n\n # 保存原始响应(如果配置允许)\n if self.config.output.save_raw_responses:\n self.output_manager.save_text(content, \"raw_response\", \"content_generation\", subfolder)\n\n # 3. 解析和结构化内容\n content_data = self._parse_content_result(content, topic)\n\n # 4. 保存结果\n self.output_manager.save_json(content_data, \"content\", \"\", subfolder)\n \n # 5. 保存元数据\n metadata = {\n \"topic\": topic,\n \"materials\": {\n \"style_content\": style_content[:100] + \"...\" if len(style_content) > 100 else style_content,\n \"demand_content\": demand_content[:100] + \"...\" if len(demand_content) > 100 else demand_content,\n \"object_content\": object_content[:100] + \"...\" if len(object_content) > 100 else object_content,\n \"product_content\": product_content[:100] + \"...\" if len(product_content) > 100 else product_content\n },\n \"field_config\": self.field_config,\n \"model_config\": self.task_model_config,\n \"tokens\": {\n \"input\": input_tokens,\n \"output\": output_tokens\n },\n \"elapsed_time\": elapsed_time\n }\n self.output_manager.save_metadata(metadata, \"content_generation\", subfolder)\n\n logger.info(f\"内容生成完成,字数: {len(content_data.get('content', ''))}\")\n return self.output_manager.run_id, content_data\n\n except Exception as e:\n error_msg = f\"内容生成失败: {str(e)}\"\n logger.error(error_msg, exc_info=True)\n raise ContentGenerationError(error_msg)",
"code_hash": "196ed64edcbabdd0086aa86557a10918"
},
{
"name": "_parse_content_result",
"line_start": 151,
"line_end": 181,
"args": [
{
"name": "self"
},
{
"name": "raw_content",
"type_hint": "str"
},
{
"name": "topic",
"type_hint": "Optional[Dict[str, Any]]"
}
],
"return_type": "Dict[str, Any]",
"docstring": "解析内容生成结果\n\nArgs:\n raw_content: AI原始输出\n topic: 原始主题信息\n\nReturns:\n 解析后的内容字典",
"is_async": false,
"decorators": [],
"code": " def _parse_content_result(self, raw_content: str, topic: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:\n \"\"\"\n 解析内容生成结果\n\n Args:\n raw_content: AI原始输出\n topic: 原始主题信息\n\n Returns:\n 解析后的内容字典\n \"\"\"\n try:\n # 使用JSON处理器解析\n parsed_data = self.json_processor.parse_llm_output(\n raw_output=raw_content,\n expected_fields=self.field_config.get(\"expected_fields\", [\"title\", \"content\", \"tag\"]),\n required_fields=self.field_config.get(\"required_fields\", [\"title\", \"content\"])\n )\n\n # 根据实际提示词返回格式进行标准化\n if isinstance(parsed_data, dict):\n content_data = self._normalize_content_data(parsed_data, topic)\n else:\n # 如果不是预期的字典格式,创建默认结构\n content_data = self._create_fallback_content_data(raw_content, topic)\n\n return content_data\n\n except Exception as e:\n logger.warning(f\"JSON解析失败使用回退方案: {e}\")\n return self._create_fallback_content_data(raw_content, topic)",
"code_hash": "adf9e723962b8af5a76599afd15f51ee"
},
{
"name": "_normalize_content_data",
"line_start": 183,
"line_end": 235,
"args": [
{
"name": "self"
},
{
"name": "parsed_data",
"type_hint": "Dict[str, Any]"
},
{
"name": "topic",
"type_hint": "Optional[Dict[str, Any]]"
}
],
"return_type": "Dict[str, Any]",
"docstring": "标准化内容数据格式\n\nArgs:\n parsed_data: 解析后的数据\n topic: 原始主题信息\n\nReturns:\n 标准化后的内容字典",
"is_async": false,
"decorators": [],
"code": " def _normalize_content_data(self, parsed_data: Dict[str, Any], topic: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:\n \"\"\"\n 标准化内容数据格式\n\n Args:\n parsed_data: 解析后的数据\n topic: 原始主题信息\n\n Returns:\n 标准化后的内容字典\n \"\"\"\n # 根据实际提示词的输出格式进行标准化\n normalized_data = {\n # 核心字段根据实际system.txt的输出格式\n \"title\": parsed_data.get(\"title\", \"\"),\n \"content\": parsed_data.get(\"content\", \"\"),\n \"tag\": parsed_data.get(\"tag\", \"\"),\n \n # 计算字段\n \"word_count\": len(parsed_data.get(\"content\", \"\")),\n \"title_length\": len(parsed_data.get(\"title\", \"\")),\n \"tag_count\": len(parsed_data.get(\"tag\", \"\").split(\"#\")) - 1 if parsed_data.get(\"tag\") else 0,\n \n # 元数据\n \"metadata\": {\n \"source_topic\": topic,\n \"generation_config\": self.task_model_config,\n \"field_config\": self.field_config,\n \"generated_at\": self.output_manager.run_id,\n \"format_version\": \"v1\",\n \"original_parsed_data\": parsed_data\n }\n }\n\n # 验证关键字段\n if not normalized_data[\"title\"] or not normalized_data[\"content\"]:\n logger.warning(\"解析结果缺少关键字段,尝试从原始数据提取\")\n \n # 尝试其他可能的字段名\n alternative_mappings = {\n \"title\": [\"标题\", \"主题\", \"title\", \"headline\"],\n \"content\": [\"内容\", \"正文\", \"content\", \"text\", \"body\"],\n \"tag\": [\"标签\", \"tag\", \"tags\", \"labels\"]\n }\n \n for std_field, alternatives in alternative_mappings.items():\n if not normalized_data[std_field]:\n for alt_field in alternatives:\n if alt_field in parsed_data and parsed_data[alt_field]:\n normalized_data[std_field] = parsed_data[alt_field]\n break\n\n return normalized_data",
"code_hash": "5bfb220ef95ac52397bb577a0a8ce558"
},
{
"name": "_create_fallback_content_data",
"line_start": 237,
"line_end": 283,
"args": [
{
"name": "self"
},
{
"name": "raw_content",
"type_hint": "str"
},
{
"name": "topic",
"type_hint": "Optional[Dict[str, Any]]"
}
],
"return_type": "Dict[str, Any]",
"docstring": "创建回退的内容数据当JSON解析失败时\n\nArgs:\n raw_content: 原始内容\n topic: 主题信息\n\nReturns:\n 回退的内容字典",
"is_async": false,
"decorators": [],
"code": " def _create_fallback_content_data(self, raw_content: str, topic: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:\n \"\"\"\n 创建回退的内容数据当JSON解析失败时\n\n Args:\n raw_content: 原始内容\n topic: 主题信息\n\n Returns:\n 回退的内容字典\n \"\"\"\n # 尝试从原始文本中提取结构化信息\n extracted_title = \"\"\n extracted_content = raw_content\n extracted_tag = \"\"\n\n # 简单的文本解析逻辑\n lines = raw_content.split('\\n')\n if lines:\n first_line = lines[0].strip()\n # 如果第一行较短且不以句号结尾,可能是标题\n if len(first_line) < 100 and not first_line.endswith('。'):\n extracted_title = first_line\n extracted_content = '\\n'.join(lines[1:]).strip()\n\n # 寻找标签(通常包含#符号)\n for line in lines:\n if '#' in line:\n extracted_tag = line.strip()\n break\n\n return {\n \"title\": extracted_title,\n \"content\": extracted_content,\n \"tag\": extracted_tag,\n \"word_count\": len(extracted_content),\n \"title_length\": len(extracted_title),\n \"tag_count\": extracted_tag.count('#'),\n \"metadata\": {\n \"source_topic\": topic,\n \"generation_config\": self.task_model_config,\n \"generated_at\": self.output_manager.run_id,\n \"format_version\": \"fallback\",\n \"parsing_method\": \"text_extraction\",\n \"original_raw_content\": raw_content[:500] + \"...\" if len(raw_content) > 500 else raw_content\n }\n }",
"code_hash": "461e0066e4cb7732bacb1e734eff651e"
},
{
"name": "generate_content_batch",
"line_start": 285,
"line_end": 311,
"args": [
{
"name": "self"
},
{
"name": "content_requests",
"type_hint": "List[Dict[str, Any]]"
}
],
"return_type": "Dict[str, Dict[str, Any]]",
"docstring": "批量生成内容\n\nArgs:\n content_requests: 内容生成请求列表\n\nReturns:\n 批次ID->内容字典的映射",
"is_async": true,
"decorators": [],
"code": " async def generate_content_batch(\n self,\n content_requests: List[Dict[str, Any]]\n ) -> Dict[str, Dict[str, Any]]:\n \"\"\"\n 批量生成内容\n\n Args:\n content_requests: 内容生成请求列表\n\n Returns:\n 批次ID->内容字典的映射\n \"\"\"\n results = {}\n \n for i, request in enumerate(content_requests):\n try:\n logger.info(f\"批量生成内容 {i+1}/{len(content_requests)}\")\n \n request_id, content_data = await self.generate_content(**request)\n results[f\"request_{i+1}\"] = content_data\n \n except Exception as e:\n logger.error(f\"批量生成第 {i+1} 项失败: {e}\")\n results[f\"request_{i+1}\"] = {\"error\": str(e)}\n \n return results",
"code_hash": "aefbf0a4c79966d70c56a35d0ca43ee3"
},
{
"name": "test_generation",
"line_start": 313,
"line_end": 334,
"args": [
{
"name": "self"
}
],
"return_type": "bool",
"docstring": "测试内容生成功能\n\nReturns:\n 测试是否成功",
"is_async": true,
"decorators": [],
"code": " async def test_generation(self) -> bool:\n \"\"\"\n 测试内容生成功能\n\n Returns:\n 测试是否成功\n \"\"\"\n try:\n test_materials = {\n \"style_content\": \"攻略风格文案\",\n \"demand_content\": \"年轻人周末游需求\",\n \"object_content\": \"上海外滩景区信息\",\n \"refer_content\": \"参考文案范例\",\n \"product_content\": \"外滩一日游产品\"\n }\n \n _, content_data = await self.generate_content(**test_materials)\n return len(content_data.get('content', '')) > 50\n \n except Exception as e:\n logger.error(f\"内容生成测试失败: {e}\")\n return False",
"code_hash": "e66518f839586eee90c7ac8425115fea"
},
{
"name": "get_generation_stats",
"line_start": 336,
"line_end": 354,
"args": [
{
"name": "self"
}
],
"return_type": "Dict[str, Any]",
"docstring": "获取生成统计信息\n\nReturns:\n 统计信息字典",
"is_async": false,
"decorators": [],
"code": " def get_generation_stats(self) -> Dict[str, Any]:\n \"\"\"\n 获取生成统计信息\n\n Returns:\n 统计信息字典\n \"\"\"\n return {\n \"task_model_config\": self.task_model_config,\n \"field_config\": self.field_config,\n \"output_directory\": str(self.output_manager.run_output_dir),\n \"ai_model_info\": self.ai_service.get_model_info(),\n \"prompt_templates\": self.prompt_manager.get_available_templates().get(\"content_generation\", {}),\n \"json_processor_enabled\": self.json_processor.enable_repair,\n \"content_config\": {\n \"enable_auto_judge\": self.config.content_generation.enable_auto_judge,\n \"judge_threshold\": self.config.content_generation.judge_threshold\n }\n } ",
"code_hash": "eb3347b32eb79d5e7ed3b2d84b3ffc31"
}
],
"docstring": "内容生成器 - 重构版本\n负责根据主题生成详细的旅游内容支持真实的字段格式",
"decorators": [],
"code": "class ContentGenerator:\n \"\"\"\n 内容生成器 - 重构版本\n 负责根据主题生成详细的旅游内容,支持真实的字段格式\n \"\"\"\n\n def __init__(self, config: AlgorithmConfig):\n \"\"\"\n 初始化内容生成器\n\n Args:\n config: 算法配置\n \"\"\"\n self.config = config\n self.ai_service = AIService(config.ai_model)\n self.output_manager = OutputManager(config.output)\n self.prompt_manager = PromptManager(config.prompts, config.resources)\n \n # 初始化JSON处理器\n self.json_processor = JSONProcessor(\n enable_repair=config.content_generation.enable_json_repair,\n max_repair_attempts=config.content_generation.json_repair_attempts\n )\n \n # 获取任务特定的模型配置和字段配置\n self.task_model_config = config.ai_model.get_task_config(\"content_generation\")\n self.field_config = config.content_generation.result_field_mapping.get(\"content_generation\", {})\n \n logger.info(f\"内容生成器初始化完成,使用模型参数: {self.task_model_config}\")\n\n async def generate_content(\n self,\n style_content: str,\n demand_content: str,\n object_content: str,\n refer_content: str,\n product_content: str,\n topic: Optional[Dict[str, Any]] = None,\n **kwargs\n ) -> Tuple[str, Dict[str, Any]]:\n \"\"\"\n 生成内容\n\n Args:\n style_content: 风格内容\n demand_content: 需求内容\n object_content: 对象内容(景区信息)\n refer_content: 参考内容\n product_content: 产品内容\n topic: 主题信息(可选)\n **kwargs: 其他参数\n\n Returns:\n Tuple[请求ID, 生成的内容字典]\n\n Raises:\n ContentGenerationError: 生成失败时抛出\n \"\"\"\n try:\n topic_title = topic.get('title', 'Unknown') if topic else 'Direct Content'\n logger.info(f\"开始生成内容,主题: {topic_title}\")\n\n # 1. 构建提示词\n system_prompt = self.prompt_manager.get_prompt(\"content_generation\", \"system\")\n user_prompt_template = self.prompt_manager.get_prompt(\"content_generation\", \"user\")\n \n # 格式化用户提示词(根据实际模板格式)\n user_prompt = self.prompt_manager.format_prompt(\n user_prompt_template,\n style_content=style_content,\n demand_content=demand_content,\n object_content=object_content,\n refer_content=refer_content,\n product_content=product_content,\n **kwargs\n )\n\n # 创建子目录保存内容\n topic_id = topic.get('id', 'direct_content') if topic else 'direct_content'\n subfolder = f\"topic_{topic_id}\"\n\n # 保存提示词(如果配置允许)\n if self.config.output.save_prompts:\n self.output_manager.save_text(system_prompt, \"system_prompt\", \"content_generation\", subfolder)\n self.output_manager.save_text(user_prompt, \"user_prompt\", \"content_generation\", subfolder)\n\n # 2. 调用AI生成\n content, input_tokens, output_tokens, elapsed_time = await self.ai_service.generate_text(\n system_prompt=system_prompt,\n user_prompt=user_prompt,\n stage=\"内容生成\",\n **self.task_model_config\n )\n\n # 保存原始响应(如果配置允许)\n if self.config.output.save_raw_responses:\n self.output_manager.save_text(content, \"raw_response\", \"content_generation\", subfolder)\n\n # 3. 解析和结构化内容\n content_data = self._parse_content_result(content, topic)\n\n # 4. 保存结果\n self.output_manager.save_json(content_data, \"content\", \"\", subfolder)\n \n # 5. 保存元数据\n metadata = {\n \"topic\": topic,\n \"materials\": {\n \"style_content\": style_content[:100] + \"...\" if len(style_content) > 100 else style_content,\n \"demand_content\": demand_content[:100] + \"...\" if len(demand_content) > 100 else demand_content,\n \"object_content\": object_content[:100] + \"...\" if len(object_content) > 100 else object_content,\n \"product_content\": product_content[:100] + \"...\" if len(product_content) > 100 else product_content\n },\n \"field_config\": self.field_config,\n \"model_config\": self.task_model_config,\n \"tokens\": {\n \"input\": input_tokens,\n \"output\": output_tokens\n },\n \"elapsed_time\": elapsed_time\n }\n self.output_manager.save_metadata(metadata, \"content_generation\", subfolder)\n\n logger.info(f\"内容生成完成,字数: {len(content_data.get('content', ''))}\")\n return self.output_manager.run_id, content_data\n\n except Exception as e:\n error_msg = f\"内容生成失败: {str(e)}\"\n logger.error(error_msg, exc_info=True)\n raise ContentGenerationError(error_msg)\n\n def _parse_content_result(self, raw_content: str, topic: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:\n \"\"\"\n 解析内容生成结果\n\n Args:\n raw_content: AI原始输出\n topic: 原始主题信息\n\n Returns:\n 解析后的内容字典\n \"\"\"\n try:\n # 使用JSON处理器解析\n parsed_data = self.json_processor.parse_llm_output(\n raw_output=raw_content,\n expected_fields=self.field_config.get(\"expected_fields\", [\"title\", \"content\", \"tag\"]),\n required_fields=self.field_config.get(\"required_fields\", [\"title\", \"content\"])\n )\n\n # 根据实际提示词返回格式进行标准化\n if isinstance(parsed_data, dict):\n content_data = self._normalize_content_data(parsed_data, topic)\n else:\n # 如果不是预期的字典格式,创建默认结构\n content_data = self._create_fallback_content_data(raw_content, topic)\n\n return content_data\n\n except Exception as e:\n logger.warning(f\"JSON解析失败使用回退方案: {e}\")\n return self._create_fallback_content_data(raw_content, topic)\n\n def _normalize_content_data(self, parsed_data: Dict[str, Any], topic: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:\n \"\"\"\n 标准化内容数据格式\n\n Args:\n parsed_data: 解析后的数据\n topic: 原始主题信息\n\n Returns:\n 标准化后的内容字典\n \"\"\"\n # 根据实际提示词的输出格式进行标准化\n normalized_data = {\n # 核心字段根据实际system.txt的输出格式\n \"title\": parsed_data.get(\"title\", \"\"),\n \"content\": parsed_data.get(\"content\", \"\"),\n \"tag\": parsed_data.get(\"tag\", \"\"),\n \n # 计算字段\n \"word_count\": len(parsed_data.get(\"content\", \"\")),\n \"title_length\": len(parsed_data.get(\"title\", \"\")),\n \"tag_count\": len(parsed_data.get(\"tag\", \"\").split(\"#\")) - 1 if parsed_data.get(\"tag\") else 0,\n \n # 元数据\n \"metadata\": {\n \"source_topic\": topic,\n \"generation_config\": self.task_model_config,\n \"field_config\": self.field_config,\n \"generated_at\": self.output_manager.run_id,\n \"format_version\": \"v1\",\n \"original_parsed_data\": parsed_data\n }\n }\n\n # 验证关键字段\n if not normalized_data[\"title\"] or not normalized_data[\"content\"]:\n logger.warning(\"解析结果缺少关键字段,尝试从原始数据提取\")\n \n # 尝试其他可能的字段名\n alternative_mappings = {\n \"title\": [\"标题\", \"主题\", \"title\", \"headline\"],\n \"content\": [\"内容\", \"正文\", \"content\", \"text\", \"body\"],\n \"tag\": [\"标签\", \"tag\", \"tags\", \"labels\"]\n }\n \n for std_field, alternatives in alternative_mappings.items():\n if not normalized_data[std_field]:\n for alt_field in alternatives:\n if alt_field in parsed_data and parsed_data[alt_field]:\n normalized_data[std_field] = parsed_data[alt_field]\n break\n\n return normalized_data\n\n def _create_fallback_content_data(self, raw_content: str, topic: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:\n \"\"\"\n 创建回退的内容数据当JSON解析失败时\n\n Args:\n raw_content: 原始内容\n topic: 主题信息\n\n Returns:\n 回退的内容字典\n \"\"\"\n # 尝试从原始文本中提取结构化信息\n extracted_title = \"\"\n extracted_content = raw_content\n extracted_tag = \"\"\n\n # 简单的文本解析逻辑\n lines = raw_content.split('\\n')\n if lines:\n first_line = lines[0].strip()\n # 如果第一行较短且不以句号结尾,可能是标题\n if len(first_line) < 100 and not first_line.endswith('。'):\n extracted_title = first_line\n extracted_content = '\\n'.join(lines[1:]).strip()\n\n # 寻找标签(通常包含#符号)\n for line in lines:\n if '#' in line:\n extracted_tag = line.strip()\n break\n\n return {\n \"title\": extracted_title,\n \"content\": extracted_content,\n \"tag\": extracted_tag,\n \"word_count\": len(extracted_content),\n \"title_length\": len(extracted_title),\n \"tag_count\": extracted_tag.count('#'),\n \"metadata\": {\n \"source_topic\": topic,\n \"generation_config\": self.task_model_config,\n \"generated_at\": self.output_manager.run_id,\n \"format_version\": \"fallback\",\n \"parsing_method\": \"text_extraction\",\n \"original_raw_content\": raw_content[:500] + \"...\" if len(raw_content) > 500 else raw_content\n }\n }\n\n async def generate_content_batch(\n self,\n content_requests: List[Dict[str, Any]]\n ) -> Dict[str, Dict[str, Any]]:\n \"\"\"\n 批量生成内容\n\n Args:\n content_requests: 内容生成请求列表\n\n Returns:\n 批次ID->内容字典的映射\n \"\"\"\n results = {}\n \n for i, request in enumerate(content_requests):\n try:\n logger.info(f\"批量生成内容 {i+1}/{len(content_requests)}\")\n \n request_id, content_data = await self.generate_content(**request)\n results[f\"request_{i+1}\"] = content_data\n \n except Exception as e:\n logger.error(f\"批量生成第 {i+1} 项失败: {e}\")\n results[f\"request_{i+1}\"] = {\"error\": str(e)}\n \n return results\n\n async def test_generation(self) -> bool:\n \"\"\"\n 测试内容生成功能\n\n Returns:\n 测试是否成功\n \"\"\"\n try:\n test_materials = {\n \"style_content\": \"攻略风格文案\",\n \"demand_content\": \"年轻人周末游需求\",\n \"object_content\": \"上海外滩景区信息\",\n \"refer_content\": \"参考文案范例\",\n \"product_content\": \"外滩一日游产品\"\n }\n \n _, content_data = await self.generate_content(**test_materials)\n return len(content_data.get('content', '')) > 50\n \n except Exception as e:\n logger.error(f\"内容生成测试失败: {e}\")\n return False\n\n def get_generation_stats(self) -> Dict[str, Any]:\n \"\"\"\n 获取生成统计信息\n\n Returns:\n 统计信息字典\n \"\"\"\n return {\n \"task_model_config\": self.task_model_config,\n \"field_config\": self.field_config,\n \"output_directory\": str(self.output_manager.run_output_dir),\n \"ai_model_info\": self.ai_service.get_model_info(),\n \"prompt_templates\": self.prompt_manager.get_available_templates().get(\"content_generation\", {}),\n \"json_processor_enabled\": self.json_processor.enable_repair,\n \"content_config\": {\n \"enable_auto_judge\": self.config.content_generation.enable_auto_judge,\n \"judge_threshold\": self.config.content_generation.judge_threshold\n }\n } ",
"code_hash": "274f5ef2a9e350ed1c1759427106d4f3"
}
],
"imports": [
{
"type": "import",
"modules": [
"logging"
],
"aliases": []
},
{
"type": "from_import",
"module": "typing",
"names": [
"Dict",
"Any",
"List",
"Optional",
"Tuple"
],
"aliases": [],
"level": 0
},
{
"type": "import",
"modules": [
"json"
],
"aliases": []
},
{
"type": "from_import",
"module": "config",
"names": [
"AlgorithmConfig"
],
"aliases": [],
"level": 2
},
{
"type": "from_import",
"module": "core",
"names": [
"AIService",
"OutputManager",
"PromptManager",
"JSONProcessor"
],
"aliases": [],
"level": 2
},
{
"type": "from_import",
"module": "exceptions",
"names": [
"ContentGenerationError"
],
"aliases": [],
"level": 2
}
],
"constants": [],
"docstring": "Content Generator\n内容生成器 - 重构版本使用动态提示词和JSON处理支持真实的内容格式",
"content_hash": "fc162f397f337e584d5951ca18c75f9a"
}