2025-07-31 15:35:23 +08:00

331 lines
30 KiB
JSON
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"file_path": "travel-algorithms/travel_algorithms/content_generation/topic_parser.py",
"file_size": 6727,
"line_count": 205,
"functions": [
{
"name": "__init__",
"line_start": 23,
"line_end": 29,
"args": [
{
"name": "self"
}
],
"return_type": null,
"docstring": "初始化解析器",
"is_async": false,
"decorators": [],
"code": " def __init__(self):\n \"\"\"初始化解析器\"\"\"\n self.json_pattern = re.compile(r'```json\\s*(.*?)\\s*```', re.DOTALL)\n self.fallback_patterns = [\n re.compile(r'\\{.*\\}', re.DOTALL), # 匹配JSON对象\n re.compile(r'\\[.*\\]', re.DOTALL), # 匹配JSON数组\n ]",
"code_hash": "2f157d84e962e5b08cd0c5789ec1e2c4"
},
{
"name": "parse",
"line_start": 31,
"line_end": 58,
"args": [
{
"name": "self"
},
{
"name": "raw_content",
"type_hint": "str"
}
],
"return_type": "List[Dict[str, Any]]",
"docstring": "解析主题内容\n\nArgs:\n raw_content: AI生成的原始内容\n\nReturns:\n 解析后的主题列表",
"is_async": false,
"decorators": [],
"code": " def parse(self, raw_content: str) -> List[Dict[str, Any]]:\n \"\"\"\n 解析主题内容\n\n Args:\n raw_content: AI生成的原始内容\n\n Returns:\n 解析后的主题列表\n \"\"\"\n if not raw_content or not raw_content.strip():\n logger.warning(\"输入内容为空\")\n return []\n\n # 尝试多种解析方法\n topics = (\n self._parse_json_block(raw_content) or\n self._parse_direct_json(raw_content) or\n self._parse_structured_text(raw_content) or\n []\n )\n\n if topics:\n logger.info(f\"成功解析出 {len(topics)} 个主题\")\n return self._validate_topics(topics)\n else:\n logger.error(\"未能解析出任何有效主题\")\n return []",
"code_hash": "fd6289431a9b868b63cb71ea21fd73b7"
},
{
"name": "_parse_json_block",
"line_start": 60,
"line_end": 79,
"args": [
{
"name": "self"
},
{
"name": "content",
"type_hint": "str"
}
],
"return_type": "Optional[List[Dict[str, Any]]]",
"docstring": "解析```json```代码块中的JSON",
"is_async": false,
"decorators": [],
"code": " def _parse_json_block(self, content: str) -> Optional[List[Dict[str, Any]]]:\n \"\"\"解析```json```代码块中的JSON\"\"\"\n try:\n matches = self.json_pattern.findall(content)\n if matches:\n json_str = matches[0].strip()\n data = json.loads(json_str)\n \n if isinstance(data, list):\n return data\n elif isinstance(data, dict):\n # 如果是字典尝试提取topics字段\n if 'topics' in data:\n return data['topics']\n else:\n return [data]\n return None\n except (json.JSONDecodeError, Exception) as e:\n logger.debug(f\"JSON代码块解析失败: {e}\")\n return None",
"code_hash": "a70c065e9c40432ae3e59cc2e73d486a"
},
{
"name": "_parse_direct_json",
"line_start": 81,
"line_end": 108,
"args": [
{
"name": "self"
},
{
"name": "content",
"type_hint": "str"
}
],
"return_type": "Optional[List[Dict[str, Any]]]",
"docstring": "直接解析JSON内容",
"is_async": false,
"decorators": [],
"code": " def _parse_direct_json(self, content: str) -> Optional[List[Dict[str, Any]]]:\n \"\"\"直接解析JSON内容\"\"\"\n try:\n # 尝试直接解析整个内容\n data = json.loads(content.strip())\n \n if isinstance(data, list):\n return data\n elif isinstance(data, dict):\n if 'topics' in data:\n return data['topics']\n else:\n return [data]\n return None\n except (json.JSONDecodeError, Exception):\n # 尝试使用fallback patterns\n for pattern in self.fallback_patterns:\n matches = pattern.findall(content)\n for match in matches:\n try:\n data = json.loads(match)\n if isinstance(data, list) and len(data) > 0:\n return data\n elif isinstance(data, dict):\n return [data]\n except json.JSONDecodeError:\n continue\n return None",
"code_hash": "bbc1f94feb212a1b0ee92114f9c95bc5"
},
{
"name": "_parse_structured_text",
"line_start": 110,
"line_end": 159,
"args": [
{
"name": "self"
},
{
"name": "content",
"type_hint": "str"
}
],
"return_type": "Optional[List[Dict[str, Any]]]",
"docstring": "解析结构化文本格式",
"is_async": false,
"decorators": [],
"code": " def _parse_structured_text(self, content: str) -> Optional[List[Dict[str, Any]]]:\n \"\"\"解析结构化文本格式\"\"\"\n try:\n topics = []\n lines = content.split('\\n')\n current_topic = {}\n \n for line in lines:\n line = line.strip()\n if not line:\n continue\n \n # 检测主题开始(数字编号)\n if re.match(r'^\\d+[\\.\\)]\\s*', line):\n if current_topic:\n topics.append(current_topic)\n current_topic = {}\n title = re.sub(r'^\\d+[\\.\\)]\\s*', '', line)\n current_topic['title'] = title\n \n # 检测字段\n elif ':' in line and current_topic:\n key, value = line.split(':', 1)\n key = key.strip().lower()\n value = value.strip()\n \n # 映射中文字段到英文\n field_mapping = {\n '标题': 'title',\n '主题': 'title', \n '目标': 'target',\n '目标人群': 'target',\n '内容': 'content',\n '描述': 'content',\n '关键词': 'keywords',\n '标签': 'tags'\n }\n \n mapped_key = field_mapping.get(key, key)\n current_topic[mapped_key] = value\n \n # 添加最后一个主题\n if current_topic:\n topics.append(current_topic)\n \n return topics if topics else None\n \n except Exception as e:\n logger.debug(f\"结构化文本解析失败: {e}\")\n return None",
"code_hash": "826c5e5160fe9b910eb34e70e657c661"
},
{
"name": "_validate_topics",
"line_start": 161,
"line_end": 193,
"args": [
{
"name": "self"
},
{
"name": "topics",
"type_hint": "List[Dict[str, Any]]"
}
],
"return_type": "List[Dict[str, Any]]",
"docstring": "验证和标准化主题数据",
"is_async": false,
"decorators": [],
"code": " def _validate_topics(self, topics: List[Dict[str, Any]]) -> List[Dict[str, Any]]:\n \"\"\"验证和标准化主题数据\"\"\"\n validated_topics = []\n \n for i, topic in enumerate(topics):\n if not isinstance(topic, dict):\n logger.warning(f\"主题 {i+1} 不是字典格式,跳过\")\n continue\n \n # 确保必需字段存在\n validated_topic = {\n 'id': topic.get('id', f\"topic_{i+1}\"),\n 'title': topic.get('title', topic.get('主题', f\"主题{i+1}\")),\n 'content': topic.get('content', topic.get('描述', topic.get('内容', ''))),\n 'target': topic.get('target', topic.get('目标人群', '通用')),\n 'keywords': topic.get('keywords', topic.get('关键词', [])),\n 'tags': topic.get('tags', topic.get('标签', [])),\n 'metadata': topic.get('metadata', {})\n }\n \n # 确保keywords和tags是列表\n if isinstance(validated_topic['keywords'], str):\n validated_topic['keywords'] = [kw.strip() for kw in validated_topic['keywords'].split(',')]\n if isinstance(validated_topic['tags'], str):\n validated_topic['tags'] = [tag.strip() for tag in validated_topic['tags'].split(',')]\n \n # 只保留有标题的主题\n if validated_topic['title']:\n validated_topics.append(validated_topic)\n else:\n logger.warning(f\"主题 {i+1} 缺少标题,跳过\")\n \n return validated_topics",
"code_hash": "3dfeb3356b59c3101937584477962a06"
},
{
"name": "parse_single_topic",
"line_start": 195,
"line_end": 206,
"args": [
{
"name": "self"
},
{
"name": "content",
"type_hint": "str"
}
],
"return_type": "Optional[Dict[str, Any]]",
"docstring": "解析单个主题\n\nArgs:\n content: 主题内容\n\nReturns:\n 解析后的主题字典",
"is_async": false,
"decorators": [],
"code": " def parse_single_topic(self, content: str) -> Optional[Dict[str, Any]]:\n \"\"\"\n 解析单个主题\n\n Args:\n content: 主题内容\n\n Returns:\n 解析后的主题字典\n \"\"\"\n topics = self.parse(content)\n return topics[0] if topics else None ",
"code_hash": "4b6e0b313fdf155b6f375314ae7da82b"
}
],
"classes": [
{
"name": "TopicParser",
"line_start": 17,
"line_end": 206,
"bases": [],
"methods": [
{
"name": "__init__",
"line_start": 23,
"line_end": 29,
"args": [
{
"name": "self"
}
],
"return_type": null,
"docstring": "初始化解析器",
"is_async": false,
"decorators": [],
"code": " def __init__(self):\n \"\"\"初始化解析器\"\"\"\n self.json_pattern = re.compile(r'```json\\s*(.*?)\\s*```', re.DOTALL)\n self.fallback_patterns = [\n re.compile(r'\\{.*\\}', re.DOTALL), # 匹配JSON对象\n re.compile(r'\\[.*\\]', re.DOTALL), # 匹配JSON数组\n ]",
"code_hash": "2f157d84e962e5b08cd0c5789ec1e2c4"
},
{
"name": "parse",
"line_start": 31,
"line_end": 58,
"args": [
{
"name": "self"
},
{
"name": "raw_content",
"type_hint": "str"
}
],
"return_type": "List[Dict[str, Any]]",
"docstring": "解析主题内容\n\nArgs:\n raw_content: AI生成的原始内容\n\nReturns:\n 解析后的主题列表",
"is_async": false,
"decorators": [],
"code": " def parse(self, raw_content: str) -> List[Dict[str, Any]]:\n \"\"\"\n 解析主题内容\n\n Args:\n raw_content: AI生成的原始内容\n\n Returns:\n 解析后的主题列表\n \"\"\"\n if not raw_content or not raw_content.strip():\n logger.warning(\"输入内容为空\")\n return []\n\n # 尝试多种解析方法\n topics = (\n self._parse_json_block(raw_content) or\n self._parse_direct_json(raw_content) or\n self._parse_structured_text(raw_content) or\n []\n )\n\n if topics:\n logger.info(f\"成功解析出 {len(topics)} 个主题\")\n return self._validate_topics(topics)\n else:\n logger.error(\"未能解析出任何有效主题\")\n return []",
"code_hash": "fd6289431a9b868b63cb71ea21fd73b7"
},
{
"name": "_parse_json_block",
"line_start": 60,
"line_end": 79,
"args": [
{
"name": "self"
},
{
"name": "content",
"type_hint": "str"
}
],
"return_type": "Optional[List[Dict[str, Any]]]",
"docstring": "解析```json```代码块中的JSON",
"is_async": false,
"decorators": [],
"code": " def _parse_json_block(self, content: str) -> Optional[List[Dict[str, Any]]]:\n \"\"\"解析```json```代码块中的JSON\"\"\"\n try:\n matches = self.json_pattern.findall(content)\n if matches:\n json_str = matches[0].strip()\n data = json.loads(json_str)\n \n if isinstance(data, list):\n return data\n elif isinstance(data, dict):\n # 如果是字典尝试提取topics字段\n if 'topics' in data:\n return data['topics']\n else:\n return [data]\n return None\n except (json.JSONDecodeError, Exception) as e:\n logger.debug(f\"JSON代码块解析失败: {e}\")\n return None",
"code_hash": "a70c065e9c40432ae3e59cc2e73d486a"
},
{
"name": "_parse_direct_json",
"line_start": 81,
"line_end": 108,
"args": [
{
"name": "self"
},
{
"name": "content",
"type_hint": "str"
}
],
"return_type": "Optional[List[Dict[str, Any]]]",
"docstring": "直接解析JSON内容",
"is_async": false,
"decorators": [],
"code": " def _parse_direct_json(self, content: str) -> Optional[List[Dict[str, Any]]]:\n \"\"\"直接解析JSON内容\"\"\"\n try:\n # 尝试直接解析整个内容\n data = json.loads(content.strip())\n \n if isinstance(data, list):\n return data\n elif isinstance(data, dict):\n if 'topics' in data:\n return data['topics']\n else:\n return [data]\n return None\n except (json.JSONDecodeError, Exception):\n # 尝试使用fallback patterns\n for pattern in self.fallback_patterns:\n matches = pattern.findall(content)\n for match in matches:\n try:\n data = json.loads(match)\n if isinstance(data, list) and len(data) > 0:\n return data\n elif isinstance(data, dict):\n return [data]\n except json.JSONDecodeError:\n continue\n return None",
"code_hash": "bbc1f94feb212a1b0ee92114f9c95bc5"
},
{
"name": "_parse_structured_text",
"line_start": 110,
"line_end": 159,
"args": [
{
"name": "self"
},
{
"name": "content",
"type_hint": "str"
}
],
"return_type": "Optional[List[Dict[str, Any]]]",
"docstring": "解析结构化文本格式",
"is_async": false,
"decorators": [],
"code": " def _parse_structured_text(self, content: str) -> Optional[List[Dict[str, Any]]]:\n \"\"\"解析结构化文本格式\"\"\"\n try:\n topics = []\n lines = content.split('\\n')\n current_topic = {}\n \n for line in lines:\n line = line.strip()\n if not line:\n continue\n \n # 检测主题开始(数字编号)\n if re.match(r'^\\d+[\\.\\)]\\s*', line):\n if current_topic:\n topics.append(current_topic)\n current_topic = {}\n title = re.sub(r'^\\d+[\\.\\)]\\s*', '', line)\n current_topic['title'] = title\n \n # 检测字段\n elif ':' in line and current_topic:\n key, value = line.split(':', 1)\n key = key.strip().lower()\n value = value.strip()\n \n # 映射中文字段到英文\n field_mapping = {\n '标题': 'title',\n '主题': 'title', \n '目标': 'target',\n '目标人群': 'target',\n '内容': 'content',\n '描述': 'content',\n '关键词': 'keywords',\n '标签': 'tags'\n }\n \n mapped_key = field_mapping.get(key, key)\n current_topic[mapped_key] = value\n \n # 添加最后一个主题\n if current_topic:\n topics.append(current_topic)\n \n return topics if topics else None\n \n except Exception as e:\n logger.debug(f\"结构化文本解析失败: {e}\")\n return None",
"code_hash": "826c5e5160fe9b910eb34e70e657c661"
},
{
"name": "_validate_topics",
"line_start": 161,
"line_end": 193,
"args": [
{
"name": "self"
},
{
"name": "topics",
"type_hint": "List[Dict[str, Any]]"
}
],
"return_type": "List[Dict[str, Any]]",
"docstring": "验证和标准化主题数据",
"is_async": false,
"decorators": [],
"code": " def _validate_topics(self, topics: List[Dict[str, Any]]) -> List[Dict[str, Any]]:\n \"\"\"验证和标准化主题数据\"\"\"\n validated_topics = []\n \n for i, topic in enumerate(topics):\n if not isinstance(topic, dict):\n logger.warning(f\"主题 {i+1} 不是字典格式,跳过\")\n continue\n \n # 确保必需字段存在\n validated_topic = {\n 'id': topic.get('id', f\"topic_{i+1}\"),\n 'title': topic.get('title', topic.get('主题', f\"主题{i+1}\")),\n 'content': topic.get('content', topic.get('描述', topic.get('内容', ''))),\n 'target': topic.get('target', topic.get('目标人群', '通用')),\n 'keywords': topic.get('keywords', topic.get('关键词', [])),\n 'tags': topic.get('tags', topic.get('标签', [])),\n 'metadata': topic.get('metadata', {})\n }\n \n # 确保keywords和tags是列表\n if isinstance(validated_topic['keywords'], str):\n validated_topic['keywords'] = [kw.strip() for kw in validated_topic['keywords'].split(',')]\n if isinstance(validated_topic['tags'], str):\n validated_topic['tags'] = [tag.strip() for tag in validated_topic['tags'].split(',')]\n \n # 只保留有标题的主题\n if validated_topic['title']:\n validated_topics.append(validated_topic)\n else:\n logger.warning(f\"主题 {i+1} 缺少标题,跳过\")\n \n return validated_topics",
"code_hash": "3dfeb3356b59c3101937584477962a06"
},
{
"name": "parse_single_topic",
"line_start": 195,
"line_end": 206,
"args": [
{
"name": "self"
},
{
"name": "content",
"type_hint": "str"
}
],
"return_type": "Optional[Dict[str, Any]]",
"docstring": "解析单个主题\n\nArgs:\n content: 主题内容\n\nReturns:\n 解析后的主题字典",
"is_async": false,
"decorators": [],
"code": " def parse_single_topic(self, content: str) -> Optional[Dict[str, Any]]:\n \"\"\"\n 解析单个主题\n\n Args:\n content: 主题内容\n\n Returns:\n 解析后的主题字典\n \"\"\"\n topics = self.parse(content)\n return topics[0] if topics else None ",
"code_hash": "4b6e0b313fdf155b6f375314ae7da82b"
}
],
"docstring": "主题解析器\n负责从AI生成的文本中解析出结构化的主题数据",
"decorators": [],
"code": "class TopicParser:\n \"\"\"\n 主题解析器\n 负责从AI生成的文本中解析出结构化的主题数据\n \"\"\"\n\n def __init__(self):\n \"\"\"初始化解析器\"\"\"\n self.json_pattern = re.compile(r'```json\\s*(.*?)\\s*```', re.DOTALL)\n self.fallback_patterns = [\n re.compile(r'\\{.*\\}', re.DOTALL), # 匹配JSON对象\n re.compile(r'\\[.*\\]', re.DOTALL), # 匹配JSON数组\n ]\n\n def parse(self, raw_content: str) -> List[Dict[str, Any]]:\n \"\"\"\n 解析主题内容\n\n Args:\n raw_content: AI生成的原始内容\n\n Returns:\n 解析后的主题列表\n \"\"\"\n if not raw_content or not raw_content.strip():\n logger.warning(\"输入内容为空\")\n return []\n\n # 尝试多种解析方法\n topics = (\n self._parse_json_block(raw_content) or\n self._parse_direct_json(raw_content) or\n self._parse_structured_text(raw_content) or\n []\n )\n\n if topics:\n logger.info(f\"成功解析出 {len(topics)} 个主题\")\n return self._validate_topics(topics)\n else:\n logger.error(\"未能解析出任何有效主题\")\n return []\n\n def _parse_json_block(self, content: str) -> Optional[List[Dict[str, Any]]]:\n \"\"\"解析```json```代码块中的JSON\"\"\"\n try:\n matches = self.json_pattern.findall(content)\n if matches:\n json_str = matches[0].strip()\n data = json.loads(json_str)\n \n if isinstance(data, list):\n return data\n elif isinstance(data, dict):\n # 如果是字典尝试提取topics字段\n if 'topics' in data:\n return data['topics']\n else:\n return [data]\n return None\n except (json.JSONDecodeError, Exception) as e:\n logger.debug(f\"JSON代码块解析失败: {e}\")\n return None\n\n def _parse_direct_json(self, content: str) -> Optional[List[Dict[str, Any]]]:\n \"\"\"直接解析JSON内容\"\"\"\n try:\n # 尝试直接解析整个内容\n data = json.loads(content.strip())\n \n if isinstance(data, list):\n return data\n elif isinstance(data, dict):\n if 'topics' in data:\n return data['topics']\n else:\n return [data]\n return None\n except (json.JSONDecodeError, Exception):\n # 尝试使用fallback patterns\n for pattern in self.fallback_patterns:\n matches = pattern.findall(content)\n for match in matches:\n try:\n data = json.loads(match)\n if isinstance(data, list) and len(data) > 0:\n return data\n elif isinstance(data, dict):\n return [data]\n except json.JSONDecodeError:\n continue\n return None\n\n def _parse_structured_text(self, content: str) -> Optional[List[Dict[str, Any]]]:\n \"\"\"解析结构化文本格式\"\"\"\n try:\n topics = []\n lines = content.split('\\n')\n current_topic = {}\n \n for line in lines:\n line = line.strip()\n if not line:\n continue\n \n # 检测主题开始(数字编号)\n if re.match(r'^\\d+[\\.\\)]\\s*', line):\n if current_topic:\n topics.append(current_topic)\n current_topic = {}\n title = re.sub(r'^\\d+[\\.\\)]\\s*', '', line)\n current_topic['title'] = title\n \n # 检测字段\n elif ':' in line and current_topic:\n key, value = line.split(':', 1)\n key = key.strip().lower()\n value = value.strip()\n \n # 映射中文字段到英文\n field_mapping = {\n '标题': 'title',\n '主题': 'title', \n '目标': 'target',\n '目标人群': 'target',\n '内容': 'content',\n '描述': 'content',\n '关键词': 'keywords',\n '标签': 'tags'\n }\n \n mapped_key = field_mapping.get(key, key)\n current_topic[mapped_key] = value\n \n # 添加最后一个主题\n if current_topic:\n topics.append(current_topic)\n \n return topics if topics else None\n \n except Exception as e:\n logger.debug(f\"结构化文本解析失败: {e}\")\n return None\n\n def _validate_topics(self, topics: List[Dict[str, Any]]) -> List[Dict[str, Any]]:\n \"\"\"验证和标准化主题数据\"\"\"\n validated_topics = []\n \n for i, topic in enumerate(topics):\n if not isinstance(topic, dict):\n logger.warning(f\"主题 {i+1} 不是字典格式,跳过\")\n continue\n \n # 确保必需字段存在\n validated_topic = {\n 'id': topic.get('id', f\"topic_{i+1}\"),\n 'title': topic.get('title', topic.get('主题', f\"主题{i+1}\")),\n 'content': topic.get('content', topic.get('描述', topic.get('内容', ''))),\n 'target': topic.get('target', topic.get('目标人群', '通用')),\n 'keywords': topic.get('keywords', topic.get('关键词', [])),\n 'tags': topic.get('tags', topic.get('标签', [])),\n 'metadata': topic.get('metadata', {})\n }\n \n # 确保keywords和tags是列表\n if isinstance(validated_topic['keywords'], str):\n validated_topic['keywords'] = [kw.strip() for kw in validated_topic['keywords'].split(',')]\n if isinstance(validated_topic['tags'], str):\n validated_topic['tags'] = [tag.strip() for tag in validated_topic['tags'].split(',')]\n \n # 只保留有标题的主题\n if validated_topic['title']:\n validated_topics.append(validated_topic)\n else:\n logger.warning(f\"主题 {i+1} 缺少标题,跳过\")\n \n return validated_topics\n\n def parse_single_topic(self, content: str) -> Optional[Dict[str, Any]]:\n \"\"\"\n 解析单个主题\n\n Args:\n content: 主题内容\n\n Returns:\n 解析后的主题字典\n \"\"\"\n topics = self.parse(content)\n return topics[0] if topics else None ",
"code_hash": "e18d720056356adcfd93c009f5594b79"
}
],
"imports": [
{
"type": "import",
"modules": [
"json"
],
"aliases": []
},
{
"type": "import",
"modules": [
"re"
],
"aliases": []
},
{
"type": "import",
"modules": [
"logging"
],
"aliases": []
},
{
"type": "from_import",
"module": "typing",
"names": [
"List",
"Dict",
"Any",
"Optional"
],
"aliases": [],
"level": 0
}
],
"constants": [],
"docstring": "Topic Parser\n主题解析器 - 从原项目迁移解析AI生成的主题内容",
"content_hash": "d74c0a64f33726e70ed1151d33e7d937"
}