{ "file_path": "travel-algorithms/travel_algorithms/content_generation/topic_parser.py", "file_size": 6727, "line_count": 205, "functions": [ { "name": "__init__", "line_start": 23, "line_end": 29, "args": [ { "name": "self" } ], "return_type": null, "docstring": "初始化解析器", "is_async": false, "decorators": [], "code": " def __init__(self):\n \"\"\"初始化解析器\"\"\"\n self.json_pattern = re.compile(r'```json\\s*(.*?)\\s*```', re.DOTALL)\n self.fallback_patterns = [\n re.compile(r'\\{.*\\}', re.DOTALL), # 匹配JSON对象\n re.compile(r'\\[.*\\]', re.DOTALL), # 匹配JSON数组\n ]", "code_hash": "2f157d84e962e5b08cd0c5789ec1e2c4" }, { "name": "parse", "line_start": 31, "line_end": 58, "args": [ { "name": "self" }, { "name": "raw_content", "type_hint": "str" } ], "return_type": "List[Dict[str, Any]]", "docstring": "解析主题内容\n\nArgs:\n raw_content: AI生成的原始内容\n\nReturns:\n 解析后的主题列表", "is_async": false, "decorators": [], "code": " def parse(self, raw_content: str) -> List[Dict[str, Any]]:\n \"\"\"\n 解析主题内容\n\n Args:\n raw_content: AI生成的原始内容\n\n Returns:\n 解析后的主题列表\n \"\"\"\n if not raw_content or not raw_content.strip():\n logger.warning(\"输入内容为空\")\n return []\n\n # 尝试多种解析方法\n topics = (\n self._parse_json_block(raw_content) or\n self._parse_direct_json(raw_content) or\n self._parse_structured_text(raw_content) or\n []\n )\n\n if topics:\n logger.info(f\"成功解析出 {len(topics)} 个主题\")\n return self._validate_topics(topics)\n else:\n logger.error(\"未能解析出任何有效主题\")\n return []", "code_hash": "fd6289431a9b868b63cb71ea21fd73b7" }, { "name": "_parse_json_block", "line_start": 60, "line_end": 79, "args": [ { "name": "self" }, { "name": "content", "type_hint": "str" } ], "return_type": "Optional[List[Dict[str, Any]]]", "docstring": "解析```json```代码块中的JSON", "is_async": false, "decorators": [], "code": " def _parse_json_block(self, content: str) -> Optional[List[Dict[str, Any]]]:\n \"\"\"解析```json```代码块中的JSON\"\"\"\n try:\n matches = self.json_pattern.findall(content)\n if matches:\n json_str = matches[0].strip()\n data = json.loads(json_str)\n \n if isinstance(data, list):\n return data\n elif isinstance(data, dict):\n # 如果是字典,尝试提取topics字段\n if 'topics' in data:\n return data['topics']\n else:\n return [data]\n return None\n except (json.JSONDecodeError, Exception) as e:\n logger.debug(f\"JSON代码块解析失败: {e}\")\n return None", "code_hash": "a70c065e9c40432ae3e59cc2e73d486a" }, { "name": "_parse_direct_json", "line_start": 81, "line_end": 108, "args": [ { "name": "self" }, { "name": "content", "type_hint": "str" } ], "return_type": "Optional[List[Dict[str, Any]]]", "docstring": "直接解析JSON内容", "is_async": false, "decorators": [], "code": " def _parse_direct_json(self, content: str) -> Optional[List[Dict[str, Any]]]:\n \"\"\"直接解析JSON内容\"\"\"\n try:\n # 尝试直接解析整个内容\n data = json.loads(content.strip())\n \n if isinstance(data, list):\n return data\n elif isinstance(data, dict):\n if 'topics' in data:\n return data['topics']\n else:\n return [data]\n return None\n except (json.JSONDecodeError, Exception):\n # 尝试使用fallback patterns\n for pattern in self.fallback_patterns:\n matches = pattern.findall(content)\n for match in matches:\n try:\n data = json.loads(match)\n if isinstance(data, list) and len(data) > 0:\n return data\n elif isinstance(data, dict):\n return [data]\n except json.JSONDecodeError:\n continue\n return None", "code_hash": "bbc1f94feb212a1b0ee92114f9c95bc5" }, { "name": "_parse_structured_text", "line_start": 110, "line_end": 159, "args": [ { "name": "self" }, { "name": "content", "type_hint": "str" } ], "return_type": "Optional[List[Dict[str, Any]]]", "docstring": "解析结构化文本格式", "is_async": false, "decorators": [], "code": " def _parse_structured_text(self, content: str) -> Optional[List[Dict[str, Any]]]:\n \"\"\"解析结构化文本格式\"\"\"\n try:\n topics = []\n lines = content.split('\\n')\n current_topic = {}\n \n for line in lines:\n line = line.strip()\n if not line:\n continue\n \n # 检测主题开始(数字编号)\n if re.match(r'^\\d+[\\.\\)]\\s*', line):\n if current_topic:\n topics.append(current_topic)\n current_topic = {}\n title = re.sub(r'^\\d+[\\.\\)]\\s*', '', line)\n current_topic['title'] = title\n \n # 检测字段\n elif ':' in line and current_topic:\n key, value = line.split(':', 1)\n key = key.strip().lower()\n value = value.strip()\n \n # 映射中文字段到英文\n field_mapping = {\n '标题': 'title',\n '主题': 'title', \n '目标': 'target',\n '目标人群': 'target',\n '内容': 'content',\n '描述': 'content',\n '关键词': 'keywords',\n '标签': 'tags'\n }\n \n mapped_key = field_mapping.get(key, key)\n current_topic[mapped_key] = value\n \n # 添加最后一个主题\n if current_topic:\n topics.append(current_topic)\n \n return topics if topics else None\n \n except Exception as e:\n logger.debug(f\"结构化文本解析失败: {e}\")\n return None", "code_hash": "826c5e5160fe9b910eb34e70e657c661" }, { "name": "_validate_topics", "line_start": 161, "line_end": 193, "args": [ { "name": "self" }, { "name": "topics", "type_hint": "List[Dict[str, Any]]" } ], "return_type": "List[Dict[str, Any]]", "docstring": "验证和标准化主题数据", "is_async": false, "decorators": [], "code": " def _validate_topics(self, topics: List[Dict[str, Any]]) -> List[Dict[str, Any]]:\n \"\"\"验证和标准化主题数据\"\"\"\n validated_topics = []\n \n for i, topic in enumerate(topics):\n if not isinstance(topic, dict):\n logger.warning(f\"主题 {i+1} 不是字典格式,跳过\")\n continue\n \n # 确保必需字段存在\n validated_topic = {\n 'id': topic.get('id', f\"topic_{i+1}\"),\n 'title': topic.get('title', topic.get('主题', f\"主题{i+1}\")),\n 'content': topic.get('content', topic.get('描述', topic.get('内容', ''))),\n 'target': topic.get('target', topic.get('目标人群', '通用')),\n 'keywords': topic.get('keywords', topic.get('关键词', [])),\n 'tags': topic.get('tags', topic.get('标签', [])),\n 'metadata': topic.get('metadata', {})\n }\n \n # 确保keywords和tags是列表\n if isinstance(validated_topic['keywords'], str):\n validated_topic['keywords'] = [kw.strip() for kw in validated_topic['keywords'].split(',')]\n if isinstance(validated_topic['tags'], str):\n validated_topic['tags'] = [tag.strip() for tag in validated_topic['tags'].split(',')]\n \n # 只保留有标题的主题\n if validated_topic['title']:\n validated_topics.append(validated_topic)\n else:\n logger.warning(f\"主题 {i+1} 缺少标题,跳过\")\n \n return validated_topics", "code_hash": "3dfeb3356b59c3101937584477962a06" }, { "name": "parse_single_topic", "line_start": 195, "line_end": 206, "args": [ { "name": "self" }, { "name": "content", "type_hint": "str" } ], "return_type": "Optional[Dict[str, Any]]", "docstring": "解析单个主题\n\nArgs:\n content: 主题内容\n\nReturns:\n 解析后的主题字典", "is_async": false, "decorators": [], "code": " def parse_single_topic(self, content: str) -> Optional[Dict[str, Any]]:\n \"\"\"\n 解析单个主题\n\n Args:\n content: 主题内容\n\n Returns:\n 解析后的主题字典\n \"\"\"\n topics = self.parse(content)\n return topics[0] if topics else None ", "code_hash": "4b6e0b313fdf155b6f375314ae7da82b" } ], "classes": [ { "name": "TopicParser", "line_start": 17, "line_end": 206, "bases": [], "methods": [ { "name": "__init__", "line_start": 23, "line_end": 29, "args": [ { "name": "self" } ], "return_type": null, "docstring": "初始化解析器", "is_async": false, "decorators": [], "code": " def __init__(self):\n \"\"\"初始化解析器\"\"\"\n self.json_pattern = re.compile(r'```json\\s*(.*?)\\s*```', re.DOTALL)\n self.fallback_patterns = [\n re.compile(r'\\{.*\\}', re.DOTALL), # 匹配JSON对象\n re.compile(r'\\[.*\\]', re.DOTALL), # 匹配JSON数组\n ]", "code_hash": "2f157d84e962e5b08cd0c5789ec1e2c4" }, { "name": "parse", "line_start": 31, "line_end": 58, "args": [ { "name": "self" }, { "name": "raw_content", "type_hint": "str" } ], "return_type": "List[Dict[str, Any]]", "docstring": "解析主题内容\n\nArgs:\n raw_content: AI生成的原始内容\n\nReturns:\n 解析后的主题列表", "is_async": false, "decorators": [], "code": " def parse(self, raw_content: str) -> List[Dict[str, Any]]:\n \"\"\"\n 解析主题内容\n\n Args:\n raw_content: AI生成的原始内容\n\n Returns:\n 解析后的主题列表\n \"\"\"\n if not raw_content or not raw_content.strip():\n logger.warning(\"输入内容为空\")\n return []\n\n # 尝试多种解析方法\n topics = (\n self._parse_json_block(raw_content) or\n self._parse_direct_json(raw_content) or\n self._parse_structured_text(raw_content) or\n []\n )\n\n if topics:\n logger.info(f\"成功解析出 {len(topics)} 个主题\")\n return self._validate_topics(topics)\n else:\n logger.error(\"未能解析出任何有效主题\")\n return []", "code_hash": "fd6289431a9b868b63cb71ea21fd73b7" }, { "name": "_parse_json_block", "line_start": 60, "line_end": 79, "args": [ { "name": "self" }, { "name": "content", "type_hint": "str" } ], "return_type": "Optional[List[Dict[str, Any]]]", "docstring": "解析```json```代码块中的JSON", "is_async": false, "decorators": [], "code": " def _parse_json_block(self, content: str) -> Optional[List[Dict[str, Any]]]:\n \"\"\"解析```json```代码块中的JSON\"\"\"\n try:\n matches = self.json_pattern.findall(content)\n if matches:\n json_str = matches[0].strip()\n data = json.loads(json_str)\n \n if isinstance(data, list):\n return data\n elif isinstance(data, dict):\n # 如果是字典,尝试提取topics字段\n if 'topics' in data:\n return data['topics']\n else:\n return [data]\n return None\n except (json.JSONDecodeError, Exception) as e:\n logger.debug(f\"JSON代码块解析失败: {e}\")\n return None", "code_hash": "a70c065e9c40432ae3e59cc2e73d486a" }, { "name": "_parse_direct_json", "line_start": 81, "line_end": 108, "args": [ { "name": "self" }, { "name": "content", "type_hint": "str" } ], "return_type": "Optional[List[Dict[str, Any]]]", "docstring": "直接解析JSON内容", "is_async": false, "decorators": [], "code": " def _parse_direct_json(self, content: str) -> Optional[List[Dict[str, Any]]]:\n \"\"\"直接解析JSON内容\"\"\"\n try:\n # 尝试直接解析整个内容\n data = json.loads(content.strip())\n \n if isinstance(data, list):\n return data\n elif isinstance(data, dict):\n if 'topics' in data:\n return data['topics']\n else:\n return [data]\n return None\n except (json.JSONDecodeError, Exception):\n # 尝试使用fallback patterns\n for pattern in self.fallback_patterns:\n matches = pattern.findall(content)\n for match in matches:\n try:\n data = json.loads(match)\n if isinstance(data, list) and len(data) > 0:\n return data\n elif isinstance(data, dict):\n return [data]\n except json.JSONDecodeError:\n continue\n return None", "code_hash": "bbc1f94feb212a1b0ee92114f9c95bc5" }, { "name": "_parse_structured_text", "line_start": 110, "line_end": 159, "args": [ { "name": "self" }, { "name": "content", "type_hint": "str" } ], "return_type": "Optional[List[Dict[str, Any]]]", "docstring": "解析结构化文本格式", "is_async": false, "decorators": [], "code": " def _parse_structured_text(self, content: str) -> Optional[List[Dict[str, Any]]]:\n \"\"\"解析结构化文本格式\"\"\"\n try:\n topics = []\n lines = content.split('\\n')\n current_topic = {}\n \n for line in lines:\n line = line.strip()\n if not line:\n continue\n \n # 检测主题开始(数字编号)\n if re.match(r'^\\d+[\\.\\)]\\s*', line):\n if current_topic:\n topics.append(current_topic)\n current_topic = {}\n title = re.sub(r'^\\d+[\\.\\)]\\s*', '', line)\n current_topic['title'] = title\n \n # 检测字段\n elif ':' in line and current_topic:\n key, value = line.split(':', 1)\n key = key.strip().lower()\n value = value.strip()\n \n # 映射中文字段到英文\n field_mapping = {\n '标题': 'title',\n '主题': 'title', \n '目标': 'target',\n '目标人群': 'target',\n '内容': 'content',\n '描述': 'content',\n '关键词': 'keywords',\n '标签': 'tags'\n }\n \n mapped_key = field_mapping.get(key, key)\n current_topic[mapped_key] = value\n \n # 添加最后一个主题\n if current_topic:\n topics.append(current_topic)\n \n return topics if topics else None\n \n except Exception as e:\n logger.debug(f\"结构化文本解析失败: {e}\")\n return None", "code_hash": "826c5e5160fe9b910eb34e70e657c661" }, { "name": "_validate_topics", "line_start": 161, "line_end": 193, "args": [ { "name": "self" }, { "name": "topics", "type_hint": "List[Dict[str, Any]]" } ], "return_type": "List[Dict[str, Any]]", "docstring": "验证和标准化主题数据", "is_async": false, "decorators": [], "code": " def _validate_topics(self, topics: List[Dict[str, Any]]) -> List[Dict[str, Any]]:\n \"\"\"验证和标准化主题数据\"\"\"\n validated_topics = []\n \n for i, topic in enumerate(topics):\n if not isinstance(topic, dict):\n logger.warning(f\"主题 {i+1} 不是字典格式,跳过\")\n continue\n \n # 确保必需字段存在\n validated_topic = {\n 'id': topic.get('id', f\"topic_{i+1}\"),\n 'title': topic.get('title', topic.get('主题', f\"主题{i+1}\")),\n 'content': topic.get('content', topic.get('描述', topic.get('内容', ''))),\n 'target': topic.get('target', topic.get('目标人群', '通用')),\n 'keywords': topic.get('keywords', topic.get('关键词', [])),\n 'tags': topic.get('tags', topic.get('标签', [])),\n 'metadata': topic.get('metadata', {})\n }\n \n # 确保keywords和tags是列表\n if isinstance(validated_topic['keywords'], str):\n validated_topic['keywords'] = [kw.strip() for kw in validated_topic['keywords'].split(',')]\n if isinstance(validated_topic['tags'], str):\n validated_topic['tags'] = [tag.strip() for tag in validated_topic['tags'].split(',')]\n \n # 只保留有标题的主题\n if validated_topic['title']:\n validated_topics.append(validated_topic)\n else:\n logger.warning(f\"主题 {i+1} 缺少标题,跳过\")\n \n return validated_topics", "code_hash": "3dfeb3356b59c3101937584477962a06" }, { "name": "parse_single_topic", "line_start": 195, "line_end": 206, "args": [ { "name": "self" }, { "name": "content", "type_hint": "str" } ], "return_type": "Optional[Dict[str, Any]]", "docstring": "解析单个主题\n\nArgs:\n content: 主题内容\n\nReturns:\n 解析后的主题字典", "is_async": false, "decorators": [], "code": " def parse_single_topic(self, content: str) -> Optional[Dict[str, Any]]:\n \"\"\"\n 解析单个主题\n\n Args:\n content: 主题内容\n\n Returns:\n 解析后的主题字典\n \"\"\"\n topics = self.parse(content)\n return topics[0] if topics else None ", "code_hash": "4b6e0b313fdf155b6f375314ae7da82b" } ], "docstring": "主题解析器\n负责从AI生成的文本中解析出结构化的主题数据", "decorators": [], "code": "class TopicParser:\n \"\"\"\n 主题解析器\n 负责从AI生成的文本中解析出结构化的主题数据\n \"\"\"\n\n def __init__(self):\n \"\"\"初始化解析器\"\"\"\n self.json_pattern = re.compile(r'```json\\s*(.*?)\\s*```', re.DOTALL)\n self.fallback_patterns = [\n re.compile(r'\\{.*\\}', re.DOTALL), # 匹配JSON对象\n re.compile(r'\\[.*\\]', re.DOTALL), # 匹配JSON数组\n ]\n\n def parse(self, raw_content: str) -> List[Dict[str, Any]]:\n \"\"\"\n 解析主题内容\n\n Args:\n raw_content: AI生成的原始内容\n\n Returns:\n 解析后的主题列表\n \"\"\"\n if not raw_content or not raw_content.strip():\n logger.warning(\"输入内容为空\")\n return []\n\n # 尝试多种解析方法\n topics = (\n self._parse_json_block(raw_content) or\n self._parse_direct_json(raw_content) or\n self._parse_structured_text(raw_content) or\n []\n )\n\n if topics:\n logger.info(f\"成功解析出 {len(topics)} 个主题\")\n return self._validate_topics(topics)\n else:\n logger.error(\"未能解析出任何有效主题\")\n return []\n\n def _parse_json_block(self, content: str) -> Optional[List[Dict[str, Any]]]:\n \"\"\"解析```json```代码块中的JSON\"\"\"\n try:\n matches = self.json_pattern.findall(content)\n if matches:\n json_str = matches[0].strip()\n data = json.loads(json_str)\n \n if isinstance(data, list):\n return data\n elif isinstance(data, dict):\n # 如果是字典,尝试提取topics字段\n if 'topics' in data:\n return data['topics']\n else:\n return [data]\n return None\n except (json.JSONDecodeError, Exception) as e:\n logger.debug(f\"JSON代码块解析失败: {e}\")\n return None\n\n def _parse_direct_json(self, content: str) -> Optional[List[Dict[str, Any]]]:\n \"\"\"直接解析JSON内容\"\"\"\n try:\n # 尝试直接解析整个内容\n data = json.loads(content.strip())\n \n if isinstance(data, list):\n return data\n elif isinstance(data, dict):\n if 'topics' in data:\n return data['topics']\n else:\n return [data]\n return None\n except (json.JSONDecodeError, Exception):\n # 尝试使用fallback patterns\n for pattern in self.fallback_patterns:\n matches = pattern.findall(content)\n for match in matches:\n try:\n data = json.loads(match)\n if isinstance(data, list) and len(data) > 0:\n return data\n elif isinstance(data, dict):\n return [data]\n except json.JSONDecodeError:\n continue\n return None\n\n def _parse_structured_text(self, content: str) -> Optional[List[Dict[str, Any]]]:\n \"\"\"解析结构化文本格式\"\"\"\n try:\n topics = []\n lines = content.split('\\n')\n current_topic = {}\n \n for line in lines:\n line = line.strip()\n if not line:\n continue\n \n # 检测主题开始(数字编号)\n if re.match(r'^\\d+[\\.\\)]\\s*', line):\n if current_topic:\n topics.append(current_topic)\n current_topic = {}\n title = re.sub(r'^\\d+[\\.\\)]\\s*', '', line)\n current_topic['title'] = title\n \n # 检测字段\n elif ':' in line and current_topic:\n key, value = line.split(':', 1)\n key = key.strip().lower()\n value = value.strip()\n \n # 映射中文字段到英文\n field_mapping = {\n '标题': 'title',\n '主题': 'title', \n '目标': 'target',\n '目标人群': 'target',\n '内容': 'content',\n '描述': 'content',\n '关键词': 'keywords',\n '标签': 'tags'\n }\n \n mapped_key = field_mapping.get(key, key)\n current_topic[mapped_key] = value\n \n # 添加最后一个主题\n if current_topic:\n topics.append(current_topic)\n \n return topics if topics else None\n \n except Exception as e:\n logger.debug(f\"结构化文本解析失败: {e}\")\n return None\n\n def _validate_topics(self, topics: List[Dict[str, Any]]) -> List[Dict[str, Any]]:\n \"\"\"验证和标准化主题数据\"\"\"\n validated_topics = []\n \n for i, topic in enumerate(topics):\n if not isinstance(topic, dict):\n logger.warning(f\"主题 {i+1} 不是字典格式,跳过\")\n continue\n \n # 确保必需字段存在\n validated_topic = {\n 'id': topic.get('id', f\"topic_{i+1}\"),\n 'title': topic.get('title', topic.get('主题', f\"主题{i+1}\")),\n 'content': topic.get('content', topic.get('描述', topic.get('内容', ''))),\n 'target': topic.get('target', topic.get('目标人群', '通用')),\n 'keywords': topic.get('keywords', topic.get('关键词', [])),\n 'tags': topic.get('tags', topic.get('标签', [])),\n 'metadata': topic.get('metadata', {})\n }\n \n # 确保keywords和tags是列表\n if isinstance(validated_topic['keywords'], str):\n validated_topic['keywords'] = [kw.strip() for kw in validated_topic['keywords'].split(',')]\n if isinstance(validated_topic['tags'], str):\n validated_topic['tags'] = [tag.strip() for tag in validated_topic['tags'].split(',')]\n \n # 只保留有标题的主题\n if validated_topic['title']:\n validated_topics.append(validated_topic)\n else:\n logger.warning(f\"主题 {i+1} 缺少标题,跳过\")\n \n return validated_topics\n\n def parse_single_topic(self, content: str) -> Optional[Dict[str, Any]]:\n \"\"\"\n 解析单个主题\n\n Args:\n content: 主题内容\n\n Returns:\n 解析后的主题字典\n \"\"\"\n topics = self.parse(content)\n return topics[0] if topics else None ", "code_hash": "e18d720056356adcfd93c009f5594b79" } ], "imports": [ { "type": "import", "modules": [ "json" ], "aliases": [] }, { "type": "import", "modules": [ "re" ], "aliases": [] }, { "type": "import", "modules": [ "logging" ], "aliases": [] }, { "type": "from_import", "module": "typing", "names": [ "List", "Dict", "Any", "Optional" ], "aliases": [], "level": 0 } ], "constants": [], "docstring": "Topic Parser\n主题解析器 - 从原项目迁移,解析AI生成的主题内容", "content_hash": "d74c0a64f33726e70ed1151d33e7d937" }