TravelContentCreator/core/topic_parser.py

import re
import json
import os


class TopicParser:
    """选题解析器类，负责解析和处理选题"""

    @staticmethod
    def parse_line(line):
        """解析单个选题内容，提取各个标签中的信息"""
        print("正在解析选题:")
        print(line)
        print("--------------------------------")
        try:
            # 清理文本，删除多余空格和换行
            line = line.strip()
            # 如果内容为空或太短则跳过
            if len(line) < 10:
                return {"error": True}

            # 提取各个标签中的内容
            index = TopicParser.extract_tag_content(line, "index")
            date = TopicParser.extract_tag_content(line, "date")
            logic = TopicParser.extract_tag_content(line, "logic")
            object = TopicParser.extract_tag_content(line, "object")
            product = TopicParser.extract_tag_content(line, "product")
            product_logic = TopicParser.extract_tag_content(line, "product_logic")
            style = TopicParser.extract_tag_content(line, "style")
            style_logic = TopicParser.extract_tag_content(line, "style_logic")
            target_audience = TopicParser.extract_tag_content(line, "target_audience")
            target_audience_logic = TopicParser.extract_tag_content(line, "target_audience_logic")

            # 验证所有必要字段是否存在
            if not all([date, logic, product, product_logic, style, style_logic]):
                print(f"缺少必要字段: date={bool(date)}, logic={bool(logic)}, product={bool(product)}, product_logic={bool(product_logic)}, style={bool(style)}, style_logic={bool(style_logic)}")
                return {"error": True}

            return {
                "index": index,
                "date": date,
                "logic": logic,
                "object": object,
                "product": product,
                "product_logic": product_logic,
                "style": style,
                "style_logic": style_logic,
                "target_audience": target_audience,
                "target_audience_logic": target_audience_logic,
                "error": False
            }
        except Exception as e:
            print(f"解析选题时出错: {e}")
            return {"index": "", "date": "", "logic": "", "object": "", "product": "", "product_logic": "", "style": "", "style_logic": "", "target_audience": "", "target_audience_logic": "", "error": True}

    @staticmethod
    def extract_tag_content(text, tag_name):
        """从文本中提取指定标签的内容"""
        try:
            start_tag = f"<{tag_name}>"
            end_tag = f"</{tag_name}>"

            # 使用正则表达式找到所有匹配项，避免嵌套标签的干扰
            pattern = f"{start_tag}(.*?){end_tag}"
            matches = re.findall(pattern, text, re.DOTALL)

            if matches:
                # 返回第一个匹配内容并去除前后空白
                return matches[0].strip()

            # 如果正则匹配失败，尝试普通的字符串查找方法
            if start_tag in text and end_tag in text:
                start_index = text.index(start_tag) + len(start_tag)
                end_index = text.index(end_tag, start_index)
                content = text[start_index:end_index].strip()
                return content

            return ""
        except Exception as e:
            print(f"提取{tag_name}标签内容时出错: {e}")
            return ""

    @staticmethod
    def parse_topics(result):
        """解析多个选题，返回解析后的选题列表"""
        print("\n开始拆解选题结果...")

        # 先尝试用###分割
        topics = result.split("###")

        # 如果分割后只有一项，可能格式不对，尝试用数字+点的模式分割
        if len(topics) <= 2:
            print("使用###分割失败，尝试按数字序号分割...")
            # 使用正则表达式按数字+点的模式分割
            topics = re.split(r'\n\s*\d+\.\s*\n', result)
            # 移除第一个可能的空白项
            if topics and not topics[0].strip():
                topics = topics[1:]

        # 如果上面的分割方法都失败，尝试直接按<date>标签分割
        if len(topics) <= 2:
            print("按数字分割失败，尝试直接按<date>标签分割...")
            # 使用正则表达式找到所有<date>标签的位置
            date_positions = [m.start() for m in re.finditer(r'<date>', result)]
            if len(date_positions) > 1:
                topics = []
                for i in range(len(date_positions)):
                    start = date_positions[i]
                    end = date_positions[i+1] if i+1 < len(date_positions) else len(result)
                    topic = result[start:end].strip()
                    if topic:
                        topics.append(topic)

        # 如果上述方法都失败，尝试使用分隔符"---"分割
        if len(topics) <= 2:
            print("按<date>标签分割失败，尝试使用分隔符'---'分割...")
            topics = result.split("---")
            # 移除空白项
            topics = [topic.strip() for topic in topics if topic.strip()]
            print(f"使用分隔符'---'分割后得到 {len(topics)} 个选题")

        print(f"初步分割得到 {len(topics)} 个选题")

        # 处理每个选题
        result_list = []
        for i, topic in enumerate(topics):
            if not topic.strip():
                continue

            print(f"\n处理第 {i+1} 个选题")
            parsed_data = TopicParser.parse_line(topic)
            if parsed_data["error"] != True:
                result_list.append(parsed_data)
            else:
                print(f"选题 {i+1} 解析失败，跳过")

        print(f"成功解析选题数量：{len(result_list)}")

        # 如果没有解析出选题，尝试另一种方法
        if len(result_list) == 0:
            print("所有选题解析失败，尝试重新识别选题格式...")
            # 寻找所有可能的标签组合
            tag_patterns = [
                (r'<index>(.*?)</index>', 'index'),
                (r'<date>(.*?)</date>', 'date'),
                (r'<logic>(.*?)</logic>', 'logic'),
                (r'<object>(.*?)</object>', 'object'),
                (r'<product>(.*?)</product>', 'product'),
                (r'<product_logic>(.*?)</product_logic>', 'product_logic'),
                (r'<style>(.*?)</style>', 'style'),
                (r'<style_logic>(.*?)</style_logic>', 'style_logic'),
                (r'<target_audience>(.*?)</target_audience>', 'target_audience'),
                (r'<target_audience_logic>(.*?)</target_audience_logic>', 'target_audience_logic'),
            ]

            # 尝试找出所有date标签，以此确定选题位置
            date_matches = re.finditer(r'<date>(.*?)</date>', result)
            topics_data = []

            for date_match in date_matches:
                date_start = date_match.start()
                date_end = date_match.end()
                date_value = date_match.group(1)

                # 从当前date标签位置开始，寻找下一个date标签或文件结尾
                next_date_match = re.search(r'<date>', result[date_end:])
                topic_end = date_end + next_date_match.start() if next_date_match else len(result)

                # 提取当前选题的文本
                topic_text = result[date_start:topic_end]

                # 解析选题数据
                topic_data = {'date': date_value, 'error': False}

                # 提取其他标签的内容
                for pattern, key in tag_patterns:
                    if key == 'date':  # 已经提取过date了
                        continue

                    match = re.search(pattern, topic_text)
                    if match:
                        topic_data[key] = match.group(1)
                    else:
                        topic_data[key] = ""

                # 检查是否有基本必要字段
                required_fields = ['date', 'logic', 'product', 'product_logic', 'style', 'style_logic']
                if all(topic_data.get(field, "") for field in required_fields):
                    topics_data.append(topic_data)
                    print(f"成功解析一个选题: {topic_data['date']}")
                else:
                    print(f"选题缺少必要字段，跳过")

            result_list = topics_data
            print(f"通过标签匹配，成功解析选题数量：{len(result_list)}")

        print(f"最终成功解析选题数量：{len(result_list)}")

        # 重新分配所有选题的index，确保唯一性和正确的格式
        used_indices = set()

        # 第一步：尝试解析已有的index，如果是有效数字则保留
        for item in result_list:
            if 'index' in item and item['index']:
                try:
                    # 尝试转换为整数
                    index_value = int(item['index'].strip())
                    # 如果已被使用，标记为None以便后续重新分配
                    if index_value in used_indices:
                        item['index'] = None
                    else:
                        item['index'] = str(index_value)
                        used_indices.add(index_value)
                except (ValueError, TypeError):
                    # 如果转换失败，标记为None以便后续重新分配
                    item['index'] = None
            else:
                item['index'] = None

        # 第二步：为所有无效或重复的index分配新值
        next_available_index = 1
        for item in result_list:
            if item['index'] is None:
                # 找到下一个未使用的索引
                while next_available_index in used_indices:
                    next_available_index += 1
                item['index'] = str(next_available_index)
                used_indices.add(next_available_index)
                next_available_index += 1

        print(f"所有选题的索引已重新分配，确保唯一性")
        return result_list

    @staticmethod
    def save_topics(result_list, output_dir, run_id, result):
        """保存解析后的选题到JSON文件"""
        # 确保输出目录存在
        os.makedirs(output_dir, exist_ok=True)

        # 保存结果为格式化的JSON
        json_path = os.path.join(output_dir, f"result_list_{run_id}.json")
        with open(json_path, "w", encoding="utf-8") as f:
            json.dump(result_list, f, ensure_ascii=False, indent=4)

        # 如果找不到任何选题，记录错误
        if len(result_list) == 0:
            error_log_file = os.path.join(output_dir, f"error_log_{run_id}.txt")
            with open(error_log_file, "w", encoding="utf-8") as f:
                f.write("无法解析任何选题，原始内容如下：\n\n")
                f.write(result)
            print(f"选题解析完全失败，已记录错误日志到 {error_log_file}")
            return False, None
        return True, json_path

    @staticmethod
    def load_topics_from_json(json_path):
        """从JSON文件加载选题数据"""
        try:
            with open(json_path, "r", encoding="utf-8") as f:
                topics = json.load(f)
            print(f"成功从{json_path}加载{len(topics)}个选题")
            return topics
        except Exception as e:
            print(f"加载选题数据失败: {e}")
            return []