TravelContentCreator/scripts/distribution/extract_and_render.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
import json
import shutil
import csv
import traceback
import re
import argparse
from datetime import datetime
import sqlite3
import logging
import base64

# 配置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

# 内置数据库记录功能
def init_database(db_path):
    """初始化数据库，创建表结构"""
    try:
        conn = sqlite3.connect(db_path)
        conn.execute("PRAGMA foreign_keys = OFF")  # 禁用外键约束
        cursor = conn.cursor()

        # 创建内容表
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS contents (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                entry_id TEXT NOT NULL UNIQUE,
                output_txt_path TEXT,
                poster_path TEXT,
                article_json_path TEXT,
                product TEXT,
                object TEXT,
                date TEXT,
                logic TEXT,
                judge_status INTEGER,
                is_distributed INTEGER DEFAULT 0,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            )
        """)

        # 创建索引
        cursor.execute("CREATE INDEX IF NOT EXISTS idx_contents_entry_id ON contents(entry_id)")

        conn.commit()
        logger.info("数据库初始化成功")
        return conn
    except sqlite3.Error as e:
        logger.error(f"初始化数据库失败: {e}")
        return None

def record_to_database(
    db_path,
    entry_id,
    output_txt_path=None,
    poster_path=None,
    article_json_path=None,
    product=None,
    object=None,
    date=None,
    logic=None,
    judge_status=None,
    is_distributed=0
):
    """将内容记录到数据库"""
    try:
        # 检查数据库是否存在，如果不存在则初始化
        if not os.path.exists(db_path):
            logger.info(f"数据库文件不存在: {db_path}，将自动创建")
            conn = init_database(db_path)
            if not conn:
                return False
        else:
            try:
                conn = sqlite3.connect(db_path)
                conn.execute("PRAGMA foreign_keys = OFF")  # 禁用外键约束
            except sqlite3.Error as e:
                logger.error(f"连接数据库失败: {e}")
                return False

        try:
            cursor = conn.cursor()

            # 准备数据
            data = (
                entry_id,
                output_txt_path or '',
                poster_path or '',
                article_json_path or '',
                product or '',
                object or '',
                date or '',
                logic or '',
                judge_status if judge_status is not None else None,
                is_distributed
            )

            # 插入或更新内容
            cursor.execute("""
                INSERT OR REPLACE INTO contents
                (entry_id, output_txt_path, poster_path, article_json_path,
                 product, object, date, logic, judge_status, is_distributed)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """, data)

            conn.commit()
            logger.info(f"已将内容 {entry_id} 记录到数据库")
            return True

        except Exception as e:
            logger.error(f"记录内容到数据库失败: {e}")
            try:
                conn.rollback()
            except:
                pass
            return False
        finally:
            try:
                conn.close()
            except:
                pass

    except Exception as e:
        logger.error(f"记录提取内容时发生错误: {e}")
        return False

def convert_json_to_txt_content(json_path, prefer_original=False):
    """
    读取 JSON 文件，提取标题、内容和标签，移除 Markdown 格式，
    并返回格式化文本。

    根据JSON文件中的状态字段决定使用什么内容：
    - 如果judged=True，使用审核后内容
    - 如果judged=False，使用原始内容

    所有情况都优先使用base64编码的字段，因为这些字段能正确保留特殊字符和换行符

    Args:
        json_path: JSON文件路径
        prefer_original: 参数保留但不再使用
    """
    print(f"    - 正在读取 JSON: {json_path}")
    if not os.path.exists(json_path):
        print(f"    - 警告: JSON 文件不存在: {json_path}")
        return None, f"文件未找到: {json_path}"

    try:
        with open(json_path, 'r', encoding='utf-8') as f_json:
            data = json.load(f_json)

        # 提取状态字段
        judged = data.get('judged', False)

        print(f"    - 文件状态: judged={judged}")

        # 初始化变量
        title = None
        content = None
        original_title = None
        original_content = None
        tags = None
        original_tags = None

        # =================解码所有可能的base64字段=================
        # 解码标题和内容字段
        if "title_base64" in data:
            try:
                title = base64.b64decode(data["title_base64"]).decode('utf-8')
                print(f"    - 成功从base64解码审核后标题")
            except Exception as e:
                print(f"    - 警告: 标题base64解码失败: {e}")

        if "content_base64" in data:
            try:
                content = base64.b64decode(data["content_base64"]).decode('utf-8')
                print(f"    - 成功从base64解码审核后内容")
            except Exception as e:
                print(f"    - 警告: 内容base64解码失败: {e}")

        # 解码原始标题和内容字段
        if "original_title_base64" in data:
            try:
                original_title = base64.b64decode(data["original_title_base64"]).decode('utf-8')
                print(f"    - 成功从base64解码原始标题")
            except Exception as e:
                print(f"    - 警告: 原始标题base64解码失败: {e}")

        if "original_content_base64" in data:
            try:
                original_content = base64.b64decode(data["original_content_base64"]).decode('utf-8')
                print(f"    - 成功从base64解码原始内容")
            except Exception as e:
                print(f"    - 警告: 原始内容base64解码失败: {e}")

        # 解码标签字段
        if "tags_base64" in data:
            try:
                tags = base64.b64decode(data["tags_base64"]).decode('utf-8')
                print(f"    - 成功从base64解码审核后标签")
            except Exception as e:
                print(f"    - 警告: 标签base64解码失败: {e}")

        if "original_tags_base64" in data:
            try:
                original_tags = base64.b64decode(data["original_tags_base64"]).decode('utf-8')
                print(f"    - 成功从base64解码原始标签")
            except Exception as e:
                print(f"    - 警告: 原始标签base64解码失败: {e}")

        # =================回退到非base64字段=================
        # 如果base64解码失败，尝试使用普通字段
        if title is None and "title" in data:
            title = data["title"]
            print(f"    - 使用普通字段标题")

        if content is None and "content" in data:
            content = data["content"]
            print(f"    - 使用普通字段内容")

        if original_title is None and "original_title" in data:
            original_title = data["original_title"]
            print(f"    - 使用普通字段原始标题")

        if original_content is None and "original_content" in data:
            original_content = data["original_content"]
            print(f"    - 使用普通字段原始内容")

        if tags is None and "tags" in data:
            tags = data["tags"]
            print(f"    - 使用普通字段标签")
        elif tags is None and "tag" in data:
            tags = data["tag"]
            print(f"    - 使用普通tag字段作为标签")

        if original_tags is None and "original_tags" in data:
            original_tags = data["original_tags"]
            print(f"    - 使用普通字段原始标签")

        # =================根据状态字段决定使用哪些内容=================
        final_title = None
        final_content = None
        final_tags = None

        # 简化逻辑：如果已审核，使用审核后内容；否则使用原始内容
        if judged:
            print(f"    - 使用审核后内容 (judged=True)")
            final_title = title
            final_content = content
            final_tags = tags
        else:
            print(f"    - 使用原始内容 (judged=False)")
            final_title = original_title
            final_content = original_content
            final_tags = original_tags

        # 确保所有字段都有值
        final_title = final_title or "未找到标题"
        final_content = final_content or "未找到内容"
        final_tags = final_tags or "未找到标签"

        # 移除Markdown格式，但保留换行符
        content_no_format = re.sub(r'\*\*(.*?)\*\*', r'\1', final_content)

        # 组合输出文本，保留内容的所有换行符
        result = final_title + "\n\n" + content_no_format

        if final_tags and final_tags != "未找到标签":
            result += "\n\n" + final_tags

        print(f"    - 内容处理完成，最终文本长度: {len(result)} 字符")
        return result, None

    except json.JSONDecodeError:
        print(f"    - 错误: JSON 格式无效: {json_path}")
        return None, f"无效的 JSON 格式: {json_path}"
    except Exception as e:
        print(f"    - 错误: 处理 JSON 时出错: {e}")
        traceback.print_exc()
        return None, f"处理 JSON 时出错: {e}"

def process_txt_content(txt_path):
    """
    直接读取TXT文件内容，移除Markdown格式，并返回处理后的文本。

    Args:
        txt_path: TXT文件路径

    Returns:
        tuple: (处理后的内容，错误信息)
    """
    print(f"    - 正在读取TXT: {txt_path}")
    if not os.path.exists(txt_path):
        print(f"    - 警告: TXT文件不存在: {txt_path}")
        return None, f"文件未找到: {txt_path}"

    try:
        # 读取TXT文件内容
        with open(txt_path, 'r', encoding='utf-8') as f_txt:
            content = f_txt.read()

        # 移除Markdown格式，但保留换行符
        # 处理粗体
        content_no_format = re.sub(r'\*\*(.*?)\*\*', r'\1', content)
        # 处理斜体
        content_no_format = re.sub(r'\*(.*?)\*', r'\1', content_no_format)
        # 处理链接 [文本](链接)
        content_no_format = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', content_no_format)
        # 处理标题 # 文本
        content_no_format = re.sub(r'^#+ (.*?)$', r'\1', content_no_format, flags=re.MULTILINE)

        return content_no_format, None
    except Exception as e:
        print(f"    - 错误: 处理TXT时出错: {e}")
        return None, f"处理TXT时出错: {e}"

def load_topic_data(source_dir, run_id):
    """
    加载选题数据

    Args:
        source_dir: 源目录路径
        run_id: 运行ID

    Returns:
        dict: 以topic_index为键的选题数据字典
    """
    topic_file_path = os.path.join(source_dir, f"tweet_topic_{run_id}.json")
    topic_data = {}

    if os.path.exists(topic_file_path):
        try:
            with open(topic_file_path, 'r', encoding='utf-8') as f:
                topics = json.load(f)

            # 将选题数据转换为以index为键的字典
            for topic in topics:
                index = topic.get("index")
                if index:
                    topic_data[index] = topic

            print(f"成功加载选题数据，共{len(topic_data)}条")
        except Exception as e:
            print(f"加载选题数据时出错: {e}")
    else:
        print(f"警告: 未找到选题文件: {topic_file_path}")

    return topic_data

def process_result_directory(source_dir, output_dir, run_id=None, prefer_original=False, db_path=None):
    """
    处理指定的结果目录，提取内容并渲染到输出目录。

    Args:
        source_dir: 源目录路径，包含i_j子目录
        output_dir: 输出目录路径
        run_id: 可选的运行ID，如果不提供则使用源目录名
        prefer_original: 是否优先使用原始内容，无视judge_success结果
        db_path: 数据库路径，若不提供则使用默认路径
    """
    if not os.path.isdir(source_dir):
        print(f"错误: 源目录不存在: {source_dir}")
        return

    # 设置默认数据库路径
    if db_path is None:
        db_path = '/root/autodl-tmp/TravelContentCreator/distribution.db'

    # 数据库是否启用
    db_enabled = True

    # 创建输出目录
    os.makedirs(output_dir, exist_ok=True)
    print(f"确保输出目录存在: {output_dir}")

    # 提取run_id
    if not run_id:
        run_id = os.path.basename(source_dir)

    # 加载选题数据
    topic_data = load_topic_data(source_dir, run_id)

    # 创建CSV清单，添加选题相关字段
    csv_path = os.path.join(output_dir, f"manifest_{run_id}.csv")
    csv_data = [
        [
            "EntryID",
            "TopicIndex",
            "VariantIndex",
            "Date",
            "Logic",
            "Object",
            "Product",
            "ProductLogic",
            "Style",
            "StyleLogic",
            "TargetAudience",
            "TargetAudienceLogic",
            "SourcePath",
            "ArticleJsonPath",
            "OutputTxtPath",
            "PosterPath",
            "AdditionalImagesCount",
            "Status",
            "Details",
            "JudgeStatus",
            "ContentSource",
            "RecordedInDB",
            "IsDistributed"
        ]
    ]

    # 查找所有i_j目录
    entry_pattern = re.compile(r"^(\d+)_(\d+)$")
    entries = []

    for item in os.listdir(source_dir):
        item_path = os.path.join(source_dir, item)
        match = entry_pattern.match(item)
        if os.path.isdir(item_path) and match:
            entries.append(item)

    if not entries:
        print(f"警告: 在源目录中未找到任何i_j格式的子目录")
        return

    print(f"找到 {len(entries)} 个条目目录")

    # 处理每个条目
    for entry in sorted(entries):
        entry_path = os.path.join(source_dir, entry)
        output_entry_path = os.path.join(output_dir, entry)

        print(f"\n处理条目: {entry}")

        # 解析topic_index和variant_index
        match = entry_pattern.match(entry)
        topic_index = match.group(1)
        variant_index = match.group(2)

        # 获取该话题的选题信息
        topic_info = topic_data.get(topic_index, {})

        # 创建记录
        record = {
            "EntryID": entry,
            "TopicIndex": topic_index,
            "VariantIndex": variant_index,
            "Date": topic_info.get("date", ""),
            "Logic": topic_info.get("logic", ""),
            "Object": topic_info.get("object", ""),
            "Product": topic_info.get("product", ""),
            "ProductLogic": topic_info.get("product_logic", ""),
            "Style": topic_info.get("style", ""),
            "StyleLogic": topic_info.get("style_logic", ""),
            "TargetAudience": topic_info.get("target_audience", ""),
            "TargetAudienceLogic": topic_info.get("target_audience_logic", ""),
            "SourcePath": entry_path,
            "ArticleJsonPath": "",
            "OutputTxtPath": "",
            "PosterPath": "",
            "AdditionalImagesCount": 0,
            "Status": "Processing",
            "Details": "",
            "JudgeStatus": "",
            "ContentSource": "unknown",
            "RecordedInDB": "No",
            "IsDistributed": "No"
        }

        # 创建输出条目目录
        try:
            os.makedirs(output_entry_path, exist_ok=True)
        except Exception as e:
            record["Status"] = "Failed"
            record["Details"] = f"创建输出目录失败: {e}"
            csv_data.append([record[col] for col in csv_data[0]])
            print(f"  - 错误: {record['Details']}")
            continue

        # 1. 处理article内容 - 优先使用JSON文件
        output_txt_path = os.path.join(output_entry_path, "article.txt")
        record["OutputTxtPath"] = output_txt_path

        # 读取article.json
        json_path = os.path.join(entry_path, "article.json")
        record["ArticleJsonPath"] = json_path

        content_processed = False

        # 优先从JSON提取内容
        if os.path.exists(json_path):
            try:
                # 从JSON文件提取审核状态
                with open(json_path, 'r', encoding='utf-8') as f_json:
                    article_data = json.load(f_json)
                    # 提取judge_success状态
                    if "judge_success" in article_data:
                        record["JudgeStatus"] = str(article_data["judge_success"])
                    elif "judged" in article_data:
                        record["JudgeStatus"] = "已审核" if article_data["judged"] else "未审核"

                # 使用convert_json_to_txt_content函数处理JSON文件
                processed_content, error = convert_json_to_txt_content(json_path, prefer_original)

                if error:
                    print(f"  - 警告: 从JSON提取内容失败: {error}")
                else:
                    try:
                        with open(output_txt_path, 'w', encoding='utf-8') as f_txt:
                            f_txt.write(processed_content)
                        print(f"  - 成功从JSON提取并写入内容到: {output_txt_path}")
                        record["ContentSource"] = "json_file"
                        content_processed = True
                    except Exception as e:
                        print(f"  - 警告: 写入从JSON提取的内容时出错: {e}")
            except Exception as e:
                print(f"  - 警告: 处理JSON文件时出错: {e}")

        # 如果从JSON提取内容失败，尝试使用现有的TXT文件
        if not content_processed:
            input_txt_path = os.path.join(entry_path, "article.txt")
            if os.path.exists(input_txt_path):
                processed_content, error = process_txt_content(input_txt_path)
                if error:
                    record["Status"] = "Partial"
                    record["Details"] += f"文章处理失败: {error}; "
                    print(f"  - 错误: {record['Details']}")
                else:
                    try:
                        with open(output_txt_path, 'w', encoding='utf-8') as f_txt:
                            f_txt.write(processed_content)
                        print(f"  - 成功写入处理后的文本文件: {output_txt_path}")
                        record["ContentSource"] = "txt_file"
                        content_processed = True
                    except Exception as e:
                        record["Status"] = "Partial"
                        record["Details"] += f"写入文本文件失败: {e}; "
                        print(f"  - 错误: {record['Details']}")
            else:
                record["Status"] = "Partial"
                record["Details"] += "无法从JSON或TXT获取内容; "
                print(f"  - 警告: {record['Details']}")

        # 2. 处理海报图片
        poster_dir = os.path.join(entry_path, "poster")
        poster_jpg_path = os.path.join(poster_dir, "poster.jpg")
        output_poster_path = os.path.join(output_entry_path, "poster.jpg")
        record["PosterPath"] = output_poster_path

        if os.path.exists(poster_jpg_path):
            try:
                shutil.copy2(poster_jpg_path, output_poster_path)
                print(f"  - 成功复制海报图片: {output_poster_path}")
            except Exception as e:
                record["Status"] = "Partial"
                record["Details"] += f"复制海报图片失败: {e}; "
                print(f"  - 错误: {record['Details']}")
        else:
            record["Status"] = "Partial"
            record["Details"] += "海报图片不存在; "
            print(f"  - 警告: {record['Details']}")

        # 3. 处理额外图片
        image_dir = os.path.join(entry_path, "image")
        output_image_dir = os.path.join(output_entry_path, "additional_images")

        if os.path.exists(image_dir) and os.path.isdir(image_dir):
            try:
                os.makedirs(output_image_dir, exist_ok=True)
                image_count = 0

                for filename in os.listdir(image_dir):
                    if filename.startswith("additional_") and filename.endswith(".jpg"):
                        source_file = os.path.join(image_dir, filename)
                        dest_file = os.path.join(output_image_dir, filename)

                        # 复制图片
                        shutil.copy2(source_file, dest_file)
                        image_count += 1

                record["AdditionalImagesCount"] = image_count
                print(f"  - 复制了 {image_count} 张额外图片到: {output_image_dir}")
            except Exception as e:
                record["Status"] = "Partial"
                record["Details"] += f"处理额外图片时出错: {e}; "
                print(f"  - 错误: {record['Details']}")
        else:
            record["AdditionalImagesCount"] = 0
            print(f"  - 没有找到额外图片目录")

        # 更新状态
        if record["Status"] == "Processing":
            record["Status"] = "Success"
            record["Details"] = "处理成功完成"

        # 4. 将内容记录到数据库
        if db_enabled:
            try:
                # 准备judge_status值
                if record["JudgeStatus"] == "True":
                    judge_status = 1
                elif record["JudgeStatus"] == "False":
                    judge_status = 0
                else:
                    judge_status = None

                # 调用数据库记录函数
                success = record_to_database(
                    db_path,
                    entry_id=record["EntryID"],
                    output_txt_path=record["OutputTxtPath"],
                    poster_path=record["PosterPath"],
                    article_json_path=record["ArticleJsonPath"],
                    product=record["Product"],
                    object=record["Object"],
                    date=record["Date"],
                    logic=record["Logic"],
                    judge_status=judge_status,
                    is_distributed=0  # 默认为未分发
                )

                if success:
                    record["RecordedInDB"] = "Yes"
                    print(f"  - 成功将内容记录到数据库")
                else:
                    record["RecordedInDB"] = "Failed"
                    print(f"  - 警告: 内容记录到数据库失败")
            except Exception as e:
                record["RecordedInDB"] = "Error"
                print(f"  - 错误: 记录到数据库时发生异常: {e}")
                traceback.print_exc()  # 打印详细的异常堆栈
        else:
            record["RecordedInDB"] = "Disabled"
            print(f"  - 信息: 数据库记录功能已禁用")

        # 添加记录到CSV数据
        csv_data.append([record[col] for col in csv_data[0]])

    # 写入CSV清单
    try:
        print(f"\n正在写入清单CSV: {csv_path}")
        with open(csv_path, 'w', newline='', encoding='utf-8-sig') as f_csv:
            writer = csv.writer(f_csv)
            writer.writerows(csv_data)
        print(f"清单CSV生成成功")
    except Exception as e:
        print(f"写入CSV文件时出错: {e}")
        traceback.print_exc()

    print(f"\n处理完成. 共处理 {len(entries)} 个条目.")
    print(f"结果保存在: {output_dir}")

def main():
    parser = argparse.ArgumentParser(description="从TravelContentCreator结果目录提取内容并渲染到指定目录")
    parser.add_argument("--source", type=str, help="源目录路径")
    parser.add_argument("--output", type=str, help="输出目录路径")
    parser.add_argument("--run-id", type=str, help="自定义运行ID")
    parser.add_argument("--prefer-original", action="store_true", help="优先使用原始内容，忽略审核结果")
    parser.add_argument("--db-path", type=str, help="数据库路径，若不提供则使用默认路径")
    parser.add_argument("--disable-db", action="store_true", help="禁用数据库记录功能")

    args = parser.parse_args()

    # 默认值设置
    source = args.source if args.source else "/root/autodl-tmp/TravelContentCreator/result/2025-05-21_16-18-24"
    output = args.output if args.output else "/root/autodl-tmp/TravelContentCreator/output/2025-05-21_16-18-24"
    run_id = args.run_id if args.run_id else os.path.basename(source)
    prefer_original = args.prefer_original
    db_path = args.db_path if args.db_path else '/root/autodl-tmp/TravelContentCreator/distribution.db'

    print("-" * 60)
    print(f"开始提取和渲染流程")
    print(f"源目录: {source}")
    print(f"输出目录: {output}")
    print(f"运行ID: {run_id}")
    if prefer_original:
        print("内容模式: 优先使用原始内容")
    else:
        print("内容模式: 根据审核结果选择内容")

    if args.disable_db:
        print("数据库记录: 已禁用")
    else:
        print(f"数据库记录: 已启用 (路径: {db_path})")
    print("-" * 60)

    process_result_directory(source, output, run_id, prefer_original, db_path)

    print("\n脚本执行完毕.")

if __name__ == "__main__":
    main()