From 2c39d981f4eea979f390e06dd8a6d5530fbac74b Mon Sep 17 00:00:00 2001
From: jinye_huang <jinye_huang@foxmail.com>
Date: Mon, 12 May 2025 15:43:32 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86=E7=BB=93=E6=9E=9C?=
 =?UTF-8?q?=E6=96=87=E6=9C=AC=E7=9A=84=E6=B8=B2=E6=9F=93=E6=96=B9=E6=B3=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 scripts/extract_and_render.py | 95 ++++++++++++++++++++++++++---------
 1 file changed, 72 insertions(+), 23 deletions(-)

diff --git a/scripts/extract_and_render.py b/scripts/extract_and_render.py
index 9f34134..0dcf16d 100644
--- a/scripts/extract_and_render.py
+++ b/scripts/extract_and_render.py
@@ -10,10 +10,18 @@ import re
 import argparse
 from datetime import datetime
 
-def convert_json_to_txt_content(json_path):
+def convert_json_to_txt_content(json_path, prefer_original=False):
     """
     读取 JSON 文件，提取标题、内容和标签，移除 Markdown 格式，
     并返回格式化文本。
+    
+    根据judge_success字段决定使用原始内容还是审核后内容：
+    - judge_success为True时使用title/content（除非prefer_original=True）
+    - judge_success为False时使用original_title/original_content
+    
+    Args:
+        json_path: JSON文件路径
+        prefer_original: 是否优先使用原始内容，无视judge_success结果
     """
     print(f"    - 正在读取 JSON: {json_path}")
     if not os.path.exists(json_path):
@@ -24,10 +32,35 @@ def convert_json_to_txt_content(json_path):
         with open(json_path, 'r', encoding='utf-8') as f_json:
             data = json.load(f_json)
 
-        # 提取字段
-        title = data.get('title', '未找到标题')
-        content = data.get('content', '未找到内容')
-        tags = data.get('tags', data.get('tag', '未找到标签'))
+        # 根据judge_success选择标题和内容
+        judge_success = data.get('judge_success', None)
+        
+        if prefer_original and 'original_title' in data and 'original_content' in data:
+            # 优先使用原始内容
+            title = data.get('original_title', '未找到原始标题')
+            content = data.get('original_content', '未找到原始内容')
+            print(f"    - 优先使用原始内容 (prefer_original=True)")
+        elif judge_success is True and not prefer_original:
+            # 使用审核后的内容
+            title = data.get('title', '未找到标题')
+            content = data.get('content', '未找到内容')
+            print(f"    - 使用审核后内容 (judge_success=True)")
+        elif 'original_title' in data and 'original_content' in data:
+            # 使用原始内容
+            title = data.get('original_title', '未找到原始标题')
+            content = data.get('original_content', '未找到原始内容')
+            print(f"    - 使用原始内容 (judge_success={judge_success})")
+        else:
+            # 若无original字段，使用常规字段
+            title = data.get('title', '未找到标题')
+            content = data.get('content', '未找到内容')
+            print(f"    - 使用常规内容 (无judge结果)")
+        
+        # 解决tag/tags字段重复问题，按照修正后的处理逻辑，只使用tags字段
+        tags = data.get('tags', '')
+        if not tags and 'tag' in data:
+            tags = data.get('tag', '未找到标签')
+            print(f"    - 使用tag字段作为标签 (该字段将在后续版本中统一为tags)")
         
         # 移除Markdown格式
         content_no_format = re.sub(r'\*\*(.*?)\*\*', r'\1', content)
@@ -74,7 +107,7 @@ def load_topic_data(source_dir, run_id):
     
     return topic_data
 
-def process_result_directory(source_dir, output_dir, run_id=None):
+def process_result_directory(source_dir, output_dir, run_id=None, prefer_original=False):
     """
     处理指定的结果目录，提取内容并渲染到输出目录。
     
@@ -82,6 +115,7 @@ def process_result_directory(source_dir, output_dir, run_id=None):
         source_dir: 源目录路径，包含i_j子目录
         output_dir: 输出目录路径
         run_id: 可选的运行ID，如果不提供则使用源目录名
+        prefer_original: 是否优先使用原始内容，无视judge_success结果
     """
     if not os.path.isdir(source_dir):
         print(f"错误: 源目录不存在: {source_dir}")
@@ -121,7 +155,8 @@ def process_result_directory(source_dir, output_dir, run_id=None):
             "AdditionalImagesCount",
             "Status",
             "Details",
-            "JudgeStatus"
+            "JudgeStatus",
+            "ContentSource"
         ]
     ]
     
@@ -177,7 +212,8 @@ def process_result_directory(source_dir, output_dir, run_id=None):
             "AdditionalImagesCount": 0,
             "Status": "Processing",
             "Details": "",
-            "JudgeStatus": ""
+            "JudgeStatus": "",
+            "ContentSource": "unknown"
         }
         
         # 创建输出条目目录
@@ -209,7 +245,7 @@ def process_result_directory(source_dir, output_dir, run_id=None):
             except Exception as e:
                 print(f"  - 错误: 读取article.json失败: {e}")
                 
-            txt_content, error = convert_json_to_txt_content(json_path)
+            txt_content, error = convert_json_to_txt_content(json_path, prefer_original)
             if error:
                 record["Status"] = "Partial"
                 record["Details"] += f"文章处理失败: {error}; "
@@ -219,6 +255,17 @@ def process_result_directory(source_dir, output_dir, run_id=None):
                     with open(txt_path, 'w', encoding='utf-8') as f_txt:
                         f_txt.write(txt_content)
                     print(f"  - 成功写入文本文件: {txt_path}")
+                    
+                    # 记录内容来源
+                    if prefer_original:
+                        record["ContentSource"] = "original_preferred"
+                    elif article_data.get("judge_success") is True:
+                        record["ContentSource"] = "judged"
+                    elif "original_title" in article_data:
+                        record["ContentSource"] = "original"
+                    else:
+                        record["ContentSource"] = "default"
+                        
                 except Exception as e:
                     record["Status"] = "Partial"
                     record["Details"] += f"写入文本文件失败: {e}; "
@@ -298,30 +345,32 @@ def process_result_directory(source_dir, output_dir, run_id=None):
     print(f"结果保存在: {output_dir}")
 
 def main():
-    # parser = argparse.ArgumentParser(description="从TravelContentCreator结果目录提取内容并渲染到指定目录")
-    # parser.add_argument("--source", type=str, help="源目录路径")
-    # parser.add_argument("--output", type=str, help="输出目录路径")
-    # parser.add_argument("--run-id", type=str, help="自定义运行ID")
+    parser = argparse.ArgumentParser(description="从TravelContentCreator结果目录提取内容并渲染到指定目录")
+    parser.add_argument("--source", type=str, help="源目录路径")
+    parser.add_argument("--output", type=str, help="输出目录路径")
+    parser.add_argument("--run-id", type=str, help="自定义运行ID")
+    parser.add_argument("--prefer-original", action="store_true", help="优先使用原始内容，忽略审核结果")
     
-    # args = parser.parse_args()
+    args = parser.parse_args()
     
-    # # 默认值设置
-    # source = args.source if args.source else "/root/autodl-tmp/TravelContentCreator/result/2025-05-11_00-26-30"
-    # output = args.output if args.output else "/root/autodl-tmp/TravelContentCreator/output/2025-05-11_00-26-30"
-    # run_id = args.run_id if args.run_id else os.path.basename(source)
-    
-    source = "/root/autodl-tmp/TravelContentCreator/result/2025-05-12_09-33-12"
-    output = "/root/autodl-tmp/TravelContentCreator/output/2025-05-12_09-33-12"
-    run_id = os.path.basename(source)
+    # 默认值设置
+    source = args.source if args.source else "/root/autodl-tmp/TravelContentCreator/result/2025-05-12_09-33-12"
+    output = args.output if args.output else "/root/autodl-tmp/TravelContentCreator/output/2025-05-12_09-33-12"
+    run_id = args.run_id if args.run_id else os.path.basename(source)
+    prefer_original = args.prefer_original
     
     print("-" * 60)
     print(f"开始提取和渲染流程")
     print(f"源目录: {source}")
     print(f"输出目录: {output}")
     print(f"运行ID: {run_id}")
+    if prefer_original:
+        print("内容模式: 优先使用原始内容")
+    else:
+        print("内容模式: 根据审核结果选择内容")
     print("-" * 60)
     
-    process_result_directory(source, output, run_id)
+    process_result_directory(source, output, run_id, prefer_original)
     
     print("\n脚本执行完毕.")