From 2c39d981f4eea979f390e06dd8a6d5530fbac74b Mon Sep 17 00:00:00 2001 From: jinye_huang Date: Mon, 12 May 2025 15:43:32 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86=E7=BB=93=E6=9E=9C?= =?UTF-8?q?=E6=96=87=E6=9C=AC=E7=9A=84=E6=B8=B2=E6=9F=93=E6=96=B9=E6=B3=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/extract_and_render.py | 95 ++++++++++++++++++++++++++--------- 1 file changed, 72 insertions(+), 23 deletions(-) diff --git a/scripts/extract_and_render.py b/scripts/extract_and_render.py index 9f34134..0dcf16d 100644 --- a/scripts/extract_and_render.py +++ b/scripts/extract_and_render.py @@ -10,10 +10,18 @@ import re import argparse from datetime import datetime -def convert_json_to_txt_content(json_path): +def convert_json_to_txt_content(json_path, prefer_original=False): """ 读取 JSON 文件,提取标题、内容和标签,移除 Markdown 格式, 并返回格式化文本。 + + 根据judge_success字段决定使用原始内容还是审核后内容: + - judge_success为True时使用title/content(除非prefer_original=True) + - judge_success为False时使用original_title/original_content + + Args: + json_path: JSON文件路径 + prefer_original: 是否优先使用原始内容,无视judge_success结果 """ print(f" - 正在读取 JSON: {json_path}") if not os.path.exists(json_path): @@ -24,10 +32,35 @@ def convert_json_to_txt_content(json_path): with open(json_path, 'r', encoding='utf-8') as f_json: data = json.load(f_json) - # 提取字段 - title = data.get('title', '未找到标题') - content = data.get('content', '未找到内容') - tags = data.get('tags', data.get('tag', '未找到标签')) + # 根据judge_success选择标题和内容 + judge_success = data.get('judge_success', None) + + if prefer_original and 'original_title' in data and 'original_content' in data: + # 优先使用原始内容 + title = data.get('original_title', '未找到原始标题') + content = data.get('original_content', '未找到原始内容') + print(f" - 优先使用原始内容 (prefer_original=True)") + elif judge_success is True and not prefer_original: + # 使用审核后的内容 + title = data.get('title', '未找到标题') + content = data.get('content', '未找到内容') + print(f" - 使用审核后内容 (judge_success=True)") + elif 'original_title' in data and 'original_content' in data: + # 使用原始内容 + title = data.get('original_title', '未找到原始标题') + content = data.get('original_content', '未找到原始内容') + print(f" - 使用原始内容 (judge_success={judge_success})") + else: + # 若无original字段,使用常规字段 + title = data.get('title', '未找到标题') + content = data.get('content', '未找到内容') + print(f" - 使用常规内容 (无judge结果)") + + # 解决tag/tags字段重复问题,按照修正后的处理逻辑,只使用tags字段 + tags = data.get('tags', '') + if not tags and 'tag' in data: + tags = data.get('tag', '未找到标签') + print(f" - 使用tag字段作为标签 (该字段将在后续版本中统一为tags)") # 移除Markdown格式 content_no_format = re.sub(r'\*\*(.*?)\*\*', r'\1', content) @@ -74,7 +107,7 @@ def load_topic_data(source_dir, run_id): return topic_data -def process_result_directory(source_dir, output_dir, run_id=None): +def process_result_directory(source_dir, output_dir, run_id=None, prefer_original=False): """ 处理指定的结果目录,提取内容并渲染到输出目录。 @@ -82,6 +115,7 @@ def process_result_directory(source_dir, output_dir, run_id=None): source_dir: 源目录路径,包含i_j子目录 output_dir: 输出目录路径 run_id: 可选的运行ID,如果不提供则使用源目录名 + prefer_original: 是否优先使用原始内容,无视judge_success结果 """ if not os.path.isdir(source_dir): print(f"错误: 源目录不存在: {source_dir}") @@ -121,7 +155,8 @@ def process_result_directory(source_dir, output_dir, run_id=None): "AdditionalImagesCount", "Status", "Details", - "JudgeStatus" + "JudgeStatus", + "ContentSource" ] ] @@ -177,7 +212,8 @@ def process_result_directory(source_dir, output_dir, run_id=None): "AdditionalImagesCount": 0, "Status": "Processing", "Details": "", - "JudgeStatus": "" + "JudgeStatus": "", + "ContentSource": "unknown" } # 创建输出条目目录 @@ -209,7 +245,7 @@ def process_result_directory(source_dir, output_dir, run_id=None): except Exception as e: print(f" - 错误: 读取article.json失败: {e}") - txt_content, error = convert_json_to_txt_content(json_path) + txt_content, error = convert_json_to_txt_content(json_path, prefer_original) if error: record["Status"] = "Partial" record["Details"] += f"文章处理失败: {error}; " @@ -219,6 +255,17 @@ def process_result_directory(source_dir, output_dir, run_id=None): with open(txt_path, 'w', encoding='utf-8') as f_txt: f_txt.write(txt_content) print(f" - 成功写入文本文件: {txt_path}") + + # 记录内容来源 + if prefer_original: + record["ContentSource"] = "original_preferred" + elif article_data.get("judge_success") is True: + record["ContentSource"] = "judged" + elif "original_title" in article_data: + record["ContentSource"] = "original" + else: + record["ContentSource"] = "default" + except Exception as e: record["Status"] = "Partial" record["Details"] += f"写入文本文件失败: {e}; " @@ -298,30 +345,32 @@ def process_result_directory(source_dir, output_dir, run_id=None): print(f"结果保存在: {output_dir}") def main(): - # parser = argparse.ArgumentParser(description="从TravelContentCreator结果目录提取内容并渲染到指定目录") - # parser.add_argument("--source", type=str, help="源目录路径") - # parser.add_argument("--output", type=str, help="输出目录路径") - # parser.add_argument("--run-id", type=str, help="自定义运行ID") + parser = argparse.ArgumentParser(description="从TravelContentCreator结果目录提取内容并渲染到指定目录") + parser.add_argument("--source", type=str, help="源目录路径") + parser.add_argument("--output", type=str, help="输出目录路径") + parser.add_argument("--run-id", type=str, help="自定义运行ID") + parser.add_argument("--prefer-original", action="store_true", help="优先使用原始内容,忽略审核结果") - # args = parser.parse_args() + args = parser.parse_args() - # # 默认值设置 - # source = args.source if args.source else "/root/autodl-tmp/TravelContentCreator/result/2025-05-11_00-26-30" - # output = args.output if args.output else "/root/autodl-tmp/TravelContentCreator/output/2025-05-11_00-26-30" - # run_id = args.run_id if args.run_id else os.path.basename(source) - - source = "/root/autodl-tmp/TravelContentCreator/result/2025-05-12_09-33-12" - output = "/root/autodl-tmp/TravelContentCreator/output/2025-05-12_09-33-12" - run_id = os.path.basename(source) + # 默认值设置 + source = args.source if args.source else "/root/autodl-tmp/TravelContentCreator/result/2025-05-12_09-33-12" + output = args.output if args.output else "/root/autodl-tmp/TravelContentCreator/output/2025-05-12_09-33-12" + run_id = args.run_id if args.run_id else os.path.basename(source) + prefer_original = args.prefer_original print("-" * 60) print(f"开始提取和渲染流程") print(f"源目录: {source}") print(f"输出目录: {output}") print(f"运行ID: {run_id}") + if prefer_original: + print("内容模式: 优先使用原始内容") + else: + print("内容模式: 根据审核结果选择内容") print("-" * 60) - process_result_directory(source, output, run_id) + process_result_directory(source, output, run_id, prefer_original) print("\n脚本执行完毕.")