修改了结果文本的渲染方法
This commit is contained in:
parent
6415966034
commit
2c39d981f4
@ -10,10 +10,18 @@ import re
|
|||||||
import argparse
|
import argparse
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
def convert_json_to_txt_content(json_path):
|
def convert_json_to_txt_content(json_path, prefer_original=False):
|
||||||
"""
|
"""
|
||||||
读取 JSON 文件,提取标题、内容和标签,移除 Markdown 格式,
|
读取 JSON 文件,提取标题、内容和标签,移除 Markdown 格式,
|
||||||
并返回格式化文本。
|
并返回格式化文本。
|
||||||
|
|
||||||
|
根据judge_success字段决定使用原始内容还是审核后内容:
|
||||||
|
- judge_success为True时使用title/content(除非prefer_original=True)
|
||||||
|
- judge_success为False时使用original_title/original_content
|
||||||
|
|
||||||
|
Args:
|
||||||
|
json_path: JSON文件路径
|
||||||
|
prefer_original: 是否优先使用原始内容,无视judge_success结果
|
||||||
"""
|
"""
|
||||||
print(f" - 正在读取 JSON: {json_path}")
|
print(f" - 正在读取 JSON: {json_path}")
|
||||||
if not os.path.exists(json_path):
|
if not os.path.exists(json_path):
|
||||||
@ -24,10 +32,35 @@ def convert_json_to_txt_content(json_path):
|
|||||||
with open(json_path, 'r', encoding='utf-8') as f_json:
|
with open(json_path, 'r', encoding='utf-8') as f_json:
|
||||||
data = json.load(f_json)
|
data = json.load(f_json)
|
||||||
|
|
||||||
# 提取字段
|
# 根据judge_success选择标题和内容
|
||||||
title = data.get('title', '未找到标题')
|
judge_success = data.get('judge_success', None)
|
||||||
content = data.get('content', '未找到内容')
|
|
||||||
tags = data.get('tags', data.get('tag', '未找到标签'))
|
if prefer_original and 'original_title' in data and 'original_content' in data:
|
||||||
|
# 优先使用原始内容
|
||||||
|
title = data.get('original_title', '未找到原始标题')
|
||||||
|
content = data.get('original_content', '未找到原始内容')
|
||||||
|
print(f" - 优先使用原始内容 (prefer_original=True)")
|
||||||
|
elif judge_success is True and not prefer_original:
|
||||||
|
# 使用审核后的内容
|
||||||
|
title = data.get('title', '未找到标题')
|
||||||
|
content = data.get('content', '未找到内容')
|
||||||
|
print(f" - 使用审核后内容 (judge_success=True)")
|
||||||
|
elif 'original_title' in data and 'original_content' in data:
|
||||||
|
# 使用原始内容
|
||||||
|
title = data.get('original_title', '未找到原始标题')
|
||||||
|
content = data.get('original_content', '未找到原始内容')
|
||||||
|
print(f" - 使用原始内容 (judge_success={judge_success})")
|
||||||
|
else:
|
||||||
|
# 若无original字段,使用常规字段
|
||||||
|
title = data.get('title', '未找到标题')
|
||||||
|
content = data.get('content', '未找到内容')
|
||||||
|
print(f" - 使用常规内容 (无judge结果)")
|
||||||
|
|
||||||
|
# 解决tag/tags字段重复问题,按照修正后的处理逻辑,只使用tags字段
|
||||||
|
tags = data.get('tags', '')
|
||||||
|
if not tags and 'tag' in data:
|
||||||
|
tags = data.get('tag', '未找到标签')
|
||||||
|
print(f" - 使用tag字段作为标签 (该字段将在后续版本中统一为tags)")
|
||||||
|
|
||||||
# 移除Markdown格式
|
# 移除Markdown格式
|
||||||
content_no_format = re.sub(r'\*\*(.*?)\*\*', r'\1', content)
|
content_no_format = re.sub(r'\*\*(.*?)\*\*', r'\1', content)
|
||||||
@ -74,7 +107,7 @@ def load_topic_data(source_dir, run_id):
|
|||||||
|
|
||||||
return topic_data
|
return topic_data
|
||||||
|
|
||||||
def process_result_directory(source_dir, output_dir, run_id=None):
|
def process_result_directory(source_dir, output_dir, run_id=None, prefer_original=False):
|
||||||
"""
|
"""
|
||||||
处理指定的结果目录,提取内容并渲染到输出目录。
|
处理指定的结果目录,提取内容并渲染到输出目录。
|
||||||
|
|
||||||
@ -82,6 +115,7 @@ def process_result_directory(source_dir, output_dir, run_id=None):
|
|||||||
source_dir: 源目录路径,包含i_j子目录
|
source_dir: 源目录路径,包含i_j子目录
|
||||||
output_dir: 输出目录路径
|
output_dir: 输出目录路径
|
||||||
run_id: 可选的运行ID,如果不提供则使用源目录名
|
run_id: 可选的运行ID,如果不提供则使用源目录名
|
||||||
|
prefer_original: 是否优先使用原始内容,无视judge_success结果
|
||||||
"""
|
"""
|
||||||
if not os.path.isdir(source_dir):
|
if not os.path.isdir(source_dir):
|
||||||
print(f"错误: 源目录不存在: {source_dir}")
|
print(f"错误: 源目录不存在: {source_dir}")
|
||||||
@ -121,7 +155,8 @@ def process_result_directory(source_dir, output_dir, run_id=None):
|
|||||||
"AdditionalImagesCount",
|
"AdditionalImagesCount",
|
||||||
"Status",
|
"Status",
|
||||||
"Details",
|
"Details",
|
||||||
"JudgeStatus"
|
"JudgeStatus",
|
||||||
|
"ContentSource"
|
||||||
]
|
]
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -177,7 +212,8 @@ def process_result_directory(source_dir, output_dir, run_id=None):
|
|||||||
"AdditionalImagesCount": 0,
|
"AdditionalImagesCount": 0,
|
||||||
"Status": "Processing",
|
"Status": "Processing",
|
||||||
"Details": "",
|
"Details": "",
|
||||||
"JudgeStatus": ""
|
"JudgeStatus": "",
|
||||||
|
"ContentSource": "unknown"
|
||||||
}
|
}
|
||||||
|
|
||||||
# 创建输出条目目录
|
# 创建输出条目目录
|
||||||
@ -209,7 +245,7 @@ def process_result_directory(source_dir, output_dir, run_id=None):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f" - 错误: 读取article.json失败: {e}")
|
print(f" - 错误: 读取article.json失败: {e}")
|
||||||
|
|
||||||
txt_content, error = convert_json_to_txt_content(json_path)
|
txt_content, error = convert_json_to_txt_content(json_path, prefer_original)
|
||||||
if error:
|
if error:
|
||||||
record["Status"] = "Partial"
|
record["Status"] = "Partial"
|
||||||
record["Details"] += f"文章处理失败: {error}; "
|
record["Details"] += f"文章处理失败: {error}; "
|
||||||
@ -219,6 +255,17 @@ def process_result_directory(source_dir, output_dir, run_id=None):
|
|||||||
with open(txt_path, 'w', encoding='utf-8') as f_txt:
|
with open(txt_path, 'w', encoding='utf-8') as f_txt:
|
||||||
f_txt.write(txt_content)
|
f_txt.write(txt_content)
|
||||||
print(f" - 成功写入文本文件: {txt_path}")
|
print(f" - 成功写入文本文件: {txt_path}")
|
||||||
|
|
||||||
|
# 记录内容来源
|
||||||
|
if prefer_original:
|
||||||
|
record["ContentSource"] = "original_preferred"
|
||||||
|
elif article_data.get("judge_success") is True:
|
||||||
|
record["ContentSource"] = "judged"
|
||||||
|
elif "original_title" in article_data:
|
||||||
|
record["ContentSource"] = "original"
|
||||||
|
else:
|
||||||
|
record["ContentSource"] = "default"
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
record["Status"] = "Partial"
|
record["Status"] = "Partial"
|
||||||
record["Details"] += f"写入文本文件失败: {e}; "
|
record["Details"] += f"写入文本文件失败: {e}; "
|
||||||
@ -298,30 +345,32 @@ def process_result_directory(source_dir, output_dir, run_id=None):
|
|||||||
print(f"结果保存在: {output_dir}")
|
print(f"结果保存在: {output_dir}")
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
# parser = argparse.ArgumentParser(description="从TravelContentCreator结果目录提取内容并渲染到指定目录")
|
parser = argparse.ArgumentParser(description="从TravelContentCreator结果目录提取内容并渲染到指定目录")
|
||||||
# parser.add_argument("--source", type=str, help="源目录路径")
|
parser.add_argument("--source", type=str, help="源目录路径")
|
||||||
# parser.add_argument("--output", type=str, help="输出目录路径")
|
parser.add_argument("--output", type=str, help="输出目录路径")
|
||||||
# parser.add_argument("--run-id", type=str, help="自定义运行ID")
|
parser.add_argument("--run-id", type=str, help="自定义运行ID")
|
||||||
|
parser.add_argument("--prefer-original", action="store_true", help="优先使用原始内容,忽略审核结果")
|
||||||
|
|
||||||
# args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# # 默认值设置
|
# 默认值设置
|
||||||
# source = args.source if args.source else "/root/autodl-tmp/TravelContentCreator/result/2025-05-11_00-26-30"
|
source = args.source if args.source else "/root/autodl-tmp/TravelContentCreator/result/2025-05-12_09-33-12"
|
||||||
# output = args.output if args.output else "/root/autodl-tmp/TravelContentCreator/output/2025-05-11_00-26-30"
|
output = args.output if args.output else "/root/autodl-tmp/TravelContentCreator/output/2025-05-12_09-33-12"
|
||||||
# run_id = args.run_id if args.run_id else os.path.basename(source)
|
run_id = args.run_id if args.run_id else os.path.basename(source)
|
||||||
|
prefer_original = args.prefer_original
|
||||||
source = "/root/autodl-tmp/TravelContentCreator/result/2025-05-12_09-33-12"
|
|
||||||
output = "/root/autodl-tmp/TravelContentCreator/output/2025-05-12_09-33-12"
|
|
||||||
run_id = os.path.basename(source)
|
|
||||||
|
|
||||||
print("-" * 60)
|
print("-" * 60)
|
||||||
print(f"开始提取和渲染流程")
|
print(f"开始提取和渲染流程")
|
||||||
print(f"源目录: {source}")
|
print(f"源目录: {source}")
|
||||||
print(f"输出目录: {output}")
|
print(f"输出目录: {output}")
|
||||||
print(f"运行ID: {run_id}")
|
print(f"运行ID: {run_id}")
|
||||||
|
if prefer_original:
|
||||||
|
print("内容模式: 优先使用原始内容")
|
||||||
|
else:
|
||||||
|
print("内容模式: 根据审核结果选择内容")
|
||||||
print("-" * 60)
|
print("-" * 60)
|
||||||
|
|
||||||
process_result_directory(source, output, run_id)
|
process_result_directory(source, output, run_id, prefer_original)
|
||||||
|
|
||||||
print("\n脚本执行完毕.")
|
print("\n脚本执行完毕.")
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user