修改了分发的读取方式

This commit is contained in:
jinye_huang 2025-05-21 09:49:41 +08:00
parent 674082e7d7
commit 881a33786b
2 changed files with 163 additions and 120 deletions

View File

@ -16,8 +16,8 @@ EMAIL_FROM="zwysendemail@163.com"
EMAIL_PASSWORD="NMhVGFmCJkGEy3B5" EMAIL_PASSWORD="NMhVGFmCJkGEy3B5"
# EMAIL_FROM="zowoyomedia@163.com" # EMAIL_FROM="zowoyomedia@163.com"
# EMAIL_PASSWORD="SDj5fK6Tk9YevmsD" # EMAIL_PASSWORD="SDj5fK6Tk9YevmsD"
SUBJECT="文旅小红书带货笔记内容0519" SUBJECT="文旅小红书带货笔记内容0520"
ZIP_FILENAME="文旅小红书带货笔记内容0519" ZIP_FILENAME="文旅小红书带货笔记内容0520"
# 设置分发配置 # 设置分发配置
ARTICLE_PER_USER=1 ARTICLE_PER_USER=1
@ -33,14 +33,14 @@ UNDISTRIBUTED_ONLY=true # 只分发未分发的内容
# 内容筛选配置 # 内容筛选配置
TARGET_PRODUCT="" # 为空则不筛选特定产品 TARGET_PRODUCT="" # 为空则不筛选特定产品
TARGET_OBJECT="北洛秘境盛季酒店" # 为空则不筛选特定景点 TARGET_OBJECT="极爽冲浪馆" # 为空则不筛选特定景点
# 用户筛选配置 # 用户筛选配置
TARGET_USER_ID="" # 为空则不筛选特定用户ID TARGET_USER_ID="" # 为空则不筛选特定用户ID
TARGET_USER_EMAIL="" # 为空则不筛选特定用户邮箱 TARGET_USER_EMAIL="" # 为空则不筛选特定用户邮箱
# 强制性附件配置 # 强制性附件配置
FORCE_ATTACHMENTS="/root/autodl-tmp/TravelContentCreator/hotel_img/标题参考格式-精选.txt" FORCE_ATTACHMENTS="/root/autodl-tmp/TravelContentCreator/hotel_img/标题参考格式-精选.txt, /root/autodl-tmp/TravelContentCreator/hotel_img/poster/极爽冲浪-门票.jpg"
# 创建必要的目录 # 创建必要的目录
mkdir -p "$LOG_DIR" mkdir -p "$LOG_DIR"

View File

@ -137,16 +137,15 @@ def convert_json_to_txt_content(json_path, prefer_original=False):
读取 JSON 文件提取标题内容和标签移除 Markdown 格式 读取 JSON 文件提取标题内容和标签移除 Markdown 格式
并返回格式化文本 并返回格式化文本
根据judge_success字段决定使用原始内容还是审核后内容 根据JSON文件中的状态字段决定使用什么内容
- judge_success为True时使用title/content除非prefer_original=True - 如果judged=True使用审核后内容
- judge_success为False时使用original_title/original_content - 如果judged=False使用原始内容
支持base64编码的内容 所有情况都优先使用base64编码的字段因为这些字段能正确保留特殊字符和换行符
- 如果检测到title_base64和content_base64字段将优先使用这些字段
Args: Args:
json_path: JSON文件路径 json_path: JSON文件路径
prefer_original: 是否优先使用原始内容无视judge_success结果 prefer_original: 参数保留但不再使用
""" """
print(f" - 正在读取 JSON: {json_path}") print(f" - 正在读取 JSON: {json_path}")
if not os.path.exists(json_path): if not os.path.exists(json_path):
@ -157,110 +156,134 @@ def convert_json_to_txt_content(json_path, prefer_original=False):
with open(json_path, 'r', encoding='utf-8') as f_json: with open(json_path, 'r', encoding='utf-8') as f_json:
data = json.load(f_json) data = json.load(f_json)
# 优先检查是否有base64编码的内容 # 提取状态字段
judged = data.get('judged', False)
print(f" - 文件状态: judged={judged}")
# 初始化变量
title = None title = None
content = None content = None
original_title = None original_title = None
original_content = None original_content = None
tags = None tags = None
original_tags = None
# 尝试从base64字段获取内容 # =================解码所有可能的base64字段=================
try: # 解码标题和内容字段
# 优先使用base64编码的内容 if "title_base64" in data:
if "title_base64" in data: try:
title = base64.b64decode(data["title_base64"]).decode('utf-8') title = base64.b64decode(data["title_base64"]).decode('utf-8')
print(f" - 成功从base64解码标题") print(f" - 成功从base64解码审核后标题")
except Exception as e:
if "content_base64" in data: print(f" - 警告: 标题base64解码失败: {e}")
content = base64.b64decode(data["content_base64"]).decode('utf-8')
print(f" - 成功从base64解码内容")
if "tags_base64" in data:
tags = base64.b64decode(data["tags_base64"]).decode('utf-8')
print(f" - 成功从base64解码标签")
elif "tags" in data:
tags = data.get("tags", "")
elif "tag" in data:
tags = data.get("tag", "")
# 检查是否有原始内容的base64
if "original_title_base64" in data:
original_title = base64.b64decode(data["original_title_base64"]).decode('utf-8')
if "original_content_base64" in data:
original_content = base64.b64decode(data["original_content_base64"]).decode('utf-8')
if "original_tags_base64" in data:
original_tags = base64.b64decode(data["original_tags_base64"]).decode('utf-8')
elif "original_tags" in data:
original_tags = data.get("original_tags", "")
# 如果prefer_original为True且有原始内容使用原始内容
if prefer_original and original_title and original_content:
title = original_title
content = original_content
tags = original_tags if original_tags else tags
print(f" - 使用解码后的原始内容 (prefer_original=True)")
except Exception as e:
print(f" - 警告: base64解码失败: {e},将尝试使用普通字段")
title = None
content = None
# 如果base64解码失败或不存在base64字段则使用原始逻辑 if "content_base64" in data:
if title is None or content is None: try:
# 根据judge_success选择标题和内容 content = base64.b64decode(data["content_base64"]).decode('utf-8')
judge_success = data.get('judge_success', None) print(f" - 成功从base64解码审核后内容")
except Exception as e:
print(f" - 警告: 内容base64解码失败: {e}")
# 解码原始标题和内容字段
if "original_title_base64" in data:
try:
original_title = base64.b64decode(data["original_title_base64"]).decode('utf-8')
print(f" - 成功从base64解码原始标题")
except Exception as e:
print(f" - 警告: 原始标题base64解码失败: {e}")
if "original_content_base64" in data:
try:
original_content = base64.b64decode(data["original_content_base64"]).decode('utf-8')
print(f" - 成功从base64解码原始内容")
except Exception as e:
print(f" - 警告: 原始内容base64解码失败: {e}")
# 解码标签字段
if "tags_base64" in data:
try:
tags = base64.b64decode(data["tags_base64"]).decode('utf-8')
print(f" - 成功从base64解码审核后标签")
except Exception as e:
print(f" - 警告: 标签base64解码失败: {e}")
if "original_tags_base64" in data:
try:
original_tags = base64.b64decode(data["original_tags_base64"]).decode('utf-8')
print(f" - 成功从base64解码原始标签")
except Exception as e:
print(f" - 警告: 原始标签base64解码失败: {e}")
# =================回退到非base64字段=================
# 如果base64解码失败尝试使用普通字段
if title is None and "title" in data:
title = data["title"]
print(f" - 使用普通字段标题")
if prefer_original and 'original_title' in data and 'original_content' in data: if content is None and "content" in data:
# 优先使用原始内容 content = data["content"]
title = data.get('original_title', '未找到原始标题') print(f" - 使用普通字段内容")
content = data.get('original_content', '未找到原始内容')
# 优先使用原始标签
tags = data.get('original_tags', data.get('tags', '未找到标签'))
print(f" - 优先使用原始内容 (prefer_original=True)")
elif judge_success is True and not prefer_original:
# 使用审核后的内容
title = data.get('title', '未找到标题')
content = data.get('content', '未找到内容')
tags = data.get('tags', '未找到标签')
print(f" - 使用审核后内容 (judge_success=True)")
elif 'original_title' in data and 'original_content' in data:
# 使用原始内容
title = data.get('original_title', '未找到原始标题')
content = data.get('original_content', '未找到原始内容')
# 优先使用原始标签
tags = data.get('original_tags', data.get('tags', '未找到标签'))
print(f" - 使用原始内容 (judge_success={judge_success})")
else:
# 若无original字段使用常规字段
title = data.get('title', '未找到标题')
content = data.get('content', '未找到内容')
tags = data.get('tags', '未找到标签')
print(f" - 使用常规内容 (无judge结果)")
# 解决tag/tags字段重复问题按照修正后的处理逻辑只使用tags字段 if original_title is None and "original_title" in data:
if not tags and 'tag' in data: original_title = data["original_title"]
tags = data.get('tag', '未找到标签') print(f" - 使用普通字段原始标题")
print(f" - 使用tag字段作为标签 (该字段将在后续版本中统一为tags)")
if original_content is None and "original_content" in data:
original_content = data["original_content"]
print(f" - 使用普通字段原始内容")
if tags is None and "tags" in data:
tags = data["tags"]
print(f" - 使用普通字段标签")
elif tags is None and "tag" in data:
tags = data["tag"]
print(f" - 使用普通tag字段作为标签")
if original_tags is None and "original_tags" in data:
original_tags = data["original_tags"]
print(f" - 使用普通字段原始标签")
# =================根据状态字段决定使用哪些内容=================
final_title = None
final_content = None
final_tags = None
# 简化逻辑:如果已审核,使用审核后内容;否则使用原始内容
if judged:
print(f" - 使用审核后内容 (judged=True)")
final_title = title
final_content = content
final_tags = tags
else:
print(f" - 使用原始内容 (judged=False)")
final_title = original_title
final_content = original_content
final_tags = original_tags
# 确保所有字段都有值
final_title = final_title or "未找到标题"
final_content = final_content or "未找到内容"
final_tags = final_tags or "未找到标签"
# 移除Markdown格式但保留换行符 # 移除Markdown格式但保留换行符
content_no_format = re.sub(r'\*\*(.*?)\*\*', r'\1', content) content_no_format = re.sub(r'\*\*(.*?)\*\*', r'\1', final_content)
# 组合输出文本,保留原始内容的所有换行符 # 组合输出文本,保留内容的所有换行符
result = "" result = final_title + "\n\n" + content_no_format
if title:
result += title + "\n\n" if final_tags and final_tags != "未找到标签":
if content_no_format: result += "\n\n" + final_tags
result += content_no_format
if tags and tags != "未找到标签":
result += "\n\n" + tags
print(f" - 内容处理完成,最终文本长度: {len(result)} 字符")
return result, None return result, None
except json.JSONDecodeError: except json.JSONDecodeError:
print(f" - 错误: JSON 格式无效: {json_path}") print(f" - 错误: JSON 格式无效: {json_path}")
return None, f"无效的 JSON 格式: {json_path}" return None, f"无效的 JSON 格式: {json_path}"
except Exception as e: except Exception as e:
print(f" - 错误: 处理 JSON 时出错: {e}") print(f" - 错误: 处理 JSON 时出错: {e}")
traceback.print_exc()
return None, f"处理 JSON 时出错: {e}" return None, f"处理 JSON 时出错: {e}"
def process_txt_content(txt_path): def process_txt_content(txt_path):
@ -462,17 +485,20 @@ def process_result_directory(source_dir, output_dir, run_id=None, prefer_origina
print(f" - 错误: {record['Details']}") print(f" - 错误: {record['Details']}")
continue continue
# 1. 处理article.txt # 1. 处理article内容 - 优先使用JSON文件
input_txt_path = os.path.join(entry_path, "article.txt")
output_txt_path = os.path.join(output_entry_path, "article.txt") output_txt_path = os.path.join(output_entry_path, "article.txt")
record["OutputTxtPath"] = output_txt_path record["OutputTxtPath"] = output_txt_path
# 读取article.json仅用于获取judge_status # 读取article.json
json_path = os.path.join(entry_path, "article.json") json_path = os.path.join(entry_path, "article.json")
record["ArticleJsonPath"] = json_path record["ArticleJsonPath"] = json_path
content_processed = False
# 优先从JSON提取内容
if os.path.exists(json_path): if os.path.exists(json_path):
try: try:
# 从JSON文件提取审核状态
with open(json_path, 'r', encoding='utf-8') as f_json: with open(json_path, 'r', encoding='utf-8') as f_json:
article_data = json.load(f_json) article_data = json.load(f_json)
# 提取judge_success状态 # 提取judge_success状态
@ -480,31 +506,48 @@ def process_result_directory(source_dir, output_dir, run_id=None, prefer_origina
record["JudgeStatus"] = str(article_data["judge_success"]) record["JudgeStatus"] = str(article_data["judge_success"])
elif "judged" in article_data: elif "judged" in article_data:
record["JudgeStatus"] = "已审核" if article_data["judged"] else "未审核" record["JudgeStatus"] = "已审核" if article_data["judged"] else "未审核"
# 使用convert_json_to_txt_content函数处理JSON文件
processed_content, error = convert_json_to_txt_content(json_path, prefer_original)
if error:
print(f" - 警告: 从JSON提取内容失败: {error}")
else:
try:
with open(output_txt_path, 'w', encoding='utf-8') as f_txt:
f_txt.write(processed_content)
print(f" - 成功从JSON提取并写入内容到: {output_txt_path}")
record["ContentSource"] = "json_file"
content_processed = True
except Exception as e:
print(f" - 警告: 写入从JSON提取的内容时出错: {e}")
except Exception as e: except Exception as e:
print(f" - 警告: 读取article.json失败: {e}") print(f" - 警告: 处理JSON文件时出错: {e}")
# 处理article.txt文件 # 如果从JSON提取内容失败尝试使用现有的TXT文件
if os.path.exists(input_txt_path): if not content_processed:
processed_content, error = process_txt_content(input_txt_path) input_txt_path = os.path.join(entry_path, "article.txt")
if error: if os.path.exists(input_txt_path):
record["Status"] = "Partial" processed_content, error = process_txt_content(input_txt_path)
record["Details"] += f"文章处理失败: {error}; " if error:
print(f" - 错误: {record['Details']}")
else:
try:
with open(output_txt_path, 'w', encoding='utf-8') as f_txt:
f_txt.write(processed_content)
print(f" - 成功写入处理后的文本文件: {output_txt_path}")
record["ContentSource"] = "txt_file"
except Exception as e:
record["Status"] = "Partial" record["Status"] = "Partial"
record["Details"] += f"写入文本文件失败: {e}; " record["Details"] += f"文章处理失败: {error}; "
print(f" - 错误: {record['Details']}") print(f" - 错误: {record['Details']}")
else: else:
record["Status"] = "Partial" try:
record["Details"] += "文章TXT文件不存在; " with open(output_txt_path, 'w', encoding='utf-8') as f_txt:
print(f" - 警告: {record['Details']}") f_txt.write(processed_content)
print(f" - 成功写入处理后的文本文件: {output_txt_path}")
record["ContentSource"] = "txt_file"
content_processed = True
except Exception as e:
record["Status"] = "Partial"
record["Details"] += f"写入文本文件失败: {e}; "
print(f" - 错误: {record['Details']}")
else:
record["Status"] = "Partial"
record["Details"] += "无法从JSON或TXT获取内容; "
print(f" - 警告: {record['Details']}")
# 2. 处理海报图片 # 2. 处理海报图片
poster_dir = os.path.join(entry_path, "poster") poster_dir = os.path.join(entry_path, "poster")
@ -627,8 +670,8 @@ def main():
args = parser.parse_args() args = parser.parse_args()
# 默认值设置 # 默认值设置
source = args.source if args.source else "/root/autodl-tmp/TravelContentCreator/result/2025-05-19_17-51-07" source = args.source if args.source else "/root/autodl-tmp/TravelContentCreator/result/2025-05-20_15-37-25"
output = args.output if args.output else "/root/autodl-tmp/TravelContentCreator/output/2025-05-19_17-51-07" output = args.output if args.output else "/root/autodl-tmp/TravelContentCreator/output/2025-05-20_15-37-25"
run_id = args.run_id if args.run_id else os.path.basename(source) run_id = args.run_id if args.run_id else os.path.basename(source)
prefer_original = args.prefer_original prefer_original = args.prefer_original
db_path = args.db_path if args.db_path else '/root/autodl-tmp/TravelContentCreator/distribution.db' db_path = args.db_path if args.db_path else '/root/autodl-tmp/TravelContentCreator/distribution.db'