修改了分发的读取方式
This commit is contained in:
parent
674082e7d7
commit
881a33786b
@ -16,8 +16,8 @@ EMAIL_FROM="zwysendemail@163.com"
|
|||||||
EMAIL_PASSWORD="NMhVGFmCJkGEy3B5"
|
EMAIL_PASSWORD="NMhVGFmCJkGEy3B5"
|
||||||
# EMAIL_FROM="zowoyomedia@163.com"
|
# EMAIL_FROM="zowoyomedia@163.com"
|
||||||
# EMAIL_PASSWORD="SDj5fK6Tk9YevmsD"
|
# EMAIL_PASSWORD="SDj5fK6Tk9YevmsD"
|
||||||
SUBJECT="文旅小红书带货笔记内容0519"
|
SUBJECT="文旅小红书带货笔记内容0520"
|
||||||
ZIP_FILENAME="文旅小红书带货笔记内容0519"
|
ZIP_FILENAME="文旅小红书带货笔记内容0520"
|
||||||
|
|
||||||
# 设置分发配置
|
# 设置分发配置
|
||||||
ARTICLE_PER_USER=1
|
ARTICLE_PER_USER=1
|
||||||
@ -33,14 +33,14 @@ UNDISTRIBUTED_ONLY=true # 只分发未分发的内容
|
|||||||
|
|
||||||
# 内容筛选配置
|
# 内容筛选配置
|
||||||
TARGET_PRODUCT="" # 为空则不筛选特定产品
|
TARGET_PRODUCT="" # 为空则不筛选特定产品
|
||||||
TARGET_OBJECT="北洛秘境盛季酒店" # 为空则不筛选特定景点
|
TARGET_OBJECT="极爽冲浪馆" # 为空则不筛选特定景点
|
||||||
|
|
||||||
# 用户筛选配置
|
# 用户筛选配置
|
||||||
TARGET_USER_ID="" # 为空则不筛选特定用户ID
|
TARGET_USER_ID="" # 为空则不筛选特定用户ID
|
||||||
TARGET_USER_EMAIL="" # 为空则不筛选特定用户邮箱
|
TARGET_USER_EMAIL="" # 为空则不筛选特定用户邮箱
|
||||||
|
|
||||||
# 强制性附件配置
|
# 强制性附件配置
|
||||||
FORCE_ATTACHMENTS="/root/autodl-tmp/TravelContentCreator/hotel_img/标题参考格式-精选.txt"
|
FORCE_ATTACHMENTS="/root/autodl-tmp/TravelContentCreator/hotel_img/标题参考格式-精选.txt, /root/autodl-tmp/TravelContentCreator/hotel_img/poster/极爽冲浪-门票.jpg"
|
||||||
|
|
||||||
# 创建必要的目录
|
# 创建必要的目录
|
||||||
mkdir -p "$LOG_DIR"
|
mkdir -p "$LOG_DIR"
|
||||||
|
|||||||
@ -137,16 +137,15 @@ def convert_json_to_txt_content(json_path, prefer_original=False):
|
|||||||
读取 JSON 文件,提取标题、内容和标签,移除 Markdown 格式,
|
读取 JSON 文件,提取标题、内容和标签,移除 Markdown 格式,
|
||||||
并返回格式化文本。
|
并返回格式化文本。
|
||||||
|
|
||||||
根据judge_success字段决定使用原始内容还是审核后内容:
|
根据JSON文件中的状态字段决定使用什么内容:
|
||||||
- judge_success为True时使用title/content(除非prefer_original=True)
|
- 如果judged=True,使用审核后内容
|
||||||
- judge_success为False时使用original_title/original_content
|
- 如果judged=False,使用原始内容
|
||||||
|
|
||||||
支持base64编码的内容:
|
所有情况都优先使用base64编码的字段,因为这些字段能正确保留特殊字符和换行符
|
||||||
- 如果检测到title_base64和content_base64字段,将优先使用这些字段
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
json_path: JSON文件路径
|
json_path: JSON文件路径
|
||||||
prefer_original: 是否优先使用原始内容,无视judge_success结果
|
prefer_original: 参数保留但不再使用
|
||||||
"""
|
"""
|
||||||
print(f" - 正在读取 JSON: {json_path}")
|
print(f" - 正在读取 JSON: {json_path}")
|
||||||
if not os.path.exists(json_path):
|
if not os.path.exists(json_path):
|
||||||
@ -157,110 +156,134 @@ def convert_json_to_txt_content(json_path, prefer_original=False):
|
|||||||
with open(json_path, 'r', encoding='utf-8') as f_json:
|
with open(json_path, 'r', encoding='utf-8') as f_json:
|
||||||
data = json.load(f_json)
|
data = json.load(f_json)
|
||||||
|
|
||||||
# 优先检查是否有base64编码的内容
|
# 提取状态字段
|
||||||
|
judged = data.get('judged', False)
|
||||||
|
|
||||||
|
print(f" - 文件状态: judged={judged}")
|
||||||
|
|
||||||
|
# 初始化变量
|
||||||
title = None
|
title = None
|
||||||
content = None
|
content = None
|
||||||
original_title = None
|
original_title = None
|
||||||
original_content = None
|
original_content = None
|
||||||
tags = None
|
tags = None
|
||||||
|
original_tags = None
|
||||||
|
|
||||||
# 尝试从base64字段获取内容
|
# =================解码所有可能的base64字段=================
|
||||||
try:
|
# 解码标题和内容字段
|
||||||
# 优先使用base64编码的内容
|
if "title_base64" in data:
|
||||||
if "title_base64" in data:
|
try:
|
||||||
title = base64.b64decode(data["title_base64"]).decode('utf-8')
|
title = base64.b64decode(data["title_base64"]).decode('utf-8')
|
||||||
print(f" - 成功从base64解码标题")
|
print(f" - 成功从base64解码审核后标题")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" - 警告: 标题base64解码失败: {e}")
|
||||||
|
|
||||||
if "content_base64" in data:
|
if "content_base64" in data:
|
||||||
|
try:
|
||||||
content = base64.b64decode(data["content_base64"]).decode('utf-8')
|
content = base64.b64decode(data["content_base64"]).decode('utf-8')
|
||||||
print(f" - 成功从base64解码内容")
|
print(f" - 成功从base64解码审核后内容")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" - 警告: 内容base64解码失败: {e}")
|
||||||
|
|
||||||
if "tags_base64" in data:
|
# 解码原始标题和内容字段
|
||||||
tags = base64.b64decode(data["tags_base64"]).decode('utf-8')
|
if "original_title_base64" in data:
|
||||||
print(f" - 成功从base64解码标签")
|
try:
|
||||||
elif "tags" in data:
|
|
||||||
tags = data.get("tags", "")
|
|
||||||
elif "tag" in data:
|
|
||||||
tags = data.get("tag", "")
|
|
||||||
|
|
||||||
# 检查是否有原始内容的base64
|
|
||||||
if "original_title_base64" in data:
|
|
||||||
original_title = base64.b64decode(data["original_title_base64"]).decode('utf-8')
|
original_title = base64.b64decode(data["original_title_base64"]).decode('utf-8')
|
||||||
|
print(f" - 成功从base64解码原始标题")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" - 警告: 原始标题base64解码失败: {e}")
|
||||||
|
|
||||||
if "original_content_base64" in data:
|
if "original_content_base64" in data:
|
||||||
|
try:
|
||||||
original_content = base64.b64decode(data["original_content_base64"]).decode('utf-8')
|
original_content = base64.b64decode(data["original_content_base64"]).decode('utf-8')
|
||||||
|
print(f" - 成功从base64解码原始内容")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" - 警告: 原始内容base64解码失败: {e}")
|
||||||
|
|
||||||
if "original_tags_base64" in data:
|
# 解码标签字段
|
||||||
|
if "tags_base64" in data:
|
||||||
|
try:
|
||||||
|
tags = base64.b64decode(data["tags_base64"]).decode('utf-8')
|
||||||
|
print(f" - 成功从base64解码审核后标签")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" - 警告: 标签base64解码失败: {e}")
|
||||||
|
|
||||||
|
if "original_tags_base64" in data:
|
||||||
|
try:
|
||||||
original_tags = base64.b64decode(data["original_tags_base64"]).decode('utf-8')
|
original_tags = base64.b64decode(data["original_tags_base64"]).decode('utf-8')
|
||||||
elif "original_tags" in data:
|
print(f" - 成功从base64解码原始标签")
|
||||||
original_tags = data.get("original_tags", "")
|
except Exception as e:
|
||||||
|
print(f" - 警告: 原始标签base64解码失败: {e}")
|
||||||
|
|
||||||
# 如果prefer_original为True且有原始内容,使用原始内容
|
# =================回退到非base64字段=================
|
||||||
if prefer_original and original_title and original_content:
|
# 如果base64解码失败,尝试使用普通字段
|
||||||
title = original_title
|
if title is None and "title" in data:
|
||||||
content = original_content
|
title = data["title"]
|
||||||
tags = original_tags if original_tags else tags
|
print(f" - 使用普通字段标题")
|
||||||
print(f" - 使用解码后的原始内容 (prefer_original=True)")
|
|
||||||
except Exception as e:
|
|
||||||
print(f" - 警告: base64解码失败: {e},将尝试使用普通字段")
|
|
||||||
title = None
|
|
||||||
content = None
|
|
||||||
|
|
||||||
# 如果base64解码失败或不存在base64字段,则使用原始逻辑
|
if content is None and "content" in data:
|
||||||
if title is None or content is None:
|
content = data["content"]
|
||||||
# 根据judge_success选择标题和内容
|
print(f" - 使用普通字段内容")
|
||||||
judge_success = data.get('judge_success', None)
|
|
||||||
|
|
||||||
if prefer_original and 'original_title' in data and 'original_content' in data:
|
if original_title is None and "original_title" in data:
|
||||||
# 优先使用原始内容
|
original_title = data["original_title"]
|
||||||
title = data.get('original_title', '未找到原始标题')
|
print(f" - 使用普通字段原始标题")
|
||||||
content = data.get('original_content', '未找到原始内容')
|
|
||||||
# 优先使用原始标签
|
|
||||||
tags = data.get('original_tags', data.get('tags', '未找到标签'))
|
|
||||||
print(f" - 优先使用原始内容 (prefer_original=True)")
|
|
||||||
elif judge_success is True and not prefer_original:
|
|
||||||
# 使用审核后的内容
|
|
||||||
title = data.get('title', '未找到标题')
|
|
||||||
content = data.get('content', '未找到内容')
|
|
||||||
tags = data.get('tags', '未找到标签')
|
|
||||||
print(f" - 使用审核后内容 (judge_success=True)")
|
|
||||||
elif 'original_title' in data and 'original_content' in data:
|
|
||||||
# 使用原始内容
|
|
||||||
title = data.get('original_title', '未找到原始标题')
|
|
||||||
content = data.get('original_content', '未找到原始内容')
|
|
||||||
# 优先使用原始标签
|
|
||||||
tags = data.get('original_tags', data.get('tags', '未找到标签'))
|
|
||||||
print(f" - 使用原始内容 (judge_success={judge_success})")
|
|
||||||
else:
|
|
||||||
# 若无original字段,使用常规字段
|
|
||||||
title = data.get('title', '未找到标题')
|
|
||||||
content = data.get('content', '未找到内容')
|
|
||||||
tags = data.get('tags', '未找到标签')
|
|
||||||
print(f" - 使用常规内容 (无judge结果)")
|
|
||||||
|
|
||||||
# 解决tag/tags字段重复问题,按照修正后的处理逻辑,只使用tags字段
|
if original_content is None and "original_content" in data:
|
||||||
if not tags and 'tag' in data:
|
original_content = data["original_content"]
|
||||||
tags = data.get('tag', '未找到标签')
|
print(f" - 使用普通字段原始内容")
|
||||||
print(f" - 使用tag字段作为标签 (该字段将在后续版本中统一为tags)")
|
|
||||||
|
if tags is None and "tags" in data:
|
||||||
|
tags = data["tags"]
|
||||||
|
print(f" - 使用普通字段标签")
|
||||||
|
elif tags is None and "tag" in data:
|
||||||
|
tags = data["tag"]
|
||||||
|
print(f" - 使用普通tag字段作为标签")
|
||||||
|
|
||||||
|
if original_tags is None and "original_tags" in data:
|
||||||
|
original_tags = data["original_tags"]
|
||||||
|
print(f" - 使用普通字段原始标签")
|
||||||
|
|
||||||
|
# =================根据状态字段决定使用哪些内容=================
|
||||||
|
final_title = None
|
||||||
|
final_content = None
|
||||||
|
final_tags = None
|
||||||
|
|
||||||
|
# 简化逻辑:如果已审核,使用审核后内容;否则使用原始内容
|
||||||
|
if judged:
|
||||||
|
print(f" - 使用审核后内容 (judged=True)")
|
||||||
|
final_title = title
|
||||||
|
final_content = content
|
||||||
|
final_tags = tags
|
||||||
|
else:
|
||||||
|
print(f" - 使用原始内容 (judged=False)")
|
||||||
|
final_title = original_title
|
||||||
|
final_content = original_content
|
||||||
|
final_tags = original_tags
|
||||||
|
|
||||||
|
# 确保所有字段都有值
|
||||||
|
final_title = final_title or "未找到标题"
|
||||||
|
final_content = final_content or "未找到内容"
|
||||||
|
final_tags = final_tags or "未找到标签"
|
||||||
|
|
||||||
# 移除Markdown格式,但保留换行符
|
# 移除Markdown格式,但保留换行符
|
||||||
content_no_format = re.sub(r'\*\*(.*?)\*\*', r'\1', content)
|
content_no_format = re.sub(r'\*\*(.*?)\*\*', r'\1', final_content)
|
||||||
|
|
||||||
# 组合输出文本,保留原始内容的所有换行符
|
# 组合输出文本,保留内容的所有换行符
|
||||||
result = ""
|
result = final_title + "\n\n" + content_no_format
|
||||||
if title:
|
|
||||||
result += title + "\n\n"
|
|
||||||
if content_no_format:
|
|
||||||
result += content_no_format
|
|
||||||
if tags and tags != "未找到标签":
|
|
||||||
result += "\n\n" + tags
|
|
||||||
|
|
||||||
|
if final_tags and final_tags != "未找到标签":
|
||||||
|
result += "\n\n" + final_tags
|
||||||
|
|
||||||
|
print(f" - 内容处理完成,最终文本长度: {len(result)} 字符")
|
||||||
return result, None
|
return result, None
|
||||||
|
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
print(f" - 错误: JSON 格式无效: {json_path}")
|
print(f" - 错误: JSON 格式无效: {json_path}")
|
||||||
return None, f"无效的 JSON 格式: {json_path}"
|
return None, f"无效的 JSON 格式: {json_path}"
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f" - 错误: 处理 JSON 时出错: {e}")
|
print(f" - 错误: 处理 JSON 时出错: {e}")
|
||||||
|
traceback.print_exc()
|
||||||
return None, f"处理 JSON 时出错: {e}"
|
return None, f"处理 JSON 时出错: {e}"
|
||||||
|
|
||||||
def process_txt_content(txt_path):
|
def process_txt_content(txt_path):
|
||||||
@ -462,17 +485,20 @@ def process_result_directory(source_dir, output_dir, run_id=None, prefer_origina
|
|||||||
print(f" - 错误: {record['Details']}")
|
print(f" - 错误: {record['Details']}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 1. 处理article.txt
|
# 1. 处理article内容 - 优先使用JSON文件
|
||||||
input_txt_path = os.path.join(entry_path, "article.txt")
|
|
||||||
output_txt_path = os.path.join(output_entry_path, "article.txt")
|
output_txt_path = os.path.join(output_entry_path, "article.txt")
|
||||||
record["OutputTxtPath"] = output_txt_path
|
record["OutputTxtPath"] = output_txt_path
|
||||||
|
|
||||||
# 读取article.json,仅用于获取judge_status
|
# 读取article.json
|
||||||
json_path = os.path.join(entry_path, "article.json")
|
json_path = os.path.join(entry_path, "article.json")
|
||||||
record["ArticleJsonPath"] = json_path
|
record["ArticleJsonPath"] = json_path
|
||||||
|
|
||||||
|
content_processed = False
|
||||||
|
|
||||||
|
# 优先从JSON提取内容
|
||||||
if os.path.exists(json_path):
|
if os.path.exists(json_path):
|
||||||
try:
|
try:
|
||||||
|
# 从JSON文件提取审核状态
|
||||||
with open(json_path, 'r', encoding='utf-8') as f_json:
|
with open(json_path, 'r', encoding='utf-8') as f_json:
|
||||||
article_data = json.load(f_json)
|
article_data = json.load(f_json)
|
||||||
# 提取judge_success状态
|
# 提取judge_success状态
|
||||||
@ -480,31 +506,48 @@ def process_result_directory(source_dir, output_dir, run_id=None, prefer_origina
|
|||||||
record["JudgeStatus"] = str(article_data["judge_success"])
|
record["JudgeStatus"] = str(article_data["judge_success"])
|
||||||
elif "judged" in article_data:
|
elif "judged" in article_data:
|
||||||
record["JudgeStatus"] = "已审核" if article_data["judged"] else "未审核"
|
record["JudgeStatus"] = "已审核" if article_data["judged"] else "未审核"
|
||||||
|
|
||||||
|
# 使用convert_json_to_txt_content函数处理JSON文件
|
||||||
|
processed_content, error = convert_json_to_txt_content(json_path, prefer_original)
|
||||||
|
|
||||||
|
if error:
|
||||||
|
print(f" - 警告: 从JSON提取内容失败: {error}")
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
with open(output_txt_path, 'w', encoding='utf-8') as f_txt:
|
||||||
|
f_txt.write(processed_content)
|
||||||
|
print(f" - 成功从JSON提取并写入内容到: {output_txt_path}")
|
||||||
|
record["ContentSource"] = "json_file"
|
||||||
|
content_processed = True
|
||||||
|
except Exception as e:
|
||||||
|
print(f" - 警告: 写入从JSON提取的内容时出错: {e}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f" - 警告: 读取article.json失败: {e}")
|
print(f" - 警告: 处理JSON文件时出错: {e}")
|
||||||
|
|
||||||
# 处理article.txt文件
|
# 如果从JSON提取内容失败,尝试使用现有的TXT文件
|
||||||
if os.path.exists(input_txt_path):
|
if not content_processed:
|
||||||
processed_content, error = process_txt_content(input_txt_path)
|
input_txt_path = os.path.join(entry_path, "article.txt")
|
||||||
if error:
|
if os.path.exists(input_txt_path):
|
||||||
record["Status"] = "Partial"
|
processed_content, error = process_txt_content(input_txt_path)
|
||||||
record["Details"] += f"文章处理失败: {error}; "
|
if error:
|
||||||
print(f" - 错误: {record['Details']}")
|
|
||||||
else:
|
|
||||||
try:
|
|
||||||
with open(output_txt_path, 'w', encoding='utf-8') as f_txt:
|
|
||||||
f_txt.write(processed_content)
|
|
||||||
print(f" - 成功写入处理后的文本文件: {output_txt_path}")
|
|
||||||
record["ContentSource"] = "txt_file"
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
record["Status"] = "Partial"
|
record["Status"] = "Partial"
|
||||||
record["Details"] += f"写入文本文件失败: {e}; "
|
record["Details"] += f"文章处理失败: {error}; "
|
||||||
print(f" - 错误: {record['Details']}")
|
print(f" - 错误: {record['Details']}")
|
||||||
else:
|
else:
|
||||||
record["Status"] = "Partial"
|
try:
|
||||||
record["Details"] += "文章TXT文件不存在; "
|
with open(output_txt_path, 'w', encoding='utf-8') as f_txt:
|
||||||
print(f" - 警告: {record['Details']}")
|
f_txt.write(processed_content)
|
||||||
|
print(f" - 成功写入处理后的文本文件: {output_txt_path}")
|
||||||
|
record["ContentSource"] = "txt_file"
|
||||||
|
content_processed = True
|
||||||
|
except Exception as e:
|
||||||
|
record["Status"] = "Partial"
|
||||||
|
record["Details"] += f"写入文本文件失败: {e}; "
|
||||||
|
print(f" - 错误: {record['Details']}")
|
||||||
|
else:
|
||||||
|
record["Status"] = "Partial"
|
||||||
|
record["Details"] += "无法从JSON或TXT获取内容; "
|
||||||
|
print(f" - 警告: {record['Details']}")
|
||||||
|
|
||||||
# 2. 处理海报图片
|
# 2. 处理海报图片
|
||||||
poster_dir = os.path.join(entry_path, "poster")
|
poster_dir = os.path.join(entry_path, "poster")
|
||||||
@ -627,8 +670,8 @@ def main():
|
|||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# 默认值设置
|
# 默认值设置
|
||||||
source = args.source if args.source else "/root/autodl-tmp/TravelContentCreator/result/2025-05-19_17-51-07"
|
source = args.source if args.source else "/root/autodl-tmp/TravelContentCreator/result/2025-05-20_15-37-25"
|
||||||
output = args.output if args.output else "/root/autodl-tmp/TravelContentCreator/output/2025-05-19_17-51-07"
|
output = args.output if args.output else "/root/autodl-tmp/TravelContentCreator/output/2025-05-20_15-37-25"
|
||||||
run_id = args.run_id if args.run_id else os.path.basename(source)
|
run_id = args.run_id if args.run_id else os.path.basename(source)
|
||||||
prefer_original = args.prefer_original
|
prefer_original = args.prefer_original
|
||||||
db_path = args.db_path if args.db_path else '/root/autodl-tmp/TravelContentCreator/distribution.db'
|
db_path = args.db_path if args.db_path else '/root/autodl-tmp/TravelContentCreator/distribution.db'
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user