From 881a33786ba53e8ada61bd09f183416babd4995a Mon Sep 17 00:00:00 2001 From: jinye_huang Date: Wed, 21 May 2025 09:49:41 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86=E5=88=86=E5=8F=91?= =?UTF-8?q?=E7=9A=84=E8=AF=BB=E5=8F=96=E6=96=B9=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/distribution/distribute_example.sh | 8 +- scripts/distribution/extract_and_render.py | 275 ++++++++++++--------- 2 files changed, 163 insertions(+), 120 deletions(-) diff --git a/scripts/distribution/distribute_example.sh b/scripts/distribution/distribute_example.sh index 4b05543..0ca4798 100755 --- a/scripts/distribution/distribute_example.sh +++ b/scripts/distribution/distribute_example.sh @@ -16,8 +16,8 @@ EMAIL_FROM="zwysendemail@163.com" EMAIL_PASSWORD="NMhVGFmCJkGEy3B5" # EMAIL_FROM="zowoyomedia@163.com" # EMAIL_PASSWORD="SDj5fK6Tk9YevmsD" -SUBJECT="文旅小红书带货笔记内容0519" -ZIP_FILENAME="文旅小红书带货笔记内容0519" +SUBJECT="文旅小红书带货笔记内容0520" +ZIP_FILENAME="文旅小红书带货笔记内容0520" # 设置分发配置 ARTICLE_PER_USER=1 @@ -33,14 +33,14 @@ UNDISTRIBUTED_ONLY=true # 只分发未分发的内容 # 内容筛选配置 TARGET_PRODUCT="" # 为空则不筛选特定产品 -TARGET_OBJECT="北洛秘境盛季酒店" # 为空则不筛选特定景点 +TARGET_OBJECT="极爽冲浪馆" # 为空则不筛选特定景点 # 用户筛选配置 TARGET_USER_ID="" # 为空则不筛选特定用户ID TARGET_USER_EMAIL="" # 为空则不筛选特定用户邮箱 # 强制性附件配置 -FORCE_ATTACHMENTS="/root/autodl-tmp/TravelContentCreator/hotel_img/标题参考格式-精选.txt" +FORCE_ATTACHMENTS="/root/autodl-tmp/TravelContentCreator/hotel_img/标题参考格式-精选.txt, /root/autodl-tmp/TravelContentCreator/hotel_img/poster/极爽冲浪-门票.jpg" # 创建必要的目录 mkdir -p "$LOG_DIR" diff --git a/scripts/distribution/extract_and_render.py b/scripts/distribution/extract_and_render.py index 5d30cee..f3ad3e4 100644 --- a/scripts/distribution/extract_and_render.py +++ b/scripts/distribution/extract_and_render.py @@ -137,16 +137,15 @@ def convert_json_to_txt_content(json_path, prefer_original=False): 读取 JSON 文件,提取标题、内容和标签,移除 Markdown 格式, 并返回格式化文本。 - 根据judge_success字段决定使用原始内容还是审核后内容: - - judge_success为True时使用title/content(除非prefer_original=True) - - judge_success为False时使用original_title/original_content + 根据JSON文件中的状态字段决定使用什么内容: + - 如果judged=True,使用审核后内容 + - 如果judged=False,使用原始内容 - 支持base64编码的内容: - - 如果检测到title_base64和content_base64字段,将优先使用这些字段 + 所有情况都优先使用base64编码的字段,因为这些字段能正确保留特殊字符和换行符 Args: json_path: JSON文件路径 - prefer_original: 是否优先使用原始内容,无视judge_success结果 + prefer_original: 参数保留但不再使用 """ print(f" - 正在读取 JSON: {json_path}") if not os.path.exists(json_path): @@ -157,110 +156,134 @@ def convert_json_to_txt_content(json_path, prefer_original=False): with open(json_path, 'r', encoding='utf-8') as f_json: data = json.load(f_json) - # 优先检查是否有base64编码的内容 + # 提取状态字段 + judged = data.get('judged', False) + + print(f" - 文件状态: judged={judged}") + + # 初始化变量 title = None content = None original_title = None original_content = None tags = None + original_tags = None - # 尝试从base64字段获取内容 - try: - # 优先使用base64编码的内容 - if "title_base64" in data: + # =================解码所有可能的base64字段================= + # 解码标题和内容字段 + if "title_base64" in data: + try: title = base64.b64decode(data["title_base64"]).decode('utf-8') - print(f" - 成功从base64解码标题") - - if "content_base64" in data: - content = base64.b64decode(data["content_base64"]).decode('utf-8') - print(f" - 成功从base64解码内容") - - if "tags_base64" in data: - tags = base64.b64decode(data["tags_base64"]).decode('utf-8') - print(f" - 成功从base64解码标签") - elif "tags" in data: - tags = data.get("tags", "") - elif "tag" in data: - tags = data.get("tag", "") - - # 检查是否有原始内容的base64 - if "original_title_base64" in data: - original_title = base64.b64decode(data["original_title_base64"]).decode('utf-8') - - if "original_content_base64" in data: - original_content = base64.b64decode(data["original_content_base64"]).decode('utf-8') - - if "original_tags_base64" in data: - original_tags = base64.b64decode(data["original_tags_base64"]).decode('utf-8') - elif "original_tags" in data: - original_tags = data.get("original_tags", "") - - # 如果prefer_original为True且有原始内容,使用原始内容 - if prefer_original and original_title and original_content: - title = original_title - content = original_content - tags = original_tags if original_tags else tags - print(f" - 使用解码后的原始内容 (prefer_original=True)") - except Exception as e: - print(f" - 警告: base64解码失败: {e},将尝试使用普通字段") - title = None - content = None + print(f" - 成功从base64解码审核后标题") + except Exception as e: + print(f" - 警告: 标题base64解码失败: {e}") - # 如果base64解码失败或不存在base64字段,则使用原始逻辑 - if title is None or content is None: - # 根据judge_success选择标题和内容 - judge_success = data.get('judge_success', None) + if "content_base64" in data: + try: + content = base64.b64decode(data["content_base64"]).decode('utf-8') + print(f" - 成功从base64解码审核后内容") + except Exception as e: + print(f" - 警告: 内容base64解码失败: {e}") + + # 解码原始标题和内容字段 + if "original_title_base64" in data: + try: + original_title = base64.b64decode(data["original_title_base64"]).decode('utf-8') + print(f" - 成功从base64解码原始标题") + except Exception as e: + print(f" - 警告: 原始标题base64解码失败: {e}") + + if "original_content_base64" in data: + try: + original_content = base64.b64decode(data["original_content_base64"]).decode('utf-8') + print(f" - 成功从base64解码原始内容") + except Exception as e: + print(f" - 警告: 原始内容base64解码失败: {e}") + + # 解码标签字段 + if "tags_base64" in data: + try: + tags = base64.b64decode(data["tags_base64"]).decode('utf-8') + print(f" - 成功从base64解码审核后标签") + except Exception as e: + print(f" - 警告: 标签base64解码失败: {e}") + + if "original_tags_base64" in data: + try: + original_tags = base64.b64decode(data["original_tags_base64"]).decode('utf-8') + print(f" - 成功从base64解码原始标签") + except Exception as e: + print(f" - 警告: 原始标签base64解码失败: {e}") + + # =================回退到非base64字段================= + # 如果base64解码失败,尝试使用普通字段 + if title is None and "title" in data: + title = data["title"] + print(f" - 使用普通字段标题") - if prefer_original and 'original_title' in data and 'original_content' in data: - # 优先使用原始内容 - title = data.get('original_title', '未找到原始标题') - content = data.get('original_content', '未找到原始内容') - # 优先使用原始标签 - tags = data.get('original_tags', data.get('tags', '未找到标签')) - print(f" - 优先使用原始内容 (prefer_original=True)") - elif judge_success is True and not prefer_original: - # 使用审核后的内容 - title = data.get('title', '未找到标题') - content = data.get('content', '未找到内容') - tags = data.get('tags', '未找到标签') - print(f" - 使用审核后内容 (judge_success=True)") - elif 'original_title' in data and 'original_content' in data: - # 使用原始内容 - title = data.get('original_title', '未找到原始标题') - content = data.get('original_content', '未找到原始内容') - # 优先使用原始标签 - tags = data.get('original_tags', data.get('tags', '未找到标签')) - print(f" - 使用原始内容 (judge_success={judge_success})") - else: - # 若无original字段,使用常规字段 - title = data.get('title', '未找到标题') - content = data.get('content', '未找到内容') - tags = data.get('tags', '未找到标签') - print(f" - 使用常规内容 (无judge结果)") + if content is None and "content" in data: + content = data["content"] + print(f" - 使用普通字段内容") - # 解决tag/tags字段重复问题,按照修正后的处理逻辑,只使用tags字段 - if not tags and 'tag' in data: - tags = data.get('tag', '未找到标签') - print(f" - 使用tag字段作为标签 (该字段将在后续版本中统一为tags)") + if original_title is None and "original_title" in data: + original_title = data["original_title"] + print(f" - 使用普通字段原始标题") + + if original_content is None and "original_content" in data: + original_content = data["original_content"] + print(f" - 使用普通字段原始内容") + + if tags is None and "tags" in data: + tags = data["tags"] + print(f" - 使用普通字段标签") + elif tags is None and "tag" in data: + tags = data["tag"] + print(f" - 使用普通tag字段作为标签") + + if original_tags is None and "original_tags" in data: + original_tags = data["original_tags"] + print(f" - 使用普通字段原始标签") + + # =================根据状态字段决定使用哪些内容================= + final_title = None + final_content = None + final_tags = None + + # 简化逻辑:如果已审核,使用审核后内容;否则使用原始内容 + if judged: + print(f" - 使用审核后内容 (judged=True)") + final_title = title + final_content = content + final_tags = tags + else: + print(f" - 使用原始内容 (judged=False)") + final_title = original_title + final_content = original_content + final_tags = original_tags + + # 确保所有字段都有值 + final_title = final_title or "未找到标题" + final_content = final_content or "未找到内容" + final_tags = final_tags or "未找到标签" # 移除Markdown格式,但保留换行符 - content_no_format = re.sub(r'\*\*(.*?)\*\*', r'\1', content) + content_no_format = re.sub(r'\*\*(.*?)\*\*', r'\1', final_content) - # 组合输出文本,保留原始内容的所有换行符 - result = "" - if title: - result += title + "\n\n" - if content_no_format: - result += content_no_format - if tags and tags != "未找到标签": - result += "\n\n" + tags + # 组合输出文本,保留内容的所有换行符 + result = final_title + "\n\n" + content_no_format + + if final_tags and final_tags != "未找到标签": + result += "\n\n" + final_tags + print(f" - 内容处理完成,最终文本长度: {len(result)} 字符") return result, None + except json.JSONDecodeError: print(f" - 错误: JSON 格式无效: {json_path}") return None, f"无效的 JSON 格式: {json_path}" except Exception as e: print(f" - 错误: 处理 JSON 时出错: {e}") + traceback.print_exc() return None, f"处理 JSON 时出错: {e}" def process_txt_content(txt_path): @@ -462,17 +485,20 @@ def process_result_directory(source_dir, output_dir, run_id=None, prefer_origina print(f" - 错误: {record['Details']}") continue - # 1. 处理article.txt - input_txt_path = os.path.join(entry_path, "article.txt") + # 1. 处理article内容 - 优先使用JSON文件 output_txt_path = os.path.join(output_entry_path, "article.txt") record["OutputTxtPath"] = output_txt_path - # 读取article.json,仅用于获取judge_status + # 读取article.json json_path = os.path.join(entry_path, "article.json") record["ArticleJsonPath"] = json_path + content_processed = False + + # 优先从JSON提取内容 if os.path.exists(json_path): try: + # 从JSON文件提取审核状态 with open(json_path, 'r', encoding='utf-8') as f_json: article_data = json.load(f_json) # 提取judge_success状态 @@ -480,31 +506,48 @@ def process_result_directory(source_dir, output_dir, run_id=None, prefer_origina record["JudgeStatus"] = str(article_data["judge_success"]) elif "judged" in article_data: record["JudgeStatus"] = "已审核" if article_data["judged"] else "未审核" + + # 使用convert_json_to_txt_content函数处理JSON文件 + processed_content, error = convert_json_to_txt_content(json_path, prefer_original) + + if error: + print(f" - 警告: 从JSON提取内容失败: {error}") + else: + try: + with open(output_txt_path, 'w', encoding='utf-8') as f_txt: + f_txt.write(processed_content) + print(f" - 成功从JSON提取并写入内容到: {output_txt_path}") + record["ContentSource"] = "json_file" + content_processed = True + except Exception as e: + print(f" - 警告: 写入从JSON提取的内容时出错: {e}") except Exception as e: - print(f" - 警告: 读取article.json失败: {e}") + print(f" - 警告: 处理JSON文件时出错: {e}") - # 处理article.txt文件 - if os.path.exists(input_txt_path): - processed_content, error = process_txt_content(input_txt_path) - if error: - record["Status"] = "Partial" - record["Details"] += f"文章处理失败: {error}; " - print(f" - 错误: {record['Details']}") - else: - try: - with open(output_txt_path, 'w', encoding='utf-8') as f_txt: - f_txt.write(processed_content) - print(f" - 成功写入处理后的文本文件: {output_txt_path}") - record["ContentSource"] = "txt_file" - - except Exception as e: + # 如果从JSON提取内容失败,尝试使用现有的TXT文件 + if not content_processed: + input_txt_path = os.path.join(entry_path, "article.txt") + if os.path.exists(input_txt_path): + processed_content, error = process_txt_content(input_txt_path) + if error: record["Status"] = "Partial" - record["Details"] += f"写入文本文件失败: {e}; " + record["Details"] += f"文章处理失败: {error}; " print(f" - 错误: {record['Details']}") - else: - record["Status"] = "Partial" - record["Details"] += "文章TXT文件不存在; " - print(f" - 警告: {record['Details']}") + else: + try: + with open(output_txt_path, 'w', encoding='utf-8') as f_txt: + f_txt.write(processed_content) + print(f" - 成功写入处理后的文本文件: {output_txt_path}") + record["ContentSource"] = "txt_file" + content_processed = True + except Exception as e: + record["Status"] = "Partial" + record["Details"] += f"写入文本文件失败: {e}; " + print(f" - 错误: {record['Details']}") + else: + record["Status"] = "Partial" + record["Details"] += "无法从JSON或TXT获取内容; " + print(f" - 警告: {record['Details']}") # 2. 处理海报图片 poster_dir = os.path.join(entry_path, "poster") @@ -627,8 +670,8 @@ def main(): args = parser.parse_args() # 默认值设置 - source = args.source if args.source else "/root/autodl-tmp/TravelContentCreator/result/2025-05-19_17-51-07" - output = args.output if args.output else "/root/autodl-tmp/TravelContentCreator/output/2025-05-19_17-51-07" + source = args.source if args.source else "/root/autodl-tmp/TravelContentCreator/result/2025-05-20_15-37-25" + output = args.output if args.output else "/root/autodl-tmp/TravelContentCreator/output/2025-05-20_15-37-25" run_id = args.run_id if args.run_id else os.path.basename(source) prefer_original = args.prefer_original db_path = args.db_path if args.db_path else '/root/autodl-tmp/TravelContentCreator/distribution.db'