修改了分发的读取方式

2025-05-21 09:49:41 +08:00 · 2025-05-21 09:49:41 +08:00 · 881a33786b
commit 881a33786b
parent 674082e7d7
2 changed files with 163 additions and 120 deletions
--- a/scripts/distribution/distribute_example.sh
+++ b/scripts/distribution/distribute_example.sh
@ -16,8 +16,8 @@ EMAIL_FROM="zwysendemail@163.com"
 EMAIL_PASSWORD="NMhVGFmCJkGEy3B5"
 # EMAIL_FROM="zowoyomedia@163.com"
 # EMAIL_PASSWORD="SDj5fK6Tk9YevmsD"
-SUBJECT="文旅小红书带货笔记内容0519"
+SUBJECT="文旅小红书带货笔记内容0520"
-ZIP_FILENAME="文旅小红书带货笔记内容0519"
+ZIP_FILENAME="文旅小红书带货笔记内容0520"
 # 设置分发配置
 ARTICLE_PER_USER=1
@ -33,14 +33,14 @@ UNDISTRIBUTED_ONLY=true  # 只分发未分发的内容
 # 内容筛选配置
 TARGET_PRODUCT=""  # 为空则不筛选特定产品
-TARGET_OBJECT="北洛秘境盛季酒店"  # 为空则不筛选特定景点
+TARGET_OBJECT="极爽冲浪馆"  # 为空则不筛选特定景点
 # 用户筛选配置
 TARGET_USER_ID=""  # 为空则不筛选特定用户ID
 TARGET_USER_EMAIL=""  # 为空则不筛选特定用户邮箱
 # 强制性附件配置
-FORCE_ATTACHMENTS="/root/autodl-tmp/TravelContentCreator/hotel_img/标题参考格式-精选.txt"
+FORCE_ATTACHMENTS="/root/autodl-tmp/TravelContentCreator/hotel_img/标题参考格式-精选.txt, /root/autodl-tmp/TravelContentCreator/hotel_img/poster/极爽冲浪-门票.jpg"
 # 创建必要的目录
 mkdir -p "$LOG_DIR"
--- a/scripts/distribution/extract_and_render.py
+++ b/scripts/distribution/extract_and_render.py
@ -137,16 +137,15 @@ def convert_json_to_txt_content(json_path, prefer_original=False):
    读取 JSON 文件，提取标题、内容和标签，移除 Markdown 格式，
    并返回格式化文本。
-    根据judge_success字段决定使用原始内容还是审核后内容：
+    根据JSON文件中的状态字段决定使用什么内容：
-    - judge_success为True时使用title/content（除非prefer_original=True）
+    - 如果judged=True，使用审核后内容
-    - judge_success为False时使用original_title/original_content
+    - 如果judged=False，使用原始内容
-    支持base64编码的内容：
+    所有情况都优先使用base64编码的字段，因为这些字段能正确保留特殊字符和换行符
    - 如果检测到title_base64和content_base64字段，将优先使用这些字段
    Args:
        json_path: JSON文件路径
-        prefer_original: 是否优先使用原始内容，无视judge_success结果
+        prefer_original: 参数保留但不再使用
    """
    print(f"    - 正在读取 JSON: {json_path}")
    if not os.path.exists(json_path):
@ -157,110 +156,134 @@ def convert_json_to_txt_content(json_path, prefer_original=False):
        with open(json_path, 'r', encoding='utf-8') as f_json:
            data = json.load(f_json)
-        # 优先检查是否有base64编码的内容
+        # 提取状态字段
        judged = data.get('judged', False)
        print(f"    - 文件状态: judged={judged}")
        # 初始化变量
        title = None
        content = None
        original_title = None
        original_content = None
        tags = None
        original_tags = None
-        # 尝试从base64字段获取内容
+        # =================解码所有可能的base64字段=================
-        try:
+        # 解码标题和内容字段
-            # 优先使用base64编码的内容
+        if "title_base64" in data:
-            if "title_base64" in data:
+            try:
                title = base64.b64decode(data["title_base64"]).decode('utf-8')
-                print(f"    - 成功从base64解码标题")
+                print(f"    - 成功从base64解码审核后标题")
            except Exception as e:
                print(f"    - 警告: 标题base64解码失败: {e}")
-            if "content_base64" in data:
+        if "content_base64" in data:
            try:
                content = base64.b64decode(data["content_base64"]).decode('utf-8')
-                print(f"    - 成功从base64解码内容")
+                print(f"    - 成功从base64解码审核后内容")
            except Exception as e:
                print(f"    - 警告: 内容base64解码失败: {e}")
-            if "tags_base64" in data:
+        # 解码原始标题和内容字段
-                tags = base64.b64decode(data["tags_base64"]).decode('utf-8')
+        if "original_title_base64" in data:
-                print(f"    - 成功从base64解码标签")
+            try:
            elif "tags" in data:
                tags = data.get("tags", "")
            elif "tag" in data:
                tags = data.get("tag", "")
            # 检查是否有原始内容的base64
            if "original_title_base64" in data:
                original_title = base64.b64decode(data["original_title_base64"]).decode('utf-8')
                print(f"    - 成功从base64解码原始标题")
            except Exception as e:
                print(f"    - 警告: 原始标题base64解码失败: {e}")
-            if "original_content_base64" in data:
+        if "original_content_base64" in data:
            try:
                original_content = base64.b64decode(data["original_content_base64"]).decode('utf-8')
                print(f"    - 成功从base64解码原始内容")
            except Exception as e:
                print(f"    - 警告: 原始内容base64解码失败: {e}")
-            if "original_tags_base64" in data:
+        # 解码标签字段
        if "tags_base64" in data:
            try:
                tags = base64.b64decode(data["tags_base64"]).decode('utf-8')
                print(f"    - 成功从base64解码审核后标签")
            except Exception as e:
                print(f"    - 警告: 标签base64解码失败: {e}")
        if "original_tags_base64" in data:
            try:
                original_tags = base64.b64decode(data["original_tags_base64"]).decode('utf-8')
-            elif "original_tags" in data:
+                print(f"    - 成功从base64解码原始标签")
-                original_tags = data.get("original_tags", "")
+            except Exception as e:
                print(f"    - 警告: 原始标签base64解码失败: {e}")
-            # 如果prefer_original为True且有原始内容，使用原始内容
+        # =================回退到非base64字段=================
-            if prefer_original and original_title and original_content:
+        # 如果base64解码失败，尝试使用普通字段
-                title = original_title
+        if title is None and "title" in data:
-                content = original_content
+            title = data["title"]
-                tags = original_tags if original_tags else tags
+            print(f"    - 使用普通字段标题")
                print(f"    - 使用解码后的原始内容 (prefer_original=True)")
        except Exception as e:
            print(f"    - 警告: base64解码失败: {e}，将尝试使用普通字段")
            title = None
            content = None
-        # 如果base64解码失败或不存在base64字段，则使用原始逻辑
+        if content is None and "content" in data:
-        if title is None or content is None:
+            content = data["content"]
-            # 根据judge_success选择标题和内容
+            print(f"    - 使用普通字段内容")
            judge_success = data.get('judge_success', None)
-            if prefer_original and 'original_title' in data and 'original_content' in data:
+        if original_title is None and "original_title" in data:
-                # 优先使用原始内容
+            original_title = data["original_title"]
-                title = data.get('original_title', '未找到原始标题')
+            print(f"    - 使用普通字段原始标题")
                content = data.get('original_content', '未找到原始内容')
                # 优先使用原始标签
                tags = data.get('original_tags', data.get('tags', '未找到标签'))
                print(f"    - 优先使用原始内容 (prefer_original=True)")
            elif judge_success is True and not prefer_original:
                # 使用审核后的内容
                title = data.get('title', '未找到标题')
                content = data.get('content', '未找到内容')
                tags = data.get('tags', '未找到标签')
                print(f"    - 使用审核后内容 (judge_success=True)")
            elif 'original_title' in data and 'original_content' in data:
                # 使用原始内容
                title = data.get('original_title', '未找到原始标题')
                content = data.get('original_content', '未找到原始内容')
                # 优先使用原始标签
                tags = data.get('original_tags', data.get('tags', '未找到标签'))
                print(f"    - 使用原始内容 (judge_success={judge_success})")
            else:
                # 若无original字段，使用常规字段
                title = data.get('title', '未找到标题')
                content = data.get('content', '未找到内容')
                tags = data.get('tags', '未找到标签')
                print(f"    - 使用常规内容 (无judge结果)")
-            # 解决tag/tags字段重复问题，按照修正后的处理逻辑，只使用tags字段
+        if original_content is None and "original_content" in data:
-            if not tags and 'tag' in data:
+            original_content = data["original_content"]
-                tags = data.get('tag', '未找到标签')
+            print(f"    - 使用普通字段原始内容")
-                print(f"    - 使用tag字段作为标签 (该字段将在后续版本中统一为tags)")
+            
        if tags is None and "tags" in data:
            tags = data["tags"]
            print(f"    - 使用普通字段标签")
        elif tags is None and "tag" in data:
            tags = data["tag"]
            print(f"    - 使用普通tag字段作为标签")
        if original_tags is None and "original_tags" in data:
            original_tags = data["original_tags"]
            print(f"    - 使用普通字段原始标签")
        # =================根据状态字段决定使用哪些内容=================
        final_title = None
        final_content = None
        final_tags = None
        # 简化逻辑：如果已审核，使用审核后内容；否则使用原始内容
        if judged:
            print(f"    - 使用审核后内容 (judged=True)")
            final_title = title
            final_content = content
            final_tags = tags
        else:
            print(f"    - 使用原始内容 (judged=False)")
            final_title = original_title
            final_content = original_content
            final_tags = original_tags
        # 确保所有字段都有值
        final_title = final_title or "未找到标题"
        final_content = final_content or "未找到内容"
        final_tags = final_tags or "未找到标签"
        # 移除Markdown格式，但保留换行符
-        content_no_format = re.sub(r'\*\*(.*?)\*\*', r'\1', content)
+        content_no_format = re.sub(r'\*\*(.*?)\*\*', r'\1', final_content)
-        # 组合输出文本，保留原始内容的所有换行符
+        # 组合输出文本，保留内容的所有换行符
-        result = ""
+        result = final_title + "\n\n" + content_no_format
        if title:
            result += title + "\n\n"
        if content_no_format:
            result += content_no_format
        if tags and tags != "未找到标签":
            result += "\n\n" + tags
        if final_tags and final_tags != "未找到标签":
            result += "\n\n" + final_tags
        print(f"    - 内容处理完成，最终文本长度: {len(result)} 字符")
        return result, None
    except json.JSONDecodeError:
        print(f"    - 错误: JSON 格式无效: {json_path}")
        return None, f"无效的 JSON 格式: {json_path}"
    except Exception as e:
        print(f"    - 错误: 处理 JSON 时出错: {e}")
        traceback.print_exc()
        return None, f"处理 JSON 时出错: {e}"
 def process_txt_content(txt_path):
@ -462,17 +485,20 @@ def process_result_directory(source_dir, output_dir, run_id=None, prefer_origina
            print(f"  - 错误: {record['Details']}")
            continue
-        # 1. 处理article.txt
+        # 1. 处理article内容 - 优先使用JSON文件
        input_txt_path = os.path.join(entry_path, "article.txt")
        output_txt_path = os.path.join(output_entry_path, "article.txt")
        record["OutputTxtPath"] = output_txt_path
-        # 读取article.json，仅用于获取judge_status
+        # 读取article.json
        json_path = os.path.join(entry_path, "article.json")
        record["ArticleJsonPath"] = json_path
        content_processed = False
        # 优先从JSON提取内容
        if os.path.exists(json_path):
            try:
                # 从JSON文件提取审核状态
                with open(json_path, 'r', encoding='utf-8') as f_json:
                    article_data = json.load(f_json)
                    # 提取judge_success状态
@ -480,31 +506,48 @@ def process_result_directory(source_dir, output_dir, run_id=None, prefer_origina
                        record["JudgeStatus"] = str(article_data["judge_success"])
                    elif "judged" in article_data:
                        record["JudgeStatus"] = "已审核" if article_data["judged"] else "未审核"
                # 使用convert_json_to_txt_content函数处理JSON文件
                processed_content, error = convert_json_to_txt_content(json_path, prefer_original)
                if error:
                    print(f"  - 警告: 从JSON提取内容失败: {error}")
                else:
                    try:
                        with open(output_txt_path, 'w', encoding='utf-8') as f_txt:
                            f_txt.write(processed_content)
                        print(f"  - 成功从JSON提取并写入内容到: {output_txt_path}")
                        record["ContentSource"] = "json_file"
                        content_processed = True
                    except Exception as e:
                        print(f"  - 警告: 写入从JSON提取的内容时出错: {e}")
            except Exception as e:
-                print(f"  - 警告: 读取article.json失败: {e}")
+                print(f"  - 警告: 处理JSON文件时出错: {e}")
-        # 处理article.txt文件
+        # 如果从JSON提取内容失败，尝试使用现有的TXT文件
-        if os.path.exists(input_txt_path):
+        if not content_processed:
-            processed_content, error = process_txt_content(input_txt_path)
+            input_txt_path = os.path.join(entry_path, "article.txt")
-            if error:
+            if os.path.exists(input_txt_path):
-                record["Status"] = "Partial"
+                processed_content, error = process_txt_content(input_txt_path)
-                record["Details"] += f"文章处理失败: {error}; "
+                if error:
                print(f"  - 错误: {record['Details']}")
            else:
                try:
                    with open(output_txt_path, 'w', encoding='utf-8') as f_txt:
                        f_txt.write(processed_content)
                    print(f"  - 成功写入处理后的文本文件: {output_txt_path}")
                    record["ContentSource"] = "txt_file"
                except Exception as e:
                    record["Status"] = "Partial"
-                    record["Details"] += f"写入文本文件失败: {e}; "
+                    record["Details"] += f"文章处理失败: {error}; "
                    print(f"  - 错误: {record['Details']}")
-        else:
+                else:
-            record["Status"] = "Partial"
+                    try:
-            record["Details"] += "文章TXT文件不存在; "
+                        with open(output_txt_path, 'w', encoding='utf-8') as f_txt:
-            print(f"  - 警告: {record['Details']}")
+                            f_txt.write(processed_content)
                        print(f"  - 成功写入处理后的文本文件: {output_txt_path}")
                        record["ContentSource"] = "txt_file"
                        content_processed = True
                    except Exception as e:
                        record["Status"] = "Partial"
                        record["Details"] += f"写入文本文件失败: {e}; "
                        print(f"  - 错误: {record['Details']}")
            else:
                record["Status"] = "Partial"
                record["Details"] += "无法从JSON或TXT获取内容; "
                print(f"  - 警告: {record['Details']}")
        # 2. 处理海报图片
        poster_dir = os.path.join(entry_path, "poster")
@ -627,8 +670,8 @@ def main():
    args = parser.parse_args()
    # 默认值设置
-    source = args.source if args.source else "/root/autodl-tmp/TravelContentCreator/result/2025-05-19_17-51-07"
+    source = args.source if args.source else "/root/autodl-tmp/TravelContentCreator/result/2025-05-20_15-37-25"
-    output = args.output if args.output else "/root/autodl-tmp/TravelContentCreator/output/2025-05-19_17-51-07"
+    output = args.output if args.output else "/root/autodl-tmp/TravelContentCreator/output/2025-05-20_15-37-25"
    run_id = args.run_id if args.run_id else os.path.basename(source)
    prefer_original = args.prefer_original
    db_path = args.db_path if args.db_path else '/root/autodl-tmp/TravelContentCreator/distribution.db'