From 881a33786ba53e8ada61bd09f183416babd4995a Mon Sep 17 00:00:00 2001
From: jinye_huang <jinye_huang@foxmail.com>
Date: Wed, 21 May 2025 09:49:41 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86=E5=88=86=E5=8F=91?=
 =?UTF-8?q?=E7=9A=84=E8=AF=BB=E5=8F=96=E6=96=B9=E5=BC=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 scripts/distribution/distribute_example.sh |   8 +-
 scripts/distribution/extract_and_render.py | 275 ++++++++++++---------
 2 files changed, 163 insertions(+), 120 deletions(-)

diff --git a/scripts/distribution/distribute_example.sh b/scripts/distribution/distribute_example.sh
index 4b05543..0ca4798 100755
--- a/scripts/distribution/distribute_example.sh
+++ b/scripts/distribution/distribute_example.sh
@@ -16,8 +16,8 @@ EMAIL_FROM="zwysendemail@163.com"
 EMAIL_PASSWORD="NMhVGFmCJkGEy3B5"
 # EMAIL_FROM="zowoyomedia@163.com"
 # EMAIL_PASSWORD="SDj5fK6Tk9YevmsD"
-SUBJECT="文旅小红书带货笔记内容0519"
-ZIP_FILENAME="文旅小红书带货笔记内容0519"
+SUBJECT="文旅小红书带货笔记内容0520"
+ZIP_FILENAME="文旅小红书带货笔记内容0520"
 
 # 设置分发配置
 ARTICLE_PER_USER=1
@@ -33,14 +33,14 @@ UNDISTRIBUTED_ONLY=true  # 只分发未分发的内容
 
 # 内容筛选配置
 TARGET_PRODUCT=""  # 为空则不筛选特定产品
-TARGET_OBJECT="北洛秘境盛季酒店"  # 为空则不筛选特定景点
+TARGET_OBJECT="极爽冲浪馆"  # 为空则不筛选特定景点
 
 # 用户筛选配置
 TARGET_USER_ID=""  # 为空则不筛选特定用户ID
 TARGET_USER_EMAIL=""  # 为空则不筛选特定用户邮箱
 
 # 强制性附件配置
-FORCE_ATTACHMENTS="/root/autodl-tmp/TravelContentCreator/hotel_img/标题参考格式-精选.txt"
+FORCE_ATTACHMENTS="/root/autodl-tmp/TravelContentCreator/hotel_img/标题参考格式-精选.txt, /root/autodl-tmp/TravelContentCreator/hotel_img/poster/极爽冲浪-门票.jpg"
 
 # 创建必要的目录
 mkdir -p "$LOG_DIR"
diff --git a/scripts/distribution/extract_and_render.py b/scripts/distribution/extract_and_render.py
index 5d30cee..f3ad3e4 100644
--- a/scripts/distribution/extract_and_render.py
+++ b/scripts/distribution/extract_and_render.py
@@ -137,16 +137,15 @@ def convert_json_to_txt_content(json_path, prefer_original=False):
     读取 JSON 文件，提取标题、内容和标签，移除 Markdown 格式，
     并返回格式化文本。
     
-    根据judge_success字段决定使用原始内容还是审核后内容：
-    - judge_success为True时使用title/content（除非prefer_original=True）
-    - judge_success为False时使用original_title/original_content
+    根据JSON文件中的状态字段决定使用什么内容：
+    - 如果judged=True，使用审核后内容
+    - 如果judged=False，使用原始内容
     
-    支持base64编码的内容：
-    - 如果检测到title_base64和content_base64字段，将优先使用这些字段
+    所有情况都优先使用base64编码的字段，因为这些字段能正确保留特殊字符和换行符
     
     Args:
         json_path: JSON文件路径
-        prefer_original: 是否优先使用原始内容，无视judge_success结果
+        prefer_original: 参数保留但不再使用
     """
     print(f"    - 正在读取 JSON: {json_path}")
     if not os.path.exists(json_path):
@@ -157,110 +156,134 @@ def convert_json_to_txt_content(json_path, prefer_original=False):
         with open(json_path, 'r', encoding='utf-8') as f_json:
             data = json.load(f_json)
 
-        # 优先检查是否有base64编码的内容
+        # 提取状态字段
+        judged = data.get('judged', False)
+        
+        print(f"    - 文件状态: judged={judged}")
+        
+        # 初始化变量
         title = None
         content = None
         original_title = None
         original_content = None
         tags = None
+        original_tags = None
         
-        # 尝试从base64字段获取内容
-        try:
-            # 优先使用base64编码的内容
-            if "title_base64" in data:
+        # =================解码所有可能的base64字段=================
+        # 解码标题和内容字段
+        if "title_base64" in data:
+            try:
                 title = base64.b64decode(data["title_base64"]).decode('utf-8')
-                print(f"    - 成功从base64解码标题")
-            
-            if "content_base64" in data:
-                content = base64.b64decode(data["content_base64"]).decode('utf-8')
-                print(f"    - 成功从base64解码内容")
-                
-            if "tags_base64" in data:
-                tags = base64.b64decode(data["tags_base64"]).decode('utf-8')
-                print(f"    - 成功从base64解码标签")
-            elif "tags" in data:
-                tags = data.get("tags", "")
-            elif "tag" in data:
-                tags = data.get("tag", "")
-                
-            # 检查是否有原始内容的base64
-            if "original_title_base64" in data:
-                original_title = base64.b64decode(data["original_title_base64"]).decode('utf-8')
-            
-            if "original_content_base64" in data:
-                original_content = base64.b64decode(data["original_content_base64"]).decode('utf-8')
-                
-            if "original_tags_base64" in data:
-                original_tags = base64.b64decode(data["original_tags_base64"]).decode('utf-8')
-            elif "original_tags" in data:
-                original_tags = data.get("original_tags", "")
-                
-            # 如果prefer_original为True且有原始内容，使用原始内容
-            if prefer_original and original_title and original_content:
-                title = original_title
-                content = original_content
-                tags = original_tags if original_tags else tags
-                print(f"    - 使用解码后的原始内容 (prefer_original=True)")
-        except Exception as e:
-            print(f"    - 警告: base64解码失败: {e}，将尝试使用普通字段")
-            title = None
-            content = None
+                print(f"    - 成功从base64解码审核后标题")
+            except Exception as e:
+                print(f"    - 警告: 标题base64解码失败: {e}")
         
-        # 如果base64解码失败或不存在base64字段，则使用原始逻辑
-        if title is None or content is None:
-            # 根据judge_success选择标题和内容
-            judge_success = data.get('judge_success', None)
+        if "content_base64" in data:
+            try:
+                content = base64.b64decode(data["content_base64"]).decode('utf-8')
+                print(f"    - 成功从base64解码审核后内容")
+            except Exception as e:
+                print(f"    - 警告: 内容base64解码失败: {e}")
+        
+        # 解码原始标题和内容字段
+        if "original_title_base64" in data:
+            try:
+                original_title = base64.b64decode(data["original_title_base64"]).decode('utf-8')
+                print(f"    - 成功从base64解码原始标题")
+            except Exception as e:
+                print(f"    - 警告: 原始标题base64解码失败: {e}")
+        
+        if "original_content_base64" in data:
+            try:
+                original_content = base64.b64decode(data["original_content_base64"]).decode('utf-8')
+                print(f"    - 成功从base64解码原始内容")
+            except Exception as e:
+                print(f"    - 警告: 原始内容base64解码失败: {e}")
+        
+        # 解码标签字段
+        if "tags_base64" in data:
+            try:
+                tags = base64.b64decode(data["tags_base64"]).decode('utf-8')
+                print(f"    - 成功从base64解码审核后标签")
+            except Exception as e:
+                print(f"    - 警告: 标签base64解码失败: {e}")
+                
+        if "original_tags_base64" in data:
+            try:
+                original_tags = base64.b64decode(data["original_tags_base64"]).decode('utf-8')
+                print(f"    - 成功从base64解码原始标签")
+            except Exception as e:
+                print(f"    - 警告: 原始标签base64解码失败: {e}")
+                
+        # =================回退到非base64字段=================
+        # 如果base64解码失败，尝试使用普通字段
+        if title is None and "title" in data:
+            title = data["title"]
+            print(f"    - 使用普通字段标题")
             
-            if prefer_original and 'original_title' in data and 'original_content' in data:
-                # 优先使用原始内容
-                title = data.get('original_title', '未找到原始标题')
-                content = data.get('original_content', '未找到原始内容')
-                # 优先使用原始标签
-                tags = data.get('original_tags', data.get('tags', '未找到标签'))
-                print(f"    - 优先使用原始内容 (prefer_original=True)")
-            elif judge_success is True and not prefer_original:
-                # 使用审核后的内容
-                title = data.get('title', '未找到标题')
-                content = data.get('content', '未找到内容')
-                tags = data.get('tags', '未找到标签')
-                print(f"    - 使用审核后内容 (judge_success=True)")
-            elif 'original_title' in data and 'original_content' in data:
-                # 使用原始内容
-                title = data.get('original_title', '未找到原始标题')
-                content = data.get('original_content', '未找到原始内容')
-                # 优先使用原始标签
-                tags = data.get('original_tags', data.get('tags', '未找到标签'))
-                print(f"    - 使用原始内容 (judge_success={judge_success})")
-            else:
-                # 若无original字段，使用常规字段
-                title = data.get('title', '未找到标题')
-                content = data.get('content', '未找到内容')
-                tags = data.get('tags', '未找到标签')
-                print(f"    - 使用常规内容 (无judge结果)")
+        if content is None and "content" in data:
+            content = data["content"]
+            print(f"    - 使用普通字段内容")
             
-            # 解决tag/tags字段重复问题，按照修正后的处理逻辑，只使用tags字段
-            if not tags and 'tag' in data:
-                tags = data.get('tag', '未找到标签')
-                print(f"    - 使用tag字段作为标签 (该字段将在后续版本中统一为tags)")
+        if original_title is None and "original_title" in data:
+            original_title = data["original_title"]
+            print(f"    - 使用普通字段原始标题")
+            
+        if original_content is None and "original_content" in data:
+            original_content = data["original_content"]
+            print(f"    - 使用普通字段原始内容")
+            
+        if tags is None and "tags" in data:
+            tags = data["tags"]
+            print(f"    - 使用普通字段标签")
+        elif tags is None and "tag" in data:
+            tags = data["tag"]
+            print(f"    - 使用普通tag字段作为标签")
+            
+        if original_tags is None and "original_tags" in data:
+            original_tags = data["original_tags"]
+            print(f"    - 使用普通字段原始标签")
+            
+        # =================根据状态字段决定使用哪些内容=================
+        final_title = None
+        final_content = None
+        final_tags = None
+        
+        # 简化逻辑：如果已审核，使用审核后内容；否则使用原始内容
+        if judged:
+            print(f"    - 使用审核后内容 (judged=True)")
+            final_title = title
+            final_content = content
+            final_tags = tags
+        else:
+            print(f"    - 使用原始内容 (judged=False)")
+            final_title = original_title
+            final_content = original_content
+            final_tags = original_tags
+        
+        # 确保所有字段都有值
+        final_title = final_title or "未找到标题"
+        final_content = final_content or "未找到内容"
+        final_tags = final_tags or "未找到标签"
         
         # 移除Markdown格式，但保留换行符
-        content_no_format = re.sub(r'\*\*(.*?)\*\*', r'\1', content)
+        content_no_format = re.sub(r'\*\*(.*?)\*\*', r'\1', final_content)
         
-        # 组合输出文本，保留原始内容的所有换行符
-        result = ""
-        if title:
-            result += title + "\n\n"
-        if content_no_format:
-            result += content_no_format
-        if tags and tags != "未找到标签":
-            result += "\n\n" + tags
+        # 组合输出文本，保留内容的所有换行符
+        result = final_title + "\n\n" + content_no_format
+        
+        if final_tags and final_tags != "未找到标签":
+            result += "\n\n" + final_tags
             
+        print(f"    - 内容处理完成，最终文本长度: {len(result)} 字符")
         return result, None
+            
     except json.JSONDecodeError:
         print(f"    - 错误: JSON 格式无效: {json_path}")
         return None, f"无效的 JSON 格式: {json_path}"
     except Exception as e:
         print(f"    - 错误: 处理 JSON 时出错: {e}")
+        traceback.print_exc()
         return None, f"处理 JSON 时出错: {e}"
 
 def process_txt_content(txt_path):
@@ -462,17 +485,20 @@ def process_result_directory(source_dir, output_dir, run_id=None, prefer_origina
             print(f"  - 错误: {record['Details']}")
             continue
         
-        # 1. 处理article.txt
-        input_txt_path = os.path.join(entry_path, "article.txt")
+        # 1. 处理article内容 - 优先使用JSON文件
         output_txt_path = os.path.join(output_entry_path, "article.txt")
         record["OutputTxtPath"] = output_txt_path
         
-        # 读取article.json，仅用于获取judge_status
+        # 读取article.json
         json_path = os.path.join(entry_path, "article.json")
         record["ArticleJsonPath"] = json_path
         
+        content_processed = False
+        
+        # 优先从JSON提取内容
         if os.path.exists(json_path):
             try:
+                # 从JSON文件提取审核状态
                 with open(json_path, 'r', encoding='utf-8') as f_json:
                     article_data = json.load(f_json)
                     # 提取judge_success状态
@@ -480,31 +506,48 @@ def process_result_directory(source_dir, output_dir, run_id=None, prefer_origina
                         record["JudgeStatus"] = str(article_data["judge_success"])
                     elif "judged" in article_data:
                         record["JudgeStatus"] = "已审核" if article_data["judged"] else "未审核"
+                
+                # 使用convert_json_to_txt_content函数处理JSON文件
+                processed_content, error = convert_json_to_txt_content(json_path, prefer_original)
+                
+                if error:
+                    print(f"  - 警告: 从JSON提取内容失败: {error}")
+                else:
+                    try:
+                        with open(output_txt_path, 'w', encoding='utf-8') as f_txt:
+                            f_txt.write(processed_content)
+                        print(f"  - 成功从JSON提取并写入内容到: {output_txt_path}")
+                        record["ContentSource"] = "json_file"
+                        content_processed = True
+                    except Exception as e:
+                        print(f"  - 警告: 写入从JSON提取的内容时出错: {e}")
             except Exception as e:
-                print(f"  - 警告: 读取article.json失败: {e}")
+                print(f"  - 警告: 处理JSON文件时出错: {e}")
         
-        # 处理article.txt文件
-        if os.path.exists(input_txt_path):
-            processed_content, error = process_txt_content(input_txt_path)
-            if error:
-                record["Status"] = "Partial"
-                record["Details"] += f"文章处理失败: {error}; "
-                print(f"  - 错误: {record['Details']}")
-            else:
-                try:
-                    with open(output_txt_path, 'w', encoding='utf-8') as f_txt:
-                        f_txt.write(processed_content)
-                    print(f"  - 成功写入处理后的文本文件: {output_txt_path}")
-                    record["ContentSource"] = "txt_file"
-                        
-                except Exception as e:
+        # 如果从JSON提取内容失败，尝试使用现有的TXT文件
+        if not content_processed:
+            input_txt_path = os.path.join(entry_path, "article.txt")
+            if os.path.exists(input_txt_path):
+                processed_content, error = process_txt_content(input_txt_path)
+                if error:
                     record["Status"] = "Partial"
-                    record["Details"] += f"写入文本文件失败: {e}; "
+                    record["Details"] += f"文章处理失败: {error}; "
                     print(f"  - 错误: {record['Details']}")
-        else:
-            record["Status"] = "Partial"
-            record["Details"] += "文章TXT文件不存在; "
-            print(f"  - 警告: {record['Details']}")
+                else:
+                    try:
+                        with open(output_txt_path, 'w', encoding='utf-8') as f_txt:
+                            f_txt.write(processed_content)
+                        print(f"  - 成功写入处理后的文本文件: {output_txt_path}")
+                        record["ContentSource"] = "txt_file"
+                        content_processed = True
+                    except Exception as e:
+                        record["Status"] = "Partial"
+                        record["Details"] += f"写入文本文件失败: {e}; "
+                        print(f"  - 错误: {record['Details']}")
+            else:
+                record["Status"] = "Partial"
+                record["Details"] += "无法从JSON或TXT获取内容; "
+                print(f"  - 警告: {record['Details']}")
         
         # 2. 处理海报图片
         poster_dir = os.path.join(entry_path, "poster")
@@ -627,8 +670,8 @@ def main():
     args = parser.parse_args()
     
     # 默认值设置
-    source = args.source if args.source else "/root/autodl-tmp/TravelContentCreator/result/2025-05-19_17-51-07"
-    output = args.output if args.output else "/root/autodl-tmp/TravelContentCreator/output/2025-05-19_17-51-07"
+    source = args.source if args.source else "/root/autodl-tmp/TravelContentCreator/result/2025-05-20_15-37-25"
+    output = args.output if args.output else "/root/autodl-tmp/TravelContentCreator/output/2025-05-20_15-37-25"
     run_id = args.run_id if args.run_id else os.path.basename(source)
     prefer_original = args.prefer_original
     db_path = args.db_path if args.db_path else '/root/autodl-tmp/TravelContentCreator/distribution.db'