修改了存储方式

2025-05-18 22:29:29 +08:00 · 2025-05-18 22:29:29 +08:00 · c8c4031696
commit c8c4031696
parent e9ac187a3b
2 changed files with 130 additions and 26 deletions
--- a/utils/pycache/output_handler.cpython-312.pyc
+++ b/utils/pycache/output_handler.cpython-312.pyc
--- a/utils/output_handler.py
+++ b/utils/output_handler.py
@ -3,6 +3,37 @@ import simplejson as json
 import logging
 from abc import ABC, abstractmethod
 import traceback
+import base64
+
+# 自定义JSON编码器，强制处理所有可能的JSON序列化问题
+class SafeJSONEncoder(json.JSONEncoder):
+    """安全的JSON编码器，可以处理所有类型的字符串"""
+    
+    def encode(self, obj):
+        """重写encode方法，确保任何字符串都能被安全编码"""
+        if isinstance(obj, dict):
+            # 处理字典：递归处理每个值
+            return '{' + ','.join(f'"{key}":{self.encode(value)}' 
+                for key, value in obj.items() 
+                if key not in ["error", "raw_result"]) + '}'
+        elif isinstance(obj, list):
+            # 处理列表：递归处理每个项
+            return '[' + ','.join(self.encode(item) for item in obj) + ']'
+        elif isinstance(obj, str):
+            # 安全处理字符串：移除可能导致问题的字符
+            safe_str = ''
+            for char in obj:
+                if char in '\n\r\t' or (32 <= ord(char) <= 126):
+                    safe_str += char
+                # 跳过所有其他字符
+            return json.JSONEncoder.encode(self, safe_str)
+        else:
+            # 其他类型：使用默认处理
+            return json.JSONEncoder.encode(self, obj)
+            
+    def iterencode(self, obj, _one_shot=False):
+        """重写iterencode方法，确保能处理迭代编码"""
+        return self.encode(obj)

 class OutputHandler(ABC):
    """Abstract base class for handling the output of the generation pipeline."""
@ -70,7 +101,7 @@ class FileSystemOutputHandler(OutputHandler):
        topics_path = os.path.join(run_dir, f"tweet_topic_{run_id}.json")
        try:
            with open(topics_path, "w", encoding="utf-8") as f:
-                json.dump(topics_list, f, ensure_ascii=False, indent=4, ignore_nan=True)
+                json.dump(topics_list, f, ensure_ascii=False, indent=4, ignore_nan=True, cls=SafeJSONEncoder)
            logging.info(f"Topics list saved successfully to: {topics_path}")
        except Exception as e:
            logging.exception(f"Error saving topic JSON file to {topics_path}:")
@ -115,9 +146,49 @@ class FileSystemOutputHandler(OutputHandler):
            if "tags" in output_data and "original_tags" not in output_data:
                output_data["original_tags"] = output_data["tags"]
        
+        # 保存原始值用于调试
+        original_title = output_data.get("title", "")
+        original_content = output_data.get("content", "")
+        
+        # 添加Base64编码内容
+        try:
+            # 编码标题和内容
+            title_base64 = base64.b64encode(output_data.get("title", "").encode('utf-8')).decode('ascii')
+            content_base64 = base64.b64encode(output_data.get("content", "").encode('utf-8')).decode('ascii')
+            
+            # 添加到输出数据
+            output_data["title_base64"] = title_base64
+            output_data["content_base64"] = content_base64
+            
+            # 如果有原始内容，也编码
+            if "original_title" in output_data and output_data["original_title"]:
+                output_data["original_title_base64"] = base64.b64encode(
+                    output_data["original_title"].encode('utf-8')).decode('ascii')
+            if "original_content" in output_data and output_data["original_content"]:
+                output_data["original_content_base64"] = base64.b64encode(
+                    output_data["original_content"].encode('utf-8')).decode('ascii')
+                
+            logging.info("成功添加Base64编码内容")
+        except Exception as e:
+            logging.error(f"Base64编码内容时出错: {e}")
+        
        # 对内容进行深度清理，确保安全序列化
        try:
+            # 暂存judge_success状态
+            judge_success = output_data.get("judge_success", False)
+            
+            # 深度清理
            output_data = self._sanitize_content_for_json(output_data)
+            
+            # 恢复judge_success状态
+            output_data["judge_success"] = judge_success
+            
+            # 移除可能的错误标志 - 我们通过尝试序列化来决定是否设置它
+            if "error" in output_data:
+                del output_data["error"]
+            if "raw_result" in output_data:
+                del output_data["raw_result"]
+                
            logging.info("内容已经过安全清理，可以序列化")
        except Exception as e:
            logging.error(f"内容清理过程中出错: {e}")
@ -126,23 +197,38 @@ class FileSystemOutputHandler(OutputHandler):
        content_path = os.path.join(variant_dir, "article.json")
        try:
            with open(content_path, "w", encoding="utf-8") as f:
-                json.dump(output_data, f, ensure_ascii=False, indent=4, ignore_nan=True)
+                # 使用自定义的SafeJSONEncoder
+                json.dump(output_data, f, ensure_ascii=False, indent=4, ignore_nan=True, cls=SafeJSONEncoder)
            logging.info(f"Content JSON saved to: {content_path}")
        except Exception as e:
            logging.exception(f"Failed to save content JSON to {content_path}: {e}")
-            # 如果序列化失败，记录原始内容用于调试
-            debug_path = os.path.join(variant_dir, "debug_content.txt")
-            try:
-                with open(debug_path, "w", encoding="utf-8") as f:
-                    for key, value in output_data.items():
-                        if isinstance(value, str):
-                            f.write(f"{key}: (length: {len(value)})\n")
-                            f.write(f"{repr(value[:200])}...\n\n")
-                        else:
-                            f.write(f"{key}: {type(value)}\n")
-                logging.info(f"Debug content saved to: {debug_path}")
-            except Exception as debug_err:
-                logging.error(f"Failed to save debug content: {debug_err}")
+            
+        # 创建一份article.txt文件以便直接查看
+        txt_path = os.path.join(variant_dir, "article.txt")
+        try:
+            # 使用原始内容
+            with open(txt_path, "w", encoding="utf-8") as f:
+                f.write(f"{original_title}\n\n{original_content}")
+            logging.info(f"Article text saved to: {txt_path}")
+        except Exception as e:
+            logging.error(f"Failed to save article.txt: {e}")
+            
+        # 记录调试信息，无论是否成功
+        debug_path = os.path.join(variant_dir, "debug_content.txt")
+        try:
+            with open(debug_path, "w", encoding="utf-8") as f:
+                f.write(f"原始标题: {original_title}\n\n")
+                f.write(f"原始内容: {original_content}\n\n")
+                f.write("---处理后---\n\n")
+                for key, value in output_data.items():
+                    if isinstance(value, str):
+                        f.write(f"{key}: (length: {len(value)})\n")
+                        f.write(f"{repr(value[:200])}...\n\n")
+                    else:
+                        f.write(f"{key}: {type(value)}\n")
+            logging.info(f"调试内容已保存到: {debug_path}")
+        except Exception as debug_err:
+            logging.error(f"保存调试内容失败: {debug_err}")
            
        # Save content prompt
        prompt_path = os.path.join(variant_dir, "tweet_prompt.txt")
@ -153,6 +239,12 @@ class FileSystemOutputHandler(OutputHandler):
            logging.info(f"Content prompt saved to: {prompt_path}")
        except Exception as e:
             logging.exception(f"Failed to save content prompt to {prompt_path}: {e}")
+             
+    def _ultra_safe_clean(self, text):
+        """执行最严格的字符清理，确保100%可序列化"""
+        if not isinstance(text, str):
+            return ""
+        return ''.join(c for c in text if 32 <= ord(c) <= 126)

    def handle_poster_configs(self, run_id: str, topic_index: int, config_data: list | dict):
        """Saves the complete poster configuration list/dict for a topic."""
@ -160,7 +252,7 @@ class FileSystemOutputHandler(OutputHandler):
        config_path = os.path.join(run_dir, f"topic_{topic_index}_poster_configs.json")
        try:
            with open(config_path, 'w', encoding='utf-8') as f_cfg_topic:
-                 json.dump(config_data, f_cfg_topic, ensure_ascii=False, indent=4, ignore_nan=True)
+                 json.dump(config_data, f_cfg_topic, ensure_ascii=False, indent=4, ignore_nan=True, cls=SafeJSONEncoder)
            logging.info(f"Saved complete poster configurations for topic {topic_index} to: {config_path}")
        except Exception as save_err:
            logging.error(f"Failed to save complete poster configurations for topic {topic_index} to {config_path}: {save_err}")
@ -216,7 +308,7 @@ class FileSystemOutputHandler(OutputHandler):
                metadata_path = os.path.join(os.path.dirname(save_path), metadata_filename)
                try:
                    with open(metadata_path, 'w', encoding='utf-8') as f:
-                        json.dump(metadata, f, ensure_ascii=False, indent=4, ignore_nan=True)
+                        json.dump(metadata, f, ensure_ascii=False, indent=4, ignore_nan=True, cls=SafeJSONEncoder)
                    logging.info(f"保存{image_type}元数据到: {metadata_path}")
                except Exception as me:
                    logging.error(f"无法保存{image_type}元数据到{metadata_path}: {me}")
@ -245,6 +337,11 @@ class FileSystemOutputHandler(OutputHandler):
            # 处理字典类型
            sanitized_dict = {}
            for key, value in data.items():
+                # 移除error标志，我们会在最终验证后重新设置它
+                if key == "error":
+                    continue
+                if key == "raw_result":
+                    continue
                sanitized_dict[key] = self._sanitize_content_for_json(value)
            return sanitized_dict
        elif isinstance(data, list):
@ -256,22 +353,29 @@ class FileSystemOutputHandler(OutputHandler):
            # 1. 首先，替换所有字面的"\n"为真正的换行符
            if r'\n' in data:
                data = data.replace(r'\n', '\n')
-                
-            # 2. 移除所有控制字符（ASCII 0-31，除了\n, \r, \t）
-            cleaned = ''
+            
+            # 2. 使用更强的处理方式 - 只保留绝对安全的字符
+            # - ASCII 32-126 (标准可打印ASCII字符)
+            # - 换行、回车、制表符
+            # - 去除所有其他控制字符和潜在问题字符
+            safe_chars = []
            for char in data:
-                # 允许常见的空白字符
-                if char in '\n\r\t' or ord(char) >= 32:
-                    cleaned += char
+                if char in '\n\r\t' or (32 <= ord(char) <= 126):
+                    safe_chars.append(char)
+                elif ord(char) > 127:  # 非ASCII字符 (包括emoji)
+                    # 转换为Unicode转义序列
+                    safe_chars.append(f"\\u{ord(char):04x}".encode().decode('unicode-escape'))
+            
+            cleaned = ''.join(safe_chars)
                    
            # 3. 验证字符串可以被安全序列化
            try:
                json.dumps(cleaned, ensure_ascii=False)
                return cleaned
            except Exception as e:
-                logging.warning(f"字符串清理后仍无法序列化，尝试更严格的清理: {e}")
-                # 如果仍然无法序列化，使用更严格的清理
-                return ''.join(c for c in cleaned if ord(c) < 65536 and (c in '\n\r\t' or ord(c) >= 32))
+                logging.warning(f"字符串清理后仍无法序列化，使用保守处理: {e}")
+                # 最保守的处理 - 只保留ASCII字符
+                return ''.join(c for c in cleaned if ord(c) < 128)
        else:
            # 其他类型（数字、布尔值等）原样返回
            return data