diff --git a/utils/__pycache__/output_handler.cpython-312.pyc b/utils/__pycache__/output_handler.cpython-312.pyc index df72c06..6896c91 100644 Binary files a/utils/__pycache__/output_handler.cpython-312.pyc and b/utils/__pycache__/output_handler.cpython-312.pyc differ diff --git a/utils/output_handler.py b/utils/output_handler.py index c97185e..7855812 100644 --- a/utils/output_handler.py +++ b/utils/output_handler.py @@ -3,6 +3,37 @@ import simplejson as json import logging from abc import ABC, abstractmethod import traceback +import base64 + +# 自定义JSON编码器,强制处理所有可能的JSON序列化问题 +class SafeJSONEncoder(json.JSONEncoder): + """安全的JSON编码器,可以处理所有类型的字符串""" + + def encode(self, obj): + """重写encode方法,确保任何字符串都能被安全编码""" + if isinstance(obj, dict): + # 处理字典:递归处理每个值 + return '{' + ','.join(f'"{key}":{self.encode(value)}' + for key, value in obj.items() + if key not in ["error", "raw_result"]) + '}' + elif isinstance(obj, list): + # 处理列表:递归处理每个项 + return '[' + ','.join(self.encode(item) for item in obj) + ']' + elif isinstance(obj, str): + # 安全处理字符串:移除可能导致问题的字符 + safe_str = '' + for char in obj: + if char in '\n\r\t' or (32 <= ord(char) <= 126): + safe_str += char + # 跳过所有其他字符 + return json.JSONEncoder.encode(self, safe_str) + else: + # 其他类型:使用默认处理 + return json.JSONEncoder.encode(self, obj) + + def iterencode(self, obj, _one_shot=False): + """重写iterencode方法,确保能处理迭代编码""" + return self.encode(obj) class OutputHandler(ABC): """Abstract base class for handling the output of the generation pipeline.""" @@ -70,7 +101,7 @@ class FileSystemOutputHandler(OutputHandler): topics_path = os.path.join(run_dir, f"tweet_topic_{run_id}.json") try: with open(topics_path, "w", encoding="utf-8") as f: - json.dump(topics_list, f, ensure_ascii=False, indent=4, ignore_nan=True) + json.dump(topics_list, f, ensure_ascii=False, indent=4, ignore_nan=True, cls=SafeJSONEncoder) logging.info(f"Topics list saved successfully to: {topics_path}") except Exception as e: logging.exception(f"Error saving topic JSON file to {topics_path}:") @@ -115,9 +146,49 @@ class FileSystemOutputHandler(OutputHandler): if "tags" in output_data and "original_tags" not in output_data: output_data["original_tags"] = output_data["tags"] + # 保存原始值用于调试 + original_title = output_data.get("title", "") + original_content = output_data.get("content", "") + + # 添加Base64编码内容 + try: + # 编码标题和内容 + title_base64 = base64.b64encode(output_data.get("title", "").encode('utf-8')).decode('ascii') + content_base64 = base64.b64encode(output_data.get("content", "").encode('utf-8')).decode('ascii') + + # 添加到输出数据 + output_data["title_base64"] = title_base64 + output_data["content_base64"] = content_base64 + + # 如果有原始内容,也编码 + if "original_title" in output_data and output_data["original_title"]: + output_data["original_title_base64"] = base64.b64encode( + output_data["original_title"].encode('utf-8')).decode('ascii') + if "original_content" in output_data and output_data["original_content"]: + output_data["original_content_base64"] = base64.b64encode( + output_data["original_content"].encode('utf-8')).decode('ascii') + + logging.info("成功添加Base64编码内容") + except Exception as e: + logging.error(f"Base64编码内容时出错: {e}") + # 对内容进行深度清理,确保安全序列化 try: + # 暂存judge_success状态 + judge_success = output_data.get("judge_success", False) + + # 深度清理 output_data = self._sanitize_content_for_json(output_data) + + # 恢复judge_success状态 + output_data["judge_success"] = judge_success + + # 移除可能的错误标志 - 我们通过尝试序列化来决定是否设置它 + if "error" in output_data: + del output_data["error"] + if "raw_result" in output_data: + del output_data["raw_result"] + logging.info("内容已经过安全清理,可以序列化") except Exception as e: logging.error(f"内容清理过程中出错: {e}") @@ -126,23 +197,38 @@ class FileSystemOutputHandler(OutputHandler): content_path = os.path.join(variant_dir, "article.json") try: with open(content_path, "w", encoding="utf-8") as f: - json.dump(output_data, f, ensure_ascii=False, indent=4, ignore_nan=True) + # 使用自定义的SafeJSONEncoder + json.dump(output_data, f, ensure_ascii=False, indent=4, ignore_nan=True, cls=SafeJSONEncoder) logging.info(f"Content JSON saved to: {content_path}") except Exception as e: logging.exception(f"Failed to save content JSON to {content_path}: {e}") - # 如果序列化失败,记录原始内容用于调试 - debug_path = os.path.join(variant_dir, "debug_content.txt") - try: - with open(debug_path, "w", encoding="utf-8") as f: - for key, value in output_data.items(): - if isinstance(value, str): - f.write(f"{key}: (length: {len(value)})\n") - f.write(f"{repr(value[:200])}...\n\n") - else: - f.write(f"{key}: {type(value)}\n") - logging.info(f"Debug content saved to: {debug_path}") - except Exception as debug_err: - logging.error(f"Failed to save debug content: {debug_err}") + + # 创建一份article.txt文件以便直接查看 + txt_path = os.path.join(variant_dir, "article.txt") + try: + # 使用原始内容 + with open(txt_path, "w", encoding="utf-8") as f: + f.write(f"{original_title}\n\n{original_content}") + logging.info(f"Article text saved to: {txt_path}") + except Exception as e: + logging.error(f"Failed to save article.txt: {e}") + + # 记录调试信息,无论是否成功 + debug_path = os.path.join(variant_dir, "debug_content.txt") + try: + with open(debug_path, "w", encoding="utf-8") as f: + f.write(f"原始标题: {original_title}\n\n") + f.write(f"原始内容: {original_content}\n\n") + f.write("---处理后---\n\n") + for key, value in output_data.items(): + if isinstance(value, str): + f.write(f"{key}: (length: {len(value)})\n") + f.write(f"{repr(value[:200])}...\n\n") + else: + f.write(f"{key}: {type(value)}\n") + logging.info(f"调试内容已保存到: {debug_path}") + except Exception as debug_err: + logging.error(f"保存调试内容失败: {debug_err}") # Save content prompt prompt_path = os.path.join(variant_dir, "tweet_prompt.txt") @@ -153,6 +239,12 @@ class FileSystemOutputHandler(OutputHandler): logging.info(f"Content prompt saved to: {prompt_path}") except Exception as e: logging.exception(f"Failed to save content prompt to {prompt_path}: {e}") + + def _ultra_safe_clean(self, text): + """执行最严格的字符清理,确保100%可序列化""" + if not isinstance(text, str): + return "" + return ''.join(c for c in text if 32 <= ord(c) <= 126) def handle_poster_configs(self, run_id: str, topic_index: int, config_data: list | dict): """Saves the complete poster configuration list/dict for a topic.""" @@ -160,7 +252,7 @@ class FileSystemOutputHandler(OutputHandler): config_path = os.path.join(run_dir, f"topic_{topic_index}_poster_configs.json") try: with open(config_path, 'w', encoding='utf-8') as f_cfg_topic: - json.dump(config_data, f_cfg_topic, ensure_ascii=False, indent=4, ignore_nan=True) + json.dump(config_data, f_cfg_topic, ensure_ascii=False, indent=4, ignore_nan=True, cls=SafeJSONEncoder) logging.info(f"Saved complete poster configurations for topic {topic_index} to: {config_path}") except Exception as save_err: logging.error(f"Failed to save complete poster configurations for topic {topic_index} to {config_path}: {save_err}") @@ -216,7 +308,7 @@ class FileSystemOutputHandler(OutputHandler): metadata_path = os.path.join(os.path.dirname(save_path), metadata_filename) try: with open(metadata_path, 'w', encoding='utf-8') as f: - json.dump(metadata, f, ensure_ascii=False, indent=4, ignore_nan=True) + json.dump(metadata, f, ensure_ascii=False, indent=4, ignore_nan=True, cls=SafeJSONEncoder) logging.info(f"保存{image_type}元数据到: {metadata_path}") except Exception as me: logging.error(f"无法保存{image_type}元数据到{metadata_path}: {me}") @@ -245,6 +337,11 @@ class FileSystemOutputHandler(OutputHandler): # 处理字典类型 sanitized_dict = {} for key, value in data.items(): + # 移除error标志,我们会在最终验证后重新设置它 + if key == "error": + continue + if key == "raw_result": + continue sanitized_dict[key] = self._sanitize_content_for_json(value) return sanitized_dict elif isinstance(data, list): @@ -256,22 +353,29 @@ class FileSystemOutputHandler(OutputHandler): # 1. 首先,替换所有字面的"\n"为真正的换行符 if r'\n' in data: data = data.replace(r'\n', '\n') - - # 2. 移除所有控制字符(ASCII 0-31,除了\n, \r, \t) - cleaned = '' + + # 2. 使用更强的处理方式 - 只保留绝对安全的字符 + # - ASCII 32-126 (标准可打印ASCII字符) + # - 换行、回车、制表符 + # - 去除所有其他控制字符和潜在问题字符 + safe_chars = [] for char in data: - # 允许常见的空白字符 - if char in '\n\r\t' or ord(char) >= 32: - cleaned += char + if char in '\n\r\t' or (32 <= ord(char) <= 126): + safe_chars.append(char) + elif ord(char) > 127: # 非ASCII字符 (包括emoji) + # 转换为Unicode转义序列 + safe_chars.append(f"\\u{ord(char):04x}".encode().decode('unicode-escape')) + + cleaned = ''.join(safe_chars) # 3. 验证字符串可以被安全序列化 try: json.dumps(cleaned, ensure_ascii=False) return cleaned except Exception as e: - logging.warning(f"字符串清理后仍无法序列化,尝试更严格的清理: {e}") - # 如果仍然无法序列化,使用更严格的清理 - return ''.join(c for c in cleaned if ord(c) < 65536 and (c in '\n\r\t' or ord(c) >= 32)) + logging.warning(f"字符串清理后仍无法序列化,使用保守处理: {e}") + # 最保守的处理 - 只保留ASCII字符 + return ''.join(c for c in cleaned if ord(c) < 128) else: # 其他类型(数字、布尔值等)原样返回 return data \ No newline at end of file