修改了存储方式

This commit is contained in:
jinye_huang 2025-05-18 22:29:29 +08:00
parent e9ac187a3b
commit c8c4031696
2 changed files with 130 additions and 26 deletions

View File

@ -3,6 +3,37 @@ import simplejson as json
import logging
from abc import ABC, abstractmethod
import traceback
import base64
# 自定义JSON编码器强制处理所有可能的JSON序列化问题
class SafeJSONEncoder(json.JSONEncoder):
"""安全的JSON编码器可以处理所有类型的字符串"""
def encode(self, obj):
"""重写encode方法确保任何字符串都能被安全编码"""
if isinstance(obj, dict):
# 处理字典:递归处理每个值
return '{' + ','.join(f'"{key}":{self.encode(value)}'
for key, value in obj.items()
if key not in ["error", "raw_result"]) + '}'
elif isinstance(obj, list):
# 处理列表:递归处理每个项
return '[' + ','.join(self.encode(item) for item in obj) + ']'
elif isinstance(obj, str):
# 安全处理字符串:移除可能导致问题的字符
safe_str = ''
for char in obj:
if char in '\n\r\t' or (32 <= ord(char) <= 126):
safe_str += char
# 跳过所有其他字符
return json.JSONEncoder.encode(self, safe_str)
else:
# 其他类型:使用默认处理
return json.JSONEncoder.encode(self, obj)
def iterencode(self, obj, _one_shot=False):
"""重写iterencode方法确保能处理迭代编码"""
return self.encode(obj)
class OutputHandler(ABC):
"""Abstract base class for handling the output of the generation pipeline."""
@ -70,7 +101,7 @@ class FileSystemOutputHandler(OutputHandler):
topics_path = os.path.join(run_dir, f"tweet_topic_{run_id}.json")
try:
with open(topics_path, "w", encoding="utf-8") as f:
json.dump(topics_list, f, ensure_ascii=False, indent=4, ignore_nan=True)
json.dump(topics_list, f, ensure_ascii=False, indent=4, ignore_nan=True, cls=SafeJSONEncoder)
logging.info(f"Topics list saved successfully to: {topics_path}")
except Exception as e:
logging.exception(f"Error saving topic JSON file to {topics_path}:")
@ -115,9 +146,49 @@ class FileSystemOutputHandler(OutputHandler):
if "tags" in output_data and "original_tags" not in output_data:
output_data["original_tags"] = output_data["tags"]
# 保存原始值用于调试
original_title = output_data.get("title", "")
original_content = output_data.get("content", "")
# 添加Base64编码内容
try:
# 编码标题和内容
title_base64 = base64.b64encode(output_data.get("title", "").encode('utf-8')).decode('ascii')
content_base64 = base64.b64encode(output_data.get("content", "").encode('utf-8')).decode('ascii')
# 添加到输出数据
output_data["title_base64"] = title_base64
output_data["content_base64"] = content_base64
# 如果有原始内容,也编码
if "original_title" in output_data and output_data["original_title"]:
output_data["original_title_base64"] = base64.b64encode(
output_data["original_title"].encode('utf-8')).decode('ascii')
if "original_content" in output_data and output_data["original_content"]:
output_data["original_content_base64"] = base64.b64encode(
output_data["original_content"].encode('utf-8')).decode('ascii')
logging.info("成功添加Base64编码内容")
except Exception as e:
logging.error(f"Base64编码内容时出错: {e}")
# 对内容进行深度清理,确保安全序列化
try:
# 暂存judge_success状态
judge_success = output_data.get("judge_success", False)
# 深度清理
output_data = self._sanitize_content_for_json(output_data)
# 恢复judge_success状态
output_data["judge_success"] = judge_success
# 移除可能的错误标志 - 我们通过尝试序列化来决定是否设置它
if "error" in output_data:
del output_data["error"]
if "raw_result" in output_data:
del output_data["raw_result"]
logging.info("内容已经过安全清理,可以序列化")
except Exception as e:
logging.error(f"内容清理过程中出错: {e}")
@ -126,23 +197,38 @@ class FileSystemOutputHandler(OutputHandler):
content_path = os.path.join(variant_dir, "article.json")
try:
with open(content_path, "w", encoding="utf-8") as f:
json.dump(output_data, f, ensure_ascii=False, indent=4, ignore_nan=True)
# 使用自定义的SafeJSONEncoder
json.dump(output_data, f, ensure_ascii=False, indent=4, ignore_nan=True, cls=SafeJSONEncoder)
logging.info(f"Content JSON saved to: {content_path}")
except Exception as e:
logging.exception(f"Failed to save content JSON to {content_path}: {e}")
# 如果序列化失败,记录原始内容用于调试
# 创建一份article.txt文件以便直接查看
txt_path = os.path.join(variant_dir, "article.txt")
try:
# 使用原始内容
with open(txt_path, "w", encoding="utf-8") as f:
f.write(f"{original_title}\n\n{original_content}")
logging.info(f"Article text saved to: {txt_path}")
except Exception as e:
logging.error(f"Failed to save article.txt: {e}")
# 记录调试信息,无论是否成功
debug_path = os.path.join(variant_dir, "debug_content.txt")
try:
with open(debug_path, "w", encoding="utf-8") as f:
f.write(f"原始标题: {original_title}\n\n")
f.write(f"原始内容: {original_content}\n\n")
f.write("---处理后---\n\n")
for key, value in output_data.items():
if isinstance(value, str):
f.write(f"{key}: (length: {len(value)})\n")
f.write(f"{repr(value[:200])}...\n\n")
else:
f.write(f"{key}: {type(value)}\n")
logging.info(f"Debug content saved to: {debug_path}")
logging.info(f"调试内容已保存到: {debug_path}")
except Exception as debug_err:
logging.error(f"Failed to save debug content: {debug_err}")
logging.error(f"保存调试内容失败: {debug_err}")
# Save content prompt
prompt_path = os.path.join(variant_dir, "tweet_prompt.txt")
@ -154,13 +240,19 @@ class FileSystemOutputHandler(OutputHandler):
except Exception as e:
logging.exception(f"Failed to save content prompt to {prompt_path}: {e}")
def _ultra_safe_clean(self, text):
"""执行最严格的字符清理确保100%可序列化"""
if not isinstance(text, str):
return ""
return ''.join(c for c in text if 32 <= ord(c) <= 126)
def handle_poster_configs(self, run_id: str, topic_index: int, config_data: list | dict):
"""Saves the complete poster configuration list/dict for a topic."""
run_dir = self._get_run_dir(run_id)
config_path = os.path.join(run_dir, f"topic_{topic_index}_poster_configs.json")
try:
with open(config_path, 'w', encoding='utf-8') as f_cfg_topic:
json.dump(config_data, f_cfg_topic, ensure_ascii=False, indent=4, ignore_nan=True)
json.dump(config_data, f_cfg_topic, ensure_ascii=False, indent=4, ignore_nan=True, cls=SafeJSONEncoder)
logging.info(f"Saved complete poster configurations for topic {topic_index} to: {config_path}")
except Exception as save_err:
logging.error(f"Failed to save complete poster configurations for topic {topic_index} to {config_path}: {save_err}")
@ -216,7 +308,7 @@ class FileSystemOutputHandler(OutputHandler):
metadata_path = os.path.join(os.path.dirname(save_path), metadata_filename)
try:
with open(metadata_path, 'w', encoding='utf-8') as f:
json.dump(metadata, f, ensure_ascii=False, indent=4, ignore_nan=True)
json.dump(metadata, f, ensure_ascii=False, indent=4, ignore_nan=True, cls=SafeJSONEncoder)
logging.info(f"保存{image_type}元数据到: {metadata_path}")
except Exception as me:
logging.error(f"无法保存{image_type}元数据到{metadata_path}: {me}")
@ -245,6 +337,11 @@ class FileSystemOutputHandler(OutputHandler):
# 处理字典类型
sanitized_dict = {}
for key, value in data.items():
# 移除error标志我们会在最终验证后重新设置它
if key == "error":
continue
if key == "raw_result":
continue
sanitized_dict[key] = self._sanitize_content_for_json(value)
return sanitized_dict
elif isinstance(data, list):
@ -257,21 +354,28 @@ class FileSystemOutputHandler(OutputHandler):
if r'\n' in data:
data = data.replace(r'\n', '\n')
# 2. 移除所有控制字符ASCII 0-31除了\n, \r, \t
cleaned = ''
# 2. 使用更强的处理方式 - 只保留绝对安全的字符
# - ASCII 32-126 (标准可打印ASCII字符)
# - 换行、回车、制表符
# - 去除所有其他控制字符和潜在问题字符
safe_chars = []
for char in data:
# 允许常见的空白字符
if char in '\n\r\t' or ord(char) >= 32:
cleaned += char
if char in '\n\r\t' or (32 <= ord(char) <= 126):
safe_chars.append(char)
elif ord(char) > 127: # 非ASCII字符 (包括emoji)
# 转换为Unicode转义序列
safe_chars.append(f"\\u{ord(char):04x}".encode().decode('unicode-escape'))
cleaned = ''.join(safe_chars)
# 3. 验证字符串可以被安全序列化
try:
json.dumps(cleaned, ensure_ascii=False)
return cleaned
except Exception as e:
logging.warning(f"字符串清理后仍无法序列化,尝试更严格的清理: {e}")
# 如果仍然无法序列化,使用更严格的清理
return ''.join(c for c in cleaned if ord(c) < 65536 and (c in '\n\r\t' or ord(c) >= 32))
logging.warning(f"字符串清理后仍无法序列化,使用保守处理: {e}")
# 最保守的处理 - 只保留ASCII字符
return ''.join(c for c in cleaned if ord(c) < 128)
else:
# 其他类型(数字、布尔值等)原样返回
return data