修改了存储方式
This commit is contained in:
parent
e9ac187a3b
commit
c8c4031696
Binary file not shown.
@ -3,6 +3,37 @@ import simplejson as json
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
import traceback
|
||||
import base64
|
||||
|
||||
# 自定义JSON编码器,强制处理所有可能的JSON序列化问题
|
||||
class SafeJSONEncoder(json.JSONEncoder):
|
||||
"""安全的JSON编码器,可以处理所有类型的字符串"""
|
||||
|
||||
def encode(self, obj):
|
||||
"""重写encode方法,确保任何字符串都能被安全编码"""
|
||||
if isinstance(obj, dict):
|
||||
# 处理字典:递归处理每个值
|
||||
return '{' + ','.join(f'"{key}":{self.encode(value)}'
|
||||
for key, value in obj.items()
|
||||
if key not in ["error", "raw_result"]) + '}'
|
||||
elif isinstance(obj, list):
|
||||
# 处理列表:递归处理每个项
|
||||
return '[' + ','.join(self.encode(item) for item in obj) + ']'
|
||||
elif isinstance(obj, str):
|
||||
# 安全处理字符串:移除可能导致问题的字符
|
||||
safe_str = ''
|
||||
for char in obj:
|
||||
if char in '\n\r\t' or (32 <= ord(char) <= 126):
|
||||
safe_str += char
|
||||
# 跳过所有其他字符
|
||||
return json.JSONEncoder.encode(self, safe_str)
|
||||
else:
|
||||
# 其他类型:使用默认处理
|
||||
return json.JSONEncoder.encode(self, obj)
|
||||
|
||||
def iterencode(self, obj, _one_shot=False):
|
||||
"""重写iterencode方法,确保能处理迭代编码"""
|
||||
return self.encode(obj)
|
||||
|
||||
class OutputHandler(ABC):
|
||||
"""Abstract base class for handling the output of the generation pipeline."""
|
||||
@ -70,7 +101,7 @@ class FileSystemOutputHandler(OutputHandler):
|
||||
topics_path = os.path.join(run_dir, f"tweet_topic_{run_id}.json")
|
||||
try:
|
||||
with open(topics_path, "w", encoding="utf-8") as f:
|
||||
json.dump(topics_list, f, ensure_ascii=False, indent=4, ignore_nan=True)
|
||||
json.dump(topics_list, f, ensure_ascii=False, indent=4, ignore_nan=True, cls=SafeJSONEncoder)
|
||||
logging.info(f"Topics list saved successfully to: {topics_path}")
|
||||
except Exception as e:
|
||||
logging.exception(f"Error saving topic JSON file to {topics_path}:")
|
||||
@ -115,9 +146,49 @@ class FileSystemOutputHandler(OutputHandler):
|
||||
if "tags" in output_data and "original_tags" not in output_data:
|
||||
output_data["original_tags"] = output_data["tags"]
|
||||
|
||||
# 保存原始值用于调试
|
||||
original_title = output_data.get("title", "")
|
||||
original_content = output_data.get("content", "")
|
||||
|
||||
# 添加Base64编码内容
|
||||
try:
|
||||
# 编码标题和内容
|
||||
title_base64 = base64.b64encode(output_data.get("title", "").encode('utf-8')).decode('ascii')
|
||||
content_base64 = base64.b64encode(output_data.get("content", "").encode('utf-8')).decode('ascii')
|
||||
|
||||
# 添加到输出数据
|
||||
output_data["title_base64"] = title_base64
|
||||
output_data["content_base64"] = content_base64
|
||||
|
||||
# 如果有原始内容,也编码
|
||||
if "original_title" in output_data and output_data["original_title"]:
|
||||
output_data["original_title_base64"] = base64.b64encode(
|
||||
output_data["original_title"].encode('utf-8')).decode('ascii')
|
||||
if "original_content" in output_data and output_data["original_content"]:
|
||||
output_data["original_content_base64"] = base64.b64encode(
|
||||
output_data["original_content"].encode('utf-8')).decode('ascii')
|
||||
|
||||
logging.info("成功添加Base64编码内容")
|
||||
except Exception as e:
|
||||
logging.error(f"Base64编码内容时出错: {e}")
|
||||
|
||||
# 对内容进行深度清理,确保安全序列化
|
||||
try:
|
||||
# 暂存judge_success状态
|
||||
judge_success = output_data.get("judge_success", False)
|
||||
|
||||
# 深度清理
|
||||
output_data = self._sanitize_content_for_json(output_data)
|
||||
|
||||
# 恢复judge_success状态
|
||||
output_data["judge_success"] = judge_success
|
||||
|
||||
# 移除可能的错误标志 - 我们通过尝试序列化来决定是否设置它
|
||||
if "error" in output_data:
|
||||
del output_data["error"]
|
||||
if "raw_result" in output_data:
|
||||
del output_data["raw_result"]
|
||||
|
||||
logging.info("内容已经过安全清理,可以序列化")
|
||||
except Exception as e:
|
||||
logging.error(f"内容清理过程中出错: {e}")
|
||||
@ -126,23 +197,38 @@ class FileSystemOutputHandler(OutputHandler):
|
||||
content_path = os.path.join(variant_dir, "article.json")
|
||||
try:
|
||||
with open(content_path, "w", encoding="utf-8") as f:
|
||||
json.dump(output_data, f, ensure_ascii=False, indent=4, ignore_nan=True)
|
||||
# 使用自定义的SafeJSONEncoder
|
||||
json.dump(output_data, f, ensure_ascii=False, indent=4, ignore_nan=True, cls=SafeJSONEncoder)
|
||||
logging.info(f"Content JSON saved to: {content_path}")
|
||||
except Exception as e:
|
||||
logging.exception(f"Failed to save content JSON to {content_path}: {e}")
|
||||
# 如果序列化失败,记录原始内容用于调试
|
||||
debug_path = os.path.join(variant_dir, "debug_content.txt")
|
||||
try:
|
||||
with open(debug_path, "w", encoding="utf-8") as f:
|
||||
for key, value in output_data.items():
|
||||
if isinstance(value, str):
|
||||
f.write(f"{key}: (length: {len(value)})\n")
|
||||
f.write(f"{repr(value[:200])}...\n\n")
|
||||
else:
|
||||
f.write(f"{key}: {type(value)}\n")
|
||||
logging.info(f"Debug content saved to: {debug_path}")
|
||||
except Exception as debug_err:
|
||||
logging.error(f"Failed to save debug content: {debug_err}")
|
||||
|
||||
# 创建一份article.txt文件以便直接查看
|
||||
txt_path = os.path.join(variant_dir, "article.txt")
|
||||
try:
|
||||
# 使用原始内容
|
||||
with open(txt_path, "w", encoding="utf-8") as f:
|
||||
f.write(f"{original_title}\n\n{original_content}")
|
||||
logging.info(f"Article text saved to: {txt_path}")
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to save article.txt: {e}")
|
||||
|
||||
# 记录调试信息,无论是否成功
|
||||
debug_path = os.path.join(variant_dir, "debug_content.txt")
|
||||
try:
|
||||
with open(debug_path, "w", encoding="utf-8") as f:
|
||||
f.write(f"原始标题: {original_title}\n\n")
|
||||
f.write(f"原始内容: {original_content}\n\n")
|
||||
f.write("---处理后---\n\n")
|
||||
for key, value in output_data.items():
|
||||
if isinstance(value, str):
|
||||
f.write(f"{key}: (length: {len(value)})\n")
|
||||
f.write(f"{repr(value[:200])}...\n\n")
|
||||
else:
|
||||
f.write(f"{key}: {type(value)}\n")
|
||||
logging.info(f"调试内容已保存到: {debug_path}")
|
||||
except Exception as debug_err:
|
||||
logging.error(f"保存调试内容失败: {debug_err}")
|
||||
|
||||
# Save content prompt
|
||||
prompt_path = os.path.join(variant_dir, "tweet_prompt.txt")
|
||||
@ -153,6 +239,12 @@ class FileSystemOutputHandler(OutputHandler):
|
||||
logging.info(f"Content prompt saved to: {prompt_path}")
|
||||
except Exception as e:
|
||||
logging.exception(f"Failed to save content prompt to {prompt_path}: {e}")
|
||||
|
||||
def _ultra_safe_clean(self, text):
|
||||
"""执行最严格的字符清理,确保100%可序列化"""
|
||||
if not isinstance(text, str):
|
||||
return ""
|
||||
return ''.join(c for c in text if 32 <= ord(c) <= 126)
|
||||
|
||||
def handle_poster_configs(self, run_id: str, topic_index: int, config_data: list | dict):
|
||||
"""Saves the complete poster configuration list/dict for a topic."""
|
||||
@ -160,7 +252,7 @@ class FileSystemOutputHandler(OutputHandler):
|
||||
config_path = os.path.join(run_dir, f"topic_{topic_index}_poster_configs.json")
|
||||
try:
|
||||
with open(config_path, 'w', encoding='utf-8') as f_cfg_topic:
|
||||
json.dump(config_data, f_cfg_topic, ensure_ascii=False, indent=4, ignore_nan=True)
|
||||
json.dump(config_data, f_cfg_topic, ensure_ascii=False, indent=4, ignore_nan=True, cls=SafeJSONEncoder)
|
||||
logging.info(f"Saved complete poster configurations for topic {topic_index} to: {config_path}")
|
||||
except Exception as save_err:
|
||||
logging.error(f"Failed to save complete poster configurations for topic {topic_index} to {config_path}: {save_err}")
|
||||
@ -216,7 +308,7 @@ class FileSystemOutputHandler(OutputHandler):
|
||||
metadata_path = os.path.join(os.path.dirname(save_path), metadata_filename)
|
||||
try:
|
||||
with open(metadata_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(metadata, f, ensure_ascii=False, indent=4, ignore_nan=True)
|
||||
json.dump(metadata, f, ensure_ascii=False, indent=4, ignore_nan=True, cls=SafeJSONEncoder)
|
||||
logging.info(f"保存{image_type}元数据到: {metadata_path}")
|
||||
except Exception as me:
|
||||
logging.error(f"无法保存{image_type}元数据到{metadata_path}: {me}")
|
||||
@ -245,6 +337,11 @@ class FileSystemOutputHandler(OutputHandler):
|
||||
# 处理字典类型
|
||||
sanitized_dict = {}
|
||||
for key, value in data.items():
|
||||
# 移除error标志,我们会在最终验证后重新设置它
|
||||
if key == "error":
|
||||
continue
|
||||
if key == "raw_result":
|
||||
continue
|
||||
sanitized_dict[key] = self._sanitize_content_for_json(value)
|
||||
return sanitized_dict
|
||||
elif isinstance(data, list):
|
||||
@ -256,22 +353,29 @@ class FileSystemOutputHandler(OutputHandler):
|
||||
# 1. 首先,替换所有字面的"\n"为真正的换行符
|
||||
if r'\n' in data:
|
||||
data = data.replace(r'\n', '\n')
|
||||
|
||||
# 2. 移除所有控制字符(ASCII 0-31,除了\n, \r, \t)
|
||||
cleaned = ''
|
||||
|
||||
# 2. 使用更强的处理方式 - 只保留绝对安全的字符
|
||||
# - ASCII 32-126 (标准可打印ASCII字符)
|
||||
# - 换行、回车、制表符
|
||||
# - 去除所有其他控制字符和潜在问题字符
|
||||
safe_chars = []
|
||||
for char in data:
|
||||
# 允许常见的空白字符
|
||||
if char in '\n\r\t' or ord(char) >= 32:
|
||||
cleaned += char
|
||||
if char in '\n\r\t' or (32 <= ord(char) <= 126):
|
||||
safe_chars.append(char)
|
||||
elif ord(char) > 127: # 非ASCII字符 (包括emoji)
|
||||
# 转换为Unicode转义序列
|
||||
safe_chars.append(f"\\u{ord(char):04x}".encode().decode('unicode-escape'))
|
||||
|
||||
cleaned = ''.join(safe_chars)
|
||||
|
||||
# 3. 验证字符串可以被安全序列化
|
||||
try:
|
||||
json.dumps(cleaned, ensure_ascii=False)
|
||||
return cleaned
|
||||
except Exception as e:
|
||||
logging.warning(f"字符串清理后仍无法序列化,尝试更严格的清理: {e}")
|
||||
# 如果仍然无法序列化,使用更严格的清理
|
||||
return ''.join(c for c in cleaned if ord(c) < 65536 and (c in '\n\r\t' or ord(c) >= 32))
|
||||
logging.warning(f"字符串清理后仍无法序列化,使用保守处理: {e}")
|
||||
# 最保守的处理 - 只保留ASCII字符
|
||||
return ''.join(c for c in cleaned if ord(c) < 128)
|
||||
else:
|
||||
# 其他类型(数字、布尔值等)原样返回
|
||||
return data
|
||||
Loading…
x
Reference in New Issue
Block a user