修复了吞换行的问题
This commit is contained in:
parent
3b6a01d3a4
commit
b7c5f92e89
Binary file not shown.
@ -427,12 +427,12 @@ class ContentJudger:
|
||||
return None
|
||||
|
||||
def _sanitize_json_text(self, text):
|
||||
"""彻底清理文本,确保可以安全解析为JSON"""
|
||||
# 步骤1: 处理控制字符
|
||||
"""彻底清理文本,确保可以安全解析为JSON,同时保留换行符"""
|
||||
# 步骤1: 处理控制字符,但保留换行符、回车和制表符
|
||||
cleaned = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', text)
|
||||
|
||||
# 步骤2: 特殊处理换行符,将实际换行转换为\n字符串
|
||||
cleaned = cleaned.replace('\n', '\\n').replace('\r', '\\r')
|
||||
# 不再将实际换行符转换为\n字符串,保留原始换行符
|
||||
# cleaned = cleaned.replace('\n', '\\n').replace('\r', '\\r')
|
||||
|
||||
# 步骤3: 处理内容字段中开始或结束可能存在的多余空格或引号
|
||||
cleaned = re.sub(r'"content"\s*:\s*"\s*', '"content":"', cleaned)
|
||||
@ -482,8 +482,12 @@ class ContentJudger:
|
||||
|
||||
if content_end > 0:
|
||||
content = text[quote_pos+1:content_end].replace('\\"', '"')
|
||||
# 特殊处理换行符
|
||||
content = content.replace('\\n', '\n').replace('\\r', '\r')
|
||||
# 处理反斜杠转义的换行符,如果字符串中有'\n',将其转换为实际换行符
|
||||
# 但如果已经是实际的换行符,则保留
|
||||
if '\\n' in content:
|
||||
content = content.replace('\\n', '\n')
|
||||
if '\\r' in content:
|
||||
content = content.replace('\\r', '\r')
|
||||
result['content'] = content.strip()
|
||||
|
||||
# 查找analysis字段
|
||||
@ -503,6 +507,11 @@ class ContentJudger:
|
||||
|
||||
if analysis_end > 0:
|
||||
analysis = text[quote_pos+1:analysis_end].replace('\\"', '"')
|
||||
# 处理反斜杠转义的换行符
|
||||
if '\\n' in analysis:
|
||||
analysis = analysis.replace('\\n', '\n')
|
||||
if '\\r' in analysis:
|
||||
analysis = analysis.replace('\\r', '\r')
|
||||
result['analysis'] = analysis.strip()
|
||||
|
||||
return result if 'title' in result and 'content' in result else None
|
||||
@ -555,7 +564,7 @@ class ContentJudger:
|
||||
|
||||
def _prepare_content_for_serialization(self, content_dict):
|
||||
"""
|
||||
对内容进行处理,确保可以安全序列化为JSON,同时保留emoji字符
|
||||
对内容进行处理,确保可以安全序列化为JSON,同时保留emoji字符和换行符
|
||||
|
||||
Args:
|
||||
content_dict: 内容字典
|
||||
@ -570,14 +579,22 @@ class ContentJudger:
|
||||
for key, value in content_dict.items():
|
||||
# 处理字符串类型的值
|
||||
if isinstance(value, str):
|
||||
# 第一步:彻底清理所有控制字符
|
||||
safe_value = re.sub(r'[\x00-\x1F\x7F]', '', value)
|
||||
# 第一步:清理控制字符,但保留换行符、回车和制表符
|
||||
safe_value = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', value)
|
||||
|
||||
# 第二步:将emoji字符转换为相应的Unicode转义序列
|
||||
# 这样能确保JSON序列化安全,同时保留emoji语义
|
||||
# 确保文本中的反斜杠换行符(如\\n)被转换为实际换行符
|
||||
if '\\n' in safe_value:
|
||||
safe_value = safe_value.replace('\\n', '\n')
|
||||
if '\\r' in safe_value:
|
||||
safe_value = safe_value.replace('\\r', '\r')
|
||||
|
||||
# 第二步:将emoji字符和其他非ASCII字符转换为相应的Unicode转义序列
|
||||
char_list = []
|
||||
for char in safe_value:
|
||||
if ord(char) > 127: # 非ASCII字符
|
||||
# 保留常见的控制字符(换行符、回车、制表符)
|
||||
if char in '\n\r\t':
|
||||
char_list.append(char)
|
||||
elif ord(char) > 127: # 非ASCII字符
|
||||
# 尝试保留高位字符(包括emoji)
|
||||
try:
|
||||
# 验证这个字符是否可以安全序列化
|
||||
@ -591,18 +608,18 @@ class ContentJudger:
|
||||
|
||||
processed_value = ''.join(char_list)
|
||||
|
||||
# 对于内容字段,特别注意保存换行符
|
||||
if key == "content" and '\\n' in processed_value:
|
||||
processed_value = processed_value.replace('\\n', '\n')
|
||||
|
||||
# 最终验证这个值是否可以安全序列化
|
||||
try:
|
||||
json.dumps(processed_value, ensure_ascii=False)
|
||||
safe_dict[key] = processed_value
|
||||
except Exception as e:
|
||||
logging.warning(f"处理后的'{key}'值仍无法序列化: {e},将进行更严格处理")
|
||||
# 更严格的处理:只保留ASCII字符
|
||||
safe_dict[key] = ''.join(c for c in processed_value if ord(c) < 128)
|
||||
# 更严格的处理:保留ASCII字符和基本控制字符
|
||||
safe_value = ''
|
||||
for c in processed_value:
|
||||
if c in '\n\r\t' or (32 <= ord(c) < 127):
|
||||
safe_value += c
|
||||
safe_dict[key] = safe_value
|
||||
else:
|
||||
safe_dict[key] = value
|
||||
|
||||
@ -615,10 +632,16 @@ class ContentJudger:
|
||||
json.loads(json_str)
|
||||
except Exception as e:
|
||||
logging.error(f"最终字典序列化验证失败: {e}")
|
||||
# 如果依然失败,返回一个绝对安全的结果
|
||||
# 如果依然失败,返回一个绝对安全的结果,但保留换行符
|
||||
safe_content = ''
|
||||
original_content = content_dict.get("content", "内容包含无法安全序列化的字符")
|
||||
for c in original_content:
|
||||
if c in '\n\r\t' or (32 <= ord(c) < 127):
|
||||
safe_content += c
|
||||
|
||||
return {
|
||||
"title": re.sub(r'[^\x20-\x7E]', '', content_dict.get("title", "序列化处理失败")),
|
||||
"content": re.sub(r'[^\x20-\x7E]', '', "内容包含无法安全序列化的字符,已移除所有非ASCII字符"),
|
||||
"content": safe_content,
|
||||
"judge_success": content_dict.get("judge_success", False),
|
||||
"error": True,
|
||||
"raw_result": str(e)
|
||||
@ -626,11 +649,10 @@ class ContentJudger:
|
||||
|
||||
return safe_dict
|
||||
except Exception as e:
|
||||
logging.error(f"处理内容以确保安全序列化时出错: {e}")
|
||||
# 如果处理失败,返回一个基本的安全字典
|
||||
logging.error(f"序列化准备过程中发生意外错误: {e}")
|
||||
return {
|
||||
"title": "序列化处理失败",
|
||||
"content": "内容包含无法安全序列化的字符",
|
||||
"content": "处理内容时发生意外错误",
|
||||
"judge_success": False,
|
||||
"error": True,
|
||||
"raw_result": str(e)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user