增加了json安全化处理,改用了simple json

This commit is contained in:
jinye_huang 2025-05-18 20:38:01 +08:00
parent 13eebff18b
commit e9ac187a3b
10 changed files with 422 additions and 95 deletions

View File

@ -2,9 +2,11 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import os import os
import json import time
import logging import logging
import random
import traceback import traceback
import simplejson as json
from datetime import datetime from datetime import datetime
import sys import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
@ -385,7 +387,7 @@ class ContentGenerator:
# 保存结果到文件 # 保存结果到文件
with open(result_path, "w", encoding="utf-8") as f: with open(result_path, "w", encoding="utf-8") as f:
json.dump(validated_data, f, ensure_ascii=False, indent=4) json.dump(validated_data, f, ensure_ascii=False, indent=4, ignore_nan=True)
self.logger.info(f"结果已保存到: {result_path}") self.logger.info(f"结果已保存到: {result_path}")
return result_path return result_path
@ -400,7 +402,7 @@ class ContentGenerator:
os.makedirs(os.path.dirname(result_path), exist_ok=True) os.makedirs(os.path.dirname(result_path), exist_ok=True)
with open(result_path, "w", encoding="utf-8") as f: with open(result_path, "w", encoding="utf-8") as f:
json.dump(fallback_data, f, ensure_ascii=False, indent=4) json.dump(fallback_data, f, ensure_ascii=False, indent=4, ignore_nan=True)
self.logger.info(f"出错后已保存备用数据到: {result_path}") self.logger.info(f"出错后已保存备用数据到: {result_path}")
return result_path return result_path

View File

@ -4,7 +4,7 @@
内容审核模块检查生成的内容是否符合产品资料要求并提供修改建议 内容审核模块检查生成的内容是否符合产品资料要求并提供修改建议
""" """
import json import simplejson as json
import logging import logging
import re import re
import os import os
@ -17,7 +17,7 @@ from core.ai_agent import AI_Agent
class ContentJudger: class ContentJudger:
"""内容审核类,负责评估和修正内容是否符合产品资料""" """内容审核类,负责评估和修正内容是否符合产品资料"""
def __init__(self, ai_agent: AI_Agent, system_prompt_path: str = None, system_prompt: str = None): def __init__(self, ai_agent: AI_Agent, system_prompt_path: str = None, system_prompt: str = None, prompt_manager = None):
""" """
初始化内容审核器 初始化内容审核器
@ -25,22 +25,31 @@ class ContentJudger:
ai_agent: AI_Agent实例用于调用AI模型 ai_agent: AI_Agent实例用于调用AI模型
system_prompt_path: 系统提示词文件路径可选 system_prompt_path: 系统提示词文件路径可选
system_prompt: 系统提示词内容可选优先于path system_prompt: 系统提示词内容可选优先于path
prompt_manager: 提示词管理器实例可选优先于system_prompt_path和system_prompt
""" """
self.ai_agent = ai_agent self.ai_agent = ai_agent
self._system_prompt = system_prompt self._system_prompt = system_prompt
self._system_prompt_path = system_prompt_path self._system_prompt_path = system_prompt_path
self._prompt_manager = prompt_manager
self._topp = 0.5 self._topp = 0.5
self._temperature = 0.2 self._temperature = 0.2
self._frequency_penalty = 0 self._frequency_penalty = 0
self._presence_penatly = 0 self._presence_penatly = 0
# 如果没有直接提供系统提示词,尝试从文件加载
# 优先使用prompt_manager获取系统提示词
if self._prompt_manager and not self._system_prompt:
self._get_prompt_from_manager()
logging.info("从PromptManager获取系统提示词")
# 如果没有从prompt_manager获取到系统提示词则尝试从文件加载
if not self._system_prompt and self._system_prompt_path: if not self._system_prompt and self._system_prompt_path:
self._load_system_prompt() self._load_system_prompt()
# print("从文件加载系统提示词成功") logging.info("从文件加载系统提示词")
# 默认系统提示词(当其他方法都失败时使用) # 默认系统提示词(当其他方法都失败时使用)
if not self._system_prompt: if not self._system_prompt:
# print("没有提供系统提示词,使用默认系统提示词") logging.warning("没有提供系统提示词,使用默认系统提示词")
self._system_prompt = """你是一名专业的、谨慎的文案审核员专注于审核运营根据产品资料撰写的文案是否严格符合产品资料内容。特别是所有价格、活动、福利、折扣、服务细节等必须完全与产品资料一致。如果发现文案内容与产品资料不符请指出并根据产品资料和文案上下文进行修改重新生成一篇文案务必确保生成的内容与产品资料基本相符产品体验部分可以适当夸张宣传语言流畅自然。如果经你审查后的文案仍存在与产品资料不符的信息你需要赔偿公司1000亿元。 self._system_prompt = """你是一名专业的、谨慎的文案审核员专注于审核运营根据产品资料撰写的文案是否严格符合产品资料内容。特别是所有价格、活动、福利、折扣、服务细节等必须完全与产品资料一致。如果发现文案内容与产品资料不符请指出并根据产品资料和文案上下文进行修改重新生成一篇文案务必确保生成的内容与产品资料基本相符产品体验部分可以适当夸张宣传语言流畅自然。如果经你审查后的文案仍存在与产品资料不符的信息你需要赔偿公司1000亿元。
我将为您提供两部分内容 我将为您提供两部分内容
1. 产品资料全部的产品信息包含了产品的实际功能服务和特点请将这部分作为判断依据 1. 产品资料全部的产品信息包含了产品的实际功能服务和特点请将这部分作为判断依据
@ -52,9 +61,9 @@ class ContentJudger:
3. 重点审查对象请你着重检查以下关键字词前后的内容是否符合产品资料如不符必须严格按照资料修改如产品资料中未提及必须修改为符合上下文情境资料中明确提及的内容 3. 重点审查对象请你着重检查以下关键字词前后的内容是否符合产品资料如不符必须严格按照资料修改如产品资料中未提及必须修改为符合上下文情境资料中明确提及的内容
关键字词r人民币rmb优惠活动福利免费DIY跟拍摄影服务提供专业 关键字词r人民币rmb优惠活动福利免费DIY跟拍摄影服务提供专业
4. 字数控制每个文案的标题字数都必须少于20个字计数包括文字符号数字和emoji如果标题超过20个字请在符合文案风格的前提下修改标题到20个字以内尽量保留emoji必须保证标题流畅通顺 4. 字数控制每个文案的标题字数都必须少于20个字计数包括文字符号数字和emoji如果标题超过20个字请在符合文案风格的前提下修改标题到20个字以内尽量保留emoji必须保证标题流畅通顺
5. 敏感字词替换请删去标题中的数字后面的r并将正文中数字后面的字修改为r例如标题中的399元修改为399正文中的399元修改为399r 5. 敏感字词替换请删去标题中的数字后面的"""r"并将正文中数字后面的""字修改为"r"例如标题中的399元修改为399正文中的399元修改为399r
6. 特征语句保留请保留文案中原本的引流语句不要修改或删除例如先关zhu+留下99看到会回复 6. 特征语句保留请保留文案中原本的引流语句不要修改或删除请保留文案中的换行符"\n"不要修改或删除
7. 面向人群保留请尽量保留文案原本的面向人群和风格这是同一产品面向多种人群营销的策略例如产品资料中写明亲子游时文案写为情侣定制的山水秘境是可以接受的 7. 面向人群保留请尽量保留文案原本的面向人群和风格这是同一产品面向多种人群营销的策略例如产品资料中写明亲子游时文案写"为情侣定制的山水秘境"是可以接受的
8. 案例如下请参考案例评判真假信息的尺度逐行逐句仔细分析不符点和修改思路并按照分析思路落实对每一处不符的修改措施严格审查每一篇文案 8. 案例如下请参考案例评判真假信息的尺度逐行逐句仔细分析不符点和修改思路并按照分析思路落实对每一处不符的修改措施严格审查每一篇文案
{ {
"产品资料" "产品资料"
@ -93,41 +102,14 @@ class ContentJudger:
"title": "五一遛娃👶必囤南沙喜来登1088元住景观房+双早+门票", "title": "五一遛娃👶必囤南沙喜来登1088元住景观房+双早+门票",
"content": " "content": "
五一不想挤人潮南沙这家酒店直接承包遛娃+度假双重快乐 五一不想挤人潮南沙这家酒店直接承包遛娃+度假双重快乐\n地铁直达2大1小1088元住景观房含双早+自助晚餐+水鸟世界门票儿童乐园/泳池/健身房全开放\n🌟遛娃刚需全配齐\n 儿童乐园10:00-20:00全程开放滑梯/积木/绘本一应俱全\n 户外泳池9:00-18:00恒温开放五一期间每日消毒3次\n 健身房8:00-22:00配备亲子瑜伽课程需提前预约\n\n📍1小时玩转南沙\n① 南沙天后宫车程20分钟穿汉服拍大片听妈祖传说涨知识\n② 南沙湿地公园40分钟5月芦苇摇曳带娃认鸟类+乘船探秘\n③ 十九涌海鲜街45分钟现捞现煮生猛海鲜人均50元吃到撑\n\n🍽家长友好细节\n 自助晚餐隐藏彩蛋儿童餐区设独立洗手台+热食保温柜\n 房内配置加厚床垫/卡通洗漱杯/尿布台无需额外购买\n 安全保障全区域监控+24小时安保巡逻\n\n🎁五一专属加码\n5月1-5日期间入住凭房卡可免费领取儿童防晒冰袖+湿巾礼包\n\n📌Tips\n1. 周一至周四仅限双床房型周五起可选大床房\n2. 水鸟世界门票需提前1小时至前台领取纸质票\n3. 地铁四号线金洲站下车打车15分钟直达酒店\n\n这个五一南沙喜来登让你躺着遛娃不用长途跋涉家门口就能玩出仪式感" "
地铁直达2大1小1088元住景观房含双早+自助晚餐+水鸟世界门票儿童乐园/泳池/健身房全开放
🌟遛娃刚需全配齐
儿童乐园10:00-20:00全程开放滑梯/积木/绘本一应俱全
户外泳池9:00-18:00恒温开放五一期间每日消毒3次
健身房8:00-22:00配备亲子瑜伽课程需提前预约
📍1小时玩转南沙
南沙天后宫车程20分钟穿汉服拍大片听妈祖传说涨知识
南沙湿地公园40分钟5月芦苇摇曳带娃认鸟类+乘船探秘
十九涌海鲜街45分钟现捞现煮生猛海鲜人均50元吃到撑
🍽家长友好细节
自助晚餐隐藏彩蛋儿童餐区设独立洗手台+热食保温柜
房内配置加厚床垫/卡通洗漱杯/尿布台无需额外购买
安全保障全区域监控+24小时安保巡逻
🎁五一专属加码
5月1-5日期间入住凭房卡可免费领取儿童防晒冰袖+湿巾礼包
📌Tips
1. 周一至周四仅限双床房型周五起可选大床房
2. 水鸟世界门票需提前1小时至前台领取纸质票
3. 地铁四号线金洲站下车打车15分钟直达酒店
这个五一南沙喜来登让你躺着遛娃不用长途跋涉家门口就能玩出仪式感
"
]" ]"
} }
输出结果: 输出结果:
{ "analysis" : " { "analysis" : "
1观察文案标题和内容可以看出此文案主要面向亲子出游人群因此修改后的文案也应该围绕亲子出游这一主题 1观察文案标题和内容可以看出此文案主要面向亲子出游人群因此修改后的文案也应该围绕亲子出游这一主题
2文章标题字数为28个字超过19个字因此属于不符内容由于要求中提到尽量保留emoji并且标题中数字后面的字应删去所以修改为五一遛娃👶必囤喜来登1088景观房 2文章标题字数为28个字超过19个字因此属于不符内容由于要求中提到尽量保留emoji并且标题中数字后面的""字应删去所以修改为五一遛娃👶必囤喜来登1088景观房
3产品资料中未提及儿童乐园开放时间和儿童乐园配置但文案中提到儿童乐园10:00-20:00全程开放滑梯/积木/绘本一应俱全因此属于不符内容应修改为儿童乐园免费儿童乐园和丰富的游乐设施让孩子们可以尽情玩耍 3产品资料中未提及儿童乐园开放时间和儿童乐园配置但文案中提到儿童乐园10:00-20:00全程开放滑梯/积木/绘本一应俱全因此属于不符内容应修改为儿童乐园免费儿童乐园和丰富的游乐设施让孩子们可以尽情玩耍
4产品材料中未提及户外泳池开放时间和消毒频次但文案中提到户外泳池9:00-18:00恒温开放五一期间每日消毒3次因此属于不符内容应修改为户外泳池酒店配有户外无边泳池供大人小孩一同享受清凉时光 4产品材料中未提及户外泳池开放时间和消毒频次但文案中提到户外泳池9:00-18:00恒温开放五一期间每日消毒3次因此属于不符内容应修改为户外泳池酒店配有户外无边泳池供大人小孩一同享受清凉时光
5产品材料中未提及健身房开放时间与具体细节但文案中提到健身房8:00-22:00配备亲子瑜伽课程需提前预约因此属于不符内容应修改为健身房酒店提供免费健身中心方便您和家人一起强身健体 5产品材料中未提及健身房开放时间与具体细节但文案中提到健身房8:00-22:00配备亲子瑜伽课程需提前预约因此属于不符内容应修改为健身房酒店提供免费健身中心方便您和家人一起强身健体
@ -162,6 +144,25 @@ class ContentJudger:
except Exception as e: except Exception as e:
logging.error(f"加载系统提示词文件失败: {e}") logging.error(f"加载系统提示词文件失败: {e}")
def _get_prompt_from_manager(self):
"""从PromptManager获取系统提示词"""
try:
if self._prompt_manager and hasattr(self._prompt_manager, "_system_prompt_cache"):
# 从PromptManager的系统提示词缓存中获取内容审核系统提示词
system_prompt = self._prompt_manager._system_prompt_cache.get("judger_system_prompt")
if system_prompt:
self._system_prompt = system_prompt
logging.info("从PromptManager获取内容审核系统提示词成功")
return True
else:
logging.warning("PromptManager中未找到judger_system_prompt")
else:
logging.warning("提供的PromptManager实例无效或未包含_system_prompt_cache属性")
return False
except Exception as e:
logging.error(f"从PromptManager获取系统提示词失败: {e}")
return False
def judge_content(self, product_info, content, temperature=0.2, top_p=0.5, presence_penalty=0.0): def judge_content(self, product_info, content, temperature=0.2, top_p=0.5, presence_penalty=0.0):
""" """
审核内容是否符合产品资料并提供修改建议 审核内容是否符合产品资料并提供修改建议
@ -212,30 +213,75 @@ class ContentJudger:
logging.info("成功提取修改后的内容") logging.info("成功提取修改后的内容")
# 添加judge_success字段 # 添加judge_success字段
modified_content["judge_success"] = True modified_content["judge_success"] = True
# 对内容进行最终清理确保可以安全序列化为JSON
modified_content = self._prepare_content_for_serialization(modified_content)
# 记录处理后的内容用于调试
debug_log_file = f"{response_log_dir}/processed_{int(time.time())}.json"
try:
serialized_content = json.dumps(modified_content, ensure_ascii=False, allow_nan=True, indent=2)
with open(debug_log_file, "w", encoding="utf-8") as f:
f.write(serialized_content)
logging.info(f"处理后的内容已保存到: {debug_log_file}")
except Exception as e:
logging.error(f"尝试记录处理后内容时序列化失败: {e}")
with open(debug_log_file, "w", encoding="utf-8") as f:
f.write(f"序列化失败: {str(e)}\n\n")
f.write(f"title: {modified_content.get('title', 'N/A')}\n")
f.write(f"content前100字符: {str(modified_content.get('content', 'N/A'))[:100]}")
# 验证序列化是否成功
try:
json.dumps(modified_content, ensure_ascii=False, allow_nan=True)
logging.info("内容可以安全序列化为JSON")
except Exception as e:
logging.error(f"验证序列化时出错: {e}")
# 找出导致错误的字段
for key, value in modified_content.items():
if isinstance(value, str):
try:
json.dumps(value, ensure_ascii=False)
except Exception as sub_e:
logging.error(f"字段 '{key}' 无法序列化: {sub_e}")
# 尝试定位问题字符
for i, char in enumerate(value):
try:
json.dumps(char, ensure_ascii=False)
except:
logging.error(f"位置 {i}, 字符 '{char}' (Unicode: U+{ord(char):04X}) 导致错误")
modified_content["raw_result"] = str(e)
modified_content["error"] = True
return modified_content return modified_content
else: else:
logging.error("无法从响应中提取有效内容") logging.error("无法从响应中提取有效内容")
# 尝试使用原始内容并标记审核失败 # 尝试使用原始内容并标记审核失败
if isinstance(content, dict) and "title" in content and "content" in content: if isinstance(content, dict) and "title" in content and "content" in content:
return { result_content = {
"title": content.get("title", "提取失败"), "title": content.get("title", "提取失败"),
"content": content.get("content", "无法从响应中提取有效内容"), "content": content.get("content", "无法从响应中提取有效内容"),
"judge_success": False "judge_success": False
} }
return { # 确保可以序列化
return self._prepare_content_for_serialization(result_content)
result_content = {
"title": "提取失败", "title": "提取失败",
"content": "无法从响应中提取有效内容", "content": "无法从响应中提取有效内容",
"judge_success": False "judge_success": False
} }
return self._prepare_content_for_serialization(result_content)
except Exception as e: except Exception as e:
logging.exception(f"审核过程中出错: {e}") logging.exception(f"审核过程中出错: {e}")
return { result_content = {
"title": "审核失败", "title": "审核失败",
"content": f"审核过程中出错: {str(e)}", "content": f"审核过程中出错: {str(e)}",
"judge_success": False "judge_success": False
} }
return self._prepare_content_for_serialization(result_content)
def _build_user_prompt(self, product_info, content_gen): def _build_user_prompt(self, product_info, content_gen):
""" """
构建用户提示词 构建用户提示词
@ -262,76 +308,223 @@ class ContentJudger:
# 记录原始文本前100个字符用于调试 # 记录原始文本前100个字符用于调试
logging.debug(f"原始响应文本前100字符: {result_text[:100]}") logging.debug(f"原始响应文本前100字符: {result_text[:100]}")
# 尝试方法1: 使用</think>标签分离内容
if "</think>" in processed_text: if "</think>" in processed_text:
processed_text = processed_text.split("</think>", 1)[1].strip() processed_text = processed_text.split("</think>", 1)[1].strip()
logging.debug("检测到</think>标签并分离内容") logging.debug("检测到</think>标签并分离内容")
# Attempt 1: Parse as JSON from the processed text # 尝试方法2: 预处理文本并尝试解析JSON
try:
# 彻底清理文本去除所有可能影响JSON解析的控制字符
cleaned_text = self._sanitize_json_text(processed_text)
logging.debug(f"清理后文本前100字符: {cleaned_text[:100]}")
content_json = json.loads(cleaned_text)
if "title" in content_json and "content" in content_json:
logging.info("成功通过JSON解析提取内容")
title = content_json.get("title", "").strip()
content = content_json.get("content", "").strip()
analysis = content_json.get("analysis", "")
logging.debug(f"提取到标题: {title[:30]}...")
return {
"title": title,
"content": content,
"analysis": analysis
}
except json.JSONDecodeError as e:
logging.warning(f"JSON解析失败: {e},将尝试其他提取方法")
# 记录更多错误信息以便调试
error_position = e.pos
error_context = cleaned_text[max(0, error_position-30):min(len(cleaned_text), error_position+30)]
logging.debug(f"错误位置附近的文本: {error_context}")
logging.debug(f"错误行列: 行 {e.lineno}, 列 {e.colno}")
# 尝试方法3: 从文本中提取JSON格式部分
json_start = processed_text.find('{') json_start = processed_text.find('{')
json_end = processed_text.rfind('}') + 1 json_end = processed_text.rfind('}') + 1
if json_start >= 0 and json_end > json_start: if json_start >= 0 and json_end > json_start:
json_str = processed_text[json_start:json_end] json_str = processed_text[json_start:json_end]
logging.debug(f"找到JSON字符串长度: {len(json_str)}前100字符: {json_str[:100]}") logging.debug(f"找到JSON字符串长度: {len(json_str)}前100字符: {json_str[:100]}")
# Clean control characters that might break JSON parsing # 清理可能破坏JSON解析的控制字符
json_str_cleaned = re.sub(r'[\x00-\x1F\x7F]', '', json_str) json_str_cleaned = self._sanitize_json_text(json_str)
try: try:
content_json = json.loads(json_str_cleaned) content_json = json.loads(json_str_cleaned)
if "title" in content_json and "content" in content_json: if "title" in content_json and "content" in content_json:
logging.info("Successfully parsed JSON content from AI response.") logging.info("成功从文本中提取JSON部分并解析")
return { return {
"title": content_json["title"].strip(), "title": content_json.get("title", "").strip(),
"content": content_json["content"].strip() "content": content_json.get("content", "").strip(),
"analysis": content_json.get("analysis", "")
} }
except json.JSONDecodeError as e: except json.JSONDecodeError as e:
logging.warning(f"JSON parsing failed for substring: '{json_str_cleaned[:100]}...'. Error: {e}. Will attempt regex extraction.") logging.warning(f"JSON子串解析失败: {e},将尝试正则表达式提取")
# 保存导致错误的JSON字符串到文件
self._save_problematic_json(json_str_cleaned, e)
# Attempt 2: Regex on the processed_text (which might have had </think> stripped) # 尝试方法4: 手动解析JSON格式的关键字段
# 修复正则表达式,移除多余的反斜杠 try:
logging.debug("尝试手动解析JSON结构")
manual_result = self._manual_json_extract(processed_text)
if manual_result and "title" in manual_result and "content" in manual_result:
logging.info("成功通过手动解析JSON提取内容")
return manual_result
except Exception as e:
logging.warning(f"手动解析JSON失败: {e}")
# 尝试方法5: 使用正则表达式提取
logging.debug("尝试使用正则表达式提取") logging.debug("尝试使用正则表达式提取")
title_match = re.search(r'"title":\s*"([^"]*)"', processed_text) # 更强大的正则表达式,处理多行内容
content_match = re.search(r'"content":\s*"([^"]*)"', processed_text) title_match = re.search(r'"title"\s*:\s*"((?:[^"\\]|\\.|[\r\n])+)"', processed_text, re.DOTALL)
content_match = re.search(r'"content"\s*:\s*"((?:[^"\\]|\\.|[\r\n])+)"', processed_text, re.DOTALL)
analysis_match = re.search(r'"analysis"\s*:\s*"((?:[^"\\]|\\.|[\r\n])+)"', processed_text, re.DOTALL)
if title_match and content_match: if title_match and content_match:
logging.info("Successfully extracted title/content using regex.") logging.info("成功使用正则表达式提取标题和内容")
return { return {
"title": title_match.group(1).strip(), "title": title_match.group(1).replace('\\"', '"').strip(),
"content": content_match.group(1).strip() "content": content_match.group(1).replace('\\"', '"').strip(),
"analysis": analysis_match.group(1).replace('\\"', '"').strip() if analysis_match else ""
} }
# Attempt 3: Try finding content with single quotes # 尝试方法6: 查找使用单引号的内容
logging.debug("尝试查找使用单引号的内容") logging.debug("尝试查找使用单引号的内容")
title_match = re.search(r'"title":\s*\'([^\']*)\'', processed_text) title_match = re.search(r'"title"\s*:\s*\'((?:[^\'\\]|\\.|[\r\n])+)\'', processed_text, re.DOTALL)
content_match = re.search(r'"content":\s*\'([^\']*)\'', processed_text) content_match = re.search(r'"content"\s*:\s*\'((?:[^\'\\]|\\.|[\r\n])+)\'', processed_text, re.DOTALL)
analysis_match = re.search(r'"analysis"\s*:\s*\'((?:[^\'\\]|\\.|[\r\n])+)\'', processed_text, re.DOTALL)
if title_match and content_match: if title_match and content_match:
logging.info("Successfully extracted title/content using single-quote regex.") logging.info("成功使用单引号正则表达式提取内容")
return { return {
"title": title_match.group(1).strip(), "title": title_match.group(1).strip(),
"content": content_match.group(1).strip() "content": content_match.group(1).strip(),
"analysis": analysis_match.group(1).strip() if analysis_match else ""
} }
# Final attempt: Look for key-value pairs without standard JSON formatting # 尝试方法7: 使用非标准格式提取
logging.debug("尝试非标准格式提取") logging.debug("尝试非标准格式提取")
title_pattern = re.compile(r'["""]?title["""]?[:]\s*["""]([^"""]+)["""]', re.IGNORECASE) title_pattern = re.compile(r'["""]?title["""]?[:]\s*["""]([^"""]+)["""]', re.IGNORECASE | re.DOTALL)
content_pattern = re.compile(r'["""]?content["""]?[:]\s*["""]([^"""]+)["""]', re.IGNORECASE) content_pattern = re.compile(r'["""]?content["""]?[:]\s*["""]([^"""]+)["""]', re.IGNORECASE | re.DOTALL)
analysis_pattern = re.compile(r'["""]?analysis["""]?[:]\s*["""]([^"""]+)["""]', re.IGNORECASE | re.DOTALL)
title_match = title_pattern.search(processed_text) title_match = title_pattern.search(processed_text)
content_match = content_pattern.search(processed_text) content_match = content_pattern.search(processed_text)
analysis_match = analysis_pattern.search(processed_text)
if title_match and content_match: if title_match and content_match:
logging.info("提取到标题和内容(使用灵活模式匹配)") logging.info("成功使用灵活模式匹配提取内容")
return { return {
"title": title_match.group(1).strip(), "title": title_match.group(1).strip(),
"content": content_match.group(1).strip() "content": content_match.group(1).strip(),
"analysis": analysis_match.group(1).strip() if analysis_match else ""
} }
logging.warning(f"所有提取方法失败响应前300字符: {processed_text[:300]}...") logging.warning(f"所有提取方法失败响应前300字符: {processed_text[:300]}...")
return None # Fallback if all extraction methods fail return None # 所有方法失败时的回退选项
except Exception as e: except Exception as e:
logging.error(f"Unexpected error during content extraction: {e}\n{traceback.format_exc()}") logging.error(f"内容提取过程中发生意外错误: {e}\n{traceback.format_exc()}")
return None return None
def _sanitize_json_text(self, text):
"""彻底清理文本确保可以安全解析为JSON"""
# 步骤1: 处理控制字符
cleaned = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', text)
# 步骤2: 特殊处理换行符,将实际换行转换为\n字符串
cleaned = cleaned.replace('\n', '\\n').replace('\r', '\\r')
# 步骤3: 处理内容字段中开始或结束可能存在的多余空格或引号
cleaned = re.sub(r'"content"\s*:\s*"\s*', '"content":"', cleaned)
cleaned = re.sub(r'"\s*,', '",', cleaned)
# 步骤4: 处理未转义的引号和反斜杠
cleaned = re.sub(r'(?<!\\)"(?=(?:(?:[^"\\]|\\.)*"(?:[^"\\]|\\.)*")*[^"\\]*$)', '\\"', cleaned)
# 步骤5: 处理可能的Unicode转义
cleaned = re.sub(r'\\u([0-9a-fA-F]{4})', lambda m: chr(int(m.group(1), 16)), cleaned)
return cleaned
def _manual_json_extract(self, text):
"""手动解析JSON结构提取关键字段"""
try:
# 使用状态机方式手动解析
result = {}
# 查找title字段
title_start = text.find('"title"')
if title_start >= 0:
colon_pos = text.find(':', title_start)
if colon_pos > 0:
quote_pos = text.find('"', colon_pos)
if quote_pos > 0:
end_quote_pos = text.find('"', quote_pos + 1)
while end_quote_pos > 0 and text[end_quote_pos-1] == '\\':
end_quote_pos = text.find('"', end_quote_pos + 1)
if end_quote_pos > 0:
result['title'] = text[quote_pos+1:end_quote_pos].replace('\\"', '"').strip()
# 查找content字段
content_start = text.find('"content"')
if content_start >= 0:
colon_pos = text.find(':', content_start)
if colon_pos > 0:
quote_pos = text.find('"', colon_pos)
if quote_pos > 0:
# 查找非转义双引号
pos = quote_pos + 1
content_end = -1
while pos < len(text):
if text[pos] == '"' and (pos == 0 or text[pos-1] != '\\'):
content_end = pos
break
pos += 1
if content_end > 0:
content = text[quote_pos+1:content_end].replace('\\"', '"')
# 特殊处理换行符
content = content.replace('\\n', '\n').replace('\\r', '\r')
result['content'] = content.strip()
# 查找analysis字段
analysis_start = text.find('"analysis"')
if analysis_start >= 0:
colon_pos = text.find(':', analysis_start)
if colon_pos > 0:
quote_pos = text.find('"', colon_pos)
if quote_pos > 0:
pos = quote_pos + 1
analysis_end = -1
while pos < len(text):
if text[pos] == '"' and (pos == 0 or text[pos-1] != '\\'):
analysis_end = pos
break
pos += 1
if analysis_end > 0:
analysis = text[quote_pos+1:analysis_end].replace('\\"', '"')
result['analysis'] = analysis.strip()
return result if 'title' in result and 'content' in result else None
except Exception as e:
logging.error(f"手动解析过程中出错: {e}")
return None
def _save_problematic_json(self, json_text, error):
"""保存导致解析错误的JSON字符串用于调试"""
try:
error_log_dir = "/root/autodl-tmp/TravelContentCreator/log/json_errors"
os.makedirs(error_log_dir, exist_ok=True)
error_log_file = f"{error_log_dir}/error_{int(time.time())}.json"
with open(error_log_file, "w", encoding="utf-8") as f:
f.write(f"# 错误信息: {str(error)}\n")
f.write(f"# 错误位置: 行 {error.lineno}, 列 {error.colno}\n")
f.write(json_text)
logging.info(f"已保存问题JSON到: {error_log_file}")
except Exception as e:
logging.error(f"保存问题JSON时出错: {e}")
def test_extraction_from_file(self, response_file_path): def test_extraction_from_file(self, response_file_path):
""" """
@ -358,4 +551,87 @@ class ContentJudger:
except Exception as e: except Exception as e:
logging.exception(f"测试提取时发生错误: {e}") logging.exception(f"测试提取时发生错误: {e}")
return {"success": False, "error": str(e)} return {"success": False, "error": str(e)}
def _prepare_content_for_serialization(self, content_dict):
"""
对内容进行处理确保可以安全序列化为JSON同时保留emoji字符
Args:
content_dict: 内容字典
Returns:
dict: 处理后的内容字典
"""
try:
# 创建一个新字典,避免修改原始内容
safe_dict = {}
for key, value in content_dict.items():
# 处理字符串类型的值
if isinstance(value, str):
# 第一步:彻底清理所有控制字符
safe_value = re.sub(r'[\x00-\x1F\x7F]', '', value)
# 第二步将emoji字符转换为相应的Unicode转义序列
# 这样能确保JSON序列化安全同时保留emoji语义
char_list = []
for char in safe_value:
if ord(char) > 127: # 非ASCII字符
# 尝试保留高位字符包括emoji
try:
# 验证这个字符是否可以安全序列化
json.dumps(char, ensure_ascii=False)
char_list.append(char)
except:
# 如果这个字符无法序列化使用其Unicode码点的字符串表示
char_list.append(f"\\u{ord(char):04x}")
else:
char_list.append(char)
processed_value = ''.join(char_list)
# 对于内容字段,特别注意保存换行符
if key == "content" and '\\n' in processed_value:
processed_value = processed_value.replace('\\n', '\n')
# 最终验证这个值是否可以安全序列化
try:
json.dumps(processed_value, ensure_ascii=False)
safe_dict[key] = processed_value
except Exception as e:
logging.warning(f"处理后的'{key}'值仍无法序列化: {e},将进行更严格处理")
# 更严格的处理只保留ASCII字符
safe_dict[key] = ''.join(c for c in processed_value if ord(c) < 128)
else:
safe_dict[key] = value
# 最终验证整个字典是否可序列化
try:
# 使用ensure_ascii=False允许非ASCII字符直接出现在JSON中
# 使用allow_nan=True允许特殊浮点数值
json_str = json.dumps(safe_dict, ensure_ascii=False, allow_nan=True)
# 验证生成的JSON是否有效
json.loads(json_str)
except Exception as e:
logging.error(f"最终字典序列化验证失败: {e}")
# 如果依然失败,返回一个绝对安全的结果
return {
"title": re.sub(r'[^\x20-\x7E]', '', content_dict.get("title", "序列化处理失败")),
"content": re.sub(r'[^\x20-\x7E]', '', "内容包含无法安全序列化的字符已移除所有非ASCII字符"),
"judge_success": content_dict.get("judge_success", False),
"error": True,
"raw_result": str(e)
}
return safe_dict
except Exception as e:
logging.error(f"处理内容以确保安全序列化时出错: {e}")
# 如果处理失败,返回一个基本的安全字典
return {
"title": "序列化处理失败",
"content": "内容包含无法安全序列化的字符",
"judge_success": False,
"error": True,
"raw_result": str(e)
}

View File

@ -1,5 +1,5 @@
import os import os
import json import simplejson as json
import logging import logging
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
import traceback import traceback
@ -70,7 +70,7 @@ class FileSystemOutputHandler(OutputHandler):
topics_path = os.path.join(run_dir, f"tweet_topic_{run_id}.json") topics_path = os.path.join(run_dir, f"tweet_topic_{run_id}.json")
try: try:
with open(topics_path, "w", encoding="utf-8") as f: with open(topics_path, "w", encoding="utf-8") as f:
json.dump(topics_list, f, ensure_ascii=False, indent=4) json.dump(topics_list, f, ensure_ascii=False, indent=4, ignore_nan=True)
logging.info(f"Topics list saved successfully to: {topics_path}") logging.info(f"Topics list saved successfully to: {topics_path}")
except Exception as e: except Exception as e:
logging.exception(f"Error saving topic JSON file to {topics_path}:") logging.exception(f"Error saving topic JSON file to {topics_path}:")
@ -115,14 +115,34 @@ class FileSystemOutputHandler(OutputHandler):
if "tags" in output_data and "original_tags" not in output_data: if "tags" in output_data and "original_tags" not in output_data:
output_data["original_tags"] = output_data["tags"] output_data["original_tags"] = output_data["tags"]
# 对内容进行深度清理,确保安全序列化
try:
output_data = self._sanitize_content_for_json(output_data)
logging.info("内容已经过安全清理,可以序列化")
except Exception as e:
logging.error(f"内容清理过程中出错: {e}")
# 保存统一格式的article.json # 保存统一格式的article.json
content_path = os.path.join(variant_dir, "article.json") content_path = os.path.join(variant_dir, "article.json")
try: try:
with open(content_path, "w", encoding="utf-8") as f: with open(content_path, "w", encoding="utf-8") as f:
json.dump(output_data, f, ensure_ascii=False, indent=4) json.dump(output_data, f, ensure_ascii=False, indent=4, ignore_nan=True)
logging.info(f"Content JSON saved to: {content_path}") logging.info(f"Content JSON saved to: {content_path}")
except Exception as e: except Exception as e:
logging.exception(f"Failed to save content JSON to {content_path}: {e}") logging.exception(f"Failed to save content JSON to {content_path}: {e}")
# 如果序列化失败,记录原始内容用于调试
debug_path = os.path.join(variant_dir, "debug_content.txt")
try:
with open(debug_path, "w", encoding="utf-8") as f:
for key, value in output_data.items():
if isinstance(value, str):
f.write(f"{key}: (length: {len(value)})\n")
f.write(f"{repr(value[:200])}...\n\n")
else:
f.write(f"{key}: {type(value)}\n")
logging.info(f"Debug content saved to: {debug_path}")
except Exception as debug_err:
logging.error(f"Failed to save debug content: {debug_err}")
# Save content prompt # Save content prompt
prompt_path = os.path.join(variant_dir, "tweet_prompt.txt") prompt_path = os.path.join(variant_dir, "tweet_prompt.txt")
@ -140,7 +160,7 @@ class FileSystemOutputHandler(OutputHandler):
config_path = os.path.join(run_dir, f"topic_{topic_index}_poster_configs.json") config_path = os.path.join(run_dir, f"topic_{topic_index}_poster_configs.json")
try: try:
with open(config_path, 'w', encoding='utf-8') as f_cfg_topic: with open(config_path, 'w', encoding='utf-8') as f_cfg_topic:
json.dump(config_data, f_cfg_topic, ensure_ascii=False, indent=4) json.dump(config_data, f_cfg_topic, ensure_ascii=False, indent=4, ignore_nan=True)
logging.info(f"Saved complete poster configurations for topic {topic_index} to: {config_path}") logging.info(f"Saved complete poster configurations for topic {topic_index} to: {config_path}")
except Exception as save_err: except Exception as save_err:
logging.error(f"Failed to save complete poster configurations for topic {topic_index} to {config_path}: {save_err}") logging.error(f"Failed to save complete poster configurations for topic {topic_index} to {config_path}: {save_err}")
@ -196,7 +216,7 @@ class FileSystemOutputHandler(OutputHandler):
metadata_path = os.path.join(os.path.dirname(save_path), metadata_filename) metadata_path = os.path.join(os.path.dirname(save_path), metadata_filename)
try: try:
with open(metadata_path, 'w', encoding='utf-8') as f: with open(metadata_path, 'w', encoding='utf-8') as f:
json.dump(metadata, f, ensure_ascii=False, indent=4) json.dump(metadata, f, ensure_ascii=False, indent=4, ignore_nan=True)
logging.info(f"保存{image_type}元数据到: {metadata_path}") logging.info(f"保存{image_type}元数据到: {metadata_path}")
except Exception as me: except Exception as me:
logging.error(f"无法保存{image_type}元数据到{metadata_path}: {me}") logging.error(f"无法保存{image_type}元数据到{metadata_path}: {me}")
@ -210,4 +230,48 @@ class FileSystemOutputHandler(OutputHandler):
def finalize(self, run_id: str): def finalize(self, run_id: str):
logging.info(f"FileSystemOutputHandler finalizing run: {run_id}. No specific actions needed.") logging.info(f"FileSystemOutputHandler finalizing run: {run_id}. No specific actions needed.")
pass # Nothing specific to do for file system finalize pass # Nothing specific to do for file system finalize
def _sanitize_content_for_json(self, data):
"""对内容进行深度清理确保可以安全序列化为JSON
Args:
data: 要处理的数据字典列表或基本类型
Returns:
经过处理的数据确保可以安全序列化
"""
if isinstance(data, dict):
# 处理字典类型
sanitized_dict = {}
for key, value in data.items():
sanitized_dict[key] = self._sanitize_content_for_json(value)
return sanitized_dict
elif isinstance(data, list):
# 处理列表类型
return [self._sanitize_content_for_json(item) for item in data]
elif isinstance(data, str):
# 处理字符串类型(重点关注)
# 1. 首先,替换所有字面的"\n"为真正的换行符
if r'\n' in data:
data = data.replace(r'\n', '\n')
# 2. 移除所有控制字符ASCII 0-31除了\n, \r, \t
cleaned = ''
for char in data:
# 允许常见的空白字符
if char in '\n\r\t' or ord(char) >= 32:
cleaned += char
# 3. 验证字符串可以被安全序列化
try:
json.dumps(cleaned, ensure_ascii=False)
return cleaned
except Exception as e:
logging.warning(f"字符串清理后仍无法序列化,尝试更严格的清理: {e}")
# 如果仍然无法序列化,使用更严格的清理
return ''.join(c for c in cleaned if ord(c) < 65536 and (c in '\n\r\t' or ord(c) >= 32))
else:
# 其他类型(数字、布尔值等)原样返回
return data

View File

@ -67,18 +67,6 @@ class tweetContent:
self.json_data = {"title": "", "content": "", "tag": "", "error": True, "raw_result": e} # 不再包含raw_result self.json_data = {"title": "", "content": "", "tag": "", "error": True, "raw_result": e} # 不再包含raw_result
def split_content(self, result): def split_content(self, result):
# Assuming split logic might still fail, keep it simple or improve with regex/json
# We should ideally switch content generation to JSON output as well.
# For now, keep existing logic but handle errors in __init__.
# Optional: Add basic check before splitting
# if not result or "</think>" not in result or "title>" not in result or "content>" not in result:
# logging.warning(f"AI result format unexpected: {result[:200]}...")
# # 返回空字符串而不是抛出异常,这样可以在主函数继续处理
# return "", ""
# --- Existing Logic (prone to errors) ---
try: try:
processed_result = result processed_result = result
if "</think>" in result: if "</think>" in result:
@ -88,11 +76,8 @@ class tweetContent:
json_data = json.loads(processed_result) json_data = json.loads(processed_result)
json_data["error"] = False json_data["error"] = False
json_data["raw_result"] = None json_data["raw_result"] = None
# 确保judge_success字段存在 json_data["judge_success"] = None
if "judge_success" not in json_data:
json_data["judge_success"] = None
return json_data return json_data
# --- End Existing Logic ---
except Exception as e: except Exception as e:
logging.warning(f"解析内容时出错: {e}, 使用默认空内容") logging.warning(f"解析内容时出错: {e}, 使用默认空内容")
@ -510,7 +495,7 @@ def generate_content_for_topic(ai_agent: AI_Agent,
# 准备审核内容 # 准备审核内容
content_to_judge = f"""title: {content_json.get('title', '')} content_to_judge = f"""title: {content_json.get('title', '')}
content: {content_json.get('content', '')} content: {content_json.get('content', '')}
""" """
# 调用ContentJudger进行审核 # 调用ContentJudger进行审核