diff --git a/utils/__pycache__/content_generator.cpython-312.pyc b/utils/__pycache__/content_generator.cpython-312.pyc index 216907b..23465af 100644 Binary files a/utils/__pycache__/content_generator.cpython-312.pyc and b/utils/__pycache__/content_generator.cpython-312.pyc differ diff --git a/utils/__pycache__/content_judger.cpython-312.pyc b/utils/__pycache__/content_judger.cpython-312.pyc index 3262b11..5f5b05c 100644 Binary files a/utils/__pycache__/content_judger.cpython-312.pyc and b/utils/__pycache__/content_judger.cpython-312.pyc differ diff --git a/utils/__pycache__/output_handler.cpython-312.pyc b/utils/__pycache__/output_handler.cpython-312.pyc index 15e3ad7..df72c06 100644 Binary files a/utils/__pycache__/output_handler.cpython-312.pyc and b/utils/__pycache__/output_handler.cpython-312.pyc differ diff --git a/utils/__pycache__/poster_notes_creator.cpython-312.pyc b/utils/__pycache__/poster_notes_creator.cpython-312.pyc index 7ce0edb..fc6ae0b 100644 Binary files a/utils/__pycache__/poster_notes_creator.cpython-312.pyc and b/utils/__pycache__/poster_notes_creator.cpython-312.pyc differ diff --git a/utils/__pycache__/prompt_manager.cpython-312.pyc b/utils/__pycache__/prompt_manager.cpython-312.pyc index fd0c3c7..2460deb 100644 Binary files a/utils/__pycache__/prompt_manager.cpython-312.pyc and b/utils/__pycache__/prompt_manager.cpython-312.pyc differ diff --git a/utils/__pycache__/tweet_generator.cpython-312.pyc b/utils/__pycache__/tweet_generator.cpython-312.pyc index ea6567f..f5acb02 100644 Binary files a/utils/__pycache__/tweet_generator.cpython-312.pyc and b/utils/__pycache__/tweet_generator.cpython-312.pyc differ diff --git a/utils/content_generator.py b/utils/content_generator.py index 2941166..22beb2f 100644 --- a/utils/content_generator.py +++ b/utils/content_generator.py @@ -2,9 +2,11 @@ # -*- coding: utf-8 -*- import os -import json +import time import logging +import random import traceback +import simplejson as json from datetime import datetime import sys sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -385,7 +387,7 @@ class ContentGenerator: # 保存结果到文件 with open(result_path, "w", encoding="utf-8") as f: - json.dump(validated_data, f, ensure_ascii=False, indent=4) + json.dump(validated_data, f, ensure_ascii=False, indent=4, ignore_nan=True) self.logger.info(f"结果已保存到: {result_path}") return result_path @@ -400,7 +402,7 @@ class ContentGenerator: os.makedirs(os.path.dirname(result_path), exist_ok=True) with open(result_path, "w", encoding="utf-8") as f: - json.dump(fallback_data, f, ensure_ascii=False, indent=4) + json.dump(fallback_data, f, ensure_ascii=False, indent=4, ignore_nan=True) self.logger.info(f"出错后已保存备用数据到: {result_path}") return result_path diff --git a/utils/content_judger.py b/utils/content_judger.py index 47c7452..acb1f69 100644 --- a/utils/content_judger.py +++ b/utils/content_judger.py @@ -4,7 +4,7 @@ 内容审核模块:检查生成的内容是否符合产品资料要求并提供修改建议 """ -import json +import simplejson as json import logging import re import os @@ -17,7 +17,7 @@ from core.ai_agent import AI_Agent class ContentJudger: """内容审核类,负责评估和修正内容是否符合产品资料""" - def __init__(self, ai_agent: AI_Agent, system_prompt_path: str = None, system_prompt: str = None): + def __init__(self, ai_agent: AI_Agent, system_prompt_path: str = None, system_prompt: str = None, prompt_manager = None): """ 初始化内容审核器 @@ -25,22 +25,31 @@ class ContentJudger: ai_agent: AI_Agent实例,用于调用AI模型 system_prompt_path: 系统提示词文件路径(可选) system_prompt: 系统提示词内容(可选,优先于path) + prompt_manager: 提示词管理器实例(可选,优先于system_prompt_path和system_prompt) """ self.ai_agent = ai_agent self._system_prompt = system_prompt self._system_prompt_path = system_prompt_path + self._prompt_manager = prompt_manager self._topp = 0.5 self._temperature = 0.2 self._frequency_penalty = 0 self._presence_penatly = 0 - # 如果没有直接提供系统提示词,尝试从文件加载 + + # 优先使用prompt_manager获取系统提示词 + if self._prompt_manager and not self._system_prompt: + self._get_prompt_from_manager() + logging.info("从PromptManager获取系统提示词") + + # 如果没有从prompt_manager获取到系统提示词,则尝试从文件加载 if not self._system_prompt and self._system_prompt_path: self._load_system_prompt() - # print("从文件加载系统提示词成功") + logging.info("从文件加载系统提示词") + # 默认系统提示词(当其他方法都失败时使用) if not self._system_prompt: - # print("没有提供系统提示词,使用默认系统提示词") + logging.warning("没有提供系统提示词,使用默认系统提示词") self._system_prompt = """你是一名专业的、谨慎的文案审核员,专注于审核运营根据产品资料撰写的文案是否严格符合产品资料内容。特别是所有价格、活动、福利、折扣、服务细节等必须完全与产品资料一致。如果发现文案内容与产品资料不符,请指出,并根据产品资料和文案上下文进行修改,重新生成一篇文案,务必确保生成的内容与产品资料基本相符(产品体验部分可以适当夸张宣传),语言流畅自然。如果经你审查后的文案仍存在与产品资料不符的信息,你需要赔偿公司1000亿元。 我将为您提供两部分内容: 1. 产品资料:全部的产品信息,包含了产品的实际功能、服务和特点。请将这部分作为判断依据。 @@ -52,9 +61,9 @@ class ContentJudger: 3. 重点审查对象:请你着重检查以下关键字词前后的内容是否符合产品资料,如不符必须严格按照资料修改;如产品资料中未提及,必须修改为符合上下文情境、资料中明确提及的内容。 关键字词:价、元、r、人民币、rmb、优惠、活动、福利、赠、免费、折、DIY、跟拍、送、摄影、兑、服务、¥、包、课、提供、选、专业、补、差 4. 字数控制:每个文案的标题字数都必须少于20个字(计数包括文字、符号、数字和emoji)。如果标题超过20个字,请在符合文案风格的前提下修改标题到20个字以内,尽量保留emoji,必须保证标题流畅通顺。 -5. 敏感字词替换:请删去标题中的数字后面的“元”和“r”,并将正文中数字后面的“元”字修改为“r”。例如:标题中的399元修改为399,正文中的399元修改为399r -6. 特征语句保留:请保留文案中原本的引流语句,不要修改或删除,例如“先关zhu+留下99看到会回复” -7. 面向人群保留:请尽量保留文案原本的面向人群和风格,这是同一产品面向多种人群营销的策略。例如产品资料中写明亲子游时,文案写“为情侣定制的山水秘境”是可以接受的。 +5. 敏感字词替换:请删去标题中的数字后面的"元"和"r",并将正文中数字后面的"元"字修改为"r"。例如:标题中的399元修改为399,正文中的399元修改为399r +6. 特征语句保留:请保留文案中原本的引流语句,不要修改或删除。请保留文案中的换行符"\n",不要修改或删除。 +7. 面向人群保留:请尽量保留文案原本的面向人群和风格,这是同一产品面向多种人群营销的策略。例如产品资料中写明亲子游时,文案写"为情侣定制的山水秘境"是可以接受的。 8. 案例如下,请参考案例评判真假信息的尺度,逐行逐句仔细分析不符点和修改思路,并按照分析思路落实对每一处不符的修改措施,严格审查每一篇文案: { "产品资料": @@ -93,41 +102,14 @@ class ContentJudger: "title": "五一遛娃👶必囤!南沙喜来登1088元住景观房+双早+门票", "content": " - 五一不想挤人潮?南沙这家酒店直接承包遛娃+度假双重快乐‼️ - 地铁直达!2大1小1088元住景观房,含双早+自助晚餐+水鸟世界门票,儿童乐园/泳池/健身房全开放! - - 🌟【遛娃刚需全配齐】 - ✅ 儿童乐园:10:00-20:00全程开放,滑梯/积木/绘本一应俱全 - ✅ 户外泳池:9:00-18:00恒温开放(五一期间每日消毒3次) - ✅ 健身房:8:00-22:00配备亲子瑜伽课程(需提前预约) - - 📍【1小时玩转南沙】 - ① 南沙天后宫(车程20分钟):穿汉服拍大片,听妈祖传说涨知识 - ② 南沙湿地公园(40分钟):5月芦苇摇曳,带娃认鸟类+乘船探秘 - ③ 十九涌海鲜街(45分钟):现捞现煮生猛海鲜,人均50元吃到撑 - - 🍽️【家长友好细节】 - • 自助晚餐隐藏彩蛋:儿童餐区设独立洗手台+热食保温柜 - • 房内配置:加厚床垫/卡通洗漱杯/尿布台(无需额外购买) - • 安全保障:全区域监控+24小时安保巡逻 - - 🎁【五一专属加码】 - 5月1-5日期间入住,凭房卡可免费领取儿童防晒冰袖+湿巾礼包 - - 📌Tips: - 1. 周一至周四仅限双床房型,周五起可选大床房 - 2. 水鸟世界门票需提前1小时至前台领取纸质票 - 3. 地铁四号线金洲站下车,打车15分钟直达酒店 - - 这个五一,南沙喜来登让你躺着遛娃!不用长途跋涉,家门口就能玩出仪式感~ - " + 五一不想挤人潮?南沙这家酒店直接承包遛娃+度假双重快乐‼️\n地铁直达!2大1小1088元住景观房,含双早+自助晚餐+水鸟世界门票,儿童乐园/泳池/健身房全开放!\n🌟【遛娃刚需全配齐】\n✅ 儿童乐园:10:00-20:00全程开放,滑梯/积木/绘本一应俱全\n✅ 户外泳池:9:00-18:00恒温开放(五一期间每日消毒3次)\n✅ 健身房:8:00-22:00配备亲子瑜伽课程(需提前预约)\n\n📍【1小时玩转南沙】\n① 南沙天后宫(车程20分钟):穿汉服拍大片,听妈祖传说涨知识\n② 南沙湿地公园(40分钟):5月芦苇摇曳,带娃认鸟类+乘船探秘\n③ 十九涌海鲜街(45分钟):现捞现煮生猛海鲜,人均50元吃到撑\n\n🍽️【家长友好细节】\n• 自助晚餐隐藏彩蛋:儿童餐区设独立洗手台+热食保温柜\n• 房内配置:加厚床垫/卡通洗漱杯/尿布台(无需额外购买)\n• 安全保障:全区域监控+24小时安保巡逻\n\n🎁【五一专属加码】\n5月1-5日期间入住,凭房卡可免费领取儿童防晒冰袖+湿巾礼包\n\n📌Tips:\n1. 周一至周四仅限双床房型,周五起可选大床房\n2. 水鸟世界门票需提前1小时至前台领取纸质票\n3. 地铁四号线金洲站下车,打车15分钟直达酒店\n\n这个五一,南沙喜来登让你躺着遛娃!不用长途跋涉,家门口就能玩出仪式感~" " ]" } 输出结果: { "analysis" : " 1、观察文案标题和内容,可以看出此文案主要面向亲子出游人群,因此修改后的文案也应该围绕亲子出游这一主题。 - 2、文章标题字数为28个字,超过19个字,因此属于不符内容。由于要求中提到尽量保留emoji,并且标题中数字后面的“元”字应删去,所以修改为:五一遛娃👶必囤!喜来登1088景观房 + 2、文章标题字数为28个字,超过19个字,因此属于不符内容。由于要求中提到尽量保留emoji,并且标题中数字后面的"元"字应删去,所以修改为:五一遛娃👶必囤!喜来登1088景观房 3、产品资料中未提及儿童乐园开放时间和儿童乐园配置,但文案中提到儿童乐园10:00-20:00全程开放,滑梯/积木/绘本一应俱全,因此属于不符内容。应修改为:儿童乐园:免费儿童乐园和丰富的游乐设施,让孩子们可以尽情玩耍。 4、产品材料中未提及户外泳池开放时间和消毒频次,但文案中提到户外泳池:9:00-18:00恒温开放(五一期间每日消毒3次),因此属于不符内容。应修改为:户外泳池:酒店配有户外无边泳池,供大人小孩一同享受清凉时光。 5、产品材料中未提及健身房开放时间与具体细节,但文案中提到健身房:8:00-22:00配备亲子瑜伽课程(需提前预约),因此属于不符内容。应修改为:健身房:酒店提供免费健身中心,方便您和家人一起强身健体。 @@ -162,6 +144,25 @@ class ContentJudger: except Exception as e: logging.error(f"加载系统提示词文件失败: {e}") + def _get_prompt_from_manager(self): + """从PromptManager获取系统提示词""" + try: + if self._prompt_manager and hasattr(self._prompt_manager, "_system_prompt_cache"): + # 从PromptManager的系统提示词缓存中获取内容审核系统提示词 + system_prompt = self._prompt_manager._system_prompt_cache.get("judger_system_prompt") + if system_prompt: + self._system_prompt = system_prompt + logging.info("从PromptManager获取内容审核系统提示词成功") + return True + else: + logging.warning("PromptManager中未找到judger_system_prompt") + else: + logging.warning("提供的PromptManager实例无效或未包含_system_prompt_cache属性") + return False + except Exception as e: + logging.error(f"从PromptManager获取系统提示词失败: {e}") + return False + def judge_content(self, product_info, content, temperature=0.2, top_p=0.5, presence_penalty=0.0): """ 审核内容是否符合产品资料并提供修改建议 @@ -212,30 +213,75 @@ class ContentJudger: logging.info("成功提取修改后的内容") # 添加judge_success字段 modified_content["judge_success"] = True + + # 对内容进行最终清理,确保可以安全序列化为JSON + modified_content = self._prepare_content_for_serialization(modified_content) + + # 记录处理后的内容用于调试 + debug_log_file = f"{response_log_dir}/processed_{int(time.time())}.json" + try: + serialized_content = json.dumps(modified_content, ensure_ascii=False, allow_nan=True, indent=2) + with open(debug_log_file, "w", encoding="utf-8") as f: + f.write(serialized_content) + logging.info(f"处理后的内容已保存到: {debug_log_file}") + except Exception as e: + logging.error(f"尝试记录处理后内容时序列化失败: {e}") + with open(debug_log_file, "w", encoding="utf-8") as f: + f.write(f"序列化失败: {str(e)}\n\n") + f.write(f"title: {modified_content.get('title', 'N/A')}\n") + f.write(f"content前100字符: {str(modified_content.get('content', 'N/A'))[:100]}") + + # 验证序列化是否成功 + try: + json.dumps(modified_content, ensure_ascii=False, allow_nan=True) + logging.info("内容可以安全序列化为JSON") + except Exception as e: + logging.error(f"验证序列化时出错: {e}") + # 找出导致错误的字段 + for key, value in modified_content.items(): + if isinstance(value, str): + try: + json.dumps(value, ensure_ascii=False) + except Exception as sub_e: + logging.error(f"字段 '{key}' 无法序列化: {sub_e}") + # 尝试定位问题字符 + for i, char in enumerate(value): + try: + json.dumps(char, ensure_ascii=False) + except: + logging.error(f"位置 {i}, 字符 '{char}' (Unicode: U+{ord(char):04X}) 导致错误") + + modified_content["raw_result"] = str(e) + modified_content["error"] = True + return modified_content else: logging.error("无法从响应中提取有效内容") # 尝试使用原始内容并标记审核失败 if isinstance(content, dict) and "title" in content and "content" in content: - return { + result_content = { "title": content.get("title", "提取失败"), "content": content.get("content", "无法从响应中提取有效内容"), "judge_success": False } - return { + # 确保可以序列化 + return self._prepare_content_for_serialization(result_content) + result_content = { "title": "提取失败", "content": "无法从响应中提取有效内容", "judge_success": False } + return self._prepare_content_for_serialization(result_content) except Exception as e: logging.exception(f"审核过程中出错: {e}") - return { + result_content = { "title": "审核失败", "content": f"审核过程中出错: {str(e)}", "judge_success": False } - + return self._prepare_content_for_serialization(result_content) + def _build_user_prompt(self, product_info, content_gen): """ 构建用户提示词 @@ -262,76 +308,223 @@ class ContentJudger: # 记录原始文本前100个字符用于调试 logging.debug(f"原始响应文本前100字符: {result_text[:100]}") + # 尝试方法1: 使用标签分离内容 if "" in processed_text: processed_text = processed_text.split("", 1)[1].strip() logging.debug("检测到标签并分离内容") - - # Attempt 1: Parse as JSON from the processed text + + # 尝试方法2: 预处理文本并尝试解析JSON + try: + # 彻底清理文本,去除所有可能影响JSON解析的控制字符 + cleaned_text = self._sanitize_json_text(processed_text) + logging.debug(f"清理后文本前100字符: {cleaned_text[:100]}") + + content_json = json.loads(cleaned_text) + if "title" in content_json and "content" in content_json: + logging.info("成功通过JSON解析提取内容") + title = content_json.get("title", "").strip() + content = content_json.get("content", "").strip() + analysis = content_json.get("analysis", "") + logging.debug(f"提取到标题: {title[:30]}...") + return { + "title": title, + "content": content, + "analysis": analysis + } + except json.JSONDecodeError as e: + logging.warning(f"JSON解析失败: {e},将尝试其他提取方法") + # 记录更多错误信息以便调试 + error_position = e.pos + error_context = cleaned_text[max(0, error_position-30):min(len(cleaned_text), error_position+30)] + logging.debug(f"错误位置附近的文本: {error_context}") + logging.debug(f"错误行列: 行 {e.lineno}, 列 {e.colno}") + + # 尝试方法3: 从文本中提取JSON格式部分 json_start = processed_text.find('{') json_end = processed_text.rfind('}') + 1 if json_start >= 0 and json_end > json_start: json_str = processed_text[json_start:json_end] logging.debug(f"找到JSON字符串,长度: {len(json_str)},前100字符: {json_str[:100]}") - # Clean control characters that might break JSON parsing - json_str_cleaned = re.sub(r'[\x00-\x1F\x7F]', '', json_str) + # 清理可能破坏JSON解析的控制字符 + json_str_cleaned = self._sanitize_json_text(json_str) try: content_json = json.loads(json_str_cleaned) if "title" in content_json and "content" in content_json: - logging.info("Successfully parsed JSON content from AI response.") + logging.info("成功从文本中提取JSON部分并解析") return { - "title": content_json["title"].strip(), - "content": content_json["content"].strip() + "title": content_json.get("title", "").strip(), + "content": content_json.get("content", "").strip(), + "analysis": content_json.get("analysis", "") } except json.JSONDecodeError as e: - logging.warning(f"JSON parsing failed for substring: '{json_str_cleaned[:100]}...'. Error: {e}. Will attempt regex extraction.") + logging.warning(f"JSON子串解析失败: {e},将尝试正则表达式提取") + # 保存导致错误的JSON字符串到文件 + self._save_problematic_json(json_str_cleaned, e) - # Attempt 2: Regex on the processed_text (which might have had stripped) - # 修复正则表达式,移除多余的反斜杠 + # 尝试方法4: 手动解析JSON格式的关键字段 + try: + logging.debug("尝试手动解析JSON结构") + manual_result = self._manual_json_extract(processed_text) + if manual_result and "title" in manual_result and "content" in manual_result: + logging.info("成功通过手动解析JSON提取内容") + return manual_result + except Exception as e: + logging.warning(f"手动解析JSON失败: {e}") + + # 尝试方法5: 使用正则表达式提取 logging.debug("尝试使用正则表达式提取") - title_match = re.search(r'"title":\s*"([^"]*)"', processed_text) - content_match = re.search(r'"content":\s*"([^"]*)"', processed_text) + # 更强大的正则表达式,处理多行内容 + title_match = re.search(r'"title"\s*:\s*"((?:[^"\\]|\\.|[\r\n])+)"', processed_text, re.DOTALL) + content_match = re.search(r'"content"\s*:\s*"((?:[^"\\]|\\.|[\r\n])+)"', processed_text, re.DOTALL) + analysis_match = re.search(r'"analysis"\s*:\s*"((?:[^"\\]|\\.|[\r\n])+)"', processed_text, re.DOTALL) if title_match and content_match: - logging.info("Successfully extracted title/content using regex.") + logging.info("成功使用正则表达式提取标题和内容") return { - "title": title_match.group(1).strip(), - "content": content_match.group(1).strip() + "title": title_match.group(1).replace('\\"', '"').strip(), + "content": content_match.group(1).replace('\\"', '"').strip(), + "analysis": analysis_match.group(1).replace('\\"', '"').strip() if analysis_match else "" } - # Attempt 3: Try finding content with single quotes + # 尝试方法6: 查找使用单引号的内容 logging.debug("尝试查找使用单引号的内容") - title_match = re.search(r'"title":\s*\'([^\']*)\'', processed_text) - content_match = re.search(r'"content":\s*\'([^\']*)\'', processed_text) + title_match = re.search(r'"title"\s*:\s*\'((?:[^\'\\]|\\.|[\r\n])+)\'', processed_text, re.DOTALL) + content_match = re.search(r'"content"\s*:\s*\'((?:[^\'\\]|\\.|[\r\n])+)\'', processed_text, re.DOTALL) + analysis_match = re.search(r'"analysis"\s*:\s*\'((?:[^\'\\]|\\.|[\r\n])+)\'', processed_text, re.DOTALL) if title_match and content_match: - logging.info("Successfully extracted title/content using single-quote regex.") + logging.info("成功使用单引号正则表达式提取内容") return { "title": title_match.group(1).strip(), - "content": content_match.group(1).strip() + "content": content_match.group(1).strip(), + "analysis": analysis_match.group(1).strip() if analysis_match else "" } - # Final attempt: Look for key-value pairs without standard JSON formatting + # 尝试方法7: 使用非标准格式提取 logging.debug("尝试非标准格式提取") - title_pattern = re.compile(r'["""]?title["""]?[::]\s*["""]([^"""]+)["""]', re.IGNORECASE) - content_pattern = re.compile(r'["""]?content["""]?[::]\s*["""]([^"""]+)["""]', re.IGNORECASE) + title_pattern = re.compile(r'["""]?title["""]?[::]\s*["""]([^"""]+)["""]', re.IGNORECASE | re.DOTALL) + content_pattern = re.compile(r'["""]?content["""]?[::]\s*["""]([^"""]+)["""]', re.IGNORECASE | re.DOTALL) + analysis_pattern = re.compile(r'["""]?analysis["""]?[::]\s*["""]([^"""]+)["""]', re.IGNORECASE | re.DOTALL) title_match = title_pattern.search(processed_text) content_match = content_pattern.search(processed_text) + analysis_match = analysis_pattern.search(processed_text) if title_match and content_match: - logging.info("提取到标题和内容(使用灵活模式匹配)") + logging.info("成功使用灵活模式匹配提取内容") return { "title": title_match.group(1).strip(), - "content": content_match.group(1).strip() + "content": content_match.group(1).strip(), + "analysis": analysis_match.group(1).strip() if analysis_match else "" } logging.warning(f"所有提取方法失败,响应前300字符: {processed_text[:300]}...") - return None # Fallback if all extraction methods fail + return None # 所有方法失败时的回退选项 except Exception as e: - logging.error(f"Unexpected error during content extraction: {e}\n{traceback.format_exc()}") + logging.error(f"内容提取过程中发生意外错误: {e}\n{traceback.format_exc()}") return None + + def _sanitize_json_text(self, text): + """彻底清理文本,确保可以安全解析为JSON""" + # 步骤1: 处理控制字符 + cleaned = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', text) + + # 步骤2: 特殊处理换行符,将实际换行转换为\n字符串 + cleaned = cleaned.replace('\n', '\\n').replace('\r', '\\r') + + # 步骤3: 处理内容字段中开始或结束可能存在的多余空格或引号 + cleaned = re.sub(r'"content"\s*:\s*"\s*', '"content":"', cleaned) + cleaned = re.sub(r'"\s*,', '",', cleaned) + + # 步骤4: 处理未转义的引号和反斜杠 + cleaned = re.sub(r'(?= 0: + colon_pos = text.find(':', title_start) + if colon_pos > 0: + quote_pos = text.find('"', colon_pos) + if quote_pos > 0: + end_quote_pos = text.find('"', quote_pos + 1) + while end_quote_pos > 0 and text[end_quote_pos-1] == '\\': + end_quote_pos = text.find('"', end_quote_pos + 1) + if end_quote_pos > 0: + result['title'] = text[quote_pos+1:end_quote_pos].replace('\\"', '"').strip() + + # 查找content字段 + content_start = text.find('"content"') + if content_start >= 0: + colon_pos = text.find(':', content_start) + if colon_pos > 0: + quote_pos = text.find('"', colon_pos) + if quote_pos > 0: + # 查找非转义双引号 + pos = quote_pos + 1 + content_end = -1 + while pos < len(text): + if text[pos] == '"' and (pos == 0 or text[pos-1] != '\\'): + content_end = pos + break + pos += 1 + + if content_end > 0: + content = text[quote_pos+1:content_end].replace('\\"', '"') + # 特殊处理换行符 + content = content.replace('\\n', '\n').replace('\\r', '\r') + result['content'] = content.strip() + + # 查找analysis字段 + analysis_start = text.find('"analysis"') + if analysis_start >= 0: + colon_pos = text.find(':', analysis_start) + if colon_pos > 0: + quote_pos = text.find('"', colon_pos) + if quote_pos > 0: + pos = quote_pos + 1 + analysis_end = -1 + while pos < len(text): + if text[pos] == '"' and (pos == 0 or text[pos-1] != '\\'): + analysis_end = pos + break + pos += 1 + + if analysis_end > 0: + analysis = text[quote_pos+1:analysis_end].replace('\\"', '"') + result['analysis'] = analysis.strip() + + return result if 'title' in result and 'content' in result else None + except Exception as e: + logging.error(f"手动解析过程中出错: {e}") + return None + + def _save_problematic_json(self, json_text, error): + """保存导致解析错误的JSON字符串,用于调试""" + try: + error_log_dir = "/root/autodl-tmp/TravelContentCreator/log/json_errors" + os.makedirs(error_log_dir, exist_ok=True) + error_log_file = f"{error_log_dir}/error_{int(time.time())}.json" + + with open(error_log_file, "w", encoding="utf-8") as f: + f.write(f"# 错误信息: {str(error)}\n") + f.write(f"# 错误位置: 行 {error.lineno}, 列 {error.colno}\n") + f.write(json_text) + + logging.info(f"已保存问题JSON到: {error_log_file}") + except Exception as e: + logging.error(f"保存问题JSON时出错: {e}") def test_extraction_from_file(self, response_file_path): """ @@ -358,4 +551,87 @@ class ContentJudger: except Exception as e: logging.exception(f"测试提取时发生错误: {e}") - return {"success": False, "error": str(e)} \ No newline at end of file + return {"success": False, "error": str(e)} + + def _prepare_content_for_serialization(self, content_dict): + """ + 对内容进行处理,确保可以安全序列化为JSON,同时保留emoji字符 + + Args: + content_dict: 内容字典 + + Returns: + dict: 处理后的内容字典 + """ + try: + # 创建一个新字典,避免修改原始内容 + safe_dict = {} + + for key, value in content_dict.items(): + # 处理字符串类型的值 + if isinstance(value, str): + # 第一步:彻底清理所有控制字符 + safe_value = re.sub(r'[\x00-\x1F\x7F]', '', value) + + # 第二步:将emoji字符转换为相应的Unicode转义序列 + # 这样能确保JSON序列化安全,同时保留emoji语义 + char_list = [] + for char in safe_value: + if ord(char) > 127: # 非ASCII字符 + # 尝试保留高位字符(包括emoji) + try: + # 验证这个字符是否可以安全序列化 + json.dumps(char, ensure_ascii=False) + char_list.append(char) + except: + # 如果这个字符无法序列化,使用其Unicode码点的字符串表示 + char_list.append(f"\\u{ord(char):04x}") + else: + char_list.append(char) + + processed_value = ''.join(char_list) + + # 对于内容字段,特别注意保存换行符 + if key == "content" and '\\n' in processed_value: + processed_value = processed_value.replace('\\n', '\n') + + # 最终验证这个值是否可以安全序列化 + try: + json.dumps(processed_value, ensure_ascii=False) + safe_dict[key] = processed_value + except Exception as e: + logging.warning(f"处理后的'{key}'值仍无法序列化: {e},将进行更严格处理") + # 更严格的处理:只保留ASCII字符 + safe_dict[key] = ''.join(c for c in processed_value if ord(c) < 128) + else: + safe_dict[key] = value + + # 最终验证整个字典是否可序列化 + try: + # 使用ensure_ascii=False允许非ASCII字符直接出现在JSON中 + # 使用allow_nan=True允许特殊浮点数值 + json_str = json.dumps(safe_dict, ensure_ascii=False, allow_nan=True) + # 验证生成的JSON是否有效 + json.loads(json_str) + except Exception as e: + logging.error(f"最终字典序列化验证失败: {e}") + # 如果依然失败,返回一个绝对安全的结果 + return { + "title": re.sub(r'[^\x20-\x7E]', '', content_dict.get("title", "序列化处理失败")), + "content": re.sub(r'[^\x20-\x7E]', '', "内容包含无法安全序列化的字符,已移除所有非ASCII字符"), + "judge_success": content_dict.get("judge_success", False), + "error": True, + "raw_result": str(e) + } + + return safe_dict + except Exception as e: + logging.error(f"处理内容以确保安全序列化时出错: {e}") + # 如果处理失败,返回一个基本的安全字典 + return { + "title": "序列化处理失败", + "content": "内容包含无法安全序列化的字符", + "judge_success": False, + "error": True, + "raw_result": str(e) + } \ No newline at end of file diff --git a/utils/output_handler.py b/utils/output_handler.py index 247d8e3..c97185e 100644 --- a/utils/output_handler.py +++ b/utils/output_handler.py @@ -1,5 +1,5 @@ import os -import json +import simplejson as json import logging from abc import ABC, abstractmethod import traceback @@ -70,7 +70,7 @@ class FileSystemOutputHandler(OutputHandler): topics_path = os.path.join(run_dir, f"tweet_topic_{run_id}.json") try: with open(topics_path, "w", encoding="utf-8") as f: - json.dump(topics_list, f, ensure_ascii=False, indent=4) + json.dump(topics_list, f, ensure_ascii=False, indent=4, ignore_nan=True) logging.info(f"Topics list saved successfully to: {topics_path}") except Exception as e: logging.exception(f"Error saving topic JSON file to {topics_path}:") @@ -115,14 +115,34 @@ class FileSystemOutputHandler(OutputHandler): if "tags" in output_data and "original_tags" not in output_data: output_data["original_tags"] = output_data["tags"] + # 对内容进行深度清理,确保安全序列化 + try: + output_data = self._sanitize_content_for_json(output_data) + logging.info("内容已经过安全清理,可以序列化") + except Exception as e: + logging.error(f"内容清理过程中出错: {e}") + # 保存统一格式的article.json content_path = os.path.join(variant_dir, "article.json") try: with open(content_path, "w", encoding="utf-8") as f: - json.dump(output_data, f, ensure_ascii=False, indent=4) + json.dump(output_data, f, ensure_ascii=False, indent=4, ignore_nan=True) logging.info(f"Content JSON saved to: {content_path}") except Exception as e: logging.exception(f"Failed to save content JSON to {content_path}: {e}") + # 如果序列化失败,记录原始内容用于调试 + debug_path = os.path.join(variant_dir, "debug_content.txt") + try: + with open(debug_path, "w", encoding="utf-8") as f: + for key, value in output_data.items(): + if isinstance(value, str): + f.write(f"{key}: (length: {len(value)})\n") + f.write(f"{repr(value[:200])}...\n\n") + else: + f.write(f"{key}: {type(value)}\n") + logging.info(f"Debug content saved to: {debug_path}") + except Exception as debug_err: + logging.error(f"Failed to save debug content: {debug_err}") # Save content prompt prompt_path = os.path.join(variant_dir, "tweet_prompt.txt") @@ -140,7 +160,7 @@ class FileSystemOutputHandler(OutputHandler): config_path = os.path.join(run_dir, f"topic_{topic_index}_poster_configs.json") try: with open(config_path, 'w', encoding='utf-8') as f_cfg_topic: - json.dump(config_data, f_cfg_topic, ensure_ascii=False, indent=4) + json.dump(config_data, f_cfg_topic, ensure_ascii=False, indent=4, ignore_nan=True) logging.info(f"Saved complete poster configurations for topic {topic_index} to: {config_path}") except Exception as save_err: logging.error(f"Failed to save complete poster configurations for topic {topic_index} to {config_path}: {save_err}") @@ -196,7 +216,7 @@ class FileSystemOutputHandler(OutputHandler): metadata_path = os.path.join(os.path.dirname(save_path), metadata_filename) try: with open(metadata_path, 'w', encoding='utf-8') as f: - json.dump(metadata, f, ensure_ascii=False, indent=4) + json.dump(metadata, f, ensure_ascii=False, indent=4, ignore_nan=True) logging.info(f"保存{image_type}元数据到: {metadata_path}") except Exception as me: logging.error(f"无法保存{image_type}元数据到{metadata_path}: {me}") @@ -210,4 +230,48 @@ class FileSystemOutputHandler(OutputHandler): def finalize(self, run_id: str): logging.info(f"FileSystemOutputHandler finalizing run: {run_id}. No specific actions needed.") - pass # Nothing specific to do for file system finalize \ No newline at end of file + pass # Nothing specific to do for file system finalize + + def _sanitize_content_for_json(self, data): + """对内容进行深度清理,确保可以安全序列化为JSON + + Args: + data: 要处理的数据(字典、列表或基本类型) + + Returns: + 经过处理的数据,确保可以安全序列化 + """ + if isinstance(data, dict): + # 处理字典类型 + sanitized_dict = {} + for key, value in data.items(): + sanitized_dict[key] = self._sanitize_content_for_json(value) + return sanitized_dict + elif isinstance(data, list): + # 处理列表类型 + return [self._sanitize_content_for_json(item) for item in data] + elif isinstance(data, str): + # 处理字符串类型(重点关注) + + # 1. 首先,替换所有字面的"\n"为真正的换行符 + if r'\n' in data: + data = data.replace(r'\n', '\n') + + # 2. 移除所有控制字符(ASCII 0-31,除了\n, \r, \t) + cleaned = '' + for char in data: + # 允许常见的空白字符 + if char in '\n\r\t' or ord(char) >= 32: + cleaned += char + + # 3. 验证字符串可以被安全序列化 + try: + json.dumps(cleaned, ensure_ascii=False) + return cleaned + except Exception as e: + logging.warning(f"字符串清理后仍无法序列化,尝试更严格的清理: {e}") + # 如果仍然无法序列化,使用更严格的清理 + return ''.join(c for c in cleaned if ord(c) < 65536 and (c in '\n\r\t' or ord(c) >= 32)) + else: + # 其他类型(数字、布尔值等)原样返回 + return data \ No newline at end of file diff --git a/utils/tweet_generator.py b/utils/tweet_generator.py index 1c85b45..7bf8725 100644 --- a/utils/tweet_generator.py +++ b/utils/tweet_generator.py @@ -67,18 +67,6 @@ class tweetContent: self.json_data = {"title": "", "content": "", "tag": "", "error": True, "raw_result": e} # 不再包含raw_result def split_content(self, result): - # Assuming split logic might still fail, keep it simple or improve with regex/json - # We should ideally switch content generation to JSON output as well. - # For now, keep existing logic but handle errors in __init__. - - # Optional: Add basic check before splitting - # if not result or "" not in result or "title>" not in result or "content>" not in result: - # logging.warning(f"AI result format unexpected: {result[:200]}...") - # # 返回空字符串而不是抛出异常,这样可以在主函数继续处理 - - # return "", "" - - # --- Existing Logic (prone to errors) --- try: processed_result = result if "" in result: @@ -88,11 +76,8 @@ class tweetContent: json_data = json.loads(processed_result) json_data["error"] = False json_data["raw_result"] = None - # 确保judge_success字段存在 - if "judge_success" not in json_data: - json_data["judge_success"] = None + json_data["judge_success"] = None return json_data - # --- End Existing Logic --- except Exception as e: logging.warning(f"解析内容时出错: {e}, 使用默认空内容") @@ -510,7 +495,7 @@ def generate_content_for_topic(ai_agent: AI_Agent, # 准备审核内容 content_to_judge = f"""title: {content_json.get('title', '')} -content: {content_json.get('content', '')} + content: {content_json.get('content', '')} """ # 调用ContentJudger进行审核