diff --git a/utils/__pycache__/content_judger.cpython-312.pyc b/utils/__pycache__/content_judger.cpython-312.pyc index a00ffc6..4e7640e 100644 Binary files a/utils/__pycache__/content_judger.cpython-312.pyc and b/utils/__pycache__/content_judger.cpython-312.pyc differ diff --git a/utils/__pycache__/output_handler.cpython-312.pyc b/utils/__pycache__/output_handler.cpython-312.pyc index 7720a1e..03ae793 100644 Binary files a/utils/__pycache__/output_handler.cpython-312.pyc and b/utils/__pycache__/output_handler.cpython-312.pyc differ diff --git a/utils/__pycache__/tweet_generator.cpython-312.pyc b/utils/__pycache__/tweet_generator.cpython-312.pyc index 25c416b..6d6effd 100644 Binary files a/utils/__pycache__/tweet_generator.cpython-312.pyc and b/utils/__pycache__/tweet_generator.cpython-312.pyc differ diff --git a/utils/content_judger.py b/utils/content_judger.py index feab470..813c6d7 100644 --- a/utils/content_judger.py +++ b/utils/content_judger.py @@ -12,6 +12,7 @@ import traceback import sys import base64 import re +import random sys.path.append('/root/autodl-tmp/TravelContentCreator') # 添加项目根目录 from core.ai_agent import AI_Agent @@ -66,8 +67,8 @@ class ContentJudger: 6. 特征语句保留:请保留文案中原本的引流语句,不要修改或删除。请保留文案中的换行符 \\n,不要修改或删除换行符。 7. 面向人群保留:请尽量保留文案原本的面向人群和风格,这是同一产品面向多种人群营销的策略。例如产品资料中写明亲子游时,文案写"为情侣定制的山水秘境"是可以接受的。 8. 案例如下,请参考案例评判真假信息的尺度,逐行逐句仔细分析不符点和修改思路,并按照分析思路落实对每一处不符的修改措施,严格审查每一篇文案: -{ -"产品资料": +[ + "产品资料": "周末不加收【南沙越秀喜来登】1088元/套,豪华客房1间1晚+双人自助早餐+自助晚餐+2大1小水鸟世界门票,免费儿童乐园,户外泳池+健身房~ 不想待在家,又想带娃出去玩?更不想开长途车、人挤人?为你推荐路程短、不塞车、景点多、坐地铁就能直达的溜娃地! 南沙越秀喜来登是广州南沙区首家国际品牌酒店,坐拥广州南大门,拥有得天独厚的中心位置,可俯瞰蕉门河美景,车程短,不出广州也能玩! @@ -97,15 +98,10 @@ class ContentJudger: 酒店地址:广东省广州市南沙区海熙大街79-80号 导航关键词:广州南沙越秀喜来登酒店" - - -"生成文案":"[ + + "生成文案": "title": "五一遛娃👶必囤!南沙喜来登1088元住景观房+双早+门票", - - "content": " - 五一不想挤人潮?南沙这家酒店直接承包遛娃+度假双重快乐‼️\n地铁直达!2大1小1088元住景观房,含双早+自助晚餐+水鸟世界门票,儿童乐园/泳池/健身房全开放!\n🌟【遛娃刚需全配齐】\n✅ 儿童乐园:10:00-20:00全程开放,滑梯/积木/绘本一应俱全\n✅ 户外泳池:9:00-18:00恒温开放(五一期间每日消毒3次)\n✅ 健身房:8:00-22:00配备亲子瑜伽课程(需提前预约)\n\n📍【1小时玩转南沙】\n① 南沙天后宫(车程20分钟):穿汉服拍大片,听妈祖传说涨知识\n② 南沙湿地公园(40分钟):5月芦苇摇曳,带娃认鸟类+乘船探秘\n③ 十九涌海鲜街(45分钟):现捞现煮生猛海鲜,人均50元吃到撑\n\n🍽️【家长友好细节】\n• 自助晚餐隐藏彩蛋:儿童餐区设独立洗手台+热食保温柜\n• 房内配置:加厚床垫/卡通洗漱杯/尿布台(无需额外购买)\n• 安全保障:全区域监控+24小时安保巡逻\n\n🎁【五一专属加码】\n5月1-5日期间入住,凭房卡可免费领取儿童防晒冰袖+湿巾礼包\n\n📌Tips:\n1. 周一至周四仅限双床房型,周五起可选大床房\n2. 水鸟世界门票需提前1小时至前台领取纸质票\n3. 地铁四号线金洲站下车,打车15分钟直达酒店\n\n这个五一,南沙喜来登让你躺着遛娃!不用长途跋涉,家门口就能玩出仪式感~" " -]" -} + "content": "五一不想挤人潮?南沙这家酒店直接承包遛娃+度假双重快乐‼️\n地铁直达!2大1小1088元住景观房,含双早+自助晚餐+水鸟世界门票,儿童乐园/泳池/健身房全开放!\n🌟【遛娃刚需全配齐】\n✅ 儿童乐园:10:00-20:00全程开放,滑梯/积木/绘本一应俱全\n✅ 户外泳池:9:00-18:00恒温开放(五一期间每日消毒3次)\n✅ 健身房:8:00-22:00配备亲子瑜伽课程(需提前预约)\n\n📍【1小时玩转南沙】\n① 南沙天后宫(车程20分钟):穿汉服拍大片,听妈祖传说涨知识\n② 南沙湿地公园(40分钟):5月芦苇摇曳,带娃认鸟类+乘船探秘\n③ 十九涌海鲜街(45分钟):现捞现煮生猛海鲜,人均50元吃到撑\n\n🍽️【家长友好细节】\n• 自助晚餐隐藏彩蛋:儿童餐区设独立洗手台+热食保温柜\n• 房内配置:加厚床垫/卡通洗漱杯/尿布台(无需额外购买)\n• 安全保障:全区域监控+24小时安保巡逻\n\n🎁【五一专属加码】\n5月1-5日期间入住,凭房卡可免费领取儿童防晒冰袖+湿巾礼包\n\n📌Tips:\n1. 周一至周四仅限双床房型,周五起可选大床房\n2. 水鸟世界门票需提前1小时至前台领取纸质票\n3. 地铁四号线金洲站下车,打车15分钟直达酒店\n\n这个五一,南沙喜来登让你躺着遛娃!不用长途跋涉,家门口就能玩出仪式感~" 输出结果: { @@ -113,8 +109,8 @@ class ContentJudger: "title": "五一遛娃👶必囤!喜来登1088景观房", "content": "五一不想挤人潮?南沙这家酒店直接承包遛娃+度假双重快乐‼️\n地铁直达!2大1小1088r住景观房,含双早+自助晚餐+水鸟世界门票,儿童乐园/泳池/健身房全开放!\n🌟【遛娃刚需全配齐】\n✅ 儿童乐园:酒店设有免费儿童乐园,提供丰富的游乐设施,让孩子们尽情玩耍\n✅ 户外泳池:酒店配有户外无边泳池,供大人小孩一同享受清凉时光 \n✅ 健身房:酒店提供免费健身中心,适合家庭成员共同锻炼。\n\n📍【1小时玩转南沙】\n① 南沙天后宫(车程20分钟):穿汉服拍大片,听妈祖传说涨知识\n② 南沙湿地公园(40分钟):5月芦苇摇曳,带娃认鸟类+乘船探秘\n③ 十九涌海鲜街(45分钟):现捞现煮生猛海鲜,人均50r吃到撑 \n\n🍽️【家长友好细节】 \n• 自助餐厅:供应鲜美海鲜、精美甜品等任君选择,大人小孩都爱吃 \n• 房内配置:55英寸超大纯平电视+独立的浴缸+超大的落地玻璃窗,尽览蕉门河风景,尽享亲子度假时光 \n• 安全保障:酒店设有完善的监控系统和安保措施,全力保障您与家人的安全 \n\n🎁【套餐专属福利】\n1、豪华客房一间一晚(周一至四只开放双床房) \n2、2大1小自助早晚餐 \n3、赠送2大1小水鸟世界门票(酒店前台领取),无需额外购买 \n\n📌Tips: \n1. 周一至周四仅限双床房型,周五起可选大床房 \n2. 酒店前台领取水鸟世界纸质门票 \n3. 地铁四号线金洲站下车,打车15分钟直达酒店 \n\n这个五一,南沙喜来登让你躺着遛娃!不用长途跋涉,家门口就能玩出仪式感~\n" } - -8. 必须按照以下格式输出修改后内容,不需要输出无关内容 +] +9. 必须按照以下格式输出修改后内容,不需要输出无关内容 { "analysis" : "分析过程", "title": "修改后的标题", @@ -154,58 +150,133 @@ class ContentJudger: logging.error(f"从PromptManager获取系统提示词失败: {e}") return False - def _split_content(self, result): - """ - 参考tweet_generator的处理方式,解析AI返回的内容 + def _preprocess_for_json(self, text): + """预处理文本,处理JSON结构中的问题字符""" + if not isinstance(text, str): + return text - Args: - result: AI返回的原始结果 - - Returns: - dict: 解析后的JSON数据 - """ try: - # 处理AI可能返回的思考部分 - processed_result = result - if "" in result: - processed_result = result.split("")[1] # 取标签后的内容 + # 1. 处理特殊Unicode字符和标点符号 + char_map = { + '"': '"', # 特殊Unicode引号替换为标准双引号 + '"': '"', # 特殊Unicode引号替换为标准双引号 + ''': "'", # 特殊Unicode单引号替换为标准单引号 + ''': "'", # 特殊Unicode单引号替换为标准单引号 + ',': ',', # 中文逗号替换为英文逗号 + ':': ':', # 中文冒号替换为英文冒号 + '(': '(', # 中文括号替换为英文括号 + ')': ')', # 中文括号替换为英文括号 + '\u200b': '', # 零宽空格直接移除 + '\u200c': '', # 零宽不连字直接移除 + '\u200d': '', # 零宽连字直接移除 + '\u2028': ' ', # 行分隔符替换为空格 + '\u2029': ' ' # 段落分隔符替换为空格 + } - # 直接尝试解析JSON - json_data = json.loads(processed_result) - json_data["error"] = False - json_data["judge_success"] = True - return json_data + # 应用字符替换 + for char, replacement in char_map.items(): + text = text.replace(char, replacement) - except json.JSONDecodeError as json_err: - # JSON解析失败,记录错误并尝试更基本的处理方法 - logging.warning(f"解析内容时出错: {json_err}, 尝试提取JSON部分") + # 2. 处理控制字符 (ASCII < 32) + cleaned_text = "" + for i, char in enumerate(text): + if ord(char) < 32: # ASCII 32以下是控制字符 + if char in ['\n', '\r', '\t']: # 保留这些常用控制字符 + cleaned_text += char + else: # 删除其他控制字符 + logging.debug(f"移除位置{i}的无效控制字符(ASCII: {ord(char)})") + continue + else: + cleaned_text += char - try: - # 尝试找到JSON部分(从第一个{到最后一个}) - json_start = processed_result.find('{') - json_end = processed_result.rfind('}') + 1 + # 3. 处理JSON结构特定问题 + # 处理大括号附近的换行符和空白 + if cleaned_text.startswith('{\n'): + cleaned_text = '{' + cleaned_text[2:] + + if cleaned_text.startswith('{ '): + cleaned_text = '{' + cleaned_text[2:] - if json_start >= 0 and json_end > json_start: - json_str = processed_result[json_start:json_end] - json_data = json.loads(json_str) - json_data["error"] = False - json_data["judge_success"] = True - return json_data - except Exception as e: - logging.error(f"尝试提取JSON部分失败: {e}") - + if '\n}' in cleaned_text: + cleaned_text = cleaned_text.replace('\n}', '}') + + if ' }' in cleaned_text: + cleaned_text = cleaned_text.replace(' }', '}') + + # 4. 处理转义序列 - 保留\n、\r、\t的转义,移除其他转义 + import re + + # 第一步:将要保留的转义序列临时替换为安全标记 + safe_replacements = { + r'\\n': '@NEWLINE@', # 保留换行转义 + r'\\r': '@RETURN@', # 保留回车转义 + r'\\t': '@TAB@', # 保留制表符转义 + } + + # 应用安全替换 + for pattern, replacement in safe_replacements.items(): + cleaned_text = re.sub(pattern, replacement, cleaned_text) + + # 第二步:移除除JSON必要转义外的所有反斜杠转义 + # 处理常见的多余转义情况 + cleaned_text = re.sub(r'\\([^\\/"bfnrtu])', r'\1', cleaned_text) # 移除非特殊字符前的反斜杠 + cleaned_text = cleaned_text.replace('\\"', '"') # 将转义的双引号还原为普通双引号 + cleaned_text = cleaned_text.replace('\\\'', '\'') # 将转义的单引号还原为普通单引号 + cleaned_text = cleaned_text.replace('\\\\', '\\') # 将双反斜杠替换为单反斜杠 + + # 第三步:将安全标记替换回原始转义序列 + reverse_replacements = { + '@NEWLINE@': '\\n', # 还原换行转义 + '@RETURN@': '\\r', # 还原回车转义 + '@TAB@': '\\t', # 还原制表符转义 + } + + # 应用反向替换 + for marker, escape_seq in reverse_replacements.items(): + cleaned_text = cleaned_text.replace(marker, escape_seq) + + # 第四步:再次检查并修复字符串内的换行符(确保100%处理) + # 这个额外的步骤确保没有任何字符串值中包含实际的换行符 + pattern = r'"([^"\\]*(\\.[^"\\]*)*)"' # 匹配所有JSON字符串(包括已经有转义字符的) + + def fix_remaining_newlines(match): + string_value = match.group(1) + # 确保所有实际换行符都被转义 + fixed_value = string_value.replace('\n', '\\n').replace('\r', '\\r') + return f'"{fixed_value}"' + + cleaned_text = re.sub(pattern, fix_remaining_newlines, cleaned_text) + + # 5. 确保逗号后换行不会导致问题 + cleaned_text = cleaned_text.replace(',\n', ', ') # 替换逗号后的换行为空格 + + # 6. 尝试解析检验 + try: + # 尝试进行轻度解析验证 + json.loads(cleaned_text) + # 如果能成功解析,直接返回 + return cleaned_text + except json.JSONDecodeError as e: + logging.debug(f"预处理后JSON仍有问题:{e},尝试最后的修复...") + # 最后的处理:使用simplejson替代内置json库尝试修复 + try: + import simplejson + # 加载后再保存,让simplejson自己处理一些小问题 + fixed_json = simplejson.loads(cleaned_text, strict=False) + return simplejson.dumps(fixed_json) + except: + # simplejson也失败了,继续后续流程 + pass + + # 7. 记录处理后的文本,以便调试 + logging.debug(f"JSON预处理后的文本长度: {len(cleaned_text)}") + return cleaned_text + except Exception as e: - logging.error(f"解析内容时出错: {e}") - - # 所有解析方法都失败,返回一个默认结果 - return { - "title": "", - "content": "", - "error": True, - "judge_success": False, - "analysis": f"内容解析失败,错误信息: {str(e)}" - } - + logging.exception(f"JSON预处理过程中出错: {e}") + # 发生异常时,返回原始文本,不做修改 + return text + def judge_content(self, product_info, content, temperature=0.2, top_p=0.5, presence_penalty=0.0): """审核内容""" logging.info("开始内容审核流程") @@ -220,48 +291,159 @@ class ContentJudger: system_prompt=self._system_prompt, user_prompt=user_prompt, file_folder=None, - temperature=self._temperature, - top_p=self._topp, - presence_penalty=self._presence_penatly, + temperature=temperature, # 使用传入的参数 + top_p=top_p, # 使用传入的参数 + presence_penalty=presence_penalty, # 使用传入的参数 ) # 保存原始响应以便调试 self._save_response(result, response_id) + logging.info(f"AI响应长度: {len(result)} 字符") - # 使用简化的解析方法处理响应 - content_json = self._split_content(result) + # 尝试多种方法提取JSON + json_obj = None + error_msg = None - # 检查解析结果是否有错误 - if content_json.get("error", False): - logging.warning(f"内容解析失败,使用原内容") - return self._create_fallback_result(content) + # 方法1: 提取{...}的JSON部分 + try: + # 移除思考部分 + processed_result = result.split("", 1)[-1].strip() if "" in result else result + + # 找到最外层的大括号 + json_start = processed_result.find('{') + json_end = processed_result.rfind('}') + 1 + + if json_start >= 0 and json_end > json_start: + # 提取JSON字符串 + json_str = processed_result[json_start:json_end] + + # 预处理JSON字符串 + json_str = self._preprocess_for_json(json_str) + + # 尝试解析JSON + json_obj = json.loads(json_str) + logging.info("方法1成功解析JSON") + except Exception as e: + error_msg = f"方法1解析JSON失败: {e}" + logging.debug(error_msg) + # 继续尝试其他方法 - # 检查必要字段是否存在 - if "title" not in content_json or "content" not in content_json: - logging.warning(f"解析结果缺少必要字段 'title' 或 'content'") - content_json["judge_success"] = False - return self._create_fallback_result(content) + # 方法2: 尝试多行解析,逐行检查是否有合法JSON + if not json_obj: + try: + lines = result.split('\n') + for i, line in enumerate(lines): + line = line.strip() + if line.startswith('{') and line.endswith('}'): + try: + # 尝试处理和解析这一行 + processed_line = self._preprocess_for_json(line) + json_obj = json.loads(processed_line) + logging.info(f"方法2在第{i+1}行成功解析JSON") + break + except: + # 继续尝试下一行 + pass + except Exception as e: + if not error_msg: + error_msg = f"方法2解析JSON失败: {e}" + logging.debug(error_msg) - # 添加Base64编码内容 - result_dict = { - "judge_success": content_json.get("judge_success", True), + # 方法3: 尝试使用正则表达式匹配最可能的JSON部分 + if not json_obj: + try: + import re + # 尝试匹配 {..."title":...,"content":...} + json_pattern = r'\{[^{}]*"title"[^{}]*"content"[^{}]*\}' + matches = re.findall(json_pattern, result, re.DOTALL) + + if matches: + for match in matches: + try: + processed_match = self._preprocess_for_json(match) + json_obj = json.loads(processed_match) + logging.info("方法3成功解析JSON") + break + except: + # 继续尝试下一个匹配 + pass + except Exception as e: + if not error_msg: + error_msg = f"方法3解析JSON失败: {e}" + logging.debug(error_msg) + + # 处理解析结果 + if json_obj and isinstance(json_obj, dict): + # 验证关键字段 + if "title" in json_obj and "content" in json_obj: + # 构建结果字典 + result_dict = { + "judge_success": True, + "judged": True, + "title": json_obj["title"], + "content": json_obj["content"], + "title_base64": base64.b64encode(json_obj["title"].encode('utf-8')).decode('utf-8'), + "content_base64": base64.b64encode(json_obj["content"].encode('utf-8')).decode('utf-8') + } + + # 添加分析字段(如果存在) + if "analysis" in json_obj: + result_dict["analysis"] = json_obj["analysis"] + result_dict["analysis_base64"] = base64.b64encode(json_obj["analysis"].encode('utf-8')).decode('utf-8') + + logging.info(f"成功提取内容: 标题({len(json_obj['title'])}字符), 内容({len(json_obj['content'])}字符)") + return result_dict + else: + # JSON对象缺少必要字段 + logging.warning("解析的JSON缺少必要字段'title'或'content'") + error_msg = "缺少必要字段'title'或'content'" + # 保存错误日志 + self._save_error_json(json.dumps(json_obj), error_msg, response_id) + else: + # 未找到有效的JSON + if error_msg: + logging.warning(f"JSON解析失败: {error_msg}") + else: + logging.warning("找不到有效的JSON结构") + + # 保存可能的JSON字符串以供调试 + if json_start >= 0 and json_end > json_start: + json_str = processed_result[json_start:json_end] + self._save_error_json(json_str, error_msg or "解析失败", response_id) + + # 所有方法都失败,返回空内容 + logging.info("内容审核过程未能产生有效结果,返回空内容") + empty_result = { + "judge_success": False, "judged": True, - "title": content_json["title"], - "content": content_json["content"], - "title_base64": base64.b64encode(content_json["title"].encode('utf-8')).decode('utf-8'), - "content_base64": base64.b64encode(content_json["content"].encode('utf-8')).decode('utf-8') + "title": "", + "content": "", + "title_base64": base64.b64encode("".encode('utf-8')).decode('utf-8'), + "content_base64": base64.b64encode("".encode('utf-8')).decode('utf-8') } - # 如果有analysis字段,也包含 - if "analysis" in content_json: - result_dict["analysis"] = content_json["analysis"] - result_dict["analysis_base64"] = base64.b64encode(content_json["analysis"].encode('utf-8')).decode('utf-8') - - return result_dict + if error_msg: + empty_result["analysis"] = f"内容审核失败: {error_msg}" + empty_result["analysis_base64"] = base64.b64encode(f"内容审核失败: {error_msg}".encode('utf-8')).decode('utf-8') + return empty_result + except Exception as e: + # 捕获所有异常 + error_traceback = traceback.format_exc() logging.exception(f"审核过程中出错: {e}") - return self._create_fallback_result(content, error_msg=str(e)) + logging.debug(f"详细错误: {error_traceback}") + + return { + "judge_success": False, + "judged": True, + "title": "", + "content": "", + "title_base64": base64.b64encode("".encode('utf-8')).decode('utf-8'), + "content_base64": base64.b64encode("".encode('utf-8')).decode('utf-8'), + "analysis": f"内容审核过程出错: {e}", + "analysis_base64": base64.b64encode(f"内容审核过程出错: {e}".encode('utf-8')).decode('utf-8') + } def _save_response(self, response, response_id): """保存原始响应""" @@ -273,6 +455,29 @@ class ContentJudger: except Exception as e: logging.error(f"保存原始响应失败: {e}") + def _save_error_json(self, json_str, error, response_id): + """保存错误的JSON字符串以供调试""" + try: + error_log_dir = "/root/autodl-tmp/TravelContentCreator/log/json_errors" + os.makedirs(error_log_dir, exist_ok=True) + + # 创建包含错误信息和原始JSON的日志 + error_info = { + "error_message": str(error), + "error_type": error.__class__.__name__ if hasattr(error, "__class__") else "Unknown", + "timestamp": int(time.time()), + "response_id": response_id, + "json_string": json_str + } + + # 保存到文件 + with open(f"{error_log_dir}/error_{response_id}.json", "w", encoding="utf-8") as f: + json.dump(error_info, f, ensure_ascii=False, indent=2) + + logging.info(f"已保存错误JSON到 {error_log_dir}/error_{response_id}.json") + except Exception as e: + logging.error(f"保存错误JSON失败: {e}") + def _create_fallback_result(self, content, error_msg="解析失败"): """创建回退结果""" if isinstance(content, str): @@ -328,4 +533,86 @@ class ContentJudger: ## 运营生成的文案(需要审核的内容): {content_str} -""" \ No newline at end of file +""" + + def judge_content_with_retry(self, product_info, content, max_retries=3, temperature=0.2, top_p=0.5, presence_penalty=0.0): + """ + 带重试机制的内容审核方法,当检测到空内容时自动重试 + + Args: + product_info: 产品资料 + content: 需要审核的内容 + max_retries: 最大重试次数 + temperature, top_p, presence_penalty: AI生成参数 + + Returns: + dict: 审核结果,如果所有重试都失败,则返回最后一次的失败结果 + """ + retry_count = 0 + last_result = None + + logging.info(f"开始内容审核流程,最大重试次数: {max_retries},初始温度参数: {temperature}") + + while retry_count <= max_retries: + current_attempt = retry_count + 1 + + if retry_count > 0: + # 每次重试增加温度参数,增加多样性 + adjusted_temperature = min(temperature + (retry_count * 0.1), 0.9) + logging.info(f"🔄 内容审核重试 ({current_attempt}/{max_retries+1}),调整温度参数为: {adjusted_temperature:.2f}") + else: + adjusted_temperature = temperature + logging.info(f"⏳ 内容审核首次尝试 (1/{max_retries+1}),使用默认温度: {adjusted_temperature:.2f}") + + # 调用基本的审核方法 + result = self.judge_content( + product_info, + content, + temperature=adjusted_temperature, + top_p=top_p, + presence_penalty=presence_penalty + ) + + last_result = result + + # 检查结果是否为空内容 + if result.get("judge_success", False) and result.get("title") and result.get("content"): + # 成功获取有效内容,返回结果 + if retry_count > 0: + logging.info(f"✅ 成功!在第{retry_count}次重试后获取有效内容(共尝试{current_attempt}次)") + else: + logging.info(f"✅ 成功!首次尝试已获取有效内容") + + # 添加审核内容长度统计 + title_len = len(result.get("title", "")) + content_len = len(result.get("content", "")) + logging.info(f"📊 审核结果统计:标题长度={title_len}字符,内容长度={content_len}字符") + + return result + else: + # 记录当前尝试的结果状态 + title_len = len(result.get("title", "")) + content_len = len(result.get("content", "")) + logging.warning(f"❌ 审核尝试 {current_attempt}/{max_retries+1} 失败,judge_success={result.get('judge_success')},标题长度={title_len},内容长度={content_len}") + + # 重试次数增加 + retry_count += 1 + + if retry_count <= max_retries: + # 在重试前稍微等待,避免过快请求 + delay = 1 + random.random() * 2 # 1-3秒随机延迟 + remaining = max_retries - retry_count + 1 + logging.info(f"⏱️ 等待{delay:.1f}秒后进行第{retry_count+1}次尝试,剩余{remaining}次尝试机会") + time.sleep(delay) + else: + logging.warning(f"⛔ 已达到最大重试次数,共尝试{current_attempt}次均未获取满意结果") + + # 所有重试都失败,返回最后一次结果 + logging.warning(f"⚠️ {max_retries+1}次尝试后仍未获取有效内容,将返回最后一次结果") + + # 记录最后返回内容的基本信息 + title_len = len(last_result.get("title", "")) + content_len = len(last_result.get("content", "")) + logging.info(f"📄 最终返回内容:judge_success={last_result.get('judge_success')},标题长度={title_len}字符,内容长度={content_len}字符") + + return last_result \ No newline at end of file diff --git a/utils/output_handler.py b/utils/output_handler.py index b6d9b14..784d5e1 100644 --- a/utils/output_handler.py +++ b/utils/output_handler.py @@ -164,10 +164,19 @@ class FileSystemOutputHandler(OutputHandler): if "tags" in input_data and "original_tags" not in input_data: input_data["original_tags"] = input_data["tags"] + # 统一审核分析字段,优先使用judge_analysis,其次使用不良内容分析 + if "judge_analysis" not in input_data and "不良内容分析" in input_data: + input_data["judge_analysis"] = input_data["不良内容分析"] + elif "不良内容分析" not in input_data and "judge_analysis" in input_data: + input_data["不良内容分析"] = input_data["judge_analysis"] + # 保存原始值用于txt文件生成和调试 - original_title = input_data.get("title", "") - original_content = input_data.get("content", "") - original_tags = input_data.get("tags", "") + original_title = input_data.get("original_title", input_data.get("title", "")) + original_content = input_data.get("original_content", input_data.get("content", "")) + original_tags = input_data.get("original_tags", input_data.get("tags", "")) + judge_title = input_data.get("title", "") + judge_content = input_data.get("content", "") + judge_tags = input_data.get("tags", "") original_judge_analysis = input_data.get("judge_analysis", "") # 创建一个只包含元数据和base64编码的输出数据对象 @@ -201,9 +210,10 @@ class FileSystemOutputHandler(OutputHandler): if "original_tags" in input_data and input_data["original_tags"]: output_data["original_tags_base64"] = base64.b64encode(input_data["original_tags"].encode('utf-8')).decode('ascii') - # 5. 审核分析 - if "judge_analysis" in input_data and input_data["judge_analysis"]: - output_data["judge_analysis_base64"] = base64.b64encode(input_data["judge_analysis"].encode('utf-8')).decode('ascii') + # 5. 审核分析 - 检查judge_analysis和不良内容分析两个字段 + judge_analysis = input_data.get("judge_analysis", input_data.get("不良内容分析", "")) + if judge_analysis: + output_data["judge_analysis_base64"] = base64.b64encode(judge_analysis.encode('utf-8')).decode('ascii') logging.info("成功添加Base64编码内容") except Exception as e: @@ -226,20 +236,47 @@ class FileSystemOutputHandler(OutputHandler): # 创建一份article.txt文件以便直接查看 txt_path = os.path.join(variant_dir, "article.txt") try: - # 使用原始内容,保留所有换行符 + # 重新组织内容显示,明确区分原始内容和审核后内容 with open(txt_path, "w", encoding="utf-8") as f: - if original_title: + # 根据审核状态决定显示哪些内容 + is_judged = input_data.get("judged", False) + is_judge_success = input_data.get("judge_success", False) + + if is_judged and is_judge_success: + # 显示审核后的内容 + f.write(f"{judge_title}\n\n") + if judge_content: + f.write(judge_content) + if judge_tags: + f.write(f"\n\n{judge_tags}") + + # 在最后添加原始内容作为参考 + if original_title != judge_title or original_content != judge_content: + f.write("\n\n=== 原始内容 ===\n") + f.write(f"{original_title}\n\n") + if original_content: + f.write(original_content) + if original_tags and original_tags != judge_tags: + f.write(f"\n\n{original_tags}") + elif is_judged and not is_judge_success: + # 审核失败,显示审核失败信息和原始内容 + f.write("审核失败\n\n") f.write(f"{original_title}\n\n") + if original_content: + f.write(original_content) + if original_tags: + f.write(f"\n\n{original_tags}") + else: + # 未审核,直接显示原始内容 + f.write(f"{original_title}\n\n") + if original_content: + f.write(original_content) + if original_tags: + f.write(f"\n\n{original_tags}") - # 保持原始内容的所有换行符 - if original_content: - f.write(original_content) - - if original_tags: - f.write(f"\n\n{original_tags}") - + # 添加审核分析信息(如果有) if original_judge_analysis: - f.write(f"\n\n审核分析:\n{original_judge_analysis}") + f.write(f"\n\n=== 审核分析 ===\n{original_judge_analysis}") logging.info(f"Article text saved to: {txt_path}") except Exception as e: @@ -253,8 +290,16 @@ class FileSystemOutputHandler(OutputHandler): f.write(f"原始内容: {original_content}\n\n") if original_tags: f.write(f"原始标签: {original_tags}\n\n") + + if is_judged: + f.write(f"审核状态: {'成功' if is_judge_success else '失败'}\n") + if is_judge_success: + f.write(f"审核后标题: {judge_title}\n\n") + f.write(f"审核后内容: {judge_content}\n\n") + if original_judge_analysis: f.write(f"审核分析: {original_judge_analysis}\n\n") + f.write("---处理后---\n\n") for key, value in output_data.items(): if isinstance(value, str): @@ -335,7 +380,7 @@ class FileSystemOutputHandler(OutputHandler): # 保存配置到JSON文件 config_file_path = os.path.join(variant_dir, f"topic_{topic_index}_poster_configs.json") with open(config_file_path, 'w', encoding='utf-8') as f: - json.dump(processed_configs, f, ensure_ascii=False, indent=4, cls=self.SafeJSONEncoder) + json.dump(processed_configs, f, ensure_ascii=False, indent=4) logging.info(f"Successfully saved poster configs to {config_file_path}") except Exception as e: logging.error(f"Error saving poster configs: {e}") diff --git a/utils/tweet_generator.py b/utils/tweet_generator.py index 23b90c8..e03f417 100644 --- a/utils/tweet_generator.py +++ b/utils/tweet_generator.py @@ -133,52 +133,94 @@ def generate_topics(ai_agent, system_prompt, user_prompt, run_id, temperature=0. def generate_single_content(ai_agent, system_prompt, user_prompt, item, run_id, - article_index, variant_index, temperature=0.3, top_p=0.4, presence_penalty=1.5): + article_index, variant_index, temperature=0.3, top_p=0.4, presence_penalty=1.5, + max_retries=3): """Generates single content variant data. Returns (content_json, user_prompt) or (None, None).""" logging.info(f"Generating content for topic {article_index}, variant {variant_index}") - try: - if not system_prompt or not user_prompt: - logging.error("System or User prompt is empty. Cannot generate content.") - return None, None + + if not system_prompt or not user_prompt: + logging.error("System or User prompt is empty. Cannot generate content.") + return None, None + + logging.debug(f"Using pre-constructed prompts. User prompt length: {len(user_prompt)}") + + # 实现重试逻辑 + retry_count = 0 + last_result = None + last_tokens = None + last_time_cost = None + + while retry_count <= max_retries: + try: + # 只有重试时增加延迟和调整参数 + if retry_count > 0: + # 添加随机延迟避免频繁请求 + delay = 1 + random.random() * 2 # 1-3秒随机延迟 + logging.info(f"内容生成重试 ({retry_count}/{max_retries}),等待{delay:.1f}秒后尝试...") + time.sleep(delay) + + # 调整温度参数,增加多样性 + adjusted_temperature = min(temperature + (retry_count * 0.1), 0.9) + logging.info(f"调整温度参数为: {adjusted_temperature}") + else: + adjusted_temperature = temperature - logging.debug(f"Using pre-constructed prompts. User prompt length: {len(user_prompt)}") - - time.sleep(random.random() * 0.5) - - # Generate content (non-streaming work returns result, tokens, time_cost) - result, tokens, time_cost = ai_agent.work( - system_prompt, user_prompt, "", temperature, top_p, presence_penalty - ) - - if result is None: # Check if AI call failed - logging.error(f"AI agent work failed for {article_index}_{variant_index}. No result returned.") - return {"title": "", "content": "", "error": True, "judge_success": False}, user_prompt # 添加judge_success字段 + # Generate content (non-streaming work returns result, tokens, time_cost) + result, tokens, time_cost = ai_agent.work( + system_prompt, user_prompt, "", adjusted_temperature, top_p, presence_penalty + ) - logging.info(f"Content generation for {article_index}_{variant_index} completed in {time_cost:.2f}s. Estimated tokens: {tokens}") + last_result = result + last_tokens = tokens + last_time_cost = time_cost + + if result is None: # Check if AI call failed completely + logging.error(f"AI agent work failed for {article_index}_{variant_index}. No result returned.") + retry_count += 1 + continue + + logging.info(f"Content generation for {article_index}_{variant_index} completed in {time_cost:.2f}s. Estimated tokens: {tokens}") - # --- Create tweetContent object (handles parsing) --- - # Pass user_prompt instead of full prompt? Yes, user_prompt is what we need later. - tweet_content = tweetContent(result, user_prompt, run_id, article_index, variant_index) - - # --- Remove Saving Logic --- - # run_specific_output_dir = os.path.join(output_dir, run_id) # output_dir no longer available - # variant_result_dir = os.path.join(run_specific_output_dir, f"{article_index}_{variant_index}") - # os.makedirs(variant_result_dir, exist_ok=True) - # content_save_path = os.path.join(variant_result_dir, "article.json") - # prompt_save_path = os.path.join(variant_result_dir, "tweet_prompt.txt") - # tweet_content.save_content(content_save_path) # Method removed - # tweet_content.save_prompt(prompt_save_path) # Method removed - # --- End Remove Saving Logic --- - - # Return the data needed by the output handler - content_json = tweet_content.get_json_data() - prompt_data = tweet_content.get_prompt() # Get the stored user prompt - - return content_json, prompt_data # Return data pair - - except Exception as e: - logging.exception(f"Error generating single content for {article_index}_{variant_index}:") - return {"title": "", "content": "", "error": True, "judge_success": False}, user_prompt # 添加judge_success字段 + # --- Create tweetContent object (handles parsing) --- + tweet_content = tweetContent(result, user_prompt, run_id, article_index, variant_index) + content_json = tweet_content.get_json_data() + + # 检查是否成功解析到有效内容 + if not content_json.get("error", False) and content_json.get("title") and content_json.get("content"): + # 成功获取有效内容 + if retry_count > 0: + logging.info(f"在第{retry_count}次重试后成功获取有效内容") + # 返回成功结果 + return content_json, user_prompt + else: + logging.warning(f"内容解析失败或内容不完整,结果: {content_json.get('error')}, 标题长度: {len(content_json.get('title', ''))}, 内容长度: {len(content_json.get('content', ''))}") + + # 如果到这里,说明内容生成或解析有问题,需要重试 + retry_count += 1 + + except Exception as e: + logging.exception(f"Error during content generation attempt {retry_count+1} for {article_index}_{variant_index}:") + retry_count += 1 + + if retry_count <= max_retries: + logging.info(f"将尝试第{retry_count}次重试...") + else: + logging.error(f"达到最大重试次数({max_retries}),无法生成有效内容") + + # 所有重试都失败,返回最后一次的结果(即使不完整) + logging.warning(f"在{max_retries}次尝试后仍未生成有效内容,返回最后一次结果") + + # 如果有最后一次结果,尝试使用它 + if last_result: + try: + tweet_content = tweetContent(last_result, user_prompt, run_id, article_index, variant_index) + content_json = tweet_content.get_json_data() + return content_json, user_prompt + except Exception as e: + logging.exception(f"Error processing last result: {e}") + + # 完全失败的情况,返回空内容 + return {"title": "", "content": "", "error": True, "judge_success": False}, user_prompt def generate_content(ai_agent, system_prompt, topics, output_dir, run_id, prompts_dir, resource_dir, variants=2, temperature=0.3, start_index=0, end_index=None): @@ -457,8 +499,8 @@ def generate_content_for_topic(ai_agent: AI_Agent, logging.info("成功获取产品资料,初始化ContentJudger...") # 从配置中读取系统提示词路径(脚本级别无法直接获取,需要传递) # 使用ai_agent的model_name或api_url判断是否使用主AI模型,避免额外资源占用 - content_judger_system_prompt_path = prompt_manager._system_prompt_cache.get("judger_system_prompt") - content_judger = ContentJudger(ai_agent, system_prompt_path=content_judger_system_prompt_path) + content_judger_system_prompt = prompt_manager._system_prompt_cache.get("judger_system_prompt") + content_judger = ContentJudger(ai_agent, system_prompt=content_judger_system_prompt) else: logging.warning("未能获取产品资料,内容审核功能将被跳过") enable_content_judge = False @@ -521,9 +563,9 @@ def generate_content_for_topic(ai_agent: AI_Agent, content_json["judged"] = True # 添加judge_success状态 content_json["judge_success"] = judged_result.get("judge_success", False) - # 可选:保存审核分析结果 - if "不良内容分析" in judged_result: - content_json["judge_analysis"] = judged_result["不良内容分析"] + # 处理分析结果,优先使用"analysis"字段,兼容"不良内容分析"字段 + if "analysis" in judged_result: + content_json["judge_analysis"] = judged_result["analysis"] else: logging.warning(f" 审核结果缺少title或content字段,保留原内容") content_json["judge_success"] = False