修复了曾导致提取失败的审核器问题

This commit is contained in:
jinye_huang 2025-05-12 09:55:18 +08:00
parent cd44bbda6f
commit 8085a9a723
6 changed files with 169 additions and 39 deletions

View File

@ -51,9 +51,9 @@ class ContentJudger:
3. 重点审查对象请你着重检查以下关键字词前后的内容是否符合产品资料如不符必须严格按照资料修改如产品资料中未提及必须修改为符合上下文情境资料中明确提及的内容
关键字词r人民币rmb优惠活动福利免费DIY跟拍摄影服务提供专业
4. 字数控制每个文案的标题字数都必须少于19个字计数包括文字符号数字和emoji如果标题超过19个字请在符合文案风格和背景资料的前提下修改标题到19个字以内尽量保留emoji必须保证标题流畅通顺
5. 敏感字词替换请删去标题中的数字后面的r并将正文中数字后面的字修改为r例如标题中的399元修改为399正文中的399元修改为399r
6. 特征语句保留请保留文案中原本的引流语句不要修改或删除例如先关zhu+留下99看到会回复
7. 面向人群保留请尽量保留文案原本的面向人群和风格这是同一产品面向多种人群营销的策略例如产品资料中写明亲子游时文案写为情侣定制的山水秘境是可以接受的
5. 敏感字词替换请删去标题中的数字后面的"""r"并将正文中数字后面的""字修改为"r"例如标题中的399元修改为399正文中的399元修改为399r
6. 特征语句保留请保留文案中原本的引流语句不要修改或删除例如"先关zhu+留下99看到会回复"
7. 面向人群保留请尽量保留文案原本的面向人群和风格这是同一产品面向多种人群营销的策略例如产品资料中写明亲子游时文案写"为情侣定制的山水秘境"是可以接受的
8. 案例如下请参考案例评判真假信息的尺度逐行逐句仔细分析不符点和修改思路并按照分析思路落实对每一处不符的修改措施严格审查每一篇文案
{
"产品资料"
@ -126,7 +126,7 @@ class ContentJudger:
输出结果:
{ "不良内容分析" : "
1观察文案标题和内容可以看出此文案主要面向亲子出游人群因此修改后的文案也应该围绕亲子出游这一主题
2文章标题字数为28个字超过19个字因此属于不符内容由于要求中提到尽量保留emoji并且标题中数字后面的字应删去所以修改为五一遛娃👶必囤喜来登1088景观房
2文章标题字数为28个字超过19个字因此属于不符内容由于要求中提到尽量保留emoji并且标题中数字后面的""字应删去所以修改为五一遛娃👶必囤喜来登1088景观房
3产品资料中未提及儿童乐园开放时间和儿童乐园配置但文案中提到儿童乐园10:00-20:00全程开放滑梯/积木/绘本一应俱全因此属于不符内容应修改为儿童乐园免费儿童乐园和丰富的游乐设施让孩子们可以尽情玩耍
4产品材料中未提及户外泳池开放时间和消毒频次但文案中提到户外泳池9:00-18:00恒温开放五一期间每日消毒3次因此属于不符内容应修改为户外泳池酒店配有户外无边泳池供大人小孩一同享受清凉时光
5产品材料中未提及健身房开放时间与具体细节但文案中提到健身房8:00-22:00配备亲子瑜伽课程需提前预约因此属于不符内容应修改为健身房酒店提供免费健身中心方便您和家人一起强身健体
@ -174,7 +174,7 @@ class ContentJudger:
presence_penalty: 存在惩罚参数
Returns:
dict: 审核后的结果JSON包含修改后的title和content
dict: 审核后的结果JSON包含修改后的title和content以及judge_success状态
"""
logging.info("开始内容审核流程")
# 构建用户提示词
@ -198,16 +198,43 @@ class ContentJudger:
end_time = time.time()
logging.info(f"AI模型响应完成耗时{end_time - start_time:.2f}")
# 保存原始响应用于调试
response_log_dir = "/root/autodl-tmp/TravelContentCreator/log/judge_responses"
os.makedirs(response_log_dir, exist_ok=True)
response_log_file = f"{response_log_dir}/response_{int(time.time())}.txt"
with open(response_log_file, "w", encoding="utf-8") as f:
f.write(result)
logging.info(f"原始响应已保存到: {response_log_file}")
# 提取修改后的内容
modified_content = self._extract_modified_content(result)
if modified_content:
logging.info("成功提取修改后的内容")
# 添加judge_success字段
modified_content["judge_success"] = True
return modified_content
else:
return {"title": "提取失败", "content": "无法从响应中提取有效内容"}
logging.error("无法从响应中提取有效内容")
# 尝试使用原始内容并标记审核失败
if isinstance(content, dict) and "title" in content and "content" in content:
return {
"title": content.get("title", "提取失败"),
"content": content.get("content", "无法从响应中提取有效内容"),
"judge_success": False
}
return {
"title": "提取失败",
"content": "无法从响应中提取有效内容",
"judge_success": False
}
except Exception as e:
return {"title": "审核失败", "content": f"审核过程中出错: {str(e)}"}
logging.exception(f"审核过程中出错: {e}")
return {
"title": "审核失败",
"content": f"审核过程中出错: {str(e)}",
"judge_success": False
}
def _build_user_prompt(self, product_info, content_gen):
"""
@ -229,21 +256,106 @@ class ContentJudger:
"""
def _extract_modified_content(self, result_text):
"""从检测结果文本中提取修改后的文案内容"""
try:
processed_text = result_text # Work on a copy of the input text
# 记录原始文本前100个字符用于调试
logging.debug(f"原始响应文本前100字符: {result_text[:100]}")
if "</think>" in processed_text:
processed_text = processed_text.split("</think>", 1)[1].strip()
logging.debug("检测到</think>标签并分离内容")
# Attempt 1: Parse as JSON from the processed text
json_start = processed_text.find('{')
json_end = processed_text.rfind('}') + 1
if json_start >= 0 and json_end > json_start:
json_str = processed_text[json_start:json_end]
logging.debug(f"找到JSON字符串长度: {len(json_str)}前100字符: {json_str[:100]}")
# Clean control characters that might break JSON parsing
json_str_cleaned = re.sub(r'[\x00-\x1F\x7F]', '', json_str)
try:
content_json = json.loads(json_str_cleaned)
if "title" in content_json and "content" in content_json:
logging.info("Successfully parsed JSON content from AI response.")
return {
"title": content_json["title"].strip(),
"content": content_json["content"].strip()
}
except json.JSONDecodeError as e:
logging.warning(f"JSON parsing failed for substring: '{json_str_cleaned[:100]}...'. Error: {e}. Will attempt regex extraction.")
# Attempt 2: Regex on the processed_text (which might have had </think> stripped)
# 修复正则表达式,移除多余的反斜杠
logging.debug("尝试使用正则表达式提取")
title_match = re.search(r'"title":\s*"([^"]*)"', processed_text)
content_match = re.search(r'"content":\s*"([^"]*)"', processed_text)
if title_match and content_match:
logging.info("Successfully extracted title/content using regex.")
return {
"title": title_match.group(1).strip(),
"content": content_match.group(1).strip()
}
# Attempt 3: Try finding content with single quotes
logging.debug("尝试查找使用单引号的内容")
title_match = re.search(r'"title":\s*\'([^\']*)\'', processed_text)
content_match = re.search(r'"content":\s*\'([^\']*)\'', processed_text)
if title_match and content_match:
logging.info("Successfully extracted title/content using single-quote regex.")
return {
"title": title_match.group(1).strip(),
"content": content_match.group(1).strip()
}
# Final attempt: Look for key-value pairs without standard JSON formatting
logging.debug("尝试非标准格式提取")
title_pattern = re.compile(r'["""]?title["""]?[:]\s*["""]([^"""]+)["""]', re.IGNORECASE)
content_pattern = re.compile(r'["""]?content["""]?[:]\s*["""]([^"""]+)["""]', re.IGNORECASE)
title_match = title_pattern.search(processed_text)
content_match = content_pattern.search(processed_text)
if title_match and content_match:
logging.info("提取到标题和内容(使用灵活模式匹配)")
return {
"title": title_match.group(1).strip(),
"content": content_match.group(1).strip()
}
logging.warning(f"所有提取方法失败响应前300字符: {processed_text[:300]}...")
return None # Fallback if all extraction methods fail
except Exception as e:
logging.error(f"Unexpected error during content extraction: {e}\n{traceback.format_exc()}")
return None
def test_extraction_from_file(self, response_file_path):
"""
从检测结果文本中提取修改后的文案内容
文件中读取响应并测试提取功能
Args:
result_text: AI响应的文本
response_file_path: 响应文件路径
Returns:
dict or None: 提取的内容JSON提取失败则返回None
dict: 提取结果
"""
try:
result_text = result_text.split("</think>")[1]
logging.info(f"从文件测试提取: {response_file_path}")
with open(response_file_path, 'r', encoding='utf-8') as f:
response_text = f.read()
## 舍弃
return json.loads(result_text)
result = self._extract_modified_content(response_text)
if result:
logging.info(f"成功从文件提取内容: {result.get('title', '')[:30]}...")
return {"success": True, "result": result}
else:
logging.error(f"从文件中提取内容失败")
return {"success": False, "error": "提取失败"}
except Exception as e:
logging.error(f"提取内容时发生错误: {e}")
return None
logging.exception(f"测试提取时发生错误: {e}")
return {"success": False, "error": str(e)}

View File

@ -1,6 +1,7 @@
import os
import random
import json
import logging
class ResourceLoader:
"""资源加载器,用于加载提示词和参考资料"""
@ -13,11 +14,11 @@ class ResourceLoader:
content = f.read()
return content
else:
print(f"文件不存在: {file_path}")
logging.warning(f"文件不存在: {file_path}")
# Return None for non-existent file to distinguish from empty file
return None
except Exception as e:
print(f"加载文件 '{file_path}' 内容失败: {e}")
logging.warning(f"加载文件 '{file_path}' 内容失败: {e}")
# Return None on error as well
return None
@ -26,10 +27,10 @@ class ResourceLoader:
"""加载Refer目录下的指定文件内容"""
refer_content = ""
if not file_path or not os.path.isfile(file_path):
print(f"Warning: Refer directory '{file_path}' not found or invalid.")
logging.warning(f"Warning: Refer directory '{file_path}' not found or invalid.")
return ""
try:
if True: # print(file_path)
if True:
if os.path.isfile(file_path) and file_path.endswith(".txt"):
# Use the updated load_file_content
content = ResourceLoader.load_file_content(file_path)
@ -49,7 +50,7 @@ class ResourceLoader:
# 检查必要的键是否存在
if "title" not in file_content or "description" not in file_content or "examples" not in file_content:
print(f"Warning: JSON文件 '{file_path}' 缺少必要的键(title/description/examples)")
logging.warning(f"Warning: JSON文件 '{file_path}' 缺少必要的键(title/description/examples)")
title_content = file_content["title"]
description_content = file_content["description"]
@ -66,12 +67,12 @@ class ResourceLoader:
refer_content += f"## {file_path}\n{content}\n\n"
else:
print(f"Warning: JSON文件 '{file_path}' 的examples不是有效列表")
logging.warning(f"Warning: JSON文件 '{file_path}' 的examples不是有效列表")
except Exception as json_err:
print(f"处理JSON文件 '{file_path}' 失败: {json_err}")
logging.warning(f"处理JSON文件 '{file_path}' 失败: {json_err}")
return refer_content
except Exception as e:
print(f"加载Refer目录文件失败: {e}")
logging.warning(f"加载Refer目录文件失败: {e}")
return ""
@staticmethod
@ -98,7 +99,7 @@ class ResourceLoader:
return None
except Exception as e:
print(f"查找文件 '{file_name}''{directory}' 失败: {e}")
logging.warning(f"查找文件 '{file_name}''{directory}' 失败: {e}")
return None
@staticmethod
@ -125,7 +126,7 @@ class ResourceLoader:
f.write(f"```\n{result}\n```\n\n")
f.write("--------------------------------\n\n")
except Exception as e:
print(f"更新汇总文件时出错: {e}")
logging.warning(f"更新汇总文件时出错: {e}")
@staticmethod
def save_article(result, prompt, output_dir, run_id, article_index, variant_index):
@ -145,5 +146,5 @@ class ResourceLoader:
return filepath
except Exception as e:
print(f"保存文章时出错: {e}")
logging.warning(f"保存文章时出错: {e}")
return None

View File

@ -88,14 +88,22 @@ class tweetContent:
json_data = json.loads(processed_result)
json_data["error"] = False
json_data["raw_result"] = None
# 确保judge_success字段存在
if "judge_success" not in json_data:
json_data["judge_success"] = None
return json_data
# --- End Existing Logic ---
except Exception as e:
logging.warning(f"解析内容时出错: {e}, 返回空字符串")
json_data["error"] = True
json_data["raw_result"] = e
return json_data
logging.warning(f"解析内容时出错: {e}, 使用默认空内容")
# 创建一个新的json_data而不是使用未定义的变量
return {
"title": "",
"content": "",
"error": True,
"raw_result": str(e),
"judge_success": False
}
def get_json_data(self):
"""Returns the generated JSON data dictionary."""
@ -159,7 +167,7 @@ def generate_single_content(ai_agent, system_prompt, user_prompt, item, run_id,
if result is None: # Check if AI call failed
logging.error(f"AI agent work failed for {article_index}_{variant_index}. No result returned.")
return {"title": "", "content": "", "error": True}, user_prompt # 返回空字段而不是None
return {"title": "", "content": "", "error": True, "judge_success": False}, user_prompt # 添加judge_success字段
logging.info(f"Content generation for {article_index}_{variant_index} completed in {time_cost:.2f}s. Estimated tokens: {tokens}")
@ -185,13 +193,13 @@ def generate_single_content(ai_agent, system_prompt, user_prompt, item, run_id,
except Exception as e:
logging.exception(f"Error generating single content for {article_index}_{variant_index}:")
return {"title": "", "content": "", "error": True}, user_prompt # 返回空字段而不是None
return {"title": "", "content": "", "error": True, "judge_success": False}, user_prompt # 添加judge_success字段
def generate_content(ai_agent, system_prompt, topics, output_dir, run_id, prompts_dir, resource_dir,
variants=2, temperature=0.3, start_index=0, end_index=None):
"""根据选题生成内容"""
if not topics:
print("没有选题,无法生成内容")
logging.warning("没有选题,无法生成内容")
return
# 确定处理范围
@ -199,7 +207,7 @@ def generate_content(ai_agent, system_prompt, topics, output_dir, run_id, prompt
end_index = len(topics)
topics_to_process = topics[start_index:end_index]
print(f"准备处理{len(topics_to_process)}个选题...")
logging.info(f"准备处理{len(topics_to_process)}个选题...")
# 创建汇总文件
# summary_file = ResourceLoader.create_summary_file(output_dir, run_id, len(topics_to_process))
@ -207,11 +215,11 @@ def generate_content(ai_agent, system_prompt, topics, output_dir, run_id, prompt
# 处理每个选题
processed_results = []
for i, item in enumerate(topics_to_process):
print(f"处理第 {i+1}/{len(topics_to_process)} 篇文章")
logging.info(f"处理第 {i+1}/{len(topics_to_process)} 篇文章")
# 为每个选题生成多个变体
for j in range(variants):
print(f"正在生成变体 {j+1}/{variants}")
logging.info(f"正在生成变体 {j+1}/{variants}")
# 调用单篇文章生成函数
tweet_content, result = generate_single_content(
@ -225,7 +233,7 @@ def generate_content(ai_agent, system_prompt, topics, output_dir, run_id, prompt
# if j == 0:
# ResourceLoader.update_summary(summary_file, i+1, user_prompt, result)
print(f"完成{len(processed_results)}篇文章生成")
logging.info(f"完成{len(processed_results)}篇文章生成")
return processed_results
@ -520,15 +528,24 @@ content: {content_json.get('content', '')}
content_json["content"] = judged_result["content"]
# 添加审核标记
content_json["judged"] = True
# 添加judge_success状态
content_json["judge_success"] = judged_result.get("judge_success", False)
# 可选:保存审核分析结果
if "不良内容分析" in judged_result:
content_json["judge_analysis"] = judged_result["不良内容分析"]
else:
logging.warning(f" 审核结果缺少title或content字段保留原内容")
content_json["judge_success"] = False
else:
logging.warning(f" 内容审核返回无效结果,保留原内容")
content_json["judge_success"] = False
except Exception as judge_err:
logging.exception(f" 内容审核过程出错: {judge_err},保留原内容")
content_json["judge_success"] = False
else:
# 未启用内容审核时,添加相应标记
content_json["judged"] = False
content_json["judge_success"] = None
# Use the output handler to process/save the result
output_handler.handle_content_variant(
@ -859,7 +876,7 @@ def generate_posters_for_topic(topic_item: dict,
collage_img = collage_images[0] # 获取第一个 PIL Image
used_image_files = used_image_filenames[0] if used_image_filenames else [] # 获取使用的图片文件名
logging.info(f"Collage image generated successfully (in memory). Used images: {used_image_files}")
print(f"拼贴图使用的图片文件: {used_image_files}")
logging.info(f"拼贴图使用的图片文件: {used_image_files}")
# --- 使用 Handler 保存 Collage 图片和使用的图片文件信息 ---
output_handler.handle_generated_image(