修复了曾导致提取失败的审核器问题

2025-05-12 09:55:18 +08:00 · 2025-05-12 09:55:18 +08:00 · 8085a9a723
commit 8085a9a723
parent cd44bbda6f
6 changed files with 169 additions and 39 deletions
--- a/utils/pycache/content_judger.cpython-312.pyc
+++ b/utils/pycache/content_judger.cpython-312.pyc
--- a/utils/pycache/output_handler.cpython-312.pyc
+++ b/utils/pycache/output_handler.cpython-312.pyc
--- a/utils/pycache/tweet_generator.cpython-312.pyc
+++ b/utils/pycache/tweet_generator.cpython-312.pyc
--- a/utils/content_judger.py
+++ b/utils/content_judger.py
@ -51,9 +51,9 @@ class ContentJudger:
 3. 重点审查对象：请你着重检查以下关键字词前后的内容是否符合产品资料，如不符必须严格按照资料修改；如产品资料中未提及，必须修改为符合上下文情境、资料中明确提及的内容。
 关键字词：价、元、r、人民币、rmb、优惠、活动、福利、赠、免费、折、DIY、跟拍、送、摄影、兑、服务、￥、包、课、提供、选、专业、补、差
 4. 字数控制：每个文案的标题字数都必须少于19个字（计数包括文字、符号、数字和emoji）。如果标题超过19个字，请在符合文案风格和背景资料的前提下修改标题到19个字以内，尽量保留emoji，必须保证标题流畅通顺。
-5. 敏感字词替换：请删去标题中的数字后面的“元”和“r”，并将正文中数字后面的“元”字修改为“r”。例如：标题中的399元修改为399，正文中的399元修改为399r                
-6. 特征语句保留：请保留文案中原本的引流语句，不要修改或删除，例如“先关zhu+留下99看到会回复”
-7. 面向人群保留：请尽量保留文案原本的面向人群和风格，这是同一产品面向多种人群营销的策略。例如产品资料中写明亲子游时，文案写“为情侣定制的山水秘境”是可以接受的。
+5. 敏感字词替换：请删去标题中的数字后面的"元"和"r"，并将正文中数字后面的"元"字修改为"r"。例如：标题中的399元修改为399，正文中的399元修改为399r                
+6. 特征语句保留：请保留文案中原本的引流语句，不要修改或删除，例如"先关zhu+留下99看到会回复"
+7. 面向人群保留：请尽量保留文案原本的面向人群和风格，这是同一产品面向多种人群营销的策略。例如产品资料中写明亲子游时，文案写"为情侣定制的山水秘境"是可以接受的。
 8. 案例如下，请参考案例评判真假信息的尺度，逐行逐句仔细分析不符点和修改思路，并按照分析思路落实对每一处不符的修改措施，严格审查每一篇文案：
 {  
 "产品资料"：         
@ -126,7 +126,7 @@ class ContentJudger:
 输出结果:
 {   "不良内容分析" : "                 
    1、观察文案标题和内容，可以看出此文案主要面向亲子出游人群，因此修改后的文案也应该围绕亲子出游这一主题。
-    2、文章标题字数为28个字，超过19个字，因此属于不符内容。由于要求中提到尽量保留emoji，并且标题中数字后面的“元”字应删去，所以修改为：五一遛娃👶必囤！喜来登1088景观房
+    2、文章标题字数为28个字，超过19个字，因此属于不符内容。由于要求中提到尽量保留emoji，并且标题中数字后面的"元"字应删去，所以修改为：五一遛娃👶必囤！喜来登1088景观房
    3、产品资料中未提及儿童乐园开放时间和儿童乐园配置，但文案中提到儿童乐园10:00-20:00全程开放，滑梯/积木/绘本一应俱全，因此属于不符内容。应修改为：儿童乐园：免费儿童乐园和丰富的游乐设施，让孩子们可以尽情玩耍。 
    4、产品材料中未提及户外泳池开放时间和消毒频次，但文案中提到户外泳池：9:00-18:00恒温开放（五一期间每日消毒3次），因此属于不符内容。应修改为：户外泳池：酒店配有户外无边泳池，供大人小孩一同享受清凉时光。 
    5、产品材料中未提及健身房开放时间与具体细节，但文案中提到健身房：8:00-22:00配备亲子瑜伽课程（需提前预约），因此属于不符内容。应修改为：健身房：酒店提供免费健身中心，方便您和家人一起强身健体。
@ -174,7 +174,7 @@ class ContentJudger:
            presence_penalty: 存在惩罚参数
            
        Returns:
-            dict: 审核后的结果JSON，包含修改后的title和content
+            dict: 审核后的结果JSON，包含修改后的title和content以及judge_success状态
        """
        logging.info("开始内容审核流程")
        # 构建用户提示词
@ -198,16 +198,43 @@ class ContentJudger:
            end_time = time.time()
            logging.info(f"AI模型响应完成，耗时：{end_time - start_time:.2f}秒")
            
+            # 保存原始响应用于调试
+            response_log_dir = "/root/autodl-tmp/TravelContentCreator/log/judge_responses"
+            os.makedirs(response_log_dir, exist_ok=True)
+            response_log_file = f"{response_log_dir}/response_{int(time.time())}.txt"
+            with open(response_log_file, "w", encoding="utf-8") as f:
+                f.write(result)
+            logging.info(f"原始响应已保存到: {response_log_file}")
+            
            # 提取修改后的内容
            modified_content = self._extract_modified_content(result)
            if modified_content:
                logging.info("成功提取修改后的内容")
+                # 添加judge_success字段
+                modified_content["judge_success"] = True
                return modified_content
            else:
-                return {"title": "提取失败", "content": "无法从响应中提取有效内容"}
+                logging.error("无法从响应中提取有效内容")
+                # 尝试使用原始内容并标记审核失败
+                if isinstance(content, dict) and "title" in content and "content" in content:
+                    return {
+                        "title": content.get("title", "提取失败"),
+                        "content": content.get("content", "无法从响应中提取有效内容"),
+                        "judge_success": False
+                    }
+                return {
+                    "title": "提取失败", 
+                    "content": "无法从响应中提取有效内容",
+                    "judge_success": False
+                }
        
        except Exception as e:
-            return {"title": "审核失败", "content": f"审核过程中出错: {str(e)}"}
+            logging.exception(f"审核过程中出错: {e}")
+            return {
+                "title": "审核失败", 
+                "content": f"审核过程中出错: {str(e)}",
+                "judge_success": False
+            }
    
    def _build_user_prompt(self, product_info, content_gen):
        """
@ -229,21 +256,106 @@ class ContentJudger:
 """
    
    def _extract_modified_content(self, result_text):
+        """从检测结果文本中提取修改后的文案内容"""
+        try:
+            processed_text = result_text # Work on a copy of the input text
+            # 记录原始文本前100个字符用于调试
+            logging.debug(f"原始响应文本前100字符: {result_text[:100]}")
+            
+            if "</think>" in processed_text:
+                processed_text = processed_text.split("</think>", 1)[1].strip()
+                logging.debug("检测到</think>标签并分离内容")
+
+            # Attempt 1: Parse as JSON from the processed text
+            json_start = processed_text.find('{')
+            json_end = processed_text.rfind('}') + 1
+            if json_start >= 0 and json_end > json_start:
+                json_str = processed_text[json_start:json_end]
+                logging.debug(f"找到JSON字符串，长度: {len(json_str)}，前100字符: {json_str[:100]}")
+                
+                # Clean control characters that might break JSON parsing
+                json_str_cleaned = re.sub(r'[\x00-\x1F\x7F]', '', json_str)
+                try:
+                    content_json = json.loads(json_str_cleaned)
+                    if "title" in content_json and "content" in content_json:
+                        logging.info("Successfully parsed JSON content from AI response.")
+                        return {
+                            "title": content_json["title"].strip(),
+                            "content": content_json["content"].strip()
+                        }
+                except json.JSONDecodeError as e:
+                    logging.warning(f"JSON parsing failed for substring: '{json_str_cleaned[:100]}...'. Error: {e}. Will attempt regex extraction.")
+
+            # Attempt 2: Regex on the processed_text (which might have had </think> stripped)
+            # 修复正则表达式，移除多余的反斜杠
+            logging.debug("尝试使用正则表达式提取")
+            title_match = re.search(r'"title":\s*"([^"]*)"', processed_text)
+            content_match = re.search(r'"content":\s*"([^"]*)"', processed_text)
+            
+            if title_match and content_match:
+                logging.info("Successfully extracted title/content using regex.")
+                return {
+                    "title": title_match.group(1).strip(),
+                    "content": content_match.group(1).strip()
+                }
+            
+            # Attempt 3: Try finding content with single quotes
+            logging.debug("尝试查找使用单引号的内容")
+            title_match = re.search(r'"title":\s*\'([^\']*)\'', processed_text)
+            content_match = re.search(r'"content":\s*\'([^\']*)\'', processed_text)
+            
+            if title_match and content_match:
+                logging.info("Successfully extracted title/content using single-quote regex.")
+                return {
+                    "title": title_match.group(1).strip(),
+                    "content": content_match.group(1).strip()
+                }
+            
+            # Final attempt: Look for key-value pairs without standard JSON formatting
+            logging.debug("尝试非标准格式提取")
+            title_pattern = re.compile(r'["""]?title["""]?[:：]\s*["""]([^"""]+)["""]', re.IGNORECASE)
+            content_pattern = re.compile(r'["""]?content["""]?[:：]\s*["""]([^"""]+)["""]', re.IGNORECASE)
+            
+            title_match = title_pattern.search(processed_text)
+            content_match = content_pattern.search(processed_text)
+            
+            if title_match and content_match:
+                logging.info("提取到标题和内容（使用灵活模式匹配）")
+                return {
+                    "title": title_match.group(1).strip(),
+                    "content": content_match.group(1).strip()
+                }
+            
+            logging.warning(f"所有提取方法失败，响应前300字符: {processed_text[:300]}...")
+            return None # Fallback if all extraction methods fail
+
+        except Exception as e:
+            logging.error(f"Unexpected error during content extraction: {e}\n{traceback.format_exc()}")
+            return None
+
+    def test_extraction_from_file(self, response_file_path):
        """
-        从检测结果文本中提取修改后的文案内容
+        从文件中读取响应并测试提取功能
        
        Args:
-            result_text: AI响应的文本
+            response_file_path: 响应文件路径
            
        Returns:
-            dict or None: 提取的内容JSON，提取失败则返回None
+            dict: 提取结果
        """
        try:
-            result_text = result_text.split("</think>")[1]
+            logging.info(f"从文件测试提取: {response_file_path}")
+            with open(response_file_path, 'r', encoding='utf-8') as f:
+                response_text = f.read()
            
-            ## 舍弃
-            
-            return json.loads(result_text)
+            result = self._extract_modified_content(response_text)
+            if result:
+                logging.info(f"成功从文件提取内容: {result.get('title', '')[:30]}...")
+                return {"success": True, "result": result}
+            else:
+                logging.error(f"从文件中提取内容失败")
+                return {"success": False, "error": "提取失败"}
+        
        except Exception as e:
-            logging.error(f"提取内容时发生错误: {e}")
-            return None
+            logging.exception(f"测试提取时发生错误: {e}")
+            return {"success": False, "error": str(e)}
--- a/utils/resource_loader.py
+++ b/utils/resource_loader.py
@ -1,6 +1,7 @@
 import os
 import random
 import json
+import logging
 class ResourceLoader:
    """资源加载器，用于加载提示词和参考资料"""
    
@ -13,11 +14,11 @@ class ResourceLoader:
                    content = f.read()
                return content
            else:
-                print(f"文件不存在: {file_path}")
+                logging.warning(f"文件不存在: {file_path}")
                # Return None for non-existent file to distinguish from empty file
                return None 
        except Exception as e:
-            print(f"加载文件 '{file_path}' 内容失败: {e}")
+            logging.warning(f"加载文件 '{file_path}' 内容失败: {e}")
            # Return None on error as well
            return None
    
@ -26,10 +27,10 @@ class ResourceLoader:
        """加载Refer目录下的指定文件内容"""
        refer_content = ""
        if not file_path or not os.path.isfile(file_path):
-             print(f"Warning: Refer directory '{file_path}' not found or invalid.")
+             logging.warning(f"Warning: Refer directory '{file_path}' not found or invalid.")
             return ""
        try:
-            if True:                # print(file_path)
+            if True:                
                if os.path.isfile(file_path) and file_path.endswith(".txt"):
                    # Use the updated load_file_content
                    content = ResourceLoader.load_file_content(file_path)
@ -49,7 +50,7 @@ class ResourceLoader:
                        
                        # 检查必要的键是否存在
                        if "title" not in file_content or "description" not in file_content or "examples" not in file_content:
-                            print(f"Warning: JSON文件 '{file_path}' 缺少必要的键(title/description/examples)")
+                            logging.warning(f"Warning: JSON文件 '{file_path}' 缺少必要的键(title/description/examples)")

                        title_content = file_content["title"]
                        description_content = file_content["description"]
@ -66,12 +67,12 @@ class ResourceLoader:
                            
                            refer_content += f"## {file_path}\n{content}\n\n"
                        else:
-                            print(f"Warning: JSON文件 '{file_path}' 的examples不是有效列表")
+                            logging.warning(f"Warning: JSON文件 '{file_path}' 的examples不是有效列表")
                    except Exception as json_err:
-                        print(f"处理JSON文件 '{file_path}' 失败: {json_err}")
+                        logging.warning(f"处理JSON文件 '{file_path}' 失败: {json_err}")
            return refer_content
        except Exception as e:
-            print(f"加载Refer目录文件失败: {e}")
+            logging.warning(f"加载Refer目录文件失败: {e}")
            return ""
    
    @staticmethod
@ -98,7 +99,7 @@ class ResourceLoader:
            
            return None
        except Exception as e:
-            print(f"查找文件 '{file_name}' 在 '{directory}' 失败: {e}")
+            logging.warning(f"查找文件 '{file_name}' 在 '{directory}' 失败: {e}")
            return None
    
    @staticmethod
@ -125,7 +126,7 @@ class ResourceLoader:
                f.write(f"```\n{result}\n```\n\n")
                f.write("--------------------------------\n\n")
        except Exception as e:
-            print(f"更新汇总文件时出错: {e}")
+            logging.warning(f"更新汇总文件时出错: {e}")
         
    @staticmethod
    def save_article(result, prompt, output_dir, run_id, article_index, variant_index):
@ -145,5 +146,5 @@ class ResourceLoader:
            
            return filepath
        except Exception as e:
-            print(f"保存文章时出错: {e}")
+            logging.warning(f"保存文章时出错: {e}")
            return None 
--- a/utils/tweet_generator.py
+++ b/utils/tweet_generator.py
@ -88,14 +88,22 @@ class tweetContent:
            json_data = json.loads(processed_result)
            json_data["error"] = False
            json_data["raw_result"] = None
+            # 确保judge_success字段存在
+            if "judge_success" not in json_data:
+                json_data["judge_success"] = None
            return json_data
            # --- End Existing Logic --- 
            
        except Exception as e:
-            logging.warning(f"解析内容时出错: {e}, 返回空字符串")
-            json_data["error"] = True
-            json_data["raw_result"] = e
-            return json_data
+            logging.warning(f"解析内容时出错: {e}, 使用默认空内容")
+            # 创建一个新的json_data而不是使用未定义的变量
+            return {
+                "title": "", 
+                "content": "", 
+                "error": True, 
+                "raw_result": str(e),
+                "judge_success": False
+            }
    
    def get_json_data(self):
        """Returns the generated JSON data dictionary."""
@ -159,7 +167,7 @@ def generate_single_content(ai_agent, system_prompt, user_prompt, item, run_id,
        
        if result is None: # Check if AI call failed
            logging.error(f"AI agent work failed for {article_index}_{variant_index}. No result returned.")
-            return {"title": "", "content": "", "error": True}, user_prompt  # 返回空字段而不是None
+            return {"title": "", "content": "", "error": True, "judge_success": False}, user_prompt  # 添加judge_success字段
            
        logging.info(f"Content generation for {article_index}_{variant_index} completed in {time_cost:.2f}s. Estimated tokens: {tokens}")

@ -185,13 +193,13 @@ def generate_single_content(ai_agent, system_prompt, user_prompt, item, run_id,
        
    except Exception as e:
        logging.exception(f"Error generating single content for {article_index}_{variant_index}:")
-        return {"title": "", "content": "", "error": True}, user_prompt  # 返回空字段而不是None
+        return {"title": "", "content": "", "error": True, "judge_success": False}, user_prompt  # 添加judge_success字段

 def generate_content(ai_agent, system_prompt, topics, output_dir, run_id, prompts_dir, resource_dir,
                    variants=2, temperature=0.3, start_index=0, end_index=None):
    """根据选题生成内容"""
    if not topics:
-        print("没有选题，无法生成内容")
+        logging.warning("没有选题，无法生成内容")
        return
    
    # 确定处理范围
@ -199,7 +207,7 @@ def generate_content(ai_agent, system_prompt, topics, output_dir, run_id, prompt
        end_index = len(topics)
    
    topics_to_process = topics[start_index:end_index]
-    print(f"准备处理{len(topics_to_process)}个选题...")
+    logging.info(f"准备处理{len(topics_to_process)}个选题...")
    
    # 创建汇总文件
    # summary_file = ResourceLoader.create_summary_file(output_dir, run_id, len(topics_to_process))
@ -207,11 +215,11 @@ def generate_content(ai_agent, system_prompt, topics, output_dir, run_id, prompt
    # 处理每个选题
    processed_results = []
    for i, item in enumerate(topics_to_process):
-        print(f"处理第 {i+1}/{len(topics_to_process)} 篇文章")
+        logging.info(f"处理第 {i+1}/{len(topics_to_process)} 篇文章")
        
        # 为每个选题生成多个变体
        for j in range(variants):
-            print(f"正在生成变体 {j+1}/{variants}")
+            logging.info(f"正在生成变体 {j+1}/{variants}")
            
            # 调用单篇文章生成函数
            tweet_content, result = generate_single_content(
@ -225,7 +233,7 @@ def generate_content(ai_agent, system_prompt, topics, output_dir, run_id, prompt
                # if j == 0:
                #     ResourceLoader.update_summary(summary_file, i+1, user_prompt, result)
    
-    print(f"完成{len(processed_results)}篇文章生成")
+    logging.info(f"完成{len(processed_results)}篇文章生成")
    return processed_results


@ -520,15 +528,24 @@ content: {content_json.get('content', '')}
                                content_json["content"] = judged_result["content"]
                                # 添加审核标记
                                content_json["judged"] = True
+                                # 添加judge_success状态
+                                content_json["judge_success"] = judged_result.get("judge_success", False)
                                # 可选：保存审核分析结果
                                if "不良内容分析" in judged_result:
                                    content_json["judge_analysis"] = judged_result["不良内容分析"]
                            else:
                                logging.warning(f"  审核结果缺少title或content字段，保留原内容")
+                                content_json["judge_success"] = False
                        else:
                            logging.warning(f"  内容审核返回无效结果，保留原内容")
+                            content_json["judge_success"] = False
                    except Exception as judge_err:
                        logging.exception(f"  内容审核过程出错: {judge_err}，保留原内容")
+                        content_json["judge_success"] = False
+                else:
+                    # 未启用内容审核时，添加相应标记
+                    content_json["judged"] = False
+                    content_json["judge_success"] = None
                
                # Use the output handler to process/save the result
                output_handler.handle_content_variant(
@ -859,7 +876,7 @@ def generate_posters_for_topic(topic_item: dict,
            collage_img = collage_images[0] # 获取第一个 PIL Image
            used_image_files = used_image_filenames[0] if used_image_filenames else [] # 获取使用的图片文件名
            logging.info(f"Collage image generated successfully (in memory). Used images: {used_image_files}")
-            print(f"拼贴图使用的图片文件: {used_image_files}")
+            logging.info(f"拼贴图使用的图片文件: {used_image_files}")
            
            # --- 使用 Handler 保存 Collage 图片和使用的图片文件信息 --- 
            output_handler.handle_generated_image(