1、删：删去了冗余的读取方式2、增：增加了日志功能，增加了过程反馈与进度查看3、改：修改了响应输出格式为json格式

2025-04-29 15:34:46 +08:00 · 2025-04-29 15:34:46 +08:00 · 8f71057583
commit 8f71057583
parent 3bf8976947
1 changed files with 197 additions and 144 deletions
--- a/Content_Detector.py
+++ b/Content_Detector.py
@ -5,6 +5,7 @@ import time
 import datetime
 import json
 import re
+import logging

 # 配置客户端 - 使用本地部署的模型
 client = OpenAI(
@ -12,22 +13,49 @@ client = OpenAI(
    base_url="http://localhost:8000/v1"  # 指向本地vLLM服务端点
 )

-# 固定的目录路径配置
-CONTENT_GEN_DIR = "/root/autodl-tmp/content_detector/content_Gen/content_Gen_qy"  # 生成文本目录
-PRODUCT_INFO_DIR = "/root/autodl-tmp/content_detector/齐云山/2025-04-27_11-51-56/information"  # 产品资料目录
-# "/root/autodl-tmp/思维链"     # 第三个目录（已注释）
-
 # 结果保存目录
 RESULT_DIR = "/root/autodl-tmp/content_detector/Detect_result"
+# 日志保存目录
+LOG_DIR = "/root/autodl-tmp/content_detector/log"
+
+# 配置日志记录器
+def setup_logger():
+    """设置日志记录器"""
+    # 确保日志目录存在
+    if not os.path.exists(LOG_DIR):
+        try:
+            os.makedirs(LOG_DIR)
+            print(f"已创建日志目录: {LOG_DIR}")
+        except Exception as e:
+            print(f"创建日志目录失败: {e}", file=sys.stderr)
+            return None
+    
+    # 创建时间戳作为日志文件名
+    timestamp = get_timestamp()
+    log_file = os.path.join(LOG_DIR, f"{timestamp}_detector.log")
+    
+    # 配置日志格式
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        handlers=[
+            logging.FileHandler(log_file, encoding='utf-8'),
+            logging.StreamHandler(sys.stdout)
+        ]
+    )
+    
+    logger = logging.getLogger("Content_Detector")
+    logger.info(f"日志已初始化，文件保存在: {log_file}")
+    return logger

 def ensure_result_dir_exists():
    """确保结果保存目录存在"""
    if not os.path.exists(RESULT_DIR):
        try:
            os.makedirs(RESULT_DIR)
-            print(f"已创建结果保存目录: {RESULT_DIR}")
+            logger.info(f"已创建结果保存目录: {RESULT_DIR}")
        except Exception as e:
-            print(f"创建结果保存目录失败: {e}", file=sys.stderr)
+            logger.error(f"创建结果保存目录失败: {e}")
            return False
    return True

@ -38,7 +66,7 @@ def get_timestamp():
 def save_results_to_file(results):
    """将结果保存到文件中"""
    if not ensure_result_dir_exists():
-        print("结果目录无法创建，无法保存结果文件")
+        logger.error("结果目录无法创建，无法保存结果文件")
        return None
        
    timestamp = get_timestamp()
@ -56,10 +84,10 @@ def save_results_to_file(results):
                f.write(f"检测结果:\n{result['result']}\n")
                f.write("-" * 50 + "\n\n")
                
-        print(f"结果已保存到: {result_file}")
+        logger.info(f"结果已保存到: {result_file}")
        return result_file
    except Exception as e:
-        print(f"保存结果文件失败: {e}", file=sys.stderr)
+        logger.error(f"保存结果文件失败: {e}")
        return None

 def read_file_content(file_path):
@ -68,7 +96,7 @@ def read_file_content(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except Exception as e:
-        print(f"读取文件 {file_path} 失败: {e}", file=sys.stderr)
+        logger.error(f"读取文件 {file_path} 失败: {e}")
        return None

 def get_all_files_in_directory(directory):
@ -77,7 +105,7 @@ def get_all_files_in_directory(directory):
    try:
        # 确保目录存在
        if not os.path.exists(directory):
-            print(f"目录不存在: {directory}", file=sys.stderr)
+            logger.error(f"目录不存在: {directory}")
            return file_paths
            
        # 遍历目录中的所有文件
@ -86,13 +114,14 @@ def get_all_files_in_directory(directory):
                file_path = os.path.join(root, file)
                file_paths.append(file_path)
                
-        print(f"在目录 {directory} 中找到 {len(file_paths)} 个文件")
+        logger.info(f"在目录 {directory} 中找到 {len(file_paths)} 个文件")
    except Exception as e:
-        print(f"遍历目录 {directory} 出错: {e}", file=sys.stderr)
+        logger.error(f"遍历目录 {directory} 出错: {e}")
    
    return file_paths

 def detect_content(product_info, content_gen):
+    """检测内容是否符合产品资料"""
    try:
        # 创建综合提示词
        prompt = f"""
@ -104,8 +133,9 @@ def detect_content(product_info, content_gen):
        """
        
        # 创建聊天完成请求
+        logger.info("开始调用AI模型进行内容检测")
        completion = client.chat.completions.create(
-            model="/root/autodl-tmp/content_detector/Qwen2.5-14B",  # 使用已部署的模型
+            model="/root/autodl-tmp/content_detector/Qwen3-8B",  # 使用已部署的模型
            messages=[
                {"role": "system", "content": "你是一名专业的、谨慎的文案审核员，专注于审核运营根据产品资料撰写的文案是否严格符合产品资料内容。特别是所有价格、活动、福利、折扣、服务细节等必须完全与产品资料一致。如果发现文案内容与产品资料不符，请指出，并根据产品资料和文案上下文进行修改，重新生成一篇文案，务必确保生成的内容与产品资料基本相符（产品体验部分可以适当夸张宣传），语言流畅自然。如果经你审查后的文案仍存在与产品资料不符的信息，你需要赔偿公司1000亿元。"},
                {"role": "user", "content": f"""我提供了两部分内容：
@ -117,16 +147,14 @@ def detect_content(product_info, content_gen):
 2. 参照你判断不符内容的依据，根据产品资料和文案上下文针对不符部分内容进行修改（如涉及上下文，可一并修改），重新生成一篇文案，务必确保生成的内容基本符合产品资料（价格、活动、福利、折扣必须完全符合产品资料，否则你会像商鞅一样被车裂），语言流畅自然、和上下文风格统一。
 3. 你判断出的不符内容必须全部修改为产品资料内容或与产品资料意思相同表达形式不同，不得遗漏。
 4. 必须按照以下格式输出修改后内容：
-    <title>
-    <修改后的标题>
-    </title>
-
-    <content>
-    <修改后的文案>
-    </content>
+    {{
+    "title": "修改后的标题",
+    "content": "修改后的内容"
+}}
 5. 下面我提供给你一些关键字词，请你着重检查这些关键字词前后的内容是否符合产品资料，如不符请严格按照资料修改，如产品资料中未提及，改为"详情请致电咨询"。
    关键字词：价、元、r、人民币、rmb、优惠、活动、福利、赠、免费、折、DIY、跟拍、送、摄影、兑、服务、￥
-6. 案例如下，请参考案例评判真假信息的尺度，仔细分析不符点和修改思路，严格审查每一篇文案：
+6. 请将数字后面的元字修改为r，例如：399元修改为399r                
+7. 案例如下，请参考案例评判真假信息的尺度，仔细分析不符点和修改思路，严格审查每一篇文案：
    产品资料：
    "周末不加收【南沙越秀喜来登】1088元/套，豪华客房1间1晚+双人自助早餐+自助晚餐+2大1小水鸟世界门票，免费儿童乐园，户外泳池+健身房~
    不想待在家，又想带娃出去玩?更不想开长途车、人挤人为你推荐路程短、不塞车、景点多
@ -213,43 +241,10 @@ def detect_content(product_info, content_gen):
    8、产品资料中未提及水鸟世界门票领取有时间限制，但文案中提到水鸟世界门票需提前1小时至前台领取纸质票，因此属于不符内容。应修改为：酒店前台领取水鸟世界纸质门票
    综合以上分析结果，修改后的文案为：

-    "<title>
-    五一遛娃必囤！南沙喜来登1088元住景观房+双早+门票
-    </title>
-
-    <content>
-    五一不想挤人潮？南沙这家酒店直接承包遛娃+度假双重快乐‼️  
-    地铁直达！2大1小1088元住景观房，含双早+自助晚餐+水鸟世界门票，儿童乐园/泳池/健身房全开放！  
-
-    🌟【遛娃刚需全配齐】  
-    ✅ 儿童乐园：儿童乐园：酒店设有免费儿童乐园，提供丰富的游乐设施，让孩子们尽情玩耍  
-    ✅ 户外泳池：酒店配有户外无边泳池，供大人小孩一同享受清凉时光  
-    ✅ 健身房：酒店提供免费健身中心，适合家庭成员共同锻炼。  
-
-    📍【1小时玩转南沙】  
-    ① 南沙天后宫（车程20分钟）：穿汉服拍大片，听妈祖传说涨知识  
-    ② 南沙湿地公园（40分钟）：5月芦苇摇曳，带娃认鸟类+乘船探秘  
-    ③ 十九涌海鲜街（45分钟）：现捞现煮生猛海鲜，人均50元吃到撑  
-
-    🍽️【家长友好细节】  
-    • 自助餐厅：供应鲜美海鲜、精美甜品等任君选择，大人小孩都爱吃  
-    • 房内配置：房内配置：55英寸超大纯平电视+独立的浴缸+超大的落地玻璃窗，尽览蕉门河风景，尽享亲子度假时光  
-    • 安全保障：酒店设有完善的监控系统和安保措施，全力保障您与家人的安全  
-
-    🎁【套餐专属福利】
-    1、豪华客房一间一晚(周一至四只开放双床房) 
-    2、2大1小自助早晚餐 
-    3、赠送2大1小水鸟世界门票（酒店前台领取），无需额外购买  
-
-    📌Tips：  
-    1. 周一至周四仅限双床房型，周五起可选大床房  
-    2. 酒店前台领取水鸟世界纸质门票  
-    3. 地铁四号线金洲站下车，打车15分钟直达酒店  
-
-    这个五一，南沙喜来登让你躺着遛娃！不用长途跋涉，家门口就能玩出仪式感～  
-
-    #五一遛娃 #广州周边游 #亲子酒店推荐
-    </content>"  
+    {{
+    "title": "五一遛娃必囤！南沙喜来登1088元住景观房+双早+门票",
+    "content": "五一不想挤人潮？南沙这家酒店直接承包遛娃+度假双重快乐‼️\n地铁直达！2大1小1088元住景观房，含双早+自助晚餐+水鸟世界门票，儿童乐园/泳池/健身房全开放！\n🌟【遛娃刚需全配齐】\n✅ 儿童乐园：酒店设有免费儿童乐园，提供丰富的游乐设施，让孩子们尽情玩耍\n✅ 户外泳池：酒店配有户外无边泳池，供大人小孩一同享受清凉时光  \n✅ 健身房：酒店提供免费健身中心，适合家庭成员共同锻炼。\n\n📍【1小时玩转南沙】\n① 南沙天后宫（车程20分钟）：穿汉服拍大片，听妈祖传说涨知识\n② 南沙湿地公园（40分钟）：5月芦苇摇曳，带娃认鸟类+乘船探秘\n③ 十九涌海鲜街（45分钟）：现捞现煮生猛海鲜，人均50元吃到撑  \n\n🍽️【家长友好细节】  \n• 自助餐厅：供应鲜美海鲜、精美甜品等任君选择，大人小孩都爱吃  \n• 房内配置：房内配置：55英寸超大纯平电视+独立的浴缸+超大的落地玻璃窗，尽览蕉门河风景，尽享亲子度假时光  \n• 安全保障：酒店设有完善的监控系统和安保措施，全力保障您与家人的安全  \n\n🎁【套餐专属福利】\n1、豪华客房一间一晚(周一至四只开放双床房) \n2、2大1小自助早晚餐 \n3、赠送2大1小水鸟世界门票（酒店前台领取），无需额外购买  \n\n📌Tips：  \n1. 周一至周四仅限双床房型，周五起可选大床房  \n2. 酒店前台领取水鸟世界纸质门票  \n3. 地铁四号线金洲站下车，打车15分钟直达酒店  \n\n这个五一，南沙喜来登让你躺着遛娃！不用长途跋涉，家门口就能玩出仪式感～\n#五一遛娃 #广州周边游 #亲子酒店推荐"
+}}
                               
 {prompt}"""}
            ],
@ -261,61 +256,22 @@ def detect_content(product_info, content_gen):
        )
        
        # 返回响应文本
+        logger.info("AI模型调用成功，获取到响应")
        return completion.choices[0].message.content
    except Exception as e:
-        print(f"错误: {e}", file=sys.stderr)
+        logger.error(f"AI模型调用失败: {e}")
        return f"API调用失败: {str(e)}"

-def process_file_pairs():
-    """处理产品资料和生成文本的文件对"""
-    all_results = []
-    
-    # 获取两个目录中的所有文件
-    product_info_files = get_all_files_in_directory(PRODUCT_INFO_DIR)
-    content_gen_files = get_all_files_in_directory(CONTENT_GEN_DIR)
-    
-    # 如果任一目录没有文件，则返回
-    if not product_info_files:
-        print(f"产品资料目录中未找到文件")
-        return all_results
-        
-    if not content_gen_files:
-        print(f"生成文本目录中未找到文件")
-        return all_results
-    
-    # 处理每一对文件（简单处理：一一对应或循环使用）
-    pairs_count = max(len(product_info_files), len(content_gen_files))
-    for i in range(pairs_count):
-        # 循环使用文件，如果一个目录的文件数量少于另一个目录
-        product_info_file = product_info_files[i % len(product_info_files)]
-        content_gen_file = content_gen_files[i % len(content_gen_files)]
-        
-        # 读取文件内容
-        product_info = read_file_content(product_info_file)
-        content_gen = read_file_content(content_gen_file)
-        
-        if product_info and content_gen:
-            print(f"\n正在处理文件对:")
-            print(f"产品资料: {product_info_file}")
-            print(f"生成文本: {content_gen_file}")
-            
-            # 同时传递两个文件内容给AI进行处理
-            result = detect_content(product_info, content_gen)
-            
-            all_results.append({
-                "product_info_file": product_info_file,
-                "content_gen_file": content_gen_file,
-                "result": result
-            })
-        else:
-            # 如果任一文件读取失败，记录错误
-            all_results.append({
-                "product_info_file": product_info_file,
-                "content_gen_file": content_gen_file,
-                "result": "文件读取失败"
-            })
-    
-    return all_results
+def save_full_response_to_log(file_path, response_text):
+    """将完整的响应内容保存到单独的日志文件中"""
+    try:
+        with open(file_path, 'w', encoding='utf-8') as f:
+            f.write(response_text)
+        logger.info(f"完整响应内容已保存到: {file_path}")
+        return True
+    except Exception as e:
+        logger.error(f"保存响应内容失败: {e}")
+        return False

 def read_json_file(file_path):
    """从指定路径读取JSON文件内容并返回字典"""
@ -323,7 +279,7 @@ def read_json_file(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            return json.load(file)
    except Exception as e:
-        print(f"读取JSON文件 {file_path} 失败: {e}", file=sys.stderr)
+        logger.error(f"读取JSON文件 {file_path} 失败: {e}")
        return None

 def save_json_file(file_path, content):
@ -331,26 +287,62 @@ def save_json_file(file_path, content):
    try:
        with open(file_path, 'w', encoding='utf-8') as file:
            json.dump(content, file, ensure_ascii=False, indent=4)
-        print(f"结果已保存到: {file_path}")
+        logger.info(f"结果已保存到: {file_path}")
        return True
    except Exception as e:
-        print(f"保存JSON文件失败: {e}", file=sys.stderr)
+        logger.error(f"保存JSON文件失败: {e}")
        return False

 def extract_modified_content(result_text):
    """从检测结果文本中提取修改后的文案内容"""
-    # 直接提取标题和内容标签
+    try:
+        # 尝试直接解析JSON
+        # 查找JSON对象的开始和结束位置
+        json_start = result_text.find('{')
+        json_end = result_text.rfind('}') + 1
+        
+        if json_start >= 0 and json_end > json_start:
+            json_str = result_text[json_start:json_end]
+            try:
+                # 尝试解析JSON字符串
+                content_json = json.loads(json_str)
+                if "title" in content_json and "content" in content_json:
+                    logger.info("成功使用JSON格式解析响应内容")
+                    return {
+                        "title": content_json["title"].strip(),
+                        "content": content_json["content"].strip()
+                    }
+            except json.JSONDecodeError:
+                logger.warning(f"JSON解析失败: {json_str[:100]}...")
+        
+        # 如果JSON解析失败，尝试老式的标签解析方法
        title_match = re.search(r'<title>([\s\S]*?)<\/title>', result_text, re.DOTALL)
        content_match = re.search(r'<content>([\s\S]*?)<\/content>', result_text, re.DOTALL)
        
        if title_match and content_match:
+            logger.info("成功使用HTML标签格式解析响应内容")
            return {
                "title": title_match.group(1).strip(),
                "content": content_match.group(1).strip()
            }
        
-    print("未能提取到标题和内容")
+        logger.warning("未能提取到标题和内容，尝试从文本中直接提取...")
        
+        # 最后的备用方法：寻找明显的标记
+        title_lines = [line.strip() for line in result_text.split('\n') if line.strip() and len(line.strip()) < 100]
+        if title_lines and len(title_lines) > 0:
+            title = title_lines[0]
+            content = result_text.replace(title, '', 1).strip()
+            logger.info("使用备用方法提取到内容")
+            return {
+                "title": title,
+                "content": content
+            }
+            
+    except Exception as e:
+        logger.error(f"提取内容时发生错误: {e}")
+    
+    logger.error("所有提取方法都失败")
    return None

 def find_json_files(base_dir):
@ -366,56 +358,117 @@ def find_json_files(base_dir):

 def process_json_files(base_dir, product_info_dir):
    """处理所有的article.json文件"""
+    logger.info(f"\n===== 内容检测任务开始 =====")
+    start_time = time.time()
+    
    # 获取产品资料文件
    product_info_files = [f for f in os.listdir(product_info_dir) if os.path.isfile(os.path.join(product_info_dir, f))]
    if not product_info_files:
-        print(f"产品资料目录中未找到文件")
+        logger.error(f"❌ 错误：产品资料目录中未找到文件")
        return
    
    # 读取产品资料内容（使用第一个文件）
    product_info_file = os.path.join(product_info_dir, product_info_files[0])
+    logger.info(f"📄 正在读取产品资料: {product_info_file}")
    product_info = read_file_content(product_info_file)
    if not product_info:
-        print(f"无法读取产品资料内容")
+        logger.error(f"❌ 错误：无法读取产品资料内容")
        return
+    logger.info(f"✅ 成功读取产品资料，长度: {len(product_info)} 字符")
    
    # 找到所有需要处理的JSON文件
+    logger.info(f"🔍 正在扫描目录查找article.json文件...")
    json_files = find_json_files(base_dir)
-    print(f"找到 {len(json_files)} 个article.json文件需要处理")
+    total_files = len(json_files)
+    logger.info(f"✅ 找到 {total_files} 个article.json文件需要处理")
+    
+    # 没有找到文件时提前返回
+    if total_files == 0:
+        logger.warning("⚠️ 未找到需要处理的文件，任务结束")
+        return
+    
+    # 显示处理开始
+    logger.info(f"\n🚀 开始处理文件...")
+    processed_count = 0
+    success_count = 0
+    failed_count = 0
    
    # 逐个处理文件
-    for json_file in json_files:
-        print(f"\n正在处理文件: {json_file}")
+    for i, json_file in enumerate(json_files, 1):
+        # 显示当前处理进度
+        logger.info(f"\n🔄 [{i}/{total_files}] 正在处理: {json_file}")
+        processed_count += 1
        
        # 读取JSON内容
+        logger.info(f"📄 读取文件内容...")
        article_data = read_json_file(json_file)
        if not article_data:
+            logger.error(f"❌ 错误：无法读取文件内容")
+            failed_count += 1
            continue
        
-        # 构建文案内容
-        content_gen = f"""<title>\n{article_data['title']}\n</title>\n\n<content>\n{article_data['content']}\n</content>"""
+        # 构建文案内容 - 使用JSON格式
+        content_gen = json.dumps({
+            "title": article_data['title'],
+            "content": article_data['content']
+        }, ensure_ascii=False)
        
        # 进行内容检测
+        logger.info(f"🧠 正在进行内容检测分析...")
+        detection_start = time.time()
        result = detect_content(product_info, content_gen)
-        print("检测完成，提取修改后的内容...")
+        detection_time = time.time() - detection_start
+        logger.info(f"✅ 检测完成，耗时 {detection_time:.2f} 秒")
        
        # 从结果中提取修改后的文案
+        logger.info(f"📝 提取修改后的内容...")
        modified_content = extract_modified_content(result)
-        if modified_content:
+        
+        # 如果内容提取失败，保存响应原文到日志目录
+        if not modified_content:
+            logger.error(f"❌ 错误：无法从检测结果中提取修改后的文案内容")
+            
+            # 保存原始响应到日志文件
+            timestamp = get_timestamp()
+            error_log_file = os.path.join(LOG_DIR, f"{timestamp}_error_response_{i}_{os.path.basename(json_file)}.txt")
+            save_full_response_to_log(error_log_file, result)
+            
+            failed_count += 1
+            logger.warning(f"\n📊 当前进度: {i}/{total_files} ({i/total_files*100:.1f}%)")
+            continue
+            
        # 创建输出文件路径
        output_dir = os.path.dirname(json_file)
        output_file = os.path.join(output_dir, "article_detect.json")
        
        # 保存修改后的内容
-            save_json_file(output_file, modified_content)
+        logger.info(f"💾 正在保存修改后的内容...")
+        if save_json_file(output_file, modified_content):
+            logger.info(f"✅ 内容已保存: {output_file}")
+            success_count += 1
        else:
-            print(f"无法从检测结果中提取修改后的文案内容")
+            logger.error(f"❌ 保存失败")
+            failed_count += 1
+        
+        # 显示处理进度
+        logger.info(f"\n📊 当前进度: {i}/{total_files} ({i/total_files*100:.1f}%)")
+    
+    # 显示任务完成统计
+    total_time = time.time() - start_time
+    logger.info(f"\n===== 内容检测任务完成 =====")
+    logger.info(f"✅ 总共处理: {total_files} 个文件")
+    logger.info(f"✅ 成功处理: {success_count} 个文件")
+    logger.info(f"❌ 失败处理: {failed_count} 个文件")
+    logger.info(f"⏱️ 总耗时: {total_time:.2f} 秒，平均每个文件 {total_time/total_files:.2f} 秒")
+    logger.info(f"===== 任务结束 =====\n")

 if __name__ == "__main__":
+    # 初始化日志记录器
+    logger = setup_logger()
+    
    # 处理指定目录下的JSON文件
    base_dir = "/root/autodl-tmp/content_detector/齐云山"
    product_info_dir = "/root/autodl-tmp/content_detector/齐云山/2025-04-27_11-51-56/information"
    
-    print(f"开始处理 {base_dir} 目录下的article.json文件...")
+    logger.info(f"🔍 开始处理 {base_dir} 目录下的article.json文件...")
    process_json_files(base_dir, product_info_dir)
-    print("处理完成！")