资料整合模块测试脚本

2025-07-17 13:57:20 +08:00 · 2025-07-17 13:57:20 +08:00 · fe5fe5e5e2
commit fe5fe5e5e2
parent 3a9e946297
1 changed files with 688 additions and 0 deletions
--- a/tests/test_integration_simple.py
+++ b/tests/test_integration_simple.py
@ -0,0 +1,688 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 简化版离线整合测试脚本
 使用已有数据测试integration模块的不同效果
 调用真实AI进行内容整合
 """
 import os
 import json
 import asyncio
 from typing import Dict, List, Optional, Any
 from datetime import datetime
 from pathlib import Path
 from dataclasses import dataclass
 # 添加项目根目录到路径
 import sys
 project_root = Path(__file__).parent
 sys.path.insert(0, str(project_root))
 # 导入核心模块
 from core.ai.ai_agent import AIAgent
 from core.config import ConfigManager, AIModelConfig
 from utils.prompts import PromptTemplate
@dataclass
 class SimpleNote:
    """简化的笔记数据模型"""
    note_id: str
    title: str
    content: str
    author: str
    likes: int
    comments: int
    shares: int
    note_url: str
    images: List[str]
@dataclass 
 class SimpleSearchResult:
    """简化的搜索结果"""
    keyword: str
    notes: List[SimpleNote]
    total_count: int
    success: bool
 class OfflineIntegrationTester:
    """离线整合测试器"""
    def __init__(self, data_file: str):
        """
        初始化测试器
        Args:
            data_file: 离线数据文件路径
        """
        self.data_file = data_file
        self.raw_data = None
        self.processed_notes = []
        # 初始化配置管理器和AI代理
        self.config_manager = ConfigManager()
        self.config_manager.load_from_directory("config", server_mode=True)
        # 获取AI配置并初始化AI代理
        ai_config = self.config_manager.get_config('ai_model', AIModelConfig)
        self.ai_agent = AIAgent(ai_config)
        # 加载integration prompt模板
        self.prompt_template = PromptTemplate(
            system_prompt_path="resource/prompt/integration/system.txt",
            user_prompt_path="resource/prompt/integration/user.txt"
        )
        # 加载数据
        self._load_data()
        print(f"✅ 离线测试器初始化完成，加载了 {len(self.processed_notes)} 条笔记数据")
        print(f"🤖 AI代理已初始化，模型: {ai_config.model}")
    def _load_data(self):
        """加载离线数据"""
        try:
            with open(self.data_file, 'r', encoding='utf-8') as f:
                self.raw_data = json.load(f)
            # 转换为SimpleNote对象
            if 'results' in self.raw_data and self.raw_data['results']:
                result = self.raw_data['results'][0]  # 取第一个搜索结果
                for note_data in result.get('notes', []):
                    note = SimpleNote(
                        note_id=note_data.get('note_id', ''),
                        title=note_data.get('title', ''),
                        content=note_data.get('content', ''),
                        author=note_data.get('author', ''),
                        likes=note_data.get('like_count', 0),
                        comments=note_data.get('comment_count', 0),
                        shares=note_data.get('share_count', 0),
                        note_url=note_data.get('note_url', ''),
                        images=note_data.get('images', [])
                    )
                    self.processed_notes.append(note)
        except Exception as e:
            print(f"❌ 数据加载失败: {e}")
            self.processed_notes = []
    def get_search_result(self, keyword: str = None) -> SimpleSearchResult:
        """获取搜索结果"""
        if not keyword:
            keyword = self.raw_data['results'][0]['keyword'] if self.raw_data else '测试关键词'
        return SimpleSearchResult(
            keyword=keyword,
            notes=self.processed_notes,
            total_count=len(self.processed_notes),
            success=True
        )
    def create_mock_documents(self, content_type: str = "travel_guide") -> Dict[str, str]:
        """创建模拟文档内容"""
        mock_docs = {
            "travel_guide": {
                "上海馥桂萌宠园游玩攻略.txt": """上海馥桂萌宠园游玩攻略
 一、园区简介
 馥桂萌宠园位于上海市嘉定区，是一个集萌宠互动、户外休闲、亲子娱乐为一体的综合性主题公园。
 二、开放信息
 - 开放时间：9:00-17:00（周一至周日）
 - 门票价格：成人票80元，儿童票50元（1.2米以下免费）
 - 地址：上海市嘉定区朱桥镇
 - 交通：地铁11号线嘉定西站转公交
 三、主要区域
 1. 萌宠互动区：可与羊驼、袋鼠、小兔子等动物亲密接触
 2. 户外游乐区：秋千、滑梯等儿童设施
 3. 餐饮休息区：提供简餐和饮品
 4. 桂花林区：春秋季节桂花飘香，适合拍照
 四、游玩建议
 - 建议游玩时间：3-4小时
 - 最佳游玩季节：春秋两季
 - 携带物品：防晒用品、湿巾、零食
 - 注意事项：爱护动物，文明游园""",
                "萌宠园交通指南.md": """# 馥桂萌宠园交通指南
 ## 公共交通
 ### 地铁+公交
 1. 地铁11号线至嘉定西站
 2. 转乘嘉定69路公交至朱桥站
 3. 步行约10分钟即可到达
 ## 自驾路线
 从市区出发：
 1. 走外环高速(A20)
 2. 转嘉定环线(A5)
 3. 在朱桥出口下高速
 4. 沿指示牌行驶约5分钟
 ## 停车信息
 - 园区提供免费停车场
 - 停车位约200个
 - 节假日建议早到，避免停车位紧张"""
            },
            "product_info": {
                "门票套餐信息.txt": """门票套餐信息
 基础门票：
 - 成人票：80元/人
 - 儿童票：50元/人（3-12岁）
 - 老年票：60元/人（65岁以上）
 - 免票：1.2米以下儿童
 套餐选择：
 1. 家庭套票（2大1小）：180元
 2. 亲子套票（1大1小）：120元  
 3. 团体票（10人以上）：70元/人
 增值服务：
 - 动物喂食包：20元/份
 - 拍照服务：50元/次
 - 导览服务：100元/团"""
            },
            "user_reviews": {
                "游客评价汇总.txt": """游客评价汇总
 正面评价：
 ✅ 动物种类丰富，互动性强
 ✅ 环境优美，适合拍照
 ✅ 工作人员服务态度好
 ✅ 停车方便，交通便利
 ✅ 价格合理，性价比高
 负面反馈：
 ❌ 节假日人流量大
 ❌ 餐饮选择相对有限
 ❌ 部分设施需要维护
 ❌ 雨天游玩体验一般
 综合评分：4.2/5.0"""
            }
        }
        return mock_docs.get(content_type, mock_docs["travel_guide"])
    def _format_xhs_notes(self, notes: List[SimpleNote]) -> str:
        """格式化小红书笔记内容"""
        if not notes:
            return "暂无相关笔记内容"
        formatted_notes = []
        for i, note in enumerate(notes, 1):
            note_content = f"""第{i}条笔记：
 标题：{note.title}
 作者：{note.author}
 内容：{note.content if note.content else '（无文字内容）'}
 互动数据：👍{note.likes} 💬{note.comments} 🔄{note.shares}
 笔记链接：{note.note_url}
 """
            if note.images:
                note_content += f"图片数量：{len(note.images)}张\n"
            formatted_notes.append(note_content)
        return "\n".join(formatted_notes)
    def _format_document_content(self, documents: Dict[str, str]) -> str:
        """格式化文档内容"""
        if not documents:
            return "暂无文档内容"
        formatted_docs = []
        for filename, content in documents.items():
            doc_content = f"""文档：{filename}
 内容：
 {content}
 """
            formatted_docs.append(doc_content)
        return "\n".join(formatted_docs)
    def test_basic_integration(self):
        """测试基础整合功能"""
        print("\n" + "="*60)
        print("🔄 测试基础整合功能")
        print("="*60)
        # 获取数据
        search_result = self.get_search_result("上海馥桂萌宠园攻略")
        documents = self.create_mock_documents("travel_guide")
        print(f"📚 文档处理结果：")
        print(f"   - 文档数量：{len(documents)}")
        total_doc_length = sum(len(content) for content in documents.values())
        print(f"   - 总内容长度：{total_doc_length}")
        print(f"\n📱 小红书数据：")
        print(f"   - 笔记数量：{len(search_result.notes)}")
        print(f"   - 总点赞数：{sum(note.likes for note in search_result.notes)}")
        print(f"   - 总评论数：{sum(note.comments for note in search_result.notes)}")
        # 显示部分内容示例
        print(f"\n📝 内容示例：")
        first_doc = list(documents.values())[0]
        print(f"   文档摘要：{first_doc[:100]}...")
        if search_result.notes:
            popular_note = max(search_result.notes, key=lambda x: x.likes)
            print(f"   热门笔记：{popular_note.title} (👍{popular_note.likes})")
        return {
            "search_result": search_result,
            "documents": documents
        }
    def test_different_document_types(self):
        """测试不同文档类型的整合效果"""
        print("\n" + "="*60)
        print("📊 测试不同文档类型的整合效果")
        print("="*60)
        doc_types = ["travel_guide", "product_info", "user_reviews"]
        results = {}
        for doc_type in doc_types:
            print(f"\n🔸 处理文档类型：{doc_type}")
            documents = self.create_mock_documents(doc_type)
            total_length = sum(len(content) for content in documents.values())
            results[doc_type] = {
                "document_count": len(documents),
                "content_length": total_length,
                "files": list(documents.keys())
            }
            print(f"   ✅ 文档数量：{results[doc_type]['document_count']}")
            print(f"   ✅ 内容长度：{results[doc_type]['content_length']}")
            print(f"   ✅ 文件列表：{results[doc_type]['files']}")
        return results
    def test_content_filtering(self):
        """测试内容过滤和筛选"""
        print("\n" + "="*60)
        print("🔍 测试内容过滤和筛选")
        print("="*60)
        search_result = self.get_search_result()
        # 按点赞数筛选
        high_quality_notes = [note for note in search_result.notes if note.likes >= 50]
        print(f"📈 高质量笔记（点赞≥50）：{len(high_quality_notes)} 条")
        # 按评论数筛选
        interactive_notes = [note for note in search_result.notes if note.comments >= 10]
        print(f"💬 高互动笔记（评论≥10）：{len(interactive_notes)} 条")
        # 按标题关键词筛选
        guide_notes = [note for note in search_result.notes if '攻略' in note.title or '指南' in note.title]
        print(f"📋 攻略类笔记：{len(guide_notes)} 条")
        # 显示统计信息
        if search_result.notes:
            avg_likes = sum(note.likes for note in search_result.notes) / len(search_result.notes)
            avg_comments = sum(note.comments for note in search_result.notes) / len(search_result.notes)
            print(f"\n📊 统计信息：")
            print(f"   平均点赞数：{avg_likes:.1f}")
            print(f"   平均评论数：{avg_comments:.1f}")
            # 找出最受欢迎的笔记
            most_liked = max(search_result.notes, key=lambda x: x.likes)
            most_commented = max(search_result.notes, key=lambda x: x.comments)
            print(f"\n🏆 最受欢迎：")
            print(f"   最多点赞：《{most_liked.title}》- {most_liked.likes} 赞")
            print(f"   最多评论：《{most_commented.title}》- {most_commented.comments} 评论")
        return {
            "total_notes": len(search_result.notes),
            "high_quality_notes": len(high_quality_notes),
            "interactive_notes": len(interactive_notes), 
            "guide_notes": len(guide_notes)
        }
    async def test_real_ai_integration(self):
        """测试真实AI整合功能"""
        print("\n" + "="*60)
        print("🤖 测试真实AI整合功能")
        print("="*60)
        try:
            # 准备数据
            search_result = self.get_search_result()
            documents = self.create_mock_documents("travel_guide")
            print("🔄 正在准备数据...")
            # 格式化文档内容
            document_content = self._format_document_content(documents)
            # 格式化小红书笔记内容
            xhs_content = self._format_xhs_notes(search_result.notes)
            # 构建prompt
            system_prompt = self.prompt_template.get_system_prompt()
            user_prompt = self.prompt_template.build_user_prompt(
                keywords=search_result.keyword or "馥桂萌宠园",
                document_content=document_content or "暂无文档内容",
                xhs_notes_content=xhs_content or "暂无笔记内容"
            )
            print(f"📝 System Prompt长度: {len(system_prompt)} 字符")
            print(f"📝 User Prompt长度: {len(user_prompt)} 字符")
            # 调用真实AI
            print("🔄 正在调用AI进行内容整合...")
            start_time = datetime.now()
            response_text, input_tokens, output_tokens, time_cost = await self.ai_agent.generate_text(
                system_prompt=system_prompt,
                user_prompt=user_prompt,
                use_stream=True,
                stage="content_integration",
            )
            end_time = datetime.now()
            print("✅ AI整合完成")
            print(f"📄 生成内容长度：{len(response_text)} 字符")
            print(f"🎯 输入Token数：{input_tokens}")
            print(f"🎯 输出Token数：{output_tokens}")
            print(f"⏱️ 处理时间：{time_cost:.2f} 秒")
            # 尝试解析JSON
            try:
                # 使用file_io模块的JSON处理功能
                from utils.file_io import process_llm_json_text
                parsed_json = process_llm_json_text(response_text)
                if parsed_json:
                    print("✅ JSON解析成功")
                    print(f"📊 解析结果包含 {len(parsed_json)} 个顶级字段")
                    # 显示解析后的结构
                    if isinstance(parsed_json, dict):
                        print("🏗️ 数据结构：")
                        for key, value in parsed_json.items():
                            if isinstance(value, dict):
                                print(f"   {key}: 包含 {len(value)} 个子字段")
                            elif isinstance(value, list):
                                print(f"   {key}: 列表，包含 {len(value)} 个项目")
                            else:
                                print(f"   {key}: {type(value).__name__}")
                    formatted_response = json.dumps(parsed_json, ensure_ascii=False, indent=2)
                else:
                    print("⚠️ JSON解析失败，使用原始响应")
                    formatted_response = response_text
            except Exception as parse_error:
                print(f"⚠️ JSON解析出错: {parse_error}")
                formatted_response = response_text
            print(f"\n📝 AI响应预览：")
            preview_text = formatted_response[:500] + "..." if len(formatted_response) > 500 else formatted_response
            print(preview_text)
            return {
                "ai_response": formatted_response,
                "content_length": len(response_text),
                "input_tokens": input_tokens,
                "output_tokens": output_tokens,
                "processing_time": time_cost,
                "source_notes": len(search_result.notes),
                "source_docs": len(documents),
                "json_parsed": parsed_json is not None
            }
        except Exception as e:
            print(f"❌ AI整合测试失败：{e}")
            import traceback
            traceback.print_exc()
            return None
    def test_export_formats(self):
        """测试不同导出格式"""
        print("\n" + "="*60)
        print("📤 测试不同导出格式")
        print("="*60)
        search_result = self.get_search_result()
        documents = self.create_mock_documents("travel_guide")
        # 模拟不同格式的导出
        formats = {
            "summary": "简要摘要格式",
            "blog_post": "博客文章格式", 
            "travel_guide": "旅游攻略格式",
            "product_sales": "产品销售格式",
            "attraction_standard": "景点标准格式"
        }
        export_results = {}
        for format_type, description in formats.items():
            print(f"🔸 生成{description}...")
            # 模拟不同格式的内容生成
            mock_content = self._generate_format_content(format_type, search_result, documents)
            export_results[format_type] = {
                "content": mock_content,
                "length": len(mock_content),
                "format": format_type
            }
            print(f"   ✅ 长度：{len(mock_content)} 字符")
        return export_results
    def _generate_format_content(self, format_type: str, search_result: SimpleSearchResult, documents: Dict[str, str]) -> str:
        """根据格式类型生成相应内容"""
        base_info = f"基于{len(search_result.notes)}篇小红书游记和官方资料"
        if format_type == "summary":
            return f"【简要摘要】{base_info}，馥桂萌宠园是嘉定区优质亲子游目的地，特色萌宠互动体验，适合周末家庭出游。"
        elif format_type == "blog_post":
            return f"""# 周末带娃新选择 | 上海馥桂萌宠园深度体验
 今天想和大家分享一个超棒的亲子游目的地——位于嘉定的馥桂萌宠园！{base_info}，这里真的是遛娃神器！
 ## 为什么推荐这里？
 🦘 可以近距离接触羊驼、袋鼠等萌宠
 🌸 环境优美，特别是桂花季节
 👨‍👩‍👧‍👦 设施完善，非常适合家庭游
 详细攻略请看下文..."""
        elif format_type == "travel_guide":
            return f"""## 上海馥桂萌宠园游玩攻略
 ### 基本信息
 - 地址：上海市嘉定区朱桥镇
 - 门票：成人80元，儿童50元
 - 开放时间：9:00-17:00
 ### 交通指南
 地铁11号线嘉定西站转公交，或自驾直达
 ### 游玩亮点
 {base_info}，园区主要特色包括萌宠互动、户外游乐、休闲拍照等。
 ### 注意事项
 建议游玩3-4小时，春秋季节体验最佳。"""
        elif format_type == "product_sales":
            return f"""🔥【限时特惠】上海馥桂萌宠园门票
 ⭐ 产品亮点：
 • {base_info}，口碑爆棚！
 • 🦘 独特萌宠互动体验
 • 🌸 网红拍照打卡圣地  
 • 👨‍👩‍👧‍👦 亲子游首选目的地
 💰 特价套餐：
 原价130元，现价只要99元！
 包含：成人门票1张 + 动物喂食包1份
 📞 立即预订：xxx-xxxx-xxxx"""
        else:  # attraction_standard
            return f"""景点名称：上海馥桂萌宠园
 景点类型：主题公园/动物园
 地理位置：上海市嘉定区朱桥镇
 开放状态：正常营业
 门票价格：80-50元
 适宜人群：亲子家庭
 推荐指数：4.2/5.0
 游玩时长：3-4小时
 最佳季节：春秋两季
 {base_info}，具有良好的口碑和游客体验。"""
    def test_parameter_effects(self):
        """测试不同参数的效果"""
        print("\n" + "="*60)
        print("⚙️ 测试不同参数的效果")
        print("="*60)
        search_result = self.get_search_result()
        # 测试不同的过滤阈值
        thresholds = [
            {"likes": 20, "comments": 5},
            {"likes": 50, "comments": 10},
            {"likes": 100, "comments": 20}
        ]
        print("🔧 测试不同过滤阈值的效果：")
        for i, threshold in enumerate(thresholds, 1):
            high_likes = [note for note in search_result.notes if note.likes >= threshold["likes"]]
            high_comments = [note for note in search_result.notes if note.comments >= threshold["comments"]]
            print(f"   阈值{i}（点赞≥{threshold['likes']}, 评论≥{threshold['comments']}）:")
            print(f"     - 符合点赞条件：{len(high_likes)} 条")
            print(f"     - 符合评论条件：{len(high_comments)} 条")
        # 测试不同的内容长度限制
        length_limits = [100, 200, 500]
        print(f"\n📏 测试不同内容长度限制的效果：")
        for limit in length_limits:
            documents = self.create_mock_documents("travel_guide")
            truncated_docs = {}
            for filename, content in documents.items():
                truncated_docs[filename] = content[:limit] + "..." if len(content) > limit else content
            total_length = sum(len(content) for content in truncated_docs.values())
            print(f"   限制{limit}字符：总长度 {total_length} 字符")
        return {
            "threshold_tests": len(thresholds),
            "length_tests": len(length_limits)
        }
 async def main():
    """主函数"""
    print("🚀 启动真实AI离线整合测试...")
    # 检查数据文件
    data_file = "batch_search_20250717_104407.json"
    if not os.path.exists(data_file):
        print(f"❌ 数据文件不存在：{data_file}")
        return
    # 初始化测试器
    try:
        tester = OfflineIntegrationTester(data_file)
    except Exception as e:
        print(f"❌ 测试器初始化失败：{e}")
        import traceback
        traceback.print_exc()
        return
    try:
        # 执行各种测试
        print(f"\n{'='*60}")
        print("🧪 开始执行真实AI离线整合测试套件")
        print(f"{'='*60}")
        # 1. 基础整合测试
        basic_result = tester.test_basic_integration()
        # 2. 不同文档类型测试
        doc_type_results = tester.test_different_document_types()
        # 3. 内容过滤测试
        filter_results = tester.test_content_filtering()
        # 4. 真实AI整合测试（异步）
        ai_results = await tester.test_real_ai_integration()
        with open("ai_results.json", "w") as f:
            json.dump(ai_results, f, ensure_ascii=False, indent=2)
        # 5. 导出格式测试
        export_results = tester.test_export_formats()
        # 6. 参数效果测试
        param_results = tester.test_parameter_effects()
        # 输出测试总结
        print(f"\n{'='*60}")
        print("📋 测试结果总结")
        print(f"{'='*60}")
        print(f"✅ 基础整合测试：成功")
        print(f"✅ 文档类型测试：处理了 {len(doc_type_results)} 种类型")
        print(f"✅ 内容过滤测试：识别出 {filter_results['high_quality_notes']} 条高质量笔记")
        if ai_results:
            print(f"✅ 真实AI整合测试：成功")
            print(f"   - 处理时间：{ai_results['processing_time']:.2f}秒")
            print(f"   - 输入Token：{ai_results['input_tokens']}")
            print(f"   - 输出Token：{ai_results['output_tokens']}")
            print(f"   - JSON解析：{'成功' if ai_results['json_parsed'] else '失败'}")
        else:
            print(f"❌ 真实AI整合测试：失败")
        # print(f"✅ 导出格式测试：生成了 {len(export_results)} 种格式")
        # print(f"✅ 参数效果测试：测试了 {param_results['threshold_tests']} 种阈值配置")
        # print(f"\n🎉 所有测试完成！")
        # # 提供测试建议
        # print(f"\n💡 测试建议：")
        # print(f"   1. 可以调整AI模型参数（temperature、top_p等）来优化输出质量")
        # print(f"   2. 可以修改integration prompt模板来改进整合效果")
        # print(f"   3. 可以测试不同的文档类型组合")
        # print(f"   4. 可以调整内容过滤的阈值")
        # print(f"   5. 可以测试不同的导出格式效果")
        # # 显示原始数据概览
        # if tester.raw_data:
        #     summary = tester.raw_data.get('summary', {})
        #     print(f"\n📊 原始数据概览：")
        #     print(f"   - 总关键词数：{summary.get('total_keywords', 0)}")
        #     print(f"   - 成功搜索数：{summary.get('successful_searches', 0)}")
        #     print(f"   - 总笔记数：{summary.get('total_notes', 0)}")
        #     print(f"   - 总互动数：{summary.get('total_interactions', 0)}")
    except Exception as e:
        print(f"❌ 测试过程中发生错误：{e}")
        import traceback
        traceback.print_exc()
 if __name__ == "__main__":
    asyncio.run(main())