bangbang-aigc-server/tests/test_integration_simple.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
简化版离线整合测试脚本
使用已有数据测试integration模块的不同效果
调用真实AI进行内容整合
"""

import os
import json
import asyncio
from typing import Dict, List, Optional, Any
from datetime import datetime
from pathlib import Path
from dataclasses import dataclass

# 添加项目根目录到路径
import sys
project_root = Path(__file__).parent
sys.path.insert(0, str(project_root))

# 导入核心模块
from core.ai.ai_agent import AIAgent
from core.config import ConfigManager, AIModelConfig
from utils.prompts import PromptTemplate

@dataclass
class SimpleNote:
    """简化的笔记数据模型"""
    note_id: str
    title: str
    content: str
    author: str
    likes: int
    comments: int
    shares: int
    note_url: str
    images: List[str]
    
@dataclass 
class SimpleSearchResult:
    """简化的搜索结果"""
    keyword: str
    notes: List[SimpleNote]
    total_count: int
    success: bool

class OfflineIntegrationTester:
    """离线整合测试器"""
    
    def __init__(self, data_file: str):
        """
        初始化测试器
        
        Args:
            data_file: 离线数据文件路径
        """
        self.data_file = data_file
        self.raw_data = None
        self.processed_notes = []
        
        # 初始化配置管理器和AI代理
        self.config_manager = ConfigManager()
        self.config_manager.load_from_directory("config", server_mode=True)
        
        # 获取AI配置并初始化AI代理
        ai_config = self.config_manager.get_config('ai_model', AIModelConfig)
        self.ai_agent = AIAgent(ai_config)
        
        # 加载integration prompt模板
        self.prompt_template = PromptTemplate(
            system_prompt_path="resource/prompt/integration/system.txt",
            user_prompt_path="resource/prompt/integration/user.txt"
        )
        
        # 加载数据
        self._load_data()
        
        print(f"✅ 离线测试器初始化完成，加载了 {len(self.processed_notes)} 条笔记数据")
        print(f"🤖 AI代理已初始化，模型: {ai_config.model}")
    
    def _load_data(self):
        """加载离线数据"""
        try:
            with open(self.data_file, 'r', encoding='utf-8') as f:
                self.raw_data = json.load(f)
            
            # 转换为SimpleNote对象
            if 'results' in self.raw_data and self.raw_data['results']:
                result = self.raw_data['results'][0]  # 取第一个搜索结果
                
                for note_data in result.get('notes', []):
                    note = SimpleNote(
                        note_id=note_data.get('note_id', ''),
                        title=note_data.get('title', ''),
                        content=note_data.get('content', ''),
                        author=note_data.get('author', ''),
                        likes=note_data.get('like_count', 0),
                        comments=note_data.get('comment_count', 0),
                        shares=note_data.get('share_count', 0),
                        note_url=note_data.get('note_url', ''),
                        images=note_data.get('images', [])
                    )
                    self.processed_notes.append(note)
                    
        except Exception as e:
            print(f"❌ 数据加载失败: {e}")
            self.processed_notes = []
    
    def get_search_result(self, keyword: str = None) -> SimpleSearchResult:
        """获取搜索结果"""
        if not keyword:
            keyword = self.raw_data['results'][0]['keyword'] if self.raw_data else '测试关键词'
        
        return SimpleSearchResult(
            keyword=keyword,
            notes=self.processed_notes,
            total_count=len(self.processed_notes),
            success=True
        )
    
    def create_mock_documents(self, content_type: str = "travel_guide") -> Dict[str, str]:
        """创建模拟文档内容"""
        
        mock_docs = {
            "travel_guide": {
                "上海馥桂萌宠园游玩攻略.txt": """上海馥桂萌宠园游玩攻略

一、园区简介
馥桂萌宠园位于上海市嘉定区，是一个集萌宠互动、户外休闲、亲子娱乐为一体的综合性主题公园。

二、开放信息
- 开放时间：9:00-17:00（周一至周日）
- 门票价格：成人票80元，儿童票50元（1.2米以下免费）
- 地址：上海市嘉定区朱桥镇
- 交通：地铁11号线嘉定西站转公交

三、主要区域
1. 萌宠互动区：可与羊驼、袋鼠、小兔子等动物亲密接触
2. 户外游乐区：秋千、滑梯等儿童设施
3. 餐饮休息区：提供简餐和饮品
4. 桂花林区：春秋季节桂花飘香，适合拍照

四、游玩建议
- 建议游玩时间：3-4小时
- 最佳游玩季节：春秋两季
- 携带物品：防晒用品、湿巾、零食
- 注意事项：爱护动物，文明游园""",
                
                "萌宠园交通指南.md": """# 馥桂萌宠园交通指南

## 公共交通
### 地铁+公交
1. 地铁11号线至嘉定西站
2. 转乘嘉定69路公交至朱桥站
3. 步行约10分钟即可到达

## 自驾路线
从市区出发：
1. 走外环高速(A20)
2. 转嘉定环线(A5)
3. 在朱桥出口下高速
4. 沿指示牌行驶约5分钟

## 停车信息
- 园区提供免费停车场
- 停车位约200个
- 节假日建议早到，避免停车位紧张"""
            },
            
            "product_info": {
                "门票套餐信息.txt": """门票套餐信息

基础门票：
- 成人票：80元/人
- 儿童票：50元/人（3-12岁）
- 老年票：60元/人（65岁以上）
- 免票：1.2米以下儿童

套餐选择：
1. 家庭套票（2大1小）：180元
2. 亲子套票（1大1小）：120元  
3. 团体票（10人以上）：70元/人

增值服务：
- 动物喂食包：20元/份
- 拍照服务：50元/次
- 导览服务：100元/团"""
            },
            
            "user_reviews": {
                "游客评价汇总.txt": """游客评价汇总

正面评价：
✅ 动物种类丰富，互动性强
✅ 环境优美，适合拍照
✅ 工作人员服务态度好
✅ 停车方便，交通便利
✅ 价格合理，性价比高

负面反馈：
❌ 节假日人流量大
❌ 餐饮选择相对有限
❌ 部分设施需要维护
❌ 雨天游玩体验一般

综合评分：4.2/5.0"""
            }
        }
        
        return mock_docs.get(content_type, mock_docs["travel_guide"])
    
    def _format_xhs_notes(self, notes: List[SimpleNote]) -> str:
        """格式化小红书笔记内容"""
        if not notes:
            return "暂无相关笔记内容"
        
        formatted_notes = []
        for i, note in enumerate(notes, 1):
            note_content = f"""第{i}条笔记：
标题：{note.title}
作者：{note.author}
内容：{note.content if note.content else '（无文字内容）'}
互动数据：👍{note.likes} 💬{note.comments} 🔄{note.shares}
笔记链接：{note.note_url}
"""
            if note.images:
                note_content += f"图片数量：{len(note.images)}张\n"
            
            formatted_notes.append(note_content)
        
        return "\n".join(formatted_notes)
    
    def _format_document_content(self, documents: Dict[str, str]) -> str:
        """格式化文档内容"""
        if not documents:
            return "暂无文档内容"
        
        formatted_docs = []
        for filename, content in documents.items():
            doc_content = f"""文档：{filename}
内容：
{content}
"""
            formatted_docs.append(doc_content)
        
        return "\n".join(formatted_docs)
    
    def test_basic_integration(self):
        """测试基础整合功能"""
        print("\n" + "="*60)
        print("🔄 测试基础整合功能")
        print("="*60)
        
        # 获取数据
        search_result = self.get_search_result("上海馥桂萌宠园攻略")
        documents = self.create_mock_documents("travel_guide")
        
        print(f"📚 文档处理结果：")
        print(f"   - 文档数量：{len(documents)}")
        total_doc_length = sum(len(content) for content in documents.values())
        print(f"   - 总内容长度：{total_doc_length}")
        
        print(f"\n📱 小红书数据：")
        print(f"   - 笔记数量：{len(search_result.notes)}")
        print(f"   - 总点赞数：{sum(note.likes for note in search_result.notes)}")
        print(f"   - 总评论数：{sum(note.comments for note in search_result.notes)}")
        
        # 显示部分内容示例
        print(f"\n📝 内容示例：")
        first_doc = list(documents.values())[0]
        print(f"   文档摘要：{first_doc[:100]}...")
        
        if search_result.notes:
            popular_note = max(search_result.notes, key=lambda x: x.likes)
            print(f"   热门笔记：{popular_note.title} (👍{popular_note.likes})")
        
        return {
            "search_result": search_result,
            "documents": documents
        }
    
    def test_different_document_types(self):
        """测试不同文档类型的整合效果"""
        print("\n" + "="*60)
        print("📊 测试不同文档类型的整合效果")
        print("="*60)
        
        doc_types = ["travel_guide", "product_info", "user_reviews"]
        results = {}
        
        for doc_type in doc_types:
            print(f"\n🔸 处理文档类型：{doc_type}")
            
            documents = self.create_mock_documents(doc_type)
            total_length = sum(len(content) for content in documents.values())
            
            results[doc_type] = {
                "document_count": len(documents),
                "content_length": total_length,
                "files": list(documents.keys())
            }
            
            print(f"   ✅ 文档数量：{results[doc_type]['document_count']}")
            print(f"   ✅ 内容长度：{results[doc_type]['content_length']}")
            print(f"   ✅ 文件列表：{results[doc_type]['files']}")
        
        return results
    
    def test_content_filtering(self):
        """测试内容过滤和筛选"""
        print("\n" + "="*60)
        print("🔍 测试内容过滤和筛选")
        print("="*60)
        
        search_result = self.get_search_result()
        
        # 按点赞数筛选
        high_quality_notes = [note for note in search_result.notes if note.likes >= 50]
        print(f"📈 高质量笔记（点赞≥50）：{len(high_quality_notes)} 条")
        
        # 按评论数筛选
        interactive_notes = [note for note in search_result.notes if note.comments >= 10]
        print(f"💬 高互动笔记（评论≥10）：{len(interactive_notes)} 条")
        
        # 按标题关键词筛选
        guide_notes = [note for note in search_result.notes if '攻略' in note.title or '指南' in note.title]
        print(f"📋 攻略类笔记：{len(guide_notes)} 条")
        
        # 显示统计信息
        if search_result.notes:
            avg_likes = sum(note.likes for note in search_result.notes) / len(search_result.notes)
            avg_comments = sum(note.comments for note in search_result.notes) / len(search_result.notes)
            
            print(f"\n📊 统计信息：")
            print(f"   平均点赞数：{avg_likes:.1f}")
            print(f"   平均评论数：{avg_comments:.1f}")
            
            # 找出最受欢迎的笔记
            most_liked = max(search_result.notes, key=lambda x: x.likes)
            most_commented = max(search_result.notes, key=lambda x: x.comments)
            
            print(f"\n🏆 最受欢迎：")
            print(f"   最多点赞：《{most_liked.title}》- {most_liked.likes} 赞")
            print(f"   最多评论：《{most_commented.title}》- {most_commented.comments} 评论")
        
        return {
            "total_notes": len(search_result.notes),
            "high_quality_notes": len(high_quality_notes),
            "interactive_notes": len(interactive_notes), 
            "guide_notes": len(guide_notes)
        }
    
    async def test_real_ai_integration(self):
        """测试真实AI整合功能"""
        print("\n" + "="*60)
        print("🤖 测试真实AI整合功能")
        print("="*60)
        
        try:
            # 准备数据
            search_result = self.get_search_result()
            documents = self.create_mock_documents("travel_guide")
            
            print("🔄 正在准备数据...")
            
            # 格式化文档内容
            document_content = self._format_document_content(documents)
            
            # 格式化小红书笔记内容
            xhs_content = self._format_xhs_notes(search_result.notes)
            
            # 构建prompt
            system_prompt = self.prompt_template.get_system_prompt()
            user_prompt = self.prompt_template.build_user_prompt(
                keywords=search_result.keyword or "馥桂萌宠园",
                document_content=document_content or "暂无文档内容",
                xhs_notes_content=xhs_content or "暂无笔记内容"
            )
            
            print(f"📝 System Prompt长度: {len(system_prompt)} 字符")
            print(f"📝 User Prompt长度: {len(user_prompt)} 字符")
            
            # 调用真实AI
            print("🔄 正在调用AI进行内容整合...")
            start_time = datetime.now()
            
            response_text, input_tokens, output_tokens, time_cost = await self.ai_agent.generate_text(
                system_prompt=system_prompt,
                user_prompt=user_prompt,
                use_stream=True,
                stage="content_integration",
            )
            
            end_time = datetime.now()
            
            print("✅ AI整合完成")
            print(f"📄 生成内容长度：{len(response_text)} 字符")
            print(f"🎯 输入Token数：{input_tokens}")
            print(f"🎯 输出Token数：{output_tokens}")
            print(f"⏱️ 处理时间：{time_cost:.2f} 秒")
            
            # 尝试解析JSON
            try:
                # 使用file_io模块的JSON处理功能
                from utils.file_io import process_llm_json_text
                parsed_json = process_llm_json_text(response_text)
                
                if parsed_json:
                    print("✅ JSON解析成功")
                    print(f"📊 解析结果包含 {len(parsed_json)} 个顶级字段")
                    
                    # 显示解析后的结构
                    if isinstance(parsed_json, dict):
                        print("🏗️ 数据结构：")
                        for key, value in parsed_json.items():
                            if isinstance(value, dict):
                                print(f"   {key}: 包含 {len(value)} 个子字段")
                            elif isinstance(value, list):
                                print(f"   {key}: 列表，包含 {len(value)} 个项目")
                            else:
                                print(f"   {key}: {type(value).__name__}")
                    
                    formatted_response = json.dumps(parsed_json, ensure_ascii=False, indent=2)
                else:
                    print("⚠️ JSON解析失败，使用原始响应")
                    formatted_response = response_text
                    
            except Exception as parse_error:
                print(f"⚠️ JSON解析出错: {parse_error}")
                formatted_response = response_text
            
            print(f"\n📝 AI响应预览：")
            preview_text = formatted_response[:500] + "..." if len(formatted_response) > 500 else formatted_response
            print(preview_text)
            
            return {
                "ai_response": formatted_response,
                "content_length": len(response_text),
                "input_tokens": input_tokens,
                "output_tokens": output_tokens,
                "processing_time": time_cost,
                "source_notes": len(search_result.notes),
                "source_docs": len(documents),
                "json_parsed": parsed_json is not None
            }
            
        except Exception as e:
            print(f"❌ AI整合测试失败：{e}")
            import traceback
            traceback.print_exc()
            return None
    
    def test_export_formats(self):
        """测试不同导出格式"""
        print("\n" + "="*60)
        print("📤 测试不同导出格式")
        print("="*60)
        
        search_result = self.get_search_result()
        documents = self.create_mock_documents("travel_guide")
        
        # 模拟不同格式的导出
        formats = {
            "summary": "简要摘要格式",
            "blog_post": "博客文章格式", 
            "travel_guide": "旅游攻略格式",
            "product_sales": "产品销售格式",
            "attraction_standard": "景点标准格式"
        }
        
        export_results = {}
        
        for format_type, description in formats.items():
            print(f"🔸 生成{description}...")
            
            # 模拟不同格式的内容生成
            mock_content = self._generate_format_content(format_type, search_result, documents)
            
            export_results[format_type] = {
                "content": mock_content,
                "length": len(mock_content),
                "format": format_type
            }
            
            print(f"   ✅ 长度：{len(mock_content)} 字符")
        
        return export_results
    
    def _generate_format_content(self, format_type: str, search_result: SimpleSearchResult, documents: Dict[str, str]) -> str:
        """根据格式类型生成相应内容"""
        
        base_info = f"基于{len(search_result.notes)}篇小红书游记和官方资料"
        
        if format_type == "summary":
            return f"【简要摘要】{base_info}，馥桂萌宠园是嘉定区优质亲子游目的地，特色萌宠互动体验，适合周末家庭出游。"
        
        elif format_type == "blog_post":
            return f"""# 周末带娃新选择 | 上海馥桂萌宠园深度体验

今天想和大家分享一个超棒的亲子游目的地——位于嘉定的馥桂萌宠园！{base_info}，这里真的是遛娃神器！

## 为什么推荐这里？
🦘 可以近距离接触羊驼、袋鼠等萌宠
🌸 环境优美，特别是桂花季节
👨‍👩‍👧‍👦 设施完善，非常适合家庭游

详细攻略请看下文..."""

        elif format_type == "travel_guide":
            return f"""## 上海馥桂萌宠园游玩攻略

### 基本信息
- 地址：上海市嘉定区朱桥镇
- 门票：成人80元，儿童50元
- 开放时间：9:00-17:00

### 交通指南
地铁11号线嘉定西站转公交，或自驾直达

### 游玩亮点
{base_info}，园区主要特色包括萌宠互动、户外游乐、休闲拍照等。

### 注意事项
建议游玩3-4小时，春秋季节体验最佳。"""

        elif format_type == "product_sales":
            return f"""🔥【限时特惠】上海馥桂萌宠园门票

⭐ 产品亮点：
• {base_info}，口碑爆棚！
• 🦘 独特萌宠互动体验
• 🌸 网红拍照打卡圣地  
• 👨‍👩‍👧‍👦 亲子游首选目的地

💰 特价套餐：
原价130元，现价只要99元！
包含：成人门票1张 + 动物喂食包1份

📞 立即预订：xxx-xxxx-xxxx"""

        else:  # attraction_standard
            return f"""景点名称：上海馥桂萌宠园
景点类型：主题公园/动物园
地理位置：上海市嘉定区朱桥镇
开放状态：正常营业
门票价格：80-50元
适宜人群：亲子家庭
推荐指数：4.2/5.0
游玩时长：3-4小时
最佳季节：春秋两季

{base_info}，具有良好的口碑和游客体验。"""
    
    def test_parameter_effects(self):
        """测试不同参数的效果"""
        print("\n" + "="*60)
        print("⚙️ 测试不同参数的效果")
        print("="*60)
        
        search_result = self.get_search_result()
        
        # 测试不同的过滤阈值
        thresholds = [
            {"likes": 20, "comments": 5},
            {"likes": 50, "comments": 10},
            {"likes": 100, "comments": 20}
        ]
        
        print("🔧 测试不同过滤阈值的效果：")
        for i, threshold in enumerate(thresholds, 1):
            high_likes = [note for note in search_result.notes if note.likes >= threshold["likes"]]
            high_comments = [note for note in search_result.notes if note.comments >= threshold["comments"]]
            
            print(f"   阈值{i}（点赞≥{threshold['likes']}, 评论≥{threshold['comments']}）:")
            print(f"     - 符合点赞条件：{len(high_likes)} 条")
            print(f"     - 符合评论条件：{len(high_comments)} 条")
        
        # 测试不同的内容长度限制
        length_limits = [100, 200, 500]
        
        print(f"\n📏 测试不同内容长度限制的效果：")
        for limit in length_limits:
            documents = self.create_mock_documents("travel_guide")
            truncated_docs = {}
            for filename, content in documents.items():
                truncated_docs[filename] = content[:limit] + "..." if len(content) > limit else content
            
            total_length = sum(len(content) for content in truncated_docs.values())
            print(f"   限制{limit}字符：总长度 {total_length} 字符")
        
        return {
            "threshold_tests": len(thresholds),
            "length_tests": len(length_limits)
        }

async def main():
    """主函数"""
    print("🚀 启动真实AI离线整合测试...")
    
    # 检查数据文件
    data_file = "batch_search_20250717_104407.json"
    if not os.path.exists(data_file):
        print(f"❌ 数据文件不存在：{data_file}")
        return
    
    # 初始化测试器
    try:
        tester = OfflineIntegrationTester(data_file)
    except Exception as e:
        print(f"❌ 测试器初始化失败：{e}")
        import traceback
        traceback.print_exc()
        return
    
    try:
        # 执行各种测试
        print(f"\n{'='*60}")
        print("🧪 开始执行真实AI离线整合测试套件")
        print(f"{'='*60}")
        
        # 1. 基础整合测试
        basic_result = tester.test_basic_integration()
        
        # 2. 不同文档类型测试
        doc_type_results = tester.test_different_document_types()
        
        # 3. 内容过滤测试
        filter_results = tester.test_content_filtering()
        
        # 4. 真实AI整合测试（异步）
        ai_results = await tester.test_real_ai_integration()
        with open("ai_results.json", "w") as f:
            json.dump(ai_results, f, ensure_ascii=False, indent=2)
        # 5. 导出格式测试
        export_results = tester.test_export_formats()
        
        # 6. 参数效果测试
        param_results = tester.test_parameter_effects()
        
        # 输出测试总结
        print(f"\n{'='*60}")
        print("📋 测试结果总结")
        print(f"{'='*60}")
        
        print(f"✅ 基础整合测试：成功")
        print(f"✅ 文档类型测试：处理了 {len(doc_type_results)} 种类型")
        print(f"✅ 内容过滤测试：识别出 {filter_results['high_quality_notes']} 条高质量笔记")
        
        if ai_results:
            print(f"✅ 真实AI整合测试：成功")
            print(f"   - 处理时间：{ai_results['processing_time']:.2f}秒")
            print(f"   - 输入Token：{ai_results['input_tokens']}")
            print(f"   - 输出Token：{ai_results['output_tokens']}")
            print(f"   - JSON解析：{'成功' if ai_results['json_parsed'] else '失败'}")
        else:
            print(f"❌ 真实AI整合测试：失败")
        
        # print(f"✅ 导出格式测试：生成了 {len(export_results)} 种格式")
        # print(f"✅ 参数效果测试：测试了 {param_results['threshold_tests']} 种阈值配置")
        
        # print(f"\n🎉 所有测试完成！")
        
        # # 提供测试建议
        # print(f"\n💡 测试建议：")
        # print(f"   1. 可以调整AI模型参数（temperature、top_p等）来优化输出质量")
        # print(f"   2. 可以修改integration prompt模板来改进整合效果")
        # print(f"   3. 可以测试不同的文档类型组合")
        # print(f"   4. 可以调整内容过滤的阈值")
        # print(f"   5. 可以测试不同的导出格式效果")
        
        # # 显示原始数据概览
        # if tester.raw_data:
        #     summary = tester.raw_data.get('summary', {})
        #     print(f"\n📊 原始数据概览：")
        #     print(f"   - 总关键词数：{summary.get('total_keywords', 0)}")
        #     print(f"   - 成功搜索数：{summary.get('successful_searches', 0)}")
        #     print(f"   - 总笔记数：{summary.get('total_notes', 0)}")
        #     print(f"   - 总互动数：{summary.get('total_interactions', 0)}")
        
    except Exception as e:
        print(f"❌ 测试过程中发生错误：{e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    asyncio.run(main())