bangbang-aigc-server/travel-algorithms/test/test_document_processing.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Document Processing Test Script
文档处理功能测试脚本 - 测试重构后的文档处理流水线
"""

import asyncio
import json
from pathlib import Path
from datetime import datetime

from travel_algorithms import (
    create_document_pipeline,
    create_default_config,
    DocumentProcessor,
    TextExtractor,
    ContentIntegrator,
    ContentTransformer
)


async def test_complete_pipeline():
    """测试完整的文档处理流水线"""

    print("📄 文档处理完整流水线测试")
    print("=" * 50)

    # 1. 创建配置和流水线
    print("\n📋 创建配置和流水线...")
    config = create_default_config(
        resource_base_directory="./resource",
        ai_model="qwen-plus"
    )

    pipeline = create_document_pipeline(config)
    document_processor = pipeline["document_processor"]

    print("✅ 文档处理流水线创建成功")

    # 2. 测试支持的格式
    print("\n📊 支持的格式:")
    supported_formats = document_processor.get_supported_formats()
    print(f"输入格式: {list(supported_formats['input_formats'].keys())}")
    print(f"输出格式: {list(supported_formats['output_formats'].keys())}")

    # 3. 创建测试文档
    print("\n📝 创建测试文档...")
    test_docs_dir = Path("test_documents")
    test_docs_dir.mkdir(exist_ok=True)

    # 创建测试文本文件
    test_txt = test_docs_dir / "景区介绍.txt"
    with open(test_txt, 'w', encoding='utf-8') as f:
        f.write("""北京故宫博物院

故宫博物院位于北京市中心，是中国明清两代的皇家宫殿，旧称紫禁城。故宫占地面积约72万平方米，建筑面积约15万平方米，是世界上现存规模最大、保存最为完整的木质结构古建筑之一。

开放时间：
- 4月1日-10月31日：8:30-17:00
- 11月1日-3月31日：8:30-16:30
- 周一闭馆（法定节假日除外）

门票价格：
- 旺季（4-10月）：60元
- 淡季（11-3月）：40元
- 学生票半价

交通指南：
地铁：1号线天安门东站、天安门西站
公交：1路、2路、10路、20路等多路公交车

游览建议：
建议游览时间3-4小时，推荐路线：午门→太和殿→中和殿→保和殿→乾清宫→坤宁宫→御花园

注意事项：
1. 需要网上实名预约购票
2. 禁止携带打火机等易燃物品
3. 保持安静，爱护文物
4. 部分区域禁止拍照""")

    # 创建测试CSV文件
    test_csv = test_docs_dir / "价格信息.csv"
    with open(test_csv, 'w', encoding='utf-8') as f:
        f.write("""票种,价格,优惠政策,适用人群
成人票,60元,无,18-59岁成人
学生票,30元,半价优惠,在校学生
老人票,30元,半价优惠,60岁以上老人
儿童票,免费,身高1.2米以下免票,儿童
年票,300元,全年无限次,所有人群""")

    print(f"✅ 测试文档创建完成: {test_docs_dir}")

    # 4. 测试不同格式的转换
    formats_to_test = [
        ('summary', '内容摘要'),
        ('attraction_standard', '景区标准信息'),
        ('travel_guide', '旅游攻略'),
        ('marketing_copy', '营销文案'),
        ('structured_data', '结构化数据')
    ]

    results = {}

    for format_type, format_name in formats_to_test:
        try:
            print(f"\n🎯 测试转换为 {format_name} 格式...")

            result = await document_processor.process_documents(
                sources=test_docs_dir,
                target_format=format_type,
                additional_requirements=f"请重点突出{format_name}的特点"
            )

            print(f"✅ {format_name} 转换成功")
            print(f"  - 处理文档数: {result['processing_summary']['total_documents']}")
            print(f"  - 输出长度: {len(result['final_output'])} 字符")
            print(f"  - 质量评分: {result['transformed_content']['quality_score']:.2f}")

            # 保存结果
            output_dir = Path("test_output/document_processing")
            output_dir.mkdir(parents=True, exist_ok=True)

            with open(output_dir / f"{format_type}_result.txt", 'w', encoding='utf-8') as f:
                f.write(result['final_output'])

            with open(output_dir / f"{format_type}_full.json", 'w', encoding='utf-8') as f:
                json.dump(result, f, ensure_ascii=False, indent=2, default=str)

            results[format_type] = result

        except Exception as e:
            print(f"❌ {format_name} 转换失败: {e}")
            results[format_type] = {'error': str(e)}

    print(f"\n📁 所有结果已保存至: test_output/document_processing/")
    print(f"✅ 完整流水线测试完成，处理了 {len(formats_to_test)} 种格式")

    return results


async def test_individual_components():
    """测试各个组件的独立功能"""

    print("\n🔧 独立组件功能测试")
    print("=" * 50)

    config = create_default_config()
    pipeline = create_document_pipeline(config)

    # 测试文本提取器
    print("\n📥 测试文本提取器...")
    text_extractor = pipeline["text_extractor"]

    # 创建简单测试文件
    test_file = Path("test_simple.txt")
    with open(test_file, 'w', encoding='utf-8') as f:
        f.write("这是一个简单的测试文档。\n包含一些基本信息用于测试提取功能。")

    try:
        extracted_doc = text_extractor.extract_from_file(test_file)
        print(f"✅ 文本提取成功:")
        print(f"  - 文件: {extracted_doc.filename}")
        print(f"  - 类型: {extracted_doc.file_type}")
        print(f"  - 内容长度: {len(extracted_doc.content)}")
        print(f"  - 提取方法: {extracted_doc.extraction_method}")
    except Exception as e:
        print(f"❌ 文本提取失败: {e}")
    finally:
        test_file.unlink(missing_ok=True)

    # 测试内容整合器
    print("\n🔗 测试内容整合器...")
    content_integrator = pipeline["content_integrator"]

    # 创建模拟文档
    from travel_algorithms.document_processing import ExtractedDocument

    mock_docs = [
        ExtractedDocument(
            filename="doc1.txt",
            file_type="Plain Text",
            content="北京故宫是明清两代的皇家宫殿，具有重要的历史价值。",
            metadata={},
            extracted_at=datetime.now(),
            file_size=100
        ),
        ExtractedDocument(
            filename="doc2.txt",
            file_type="Plain Text",
            content="故宫门票价格：旺季60元，淡季40元。开放时间每日8:30-17:00。",
            metadata={},
            extracted_at=datetime.now(),
            file_size=80
        )
    ]

    try:
        integrated_content = content_integrator.integrate_documents(mock_docs)
        print(f"✅ 内容整合成功:")
        print(f"  - 文档数量: {integrated_content.document_count}")
        print(f"  - 总内容长度: {integrated_content.total_content_length}")
        print(f"  - 关键主题: {integrated_content.key_topics}")
        print(f"  - 文档类型: {integrated_content.document_types}")
    except Exception as e:
        print(f"❌ 内容整合失败: {e}")

    # 测试内容转换器
    print("\n🔄 测试内容转换器...")
    content_transformer = pipeline["content_transformer"]

    try:
        if 'integrated_content' in locals():
            transformed_content = await content_transformer.transform_content(
                integrated_content=integrated_content,
                format_type='summary'
            )
            print(f"✅ 内容转换成功:")
            print(f"  - 转换格式: {transformed_content.format_type}")
            print(f"  - 输出长度: {len(transformed_content.transformed_text)}")
            print(f"  - 质量评分: {transformed_content.quality_score:.2f}")
            print(f"  - 转换预览: {transformed_content.transformed_text[:100]}...")
    except Exception as e:
        print(f"❌ 内容转换失败: {e}")

    print("\n✅ 独立组件测试完成")


def test_configuration():
    """测试配置功能"""

    print("\n⚙️ 配置功能测试")
    print("=" * 50)

    config = create_default_config()

    # 显示文档处理配置
    doc_config = config.document_processing
    print(f"📄 文档处理配置:")
    print(f"  - 最大文件大小: {doc_config.max_file_size / (1024*1024):.1f} MB")
    print(f"  - 最大内容长度: {doc_config.max_content_length:,} 字符")
    print(f"  - 最大文档数: {doc_config.max_documents}")
    print(f"  - 最大主题数: {doc_config.max_topics}")
    print(f"  - 启用内容整合: {doc_config.enable_content_integration}")

    # 显示可用组件
    pipeline = create_document_pipeline(config)
    print(f"\n🧩 可用组件:")
    for name, component in pipeline.items():
        if hasattr(component, '__class__'):
            print(f"  - {name}: {component.__class__.__name__}")

    print("\n✅ 配置测试完成")


async def main():
    """主测试函数"""

    print("🚀 文档处理系统全面测试")
    print("📖 测试重构后的文档处理功能")
    print("\n包含的功能:")
    print("  ✨ 多格式文档提取 (PDF, Word, Excel, TXT, CSV, JSON, XML, HTML)")
    print("  ✨ 智能内容整合和主题提取")
    print("  ✨ AI驱动的格式转换")
    print("  ✨ 质量评分和元数据跟踪")
    print("  ✨ 批量处理和流水线管理")
    print("\n" + "=" * 60)

    # 配置测试
    test_configuration()

    # 独立组件测试
    await test_individual_components()

    # 完整流水线测试
    await test_complete_pipeline()

    print("\n" + "=" * 60)
    print("🎊 所有测试完成！")
    print("\n📁 生成的文件:")
    print("  - test_documents/ - 测试文档目录")
    print("  - test_output/document_processing/ - 处理结果")
    print("📖 查看生成的文件了解处理效果")


if __name__ == "__main__":
    asyncio.run(main())