From 12aaf88aa2d7d4f87e37fc5e90dfa85c6ad228dd Mon Sep 17 00:00:00 2001
From: jinye_huang <jinye_huang@foxmail.com>
Date: Mon, 14 Jul 2025 16:01:06 +0800
Subject: [PATCH] =?UTF-8?q?=E6=B5=8B=E8=AF=95=E9=80=9A=E8=BF=87=E4=BA=86?=
 =?UTF-8?q?=E5=A4=9A=E7=A7=8D=E4=B8=8D=E5=90=8C=E6=A8=A1=E5=BC=8F=E7=9A=84?=
 =?UTF-8?q?=E8=AF=86=E5=88=AB?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tests/document_processing_example_simple.py | 221 ++++++++++++++++++++
 1 file changed, 221 insertions(+)
 create mode 100644 tests/document_processing_example_simple.py

diff --git a/tests/document_processing_example_simple.py b/tests/document_processing_example_simple.py
new file mode 100644
index 0000000..2a21b7c
--- /dev/null
+++ b/tests/document_processing_example_simple.py
@@ -0,0 +1,221 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+文档内容提取示例
+专注于文档内容提取功能，不包含网络搜索
+"""
+
+import os
+import sys
+from pathlib import Path
+
+# 添加项目根目录到Python路径
+project_root = Path(__file__).parent
+sys.path.insert(0, str(project_root))
+
+from document import TextExtractor, ContentIntegrator, ContentTransformer
+from api.services.document_service import DocumentService
+
+def test_single_document_extraction():
+    """测试单个文档提取"""
+    print("=== 单个文档提取测试 ===")
+    
+    # 创建文档服务
+    service = DocumentService()
+    
+    # 测试文件路径（请替换为实际的文件路径）
+    test_file = "test_document.pdf"  # 替换为实际文件路径
+    
+    if not os.path.exists(test_file):
+        print(f"测试文件不存在: {test_file}")
+        print("请将一个测试文档放在当前目录下，或修改test_file变量")
+        return
+    
+    # 仅提取文本
+    print(f"提取文档: {test_file}")
+    result = service.extract_text_only(test_file)
+    
+    if result['success']:
+        doc = result['document']
+        print(f"文件名: {doc['filename']}")
+        print(f"文件类型: {doc['file_type']}")
+        print(f"文件大小: {doc['file_size']} 字节")
+        print(f"内容长度: {doc['content_length']} 字符")
+        print(f"页数: {doc['page_count']}")
+        print(f"提取时间: {doc['extracted_at']}")
+        print("\n内容预览:")
+        print(doc['content'][:500] + "..." if len(doc['content']) > 500 else doc['content'])
+    else:
+        print(f"提取失败: {result['error']}")
+
+def test_multiple_documents_processing(save_path):
+    """测试多个文档处理"""
+    print("\n=== 多个文档处理测试 ===")
+    
+    # 创建文档服务
+    service = DocumentService()
+    
+    # 测试文件列表（请替换为实际的文件路径）
+    test_files = [
+        "document/sample_documents/Ai服务商家资料收集202506.xlsx",
+        "document/sample_documents/ai营销括客话术和QA整理.docx",
+        "document/sample_documents/附件1-服务器租赁发票20250702.pdf"
+    ]
+    
+    # 过滤存在的文件
+    existing_files = [f for f in test_files if os.path.exists(f)]
+    
+    if not existing_files:
+        print("没有找到测试文件")
+        print("请将测试文档放在当前目录下，或修改test_files变量")
+        return
+    
+    print(f"处理文档: {existing_files}")
+    
+    # 处理多个文档
+    result = service.process_multiple_documents(existing_files, output_format='summary')
+    
+    if result['success']:
+        print(f"处理摘要:")
+        summary = result['processing_summary']
+        print(f"  总文件数: {summary['total_files']}")
+        print(f"  成功提取: {summary['successful_extractions']}")
+        print(f"  失败提取: {summary['failed_extractions']}")
+        
+        print(f"\n文档列表:")
+        for i, doc in enumerate(result['documents'], 1):
+            print(f"  {i}. {doc['filename']} ({doc['file_type']}) - {doc['content_length']} 字符")
+        
+        print(f"\n整合内容:")
+        integrated = result['integrated_content']
+        print(f"  文档数量: {integrated['document_count']}")
+        print(f"  总内容长度: {integrated['total_content_length']}")
+        print(f"  文档类型: {integrated['document_types']}")
+        print(f"  关键主题: {integrated['key_topics']}")
+        
+        print(f"\n内容摘要:")
+        print(integrated['content_summary'])
+        
+        print(f"\n转换后的内容:")
+        transformed = result['transformed_content']
+        print(f"  格式类型: {transformed['format_type']}")
+        print(f"  转换时间: {transformed['transformed_at']}")
+        print("\n转换内容预览:")
+        content = transformed['content']
+        # print(content[:1000] + "..." if len(content) > 1000 else content)
+        if save_path:
+            with open(save_path, 'w', encoding='utf-8') as f:
+                f.write(content)
+            print(f"转换后的内容已保存到: {save_path}")
+    else:
+        print(f"处理失败: {result['error']}")
+
+def test_component_usage():
+    """测试组件单独使用"""
+    print("\n=== 组件单独使用测试 ===")
+    
+    # 测试文件
+    test_file = "test_document.txt"
+    
+    # 创建测试文件
+    if not os.path.exists(test_file):
+        with open(test_file, 'w', encoding='utf-8') as f:
+            f.write("""
+这是一个测试文档。
+
+主要内容包括：
+1. 文档提取功能测试
+2. 内容整合功能测试
+3. 内容转换功能测试
+
+测试文档包含中文内容，用于验证文本提取和处理功能。
+            """)
+        print(f"创建测试文件: {test_file}")
+    
+    # 1. 文本提取器测试
+    print("\n1. 文本提取器测试")
+    extractor = TextExtractor()
+    extracted_doc = extractor.extract(test_file)
+    
+    print(f"提取结果:")
+    print(f"  文件名: {extracted_doc.filename}")
+    print(f"  文件类型: {extracted_doc.file_type}")
+    print(f"  内容长度: {len(extracted_doc.content)}")
+    print(f"  内容: {extracted_doc.content}")
+    
+    # 2. 内容整合器测试
+    print("\n2. 内容整合器测试")
+    integrator = ContentIntegrator()
+    integrated_content = integrator.integrate_documents([extracted_doc])
+    
+    print(f"整合结果:")
+    print(f"  文档数量: {integrated_content.document_count}")
+    print(f"  总内容长度: {integrated_content.total_content_length}")
+    print(f"  文档类型: {integrated_content.document_types}")
+    print(f"  关键主题: {integrated_content.key_topics}")
+    print(f"  内容摘要: {integrated_content.content_summary}")
+    
+    # 3. 内容转换器测试
+    print("\n3. 内容转换器测试")
+    transformer = ContentTransformer()
+    
+    # 测试不同格式
+    formats = ['summary', 'attraction_standard', 'product_sales', 'travel_guide', 'blog_post']
+    
+    for format_type in formats:
+        print(f"\n--- {format_type} 格式 ---")
+        transformed_content = transformer.transform_content(integrated_content, format_type=format_type)
+        print(f"转换结果预览:")
+        content = transformed_content.transformed_text
+        print(content[:300] + "..." if len(content) > 300 else content)
+
+def test_service_info():
+    """测试服务信息"""
+    print("\n=== 服务信息测试 ===")
+    
+    service = DocumentService()
+    
+    print("支持的文件类型:")
+    file_types = service.get_supported_file_types()
+    print(f"  {file_types}")
+    
+    print("\n支持的输出格式:")
+    output_formats = service.get_supported_formats()
+    print(f"  {output_formats}")
+    
+    print("\n服务状态:")
+    status = service.get_service_status()
+    print(f"  服务名: {status['service_name']}")
+    print(f"  状态: {status['status']}")
+    print(f"  组件: {status['components']}")
+    print(f"  时间戳: {status['timestamp']}")
+
+def main():
+    """主函数"""
+    print("文档内容提取示例程序")
+    print("=" * 50)
+    
+    try:
+        # 测试服务信息
+        test_service_info()
+        
+        # 测试组件单独使用
+        test_component_usage()
+        
+        # 测试单个文档提取
+        test_single_document_extraction()
+        
+        # 测试多个文档处理
+        test_multiple_documents_processing("document/sample_documents/test.txt")
+        
+        print("\n" + "=" * 50)
+        print("测试完成！")
+        
+    except Exception as e:
+        print(f"测试过程中发生错误: {str(e)}")
+        import traceback
+        traceback.print_exc()
+
+if __name__ == "__main__":
+    main() 
\ No newline at end of file