bangbang-aigc-server/tests/document_processing_example_simple.py

221 lines
7.5 KiB
Python
Raw Permalink Normal View History

2025-07-31 15:35:23 +08:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
文档内容提取示例
专注于文档内容提取功能不包含网络搜索
"""
import os
import sys
from pathlib import Path
# 添加项目根目录到Python路径
project_root = Path(__file__).parent
sys.path.insert(0, str(project_root))
from document import TextExtractor, ContentIntegrator, ContentTransformer
from api.services.document_service import DocumentService
def test_single_document_extraction():
"""测试单个文档提取"""
print("=== 单个文档提取测试 ===")
# 创建文档服务
service = DocumentService()
# 测试文件路径(请替换为实际的文件路径)
test_file = "test_document.pdf" # 替换为实际文件路径
if not os.path.exists(test_file):
print(f"测试文件不存在: {test_file}")
print("请将一个测试文档放在当前目录下或修改test_file变量")
return
# 仅提取文本
print(f"提取文档: {test_file}")
result = service.extract_text_only(test_file)
if result['success']:
doc = result['document']
print(f"文件名: {doc['filename']}")
print(f"文件类型: {doc['file_type']}")
print(f"文件大小: {doc['file_size']} 字节")
print(f"内容长度: {doc['content_length']} 字符")
print(f"页数: {doc['page_count']}")
print(f"提取时间: {doc['extracted_at']}")
print("\n内容预览:")
print(doc['content'][:500] + "..." if len(doc['content']) > 500 else doc['content'])
else:
print(f"提取失败: {result['error']}")
def test_multiple_documents_processing(save_path):
"""测试多个文档处理"""
print("\n=== 多个文档处理测试 ===")
# 创建文档服务
service = DocumentService()
# 测试文件列表(请替换为实际的文件路径)
test_files = [
"document/sample_documents/Ai服务商家资料收集202506.xlsx",
"document/sample_documents/ai营销括客话术和QA整理.docx",
"document/sample_documents/附件1-服务器租赁发票20250702.pdf"
]
# 过滤存在的文件
existing_files = [f for f in test_files if os.path.exists(f)]
if not existing_files:
print("没有找到测试文件")
print("请将测试文档放在当前目录下或修改test_files变量")
return
print(f"处理文档: {existing_files}")
# 处理多个文档
result = service.process_multiple_documents(existing_files, output_format='summary')
if result['success']:
print(f"处理摘要:")
summary = result['processing_summary']
print(f" 总文件数: {summary['total_files']}")
print(f" 成功提取: {summary['successful_extractions']}")
print(f" 失败提取: {summary['failed_extractions']}")
print(f"\n文档列表:")
for i, doc in enumerate(result['documents'], 1):
print(f" {i}. {doc['filename']} ({doc['file_type']}) - {doc['content_length']} 字符")
print(f"\n整合内容:")
integrated = result['integrated_content']
print(f" 文档数量: {integrated['document_count']}")
print(f" 总内容长度: {integrated['total_content_length']}")
print(f" 文档类型: {integrated['document_types']}")
print(f" 关键主题: {integrated['key_topics']}")
print(f"\n内容摘要:")
print(integrated['content_summary'])
print(f"\n转换后的内容:")
transformed = result['transformed_content']
print(f" 格式类型: {transformed['format_type']}")
print(f" 转换时间: {transformed['transformed_at']}")
print("\n转换内容预览:")
content = transformed['content']
# print(content[:1000] + "..." if len(content) > 1000 else content)
if save_path:
with open(save_path, 'w', encoding='utf-8') as f:
f.write(content)
print(f"转换后的内容已保存到: {save_path}")
else:
print(f"处理失败: {result['error']}")
def test_component_usage():
"""测试组件单独使用"""
print("\n=== 组件单独使用测试 ===")
# 测试文件
test_file = "test_document.txt"
# 创建测试文件
if not os.path.exists(test_file):
with open(test_file, 'w', encoding='utf-8') as f:
f.write("""
这是一个测试文档
主要内容包括
1. 文档提取功能测试
2. 内容整合功能测试
3. 内容转换功能测试
测试文档包含中文内容用于验证文本提取和处理功能
""")
print(f"创建测试文件: {test_file}")
# 1. 文本提取器测试
print("\n1. 文本提取器测试")
extractor = TextExtractor()
extracted_doc = extractor.extract(test_file)
print(f"提取结果:")
print(f" 文件名: {extracted_doc.filename}")
print(f" 文件类型: {extracted_doc.file_type}")
print(f" 内容长度: {len(extracted_doc.content)}")
print(f" 内容: {extracted_doc.content}")
# 2. 内容整合器测试
print("\n2. 内容整合器测试")
integrator = ContentIntegrator()
integrated_content = integrator.integrate_documents([extracted_doc])
print(f"整合结果:")
print(f" 文档数量: {integrated_content.document_count}")
print(f" 总内容长度: {integrated_content.total_content_length}")
print(f" 文档类型: {integrated_content.document_types}")
print(f" 关键主题: {integrated_content.key_topics}")
print(f" 内容摘要: {integrated_content.content_summary}")
# 3. 内容转换器测试
print("\n3. 内容转换器测试")
transformer = ContentTransformer()
# 测试不同格式
formats = ['summary', 'attraction_standard', 'product_sales', 'travel_guide', 'blog_post']
for format_type in formats:
print(f"\n--- {format_type} 格式 ---")
transformed_content = transformer.transform_content(integrated_content, format_type=format_type)
print(f"转换结果预览:")
content = transformed_content.transformed_text
print(content[:300] + "..." if len(content) > 300 else content)
def test_service_info():
"""测试服务信息"""
print("\n=== 服务信息测试 ===")
service = DocumentService()
print("支持的文件类型:")
file_types = service.get_supported_file_types()
print(f" {file_types}")
print("\n支持的输出格式:")
output_formats = service.get_supported_formats()
print(f" {output_formats}")
print("\n服务状态:")
status = service.get_service_status()
print(f" 服务名: {status['service_name']}")
print(f" 状态: {status['status']}")
print(f" 组件: {status['components']}")
print(f" 时间戳: {status['timestamp']}")
def main():
"""主函数"""
print("文档内容提取示例程序")
print("=" * 50)
try:
# 测试服务信息
test_service_info()
# 测试组件单独使用
test_component_usage()
# 测试单个文档提取
test_single_document_extraction()
# 测试多个文档处理
test_multiple_documents_processing("document/sample_documents/test.txt")
print("\n" + "=" * 50)
print("测试完成!")
except Exception as e:
print(f"测试过程中发生错误: {str(e)}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()