bangbang-aigc-server/tests/document_processing_example_simple.py
2025-07-31 15:35:23 +08:00

221 lines
7.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
文档内容提取示例
专注于文档内容提取功能,不包含网络搜索
"""
import os
import sys
from pathlib import Path
# 添加项目根目录到Python路径
project_root = Path(__file__).parent
sys.path.insert(0, str(project_root))
from document import TextExtractor, ContentIntegrator, ContentTransformer
from api.services.document_service import DocumentService
def test_single_document_extraction():
"""测试单个文档提取"""
print("=== 单个文档提取测试 ===")
# 创建文档服务
service = DocumentService()
# 测试文件路径(请替换为实际的文件路径)
test_file = "test_document.pdf" # 替换为实际文件路径
if not os.path.exists(test_file):
print(f"测试文件不存在: {test_file}")
print("请将一个测试文档放在当前目录下或修改test_file变量")
return
# 仅提取文本
print(f"提取文档: {test_file}")
result = service.extract_text_only(test_file)
if result['success']:
doc = result['document']
print(f"文件名: {doc['filename']}")
print(f"文件类型: {doc['file_type']}")
print(f"文件大小: {doc['file_size']} 字节")
print(f"内容长度: {doc['content_length']} 字符")
print(f"页数: {doc['page_count']}")
print(f"提取时间: {doc['extracted_at']}")
print("\n内容预览:")
print(doc['content'][:500] + "..." if len(doc['content']) > 500 else doc['content'])
else:
print(f"提取失败: {result['error']}")
def test_multiple_documents_processing(save_path):
"""测试多个文档处理"""
print("\n=== 多个文档处理测试 ===")
# 创建文档服务
service = DocumentService()
# 测试文件列表(请替换为实际的文件路径)
test_files = [
"document/sample_documents/Ai服务商家资料收集202506.xlsx",
"document/sample_documents/ai营销括客话术和QA整理.docx",
"document/sample_documents/附件1-服务器租赁发票20250702.pdf"
]
# 过滤存在的文件
existing_files = [f for f in test_files if os.path.exists(f)]
if not existing_files:
print("没有找到测试文件")
print("请将测试文档放在当前目录下或修改test_files变量")
return
print(f"处理文档: {existing_files}")
# 处理多个文档
result = service.process_multiple_documents(existing_files, output_format='summary')
if result['success']:
print(f"处理摘要:")
summary = result['processing_summary']
print(f" 总文件数: {summary['total_files']}")
print(f" 成功提取: {summary['successful_extractions']}")
print(f" 失败提取: {summary['failed_extractions']}")
print(f"\n文档列表:")
for i, doc in enumerate(result['documents'], 1):
print(f" {i}. {doc['filename']} ({doc['file_type']}) - {doc['content_length']} 字符")
print(f"\n整合内容:")
integrated = result['integrated_content']
print(f" 文档数量: {integrated['document_count']}")
print(f" 总内容长度: {integrated['total_content_length']}")
print(f" 文档类型: {integrated['document_types']}")
print(f" 关键主题: {integrated['key_topics']}")
print(f"\n内容摘要:")
print(integrated['content_summary'])
print(f"\n转换后的内容:")
transformed = result['transformed_content']
print(f" 格式类型: {transformed['format_type']}")
print(f" 转换时间: {transformed['transformed_at']}")
print("\n转换内容预览:")
content = transformed['content']
# print(content[:1000] + "..." if len(content) > 1000 else content)
if save_path:
with open(save_path, 'w', encoding='utf-8') as f:
f.write(content)
print(f"转换后的内容已保存到: {save_path}")
else:
print(f"处理失败: {result['error']}")
def test_component_usage():
"""测试组件单独使用"""
print("\n=== 组件单独使用测试 ===")
# 测试文件
test_file = "test_document.txt"
# 创建测试文件
if not os.path.exists(test_file):
with open(test_file, 'w', encoding='utf-8') as f:
f.write("""
这是一个测试文档。
主要内容包括:
1. 文档提取功能测试
2. 内容整合功能测试
3. 内容转换功能测试
测试文档包含中文内容,用于验证文本提取和处理功能。
""")
print(f"创建测试文件: {test_file}")
# 1. 文本提取器测试
print("\n1. 文本提取器测试")
extractor = TextExtractor()
extracted_doc = extractor.extract(test_file)
print(f"提取结果:")
print(f" 文件名: {extracted_doc.filename}")
print(f" 文件类型: {extracted_doc.file_type}")
print(f" 内容长度: {len(extracted_doc.content)}")
print(f" 内容: {extracted_doc.content}")
# 2. 内容整合器测试
print("\n2. 内容整合器测试")
integrator = ContentIntegrator()
integrated_content = integrator.integrate_documents([extracted_doc])
print(f"整合结果:")
print(f" 文档数量: {integrated_content.document_count}")
print(f" 总内容长度: {integrated_content.total_content_length}")
print(f" 文档类型: {integrated_content.document_types}")
print(f" 关键主题: {integrated_content.key_topics}")
print(f" 内容摘要: {integrated_content.content_summary}")
# 3. 内容转换器测试
print("\n3. 内容转换器测试")
transformer = ContentTransformer()
# 测试不同格式
formats = ['summary', 'attraction_standard', 'product_sales', 'travel_guide', 'blog_post']
for format_type in formats:
print(f"\n--- {format_type} 格式 ---")
transformed_content = transformer.transform_content(integrated_content, format_type=format_type)
print(f"转换结果预览:")
content = transformed_content.transformed_text
print(content[:300] + "..." if len(content) > 300 else content)
def test_service_info():
"""测试服务信息"""
print("\n=== 服务信息测试 ===")
service = DocumentService()
print("支持的文件类型:")
file_types = service.get_supported_file_types()
print(f" {file_types}")
print("\n支持的输出格式:")
output_formats = service.get_supported_formats()
print(f" {output_formats}")
print("\n服务状态:")
status = service.get_service_status()
print(f" 服务名: {status['service_name']}")
print(f" 状态: {status['status']}")
print(f" 组件: {status['components']}")
print(f" 时间戳: {status['timestamp']}")
def main():
"""主函数"""
print("文档内容提取示例程序")
print("=" * 50)
try:
# 测试服务信息
test_service_info()
# 测试组件单独使用
test_component_usage()
# 测试单个文档提取
test_single_document_extraction()
# 测试多个文档处理
test_multiple_documents_processing("document/sample_documents/test.txt")
print("\n" + "=" * 50)
print("测试完成!")
except Exception as e:
print(f"测试过程中发生错误: {str(e)}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()