221 lines
7.5 KiB
Python
221 lines
7.5 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
|
|||
|
|
"""
|
|||
|
|
文档内容提取示例
|
|||
|
|
专注于文档内容提取功能,不包含网络搜索
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import os
|
|||
|
|
import sys
|
|||
|
|
from pathlib import Path
|
|||
|
|
|
|||
|
|
# 添加项目根目录到Python路径
|
|||
|
|
project_root = Path(__file__).parent
|
|||
|
|
sys.path.insert(0, str(project_root))
|
|||
|
|
|
|||
|
|
from document import TextExtractor, ContentIntegrator, ContentTransformer
|
|||
|
|
from api.services.document_service import DocumentService
|
|||
|
|
|
|||
|
|
def test_single_document_extraction():
|
|||
|
|
"""测试单个文档提取"""
|
|||
|
|
print("=== 单个文档提取测试 ===")
|
|||
|
|
|
|||
|
|
# 创建文档服务
|
|||
|
|
service = DocumentService()
|
|||
|
|
|
|||
|
|
# 测试文件路径(请替换为实际的文件路径)
|
|||
|
|
test_file = "test_document.pdf" # 替换为实际文件路径
|
|||
|
|
|
|||
|
|
if not os.path.exists(test_file):
|
|||
|
|
print(f"测试文件不存在: {test_file}")
|
|||
|
|
print("请将一个测试文档放在当前目录下,或修改test_file变量")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
# 仅提取文本
|
|||
|
|
print(f"提取文档: {test_file}")
|
|||
|
|
result = service.extract_text_only(test_file)
|
|||
|
|
|
|||
|
|
if result['success']:
|
|||
|
|
doc = result['document']
|
|||
|
|
print(f"文件名: {doc['filename']}")
|
|||
|
|
print(f"文件类型: {doc['file_type']}")
|
|||
|
|
print(f"文件大小: {doc['file_size']} 字节")
|
|||
|
|
print(f"内容长度: {doc['content_length']} 字符")
|
|||
|
|
print(f"页数: {doc['page_count']}")
|
|||
|
|
print(f"提取时间: {doc['extracted_at']}")
|
|||
|
|
print("\n内容预览:")
|
|||
|
|
print(doc['content'][:500] + "..." if len(doc['content']) > 500 else doc['content'])
|
|||
|
|
else:
|
|||
|
|
print(f"提取失败: {result['error']}")
|
|||
|
|
|
|||
|
|
def test_multiple_documents_processing(save_path):
|
|||
|
|
"""测试多个文档处理"""
|
|||
|
|
print("\n=== 多个文档处理测试 ===")
|
|||
|
|
|
|||
|
|
# 创建文档服务
|
|||
|
|
service = DocumentService()
|
|||
|
|
|
|||
|
|
# 测试文件列表(请替换为实际的文件路径)
|
|||
|
|
test_files = [
|
|||
|
|
"document/sample_documents/Ai服务商家资料收集202506.xlsx",
|
|||
|
|
"document/sample_documents/ai营销括客话术和QA整理.docx",
|
|||
|
|
"document/sample_documents/附件1-服务器租赁发票20250702.pdf"
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
# 过滤存在的文件
|
|||
|
|
existing_files = [f for f in test_files if os.path.exists(f)]
|
|||
|
|
|
|||
|
|
if not existing_files:
|
|||
|
|
print("没有找到测试文件")
|
|||
|
|
print("请将测试文档放在当前目录下,或修改test_files变量")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
print(f"处理文档: {existing_files}")
|
|||
|
|
|
|||
|
|
# 处理多个文档
|
|||
|
|
result = service.process_multiple_documents(existing_files, output_format='summary')
|
|||
|
|
|
|||
|
|
if result['success']:
|
|||
|
|
print(f"处理摘要:")
|
|||
|
|
summary = result['processing_summary']
|
|||
|
|
print(f" 总文件数: {summary['total_files']}")
|
|||
|
|
print(f" 成功提取: {summary['successful_extractions']}")
|
|||
|
|
print(f" 失败提取: {summary['failed_extractions']}")
|
|||
|
|
|
|||
|
|
print(f"\n文档列表:")
|
|||
|
|
for i, doc in enumerate(result['documents'], 1):
|
|||
|
|
print(f" {i}. {doc['filename']} ({doc['file_type']}) - {doc['content_length']} 字符")
|
|||
|
|
|
|||
|
|
print(f"\n整合内容:")
|
|||
|
|
integrated = result['integrated_content']
|
|||
|
|
print(f" 文档数量: {integrated['document_count']}")
|
|||
|
|
print(f" 总内容长度: {integrated['total_content_length']}")
|
|||
|
|
print(f" 文档类型: {integrated['document_types']}")
|
|||
|
|
print(f" 关键主题: {integrated['key_topics']}")
|
|||
|
|
|
|||
|
|
print(f"\n内容摘要:")
|
|||
|
|
print(integrated['content_summary'])
|
|||
|
|
|
|||
|
|
print(f"\n转换后的内容:")
|
|||
|
|
transformed = result['transformed_content']
|
|||
|
|
print(f" 格式类型: {transformed['format_type']}")
|
|||
|
|
print(f" 转换时间: {transformed['transformed_at']}")
|
|||
|
|
print("\n转换内容预览:")
|
|||
|
|
content = transformed['content']
|
|||
|
|
# print(content[:1000] + "..." if len(content) > 1000 else content)
|
|||
|
|
if save_path:
|
|||
|
|
with open(save_path, 'w', encoding='utf-8') as f:
|
|||
|
|
f.write(content)
|
|||
|
|
print(f"转换后的内容已保存到: {save_path}")
|
|||
|
|
else:
|
|||
|
|
print(f"处理失败: {result['error']}")
|
|||
|
|
|
|||
|
|
def test_component_usage():
|
|||
|
|
"""测试组件单独使用"""
|
|||
|
|
print("\n=== 组件单独使用测试 ===")
|
|||
|
|
|
|||
|
|
# 测试文件
|
|||
|
|
test_file = "test_document.txt"
|
|||
|
|
|
|||
|
|
# 创建测试文件
|
|||
|
|
if not os.path.exists(test_file):
|
|||
|
|
with open(test_file, 'w', encoding='utf-8') as f:
|
|||
|
|
f.write("""
|
|||
|
|
这是一个测试文档。
|
|||
|
|
|
|||
|
|
主要内容包括:
|
|||
|
|
1. 文档提取功能测试
|
|||
|
|
2. 内容整合功能测试
|
|||
|
|
3. 内容转换功能测试
|
|||
|
|
|
|||
|
|
测试文档包含中文内容,用于验证文本提取和处理功能。
|
|||
|
|
""")
|
|||
|
|
print(f"创建测试文件: {test_file}")
|
|||
|
|
|
|||
|
|
# 1. 文本提取器测试
|
|||
|
|
print("\n1. 文本提取器测试")
|
|||
|
|
extractor = TextExtractor()
|
|||
|
|
extracted_doc = extractor.extract(test_file)
|
|||
|
|
|
|||
|
|
print(f"提取结果:")
|
|||
|
|
print(f" 文件名: {extracted_doc.filename}")
|
|||
|
|
print(f" 文件类型: {extracted_doc.file_type}")
|
|||
|
|
print(f" 内容长度: {len(extracted_doc.content)}")
|
|||
|
|
print(f" 内容: {extracted_doc.content}")
|
|||
|
|
|
|||
|
|
# 2. 内容整合器测试
|
|||
|
|
print("\n2. 内容整合器测试")
|
|||
|
|
integrator = ContentIntegrator()
|
|||
|
|
integrated_content = integrator.integrate_documents([extracted_doc])
|
|||
|
|
|
|||
|
|
print(f"整合结果:")
|
|||
|
|
print(f" 文档数量: {integrated_content.document_count}")
|
|||
|
|
print(f" 总内容长度: {integrated_content.total_content_length}")
|
|||
|
|
print(f" 文档类型: {integrated_content.document_types}")
|
|||
|
|
print(f" 关键主题: {integrated_content.key_topics}")
|
|||
|
|
print(f" 内容摘要: {integrated_content.content_summary}")
|
|||
|
|
|
|||
|
|
# 3. 内容转换器测试
|
|||
|
|
print("\n3. 内容转换器测试")
|
|||
|
|
transformer = ContentTransformer()
|
|||
|
|
|
|||
|
|
# 测试不同格式
|
|||
|
|
formats = ['summary', 'attraction_standard', 'product_sales', 'travel_guide', 'blog_post']
|
|||
|
|
|
|||
|
|
for format_type in formats:
|
|||
|
|
print(f"\n--- {format_type} 格式 ---")
|
|||
|
|
transformed_content = transformer.transform_content(integrated_content, format_type=format_type)
|
|||
|
|
print(f"转换结果预览:")
|
|||
|
|
content = transformed_content.transformed_text
|
|||
|
|
print(content[:300] + "..." if len(content) > 300 else content)
|
|||
|
|
|
|||
|
|
def test_service_info():
|
|||
|
|
"""测试服务信息"""
|
|||
|
|
print("\n=== 服务信息测试 ===")
|
|||
|
|
|
|||
|
|
service = DocumentService()
|
|||
|
|
|
|||
|
|
print("支持的文件类型:")
|
|||
|
|
file_types = service.get_supported_file_types()
|
|||
|
|
print(f" {file_types}")
|
|||
|
|
|
|||
|
|
print("\n支持的输出格式:")
|
|||
|
|
output_formats = service.get_supported_formats()
|
|||
|
|
print(f" {output_formats}")
|
|||
|
|
|
|||
|
|
print("\n服务状态:")
|
|||
|
|
status = service.get_service_status()
|
|||
|
|
print(f" 服务名: {status['service_name']}")
|
|||
|
|
print(f" 状态: {status['status']}")
|
|||
|
|
print(f" 组件: {status['components']}")
|
|||
|
|
print(f" 时间戳: {status['timestamp']}")
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
"""主函数"""
|
|||
|
|
print("文档内容提取示例程序")
|
|||
|
|
print("=" * 50)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# 测试服务信息
|
|||
|
|
test_service_info()
|
|||
|
|
|
|||
|
|
# 测试组件单独使用
|
|||
|
|
test_component_usage()
|
|||
|
|
|
|||
|
|
# 测试单个文档提取
|
|||
|
|
test_single_document_extraction()
|
|||
|
|
|
|||
|
|
# 测试多个文档处理
|
|||
|
|
test_multiple_documents_processing("document/sample_documents/test.txt")
|
|||
|
|
|
|||
|
|
print("\n" + "=" * 50)
|
|||
|
|
print("测试完成!")
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"测试过程中发生错误: {str(e)}")
|
|||
|
|
import traceback
|
|||
|
|
traceback.print_exc()
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|