From 12aaf88aa2d7d4f87e37fc5e90dfa85c6ad228dd Mon Sep 17 00:00:00 2001 From: jinye_huang Date: Mon, 14 Jul 2025 16:01:06 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B5=8B=E8=AF=95=E9=80=9A=E8=BF=87=E4=BA=86?= =?UTF-8?q?=E5=A4=9A=E7=A7=8D=E4=B8=8D=E5=90=8C=E6=A8=A1=E5=BC=8F=E7=9A=84?= =?UTF-8?q?=E8=AF=86=E5=88=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/document_processing_example_simple.py | 221 ++++++++++++++++++++ 1 file changed, 221 insertions(+) create mode 100644 tests/document_processing_example_simple.py diff --git a/tests/document_processing_example_simple.py b/tests/document_processing_example_simple.py new file mode 100644 index 0000000..2a21b7c --- /dev/null +++ b/tests/document_processing_example_simple.py @@ -0,0 +1,221 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +文档内容提取示例 +专注于文档内容提取功能,不包含网络搜索 +""" + +import os +import sys +from pathlib import Path + +# 添加项目根目录到Python路径 +project_root = Path(__file__).parent +sys.path.insert(0, str(project_root)) + +from document import TextExtractor, ContentIntegrator, ContentTransformer +from api.services.document_service import DocumentService + +def test_single_document_extraction(): + """测试单个文档提取""" + print("=== 单个文档提取测试 ===") + + # 创建文档服务 + service = DocumentService() + + # 测试文件路径(请替换为实际的文件路径) + test_file = "test_document.pdf" # 替换为实际文件路径 + + if not os.path.exists(test_file): + print(f"测试文件不存在: {test_file}") + print("请将一个测试文档放在当前目录下,或修改test_file变量") + return + + # 仅提取文本 + print(f"提取文档: {test_file}") + result = service.extract_text_only(test_file) + + if result['success']: + doc = result['document'] + print(f"文件名: {doc['filename']}") + print(f"文件类型: {doc['file_type']}") + print(f"文件大小: {doc['file_size']} 字节") + print(f"内容长度: {doc['content_length']} 字符") + print(f"页数: {doc['page_count']}") + print(f"提取时间: {doc['extracted_at']}") + print("\n内容预览:") + print(doc['content'][:500] + "..." if len(doc['content']) > 500 else doc['content']) + else: + print(f"提取失败: {result['error']}") + +def test_multiple_documents_processing(save_path): + """测试多个文档处理""" + print("\n=== 多个文档处理测试 ===") + + # 创建文档服务 + service = DocumentService() + + # 测试文件列表(请替换为实际的文件路径) + test_files = [ + "document/sample_documents/Ai服务商家资料收集202506.xlsx", + "document/sample_documents/ai营销括客话术和QA整理.docx", + "document/sample_documents/附件1-服务器租赁发票20250702.pdf" + ] + + # 过滤存在的文件 + existing_files = [f for f in test_files if os.path.exists(f)] + + if not existing_files: + print("没有找到测试文件") + print("请将测试文档放在当前目录下,或修改test_files变量") + return + + print(f"处理文档: {existing_files}") + + # 处理多个文档 + result = service.process_multiple_documents(existing_files, output_format='summary') + + if result['success']: + print(f"处理摘要:") + summary = result['processing_summary'] + print(f" 总文件数: {summary['total_files']}") + print(f" 成功提取: {summary['successful_extractions']}") + print(f" 失败提取: {summary['failed_extractions']}") + + print(f"\n文档列表:") + for i, doc in enumerate(result['documents'], 1): + print(f" {i}. {doc['filename']} ({doc['file_type']}) - {doc['content_length']} 字符") + + print(f"\n整合内容:") + integrated = result['integrated_content'] + print(f" 文档数量: {integrated['document_count']}") + print(f" 总内容长度: {integrated['total_content_length']}") + print(f" 文档类型: {integrated['document_types']}") + print(f" 关键主题: {integrated['key_topics']}") + + print(f"\n内容摘要:") + print(integrated['content_summary']) + + print(f"\n转换后的内容:") + transformed = result['transformed_content'] + print(f" 格式类型: {transformed['format_type']}") + print(f" 转换时间: {transformed['transformed_at']}") + print("\n转换内容预览:") + content = transformed['content'] + # print(content[:1000] + "..." if len(content) > 1000 else content) + if save_path: + with open(save_path, 'w', encoding='utf-8') as f: + f.write(content) + print(f"转换后的内容已保存到: {save_path}") + else: + print(f"处理失败: {result['error']}") + +def test_component_usage(): + """测试组件单独使用""" + print("\n=== 组件单独使用测试 ===") + + # 测试文件 + test_file = "test_document.txt" + + # 创建测试文件 + if not os.path.exists(test_file): + with open(test_file, 'w', encoding='utf-8') as f: + f.write(""" +这是一个测试文档。 + +主要内容包括: +1. 文档提取功能测试 +2. 内容整合功能测试 +3. 内容转换功能测试 + +测试文档包含中文内容,用于验证文本提取和处理功能。 + """) + print(f"创建测试文件: {test_file}") + + # 1. 文本提取器测试 + print("\n1. 文本提取器测试") + extractor = TextExtractor() + extracted_doc = extractor.extract(test_file) + + print(f"提取结果:") + print(f" 文件名: {extracted_doc.filename}") + print(f" 文件类型: {extracted_doc.file_type}") + print(f" 内容长度: {len(extracted_doc.content)}") + print(f" 内容: {extracted_doc.content}") + + # 2. 内容整合器测试 + print("\n2. 内容整合器测试") + integrator = ContentIntegrator() + integrated_content = integrator.integrate_documents([extracted_doc]) + + print(f"整合结果:") + print(f" 文档数量: {integrated_content.document_count}") + print(f" 总内容长度: {integrated_content.total_content_length}") + print(f" 文档类型: {integrated_content.document_types}") + print(f" 关键主题: {integrated_content.key_topics}") + print(f" 内容摘要: {integrated_content.content_summary}") + + # 3. 内容转换器测试 + print("\n3. 内容转换器测试") + transformer = ContentTransformer() + + # 测试不同格式 + formats = ['summary', 'attraction_standard', 'product_sales', 'travel_guide', 'blog_post'] + + for format_type in formats: + print(f"\n--- {format_type} 格式 ---") + transformed_content = transformer.transform_content(integrated_content, format_type=format_type) + print(f"转换结果预览:") + content = transformed_content.transformed_text + print(content[:300] + "..." if len(content) > 300 else content) + +def test_service_info(): + """测试服务信息""" + print("\n=== 服务信息测试 ===") + + service = DocumentService() + + print("支持的文件类型:") + file_types = service.get_supported_file_types() + print(f" {file_types}") + + print("\n支持的输出格式:") + output_formats = service.get_supported_formats() + print(f" {output_formats}") + + print("\n服务状态:") + status = service.get_service_status() + print(f" 服务名: {status['service_name']}") + print(f" 状态: {status['status']}") + print(f" 组件: {status['components']}") + print(f" 时间戳: {status['timestamp']}") + +def main(): + """主函数""" + print("文档内容提取示例程序") + print("=" * 50) + + try: + # 测试服务信息 + test_service_info() + + # 测试组件单独使用 + test_component_usage() + + # 测试单个文档提取 + test_single_document_extraction() + + # 测试多个文档处理 + test_multiple_documents_processing("document/sample_documents/test.txt") + + print("\n" + "=" * 50) + print("测试完成!") + + except Exception as e: + print(f"测试过程中发生错误: {str(e)}") + import traceback + traceback.print_exc() + +if __name__ == "__main__": + main() \ No newline at end of file