#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 文档处理服务层 整合文本提取、文档解析和内容转换功能 """ import logging from typing import List, Dict, Optional, Any from pathlib import Path from datetime import datetime from core.document import ( TextExtractor, ExtractedDocument, ContentIntegrator, IntegratedContent, ContentTransformer, TransformedContent ) logger = logging.getLogger(__name__) class DocumentService: """文档处理服务""" def __init__(self, config: Optional[Dict[str, Any]] = None): self.config = config or {} # 初始化组件 self.text_extractor = TextExtractor() self.content_integrator = ContentIntegrator() self.content_transformer = ContentTransformer(self.config.get('transformer', {})) logger.info("文档处理服务初始化完成") def process_single_document(self, file_path: str) -> Dict[str, Any]: """处理单个文档 Args: file_path: 文件路径 Returns: 处理结果 """ try: # 1. 提取文本 extracted_doc = self.text_extractor.extract(file_path) # 2. 整合内容(单个文档) integrated_content = self.content_integrator.integrate_documents([extracted_doc]) # 3. 转换为摘要格式 transformed_content = self.content_transformer.transform_content( integrated_content, format_type='summary' ) return { 'success': True, 'document': { 'filename': extracted_doc.filename, 'file_type': extracted_doc.file_type, 'file_size': extracted_doc.file_size, 'content_length': len(extracted_doc.content), 'extracted_at': extracted_doc.extracted_at.isoformat() }, 'integrated_content': { 'document_count': integrated_content.document_count, 'total_content_length': integrated_content.total_content_length, 'document_types': integrated_content.document_types, 'content_summary': integrated_content.content_summary, 'key_topics': integrated_content.key_topics }, 'transformed_content': { 'format_type': transformed_content.format_type, 'content': transformed_content.transformed_text, 'transformation_metadata': transformed_content.transformation_metadata, 'transformed_at': transformed_content.transformed_at.isoformat() } } except Exception as e: logger.error(f"处理单个文档失败: {str(e)}") return { 'success': False, 'error': str(e), 'file_path': file_path } def process_multiple_documents(self, file_paths: List[str], output_format: str = 'summary') -> Dict[str, Any]: """处理多个文档 Args: file_paths: 文件路径列表 output_format: 输出格式 Returns: 处理结果 """ try: # 1. 提取所有文档的文本 extracted_docs = [] failed_extractions = [] for file_path in file_paths: try: extracted_doc = self.text_extractor.extract(file_path) extracted_docs.append(extracted_doc) except Exception as e: logger.error(f"提取文档失败 {file_path}: {str(e)}") failed_extractions.append({ 'file_path': file_path, 'error': str(e) }) if not extracted_docs: return { 'success': False, 'error': '没有成功提取的文档', 'failed_extractions': failed_extractions } # 2. 整合内容 integrated_content = self.content_integrator.integrate_documents(extracted_docs) # 3. 转换内容 transformed_content = self.content_transformer.transform_content( integrated_content, format_type=output_format ) return { 'success': True, 'processing_summary': { 'total_files': len(file_paths), 'successful_extractions': len(extracted_docs), 'failed_extractions': len(failed_extractions), 'failed_files': failed_extractions }, 'documents': [ { 'filename': doc.filename, 'file_type': doc.file_type, 'file_size': doc.file_size, 'content_length': len(doc.content), 'extracted_at': doc.extracted_at.isoformat() } for doc in extracted_docs ], 'integrated_content': { 'document_count': integrated_content.document_count, 'total_content_length': integrated_content.total_content_length, 'document_types': integrated_content.document_types, 'content_summary': integrated_content.content_summary, 'key_topics': integrated_content.key_topics }, 'transformed_content': { 'format_type': transformed_content.format_type, 'content': transformed_content.transformed_text, 'transformation_metadata': transformed_content.transformation_metadata, 'transformed_at': transformed_content.transformed_at.isoformat() } } except Exception as e: logger.error(f"处理多个文档失败: {str(e)}") return { 'success': False, 'error': str(e), 'file_paths': file_paths } def extract_text_only(self, file_path: str) -> Dict[str, Any]: """仅提取文本,不进行整合和转换 Args: file_path: 文件路径 Returns: 提取结果 """ try: extracted_doc = self.text_extractor.extract(file_path) return { 'success': True, 'document': { 'filename': extracted_doc.filename, 'file_type': extracted_doc.file_type, 'file_size': extracted_doc.file_size, 'content': extracted_doc.content, 'content_length': len(extracted_doc.content), 'page_count': extracted_doc.page_count, 'metadata': extracted_doc.metadata, 'extracted_at': extracted_doc.extracted_at.isoformat() } } except Exception as e: logger.error(f"提取文本失败: {str(e)}") return { 'success': False, 'error': str(e), 'file_path': file_path } def get_supported_formats(self) -> List[str]: """获取支持的输出格式""" return self.content_transformer.get_supported_formats() def get_supported_file_types(self) -> List[str]: """获取支持的文件类型""" return self.text_extractor.get_supported_formats() def validate_file_path(self, file_path: str) -> Dict[str, Any]: """验证文件路径 Args: file_path: 文件路径 Returns: 验证结果 """ try: path = Path(file_path) if not path.exists(): return { 'valid': False, 'error': '文件不存在' } if not path.is_file(): return { 'valid': False, 'error': '路径不是文件' } file_extension = path.suffix.lower() supported_formats = self.get_supported_file_types() if file_extension not in supported_formats: return { 'valid': False, 'error': f'不支持的文件格式: {file_extension}', 'supported_formats': supported_formats } return { 'valid': True, 'file_info': { 'filename': path.name, 'file_extension': file_extension, 'file_size': path.stat().st_size, 'absolute_path': str(path.absolute()) } } except Exception as e: return { 'valid': False, 'error': f'文件验证失败: {str(e)}' } def get_service_status(self) -> Dict[str, Any]: """获取服务状态""" return { 'service_name': 'DocumentService', 'status': 'active', 'components': { 'text_extractor': 'active', 'content_integrator': 'active', 'content_transformer': 'active' }, 'supported_file_types': self.get_supported_file_types(), 'supported_output_formats': self.get_supported_formats(), 'timestamp': datetime.now().isoformat() }