278 lines
9.9 KiB
Python
278 lines
9.9 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
文档处理服务层
|
|
整合文本提取、文档解析和内容转换功能
|
|
"""
|
|
|
|
import logging
|
|
from typing import List, Dict, Optional, Any
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
from core.document import (
|
|
TextExtractor, ExtractedDocument,
|
|
ContentIntegrator, IntegratedContent,
|
|
ContentTransformer, TransformedContent
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class DocumentService:
|
|
"""文档处理服务"""
|
|
|
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
self.config = config or {}
|
|
|
|
# 初始化组件
|
|
self.text_extractor = TextExtractor()
|
|
self.content_integrator = ContentIntegrator()
|
|
self.content_transformer = ContentTransformer(self.config.get('transformer', {}))
|
|
|
|
logger.info("文档处理服务初始化完成")
|
|
|
|
def process_single_document(self, file_path: str) -> Dict[str, Any]:
|
|
"""处理单个文档
|
|
|
|
Args:
|
|
file_path: 文件路径
|
|
|
|
Returns:
|
|
处理结果
|
|
"""
|
|
try:
|
|
# 1. 提取文本
|
|
extracted_doc = self.text_extractor.extract(file_path)
|
|
|
|
# 2. 整合内容(单个文档)
|
|
integrated_content = self.content_integrator.integrate_documents([extracted_doc])
|
|
|
|
# 3. 转换为摘要格式
|
|
transformed_content = self.content_transformer.transform_content(
|
|
integrated_content,
|
|
format_type='summary'
|
|
)
|
|
|
|
return {
|
|
'success': True,
|
|
'document': {
|
|
'filename': extracted_doc.filename,
|
|
'file_type': extracted_doc.file_type,
|
|
'file_size': extracted_doc.file_size,
|
|
'content_length': len(extracted_doc.content),
|
|
'extracted_at': extracted_doc.extracted_at.isoformat()
|
|
},
|
|
'integrated_content': {
|
|
'document_count': integrated_content.document_count,
|
|
'total_content_length': integrated_content.total_content_length,
|
|
'document_types': integrated_content.document_types,
|
|
'content_summary': integrated_content.content_summary,
|
|
'key_topics': integrated_content.key_topics
|
|
},
|
|
'transformed_content': {
|
|
'format_type': transformed_content.format_type,
|
|
'content': transformed_content.transformed_text,
|
|
'transformation_metadata': transformed_content.transformation_metadata,
|
|
'transformed_at': transformed_content.transformed_at.isoformat()
|
|
}
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"处理单个文档失败: {str(e)}")
|
|
return {
|
|
'success': False,
|
|
'error': str(e),
|
|
'file_path': file_path
|
|
}
|
|
|
|
def process_multiple_documents(self, file_paths: List[str],
|
|
output_format: str = 'summary') -> Dict[str, Any]:
|
|
"""处理多个文档
|
|
|
|
Args:
|
|
file_paths: 文件路径列表
|
|
output_format: 输出格式
|
|
|
|
Returns:
|
|
处理结果
|
|
"""
|
|
try:
|
|
# 1. 提取所有文档的文本
|
|
extracted_docs = []
|
|
failed_extractions = []
|
|
|
|
for file_path in file_paths:
|
|
try:
|
|
extracted_doc = self.text_extractor.extract(file_path)
|
|
extracted_docs.append(extracted_doc)
|
|
except Exception as e:
|
|
logger.error(f"提取文档失败 {file_path}: {str(e)}")
|
|
failed_extractions.append({
|
|
'file_path': file_path,
|
|
'error': str(e)
|
|
})
|
|
|
|
if not extracted_docs:
|
|
return {
|
|
'success': False,
|
|
'error': '没有成功提取的文档',
|
|
'failed_extractions': failed_extractions
|
|
}
|
|
|
|
# 2. 整合内容
|
|
integrated_content = self.content_integrator.integrate_documents(extracted_docs)
|
|
|
|
# 3. 转换内容
|
|
transformed_content = self.content_transformer.transform_content(
|
|
integrated_content,
|
|
format_type=output_format
|
|
)
|
|
|
|
return {
|
|
'success': True,
|
|
'processing_summary': {
|
|
'total_files': len(file_paths),
|
|
'successful_extractions': len(extracted_docs),
|
|
'failed_extractions': len(failed_extractions),
|
|
'failed_files': failed_extractions
|
|
},
|
|
'documents': [
|
|
{
|
|
'filename': doc.filename,
|
|
'file_type': doc.file_type,
|
|
'file_size': doc.file_size,
|
|
'content_length': len(doc.content),
|
|
'extracted_at': doc.extracted_at.isoformat()
|
|
}
|
|
for doc in extracted_docs
|
|
],
|
|
'integrated_content': {
|
|
'document_count': integrated_content.document_count,
|
|
'total_content_length': integrated_content.total_content_length,
|
|
'document_types': integrated_content.document_types,
|
|
'content_summary': integrated_content.content_summary,
|
|
'key_topics': integrated_content.key_topics
|
|
},
|
|
'transformed_content': {
|
|
'format_type': transformed_content.format_type,
|
|
'content': transformed_content.transformed_text,
|
|
'transformation_metadata': transformed_content.transformation_metadata,
|
|
'transformed_at': transformed_content.transformed_at.isoformat()
|
|
}
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"处理多个文档失败: {str(e)}")
|
|
return {
|
|
'success': False,
|
|
'error': str(e),
|
|
'file_paths': file_paths
|
|
}
|
|
|
|
def extract_text_only(self, file_path: str) -> Dict[str, Any]:
|
|
"""仅提取文本,不进行整合和转换
|
|
|
|
Args:
|
|
file_path: 文件路径
|
|
|
|
Returns:
|
|
提取结果
|
|
"""
|
|
try:
|
|
extracted_doc = self.text_extractor.extract(file_path)
|
|
|
|
return {
|
|
'success': True,
|
|
'document': {
|
|
'filename': extracted_doc.filename,
|
|
'file_type': extracted_doc.file_type,
|
|
'file_size': extracted_doc.file_size,
|
|
'content': extracted_doc.content,
|
|
'content_length': len(extracted_doc.content),
|
|
'page_count': extracted_doc.page_count,
|
|
'metadata': extracted_doc.metadata,
|
|
'extracted_at': extracted_doc.extracted_at.isoformat()
|
|
}
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"提取文本失败: {str(e)}")
|
|
return {
|
|
'success': False,
|
|
'error': str(e),
|
|
'file_path': file_path
|
|
}
|
|
|
|
def get_supported_formats(self) -> List[str]:
|
|
"""获取支持的输出格式"""
|
|
return self.content_transformer.get_supported_formats()
|
|
|
|
def get_supported_file_types(self) -> List[str]:
|
|
"""获取支持的文件类型"""
|
|
return self.text_extractor.get_supported_formats()
|
|
|
|
def validate_file_path(self, file_path: str) -> Dict[str, Any]:
|
|
"""验证文件路径
|
|
|
|
Args:
|
|
file_path: 文件路径
|
|
|
|
Returns:
|
|
验证结果
|
|
"""
|
|
try:
|
|
path = Path(file_path)
|
|
|
|
if not path.exists():
|
|
return {
|
|
'valid': False,
|
|
'error': '文件不存在'
|
|
}
|
|
|
|
if not path.is_file():
|
|
return {
|
|
'valid': False,
|
|
'error': '路径不是文件'
|
|
}
|
|
|
|
file_extension = path.suffix.lower()
|
|
supported_formats = self.get_supported_file_types()
|
|
|
|
if file_extension not in supported_formats:
|
|
return {
|
|
'valid': False,
|
|
'error': f'不支持的文件格式: {file_extension}',
|
|
'supported_formats': supported_formats
|
|
}
|
|
|
|
return {
|
|
'valid': True,
|
|
'file_info': {
|
|
'filename': path.name,
|
|
'file_extension': file_extension,
|
|
'file_size': path.stat().st_size,
|
|
'absolute_path': str(path.absolute())
|
|
}
|
|
}
|
|
|
|
except Exception as e:
|
|
return {
|
|
'valid': False,
|
|
'error': f'文件验证失败: {str(e)}'
|
|
}
|
|
|
|
def get_service_status(self) -> Dict[str, Any]:
|
|
"""获取服务状态"""
|
|
return {
|
|
'service_name': 'DocumentService',
|
|
'status': 'active',
|
|
'components': {
|
|
'text_extractor': 'active',
|
|
'content_integrator': 'active',
|
|
'content_transformer': 'active'
|
|
},
|
|
'supported_file_types': self.get_supported_file_types(),
|
|
'supported_output_formats': self.get_supported_formats(),
|
|
'timestamp': datetime.now().isoformat()
|
|
} |