TravelContentCreator/api/services/document_service.py

278 lines
9.9 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
文档处理服务层
整合文本提取、文档解析和内容转换功能
"""
import logging
from typing import List, Dict, Optional, Any
from pathlib import Path
from datetime import datetime
from core.document import (
TextExtractor, ExtractedDocument,
ContentIntegrator, IntegratedContent,
ContentTransformer, TransformedContent
)
logger = logging.getLogger(__name__)
class DocumentService:
"""文档处理服务"""
def __init__(self, config: Optional[Dict[str, Any]] = None):
self.config = config or {}
# 初始化组件
self.text_extractor = TextExtractor()
self.content_integrator = ContentIntegrator()
self.content_transformer = ContentTransformer(self.config.get('transformer', {}))
logger.info("文档处理服务初始化完成")
def process_single_document(self, file_path: str) -> Dict[str, Any]:
"""处理单个文档
Args:
file_path: 文件路径
Returns:
处理结果
"""
try:
# 1. 提取文本
extracted_doc = self.text_extractor.extract(file_path)
# 2. 整合内容(单个文档)
integrated_content = self.content_integrator.integrate_documents([extracted_doc])
# 3. 转换为摘要格式
transformed_content = self.content_transformer.transform_content(
integrated_content,
format_type='summary'
)
return {
'success': True,
'document': {
'filename': extracted_doc.filename,
'file_type': extracted_doc.file_type,
'file_size': extracted_doc.file_size,
'content_length': len(extracted_doc.content),
'extracted_at': extracted_doc.extracted_at.isoformat()
},
'integrated_content': {
'document_count': integrated_content.document_count,
'total_content_length': integrated_content.total_content_length,
'document_types': integrated_content.document_types,
'content_summary': integrated_content.content_summary,
'key_topics': integrated_content.key_topics
},
'transformed_content': {
'format_type': transformed_content.format_type,
'content': transformed_content.transformed_text,
'transformation_metadata': transformed_content.transformation_metadata,
'transformed_at': transformed_content.transformed_at.isoformat()
}
}
except Exception as e:
logger.error(f"处理单个文档失败: {str(e)}")
return {
'success': False,
'error': str(e),
'file_path': file_path
}
def process_multiple_documents(self, file_paths: List[str],
output_format: str = 'summary') -> Dict[str, Any]:
"""处理多个文档
Args:
file_paths: 文件路径列表
output_format: 输出格式
Returns:
处理结果
"""
try:
# 1. 提取所有文档的文本
extracted_docs = []
failed_extractions = []
for file_path in file_paths:
try:
extracted_doc = self.text_extractor.extract(file_path)
extracted_docs.append(extracted_doc)
except Exception as e:
logger.error(f"提取文档失败 {file_path}: {str(e)}")
failed_extractions.append({
'file_path': file_path,
'error': str(e)
})
if not extracted_docs:
return {
'success': False,
'error': '没有成功提取的文档',
'failed_extractions': failed_extractions
}
# 2. 整合内容
integrated_content = self.content_integrator.integrate_documents(extracted_docs)
# 3. 转换内容
transformed_content = self.content_transformer.transform_content(
integrated_content,
format_type=output_format
)
return {
'success': True,
'processing_summary': {
'total_files': len(file_paths),
'successful_extractions': len(extracted_docs),
'failed_extractions': len(failed_extractions),
'failed_files': failed_extractions
},
'documents': [
{
'filename': doc.filename,
'file_type': doc.file_type,
'file_size': doc.file_size,
'content_length': len(doc.content),
'extracted_at': doc.extracted_at.isoformat()
}
for doc in extracted_docs
],
'integrated_content': {
'document_count': integrated_content.document_count,
'total_content_length': integrated_content.total_content_length,
'document_types': integrated_content.document_types,
'content_summary': integrated_content.content_summary,
'key_topics': integrated_content.key_topics
},
'transformed_content': {
'format_type': transformed_content.format_type,
'content': transformed_content.transformed_text,
'transformation_metadata': transformed_content.transformation_metadata,
'transformed_at': transformed_content.transformed_at.isoformat()
}
}
except Exception as e:
logger.error(f"处理多个文档失败: {str(e)}")
return {
'success': False,
'error': str(e),
'file_paths': file_paths
}
def extract_text_only(self, file_path: str) -> Dict[str, Any]:
"""仅提取文本,不进行整合和转换
Args:
file_path: 文件路径
Returns:
提取结果
"""
try:
extracted_doc = self.text_extractor.extract(file_path)
return {
'success': True,
'document': {
'filename': extracted_doc.filename,
'file_type': extracted_doc.file_type,
'file_size': extracted_doc.file_size,
'content': extracted_doc.content,
'content_length': len(extracted_doc.content),
'page_count': extracted_doc.page_count,
'metadata': extracted_doc.metadata,
'extracted_at': extracted_doc.extracted_at.isoformat()
}
}
except Exception as e:
logger.error(f"提取文本失败: {str(e)}")
return {
'success': False,
'error': str(e),
'file_path': file_path
}
def get_supported_formats(self) -> List[str]:
"""获取支持的输出格式"""
return self.content_transformer.get_supported_formats()
def get_supported_file_types(self) -> List[str]:
"""获取支持的文件类型"""
return self.text_extractor.get_supported_formats()
def validate_file_path(self, file_path: str) -> Dict[str, Any]:
"""验证文件路径
Args:
file_path: 文件路径
Returns:
验证结果
"""
try:
path = Path(file_path)
if not path.exists():
return {
'valid': False,
'error': '文件不存在'
}
if not path.is_file():
return {
'valid': False,
'error': '路径不是文件'
}
file_extension = path.suffix.lower()
supported_formats = self.get_supported_file_types()
if file_extension not in supported_formats:
return {
'valid': False,
'error': f'不支持的文件格式: {file_extension}',
'supported_formats': supported_formats
}
return {
'valid': True,
'file_info': {
'filename': path.name,
'file_extension': file_extension,
'file_size': path.stat().st_size,
'absolute_path': str(path.absolute())
}
}
except Exception as e:
return {
'valid': False,
'error': f'文件验证失败: {str(e)}'
}
def get_service_status(self) -> Dict[str, Any]:
"""获取服务状态"""
return {
'service_name': 'DocumentService',
'status': 'active',
'components': {
'text_extractor': 'active',
'content_integrator': 'active',
'content_transformer': 'active'
},
'supported_file_types': self.get_supported_file_types(),
'supported_output_formats': self.get_supported_formats(),
'timestamp': datetime.now().isoformat()
}