211 lines
7.6 KiB
Python
211 lines
7.6 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
|
|||
|
|
"""
|
|||
|
|
Document Adapter
|
|||
|
|
文档处理适配器
|
|||
|
|
|
|||
|
|
作为core模块与document模块的桥梁,提供统一的接口
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import sys
|
|||
|
|
from typing import Dict, List, Optional, Any
|
|||
|
|
from pathlib import Path
|
|||
|
|
import logging
|
|||
|
|
|
|||
|
|
from .models import DocumentContent, IntegratedContent
|
|||
|
|
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
from .document import TextExtractor, ContentIntegrator, ContentTransformer
|
|||
|
|
DOCUMENT_AVAILABLE = True
|
|||
|
|
except ImportError as e:
|
|||
|
|
logger.warning(f"Document模块导入失败: {e}")
|
|||
|
|
DOCUMENT_AVAILABLE = False
|
|||
|
|
|
|||
|
|
class DocumentAdapter:
|
|||
|
|
"""文档处理适配器"""
|
|||
|
|
|
|||
|
|
def __init__(self):
|
|||
|
|
"""初始化适配器"""
|
|||
|
|
self.available = DOCUMENT_AVAILABLE
|
|||
|
|
self.extractor = None
|
|||
|
|
self.integrator = None
|
|||
|
|
self.transformer = None
|
|||
|
|
|
|||
|
|
if self.available:
|
|||
|
|
try:
|
|||
|
|
self.extractor = TextExtractor()
|
|||
|
|
self.integrator = ContentIntegrator()
|
|||
|
|
self.transformer = ContentTransformer()
|
|||
|
|
logger.info("Document适配器初始化成功")
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"Document适配器初始化失败: {e}")
|
|||
|
|
self.available = False
|
|||
|
|
else:
|
|||
|
|
logger.warning("Document模块不可用,将使用模拟数据")
|
|||
|
|
|
|||
|
|
def extract_text(self, file_path: str) -> DocumentContent:
|
|||
|
|
"""
|
|||
|
|
提取文档文本
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
file_path: 文档文件路径
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
DocumentContent: 文档内容
|
|||
|
|
"""
|
|||
|
|
if not self.available or not self.extractor:
|
|||
|
|
return self._get_mock_document(file_path)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
extracted_doc = self.extractor.extract(file_path)
|
|||
|
|
content = extracted_doc.content if hasattr(extracted_doc, 'content') else str(extracted_doc)
|
|||
|
|
file_type = Path(file_path).suffix.lower()
|
|||
|
|
|
|||
|
|
return DocumentContent(
|
|||
|
|
file_path=file_path,
|
|||
|
|
content=content,
|
|||
|
|
file_type=file_type,
|
|||
|
|
metadata={
|
|||
|
|
"length": len(content),
|
|||
|
|
"word_count": len(content.split()),
|
|||
|
|
"file_size": Path(file_path).stat().st_size if Path(file_path).exists() else 0
|
|||
|
|
}
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"文本提取失败: {e}")
|
|||
|
|
return self._get_mock_document(file_path)
|
|||
|
|
|
|||
|
|
def integrate_documents(self, document_paths: List[str]) -> IntegratedContent:
|
|||
|
|
"""
|
|||
|
|
整合多个文档
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
document_paths: 文档路径列表
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
IntegratedContent: 整合后的内容
|
|||
|
|
"""
|
|||
|
|
if not self.available or not self.integrator:
|
|||
|
|
return self._get_mock_integration(document_paths)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# 提取所有文档内容
|
|||
|
|
documents = []
|
|||
|
|
for path in document_paths:
|
|||
|
|
doc_content = self.extract_text(path)
|
|||
|
|
documents.append(doc_content)
|
|||
|
|
|
|||
|
|
# 简单整合:将所有内容连接
|
|||
|
|
all_content = [doc.content for doc in documents if doc.content]
|
|||
|
|
integrated_text = "\n\n".join(all_content)
|
|||
|
|
|
|||
|
|
# 生成简单摘要
|
|||
|
|
summary = integrated_text[:200] + "..." if len(integrated_text) > 200 else integrated_text
|
|||
|
|
|
|||
|
|
# 提取关键词(简单实现)
|
|||
|
|
key_topics = []
|
|||
|
|
for doc in documents:
|
|||
|
|
words = doc.content.split()[:10] # 取前10个词作为关键词
|
|||
|
|
key_topics.extend(words)
|
|||
|
|
key_topics = list(set(key_topics))[:5] # 去重并取前5个
|
|||
|
|
|
|||
|
|
return IntegratedContent(
|
|||
|
|
documents=documents,
|
|||
|
|
integrated_text=integrated_text,
|
|||
|
|
summary=summary,
|
|||
|
|
key_topics=key_topics,
|
|||
|
|
total_length=sum(len(doc.content) for doc in documents)
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"文档整合失败: {e}")
|
|||
|
|
return self._get_mock_integration(document_paths)
|
|||
|
|
|
|||
|
|
def transform_content(self, content: str, output_format: str) -> str:
|
|||
|
|
"""
|
|||
|
|
转换内容格式
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
content: 原始内容
|
|||
|
|
output_format: 输出格式
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
str: 转换后的内容
|
|||
|
|
"""
|
|||
|
|
if not self.available or not self.transformer:
|
|||
|
|
return self._get_mock_transformation(content, output_format)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# 简单的格式转换
|
|||
|
|
if output_format == "summary":
|
|||
|
|
return f"摘要:\n{content[:500]}..."
|
|||
|
|
elif output_format == "blog_post":
|
|||
|
|
return f"# 博客文章\n\n{content}"
|
|||
|
|
elif output_format == "travel_guide":
|
|||
|
|
return f"# 旅游攻略\n\n{content}"
|
|||
|
|
elif output_format == "product_sales":
|
|||
|
|
return f"# 产品销售\n\n{content}"
|
|||
|
|
elif output_format == "attraction_standard":
|
|||
|
|
return f"# 景点标准\n\n{content}"
|
|||
|
|
else:
|
|||
|
|
return content
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"内容转换失败: {e}")
|
|||
|
|
return self._get_mock_transformation(content, output_format)
|
|||
|
|
|
|||
|
|
def get_supported_formats(self) -> List[str]:
|
|||
|
|
"""获取支持的文档格式"""
|
|||
|
|
return [".txt", ".md", ".pdf", ".docx", ".doc", ".xlsx", ".xls", ".csv"]
|
|||
|
|
|
|||
|
|
def get_supported_output_formats(self) -> List[str]:
|
|||
|
|
"""获取支持的输出格式"""
|
|||
|
|
return ["summary", "blog_post", "travel_guide", "product_sales", "attraction_standard"]
|
|||
|
|
|
|||
|
|
def _get_mock_document(self, file_path: str) -> DocumentContent:
|
|||
|
|
"""获取模拟文档内容"""
|
|||
|
|
return DocumentContent(
|
|||
|
|
file_path=file_path,
|
|||
|
|
content=f"这是{Path(file_path).name}的模拟内容。包含了文档的主要信息和内容。",
|
|||
|
|
file_type=Path(file_path).suffix.lower(),
|
|||
|
|
metadata={
|
|||
|
|
"length": 50,
|
|||
|
|
"word_count": 15,
|
|||
|
|
"file_size": 1024
|
|||
|
|
}
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
def _get_mock_integration(self, document_paths: List[str]) -> IntegratedContent:
|
|||
|
|
"""获取模拟整合内容"""
|
|||
|
|
documents = [self._get_mock_document(path) for path in document_paths]
|
|||
|
|
|
|||
|
|
return IntegratedContent(
|
|||
|
|
documents=documents,
|
|||
|
|
integrated_text="这是整合后的文档内容,包含了所有文档的主要信息。",
|
|||
|
|
summary="文档摘要:主要讨论了相关主题的重要内容。",
|
|||
|
|
key_topics=["主题1", "主题2", "主题3"],
|
|||
|
|
total_length=sum(len(doc.content) for doc in documents)
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
def _get_mock_transformation(self, content: str, output_format: str) -> str:
|
|||
|
|
"""获取模拟转换内容"""
|
|||
|
|
return f"[{output_format}格式] {content[:100]}..."
|
|||
|
|
|
|||
|
|
def is_available(self) -> bool:
|
|||
|
|
"""检查适配器是否可用"""
|
|||
|
|
return self.available
|
|||
|
|
|
|||
|
|
def get_status(self) -> Dict[str, Any]:
|
|||
|
|
"""获取适配器状态"""
|
|||
|
|
return {
|
|||
|
|
"available": self.available,
|
|||
|
|
"extractor_initialized": self.extractor is not None,
|
|||
|
|
"integrator_initialized": self.integrator is not None,
|
|||
|
|
"transformer_initialized": self.transformer is not None,
|
|||
|
|
"supported_formats": self.get_supported_formats(),
|
|||
|
|
"supported_output_formats": self.get_supported_output_formats()
|
|||
|
|
}
|