bangbang-aigc-server/core/document_adapter.py

211 lines
7.6 KiB
Python
Raw Permalink Normal View History

2025-07-31 15:35:23 +08:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Document Adapter
文档处理适配器
作为core模块与document模块的桥梁提供统一的接口
"""
import sys
from typing import Dict, List, Optional, Any
from pathlib import Path
import logging
from .models import DocumentContent, IntegratedContent
logger = logging.getLogger(__name__)
try:
from .document import TextExtractor, ContentIntegrator, ContentTransformer
DOCUMENT_AVAILABLE = True
except ImportError as e:
logger.warning(f"Document模块导入失败: {e}")
DOCUMENT_AVAILABLE = False
class DocumentAdapter:
"""文档处理适配器"""
def __init__(self):
"""初始化适配器"""
self.available = DOCUMENT_AVAILABLE
self.extractor = None
self.integrator = None
self.transformer = None
if self.available:
try:
self.extractor = TextExtractor()
self.integrator = ContentIntegrator()
self.transformer = ContentTransformer()
logger.info("Document适配器初始化成功")
except Exception as e:
logger.error(f"Document适配器初始化失败: {e}")
self.available = False
else:
logger.warning("Document模块不可用将使用模拟数据")
def extract_text(self, file_path: str) -> DocumentContent:
"""
提取文档文本
Args:
file_path: 文档文件路径
Returns:
DocumentContent: 文档内容
"""
if not self.available or not self.extractor:
return self._get_mock_document(file_path)
try:
extracted_doc = self.extractor.extract(file_path)
content = extracted_doc.content if hasattr(extracted_doc, 'content') else str(extracted_doc)
file_type = Path(file_path).suffix.lower()
return DocumentContent(
file_path=file_path,
content=content,
file_type=file_type,
metadata={
"length": len(content),
"word_count": len(content.split()),
"file_size": Path(file_path).stat().st_size if Path(file_path).exists() else 0
}
)
except Exception as e:
logger.error(f"文本提取失败: {e}")
return self._get_mock_document(file_path)
def integrate_documents(self, document_paths: List[str]) -> IntegratedContent:
"""
整合多个文档
Args:
document_paths: 文档路径列表
Returns:
IntegratedContent: 整合后的内容
"""
if not self.available or not self.integrator:
return self._get_mock_integration(document_paths)
try:
# 提取所有文档内容
documents = []
for path in document_paths:
doc_content = self.extract_text(path)
documents.append(doc_content)
# 简单整合:将所有内容连接
all_content = [doc.content for doc in documents if doc.content]
integrated_text = "\n\n".join(all_content)
# 生成简单摘要
summary = integrated_text[:200] + "..." if len(integrated_text) > 200 else integrated_text
# 提取关键词(简单实现)
key_topics = []
for doc in documents:
words = doc.content.split()[:10] # 取前10个词作为关键词
key_topics.extend(words)
key_topics = list(set(key_topics))[:5] # 去重并取前5个
return IntegratedContent(
documents=documents,
integrated_text=integrated_text,
summary=summary,
key_topics=key_topics,
total_length=sum(len(doc.content) for doc in documents)
)
except Exception as e:
logger.error(f"文档整合失败: {e}")
return self._get_mock_integration(document_paths)
def transform_content(self, content: str, output_format: str) -> str:
"""
转换内容格式
Args:
content: 原始内容
output_format: 输出格式
Returns:
str: 转换后的内容
"""
if not self.available or not self.transformer:
return self._get_mock_transformation(content, output_format)
try:
# 简单的格式转换
if output_format == "summary":
return f"摘要:\n{content[:500]}..."
elif output_format == "blog_post":
return f"# 博客文章\n\n{content}"
elif output_format == "travel_guide":
return f"# 旅游攻略\n\n{content}"
elif output_format == "product_sales":
return f"# 产品销售\n\n{content}"
elif output_format == "attraction_standard":
return f"# 景点标准\n\n{content}"
else:
return content
except Exception as e:
logger.error(f"内容转换失败: {e}")
return self._get_mock_transformation(content, output_format)
def get_supported_formats(self) -> List[str]:
"""获取支持的文档格式"""
return [".txt", ".md", ".pdf", ".docx", ".doc", ".xlsx", ".xls", ".csv"]
def get_supported_output_formats(self) -> List[str]:
"""获取支持的输出格式"""
return ["summary", "blog_post", "travel_guide", "product_sales", "attraction_standard"]
def _get_mock_document(self, file_path: str) -> DocumentContent:
"""获取模拟文档内容"""
return DocumentContent(
file_path=file_path,
content=f"这是{Path(file_path).name}的模拟内容。包含了文档的主要信息和内容。",
file_type=Path(file_path).suffix.lower(),
metadata={
"length": 50,
"word_count": 15,
"file_size": 1024
}
)
def _get_mock_integration(self, document_paths: List[str]) -> IntegratedContent:
"""获取模拟整合内容"""
documents = [self._get_mock_document(path) for path in document_paths]
return IntegratedContent(
documents=documents,
integrated_text="这是整合后的文档内容,包含了所有文档的主要信息。",
summary="文档摘要:主要讨论了相关主题的重要内容。",
key_topics=["主题1", "主题2", "主题3"],
total_length=sum(len(doc.content) for doc in documents)
)
def _get_mock_transformation(self, content: str, output_format: str) -> str:
"""获取模拟转换内容"""
return f"[{output_format}格式] {content[:100]}..."
def is_available(self) -> bool:
"""检查适配器是否可用"""
return self.available
def get_status(self) -> Dict[str, Any]:
"""获取适配器状态"""
return {
"available": self.available,
"extractor_initialized": self.extractor is not None,
"integrator_initialized": self.integrator is not None,
"transformer_initialized": self.transformer is not None,
"supported_formats": self.get_supported_formats(),
"supported_output_formats": self.get_supported_output_formats()
}