bangbang-aigc-server/core/document_adapter.py
2025-07-31 15:35:23 +08:00

211 lines
7.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Document Adapter
文档处理适配器
作为core模块与document模块的桥梁提供统一的接口
"""
import sys
from typing import Dict, List, Optional, Any
from pathlib import Path
import logging
from .models import DocumentContent, IntegratedContent
logger = logging.getLogger(__name__)
try:
from .document import TextExtractor, ContentIntegrator, ContentTransformer
DOCUMENT_AVAILABLE = True
except ImportError as e:
logger.warning(f"Document模块导入失败: {e}")
DOCUMENT_AVAILABLE = False
class DocumentAdapter:
"""文档处理适配器"""
def __init__(self):
"""初始化适配器"""
self.available = DOCUMENT_AVAILABLE
self.extractor = None
self.integrator = None
self.transformer = None
if self.available:
try:
self.extractor = TextExtractor()
self.integrator = ContentIntegrator()
self.transformer = ContentTransformer()
logger.info("Document适配器初始化成功")
except Exception as e:
logger.error(f"Document适配器初始化失败: {e}")
self.available = False
else:
logger.warning("Document模块不可用将使用模拟数据")
def extract_text(self, file_path: str) -> DocumentContent:
"""
提取文档文本
Args:
file_path: 文档文件路径
Returns:
DocumentContent: 文档内容
"""
if not self.available or not self.extractor:
return self._get_mock_document(file_path)
try:
extracted_doc = self.extractor.extract(file_path)
content = extracted_doc.content if hasattr(extracted_doc, 'content') else str(extracted_doc)
file_type = Path(file_path).suffix.lower()
return DocumentContent(
file_path=file_path,
content=content,
file_type=file_type,
metadata={
"length": len(content),
"word_count": len(content.split()),
"file_size": Path(file_path).stat().st_size if Path(file_path).exists() else 0
}
)
except Exception as e:
logger.error(f"文本提取失败: {e}")
return self._get_mock_document(file_path)
def integrate_documents(self, document_paths: List[str]) -> IntegratedContent:
"""
整合多个文档
Args:
document_paths: 文档路径列表
Returns:
IntegratedContent: 整合后的内容
"""
if not self.available or not self.integrator:
return self._get_mock_integration(document_paths)
try:
# 提取所有文档内容
documents = []
for path in document_paths:
doc_content = self.extract_text(path)
documents.append(doc_content)
# 简单整合:将所有内容连接
all_content = [doc.content for doc in documents if doc.content]
integrated_text = "\n\n".join(all_content)
# 生成简单摘要
summary = integrated_text[:200] + "..." if len(integrated_text) > 200 else integrated_text
# 提取关键词(简单实现)
key_topics = []
for doc in documents:
words = doc.content.split()[:10] # 取前10个词作为关键词
key_topics.extend(words)
key_topics = list(set(key_topics))[:5] # 去重并取前5个
return IntegratedContent(
documents=documents,
integrated_text=integrated_text,
summary=summary,
key_topics=key_topics,
total_length=sum(len(doc.content) for doc in documents)
)
except Exception as e:
logger.error(f"文档整合失败: {e}")
return self._get_mock_integration(document_paths)
def transform_content(self, content: str, output_format: str) -> str:
"""
转换内容格式
Args:
content: 原始内容
output_format: 输出格式
Returns:
str: 转换后的内容
"""
if not self.available or not self.transformer:
return self._get_mock_transformation(content, output_format)
try:
# 简单的格式转换
if output_format == "summary":
return f"摘要:\n{content[:500]}..."
elif output_format == "blog_post":
return f"# 博客文章\n\n{content}"
elif output_format == "travel_guide":
return f"# 旅游攻略\n\n{content}"
elif output_format == "product_sales":
return f"# 产品销售\n\n{content}"
elif output_format == "attraction_standard":
return f"# 景点标准\n\n{content}"
else:
return content
except Exception as e:
logger.error(f"内容转换失败: {e}")
return self._get_mock_transformation(content, output_format)
def get_supported_formats(self) -> List[str]:
"""获取支持的文档格式"""
return [".txt", ".md", ".pdf", ".docx", ".doc", ".xlsx", ".xls", ".csv"]
def get_supported_output_formats(self) -> List[str]:
"""获取支持的输出格式"""
return ["summary", "blog_post", "travel_guide", "product_sales", "attraction_standard"]
def _get_mock_document(self, file_path: str) -> DocumentContent:
"""获取模拟文档内容"""
return DocumentContent(
file_path=file_path,
content=f"这是{Path(file_path).name}的模拟内容。包含了文档的主要信息和内容。",
file_type=Path(file_path).suffix.lower(),
metadata={
"length": 50,
"word_count": 15,
"file_size": 1024
}
)
def _get_mock_integration(self, document_paths: List[str]) -> IntegratedContent:
"""获取模拟整合内容"""
documents = [self._get_mock_document(path) for path in document_paths]
return IntegratedContent(
documents=documents,
integrated_text="这是整合后的文档内容,包含了所有文档的主要信息。",
summary="文档摘要:主要讨论了相关主题的重要内容。",
key_topics=["主题1", "主题2", "主题3"],
total_length=sum(len(doc.content) for doc in documents)
)
def _get_mock_transformation(self, content: str, output_format: str) -> str:
"""获取模拟转换内容"""
return f"[{output_format}格式] {content[:100]}..."
def is_available(self) -> bool:
"""检查适配器是否可用"""
return self.available
def get_status(self) -> Dict[str, Any]:
"""获取适配器状态"""
return {
"available": self.available,
"extractor_initialized": self.extractor is not None,
"integrator_initialized": self.integrator is not None,
"transformer_initialized": self.transformer is not None,
"supported_formats": self.get_supported_formats(),
"supported_output_formats": self.get_supported_output_formats()
}