#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Document Adapter 文档处理适配器 作为core模块与document模块的桥梁,提供统一的接口 """ import sys from typing import Dict, List, Optional, Any from pathlib import Path import logging from .models import DocumentContent, IntegratedContent logger = logging.getLogger(__name__) try: from .document import TextExtractor, ContentIntegrator, ContentTransformer DOCUMENT_AVAILABLE = True except ImportError as e: logger.warning(f"Document模块导入失败: {e}") DOCUMENT_AVAILABLE = False class DocumentAdapter: """文档处理适配器""" def __init__(self): """初始化适配器""" self.available = DOCUMENT_AVAILABLE self.extractor = None self.integrator = None self.transformer = None if self.available: try: self.extractor = TextExtractor() self.integrator = ContentIntegrator() self.transformer = ContentTransformer() logger.info("Document适配器初始化成功") except Exception as e: logger.error(f"Document适配器初始化失败: {e}") self.available = False else: logger.warning("Document模块不可用,将使用模拟数据") def extract_text(self, file_path: str) -> DocumentContent: """ 提取文档文本 Args: file_path: 文档文件路径 Returns: DocumentContent: 文档内容 """ if not self.available or not self.extractor: return self._get_mock_document(file_path) try: extracted_doc = self.extractor.extract(file_path) content = extracted_doc.content if hasattr(extracted_doc, 'content') else str(extracted_doc) file_type = Path(file_path).suffix.lower() return DocumentContent( file_path=file_path, content=content, file_type=file_type, metadata={ "length": len(content), "word_count": len(content.split()), "file_size": Path(file_path).stat().st_size if Path(file_path).exists() else 0 } ) except Exception as e: logger.error(f"文本提取失败: {e}") return self._get_mock_document(file_path) def integrate_documents(self, document_paths: List[str]) -> IntegratedContent: """ 整合多个文档 Args: document_paths: 文档路径列表 Returns: IntegratedContent: 整合后的内容 """ if not self.available or not self.integrator: return self._get_mock_integration(document_paths) try: # 提取所有文档内容 documents = [] for path in document_paths: doc_content = self.extract_text(path) documents.append(doc_content) # 简单整合:将所有内容连接 all_content = [doc.content for doc in documents if doc.content] integrated_text = "\n\n".join(all_content) # 生成简单摘要 summary = integrated_text[:200] + "..." if len(integrated_text) > 200 else integrated_text # 提取关键词(简单实现) key_topics = [] for doc in documents: words = doc.content.split()[:10] # 取前10个词作为关键词 key_topics.extend(words) key_topics = list(set(key_topics))[:5] # 去重并取前5个 return IntegratedContent( documents=documents, integrated_text=integrated_text, summary=summary, key_topics=key_topics, total_length=sum(len(doc.content) for doc in documents) ) except Exception as e: logger.error(f"文档整合失败: {e}") return self._get_mock_integration(document_paths) def transform_content(self, content: str, output_format: str) -> str: """ 转换内容格式 Args: content: 原始内容 output_format: 输出格式 Returns: str: 转换后的内容 """ if not self.available or not self.transformer: return self._get_mock_transformation(content, output_format) try: # 简单的格式转换 if output_format == "summary": return f"摘要:\n{content[:500]}..." elif output_format == "blog_post": return f"# 博客文章\n\n{content}" elif output_format == "travel_guide": return f"# 旅游攻略\n\n{content}" elif output_format == "product_sales": return f"# 产品销售\n\n{content}" elif output_format == "attraction_standard": return f"# 景点标准\n\n{content}" else: return content except Exception as e: logger.error(f"内容转换失败: {e}") return self._get_mock_transformation(content, output_format) def get_supported_formats(self) -> List[str]: """获取支持的文档格式""" return [".txt", ".md", ".pdf", ".docx", ".doc", ".xlsx", ".xls", ".csv"] def get_supported_output_formats(self) -> List[str]: """获取支持的输出格式""" return ["summary", "blog_post", "travel_guide", "product_sales", "attraction_standard"] def _get_mock_document(self, file_path: str) -> DocumentContent: """获取模拟文档内容""" return DocumentContent( file_path=file_path, content=f"这是{Path(file_path).name}的模拟内容。包含了文档的主要信息和内容。", file_type=Path(file_path).suffix.lower(), metadata={ "length": 50, "word_count": 15, "file_size": 1024 } ) def _get_mock_integration(self, document_paths: List[str]) -> IntegratedContent: """获取模拟整合内容""" documents = [self._get_mock_document(path) for path in document_paths] return IntegratedContent( documents=documents, integrated_text="这是整合后的文档内容,包含了所有文档的主要信息。", summary="文档摘要:主要讨论了相关主题的重要内容。", key_topics=["主题1", "主题2", "主题3"], total_length=sum(len(doc.content) for doc in documents) ) def _get_mock_transformation(self, content: str, output_format: str) -> str: """获取模拟转换内容""" return f"[{output_format}格式] {content[:100]}..." def is_available(self) -> bool: """检查适配器是否可用""" return self.available def get_status(self) -> Dict[str, Any]: """获取适配器状态""" return { "available": self.available, "extractor_initialized": self.extractor is not None, "integrator_initialized": self.integrator is not None, "transformer_initialized": self.transformer is not None, "supported_formats": self.get_supported_formats(), "supported_output_formats": self.get_supported_output_formats() }