import logging from typing import List, Dict, Any, Optional from dataclasses import dataclass from datetime import datetime from .text_extractor import ExtractedDocument import re logger = logging.getLogger(__name__) @dataclass class IntegratedContent: """整合后的内容""" documents: List[ExtractedDocument] document_count: int total_content_length: int document_types: Dict[str, int] combined_content: str content_summary: str key_topics: List[str] def __post_init__(self): """初始化后处理""" if not self.document_types: self.document_types = {} for doc in self.documents: ext = doc.file_type.lower() self.document_types[ext] = self.document_types.get(ext, 0) + 1 class ContentIntegrator: """内容整合器 - 整合多个文档的信息""" def __init__(self): pass def integrate_documents(self, documents: List[ExtractedDocument]) -> IntegratedContent: """整合多个文档 Args: documents: 提取的文档列表 Returns: IntegratedContent: 整合后的内容 """ if not documents: return IntegratedContent( documents=[], document_count=0, total_content_length=0, document_types={}, combined_content="", content_summary="没有提供文档内容", key_topics=[] ) # 统计文档类型 document_types = {} for doc in documents: ext = doc.file_type.lower() document_types[ext] = document_types.get(ext, 0) + 1 # 合并内容 combined_content = self._combine_content(documents) total_length = len(combined_content) # 生成摘要 content_summary = self._generate_summary(documents) # 提取关键主题 key_topics = self._extract_key_topics(combined_content) return IntegratedContent( documents=documents, document_count=len(documents), total_content_length=total_length, document_types=document_types, combined_content=combined_content, content_summary=content_summary, key_topics=key_topics ) def _combine_content(self, documents: List[ExtractedDocument]) -> str: """合并文档内容""" combined = [] for i, doc in enumerate(documents, 1): combined.append(f"=== 文档 {i}: {doc.filename} ===") combined.append(f"文件类型: {doc.file_type}") combined.append(f"文件大小: {doc.file_size} 字节") combined.append(f"提取时间: {doc.extracted_at}") combined.append("") combined.append("内容:") combined.append(doc.content) combined.append("") combined.append("=" * 50) combined.append("") return "\n".join(combined) def _generate_summary(self, documents: List[ExtractedDocument]) -> str: """生成内容摘要""" if not documents: return "没有文档内容" summary_parts = [] summary_parts.append(f"共处理了 {len(documents)} 个文档:") for i, doc in enumerate(documents, 1): content_preview = doc.content[:100] + "..." if len(doc.content) > 100 else doc.content summary_parts.append(f"{i}. {doc.filename} ({doc.file_type}): {content_preview}") return "\n".join(summary_parts) def _extract_key_topics(self, content: str) -> List[str]: """提取关键主题(简单的关键词提取)""" if not content: return [] # 简单的中文关键词提取 # 这里可以根据需要使用更复杂的NLP方法 words = re.findall(r'[\u4e00-\u9fff]+', content) # 统计词频 word_count = {} for word in words: if len(word) >= 2: # 只考虑长度>=2的词 word_count[word] = word_count.get(word, 0) + 1 # 返回出现频率最高的前10个词 sorted_words = sorted(word_count.items(), key=lambda x: x[1], reverse=True) return [word for word, count in sorted_words[:10] if count > 1]