130 lines
4.4 KiB
Python
130 lines
4.4 KiB
Python
|
|
import logging
|
||
|
|
from typing import List, Dict, Any, Optional
|
||
|
|
from dataclasses import dataclass
|
||
|
|
from datetime import datetime
|
||
|
|
from .text_extractor import ExtractedDocument
|
||
|
|
import re
|
||
|
|
|
||
|
|
logger = logging.getLogger(__name__)
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class IntegratedContent:
|
||
|
|
"""整合后的内容"""
|
||
|
|
documents: List[ExtractedDocument]
|
||
|
|
document_count: int
|
||
|
|
total_content_length: int
|
||
|
|
document_types: Dict[str, int]
|
||
|
|
combined_content: str
|
||
|
|
content_summary: str
|
||
|
|
key_topics: List[str]
|
||
|
|
|
||
|
|
def __post_init__(self):
|
||
|
|
"""初始化后处理"""
|
||
|
|
if not self.document_types:
|
||
|
|
self.document_types = {}
|
||
|
|
for doc in self.documents:
|
||
|
|
ext = doc.file_type.lower()
|
||
|
|
self.document_types[ext] = self.document_types.get(ext, 0) + 1
|
||
|
|
|
||
|
|
class ContentIntegrator:
|
||
|
|
"""内容整合器 - 整合多个文档的信息"""
|
||
|
|
|
||
|
|
def __init__(self):
|
||
|
|
pass
|
||
|
|
|
||
|
|
def integrate_documents(self, documents: List[ExtractedDocument]) -> IntegratedContent:
|
||
|
|
"""整合多个文档
|
||
|
|
|
||
|
|
Args:
|
||
|
|
documents: 提取的文档列表
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
IntegratedContent: 整合后的内容
|
||
|
|
"""
|
||
|
|
if not documents:
|
||
|
|
return IntegratedContent(
|
||
|
|
documents=[],
|
||
|
|
document_count=0,
|
||
|
|
total_content_length=0,
|
||
|
|
document_types={},
|
||
|
|
combined_content="",
|
||
|
|
content_summary="没有提供文档内容",
|
||
|
|
key_topics=[]
|
||
|
|
)
|
||
|
|
|
||
|
|
# 统计文档类型
|
||
|
|
document_types = {}
|
||
|
|
for doc in documents:
|
||
|
|
ext = doc.file_type.lower()
|
||
|
|
document_types[ext] = document_types.get(ext, 0) + 1
|
||
|
|
|
||
|
|
# 合并内容
|
||
|
|
combined_content = self._combine_content(documents)
|
||
|
|
total_length = len(combined_content)
|
||
|
|
|
||
|
|
# 生成摘要
|
||
|
|
content_summary = self._generate_summary(documents)
|
||
|
|
|
||
|
|
# 提取关键主题
|
||
|
|
key_topics = self._extract_key_topics(combined_content)
|
||
|
|
|
||
|
|
return IntegratedContent(
|
||
|
|
documents=documents,
|
||
|
|
document_count=len(documents),
|
||
|
|
total_content_length=total_length,
|
||
|
|
document_types=document_types,
|
||
|
|
combined_content=combined_content,
|
||
|
|
content_summary=content_summary,
|
||
|
|
key_topics=key_topics
|
||
|
|
)
|
||
|
|
|
||
|
|
def _combine_content(self, documents: List[ExtractedDocument]) -> str:
|
||
|
|
"""合并文档内容"""
|
||
|
|
combined = []
|
||
|
|
|
||
|
|
for i, doc in enumerate(documents, 1):
|
||
|
|
combined.append(f"=== 文档 {i}: {doc.filename} ===")
|
||
|
|
combined.append(f"文件类型: {doc.file_type}")
|
||
|
|
combined.append(f"文件大小: {doc.file_size} 字节")
|
||
|
|
combined.append(f"提取时间: {doc.extracted_at}")
|
||
|
|
combined.append("")
|
||
|
|
combined.append("内容:")
|
||
|
|
combined.append(doc.content)
|
||
|
|
combined.append("")
|
||
|
|
combined.append("=" * 50)
|
||
|
|
combined.append("")
|
||
|
|
|
||
|
|
return "\n".join(combined)
|
||
|
|
|
||
|
|
def _generate_summary(self, documents: List[ExtractedDocument]) -> str:
|
||
|
|
"""生成内容摘要"""
|
||
|
|
if not documents:
|
||
|
|
return "没有文档内容"
|
||
|
|
|
||
|
|
summary_parts = []
|
||
|
|
summary_parts.append(f"共处理了 {len(documents)} 个文档:")
|
||
|
|
|
||
|
|
for i, doc in enumerate(documents, 1):
|
||
|
|
content_preview = doc.content[:100] + "..." if len(doc.content) > 100 else doc.content
|
||
|
|
summary_parts.append(f"{i}. {doc.filename} ({doc.file_type}): {content_preview}")
|
||
|
|
|
||
|
|
return "\n".join(summary_parts)
|
||
|
|
|
||
|
|
def _extract_key_topics(self, content: str) -> List[str]:
|
||
|
|
"""提取关键主题(简单的关键词提取)"""
|
||
|
|
if not content:
|
||
|
|
return []
|
||
|
|
|
||
|
|
# 简单的中文关键词提取
|
||
|
|
# 这里可以根据需要使用更复杂的NLP方法
|
||
|
|
words = re.findall(r'[\u4e00-\u9fff]+', content)
|
||
|
|
|
||
|
|
# 统计词频
|
||
|
|
word_count = {}
|
||
|
|
for word in words:
|
||
|
|
if len(word) >= 2: # 只考虑长度>=2的词
|
||
|
|
word_count[word] = word_count.get(word, 0) + 1
|
||
|
|
|
||
|
|
# 返回出现频率最高的前10个词
|
||
|
|
sorted_words = sorted(word_count.items(), key=lambda x: x[1], reverse=True)
|
||
|
|
return [word for word, count in sorted_words[:10] if count > 1]
|