bangbang-aigc-server/core/document/content_integrator.py
2025-07-31 15:35:23 +08:00

130 lines
4.4 KiB
Python

import logging
from typing import List, Dict, Any, Optional
from dataclasses import dataclass
from datetime import datetime
from .text_extractor import ExtractedDocument
import re
logger = logging.getLogger(__name__)
@dataclass
class IntegratedContent:
"""整合后的内容"""
documents: List[ExtractedDocument]
document_count: int
total_content_length: int
document_types: Dict[str, int]
combined_content: str
content_summary: str
key_topics: List[str]
def __post_init__(self):
"""初始化后处理"""
if not self.document_types:
self.document_types = {}
for doc in self.documents:
ext = doc.file_type.lower()
self.document_types[ext] = self.document_types.get(ext, 0) + 1
class ContentIntegrator:
"""内容整合器 - 整合多个文档的信息"""
def __init__(self):
pass
def integrate_documents(self, documents: List[ExtractedDocument]) -> IntegratedContent:
"""整合多个文档
Args:
documents: 提取的文档列表
Returns:
IntegratedContent: 整合后的内容
"""
if not documents:
return IntegratedContent(
documents=[],
document_count=0,
total_content_length=0,
document_types={},
combined_content="",
content_summary="没有提供文档内容",
key_topics=[]
)
# 统计文档类型
document_types = {}
for doc in documents:
ext = doc.file_type.lower()
document_types[ext] = document_types.get(ext, 0) + 1
# 合并内容
combined_content = self._combine_content(documents)
total_length = len(combined_content)
# 生成摘要
content_summary = self._generate_summary(documents)
# 提取关键主题
key_topics = self._extract_key_topics(combined_content)
return IntegratedContent(
documents=documents,
document_count=len(documents),
total_content_length=total_length,
document_types=document_types,
combined_content=combined_content,
content_summary=content_summary,
key_topics=key_topics
)
def _combine_content(self, documents: List[ExtractedDocument]) -> str:
"""合并文档内容"""
combined = []
for i, doc in enumerate(documents, 1):
combined.append(f"=== 文档 {i}: {doc.filename} ===")
combined.append(f"文件类型: {doc.file_type}")
combined.append(f"文件大小: {doc.file_size} 字节")
combined.append(f"提取时间: {doc.extracted_at}")
combined.append("")
combined.append("内容:")
combined.append(doc.content)
combined.append("")
combined.append("=" * 50)
combined.append("")
return "\n".join(combined)
def _generate_summary(self, documents: List[ExtractedDocument]) -> str:
"""生成内容摘要"""
if not documents:
return "没有文档内容"
summary_parts = []
summary_parts.append(f"共处理了 {len(documents)} 个文档:")
for i, doc in enumerate(documents, 1):
content_preview = doc.content[:100] + "..." if len(doc.content) > 100 else doc.content
summary_parts.append(f"{i}. {doc.filename} ({doc.file_type}): {content_preview}")
return "\n".join(summary_parts)
def _extract_key_topics(self, content: str) -> List[str]:
"""提取关键主题(简单的关键词提取)"""
if not content:
return []
# 简单的中文关键词提取
# 这里可以根据需要使用更复杂的NLP方法
words = re.findall(r'[\u4e00-\u9fff]+', content)
# 统计词频
word_count = {}
for word in words:
if len(word) >= 2: # 只考虑长度>=2的词
word_count[word] = word_count.get(word, 0) + 1
# 返回出现频率最高的前10个词
sorted_words = sorted(word_count.items(), key=lambda x: x[1], reverse=True)
return [word for word, count in sorted_words[:10] if count > 1]