2025-07-31 15:35:23 +08:00

801 lines
69 KiB
JSON
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"file_path": "travel-algorithms/travel_algorithms/document_processing/content_integrator.py",
"file_size": 15711,
"line_count": 435,
"functions": [
{
"name": "__post_init__",
"line_start": 36,
"line_end": 45,
"args": [
{
"name": "self"
}
],
"return_type": null,
"docstring": "初始化后处理",
"is_async": false,
"decorators": [],
"code": " def __post_init__(self):\n \"\"\"初始化后处理\"\"\"\n if not self.integrated_at:\n self.integrated_at = datetime.now()\n \n if not self.document_types:\n self.document_types = {}\n for doc in self.documents:\n file_type = doc.file_type\n self.document_types[file_type] = self.document_types.get(file_type, 0) + 1",
"code_hash": "dfa2a458566ec2698cc045a4f0be192e"
},
{
"name": "to_dict",
"line_start": 47,
"line_end": 59,
"args": [
{
"name": "self"
}
],
"return_type": "Dict[str, Any]",
"docstring": "转换为字典格式",
"is_async": false,
"decorators": [],
"code": " def to_dict(self) -> Dict[str, Any]:\n \"\"\"转换为字典格式\"\"\"\n return {\n 'document_count': self.document_count,\n 'total_content_length': self.total_content_length,\n 'document_types': self.document_types,\n 'combined_content': self.combined_content,\n 'content_summary': self.content_summary,\n 'key_topics': self.key_topics,\n 'integration_metadata': self.integration_metadata,\n 'integrated_at': self.integrated_at.isoformat(),\n 'documents': [doc.get_summary() for doc in self.documents]\n }",
"code_hash": "1d82123b5ec018a20433936b0df16320"
},
{
"name": "get_statistics",
"line_start": 61,
"line_end": 75,
"args": [
{
"name": "self"
}
],
"return_type": "Dict[str, Any]",
"docstring": "获取内容统计信息",
"is_async": false,
"decorators": [],
"code": " def get_statistics(self) -> Dict[str, Any]:\n \"\"\"获取内容统计信息\"\"\"\n total_words = sum(len(doc.content.split()) for doc in self.documents)\n successful_docs = [doc for doc in self.documents if not doc.error_info]\n \n return {\n 'total_documents': self.document_count,\n 'successful_extractions': len(successful_docs),\n 'failed_extractions': self.document_count - len(successful_docs),\n 'total_words': total_words,\n 'total_content_length': self.total_content_length,\n 'average_content_length': self.total_content_length / max(1, len(successful_docs)),\n 'document_types': self.document_types,\n 'key_topics_count': len(self.key_topics)\n }",
"code_hash": "10c87ec40691e2b661e6007409a5634d"
},
{
"name": "__init__",
"line_start": 84,
"line_end": 101,
"args": [
{
"name": "self"
},
{
"name": "config",
"type_hint": "DocumentProcessingConfig"
}
],
"return_type": null,
"docstring": "初始化内容整合器\n\nArgs:\n config: 文档处理配置",
"is_async": false,
"decorators": [],
"code": " def __init__(self, config: DocumentProcessingConfig):\n \"\"\"\n 初始化内容整合器\n \n Args:\n config: 文档处理配置\n \"\"\"\n self.config = config\n \n # 停用词列表(中文)\n self.stop_words = {\n '的', '了', '是', '在', '有', '和', '就', '不', '人', '都', '一', '个', '上', '也', '很', '到', '说', '要', '去', \n '你', '会', '着', '没', '看', '好', '自己', '这', '那', '来', '可以', '时候', '我', '他', '她', '它', '们',\n '之', '与', '及', '或', '但', '而', '因为', '所以', '如果', '虽然', '然而', '因此', '于是', '总之',\n '的话', '这样', '那样', '这里', '那里', '现在', '以前', '以后', '当时', '刚才', '马上', '立即'\n }\n \n logger.info(\"内容整合器初始化完成\")",
"code_hash": "4f448f1348426d8622feeae0c4f064cc"
},
{
"name": "integrate_documents",
"line_start": 103,
"line_end": 172,
"args": [
{
"name": "self"
},
{
"name": "documents",
"type_hint": "List[ExtractedDocument]"
}
],
"return_type": "IntegratedContent",
"docstring": "整合多个文档\n\nArgs:\n documents: 提取的文档列表\n \nReturns:\n IntegratedContent: 整合后的内容\n \nRaises:\n DocumentProcessingError: 整合失败时抛出",
"is_async": false,
"decorators": [],
"code": " def integrate_documents(self, documents: List[ExtractedDocument]) -> IntegratedContent:\n \"\"\"\n 整合多个文档\n \n Args:\n documents: 提取的文档列表\n \n Returns:\n IntegratedContent: 整合后的内容\n \n Raises:\n DocumentProcessingError: 整合失败时抛出\n \"\"\"\n if not documents:\n return IntegratedContent(\n documents=[],\n document_count=0,\n total_content_length=0,\n document_types={},\n combined_content=\"\",\n content_summary=\"没有可处理的文档\",\n key_topics=[],\n integration_metadata={'integration_method': 'empty'},\n integrated_at=datetime.now()\n )\n \n try:\n logger.info(f\"开始整合 {len(documents)} 个文档\")\n \n # 过滤成功提取的文档\n successful_docs = [doc for doc in documents if doc.content and not doc.error_info]\n \n if not successful_docs:\n logger.warning(\"没有成功提取的文档内容\")\n return self._create_empty_integration(documents, \"所有文档提取失败\")\n \n # 合并内容\n combined_content = self._combine_content(successful_docs)\n \n # 生成内容摘要\n content_summary = self._generate_summary(combined_content, successful_docs)\n \n # 提取关键主题\n key_topics = self._extract_key_topics(combined_content)\n \n # 收集统计信息\n integration_metadata = self._collect_metadata(documents, successful_docs)\n \n # 计算总内容长度\n total_content_length = sum(len(doc.content) for doc in successful_docs)\n \n integrated_content = IntegratedContent(\n documents=documents,\n document_count=len(documents),\n total_content_length=total_content_length,\n document_types=self._count_document_types(documents),\n combined_content=combined_content,\n content_summary=content_summary,\n key_topics=key_topics,\n integration_metadata=integration_metadata,\n integrated_at=datetime.now()\n )\n \n logger.info(f\"文档整合完成,合并内容长度: {len(combined_content)}\")\n return integrated_content\n \n except Exception as e:\n error_msg = f\"文档整合失败: {str(e)}\"\n logger.error(error_msg, exc_info=True)\n raise DocumentProcessingError(error_msg)",
"code_hash": "29f02e2616a26e6f18f5e472032c713b"
},
{
"name": "_combine_content",
"line_start": 174,
"line_end": 203,
"args": [
{
"name": "self"
},
{
"name": "documents",
"type_hint": "List[ExtractedDocument]"
}
],
"return_type": "str",
"docstring": "合并文档内容",
"is_async": false,
"decorators": [],
"code": " def _combine_content(self, documents: List[ExtractedDocument]) -> str:\n \"\"\"合并文档内容\"\"\"\n if not documents:\n return \"\"\n \n combined_parts = []\n \n for i, doc in enumerate(documents):\n if doc.content.strip():\n # 添加文档分隔符\n if i > 0:\n combined_parts.append(f\"\\n{'='*50}\\n\")\n \n # 添加文档头信息\n combined_parts.append(f\"文档: {doc.filename} ({doc.file_type})\\n\")\n if doc.page_count:\n combined_parts.append(f\"页数: {doc.page_count}\\n\")\n combined_parts.append(\"-\" * 30 + \"\\n\")\n \n # 添加文档内容\n content = doc.content.strip()\n \n # 应用内容长度限制\n if self.config.max_content_length > 0 and len(content) > self.config.max_content_length:\n content = content[:self.config.max_content_length] + \"\\n[内容已截断...]\"\n \n combined_parts.append(content)\n combined_parts.append(\"\\n\")\n \n return \"\".join(combined_parts)",
"code_hash": "ae1f5bf801d40221bdc66126f230a2d6"
},
{
"name": "_generate_summary",
"line_start": 205,
"line_end": 244,
"args": [
{
"name": "self"
},
{
"name": "combined_content",
"type_hint": "str"
},
{
"name": "documents",
"type_hint": "List[ExtractedDocument]"
}
],
"return_type": "str",
"docstring": "生成内容摘要",
"is_async": false,
"decorators": [],
"code": " def _generate_summary(self, combined_content: str, documents: List[ExtractedDocument]) -> str:\n \"\"\"生成内容摘要\"\"\"\n if not combined_content.strip():\n return \"无内容可摘要\"\n \n summary_parts = []\n \n # 基本统计\n word_count = len(combined_content.split())\n char_count = len(combined_content)\n \n summary_parts.append(f\"文档摘要:\")\n summary_parts.append(f\"- 成功处理文档数: {len(documents)}\")\n summary_parts.append(f\"- 总字符数: {char_count:,}\")\n summary_parts.append(f\"- 总词数: {word_count:,}\")\n \n # 文档类型统计\n doc_types = Counter(doc.file_type for doc in documents)\n summary_parts.append(f\"- 文档类型分布: {dict(doc_types)}\")\n \n # 内容长度分析\n content_lengths = [len(doc.content) for doc in documents if doc.content]\n if content_lengths:\n avg_length = sum(content_lengths) / len(content_lengths)\n summary_parts.append(f\"- 平均文档长度: {avg_length:.0f} 字符\")\n \n # 提取前几个段落作为内容预览\n paragraphs = [p.strip() for p in combined_content.split('\\n') if p.strip()]\n preview_paragraphs = []\n \n for para in paragraphs[:5]: # 最多取前5个段落\n if len(para) > 10 and not para.startswith('-') and not para.startswith('='):\n preview_paragraphs.append(para[:100] + \"...\" if len(para) > 100 else para)\n \n if preview_paragraphs:\n summary_parts.append(\"\\n内容预览:\")\n for i, para in enumerate(preview_paragraphs, 1):\n summary_parts.append(f\"{i}. {para}\")\n \n return \"\\n\".join(summary_parts)",
"code_hash": "317ec07d8e64f2cbf8ff75d02c585220"
},
{
"name": "_extract_key_topics",
"line_start": 246,
"line_end": 268,
"args": [
{
"name": "self"
},
{
"name": "content",
"type_hint": "str"
}
],
"return_type": "List[str]",
"docstring": "提取关键主题",
"is_async": false,
"decorators": [],
"code": " def _extract_key_topics(self, content: str) -> List[str]:\n \"\"\"提取关键主题\"\"\"\n if not content.strip():\n return []\n \n try:\n # 文本预处理\n text = self._preprocess_text(content)\n \n # 提取词频\n word_freq = self._calculate_word_frequency(text)\n \n # 筛选关键词\n keywords = self._filter_keywords(word_freq)\n \n # 主题聚类(简单版本)\n topics = self._cluster_topics(keywords, content)\n \n return topics[:self.config.max_topics]\n \n except Exception as e:\n logger.warning(f\"关键主题提取失败: {e}\")\n return []",
"code_hash": "e21880093361157107ba9fe51078854c"
},
{
"name": "_preprocess_text",
"line_start": 270,
"line_end": 278,
"args": [
{
"name": "self"
},
{
"name": "text",
"type_hint": "str"
}
],
"return_type": "str",
"docstring": "文本预处理",
"is_async": false,
"decorators": [],
"code": " def _preprocess_text(self, text: str) -> str:\n \"\"\"文本预处理\"\"\"\n # 移除特殊字符和数字\n text = re.sub(r'[^\\u4e00-\\u9fa5a-zA-Z\\s]', ' ', text)\n \n # 移除多余空白\n text = re.sub(r'\\s+', ' ', text).strip()\n \n return text",
"code_hash": "5d5dc7ad13e8caaaca8afb20038a80e2"
},
{
"name": "_calculate_word_frequency",
"line_start": 280,
"line_end": 292,
"args": [
{
"name": "self"
},
{
"name": "text",
"type_hint": "str"
}
],
"return_type": "Dict[str, int]",
"docstring": "计算词频",
"is_async": false,
"decorators": [],
"code": " def _calculate_word_frequency(self, text: str) -> Dict[str, int]:\n \"\"\"计算词频\"\"\"\n words = text.split()\n word_freq = Counter()\n \n for word in words:\n word = word.strip().lower()\n \n # 过滤短词和停用词\n if len(word) >= 2 and word not in self.stop_words:\n word_freq[word] += 1\n \n return dict(word_freq)",
"code_hash": "60a4dccdbc57da50b752d88025914303"
},
{
"name": "_filter_keywords",
"line_start": 294,
"line_end": 305,
"args": [
{
"name": "self"
},
{
"name": "word_freq",
"type_hint": "Dict[str, int]"
}
],
"return_type": "List[tuple]",
"docstring": "筛选关键词",
"is_async": false,
"decorators": [],
"code": " def _filter_keywords(self, word_freq: Dict[str, int]) -> List[tuple]:\n \"\"\"筛选关键词\"\"\"\n # 按频率排序\n sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)\n \n # 取前N个高频词作为关键词\n keywords = []\n for word, freq in sorted_words[:50]: # 最多50个关键词\n if freq >= 2: # 至少出现2次\n keywords.append((word, freq))\n \n return keywords",
"code_hash": "2c7fb426834120ae71702ca1858d29ee"
},
{
"name": "_cluster_topics",
"line_start": 307,
"line_end": 343,
"args": [
{
"name": "self"
},
{
"name": "keywords",
"type_hint": "List[tuple]"
},
{
"name": "content",
"type_hint": "str"
}
],
"return_type": "List[str]",
"docstring": "简单的主题聚类",
"is_async": false,
"decorators": [],
"code": " def _cluster_topics(self, keywords: List[tuple], content: str) -> List[str]:\n \"\"\"简单的主题聚类\"\"\"\n if not keywords:\n return []\n \n topics = []\n \n # 基于关键词生成主题\n keyword_words = [kw[0] for kw in keywords[:20]]\n \n # 预定义主题模式\n topic_patterns = {\n '景区介绍': ['景区', '景点', '景观', '风景', '旅游', '游览', '参观', '观光'],\n '门票价格': ['门票', '价格', '费用', '收费', '票价', '优惠', '折扣'],\n '开放时间': ['时间', '开放', '营业', '闭园', '时段', '时刻'],\n '交通指南': ['交通', '路线', '公交', '地铁', '自驾', '停车', '导航'],\n '服务设施': ['服务', '设施', '餐厅', '厕所', '休息', '购物', '商店'],\n '注意事项': ['注意', '提醒', '禁止', '安全', '建议', '须知'],\n '活动体验': ['活动', '体验', '表演', '娱乐', '项目', '节目']\n }\n \n for topic_name, patterns in topic_patterns.items():\n matches = 0\n for keyword in keyword_words:\n for pattern in patterns:\n if pattern in keyword or keyword in pattern:\n matches += 1\n break\n \n if matches >= 2: # 至少匹配2个模式\n topics.append(topic_name)\n \n # 如果没有匹配的预定义主题,使用高频关键词\n if not topics:\n topics = [kw[0] for kw in keywords[:5]]\n \n return topics",
"code_hash": "1da2bbe4375348bb212bb26829916281"
},
{
"name": "_count_document_types",
"line_start": 345,
"line_end": 350,
"args": [
{
"name": "self"
},
{
"name": "documents",
"type_hint": "List[ExtractedDocument]"
}
],
"return_type": "Dict[str, int]",
"docstring": "统计文档类型",
"is_async": false,
"decorators": [],
"code": " def _count_document_types(self, documents: List[ExtractedDocument]) -> Dict[str, int]:\n \"\"\"统计文档类型\"\"\"\n type_counts = Counter()\n for doc in documents:\n type_counts[doc.file_type] += 1\n return dict(type_counts)",
"code_hash": "39f82f3afb3f45311de05455c401570c"
},
{
"name": "_collect_metadata",
"line_start": 352,
"line_end": 368,
"args": [
{
"name": "self"
},
{
"name": "all_documents",
"type_hint": "List[ExtractedDocument]"
},
{
"name": "successful_docs",
"type_hint": "List[ExtractedDocument]"
}
],
"return_type": "Dict[str, Any]",
"docstring": "收集整合元数据",
"is_async": false,
"decorators": [],
"code": " def _collect_metadata(self, all_documents: List[ExtractedDocument], successful_docs: List[ExtractedDocument]) -> Dict[str, Any]:\n \"\"\"收集整合元数据\"\"\"\n total_file_size = sum(doc.file_size for doc in all_documents)\n extraction_methods = Counter(doc.extraction_method for doc in successful_docs if doc.extraction_method)\n \n return {\n 'integration_method': 'content_integrator_v2',\n 'total_documents': len(all_documents),\n 'successful_extractions': len(successful_docs),\n 'failed_extractions': len(all_documents) - len(successful_docs),\n 'total_file_size': total_file_size,\n 'extraction_methods_used': dict(extraction_methods),\n 'processing_timestamp': datetime.now().isoformat(),\n 'content_integration_enabled': self.config.enable_content_integration,\n 'max_content_length': self.config.max_content_length,\n 'max_topics': self.config.max_topics\n }",
"code_hash": "4b8f45cdbb97e153a6b3357b24e59555"
},
{
"name": "_create_empty_integration",
"line_start": 370,
"line_end": 382,
"args": [
{
"name": "self"
},
{
"name": "documents",
"type_hint": "List[ExtractedDocument]"
},
{
"name": "reason",
"type_hint": "str"
}
],
"return_type": "IntegratedContent",
"docstring": "创建空的整合结果",
"is_async": false,
"decorators": [],
"code": " def _create_empty_integration(self, documents: List[ExtractedDocument], reason: str) -> IntegratedContent:\n \"\"\"创建空的整合结果\"\"\"\n return IntegratedContent(\n documents=documents,\n document_count=len(documents),\n total_content_length=0,\n document_types=self._count_document_types(documents),\n combined_content=\"\",\n content_summary=f\"整合失败: {reason}\",\n key_topics=[],\n integration_metadata={'integration_method': 'empty', 'failure_reason': reason},\n integrated_at=datetime.now()\n )",
"code_hash": "dc78387417a3481b4e9de7a4bc8ee97f"
},
{
"name": "analyze_content_overlap",
"line_start": 384,
"line_end": 420,
"args": [
{
"name": "self"
},
{
"name": "documents",
"type_hint": "List[ExtractedDocument]"
}
],
"return_type": "Dict[str, Any]",
"docstring": "分析文档内容重叠情况",
"is_async": false,
"decorators": [],
"code": " def analyze_content_overlap(self, documents: List[ExtractedDocument]) -> Dict[str, Any]:\n \"\"\"分析文档内容重叠情况\"\"\"\n if len(documents) < 2:\n return {'overlap_analysis': '需要至少2个文档进行重叠分析'}\n \n # 简单的重叠分析\n doc_words = []\n for doc in documents:\n if doc.content and not doc.error_info:\n words = set(self._preprocess_text(doc.content).split())\n doc_words.append((doc.filename, words))\n \n if len(doc_words) < 2:\n return {'overlap_analysis': '没有足够的有效文档内容进行分析'}\n \n overlap_results = {}\n \n for i in range(len(doc_words)):\n for j in range(i + 1, len(doc_words)):\n doc1_name, words1 = doc_words[i]\n doc2_name, words2 = doc_words[j]\n \n common_words = words1.intersection(words2)\n total_words = len(words1.union(words2))\n \n if total_words > 0:\n overlap_ratio = len(common_words) / total_words\n overlap_results[f\"{doc1_name} vs {doc2_name}\"] = {\n 'common_words_count': len(common_words),\n 'overlap_ratio': round(overlap_ratio, 3),\n 'common_words_sample': list(common_words)[:10]\n }\n \n return {\n 'overlap_analysis': overlap_results,\n 'total_comparisons': len(overlap_results)\n }",
"code_hash": "29b905f15eb4c80bb9afc905a96b3249"
},
{
"name": "get_integration_stats",
"line_start": 422,
"line_end": 436,
"args": [
{
"name": "self"
}
],
"return_type": "Dict[str, Any]",
"docstring": "获取整合器统计信息",
"is_async": false,
"decorators": [],
"code": " def get_integration_stats(self) -> Dict[str, Any]:\n \"\"\"获取整合器统计信息\"\"\"\n return {\n 'max_content_length': self.config.max_content_length,\n 'max_topics': self.config.max_topics,\n 'enable_content_integration': self.config.enable_content_integration,\n 'stop_words_count': len(self.stop_words),\n 'supported_analysis': [\n 'content_combination',\n 'summary_generation', \n 'key_topic_extraction',\n 'content_overlap_analysis',\n 'document_type_statistics'\n ]\n } ",
"code_hash": "3a9ed49f3fe653c211e8c2e0253d87e6"
}
],
"classes": [
{
"name": "IntegratedContent",
"line_start": 24,
"line_end": 75,
"bases": [],
"methods": [
{
"name": "__post_init__",
"line_start": 36,
"line_end": 45,
"args": [
{
"name": "self"
}
],
"return_type": null,
"docstring": "初始化后处理",
"is_async": false,
"decorators": [],
"code": " def __post_init__(self):\n \"\"\"初始化后处理\"\"\"\n if not self.integrated_at:\n self.integrated_at = datetime.now()\n \n if not self.document_types:\n self.document_types = {}\n for doc in self.documents:\n file_type = doc.file_type\n self.document_types[file_type] = self.document_types.get(file_type, 0) + 1",
"code_hash": "dfa2a458566ec2698cc045a4f0be192e"
},
{
"name": "to_dict",
"line_start": 47,
"line_end": 59,
"args": [
{
"name": "self"
}
],
"return_type": "Dict[str, Any]",
"docstring": "转换为字典格式",
"is_async": false,
"decorators": [],
"code": " def to_dict(self) -> Dict[str, Any]:\n \"\"\"转换为字典格式\"\"\"\n return {\n 'document_count': self.document_count,\n 'total_content_length': self.total_content_length,\n 'document_types': self.document_types,\n 'combined_content': self.combined_content,\n 'content_summary': self.content_summary,\n 'key_topics': self.key_topics,\n 'integration_metadata': self.integration_metadata,\n 'integrated_at': self.integrated_at.isoformat(),\n 'documents': [doc.get_summary() for doc in self.documents]\n }",
"code_hash": "1d82123b5ec018a20433936b0df16320"
},
{
"name": "get_statistics",
"line_start": 61,
"line_end": 75,
"args": [
{
"name": "self"
}
],
"return_type": "Dict[str, Any]",
"docstring": "获取内容统计信息",
"is_async": false,
"decorators": [],
"code": " def get_statistics(self) -> Dict[str, Any]:\n \"\"\"获取内容统计信息\"\"\"\n total_words = sum(len(doc.content.split()) for doc in self.documents)\n successful_docs = [doc for doc in self.documents if not doc.error_info]\n \n return {\n 'total_documents': self.document_count,\n 'successful_extractions': len(successful_docs),\n 'failed_extractions': self.document_count - len(successful_docs),\n 'total_words': total_words,\n 'total_content_length': self.total_content_length,\n 'average_content_length': self.total_content_length / max(1, len(successful_docs)),\n 'document_types': self.document_types,\n 'key_topics_count': len(self.key_topics)\n }",
"code_hash": "10c87ec40691e2b661e6007409a5634d"
}
],
"docstring": "整合后的内容",
"decorators": [
"dataclass"
],
"code": "class IntegratedContent:\n \"\"\"整合后的内容\"\"\"\n documents: List[ExtractedDocument]\n document_count: int\n total_content_length: int\n document_types: Dict[str, int]\n combined_content: str\n content_summary: str\n key_topics: List[str]\n integration_metadata: Dict[str, Any]\n integrated_at: datetime\n \n def __post_init__(self):\n \"\"\"初始化后处理\"\"\"\n if not self.integrated_at:\n self.integrated_at = datetime.now()\n \n if not self.document_types:\n self.document_types = {}\n for doc in self.documents:\n file_type = doc.file_type\n self.document_types[file_type] = self.document_types.get(file_type, 0) + 1\n \n def to_dict(self) -> Dict[str, Any]:\n \"\"\"转换为字典格式\"\"\"\n return {\n 'document_count': self.document_count,\n 'total_content_length': self.total_content_length,\n 'document_types': self.document_types,\n 'combined_content': self.combined_content,\n 'content_summary': self.content_summary,\n 'key_topics': self.key_topics,\n 'integration_metadata': self.integration_metadata,\n 'integrated_at': self.integrated_at.isoformat(),\n 'documents': [doc.get_summary() for doc in self.documents]\n }\n \n def get_statistics(self) -> Dict[str, Any]:\n \"\"\"获取内容统计信息\"\"\"\n total_words = sum(len(doc.content.split()) for doc in self.documents)\n successful_docs = [doc for doc in self.documents if not doc.error_info]\n \n return {\n 'total_documents': self.document_count,\n 'successful_extractions': len(successful_docs),\n 'failed_extractions': self.document_count - len(successful_docs),\n 'total_words': total_words,\n 'total_content_length': self.total_content_length,\n 'average_content_length': self.total_content_length / max(1, len(successful_docs)),\n 'document_types': self.document_types,\n 'key_topics_count': len(self.key_topics)\n }",
"code_hash": "20fc804660dc1b1d8ff4ccea9f20d21e"
},
{
"name": "ContentIntegrator",
"line_start": 78,
"line_end": 436,
"bases": [],
"methods": [
{
"name": "__init__",
"line_start": 84,
"line_end": 101,
"args": [
{
"name": "self"
},
{
"name": "config",
"type_hint": "DocumentProcessingConfig"
}
],
"return_type": null,
"docstring": "初始化内容整合器\n\nArgs:\n config: 文档处理配置",
"is_async": false,
"decorators": [],
"code": " def __init__(self, config: DocumentProcessingConfig):\n \"\"\"\n 初始化内容整合器\n \n Args:\n config: 文档处理配置\n \"\"\"\n self.config = config\n \n # 停用词列表(中文)\n self.stop_words = {\n '的', '了', '是', '在', '有', '和', '就', '不', '人', '都', '一', '个', '上', '也', '很', '到', '说', '要', '去', \n '你', '会', '着', '没', '看', '好', '自己', '这', '那', '来', '可以', '时候', '我', '他', '她', '它', '们',\n '之', '与', '及', '或', '但', '而', '因为', '所以', '如果', '虽然', '然而', '因此', '于是', '总之',\n '的话', '这样', '那样', '这里', '那里', '现在', '以前', '以后', '当时', '刚才', '马上', '立即'\n }\n \n logger.info(\"内容整合器初始化完成\")",
"code_hash": "4f448f1348426d8622feeae0c4f064cc"
},
{
"name": "integrate_documents",
"line_start": 103,
"line_end": 172,
"args": [
{
"name": "self"
},
{
"name": "documents",
"type_hint": "List[ExtractedDocument]"
}
],
"return_type": "IntegratedContent",
"docstring": "整合多个文档\n\nArgs:\n documents: 提取的文档列表\n \nReturns:\n IntegratedContent: 整合后的内容\n \nRaises:\n DocumentProcessingError: 整合失败时抛出",
"is_async": false,
"decorators": [],
"code": " def integrate_documents(self, documents: List[ExtractedDocument]) -> IntegratedContent:\n \"\"\"\n 整合多个文档\n \n Args:\n documents: 提取的文档列表\n \n Returns:\n IntegratedContent: 整合后的内容\n \n Raises:\n DocumentProcessingError: 整合失败时抛出\n \"\"\"\n if not documents:\n return IntegratedContent(\n documents=[],\n document_count=0,\n total_content_length=0,\n document_types={},\n combined_content=\"\",\n content_summary=\"没有可处理的文档\",\n key_topics=[],\n integration_metadata={'integration_method': 'empty'},\n integrated_at=datetime.now()\n )\n \n try:\n logger.info(f\"开始整合 {len(documents)} 个文档\")\n \n # 过滤成功提取的文档\n successful_docs = [doc for doc in documents if doc.content and not doc.error_info]\n \n if not successful_docs:\n logger.warning(\"没有成功提取的文档内容\")\n return self._create_empty_integration(documents, \"所有文档提取失败\")\n \n # 合并内容\n combined_content = self._combine_content(successful_docs)\n \n # 生成内容摘要\n content_summary = self._generate_summary(combined_content, successful_docs)\n \n # 提取关键主题\n key_topics = self._extract_key_topics(combined_content)\n \n # 收集统计信息\n integration_metadata = self._collect_metadata(documents, successful_docs)\n \n # 计算总内容长度\n total_content_length = sum(len(doc.content) for doc in successful_docs)\n \n integrated_content = IntegratedContent(\n documents=documents,\n document_count=len(documents),\n total_content_length=total_content_length,\n document_types=self._count_document_types(documents),\n combined_content=combined_content,\n content_summary=content_summary,\n key_topics=key_topics,\n integration_metadata=integration_metadata,\n integrated_at=datetime.now()\n )\n \n logger.info(f\"文档整合完成,合并内容长度: {len(combined_content)}\")\n return integrated_content\n \n except Exception as e:\n error_msg = f\"文档整合失败: {str(e)}\"\n logger.error(error_msg, exc_info=True)\n raise DocumentProcessingError(error_msg)",
"code_hash": "29f02e2616a26e6f18f5e472032c713b"
},
{
"name": "_combine_content",
"line_start": 174,
"line_end": 203,
"args": [
{
"name": "self"
},
{
"name": "documents",
"type_hint": "List[ExtractedDocument]"
}
],
"return_type": "str",
"docstring": "合并文档内容",
"is_async": false,
"decorators": [],
"code": " def _combine_content(self, documents: List[ExtractedDocument]) -> str:\n \"\"\"合并文档内容\"\"\"\n if not documents:\n return \"\"\n \n combined_parts = []\n \n for i, doc in enumerate(documents):\n if doc.content.strip():\n # 添加文档分隔符\n if i > 0:\n combined_parts.append(f\"\\n{'='*50}\\n\")\n \n # 添加文档头信息\n combined_parts.append(f\"文档: {doc.filename} ({doc.file_type})\\n\")\n if doc.page_count:\n combined_parts.append(f\"页数: {doc.page_count}\\n\")\n combined_parts.append(\"-\" * 30 + \"\\n\")\n \n # 添加文档内容\n content = doc.content.strip()\n \n # 应用内容长度限制\n if self.config.max_content_length > 0 and len(content) > self.config.max_content_length:\n content = content[:self.config.max_content_length] + \"\\n[内容已截断...]\"\n \n combined_parts.append(content)\n combined_parts.append(\"\\n\")\n \n return \"\".join(combined_parts)",
"code_hash": "ae1f5bf801d40221bdc66126f230a2d6"
},
{
"name": "_generate_summary",
"line_start": 205,
"line_end": 244,
"args": [
{
"name": "self"
},
{
"name": "combined_content",
"type_hint": "str"
},
{
"name": "documents",
"type_hint": "List[ExtractedDocument]"
}
],
"return_type": "str",
"docstring": "生成内容摘要",
"is_async": false,
"decorators": [],
"code": " def _generate_summary(self, combined_content: str, documents: List[ExtractedDocument]) -> str:\n \"\"\"生成内容摘要\"\"\"\n if not combined_content.strip():\n return \"无内容可摘要\"\n \n summary_parts = []\n \n # 基本统计\n word_count = len(combined_content.split())\n char_count = len(combined_content)\n \n summary_parts.append(f\"文档摘要:\")\n summary_parts.append(f\"- 成功处理文档数: {len(documents)}\")\n summary_parts.append(f\"- 总字符数: {char_count:,}\")\n summary_parts.append(f\"- 总词数: {word_count:,}\")\n \n # 文档类型统计\n doc_types = Counter(doc.file_type for doc in documents)\n summary_parts.append(f\"- 文档类型分布: {dict(doc_types)}\")\n \n # 内容长度分析\n content_lengths = [len(doc.content) for doc in documents if doc.content]\n if content_lengths:\n avg_length = sum(content_lengths) / len(content_lengths)\n summary_parts.append(f\"- 平均文档长度: {avg_length:.0f} 字符\")\n \n # 提取前几个段落作为内容预览\n paragraphs = [p.strip() for p in combined_content.split('\\n') if p.strip()]\n preview_paragraphs = []\n \n for para in paragraphs[:5]: # 最多取前5个段落\n if len(para) > 10 and not para.startswith('-') and not para.startswith('='):\n preview_paragraphs.append(para[:100] + \"...\" if len(para) > 100 else para)\n \n if preview_paragraphs:\n summary_parts.append(\"\\n内容预览:\")\n for i, para in enumerate(preview_paragraphs, 1):\n summary_parts.append(f\"{i}. {para}\")\n \n return \"\\n\".join(summary_parts)",
"code_hash": "317ec07d8e64f2cbf8ff75d02c585220"
},
{
"name": "_extract_key_topics",
"line_start": 246,
"line_end": 268,
"args": [
{
"name": "self"
},
{
"name": "content",
"type_hint": "str"
}
],
"return_type": "List[str]",
"docstring": "提取关键主题",
"is_async": false,
"decorators": [],
"code": " def _extract_key_topics(self, content: str) -> List[str]:\n \"\"\"提取关键主题\"\"\"\n if not content.strip():\n return []\n \n try:\n # 文本预处理\n text = self._preprocess_text(content)\n \n # 提取词频\n word_freq = self._calculate_word_frequency(text)\n \n # 筛选关键词\n keywords = self._filter_keywords(word_freq)\n \n # 主题聚类(简单版本)\n topics = self._cluster_topics(keywords, content)\n \n return topics[:self.config.max_topics]\n \n except Exception as e:\n logger.warning(f\"关键主题提取失败: {e}\")\n return []",
"code_hash": "e21880093361157107ba9fe51078854c"
},
{
"name": "_preprocess_text",
"line_start": 270,
"line_end": 278,
"args": [
{
"name": "self"
},
{
"name": "text",
"type_hint": "str"
}
],
"return_type": "str",
"docstring": "文本预处理",
"is_async": false,
"decorators": [],
"code": " def _preprocess_text(self, text: str) -> str:\n \"\"\"文本预处理\"\"\"\n # 移除特殊字符和数字\n text = re.sub(r'[^\\u4e00-\\u9fa5a-zA-Z\\s]', ' ', text)\n \n # 移除多余空白\n text = re.sub(r'\\s+', ' ', text).strip()\n \n return text",
"code_hash": "5d5dc7ad13e8caaaca8afb20038a80e2"
},
{
"name": "_calculate_word_frequency",
"line_start": 280,
"line_end": 292,
"args": [
{
"name": "self"
},
{
"name": "text",
"type_hint": "str"
}
],
"return_type": "Dict[str, int]",
"docstring": "计算词频",
"is_async": false,
"decorators": [],
"code": " def _calculate_word_frequency(self, text: str) -> Dict[str, int]:\n \"\"\"计算词频\"\"\"\n words = text.split()\n word_freq = Counter()\n \n for word in words:\n word = word.strip().lower()\n \n # 过滤短词和停用词\n if len(word) >= 2 and word not in self.stop_words:\n word_freq[word] += 1\n \n return dict(word_freq)",
"code_hash": "60a4dccdbc57da50b752d88025914303"
},
{
"name": "_filter_keywords",
"line_start": 294,
"line_end": 305,
"args": [
{
"name": "self"
},
{
"name": "word_freq",
"type_hint": "Dict[str, int]"
}
],
"return_type": "List[tuple]",
"docstring": "筛选关键词",
"is_async": false,
"decorators": [],
"code": " def _filter_keywords(self, word_freq: Dict[str, int]) -> List[tuple]:\n \"\"\"筛选关键词\"\"\"\n # 按频率排序\n sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)\n \n # 取前N个高频词作为关键词\n keywords = []\n for word, freq in sorted_words[:50]: # 最多50个关键词\n if freq >= 2: # 至少出现2次\n keywords.append((word, freq))\n \n return keywords",
"code_hash": "2c7fb426834120ae71702ca1858d29ee"
},
{
"name": "_cluster_topics",
"line_start": 307,
"line_end": 343,
"args": [
{
"name": "self"
},
{
"name": "keywords",
"type_hint": "List[tuple]"
},
{
"name": "content",
"type_hint": "str"
}
],
"return_type": "List[str]",
"docstring": "简单的主题聚类",
"is_async": false,
"decorators": [],
"code": " def _cluster_topics(self, keywords: List[tuple], content: str) -> List[str]:\n \"\"\"简单的主题聚类\"\"\"\n if not keywords:\n return []\n \n topics = []\n \n # 基于关键词生成主题\n keyword_words = [kw[0] for kw in keywords[:20]]\n \n # 预定义主题模式\n topic_patterns = {\n '景区介绍': ['景区', '景点', '景观', '风景', '旅游', '游览', '参观', '观光'],\n '门票价格': ['门票', '价格', '费用', '收费', '票价', '优惠', '折扣'],\n '开放时间': ['时间', '开放', '营业', '闭园', '时段', '时刻'],\n '交通指南': ['交通', '路线', '公交', '地铁', '自驾', '停车', '导航'],\n '服务设施': ['服务', '设施', '餐厅', '厕所', '休息', '购物', '商店'],\n '注意事项': ['注意', '提醒', '禁止', '安全', '建议', '须知'],\n '活动体验': ['活动', '体验', '表演', '娱乐', '项目', '节目']\n }\n \n for topic_name, patterns in topic_patterns.items():\n matches = 0\n for keyword in keyword_words:\n for pattern in patterns:\n if pattern in keyword or keyword in pattern:\n matches += 1\n break\n \n if matches >= 2: # 至少匹配2个模式\n topics.append(topic_name)\n \n # 如果没有匹配的预定义主题,使用高频关键词\n if not topics:\n topics = [kw[0] for kw in keywords[:5]]\n \n return topics",
"code_hash": "1da2bbe4375348bb212bb26829916281"
},
{
"name": "_count_document_types",
"line_start": 345,
"line_end": 350,
"args": [
{
"name": "self"
},
{
"name": "documents",
"type_hint": "List[ExtractedDocument]"
}
],
"return_type": "Dict[str, int]",
"docstring": "统计文档类型",
"is_async": false,
"decorators": [],
"code": " def _count_document_types(self, documents: List[ExtractedDocument]) -> Dict[str, int]:\n \"\"\"统计文档类型\"\"\"\n type_counts = Counter()\n for doc in documents:\n type_counts[doc.file_type] += 1\n return dict(type_counts)",
"code_hash": "39f82f3afb3f45311de05455c401570c"
},
{
"name": "_collect_metadata",
"line_start": 352,
"line_end": 368,
"args": [
{
"name": "self"
},
{
"name": "all_documents",
"type_hint": "List[ExtractedDocument]"
},
{
"name": "successful_docs",
"type_hint": "List[ExtractedDocument]"
}
],
"return_type": "Dict[str, Any]",
"docstring": "收集整合元数据",
"is_async": false,
"decorators": [],
"code": " def _collect_metadata(self, all_documents: List[ExtractedDocument], successful_docs: List[ExtractedDocument]) -> Dict[str, Any]:\n \"\"\"收集整合元数据\"\"\"\n total_file_size = sum(doc.file_size for doc in all_documents)\n extraction_methods = Counter(doc.extraction_method for doc in successful_docs if doc.extraction_method)\n \n return {\n 'integration_method': 'content_integrator_v2',\n 'total_documents': len(all_documents),\n 'successful_extractions': len(successful_docs),\n 'failed_extractions': len(all_documents) - len(successful_docs),\n 'total_file_size': total_file_size,\n 'extraction_methods_used': dict(extraction_methods),\n 'processing_timestamp': datetime.now().isoformat(),\n 'content_integration_enabled': self.config.enable_content_integration,\n 'max_content_length': self.config.max_content_length,\n 'max_topics': self.config.max_topics\n }",
"code_hash": "4b8f45cdbb97e153a6b3357b24e59555"
},
{
"name": "_create_empty_integration",
"line_start": 370,
"line_end": 382,
"args": [
{
"name": "self"
},
{
"name": "documents",
"type_hint": "List[ExtractedDocument]"
},
{
"name": "reason",
"type_hint": "str"
}
],
"return_type": "IntegratedContent",
"docstring": "创建空的整合结果",
"is_async": false,
"decorators": [],
"code": " def _create_empty_integration(self, documents: List[ExtractedDocument], reason: str) -> IntegratedContent:\n \"\"\"创建空的整合结果\"\"\"\n return IntegratedContent(\n documents=documents,\n document_count=len(documents),\n total_content_length=0,\n document_types=self._count_document_types(documents),\n combined_content=\"\",\n content_summary=f\"整合失败: {reason}\",\n key_topics=[],\n integration_metadata={'integration_method': 'empty', 'failure_reason': reason},\n integrated_at=datetime.now()\n )",
"code_hash": "dc78387417a3481b4e9de7a4bc8ee97f"
},
{
"name": "analyze_content_overlap",
"line_start": 384,
"line_end": 420,
"args": [
{
"name": "self"
},
{
"name": "documents",
"type_hint": "List[ExtractedDocument]"
}
],
"return_type": "Dict[str, Any]",
"docstring": "分析文档内容重叠情况",
"is_async": false,
"decorators": [],
"code": " def analyze_content_overlap(self, documents: List[ExtractedDocument]) -> Dict[str, Any]:\n \"\"\"分析文档内容重叠情况\"\"\"\n if len(documents) < 2:\n return {'overlap_analysis': '需要至少2个文档进行重叠分析'}\n \n # 简单的重叠分析\n doc_words = []\n for doc in documents:\n if doc.content and not doc.error_info:\n words = set(self._preprocess_text(doc.content).split())\n doc_words.append((doc.filename, words))\n \n if len(doc_words) < 2:\n return {'overlap_analysis': '没有足够的有效文档内容进行分析'}\n \n overlap_results = {}\n \n for i in range(len(doc_words)):\n for j in range(i + 1, len(doc_words)):\n doc1_name, words1 = doc_words[i]\n doc2_name, words2 = doc_words[j]\n \n common_words = words1.intersection(words2)\n total_words = len(words1.union(words2))\n \n if total_words > 0:\n overlap_ratio = len(common_words) / total_words\n overlap_results[f\"{doc1_name} vs {doc2_name}\"] = {\n 'common_words_count': len(common_words),\n 'overlap_ratio': round(overlap_ratio, 3),\n 'common_words_sample': list(common_words)[:10]\n }\n \n return {\n 'overlap_analysis': overlap_results,\n 'total_comparisons': len(overlap_results)\n }",
"code_hash": "29b905f15eb4c80bb9afc905a96b3249"
},
{
"name": "get_integration_stats",
"line_start": 422,
"line_end": 436,
"args": [
{
"name": "self"
}
],
"return_type": "Dict[str, Any]",
"docstring": "获取整合器统计信息",
"is_async": false,
"decorators": [],
"code": " def get_integration_stats(self) -> Dict[str, Any]:\n \"\"\"获取整合器统计信息\"\"\"\n return {\n 'max_content_length': self.config.max_content_length,\n 'max_topics': self.config.max_topics,\n 'enable_content_integration': self.config.enable_content_integration,\n 'stop_words_count': len(self.stop_words),\n 'supported_analysis': [\n 'content_combination',\n 'summary_generation', \n 'key_topic_extraction',\n 'content_overlap_analysis',\n 'document_type_statistics'\n ]\n } ",
"code_hash": "3a9ed49f3fe653c211e8c2e0253d87e6"
}
],
"docstring": "内容整合器 - 重构版本\n负责整合多个文档的信息提取关键内容和主题",
"decorators": [],
"code": "class ContentIntegrator:\n \"\"\"\n 内容整合器 - 重构版本\n 负责整合多个文档的信息,提取关键内容和主题\n \"\"\"\n \n def __init__(self, config: DocumentProcessingConfig):\n \"\"\"\n 初始化内容整合器\n \n Args:\n config: 文档处理配置\n \"\"\"\n self.config = config\n \n # 停用词列表(中文)\n self.stop_words = {\n '的', '了', '是', '在', '有', '和', '就', '不', '人', '都', '一', '个', '上', '也', '很', '到', '说', '要', '去', \n '你', '会', '着', '没', '看', '好', '自己', '这', '那', '来', '可以', '时候', '我', '他', '她', '它', '们',\n '之', '与', '及', '或', '但', '而', '因为', '所以', '如果', '虽然', '然而', '因此', '于是', '总之',\n '的话', '这样', '那样', '这里', '那里', '现在', '以前', '以后', '当时', '刚才', '马上', '立即'\n }\n \n logger.info(\"内容整合器初始化完成\")\n \n def integrate_documents(self, documents: List[ExtractedDocument]) -> IntegratedContent:\n \"\"\"\n 整合多个文档\n \n Args:\n documents: 提取的文档列表\n \n Returns:\n IntegratedContent: 整合后的内容\n \n Raises:\n DocumentProcessingError: 整合失败时抛出\n \"\"\"\n if not documents:\n return IntegratedContent(\n documents=[],\n document_count=0,\n total_content_length=0,\n document_types={},\n combined_content=\"\",\n content_summary=\"没有可处理的文档\",\n key_topics=[],\n integration_metadata={'integration_method': 'empty'},\n integrated_at=datetime.now()\n )\n \n try:\n logger.info(f\"开始整合 {len(documents)} 个文档\")\n \n # 过滤成功提取的文档\n successful_docs = [doc for doc in documents if doc.content and not doc.error_info]\n \n if not successful_docs:\n logger.warning(\"没有成功提取的文档内容\")\n return self._create_empty_integration(documents, \"所有文档提取失败\")\n \n # 合并内容\n combined_content = self._combine_content(successful_docs)\n \n # 生成内容摘要\n content_summary = self._generate_summary(combined_content, successful_docs)\n \n # 提取关键主题\n key_topics = self._extract_key_topics(combined_content)\n \n # 收集统计信息\n integration_metadata = self._collect_metadata(documents, successful_docs)\n \n # 计算总内容长度\n total_content_length = sum(len(doc.content) for doc in successful_docs)\n \n integrated_content = IntegratedContent(\n documents=documents,\n document_count=len(documents),\n total_content_length=total_content_length,\n document_types=self._count_document_types(documents),\n combined_content=combined_content,\n content_summary=content_summary,\n key_topics=key_topics,\n integration_metadata=integration_metadata,\n integrated_at=datetime.now()\n )\n \n logger.info(f\"文档整合完成,合并内容长度: {len(combined_content)}\")\n return integrated_content\n \n except Exception as e:\n error_msg = f\"文档整合失败: {str(e)}\"\n logger.error(error_msg, exc_info=True)\n raise DocumentProcessingError(error_msg)\n \n def _combine_content(self, documents: List[ExtractedDocument]) -> str:\n \"\"\"合并文档内容\"\"\"\n if not documents:\n return \"\"\n \n combined_parts = []\n \n for i, doc in enumerate(documents):\n if doc.content.strip():\n # 添加文档分隔符\n if i > 0:\n combined_parts.append(f\"\\n{'='*50}\\n\")\n \n # 添加文档头信息\n combined_parts.append(f\"文档: {doc.filename} ({doc.file_type})\\n\")\n if doc.page_count:\n combined_parts.append(f\"页数: {doc.page_count}\\n\")\n combined_parts.append(\"-\" * 30 + \"\\n\")\n \n # 添加文档内容\n content = doc.content.strip()\n \n # 应用内容长度限制\n if self.config.max_content_length > 0 and len(content) > self.config.max_content_length:\n content = content[:self.config.max_content_length] + \"\\n[内容已截断...]\"\n \n combined_parts.append(content)\n combined_parts.append(\"\\n\")\n \n return \"\".join(combined_parts)\n \n def _generate_summary(self, combined_content: str, documents: List[ExtractedDocument]) -> str:\n \"\"\"生成内容摘要\"\"\"\n if not combined_content.strip():\n return \"无内容可摘要\"\n \n summary_parts = []\n \n # 基本统计\n word_count = len(combined_content.split())\n char_count = len(combined_content)\n \n summary_parts.append(f\"文档摘要:\")\n summary_parts.append(f\"- 成功处理文档数: {len(documents)}\")\n summary_parts.append(f\"- 总字符数: {char_count:,}\")\n summary_parts.append(f\"- 总词数: {word_count:,}\")\n \n # 文档类型统计\n doc_types = Counter(doc.file_type for doc in documents)\n summary_parts.append(f\"- 文档类型分布: {dict(doc_types)}\")\n \n # 内容长度分析\n content_lengths = [len(doc.content) for doc in documents if doc.content]\n if content_lengths:\n avg_length = sum(content_lengths) / len(content_lengths)\n summary_parts.append(f\"- 平均文档长度: {avg_length:.0f} 字符\")\n \n # 提取前几个段落作为内容预览\n paragraphs = [p.strip() for p in combined_content.split('\\n') if p.strip()]\n preview_paragraphs = []\n \n for para in paragraphs[:5]: # 最多取前5个段落\n if len(para) > 10 and not para.startswith('-') and not para.startswith('='):\n preview_paragraphs.append(para[:100] + \"...\" if len(para) > 100 else para)\n \n if preview_paragraphs:\n summary_parts.append(\"\\n内容预览:\")\n for i, para in enumerate(preview_paragraphs, 1):\n summary_parts.append(f\"{i}. {para}\")\n \n return \"\\n\".join(summary_parts)\n \n def _extract_key_topics(self, content: str) -> List[str]:\n \"\"\"提取关键主题\"\"\"\n if not content.strip():\n return []\n \n try:\n # 文本预处理\n text = self._preprocess_text(content)\n \n # 提取词频\n word_freq = self._calculate_word_frequency(text)\n \n # 筛选关键词\n keywords = self._filter_keywords(word_freq)\n \n # 主题聚类(简单版本)\n topics = self._cluster_topics(keywords, content)\n \n return topics[:self.config.max_topics]\n \n except Exception as e:\n logger.warning(f\"关键主题提取失败: {e}\")\n return []\n \n def _preprocess_text(self, text: str) -> str:\n \"\"\"文本预处理\"\"\"\n # 移除特殊字符和数字\n text = re.sub(r'[^\\u4e00-\\u9fa5a-zA-Z\\s]', ' ', text)\n \n # 移除多余空白\n text = re.sub(r'\\s+', ' ', text).strip()\n \n return text\n \n def _calculate_word_frequency(self, text: str) -> Dict[str, int]:\n \"\"\"计算词频\"\"\"\n words = text.split()\n word_freq = Counter()\n \n for word in words:\n word = word.strip().lower()\n \n # 过滤短词和停用词\n if len(word) >= 2 and word not in self.stop_words:\n word_freq[word] += 1\n \n return dict(word_freq)\n \n def _filter_keywords(self, word_freq: Dict[str, int]) -> List[tuple]:\n \"\"\"筛选关键词\"\"\"\n # 按频率排序\n sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)\n \n # 取前N个高频词作为关键词\n keywords = []\n for word, freq in sorted_words[:50]: # 最多50个关键词\n if freq >= 2: # 至少出现2次\n keywords.append((word, freq))\n \n return keywords\n \n def _cluster_topics(self, keywords: List[tuple], content: str) -> List[str]:\n \"\"\"简单的主题聚类\"\"\"\n if not keywords:\n return []\n \n topics = []\n \n # 基于关键词生成主题\n keyword_words = [kw[0] for kw in keywords[:20]]\n \n # 预定义主题模式\n topic_patterns = {\n '景区介绍': ['景区', '景点', '景观', '风景', '旅游', '游览', '参观', '观光'],\n '门票价格': ['门票', '价格', '费用', '收费', '票价', '优惠', '折扣'],\n '开放时间': ['时间', '开放', '营业', '闭园', '时段', '时刻'],\n '交通指南': ['交通', '路线', '公交', '地铁', '自驾', '停车', '导航'],\n '服务设施': ['服务', '设施', '餐厅', '厕所', '休息', '购物', '商店'],\n '注意事项': ['注意', '提醒', '禁止', '安全', '建议', '须知'],\n '活动体验': ['活动', '体验', '表演', '娱乐', '项目', '节目']\n }\n \n for topic_name, patterns in topic_patterns.items():\n matches = 0\n for keyword in keyword_words:\n for pattern in patterns:\n if pattern in keyword or keyword in pattern:\n matches += 1\n break\n \n if matches >= 2: # 至少匹配2个模式\n topics.append(topic_name)\n \n # 如果没有匹配的预定义主题,使用高频关键词\n if not topics:\n topics = [kw[0] for kw in keywords[:5]]\n \n return topics\n \n def _count_document_types(self, documents: List[ExtractedDocument]) -> Dict[str, int]:\n \"\"\"统计文档类型\"\"\"\n type_counts = Counter()\n for doc in documents:\n type_counts[doc.file_type] += 1\n return dict(type_counts)\n \n def _collect_metadata(self, all_documents: List[ExtractedDocument], successful_docs: List[ExtractedDocument]) -> Dict[str, Any]:\n \"\"\"收集整合元数据\"\"\"\n total_file_size = sum(doc.file_size for doc in all_documents)\n extraction_methods = Counter(doc.extraction_method for doc in successful_docs if doc.extraction_method)\n \n return {\n 'integration_method': 'content_integrator_v2',\n 'total_documents': len(all_documents),\n 'successful_extractions': len(successful_docs),\n 'failed_extractions': len(all_documents) - len(successful_docs),\n 'total_file_size': total_file_size,\n 'extraction_methods_used': dict(extraction_methods),\n 'processing_timestamp': datetime.now().isoformat(),\n 'content_integration_enabled': self.config.enable_content_integration,\n 'max_content_length': self.config.max_content_length,\n 'max_topics': self.config.max_topics\n }\n \n def _create_empty_integration(self, documents: List[ExtractedDocument], reason: str) -> IntegratedContent:\n \"\"\"创建空的整合结果\"\"\"\n return IntegratedContent(\n documents=documents,\n document_count=len(documents),\n total_content_length=0,\n document_types=self._count_document_types(documents),\n combined_content=\"\",\n content_summary=f\"整合失败: {reason}\",\n key_topics=[],\n integration_metadata={'integration_method': 'empty', 'failure_reason': reason},\n integrated_at=datetime.now()\n )\n \n def analyze_content_overlap(self, documents: List[ExtractedDocument]) -> Dict[str, Any]:\n \"\"\"分析文档内容重叠情况\"\"\"\n if len(documents) < 2:\n return {'overlap_analysis': '需要至少2个文档进行重叠分析'}\n \n # 简单的重叠分析\n doc_words = []\n for doc in documents:\n if doc.content and not doc.error_info:\n words = set(self._preprocess_text(doc.content).split())\n doc_words.append((doc.filename, words))\n \n if len(doc_words) < 2:\n return {'overlap_analysis': '没有足够的有效文档内容进行分析'}\n \n overlap_results = {}\n \n for i in range(len(doc_words)):\n for j in range(i + 1, len(doc_words)):\n doc1_name, words1 = doc_words[i]\n doc2_name, words2 = doc_words[j]\n \n common_words = words1.intersection(words2)\n total_words = len(words1.union(words2))\n \n if total_words > 0:\n overlap_ratio = len(common_words) / total_words\n overlap_results[f\"{doc1_name} vs {doc2_name}\"] = {\n 'common_words_count': len(common_words),\n 'overlap_ratio': round(overlap_ratio, 3),\n 'common_words_sample': list(common_words)[:10]\n }\n \n return {\n 'overlap_analysis': overlap_results,\n 'total_comparisons': len(overlap_results)\n }\n \n def get_integration_stats(self) -> Dict[str, Any]:\n \"\"\"获取整合器统计信息\"\"\"\n return {\n 'max_content_length': self.config.max_content_length,\n 'max_topics': self.config.max_topics,\n 'enable_content_integration': self.config.enable_content_integration,\n 'stop_words_count': len(self.stop_words),\n 'supported_analysis': [\n 'content_combination',\n 'summary_generation', \n 'key_topic_extraction',\n 'content_overlap_analysis',\n 'document_type_statistics'\n ]\n } ",
"code_hash": "5105fa33ca6b0bb75b1bad5e693df5a5"
}
],
"imports": [
{
"type": "import",
"modules": [
"logging"
],
"aliases": []
},
{
"type": "import",
"modules": [
"re"
],
"aliases": []
},
{
"type": "from_import",
"module": "typing",
"names": [
"List",
"Dict",
"Any",
"Optional",
"Set"
],
"aliases": [],
"level": 0
},
{
"type": "from_import",
"module": "dataclasses",
"names": [
"dataclass"
],
"aliases": [],
"level": 0
},
{
"type": "from_import",
"module": "datetime",
"names": [
"datetime"
],
"aliases": [],
"level": 0
},
{
"type": "from_import",
"module": "collections",
"names": [
"Counter"
],
"aliases": [],
"level": 0
},
{
"type": "from_import",
"module": "text_extractor",
"names": [
"ExtractedDocument"
],
"aliases": [],
"level": 1
},
{
"type": "from_import",
"module": "config",
"names": [
"DocumentProcessingConfig"
],
"aliases": [],
"level": 2
},
{
"type": "from_import",
"module": "exceptions",
"names": [
"DocumentProcessingError"
],
"aliases": [],
"level": 2
}
],
"constants": [],
"docstring": "Content Integrator\n内容整合器 - 重构版本,将多个文档的内容进行整合和分析",
"content_hash": "0b20bd330220e4a3604ff7aef74d05ec"
}