801 lines
69 KiB
JSON
801 lines
69 KiB
JSON
{
|
||
"file_path": "travel-algorithms/travel_algorithms/document_processing/content_integrator.py",
|
||
"file_size": 15711,
|
||
"line_count": 435,
|
||
"functions": [
|
||
{
|
||
"name": "__post_init__",
|
||
"line_start": 36,
|
||
"line_end": 45,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
}
|
||
],
|
||
"return_type": null,
|
||
"docstring": "初始化后处理",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def __post_init__(self):\n \"\"\"初始化后处理\"\"\"\n if not self.integrated_at:\n self.integrated_at = datetime.now()\n \n if not self.document_types:\n self.document_types = {}\n for doc in self.documents:\n file_type = doc.file_type\n self.document_types[file_type] = self.document_types.get(file_type, 0) + 1",
|
||
"code_hash": "dfa2a458566ec2698cc045a4f0be192e"
|
||
},
|
||
{
|
||
"name": "to_dict",
|
||
"line_start": 47,
|
||
"line_end": 59,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
}
|
||
],
|
||
"return_type": "Dict[str, Any]",
|
||
"docstring": "转换为字典格式",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def to_dict(self) -> Dict[str, Any]:\n \"\"\"转换为字典格式\"\"\"\n return {\n 'document_count': self.document_count,\n 'total_content_length': self.total_content_length,\n 'document_types': self.document_types,\n 'combined_content': self.combined_content,\n 'content_summary': self.content_summary,\n 'key_topics': self.key_topics,\n 'integration_metadata': self.integration_metadata,\n 'integrated_at': self.integrated_at.isoformat(),\n 'documents': [doc.get_summary() for doc in self.documents]\n }",
|
||
"code_hash": "1d82123b5ec018a20433936b0df16320"
|
||
},
|
||
{
|
||
"name": "get_statistics",
|
||
"line_start": 61,
|
||
"line_end": 75,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
}
|
||
],
|
||
"return_type": "Dict[str, Any]",
|
||
"docstring": "获取内容统计信息",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def get_statistics(self) -> Dict[str, Any]:\n \"\"\"获取内容统计信息\"\"\"\n total_words = sum(len(doc.content.split()) for doc in self.documents)\n successful_docs = [doc for doc in self.documents if not doc.error_info]\n \n return {\n 'total_documents': self.document_count,\n 'successful_extractions': len(successful_docs),\n 'failed_extractions': self.document_count - len(successful_docs),\n 'total_words': total_words,\n 'total_content_length': self.total_content_length,\n 'average_content_length': self.total_content_length / max(1, len(successful_docs)),\n 'document_types': self.document_types,\n 'key_topics_count': len(self.key_topics)\n }",
|
||
"code_hash": "10c87ec40691e2b661e6007409a5634d"
|
||
},
|
||
{
|
||
"name": "__init__",
|
||
"line_start": 84,
|
||
"line_end": 101,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "config",
|
||
"type_hint": "DocumentProcessingConfig"
|
||
}
|
||
],
|
||
"return_type": null,
|
||
"docstring": "初始化内容整合器\n\nArgs:\n config: 文档处理配置",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def __init__(self, config: DocumentProcessingConfig):\n \"\"\"\n 初始化内容整合器\n \n Args:\n config: 文档处理配置\n \"\"\"\n self.config = config\n \n # 停用词列表(中文)\n self.stop_words = {\n '的', '了', '是', '在', '有', '和', '就', '不', '人', '都', '一', '个', '上', '也', '很', '到', '说', '要', '去', \n '你', '会', '着', '没', '看', '好', '自己', '这', '那', '来', '可以', '时候', '我', '他', '她', '它', '们',\n '之', '与', '及', '或', '但', '而', '因为', '所以', '如果', '虽然', '然而', '因此', '于是', '总之',\n '的话', '这样', '那样', '这里', '那里', '现在', '以前', '以后', '当时', '刚才', '马上', '立即'\n }\n \n logger.info(\"内容整合器初始化完成\")",
|
||
"code_hash": "4f448f1348426d8622feeae0c4f064cc"
|
||
},
|
||
{
|
||
"name": "integrate_documents",
|
||
"line_start": 103,
|
||
"line_end": 172,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "documents",
|
||
"type_hint": "List[ExtractedDocument]"
|
||
}
|
||
],
|
||
"return_type": "IntegratedContent",
|
||
"docstring": "整合多个文档\n\nArgs:\n documents: 提取的文档列表\n \nReturns:\n IntegratedContent: 整合后的内容\n \nRaises:\n DocumentProcessingError: 整合失败时抛出",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def integrate_documents(self, documents: List[ExtractedDocument]) -> IntegratedContent:\n \"\"\"\n 整合多个文档\n \n Args:\n documents: 提取的文档列表\n \n Returns:\n IntegratedContent: 整合后的内容\n \n Raises:\n DocumentProcessingError: 整合失败时抛出\n \"\"\"\n if not documents:\n return IntegratedContent(\n documents=[],\n document_count=0,\n total_content_length=0,\n document_types={},\n combined_content=\"\",\n content_summary=\"没有可处理的文档\",\n key_topics=[],\n integration_metadata={'integration_method': 'empty'},\n integrated_at=datetime.now()\n )\n \n try:\n logger.info(f\"开始整合 {len(documents)} 个文档\")\n \n # 过滤成功提取的文档\n successful_docs = [doc for doc in documents if doc.content and not doc.error_info]\n \n if not successful_docs:\n logger.warning(\"没有成功提取的文档内容\")\n return self._create_empty_integration(documents, \"所有文档提取失败\")\n \n # 合并内容\n combined_content = self._combine_content(successful_docs)\n \n # 生成内容摘要\n content_summary = self._generate_summary(combined_content, successful_docs)\n \n # 提取关键主题\n key_topics = self._extract_key_topics(combined_content)\n \n # 收集统计信息\n integration_metadata = self._collect_metadata(documents, successful_docs)\n \n # 计算总内容长度\n total_content_length = sum(len(doc.content) for doc in successful_docs)\n \n integrated_content = IntegratedContent(\n documents=documents,\n document_count=len(documents),\n total_content_length=total_content_length,\n document_types=self._count_document_types(documents),\n combined_content=combined_content,\n content_summary=content_summary,\n key_topics=key_topics,\n integration_metadata=integration_metadata,\n integrated_at=datetime.now()\n )\n \n logger.info(f\"文档整合完成,合并内容长度: {len(combined_content)}\")\n return integrated_content\n \n except Exception as e:\n error_msg = f\"文档整合失败: {str(e)}\"\n logger.error(error_msg, exc_info=True)\n raise DocumentProcessingError(error_msg)",
|
||
"code_hash": "29f02e2616a26e6f18f5e472032c713b"
|
||
},
|
||
{
|
||
"name": "_combine_content",
|
||
"line_start": 174,
|
||
"line_end": 203,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "documents",
|
||
"type_hint": "List[ExtractedDocument]"
|
||
}
|
||
],
|
||
"return_type": "str",
|
||
"docstring": "合并文档内容",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _combine_content(self, documents: List[ExtractedDocument]) -> str:\n \"\"\"合并文档内容\"\"\"\n if not documents:\n return \"\"\n \n combined_parts = []\n \n for i, doc in enumerate(documents):\n if doc.content.strip():\n # 添加文档分隔符\n if i > 0:\n combined_parts.append(f\"\\n{'='*50}\\n\")\n \n # 添加文档头信息\n combined_parts.append(f\"文档: {doc.filename} ({doc.file_type})\\n\")\n if doc.page_count:\n combined_parts.append(f\"页数: {doc.page_count}\\n\")\n combined_parts.append(\"-\" * 30 + \"\\n\")\n \n # 添加文档内容\n content = doc.content.strip()\n \n # 应用内容长度限制\n if self.config.max_content_length > 0 and len(content) > self.config.max_content_length:\n content = content[:self.config.max_content_length] + \"\\n[内容已截断...]\"\n \n combined_parts.append(content)\n combined_parts.append(\"\\n\")\n \n return \"\".join(combined_parts)",
|
||
"code_hash": "ae1f5bf801d40221bdc66126f230a2d6"
|
||
},
|
||
{
|
||
"name": "_generate_summary",
|
||
"line_start": 205,
|
||
"line_end": 244,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "combined_content",
|
||
"type_hint": "str"
|
||
},
|
||
{
|
||
"name": "documents",
|
||
"type_hint": "List[ExtractedDocument]"
|
||
}
|
||
],
|
||
"return_type": "str",
|
||
"docstring": "生成内容摘要",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _generate_summary(self, combined_content: str, documents: List[ExtractedDocument]) -> str:\n \"\"\"生成内容摘要\"\"\"\n if not combined_content.strip():\n return \"无内容可摘要\"\n \n summary_parts = []\n \n # 基本统计\n word_count = len(combined_content.split())\n char_count = len(combined_content)\n \n summary_parts.append(f\"文档摘要:\")\n summary_parts.append(f\"- 成功处理文档数: {len(documents)}\")\n summary_parts.append(f\"- 总字符数: {char_count:,}\")\n summary_parts.append(f\"- 总词数: {word_count:,}\")\n \n # 文档类型统计\n doc_types = Counter(doc.file_type for doc in documents)\n summary_parts.append(f\"- 文档类型分布: {dict(doc_types)}\")\n \n # 内容长度分析\n content_lengths = [len(doc.content) for doc in documents if doc.content]\n if content_lengths:\n avg_length = sum(content_lengths) / len(content_lengths)\n summary_parts.append(f\"- 平均文档长度: {avg_length:.0f} 字符\")\n \n # 提取前几个段落作为内容预览\n paragraphs = [p.strip() for p in combined_content.split('\\n') if p.strip()]\n preview_paragraphs = []\n \n for para in paragraphs[:5]: # 最多取前5个段落\n if len(para) > 10 and not para.startswith('-') and not para.startswith('='):\n preview_paragraphs.append(para[:100] + \"...\" if len(para) > 100 else para)\n \n if preview_paragraphs:\n summary_parts.append(\"\\n内容预览:\")\n for i, para in enumerate(preview_paragraphs, 1):\n summary_parts.append(f\"{i}. {para}\")\n \n return \"\\n\".join(summary_parts)",
|
||
"code_hash": "317ec07d8e64f2cbf8ff75d02c585220"
|
||
},
|
||
{
|
||
"name": "_extract_key_topics",
|
||
"line_start": 246,
|
||
"line_end": 268,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "content",
|
||
"type_hint": "str"
|
||
}
|
||
],
|
||
"return_type": "List[str]",
|
||
"docstring": "提取关键主题",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _extract_key_topics(self, content: str) -> List[str]:\n \"\"\"提取关键主题\"\"\"\n if not content.strip():\n return []\n \n try:\n # 文本预处理\n text = self._preprocess_text(content)\n \n # 提取词频\n word_freq = self._calculate_word_frequency(text)\n \n # 筛选关键词\n keywords = self._filter_keywords(word_freq)\n \n # 主题聚类(简单版本)\n topics = self._cluster_topics(keywords, content)\n \n return topics[:self.config.max_topics]\n \n except Exception as e:\n logger.warning(f\"关键主题提取失败: {e}\")\n return []",
|
||
"code_hash": "e21880093361157107ba9fe51078854c"
|
||
},
|
||
{
|
||
"name": "_preprocess_text",
|
||
"line_start": 270,
|
||
"line_end": 278,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "text",
|
||
"type_hint": "str"
|
||
}
|
||
],
|
||
"return_type": "str",
|
||
"docstring": "文本预处理",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _preprocess_text(self, text: str) -> str:\n \"\"\"文本预处理\"\"\"\n # 移除特殊字符和数字\n text = re.sub(r'[^\\u4e00-\\u9fa5a-zA-Z\\s]', ' ', text)\n \n # 移除多余空白\n text = re.sub(r'\\s+', ' ', text).strip()\n \n return text",
|
||
"code_hash": "5d5dc7ad13e8caaaca8afb20038a80e2"
|
||
},
|
||
{
|
||
"name": "_calculate_word_frequency",
|
||
"line_start": 280,
|
||
"line_end": 292,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "text",
|
||
"type_hint": "str"
|
||
}
|
||
],
|
||
"return_type": "Dict[str, int]",
|
||
"docstring": "计算词频",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _calculate_word_frequency(self, text: str) -> Dict[str, int]:\n \"\"\"计算词频\"\"\"\n words = text.split()\n word_freq = Counter()\n \n for word in words:\n word = word.strip().lower()\n \n # 过滤短词和停用词\n if len(word) >= 2 and word not in self.stop_words:\n word_freq[word] += 1\n \n return dict(word_freq)",
|
||
"code_hash": "60a4dccdbc57da50b752d88025914303"
|
||
},
|
||
{
|
||
"name": "_filter_keywords",
|
||
"line_start": 294,
|
||
"line_end": 305,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "word_freq",
|
||
"type_hint": "Dict[str, int]"
|
||
}
|
||
],
|
||
"return_type": "List[tuple]",
|
||
"docstring": "筛选关键词",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _filter_keywords(self, word_freq: Dict[str, int]) -> List[tuple]:\n \"\"\"筛选关键词\"\"\"\n # 按频率排序\n sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)\n \n # 取前N个高频词作为关键词\n keywords = []\n for word, freq in sorted_words[:50]: # 最多50个关键词\n if freq >= 2: # 至少出现2次\n keywords.append((word, freq))\n \n return keywords",
|
||
"code_hash": "2c7fb426834120ae71702ca1858d29ee"
|
||
},
|
||
{
|
||
"name": "_cluster_topics",
|
||
"line_start": 307,
|
||
"line_end": 343,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "keywords",
|
||
"type_hint": "List[tuple]"
|
||
},
|
||
{
|
||
"name": "content",
|
||
"type_hint": "str"
|
||
}
|
||
],
|
||
"return_type": "List[str]",
|
||
"docstring": "简单的主题聚类",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _cluster_topics(self, keywords: List[tuple], content: str) -> List[str]:\n \"\"\"简单的主题聚类\"\"\"\n if not keywords:\n return []\n \n topics = []\n \n # 基于关键词生成主题\n keyword_words = [kw[0] for kw in keywords[:20]]\n \n # 预定义主题模式\n topic_patterns = {\n '景区介绍': ['景区', '景点', '景观', '风景', '旅游', '游览', '参观', '观光'],\n '门票价格': ['门票', '价格', '费用', '收费', '票价', '优惠', '折扣'],\n '开放时间': ['时间', '开放', '营业', '闭园', '时段', '时刻'],\n '交通指南': ['交通', '路线', '公交', '地铁', '自驾', '停车', '导航'],\n '服务设施': ['服务', '设施', '餐厅', '厕所', '休息', '购物', '商店'],\n '注意事项': ['注意', '提醒', '禁止', '安全', '建议', '须知'],\n '活动体验': ['活动', '体验', '表演', '娱乐', '项目', '节目']\n }\n \n for topic_name, patterns in topic_patterns.items():\n matches = 0\n for keyword in keyword_words:\n for pattern in patterns:\n if pattern in keyword or keyword in pattern:\n matches += 1\n break\n \n if matches >= 2: # 至少匹配2个模式\n topics.append(topic_name)\n \n # 如果没有匹配的预定义主题,使用高频关键词\n if not topics:\n topics = [kw[0] for kw in keywords[:5]]\n \n return topics",
|
||
"code_hash": "1da2bbe4375348bb212bb26829916281"
|
||
},
|
||
{
|
||
"name": "_count_document_types",
|
||
"line_start": 345,
|
||
"line_end": 350,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "documents",
|
||
"type_hint": "List[ExtractedDocument]"
|
||
}
|
||
],
|
||
"return_type": "Dict[str, int]",
|
||
"docstring": "统计文档类型",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _count_document_types(self, documents: List[ExtractedDocument]) -> Dict[str, int]:\n \"\"\"统计文档类型\"\"\"\n type_counts = Counter()\n for doc in documents:\n type_counts[doc.file_type] += 1\n return dict(type_counts)",
|
||
"code_hash": "39f82f3afb3f45311de05455c401570c"
|
||
},
|
||
{
|
||
"name": "_collect_metadata",
|
||
"line_start": 352,
|
||
"line_end": 368,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "all_documents",
|
||
"type_hint": "List[ExtractedDocument]"
|
||
},
|
||
{
|
||
"name": "successful_docs",
|
||
"type_hint": "List[ExtractedDocument]"
|
||
}
|
||
],
|
||
"return_type": "Dict[str, Any]",
|
||
"docstring": "收集整合元数据",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _collect_metadata(self, all_documents: List[ExtractedDocument], successful_docs: List[ExtractedDocument]) -> Dict[str, Any]:\n \"\"\"收集整合元数据\"\"\"\n total_file_size = sum(doc.file_size for doc in all_documents)\n extraction_methods = Counter(doc.extraction_method for doc in successful_docs if doc.extraction_method)\n \n return {\n 'integration_method': 'content_integrator_v2',\n 'total_documents': len(all_documents),\n 'successful_extractions': len(successful_docs),\n 'failed_extractions': len(all_documents) - len(successful_docs),\n 'total_file_size': total_file_size,\n 'extraction_methods_used': dict(extraction_methods),\n 'processing_timestamp': datetime.now().isoformat(),\n 'content_integration_enabled': self.config.enable_content_integration,\n 'max_content_length': self.config.max_content_length,\n 'max_topics': self.config.max_topics\n }",
|
||
"code_hash": "4b8f45cdbb97e153a6b3357b24e59555"
|
||
},
|
||
{
|
||
"name": "_create_empty_integration",
|
||
"line_start": 370,
|
||
"line_end": 382,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "documents",
|
||
"type_hint": "List[ExtractedDocument]"
|
||
},
|
||
{
|
||
"name": "reason",
|
||
"type_hint": "str"
|
||
}
|
||
],
|
||
"return_type": "IntegratedContent",
|
||
"docstring": "创建空的整合结果",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _create_empty_integration(self, documents: List[ExtractedDocument], reason: str) -> IntegratedContent:\n \"\"\"创建空的整合结果\"\"\"\n return IntegratedContent(\n documents=documents,\n document_count=len(documents),\n total_content_length=0,\n document_types=self._count_document_types(documents),\n combined_content=\"\",\n content_summary=f\"整合失败: {reason}\",\n key_topics=[],\n integration_metadata={'integration_method': 'empty', 'failure_reason': reason},\n integrated_at=datetime.now()\n )",
|
||
"code_hash": "dc78387417a3481b4e9de7a4bc8ee97f"
|
||
},
|
||
{
|
||
"name": "analyze_content_overlap",
|
||
"line_start": 384,
|
||
"line_end": 420,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "documents",
|
||
"type_hint": "List[ExtractedDocument]"
|
||
}
|
||
],
|
||
"return_type": "Dict[str, Any]",
|
||
"docstring": "分析文档内容重叠情况",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def analyze_content_overlap(self, documents: List[ExtractedDocument]) -> Dict[str, Any]:\n \"\"\"分析文档内容重叠情况\"\"\"\n if len(documents) < 2:\n return {'overlap_analysis': '需要至少2个文档进行重叠分析'}\n \n # 简单的重叠分析\n doc_words = []\n for doc in documents:\n if doc.content and not doc.error_info:\n words = set(self._preprocess_text(doc.content).split())\n doc_words.append((doc.filename, words))\n \n if len(doc_words) < 2:\n return {'overlap_analysis': '没有足够的有效文档内容进行分析'}\n \n overlap_results = {}\n \n for i in range(len(doc_words)):\n for j in range(i + 1, len(doc_words)):\n doc1_name, words1 = doc_words[i]\n doc2_name, words2 = doc_words[j]\n \n common_words = words1.intersection(words2)\n total_words = len(words1.union(words2))\n \n if total_words > 0:\n overlap_ratio = len(common_words) / total_words\n overlap_results[f\"{doc1_name} vs {doc2_name}\"] = {\n 'common_words_count': len(common_words),\n 'overlap_ratio': round(overlap_ratio, 3),\n 'common_words_sample': list(common_words)[:10]\n }\n \n return {\n 'overlap_analysis': overlap_results,\n 'total_comparisons': len(overlap_results)\n }",
|
||
"code_hash": "29b905f15eb4c80bb9afc905a96b3249"
|
||
},
|
||
{
|
||
"name": "get_integration_stats",
|
||
"line_start": 422,
|
||
"line_end": 436,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
}
|
||
],
|
||
"return_type": "Dict[str, Any]",
|
||
"docstring": "获取整合器统计信息",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def get_integration_stats(self) -> Dict[str, Any]:\n \"\"\"获取整合器统计信息\"\"\"\n return {\n 'max_content_length': self.config.max_content_length,\n 'max_topics': self.config.max_topics,\n 'enable_content_integration': self.config.enable_content_integration,\n 'stop_words_count': len(self.stop_words),\n 'supported_analysis': [\n 'content_combination',\n 'summary_generation', \n 'key_topic_extraction',\n 'content_overlap_analysis',\n 'document_type_statistics'\n ]\n } ",
|
||
"code_hash": "3a9ed49f3fe653c211e8c2e0253d87e6"
|
||
}
|
||
],
|
||
"classes": [
|
||
{
|
||
"name": "IntegratedContent",
|
||
"line_start": 24,
|
||
"line_end": 75,
|
||
"bases": [],
|
||
"methods": [
|
||
{
|
||
"name": "__post_init__",
|
||
"line_start": 36,
|
||
"line_end": 45,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
}
|
||
],
|
||
"return_type": null,
|
||
"docstring": "初始化后处理",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def __post_init__(self):\n \"\"\"初始化后处理\"\"\"\n if not self.integrated_at:\n self.integrated_at = datetime.now()\n \n if not self.document_types:\n self.document_types = {}\n for doc in self.documents:\n file_type = doc.file_type\n self.document_types[file_type] = self.document_types.get(file_type, 0) + 1",
|
||
"code_hash": "dfa2a458566ec2698cc045a4f0be192e"
|
||
},
|
||
{
|
||
"name": "to_dict",
|
||
"line_start": 47,
|
||
"line_end": 59,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
}
|
||
],
|
||
"return_type": "Dict[str, Any]",
|
||
"docstring": "转换为字典格式",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def to_dict(self) -> Dict[str, Any]:\n \"\"\"转换为字典格式\"\"\"\n return {\n 'document_count': self.document_count,\n 'total_content_length': self.total_content_length,\n 'document_types': self.document_types,\n 'combined_content': self.combined_content,\n 'content_summary': self.content_summary,\n 'key_topics': self.key_topics,\n 'integration_metadata': self.integration_metadata,\n 'integrated_at': self.integrated_at.isoformat(),\n 'documents': [doc.get_summary() for doc in self.documents]\n }",
|
||
"code_hash": "1d82123b5ec018a20433936b0df16320"
|
||
},
|
||
{
|
||
"name": "get_statistics",
|
||
"line_start": 61,
|
||
"line_end": 75,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
}
|
||
],
|
||
"return_type": "Dict[str, Any]",
|
||
"docstring": "获取内容统计信息",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def get_statistics(self) -> Dict[str, Any]:\n \"\"\"获取内容统计信息\"\"\"\n total_words = sum(len(doc.content.split()) for doc in self.documents)\n successful_docs = [doc for doc in self.documents if not doc.error_info]\n \n return {\n 'total_documents': self.document_count,\n 'successful_extractions': len(successful_docs),\n 'failed_extractions': self.document_count - len(successful_docs),\n 'total_words': total_words,\n 'total_content_length': self.total_content_length,\n 'average_content_length': self.total_content_length / max(1, len(successful_docs)),\n 'document_types': self.document_types,\n 'key_topics_count': len(self.key_topics)\n }",
|
||
"code_hash": "10c87ec40691e2b661e6007409a5634d"
|
||
}
|
||
],
|
||
"docstring": "整合后的内容",
|
||
"decorators": [
|
||
"dataclass"
|
||
],
|
||
"code": "class IntegratedContent:\n \"\"\"整合后的内容\"\"\"\n documents: List[ExtractedDocument]\n document_count: int\n total_content_length: int\n document_types: Dict[str, int]\n combined_content: str\n content_summary: str\n key_topics: List[str]\n integration_metadata: Dict[str, Any]\n integrated_at: datetime\n \n def __post_init__(self):\n \"\"\"初始化后处理\"\"\"\n if not self.integrated_at:\n self.integrated_at = datetime.now()\n \n if not self.document_types:\n self.document_types = {}\n for doc in self.documents:\n file_type = doc.file_type\n self.document_types[file_type] = self.document_types.get(file_type, 0) + 1\n \n def to_dict(self) -> Dict[str, Any]:\n \"\"\"转换为字典格式\"\"\"\n return {\n 'document_count': self.document_count,\n 'total_content_length': self.total_content_length,\n 'document_types': self.document_types,\n 'combined_content': self.combined_content,\n 'content_summary': self.content_summary,\n 'key_topics': self.key_topics,\n 'integration_metadata': self.integration_metadata,\n 'integrated_at': self.integrated_at.isoformat(),\n 'documents': [doc.get_summary() for doc in self.documents]\n }\n \n def get_statistics(self) -> Dict[str, Any]:\n \"\"\"获取内容统计信息\"\"\"\n total_words = sum(len(doc.content.split()) for doc in self.documents)\n successful_docs = [doc for doc in self.documents if not doc.error_info]\n \n return {\n 'total_documents': self.document_count,\n 'successful_extractions': len(successful_docs),\n 'failed_extractions': self.document_count - len(successful_docs),\n 'total_words': total_words,\n 'total_content_length': self.total_content_length,\n 'average_content_length': self.total_content_length / max(1, len(successful_docs)),\n 'document_types': self.document_types,\n 'key_topics_count': len(self.key_topics)\n }",
|
||
"code_hash": "20fc804660dc1b1d8ff4ccea9f20d21e"
|
||
},
|
||
{
|
||
"name": "ContentIntegrator",
|
||
"line_start": 78,
|
||
"line_end": 436,
|
||
"bases": [],
|
||
"methods": [
|
||
{
|
||
"name": "__init__",
|
||
"line_start": 84,
|
||
"line_end": 101,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "config",
|
||
"type_hint": "DocumentProcessingConfig"
|
||
}
|
||
],
|
||
"return_type": null,
|
||
"docstring": "初始化内容整合器\n\nArgs:\n config: 文档处理配置",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def __init__(self, config: DocumentProcessingConfig):\n \"\"\"\n 初始化内容整合器\n \n Args:\n config: 文档处理配置\n \"\"\"\n self.config = config\n \n # 停用词列表(中文)\n self.stop_words = {\n '的', '了', '是', '在', '有', '和', '就', '不', '人', '都', '一', '个', '上', '也', '很', '到', '说', '要', '去', \n '你', '会', '着', '没', '看', '好', '自己', '这', '那', '来', '可以', '时候', '我', '他', '她', '它', '们',\n '之', '与', '及', '或', '但', '而', '因为', '所以', '如果', '虽然', '然而', '因此', '于是', '总之',\n '的话', '这样', '那样', '这里', '那里', '现在', '以前', '以后', '当时', '刚才', '马上', '立即'\n }\n \n logger.info(\"内容整合器初始化完成\")",
|
||
"code_hash": "4f448f1348426d8622feeae0c4f064cc"
|
||
},
|
||
{
|
||
"name": "integrate_documents",
|
||
"line_start": 103,
|
||
"line_end": 172,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "documents",
|
||
"type_hint": "List[ExtractedDocument]"
|
||
}
|
||
],
|
||
"return_type": "IntegratedContent",
|
||
"docstring": "整合多个文档\n\nArgs:\n documents: 提取的文档列表\n \nReturns:\n IntegratedContent: 整合后的内容\n \nRaises:\n DocumentProcessingError: 整合失败时抛出",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def integrate_documents(self, documents: List[ExtractedDocument]) -> IntegratedContent:\n \"\"\"\n 整合多个文档\n \n Args:\n documents: 提取的文档列表\n \n Returns:\n IntegratedContent: 整合后的内容\n \n Raises:\n DocumentProcessingError: 整合失败时抛出\n \"\"\"\n if not documents:\n return IntegratedContent(\n documents=[],\n document_count=0,\n total_content_length=0,\n document_types={},\n combined_content=\"\",\n content_summary=\"没有可处理的文档\",\n key_topics=[],\n integration_metadata={'integration_method': 'empty'},\n integrated_at=datetime.now()\n )\n \n try:\n logger.info(f\"开始整合 {len(documents)} 个文档\")\n \n # 过滤成功提取的文档\n successful_docs = [doc for doc in documents if doc.content and not doc.error_info]\n \n if not successful_docs:\n logger.warning(\"没有成功提取的文档内容\")\n return self._create_empty_integration(documents, \"所有文档提取失败\")\n \n # 合并内容\n combined_content = self._combine_content(successful_docs)\n \n # 生成内容摘要\n content_summary = self._generate_summary(combined_content, successful_docs)\n \n # 提取关键主题\n key_topics = self._extract_key_topics(combined_content)\n \n # 收集统计信息\n integration_metadata = self._collect_metadata(documents, successful_docs)\n \n # 计算总内容长度\n total_content_length = sum(len(doc.content) for doc in successful_docs)\n \n integrated_content = IntegratedContent(\n documents=documents,\n document_count=len(documents),\n total_content_length=total_content_length,\n document_types=self._count_document_types(documents),\n combined_content=combined_content,\n content_summary=content_summary,\n key_topics=key_topics,\n integration_metadata=integration_metadata,\n integrated_at=datetime.now()\n )\n \n logger.info(f\"文档整合完成,合并内容长度: {len(combined_content)}\")\n return integrated_content\n \n except Exception as e:\n error_msg = f\"文档整合失败: {str(e)}\"\n logger.error(error_msg, exc_info=True)\n raise DocumentProcessingError(error_msg)",
|
||
"code_hash": "29f02e2616a26e6f18f5e472032c713b"
|
||
},
|
||
{
|
||
"name": "_combine_content",
|
||
"line_start": 174,
|
||
"line_end": 203,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "documents",
|
||
"type_hint": "List[ExtractedDocument]"
|
||
}
|
||
],
|
||
"return_type": "str",
|
||
"docstring": "合并文档内容",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _combine_content(self, documents: List[ExtractedDocument]) -> str:\n \"\"\"合并文档内容\"\"\"\n if not documents:\n return \"\"\n \n combined_parts = []\n \n for i, doc in enumerate(documents):\n if doc.content.strip():\n # 添加文档分隔符\n if i > 0:\n combined_parts.append(f\"\\n{'='*50}\\n\")\n \n # 添加文档头信息\n combined_parts.append(f\"文档: {doc.filename} ({doc.file_type})\\n\")\n if doc.page_count:\n combined_parts.append(f\"页数: {doc.page_count}\\n\")\n combined_parts.append(\"-\" * 30 + \"\\n\")\n \n # 添加文档内容\n content = doc.content.strip()\n \n # 应用内容长度限制\n if self.config.max_content_length > 0 and len(content) > self.config.max_content_length:\n content = content[:self.config.max_content_length] + \"\\n[内容已截断...]\"\n \n combined_parts.append(content)\n combined_parts.append(\"\\n\")\n \n return \"\".join(combined_parts)",
|
||
"code_hash": "ae1f5bf801d40221bdc66126f230a2d6"
|
||
},
|
||
{
|
||
"name": "_generate_summary",
|
||
"line_start": 205,
|
||
"line_end": 244,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "combined_content",
|
||
"type_hint": "str"
|
||
},
|
||
{
|
||
"name": "documents",
|
||
"type_hint": "List[ExtractedDocument]"
|
||
}
|
||
],
|
||
"return_type": "str",
|
||
"docstring": "生成内容摘要",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _generate_summary(self, combined_content: str, documents: List[ExtractedDocument]) -> str:\n \"\"\"生成内容摘要\"\"\"\n if not combined_content.strip():\n return \"无内容可摘要\"\n \n summary_parts = []\n \n # 基本统计\n word_count = len(combined_content.split())\n char_count = len(combined_content)\n \n summary_parts.append(f\"文档摘要:\")\n summary_parts.append(f\"- 成功处理文档数: {len(documents)}\")\n summary_parts.append(f\"- 总字符数: {char_count:,}\")\n summary_parts.append(f\"- 总词数: {word_count:,}\")\n \n # 文档类型统计\n doc_types = Counter(doc.file_type for doc in documents)\n summary_parts.append(f\"- 文档类型分布: {dict(doc_types)}\")\n \n # 内容长度分析\n content_lengths = [len(doc.content) for doc in documents if doc.content]\n if content_lengths:\n avg_length = sum(content_lengths) / len(content_lengths)\n summary_parts.append(f\"- 平均文档长度: {avg_length:.0f} 字符\")\n \n # 提取前几个段落作为内容预览\n paragraphs = [p.strip() for p in combined_content.split('\\n') if p.strip()]\n preview_paragraphs = []\n \n for para in paragraphs[:5]: # 最多取前5个段落\n if len(para) > 10 and not para.startswith('-') and not para.startswith('='):\n preview_paragraphs.append(para[:100] + \"...\" if len(para) > 100 else para)\n \n if preview_paragraphs:\n summary_parts.append(\"\\n内容预览:\")\n for i, para in enumerate(preview_paragraphs, 1):\n summary_parts.append(f\"{i}. {para}\")\n \n return \"\\n\".join(summary_parts)",
|
||
"code_hash": "317ec07d8e64f2cbf8ff75d02c585220"
|
||
},
|
||
{
|
||
"name": "_extract_key_topics",
|
||
"line_start": 246,
|
||
"line_end": 268,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "content",
|
||
"type_hint": "str"
|
||
}
|
||
],
|
||
"return_type": "List[str]",
|
||
"docstring": "提取关键主题",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _extract_key_topics(self, content: str) -> List[str]:\n \"\"\"提取关键主题\"\"\"\n if not content.strip():\n return []\n \n try:\n # 文本预处理\n text = self._preprocess_text(content)\n \n # 提取词频\n word_freq = self._calculate_word_frequency(text)\n \n # 筛选关键词\n keywords = self._filter_keywords(word_freq)\n \n # 主题聚类(简单版本)\n topics = self._cluster_topics(keywords, content)\n \n return topics[:self.config.max_topics]\n \n except Exception as e:\n logger.warning(f\"关键主题提取失败: {e}\")\n return []",
|
||
"code_hash": "e21880093361157107ba9fe51078854c"
|
||
},
|
||
{
|
||
"name": "_preprocess_text",
|
||
"line_start": 270,
|
||
"line_end": 278,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "text",
|
||
"type_hint": "str"
|
||
}
|
||
],
|
||
"return_type": "str",
|
||
"docstring": "文本预处理",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _preprocess_text(self, text: str) -> str:\n \"\"\"文本预处理\"\"\"\n # 移除特殊字符和数字\n text = re.sub(r'[^\\u4e00-\\u9fa5a-zA-Z\\s]', ' ', text)\n \n # 移除多余空白\n text = re.sub(r'\\s+', ' ', text).strip()\n \n return text",
|
||
"code_hash": "5d5dc7ad13e8caaaca8afb20038a80e2"
|
||
},
|
||
{
|
||
"name": "_calculate_word_frequency",
|
||
"line_start": 280,
|
||
"line_end": 292,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "text",
|
||
"type_hint": "str"
|
||
}
|
||
],
|
||
"return_type": "Dict[str, int]",
|
||
"docstring": "计算词频",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _calculate_word_frequency(self, text: str) -> Dict[str, int]:\n \"\"\"计算词频\"\"\"\n words = text.split()\n word_freq = Counter()\n \n for word in words:\n word = word.strip().lower()\n \n # 过滤短词和停用词\n if len(word) >= 2 and word not in self.stop_words:\n word_freq[word] += 1\n \n return dict(word_freq)",
|
||
"code_hash": "60a4dccdbc57da50b752d88025914303"
|
||
},
|
||
{
|
||
"name": "_filter_keywords",
|
||
"line_start": 294,
|
||
"line_end": 305,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "word_freq",
|
||
"type_hint": "Dict[str, int]"
|
||
}
|
||
],
|
||
"return_type": "List[tuple]",
|
||
"docstring": "筛选关键词",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _filter_keywords(self, word_freq: Dict[str, int]) -> List[tuple]:\n \"\"\"筛选关键词\"\"\"\n # 按频率排序\n sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)\n \n # 取前N个高频词作为关键词\n keywords = []\n for word, freq in sorted_words[:50]: # 最多50个关键词\n if freq >= 2: # 至少出现2次\n keywords.append((word, freq))\n \n return keywords",
|
||
"code_hash": "2c7fb426834120ae71702ca1858d29ee"
|
||
},
|
||
{
|
||
"name": "_cluster_topics",
|
||
"line_start": 307,
|
||
"line_end": 343,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "keywords",
|
||
"type_hint": "List[tuple]"
|
||
},
|
||
{
|
||
"name": "content",
|
||
"type_hint": "str"
|
||
}
|
||
],
|
||
"return_type": "List[str]",
|
||
"docstring": "简单的主题聚类",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _cluster_topics(self, keywords: List[tuple], content: str) -> List[str]:\n \"\"\"简单的主题聚类\"\"\"\n if not keywords:\n return []\n \n topics = []\n \n # 基于关键词生成主题\n keyword_words = [kw[0] for kw in keywords[:20]]\n \n # 预定义主题模式\n topic_patterns = {\n '景区介绍': ['景区', '景点', '景观', '风景', '旅游', '游览', '参观', '观光'],\n '门票价格': ['门票', '价格', '费用', '收费', '票价', '优惠', '折扣'],\n '开放时间': ['时间', '开放', '营业', '闭园', '时段', '时刻'],\n '交通指南': ['交通', '路线', '公交', '地铁', '自驾', '停车', '导航'],\n '服务设施': ['服务', '设施', '餐厅', '厕所', '休息', '购物', '商店'],\n '注意事项': ['注意', '提醒', '禁止', '安全', '建议', '须知'],\n '活动体验': ['活动', '体验', '表演', '娱乐', '项目', '节目']\n }\n \n for topic_name, patterns in topic_patterns.items():\n matches = 0\n for keyword in keyword_words:\n for pattern in patterns:\n if pattern in keyword or keyword in pattern:\n matches += 1\n break\n \n if matches >= 2: # 至少匹配2个模式\n topics.append(topic_name)\n \n # 如果没有匹配的预定义主题,使用高频关键词\n if not topics:\n topics = [kw[0] for kw in keywords[:5]]\n \n return topics",
|
||
"code_hash": "1da2bbe4375348bb212bb26829916281"
|
||
},
|
||
{
|
||
"name": "_count_document_types",
|
||
"line_start": 345,
|
||
"line_end": 350,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "documents",
|
||
"type_hint": "List[ExtractedDocument]"
|
||
}
|
||
],
|
||
"return_type": "Dict[str, int]",
|
||
"docstring": "统计文档类型",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _count_document_types(self, documents: List[ExtractedDocument]) -> Dict[str, int]:\n \"\"\"统计文档类型\"\"\"\n type_counts = Counter()\n for doc in documents:\n type_counts[doc.file_type] += 1\n return dict(type_counts)",
|
||
"code_hash": "39f82f3afb3f45311de05455c401570c"
|
||
},
|
||
{
|
||
"name": "_collect_metadata",
|
||
"line_start": 352,
|
||
"line_end": 368,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "all_documents",
|
||
"type_hint": "List[ExtractedDocument]"
|
||
},
|
||
{
|
||
"name": "successful_docs",
|
||
"type_hint": "List[ExtractedDocument]"
|
||
}
|
||
],
|
||
"return_type": "Dict[str, Any]",
|
||
"docstring": "收集整合元数据",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _collect_metadata(self, all_documents: List[ExtractedDocument], successful_docs: List[ExtractedDocument]) -> Dict[str, Any]:\n \"\"\"收集整合元数据\"\"\"\n total_file_size = sum(doc.file_size for doc in all_documents)\n extraction_methods = Counter(doc.extraction_method for doc in successful_docs if doc.extraction_method)\n \n return {\n 'integration_method': 'content_integrator_v2',\n 'total_documents': len(all_documents),\n 'successful_extractions': len(successful_docs),\n 'failed_extractions': len(all_documents) - len(successful_docs),\n 'total_file_size': total_file_size,\n 'extraction_methods_used': dict(extraction_methods),\n 'processing_timestamp': datetime.now().isoformat(),\n 'content_integration_enabled': self.config.enable_content_integration,\n 'max_content_length': self.config.max_content_length,\n 'max_topics': self.config.max_topics\n }",
|
||
"code_hash": "4b8f45cdbb97e153a6b3357b24e59555"
|
||
},
|
||
{
|
||
"name": "_create_empty_integration",
|
||
"line_start": 370,
|
||
"line_end": 382,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "documents",
|
||
"type_hint": "List[ExtractedDocument]"
|
||
},
|
||
{
|
||
"name": "reason",
|
||
"type_hint": "str"
|
||
}
|
||
],
|
||
"return_type": "IntegratedContent",
|
||
"docstring": "创建空的整合结果",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def _create_empty_integration(self, documents: List[ExtractedDocument], reason: str) -> IntegratedContent:\n \"\"\"创建空的整合结果\"\"\"\n return IntegratedContent(\n documents=documents,\n document_count=len(documents),\n total_content_length=0,\n document_types=self._count_document_types(documents),\n combined_content=\"\",\n content_summary=f\"整合失败: {reason}\",\n key_topics=[],\n integration_metadata={'integration_method': 'empty', 'failure_reason': reason},\n integrated_at=datetime.now()\n )",
|
||
"code_hash": "dc78387417a3481b4e9de7a4bc8ee97f"
|
||
},
|
||
{
|
||
"name": "analyze_content_overlap",
|
||
"line_start": 384,
|
||
"line_end": 420,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
},
|
||
{
|
||
"name": "documents",
|
||
"type_hint": "List[ExtractedDocument]"
|
||
}
|
||
],
|
||
"return_type": "Dict[str, Any]",
|
||
"docstring": "分析文档内容重叠情况",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def analyze_content_overlap(self, documents: List[ExtractedDocument]) -> Dict[str, Any]:\n \"\"\"分析文档内容重叠情况\"\"\"\n if len(documents) < 2:\n return {'overlap_analysis': '需要至少2个文档进行重叠分析'}\n \n # 简单的重叠分析\n doc_words = []\n for doc in documents:\n if doc.content and not doc.error_info:\n words = set(self._preprocess_text(doc.content).split())\n doc_words.append((doc.filename, words))\n \n if len(doc_words) < 2:\n return {'overlap_analysis': '没有足够的有效文档内容进行分析'}\n \n overlap_results = {}\n \n for i in range(len(doc_words)):\n for j in range(i + 1, len(doc_words)):\n doc1_name, words1 = doc_words[i]\n doc2_name, words2 = doc_words[j]\n \n common_words = words1.intersection(words2)\n total_words = len(words1.union(words2))\n \n if total_words > 0:\n overlap_ratio = len(common_words) / total_words\n overlap_results[f\"{doc1_name} vs {doc2_name}\"] = {\n 'common_words_count': len(common_words),\n 'overlap_ratio': round(overlap_ratio, 3),\n 'common_words_sample': list(common_words)[:10]\n }\n \n return {\n 'overlap_analysis': overlap_results,\n 'total_comparisons': len(overlap_results)\n }",
|
||
"code_hash": "29b905f15eb4c80bb9afc905a96b3249"
|
||
},
|
||
{
|
||
"name": "get_integration_stats",
|
||
"line_start": 422,
|
||
"line_end": 436,
|
||
"args": [
|
||
{
|
||
"name": "self"
|
||
}
|
||
],
|
||
"return_type": "Dict[str, Any]",
|
||
"docstring": "获取整合器统计信息",
|
||
"is_async": false,
|
||
"decorators": [],
|
||
"code": " def get_integration_stats(self) -> Dict[str, Any]:\n \"\"\"获取整合器统计信息\"\"\"\n return {\n 'max_content_length': self.config.max_content_length,\n 'max_topics': self.config.max_topics,\n 'enable_content_integration': self.config.enable_content_integration,\n 'stop_words_count': len(self.stop_words),\n 'supported_analysis': [\n 'content_combination',\n 'summary_generation', \n 'key_topic_extraction',\n 'content_overlap_analysis',\n 'document_type_statistics'\n ]\n } ",
|
||
"code_hash": "3a9ed49f3fe653c211e8c2e0253d87e6"
|
||
}
|
||
],
|
||
"docstring": "内容整合器 - 重构版本\n负责整合多个文档的信息,提取关键内容和主题",
|
||
"decorators": [],
|
||
"code": "class ContentIntegrator:\n \"\"\"\n 内容整合器 - 重构版本\n 负责整合多个文档的信息,提取关键内容和主题\n \"\"\"\n \n def __init__(self, config: DocumentProcessingConfig):\n \"\"\"\n 初始化内容整合器\n \n Args:\n config: 文档处理配置\n \"\"\"\n self.config = config\n \n # 停用词列表(中文)\n self.stop_words = {\n '的', '了', '是', '在', '有', '和', '就', '不', '人', '都', '一', '个', '上', '也', '很', '到', '说', '要', '去', \n '你', '会', '着', '没', '看', '好', '自己', '这', '那', '来', '可以', '时候', '我', '他', '她', '它', '们',\n '之', '与', '及', '或', '但', '而', '因为', '所以', '如果', '虽然', '然而', '因此', '于是', '总之',\n '的话', '这样', '那样', '这里', '那里', '现在', '以前', '以后', '当时', '刚才', '马上', '立即'\n }\n \n logger.info(\"内容整合器初始化完成\")\n \n def integrate_documents(self, documents: List[ExtractedDocument]) -> IntegratedContent:\n \"\"\"\n 整合多个文档\n \n Args:\n documents: 提取的文档列表\n \n Returns:\n IntegratedContent: 整合后的内容\n \n Raises:\n DocumentProcessingError: 整合失败时抛出\n \"\"\"\n if not documents:\n return IntegratedContent(\n documents=[],\n document_count=0,\n total_content_length=0,\n document_types={},\n combined_content=\"\",\n content_summary=\"没有可处理的文档\",\n key_topics=[],\n integration_metadata={'integration_method': 'empty'},\n integrated_at=datetime.now()\n )\n \n try:\n logger.info(f\"开始整合 {len(documents)} 个文档\")\n \n # 过滤成功提取的文档\n successful_docs = [doc for doc in documents if doc.content and not doc.error_info]\n \n if not successful_docs:\n logger.warning(\"没有成功提取的文档内容\")\n return self._create_empty_integration(documents, \"所有文档提取失败\")\n \n # 合并内容\n combined_content = self._combine_content(successful_docs)\n \n # 生成内容摘要\n content_summary = self._generate_summary(combined_content, successful_docs)\n \n # 提取关键主题\n key_topics = self._extract_key_topics(combined_content)\n \n # 收集统计信息\n integration_metadata = self._collect_metadata(documents, successful_docs)\n \n # 计算总内容长度\n total_content_length = sum(len(doc.content) for doc in successful_docs)\n \n integrated_content = IntegratedContent(\n documents=documents,\n document_count=len(documents),\n total_content_length=total_content_length,\n document_types=self._count_document_types(documents),\n combined_content=combined_content,\n content_summary=content_summary,\n key_topics=key_topics,\n integration_metadata=integration_metadata,\n integrated_at=datetime.now()\n )\n \n logger.info(f\"文档整合完成,合并内容长度: {len(combined_content)}\")\n return integrated_content\n \n except Exception as e:\n error_msg = f\"文档整合失败: {str(e)}\"\n logger.error(error_msg, exc_info=True)\n raise DocumentProcessingError(error_msg)\n \n def _combine_content(self, documents: List[ExtractedDocument]) -> str:\n \"\"\"合并文档内容\"\"\"\n if not documents:\n return \"\"\n \n combined_parts = []\n \n for i, doc in enumerate(documents):\n if doc.content.strip():\n # 添加文档分隔符\n if i > 0:\n combined_parts.append(f\"\\n{'='*50}\\n\")\n \n # 添加文档头信息\n combined_parts.append(f\"文档: {doc.filename} ({doc.file_type})\\n\")\n if doc.page_count:\n combined_parts.append(f\"页数: {doc.page_count}\\n\")\n combined_parts.append(\"-\" * 30 + \"\\n\")\n \n # 添加文档内容\n content = doc.content.strip()\n \n # 应用内容长度限制\n if self.config.max_content_length > 0 and len(content) > self.config.max_content_length:\n content = content[:self.config.max_content_length] + \"\\n[内容已截断...]\"\n \n combined_parts.append(content)\n combined_parts.append(\"\\n\")\n \n return \"\".join(combined_parts)\n \n def _generate_summary(self, combined_content: str, documents: List[ExtractedDocument]) -> str:\n \"\"\"生成内容摘要\"\"\"\n if not combined_content.strip():\n return \"无内容可摘要\"\n \n summary_parts = []\n \n # 基本统计\n word_count = len(combined_content.split())\n char_count = len(combined_content)\n \n summary_parts.append(f\"文档摘要:\")\n summary_parts.append(f\"- 成功处理文档数: {len(documents)}\")\n summary_parts.append(f\"- 总字符数: {char_count:,}\")\n summary_parts.append(f\"- 总词数: {word_count:,}\")\n \n # 文档类型统计\n doc_types = Counter(doc.file_type for doc in documents)\n summary_parts.append(f\"- 文档类型分布: {dict(doc_types)}\")\n \n # 内容长度分析\n content_lengths = [len(doc.content) for doc in documents if doc.content]\n if content_lengths:\n avg_length = sum(content_lengths) / len(content_lengths)\n summary_parts.append(f\"- 平均文档长度: {avg_length:.0f} 字符\")\n \n # 提取前几个段落作为内容预览\n paragraphs = [p.strip() for p in combined_content.split('\\n') if p.strip()]\n preview_paragraphs = []\n \n for para in paragraphs[:5]: # 最多取前5个段落\n if len(para) > 10 and not para.startswith('-') and not para.startswith('='):\n preview_paragraphs.append(para[:100] + \"...\" if len(para) > 100 else para)\n \n if preview_paragraphs:\n summary_parts.append(\"\\n内容预览:\")\n for i, para in enumerate(preview_paragraphs, 1):\n summary_parts.append(f\"{i}. {para}\")\n \n return \"\\n\".join(summary_parts)\n \n def _extract_key_topics(self, content: str) -> List[str]:\n \"\"\"提取关键主题\"\"\"\n if not content.strip():\n return []\n \n try:\n # 文本预处理\n text = self._preprocess_text(content)\n \n # 提取词频\n word_freq = self._calculate_word_frequency(text)\n \n # 筛选关键词\n keywords = self._filter_keywords(word_freq)\n \n # 主题聚类(简单版本)\n topics = self._cluster_topics(keywords, content)\n \n return topics[:self.config.max_topics]\n \n except Exception as e:\n logger.warning(f\"关键主题提取失败: {e}\")\n return []\n \n def _preprocess_text(self, text: str) -> str:\n \"\"\"文本预处理\"\"\"\n # 移除特殊字符和数字\n text = re.sub(r'[^\\u4e00-\\u9fa5a-zA-Z\\s]', ' ', text)\n \n # 移除多余空白\n text = re.sub(r'\\s+', ' ', text).strip()\n \n return text\n \n def _calculate_word_frequency(self, text: str) -> Dict[str, int]:\n \"\"\"计算词频\"\"\"\n words = text.split()\n word_freq = Counter()\n \n for word in words:\n word = word.strip().lower()\n \n # 过滤短词和停用词\n if len(word) >= 2 and word not in self.stop_words:\n word_freq[word] += 1\n \n return dict(word_freq)\n \n def _filter_keywords(self, word_freq: Dict[str, int]) -> List[tuple]:\n \"\"\"筛选关键词\"\"\"\n # 按频率排序\n sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)\n \n # 取前N个高频词作为关键词\n keywords = []\n for word, freq in sorted_words[:50]: # 最多50个关键词\n if freq >= 2: # 至少出现2次\n keywords.append((word, freq))\n \n return keywords\n \n def _cluster_topics(self, keywords: List[tuple], content: str) -> List[str]:\n \"\"\"简单的主题聚类\"\"\"\n if not keywords:\n return []\n \n topics = []\n \n # 基于关键词生成主题\n keyword_words = [kw[0] for kw in keywords[:20]]\n \n # 预定义主题模式\n topic_patterns = {\n '景区介绍': ['景区', '景点', '景观', '风景', '旅游', '游览', '参观', '观光'],\n '门票价格': ['门票', '价格', '费用', '收费', '票价', '优惠', '折扣'],\n '开放时间': ['时间', '开放', '营业', '闭园', '时段', '时刻'],\n '交通指南': ['交通', '路线', '公交', '地铁', '自驾', '停车', '导航'],\n '服务设施': ['服务', '设施', '餐厅', '厕所', '休息', '购物', '商店'],\n '注意事项': ['注意', '提醒', '禁止', '安全', '建议', '须知'],\n '活动体验': ['活动', '体验', '表演', '娱乐', '项目', '节目']\n }\n \n for topic_name, patterns in topic_patterns.items():\n matches = 0\n for keyword in keyword_words:\n for pattern in patterns:\n if pattern in keyword or keyword in pattern:\n matches += 1\n break\n \n if matches >= 2: # 至少匹配2个模式\n topics.append(topic_name)\n \n # 如果没有匹配的预定义主题,使用高频关键词\n if not topics:\n topics = [kw[0] for kw in keywords[:5]]\n \n return topics\n \n def _count_document_types(self, documents: List[ExtractedDocument]) -> Dict[str, int]:\n \"\"\"统计文档类型\"\"\"\n type_counts = Counter()\n for doc in documents:\n type_counts[doc.file_type] += 1\n return dict(type_counts)\n \n def _collect_metadata(self, all_documents: List[ExtractedDocument], successful_docs: List[ExtractedDocument]) -> Dict[str, Any]:\n \"\"\"收集整合元数据\"\"\"\n total_file_size = sum(doc.file_size for doc in all_documents)\n extraction_methods = Counter(doc.extraction_method for doc in successful_docs if doc.extraction_method)\n \n return {\n 'integration_method': 'content_integrator_v2',\n 'total_documents': len(all_documents),\n 'successful_extractions': len(successful_docs),\n 'failed_extractions': len(all_documents) - len(successful_docs),\n 'total_file_size': total_file_size,\n 'extraction_methods_used': dict(extraction_methods),\n 'processing_timestamp': datetime.now().isoformat(),\n 'content_integration_enabled': self.config.enable_content_integration,\n 'max_content_length': self.config.max_content_length,\n 'max_topics': self.config.max_topics\n }\n \n def _create_empty_integration(self, documents: List[ExtractedDocument], reason: str) -> IntegratedContent:\n \"\"\"创建空的整合结果\"\"\"\n return IntegratedContent(\n documents=documents,\n document_count=len(documents),\n total_content_length=0,\n document_types=self._count_document_types(documents),\n combined_content=\"\",\n content_summary=f\"整合失败: {reason}\",\n key_topics=[],\n integration_metadata={'integration_method': 'empty', 'failure_reason': reason},\n integrated_at=datetime.now()\n )\n \n def analyze_content_overlap(self, documents: List[ExtractedDocument]) -> Dict[str, Any]:\n \"\"\"分析文档内容重叠情况\"\"\"\n if len(documents) < 2:\n return {'overlap_analysis': '需要至少2个文档进行重叠分析'}\n \n # 简单的重叠分析\n doc_words = []\n for doc in documents:\n if doc.content and not doc.error_info:\n words = set(self._preprocess_text(doc.content).split())\n doc_words.append((doc.filename, words))\n \n if len(doc_words) < 2:\n return {'overlap_analysis': '没有足够的有效文档内容进行分析'}\n \n overlap_results = {}\n \n for i in range(len(doc_words)):\n for j in range(i + 1, len(doc_words)):\n doc1_name, words1 = doc_words[i]\n doc2_name, words2 = doc_words[j]\n \n common_words = words1.intersection(words2)\n total_words = len(words1.union(words2))\n \n if total_words > 0:\n overlap_ratio = len(common_words) / total_words\n overlap_results[f\"{doc1_name} vs {doc2_name}\"] = {\n 'common_words_count': len(common_words),\n 'overlap_ratio': round(overlap_ratio, 3),\n 'common_words_sample': list(common_words)[:10]\n }\n \n return {\n 'overlap_analysis': overlap_results,\n 'total_comparisons': len(overlap_results)\n }\n \n def get_integration_stats(self) -> Dict[str, Any]:\n \"\"\"获取整合器统计信息\"\"\"\n return {\n 'max_content_length': self.config.max_content_length,\n 'max_topics': self.config.max_topics,\n 'enable_content_integration': self.config.enable_content_integration,\n 'stop_words_count': len(self.stop_words),\n 'supported_analysis': [\n 'content_combination',\n 'summary_generation', \n 'key_topic_extraction',\n 'content_overlap_analysis',\n 'document_type_statistics'\n ]\n } ",
|
||
"code_hash": "5105fa33ca6b0bb75b1bad5e693df5a5"
|
||
}
|
||
],
|
||
"imports": [
|
||
{
|
||
"type": "import",
|
||
"modules": [
|
||
"logging"
|
||
],
|
||
"aliases": []
|
||
},
|
||
{
|
||
"type": "import",
|
||
"modules": [
|
||
"re"
|
||
],
|
||
"aliases": []
|
||
},
|
||
{
|
||
"type": "from_import",
|
||
"module": "typing",
|
||
"names": [
|
||
"List",
|
||
"Dict",
|
||
"Any",
|
||
"Optional",
|
||
"Set"
|
||
],
|
||
"aliases": [],
|
||
"level": 0
|
||
},
|
||
{
|
||
"type": "from_import",
|
||
"module": "dataclasses",
|
||
"names": [
|
||
"dataclass"
|
||
],
|
||
"aliases": [],
|
||
"level": 0
|
||
},
|
||
{
|
||
"type": "from_import",
|
||
"module": "datetime",
|
||
"names": [
|
||
"datetime"
|
||
],
|
||
"aliases": [],
|
||
"level": 0
|
||
},
|
||
{
|
||
"type": "from_import",
|
||
"module": "collections",
|
||
"names": [
|
||
"Counter"
|
||
],
|
||
"aliases": [],
|
||
"level": 0
|
||
},
|
||
{
|
||
"type": "from_import",
|
||
"module": "text_extractor",
|
||
"names": [
|
||
"ExtractedDocument"
|
||
],
|
||
"aliases": [],
|
||
"level": 1
|
||
},
|
||
{
|
||
"type": "from_import",
|
||
"module": "config",
|
||
"names": [
|
||
"DocumentProcessingConfig"
|
||
],
|
||
"aliases": [],
|
||
"level": 2
|
||
},
|
||
{
|
||
"type": "from_import",
|
||
"module": "exceptions",
|
||
"names": [
|
||
"DocumentProcessingError"
|
||
],
|
||
"aliases": [],
|
||
"level": 2
|
||
}
|
||
],
|
||
"constants": [],
|
||
"docstring": "Content Integrator\n内容整合器 - 重构版本,将多个文档的内容进行整合和分析",
|
||
"content_hash": "0b20bd330220e4a3604ff7aef74d05ec"
|
||
} |