{ "file_path": "document/content_integrator.py", "file_size": 4125, "line_count": 129, "functions": [ { "name": "__post_init__", "line_start": 21, "line_end": 27, "args": [ { "name": "self" } ], "return_type": null, "docstring": "初始化后处理", "is_async": false, "decorators": [], "code": " def __post_init__(self):\n \"\"\"初始化后处理\"\"\"\n if not self.document_types:\n self.document_types = {}\n for doc in self.documents:\n ext = doc.file_type.lower()\n self.document_types[ext] = self.document_types.get(ext, 0) + 1", "code_hash": "c3ef16d786a9c2c23091f4f8516362d2" }, { "name": "__init__", "line_start": 32, "line_end": 33, "args": [ { "name": "self" } ], "return_type": null, "docstring": "", "is_async": false, "decorators": [], "code": " def __init__(self):\n pass", "code_hash": "daee209c03368e1f5e1bbd6536a5c59d" }, { "name": "integrate_documents", "line_start": 35, "line_end": 79, "args": [ { "name": "self" }, { "name": "documents", "type_hint": "List[ExtractedDocument]" } ], "return_type": "IntegratedContent", "docstring": "整合多个文档\n\nArgs:\n documents: 提取的文档列表\n \nReturns:\n IntegratedContent: 整合后的内容", "is_async": false, "decorators": [], "code": " def integrate_documents(self, documents: List[ExtractedDocument]) -> IntegratedContent:\n \"\"\"整合多个文档\n \n Args:\n documents: 提取的文档列表\n \n Returns:\n IntegratedContent: 整合后的内容\n \"\"\"\n if not documents:\n return IntegratedContent(\n documents=[],\n document_count=0,\n total_content_length=0,\n document_types={},\n combined_content=\"\",\n content_summary=\"没有提供文档内容\",\n key_topics=[]\n )\n \n # 统计文档类型\n document_types = {}\n for doc in documents:\n ext = doc.file_type.lower()\n document_types[ext] = document_types.get(ext, 0) + 1\n \n # 合并内容\n combined_content = self._combine_content(documents)\n total_length = len(combined_content)\n \n # 生成摘要\n content_summary = self._generate_summary(documents)\n \n # 提取关键主题\n key_topics = self._extract_key_topics(combined_content)\n \n return IntegratedContent(\n documents=documents,\n document_count=len(documents),\n total_content_length=total_length,\n document_types=document_types,\n combined_content=combined_content,\n content_summary=content_summary,\n key_topics=key_topics\n )", "code_hash": "e20f5e22ceb80b12a11b34adc1bbfec0" }, { "name": "_combine_content", "line_start": 81, "line_end": 97, "args": [ { "name": "self" }, { "name": "documents", "type_hint": "List[ExtractedDocument]" } ], "return_type": "str", "docstring": "合并文档内容", "is_async": false, "decorators": [], "code": " def _combine_content(self, documents: List[ExtractedDocument]) -> str:\n \"\"\"合并文档内容\"\"\"\n combined = []\n \n for i, doc in enumerate(documents, 1):\n combined.append(f\"=== 文档 {i}: {doc.filename} ===\")\n combined.append(f\"文件类型: {doc.file_type}\")\n combined.append(f\"文件大小: {doc.file_size} 字节\")\n combined.append(f\"提取时间: {doc.extracted_at}\")\n combined.append(\"\")\n combined.append(\"内容:\")\n combined.append(doc.content)\n combined.append(\"\")\n combined.append(\"=\" * 50)\n combined.append(\"\")\n \n return \"\\n\".join(combined)", "code_hash": "a8d95c650807d96892836fca700e3f54" }, { "name": "_generate_summary", "line_start": 99, "line_end": 111, "args": [ { "name": "self" }, { "name": "documents", "type_hint": "List[ExtractedDocument]" } ], "return_type": "str", "docstring": "生成内容摘要", "is_async": false, "decorators": [], "code": " def _generate_summary(self, documents: List[ExtractedDocument]) -> str:\n \"\"\"生成内容摘要\"\"\"\n if not documents:\n return \"没有文档内容\"\n \n summary_parts = []\n summary_parts.append(f\"共处理了 {len(documents)} 个文档:\")\n \n for i, doc in enumerate(documents, 1):\n content_preview = doc.content[:100] + \"...\" if len(doc.content) > 100 else doc.content\n summary_parts.append(f\"{i}. {doc.filename} ({doc.file_type}): {content_preview}\")\n \n return \"\\n\".join(summary_parts)", "code_hash": "934fc6f97d5ccb683e0177edd66a0366" }, { "name": "_extract_key_topics", "line_start": 113, "line_end": 130, "args": [ { "name": "self" }, { "name": "content", "type_hint": "str" } ], "return_type": "List[str]", "docstring": "提取关键主题(简单的关键词提取)", "is_async": false, "decorators": [], "code": " def _extract_key_topics(self, content: str) -> List[str]:\n \"\"\"提取关键主题(简单的关键词提取)\"\"\"\n if not content:\n return []\n \n # 简单的中文关键词提取\n # 这里可以根据需要使用更复杂的NLP方法\n words = re.findall(r'[\\u4e00-\\u9fff]+', content)\n \n # 统计词频\n word_count = {}\n for word in words:\n if len(word) >= 2: # 只考虑长度>=2的词\n word_count[word] = word_count.get(word, 0) + 1\n \n # 返回出现频率最高的前10个词\n sorted_words = sorted(word_count.items(), key=lambda x: x[1], reverse=True)\n return [word for word, count in sorted_words[:10] if count > 1] ", "code_hash": "66d45bfafce3b915b1837e1a633c45a4" } ], "classes": [ { "name": "IntegratedContent", "line_start": 11, "line_end": 27, "bases": [], "methods": [ { "name": "__post_init__", "line_start": 21, "line_end": 27, "args": [ { "name": "self" } ], "return_type": null, "docstring": "初始化后处理", "is_async": false, "decorators": [], "code": " def __post_init__(self):\n \"\"\"初始化后处理\"\"\"\n if not self.document_types:\n self.document_types = {}\n for doc in self.documents:\n ext = doc.file_type.lower()\n self.document_types[ext] = self.document_types.get(ext, 0) + 1", "code_hash": "c3ef16d786a9c2c23091f4f8516362d2" } ], "docstring": "整合后的内容", "decorators": [ "dataclass" ], "code": "class IntegratedContent:\n \"\"\"整合后的内容\"\"\"\n documents: List[ExtractedDocument]\n document_count: int\n total_content_length: int\n document_types: Dict[str, int]\n combined_content: str\n content_summary: str\n key_topics: List[str]\n \n def __post_init__(self):\n \"\"\"初始化后处理\"\"\"\n if not self.document_types:\n self.document_types = {}\n for doc in self.documents:\n ext = doc.file_type.lower()\n self.document_types[ext] = self.document_types.get(ext, 0) + 1", "code_hash": "8b9786ed613765dcd7c6ed75bf0b69c8" }, { "name": "ContentIntegrator", "line_start": 29, "line_end": 130, "bases": [], "methods": [ { "name": "__init__", "line_start": 32, "line_end": 33, "args": [ { "name": "self" } ], "return_type": null, "docstring": "", "is_async": false, "decorators": [], "code": " def __init__(self):\n pass", "code_hash": "daee209c03368e1f5e1bbd6536a5c59d" }, { "name": "integrate_documents", "line_start": 35, "line_end": 79, "args": [ { "name": "self" }, { "name": "documents", "type_hint": "List[ExtractedDocument]" } ], "return_type": "IntegratedContent", "docstring": "整合多个文档\n\nArgs:\n documents: 提取的文档列表\n \nReturns:\n IntegratedContent: 整合后的内容", "is_async": false, "decorators": [], "code": " def integrate_documents(self, documents: List[ExtractedDocument]) -> IntegratedContent:\n \"\"\"整合多个文档\n \n Args:\n documents: 提取的文档列表\n \n Returns:\n IntegratedContent: 整合后的内容\n \"\"\"\n if not documents:\n return IntegratedContent(\n documents=[],\n document_count=0,\n total_content_length=0,\n document_types={},\n combined_content=\"\",\n content_summary=\"没有提供文档内容\",\n key_topics=[]\n )\n \n # 统计文档类型\n document_types = {}\n for doc in documents:\n ext = doc.file_type.lower()\n document_types[ext] = document_types.get(ext, 0) + 1\n \n # 合并内容\n combined_content = self._combine_content(documents)\n total_length = len(combined_content)\n \n # 生成摘要\n content_summary = self._generate_summary(documents)\n \n # 提取关键主题\n key_topics = self._extract_key_topics(combined_content)\n \n return IntegratedContent(\n documents=documents,\n document_count=len(documents),\n total_content_length=total_length,\n document_types=document_types,\n combined_content=combined_content,\n content_summary=content_summary,\n key_topics=key_topics\n )", "code_hash": "e20f5e22ceb80b12a11b34adc1bbfec0" }, { "name": "_combine_content", "line_start": 81, "line_end": 97, "args": [ { "name": "self" }, { "name": "documents", "type_hint": "List[ExtractedDocument]" } ], "return_type": "str", "docstring": "合并文档内容", "is_async": false, "decorators": [], "code": " def _combine_content(self, documents: List[ExtractedDocument]) -> str:\n \"\"\"合并文档内容\"\"\"\n combined = []\n \n for i, doc in enumerate(documents, 1):\n combined.append(f\"=== 文档 {i}: {doc.filename} ===\")\n combined.append(f\"文件类型: {doc.file_type}\")\n combined.append(f\"文件大小: {doc.file_size} 字节\")\n combined.append(f\"提取时间: {doc.extracted_at}\")\n combined.append(\"\")\n combined.append(\"内容:\")\n combined.append(doc.content)\n combined.append(\"\")\n combined.append(\"=\" * 50)\n combined.append(\"\")\n \n return \"\\n\".join(combined)", "code_hash": "a8d95c650807d96892836fca700e3f54" }, { "name": "_generate_summary", "line_start": 99, "line_end": 111, "args": [ { "name": "self" }, { "name": "documents", "type_hint": "List[ExtractedDocument]" } ], "return_type": "str", "docstring": "生成内容摘要", "is_async": false, "decorators": [], "code": " def _generate_summary(self, documents: List[ExtractedDocument]) -> str:\n \"\"\"生成内容摘要\"\"\"\n if not documents:\n return \"没有文档内容\"\n \n summary_parts = []\n summary_parts.append(f\"共处理了 {len(documents)} 个文档:\")\n \n for i, doc in enumerate(documents, 1):\n content_preview = doc.content[:100] + \"...\" if len(doc.content) > 100 else doc.content\n summary_parts.append(f\"{i}. {doc.filename} ({doc.file_type}): {content_preview}\")\n \n return \"\\n\".join(summary_parts)", "code_hash": "934fc6f97d5ccb683e0177edd66a0366" }, { "name": "_extract_key_topics", "line_start": 113, "line_end": 130, "args": [ { "name": "self" }, { "name": "content", "type_hint": "str" } ], "return_type": "List[str]", "docstring": "提取关键主题(简单的关键词提取)", "is_async": false, "decorators": [], "code": " def _extract_key_topics(self, content: str) -> List[str]:\n \"\"\"提取关键主题(简单的关键词提取)\"\"\"\n if not content:\n return []\n \n # 简单的中文关键词提取\n # 这里可以根据需要使用更复杂的NLP方法\n words = re.findall(r'[\\u4e00-\\u9fff]+', content)\n \n # 统计词频\n word_count = {}\n for word in words:\n if len(word) >= 2: # 只考虑长度>=2的词\n word_count[word] = word_count.get(word, 0) + 1\n \n # 返回出现频率最高的前10个词\n sorted_words = sorted(word_count.items(), key=lambda x: x[1], reverse=True)\n return [word for word, count in sorted_words[:10] if count > 1] ", "code_hash": "66d45bfafce3b915b1837e1a633c45a4" } ], "docstring": "内容整合器 - 整合多个文档的信息", "decorators": [], "code": "class ContentIntegrator:\n \"\"\"内容整合器 - 整合多个文档的信息\"\"\"\n \n def __init__(self):\n pass\n \n def integrate_documents(self, documents: List[ExtractedDocument]) -> IntegratedContent:\n \"\"\"整合多个文档\n \n Args:\n documents: 提取的文档列表\n \n Returns:\n IntegratedContent: 整合后的内容\n \"\"\"\n if not documents:\n return IntegratedContent(\n documents=[],\n document_count=0,\n total_content_length=0,\n document_types={},\n combined_content=\"\",\n content_summary=\"没有提供文档内容\",\n key_topics=[]\n )\n \n # 统计文档类型\n document_types = {}\n for doc in documents:\n ext = doc.file_type.lower()\n document_types[ext] = document_types.get(ext, 0) + 1\n \n # 合并内容\n combined_content = self._combine_content(documents)\n total_length = len(combined_content)\n \n # 生成摘要\n content_summary = self._generate_summary(documents)\n \n # 提取关键主题\n key_topics = self._extract_key_topics(combined_content)\n \n return IntegratedContent(\n documents=documents,\n document_count=len(documents),\n total_content_length=total_length,\n document_types=document_types,\n combined_content=combined_content,\n content_summary=content_summary,\n key_topics=key_topics\n )\n \n def _combine_content(self, documents: List[ExtractedDocument]) -> str:\n \"\"\"合并文档内容\"\"\"\n combined = []\n \n for i, doc in enumerate(documents, 1):\n combined.append(f\"=== 文档 {i}: {doc.filename} ===\")\n combined.append(f\"文件类型: {doc.file_type}\")\n combined.append(f\"文件大小: {doc.file_size} 字节\")\n combined.append(f\"提取时间: {doc.extracted_at}\")\n combined.append(\"\")\n combined.append(\"内容:\")\n combined.append(doc.content)\n combined.append(\"\")\n combined.append(\"=\" * 50)\n combined.append(\"\")\n \n return \"\\n\".join(combined)\n \n def _generate_summary(self, documents: List[ExtractedDocument]) -> str:\n \"\"\"生成内容摘要\"\"\"\n if not documents:\n return \"没有文档内容\"\n \n summary_parts = []\n summary_parts.append(f\"共处理了 {len(documents)} 个文档:\")\n \n for i, doc in enumerate(documents, 1):\n content_preview = doc.content[:100] + \"...\" if len(doc.content) > 100 else doc.content\n summary_parts.append(f\"{i}. {doc.filename} ({doc.file_type}): {content_preview}\")\n \n return \"\\n\".join(summary_parts)\n \n def _extract_key_topics(self, content: str) -> List[str]:\n \"\"\"提取关键主题(简单的关键词提取)\"\"\"\n if not content:\n return []\n \n # 简单的中文关键词提取\n # 这里可以根据需要使用更复杂的NLP方法\n words = re.findall(r'[\\u4e00-\\u9fff]+', content)\n \n # 统计词频\n word_count = {}\n for word in words:\n if len(word) >= 2: # 只考虑长度>=2的词\n word_count[word] = word_count.get(word, 0) + 1\n \n # 返回出现频率最高的前10个词\n sorted_words = sorted(word_count.items(), key=lambda x: x[1], reverse=True)\n return [word for word, count in sorted_words[:10] if count > 1] ", "code_hash": "dbbcdf46aa1b5c5eb28abc040390b4ef" } ], "imports": [ { "type": "import", "modules": [ "logging" ], "aliases": [] }, { "type": "from_import", "module": "typing", "names": [ "List", "Dict", "Any", "Optional" ], "aliases": [], "level": 0 }, { "type": "from_import", "module": "dataclasses", "names": [ "dataclass" ], "aliases": [], "level": 0 }, { "type": "from_import", "module": "datetime", "names": [ "datetime" ], "aliases": [], "level": 0 }, { "type": "from_import", "module": "text_extractor", "names": [ "ExtractedDocument" ], "aliases": [], "level": 1 }, { "type": "import", "modules": [ "re" ], "aliases": [] } ], "constants": [], "docstring": "", "content_hash": "d2d6195231df575ad4b88dfe689acd0a" }