317 lines
20 KiB
JSON
317 lines
20 KiB
JSON
{
|
|
"file_path": "document/content_integrator.py",
|
|
"file_size": 4125,
|
|
"line_count": 129,
|
|
"functions": [
|
|
{
|
|
"name": "__post_init__",
|
|
"line_start": 21,
|
|
"line_end": 27,
|
|
"args": [
|
|
{
|
|
"name": "self"
|
|
}
|
|
],
|
|
"return_type": null,
|
|
"docstring": "初始化后处理",
|
|
"is_async": false,
|
|
"decorators": [],
|
|
"code": " def __post_init__(self):\n \"\"\"初始化后处理\"\"\"\n if not self.document_types:\n self.document_types = {}\n for doc in self.documents:\n ext = doc.file_type.lower()\n self.document_types[ext] = self.document_types.get(ext, 0) + 1",
|
|
"code_hash": "c3ef16d786a9c2c23091f4f8516362d2"
|
|
},
|
|
{
|
|
"name": "__init__",
|
|
"line_start": 32,
|
|
"line_end": 33,
|
|
"args": [
|
|
{
|
|
"name": "self"
|
|
}
|
|
],
|
|
"return_type": null,
|
|
"docstring": "",
|
|
"is_async": false,
|
|
"decorators": [],
|
|
"code": " def __init__(self):\n pass",
|
|
"code_hash": "daee209c03368e1f5e1bbd6536a5c59d"
|
|
},
|
|
{
|
|
"name": "integrate_documents",
|
|
"line_start": 35,
|
|
"line_end": 79,
|
|
"args": [
|
|
{
|
|
"name": "self"
|
|
},
|
|
{
|
|
"name": "documents",
|
|
"type_hint": "List[ExtractedDocument]"
|
|
}
|
|
],
|
|
"return_type": "IntegratedContent",
|
|
"docstring": "整合多个文档\n\nArgs:\n documents: 提取的文档列表\n \nReturns:\n IntegratedContent: 整合后的内容",
|
|
"is_async": false,
|
|
"decorators": [],
|
|
"code": " def integrate_documents(self, documents: List[ExtractedDocument]) -> IntegratedContent:\n \"\"\"整合多个文档\n \n Args:\n documents: 提取的文档列表\n \n Returns:\n IntegratedContent: 整合后的内容\n \"\"\"\n if not documents:\n return IntegratedContent(\n documents=[],\n document_count=0,\n total_content_length=0,\n document_types={},\n combined_content=\"\",\n content_summary=\"没有提供文档内容\",\n key_topics=[]\n )\n \n # 统计文档类型\n document_types = {}\n for doc in documents:\n ext = doc.file_type.lower()\n document_types[ext] = document_types.get(ext, 0) + 1\n \n # 合并内容\n combined_content = self._combine_content(documents)\n total_length = len(combined_content)\n \n # 生成摘要\n content_summary = self._generate_summary(documents)\n \n # 提取关键主题\n key_topics = self._extract_key_topics(combined_content)\n \n return IntegratedContent(\n documents=documents,\n document_count=len(documents),\n total_content_length=total_length,\n document_types=document_types,\n combined_content=combined_content,\n content_summary=content_summary,\n key_topics=key_topics\n )",
|
|
"code_hash": "e20f5e22ceb80b12a11b34adc1bbfec0"
|
|
},
|
|
{
|
|
"name": "_combine_content",
|
|
"line_start": 81,
|
|
"line_end": 97,
|
|
"args": [
|
|
{
|
|
"name": "self"
|
|
},
|
|
{
|
|
"name": "documents",
|
|
"type_hint": "List[ExtractedDocument]"
|
|
}
|
|
],
|
|
"return_type": "str",
|
|
"docstring": "合并文档内容",
|
|
"is_async": false,
|
|
"decorators": [],
|
|
"code": " def _combine_content(self, documents: List[ExtractedDocument]) -> str:\n \"\"\"合并文档内容\"\"\"\n combined = []\n \n for i, doc in enumerate(documents, 1):\n combined.append(f\"=== 文档 {i}: {doc.filename} ===\")\n combined.append(f\"文件类型: {doc.file_type}\")\n combined.append(f\"文件大小: {doc.file_size} 字节\")\n combined.append(f\"提取时间: {doc.extracted_at}\")\n combined.append(\"\")\n combined.append(\"内容:\")\n combined.append(doc.content)\n combined.append(\"\")\n combined.append(\"=\" * 50)\n combined.append(\"\")\n \n return \"\\n\".join(combined)",
|
|
"code_hash": "a8d95c650807d96892836fca700e3f54"
|
|
},
|
|
{
|
|
"name": "_generate_summary",
|
|
"line_start": 99,
|
|
"line_end": 111,
|
|
"args": [
|
|
{
|
|
"name": "self"
|
|
},
|
|
{
|
|
"name": "documents",
|
|
"type_hint": "List[ExtractedDocument]"
|
|
}
|
|
],
|
|
"return_type": "str",
|
|
"docstring": "生成内容摘要",
|
|
"is_async": false,
|
|
"decorators": [],
|
|
"code": " def _generate_summary(self, documents: List[ExtractedDocument]) -> str:\n \"\"\"生成内容摘要\"\"\"\n if not documents:\n return \"没有文档内容\"\n \n summary_parts = []\n summary_parts.append(f\"共处理了 {len(documents)} 个文档:\")\n \n for i, doc in enumerate(documents, 1):\n content_preview = doc.content[:100] + \"...\" if len(doc.content) > 100 else doc.content\n summary_parts.append(f\"{i}. {doc.filename} ({doc.file_type}): {content_preview}\")\n \n return \"\\n\".join(summary_parts)",
|
|
"code_hash": "934fc6f97d5ccb683e0177edd66a0366"
|
|
},
|
|
{
|
|
"name": "_extract_key_topics",
|
|
"line_start": 113,
|
|
"line_end": 130,
|
|
"args": [
|
|
{
|
|
"name": "self"
|
|
},
|
|
{
|
|
"name": "content",
|
|
"type_hint": "str"
|
|
}
|
|
],
|
|
"return_type": "List[str]",
|
|
"docstring": "提取关键主题(简单的关键词提取)",
|
|
"is_async": false,
|
|
"decorators": [],
|
|
"code": " def _extract_key_topics(self, content: str) -> List[str]:\n \"\"\"提取关键主题(简单的关键词提取)\"\"\"\n if not content:\n return []\n \n # 简单的中文关键词提取\n # 这里可以根据需要使用更复杂的NLP方法\n words = re.findall(r'[\\u4e00-\\u9fff]+', content)\n \n # 统计词频\n word_count = {}\n for word in words:\n if len(word) >= 2: # 只考虑长度>=2的词\n word_count[word] = word_count.get(word, 0) + 1\n \n # 返回出现频率最高的前10个词\n sorted_words = sorted(word_count.items(), key=lambda x: x[1], reverse=True)\n return [word for word, count in sorted_words[:10] if count > 1] ",
|
|
"code_hash": "66d45bfafce3b915b1837e1a633c45a4"
|
|
}
|
|
],
|
|
"classes": [
|
|
{
|
|
"name": "IntegratedContent",
|
|
"line_start": 11,
|
|
"line_end": 27,
|
|
"bases": [],
|
|
"methods": [
|
|
{
|
|
"name": "__post_init__",
|
|
"line_start": 21,
|
|
"line_end": 27,
|
|
"args": [
|
|
{
|
|
"name": "self"
|
|
}
|
|
],
|
|
"return_type": null,
|
|
"docstring": "初始化后处理",
|
|
"is_async": false,
|
|
"decorators": [],
|
|
"code": " def __post_init__(self):\n \"\"\"初始化后处理\"\"\"\n if not self.document_types:\n self.document_types = {}\n for doc in self.documents:\n ext = doc.file_type.lower()\n self.document_types[ext] = self.document_types.get(ext, 0) + 1",
|
|
"code_hash": "c3ef16d786a9c2c23091f4f8516362d2"
|
|
}
|
|
],
|
|
"docstring": "整合后的内容",
|
|
"decorators": [
|
|
"dataclass"
|
|
],
|
|
"code": "class IntegratedContent:\n \"\"\"整合后的内容\"\"\"\n documents: List[ExtractedDocument]\n document_count: int\n total_content_length: int\n document_types: Dict[str, int]\n combined_content: str\n content_summary: str\n key_topics: List[str]\n \n def __post_init__(self):\n \"\"\"初始化后处理\"\"\"\n if not self.document_types:\n self.document_types = {}\n for doc in self.documents:\n ext = doc.file_type.lower()\n self.document_types[ext] = self.document_types.get(ext, 0) + 1",
|
|
"code_hash": "8b9786ed613765dcd7c6ed75bf0b69c8"
|
|
},
|
|
{
|
|
"name": "ContentIntegrator",
|
|
"line_start": 29,
|
|
"line_end": 130,
|
|
"bases": [],
|
|
"methods": [
|
|
{
|
|
"name": "__init__",
|
|
"line_start": 32,
|
|
"line_end": 33,
|
|
"args": [
|
|
{
|
|
"name": "self"
|
|
}
|
|
],
|
|
"return_type": null,
|
|
"docstring": "",
|
|
"is_async": false,
|
|
"decorators": [],
|
|
"code": " def __init__(self):\n pass",
|
|
"code_hash": "daee209c03368e1f5e1bbd6536a5c59d"
|
|
},
|
|
{
|
|
"name": "integrate_documents",
|
|
"line_start": 35,
|
|
"line_end": 79,
|
|
"args": [
|
|
{
|
|
"name": "self"
|
|
},
|
|
{
|
|
"name": "documents",
|
|
"type_hint": "List[ExtractedDocument]"
|
|
}
|
|
],
|
|
"return_type": "IntegratedContent",
|
|
"docstring": "整合多个文档\n\nArgs:\n documents: 提取的文档列表\n \nReturns:\n IntegratedContent: 整合后的内容",
|
|
"is_async": false,
|
|
"decorators": [],
|
|
"code": " def integrate_documents(self, documents: List[ExtractedDocument]) -> IntegratedContent:\n \"\"\"整合多个文档\n \n Args:\n documents: 提取的文档列表\n \n Returns:\n IntegratedContent: 整合后的内容\n \"\"\"\n if not documents:\n return IntegratedContent(\n documents=[],\n document_count=0,\n total_content_length=0,\n document_types={},\n combined_content=\"\",\n content_summary=\"没有提供文档内容\",\n key_topics=[]\n )\n \n # 统计文档类型\n document_types = {}\n for doc in documents:\n ext = doc.file_type.lower()\n document_types[ext] = document_types.get(ext, 0) + 1\n \n # 合并内容\n combined_content = self._combine_content(documents)\n total_length = len(combined_content)\n \n # 生成摘要\n content_summary = self._generate_summary(documents)\n \n # 提取关键主题\n key_topics = self._extract_key_topics(combined_content)\n \n return IntegratedContent(\n documents=documents,\n document_count=len(documents),\n total_content_length=total_length,\n document_types=document_types,\n combined_content=combined_content,\n content_summary=content_summary,\n key_topics=key_topics\n )",
|
|
"code_hash": "e20f5e22ceb80b12a11b34adc1bbfec0"
|
|
},
|
|
{
|
|
"name": "_combine_content",
|
|
"line_start": 81,
|
|
"line_end": 97,
|
|
"args": [
|
|
{
|
|
"name": "self"
|
|
},
|
|
{
|
|
"name": "documents",
|
|
"type_hint": "List[ExtractedDocument]"
|
|
}
|
|
],
|
|
"return_type": "str",
|
|
"docstring": "合并文档内容",
|
|
"is_async": false,
|
|
"decorators": [],
|
|
"code": " def _combine_content(self, documents: List[ExtractedDocument]) -> str:\n \"\"\"合并文档内容\"\"\"\n combined = []\n \n for i, doc in enumerate(documents, 1):\n combined.append(f\"=== 文档 {i}: {doc.filename} ===\")\n combined.append(f\"文件类型: {doc.file_type}\")\n combined.append(f\"文件大小: {doc.file_size} 字节\")\n combined.append(f\"提取时间: {doc.extracted_at}\")\n combined.append(\"\")\n combined.append(\"内容:\")\n combined.append(doc.content)\n combined.append(\"\")\n combined.append(\"=\" * 50)\n combined.append(\"\")\n \n return \"\\n\".join(combined)",
|
|
"code_hash": "a8d95c650807d96892836fca700e3f54"
|
|
},
|
|
{
|
|
"name": "_generate_summary",
|
|
"line_start": 99,
|
|
"line_end": 111,
|
|
"args": [
|
|
{
|
|
"name": "self"
|
|
},
|
|
{
|
|
"name": "documents",
|
|
"type_hint": "List[ExtractedDocument]"
|
|
}
|
|
],
|
|
"return_type": "str",
|
|
"docstring": "生成内容摘要",
|
|
"is_async": false,
|
|
"decorators": [],
|
|
"code": " def _generate_summary(self, documents: List[ExtractedDocument]) -> str:\n \"\"\"生成内容摘要\"\"\"\n if not documents:\n return \"没有文档内容\"\n \n summary_parts = []\n summary_parts.append(f\"共处理了 {len(documents)} 个文档:\")\n \n for i, doc in enumerate(documents, 1):\n content_preview = doc.content[:100] + \"...\" if len(doc.content) > 100 else doc.content\n summary_parts.append(f\"{i}. {doc.filename} ({doc.file_type}): {content_preview}\")\n \n return \"\\n\".join(summary_parts)",
|
|
"code_hash": "934fc6f97d5ccb683e0177edd66a0366"
|
|
},
|
|
{
|
|
"name": "_extract_key_topics",
|
|
"line_start": 113,
|
|
"line_end": 130,
|
|
"args": [
|
|
{
|
|
"name": "self"
|
|
},
|
|
{
|
|
"name": "content",
|
|
"type_hint": "str"
|
|
}
|
|
],
|
|
"return_type": "List[str]",
|
|
"docstring": "提取关键主题(简单的关键词提取)",
|
|
"is_async": false,
|
|
"decorators": [],
|
|
"code": " def _extract_key_topics(self, content: str) -> List[str]:\n \"\"\"提取关键主题(简单的关键词提取)\"\"\"\n if not content:\n return []\n \n # 简单的中文关键词提取\n # 这里可以根据需要使用更复杂的NLP方法\n words = re.findall(r'[\\u4e00-\\u9fff]+', content)\n \n # 统计词频\n word_count = {}\n for word in words:\n if len(word) >= 2: # 只考虑长度>=2的词\n word_count[word] = word_count.get(word, 0) + 1\n \n # 返回出现频率最高的前10个词\n sorted_words = sorted(word_count.items(), key=lambda x: x[1], reverse=True)\n return [word for word, count in sorted_words[:10] if count > 1] ",
|
|
"code_hash": "66d45bfafce3b915b1837e1a633c45a4"
|
|
}
|
|
],
|
|
"docstring": "内容整合器 - 整合多个文档的信息",
|
|
"decorators": [],
|
|
"code": "class ContentIntegrator:\n \"\"\"内容整合器 - 整合多个文档的信息\"\"\"\n \n def __init__(self):\n pass\n \n def integrate_documents(self, documents: List[ExtractedDocument]) -> IntegratedContent:\n \"\"\"整合多个文档\n \n Args:\n documents: 提取的文档列表\n \n Returns:\n IntegratedContent: 整合后的内容\n \"\"\"\n if not documents:\n return IntegratedContent(\n documents=[],\n document_count=0,\n total_content_length=0,\n document_types={},\n combined_content=\"\",\n content_summary=\"没有提供文档内容\",\n key_topics=[]\n )\n \n # 统计文档类型\n document_types = {}\n for doc in documents:\n ext = doc.file_type.lower()\n document_types[ext] = document_types.get(ext, 0) + 1\n \n # 合并内容\n combined_content = self._combine_content(documents)\n total_length = len(combined_content)\n \n # 生成摘要\n content_summary = self._generate_summary(documents)\n \n # 提取关键主题\n key_topics = self._extract_key_topics(combined_content)\n \n return IntegratedContent(\n documents=documents,\n document_count=len(documents),\n total_content_length=total_length,\n document_types=document_types,\n combined_content=combined_content,\n content_summary=content_summary,\n key_topics=key_topics\n )\n \n def _combine_content(self, documents: List[ExtractedDocument]) -> str:\n \"\"\"合并文档内容\"\"\"\n combined = []\n \n for i, doc in enumerate(documents, 1):\n combined.append(f\"=== 文档 {i}: {doc.filename} ===\")\n combined.append(f\"文件类型: {doc.file_type}\")\n combined.append(f\"文件大小: {doc.file_size} 字节\")\n combined.append(f\"提取时间: {doc.extracted_at}\")\n combined.append(\"\")\n combined.append(\"内容:\")\n combined.append(doc.content)\n combined.append(\"\")\n combined.append(\"=\" * 50)\n combined.append(\"\")\n \n return \"\\n\".join(combined)\n \n def _generate_summary(self, documents: List[ExtractedDocument]) -> str:\n \"\"\"生成内容摘要\"\"\"\n if not documents:\n return \"没有文档内容\"\n \n summary_parts = []\n summary_parts.append(f\"共处理了 {len(documents)} 个文档:\")\n \n for i, doc in enumerate(documents, 1):\n content_preview = doc.content[:100] + \"...\" if len(doc.content) > 100 else doc.content\n summary_parts.append(f\"{i}. {doc.filename} ({doc.file_type}): {content_preview}\")\n \n return \"\\n\".join(summary_parts)\n \n def _extract_key_topics(self, content: str) -> List[str]:\n \"\"\"提取关键主题(简单的关键词提取)\"\"\"\n if not content:\n return []\n \n # 简单的中文关键词提取\n # 这里可以根据需要使用更复杂的NLP方法\n words = re.findall(r'[\\u4e00-\\u9fff]+', content)\n \n # 统计词频\n word_count = {}\n for word in words:\n if len(word) >= 2: # 只考虑长度>=2的词\n word_count[word] = word_count.get(word, 0) + 1\n \n # 返回出现频率最高的前10个词\n sorted_words = sorted(word_count.items(), key=lambda x: x[1], reverse=True)\n return [word for word, count in sorted_words[:10] if count > 1] ",
|
|
"code_hash": "dbbcdf46aa1b5c5eb28abc040390b4ef"
|
|
}
|
|
],
|
|
"imports": [
|
|
{
|
|
"type": "import",
|
|
"modules": [
|
|
"logging"
|
|
],
|
|
"aliases": []
|
|
},
|
|
{
|
|
"type": "from_import",
|
|
"module": "typing",
|
|
"names": [
|
|
"List",
|
|
"Dict",
|
|
"Any",
|
|
"Optional"
|
|
],
|
|
"aliases": [],
|
|
"level": 0
|
|
},
|
|
{
|
|
"type": "from_import",
|
|
"module": "dataclasses",
|
|
"names": [
|
|
"dataclass"
|
|
],
|
|
"aliases": [],
|
|
"level": 0
|
|
},
|
|
{
|
|
"type": "from_import",
|
|
"module": "datetime",
|
|
"names": [
|
|
"datetime"
|
|
],
|
|
"aliases": [],
|
|
"level": 0
|
|
},
|
|
{
|
|
"type": "from_import",
|
|
"module": "text_extractor",
|
|
"names": [
|
|
"ExtractedDocument"
|
|
],
|
|
"aliases": [],
|
|
"level": 1
|
|
},
|
|
{
|
|
"type": "import",
|
|
"modules": [
|
|
"re"
|
|
],
|
|
"aliases": []
|
|
}
|
|
],
|
|
"constants": [],
|
|
"docstring": "",
|
|
"content_hash": "d2d6195231df575ad4b88dfe689acd0a"
|
|
} |