bangbang-aigc-server/detailed_comparison/document_processing/old_content_integrator.py.json

{
  "file_path": "document/content_integrator.py",
  "file_size": 4125,
  "line_count": 129,
  "functions": [
    {
      "name": "__post_init__",
      "line_start": 21,
      "line_end": 27,
      "args": [
        {
          "name": "self"
        }
      ],
      "return_type": null,
      "docstring": "初始化后处理",
      "is_async": false,
      "decorators": [],
      "code": "    def __post_init__(self):\n        \"\"\"初始化后处理\"\"\"\n        if not self.document_types:\n            self.document_types = {}\n            for doc in self.documents:\n                ext = doc.file_type.lower()\n                self.document_types[ext] = self.document_types.get(ext, 0) + 1",
      "code_hash": "c3ef16d786a9c2c23091f4f8516362d2"
    },
    {
      "name": "__init__",
      "line_start": 32,
      "line_end": 33,
      "args": [
        {
          "name": "self"
        }
      ],
      "return_type": null,
      "docstring": "",
      "is_async": false,
      "decorators": [],
      "code": "    def __init__(self):\n        pass",
      "code_hash": "daee209c03368e1f5e1bbd6536a5c59d"
    },
    {
      "name": "integrate_documents",
      "line_start": 35,
      "line_end": 79,
      "args": [
        {
          "name": "self"
        },
        {
          "name": "documents",
          "type_hint": "List[ExtractedDocument]"
        }
      ],
      "return_type": "IntegratedContent",
      "docstring": "整合多个文档\n\nArgs:\n    documents: 提取的文档列表\n    \nReturns:\n    IntegratedContent: 整合后的内容",
      "is_async": false,
      "decorators": [],
      "code": "    def integrate_documents(self, documents: List[ExtractedDocument]) -> IntegratedContent:\n        \"\"\"整合多个文档\n        \n        Args:\n            documents: 提取的文档列表\n            \n        Returns:\n            IntegratedContent: 整合后的内容\n        \"\"\"\n        if not documents:\n            return IntegratedContent(\n                documents=[],\n                document_count=0,\n                total_content_length=0,\n                document_types={},\n                combined_content=\"\",\n                content_summary=\"没有提供文档内容\",\n                key_topics=[]\n            )\n        \n        # 统计文档类型\n        document_types = {}\n        for doc in documents:\n            ext = doc.file_type.lower()\n            document_types[ext] = document_types.get(ext, 0) + 1\n        \n        # 合并内容\n        combined_content = self._combine_content(documents)\n        total_length = len(combined_content)\n        \n        # 生成摘要\n        content_summary = self._generate_summary(documents)\n        \n        # 提取关键主题\n        key_topics = self._extract_key_topics(combined_content)\n        \n        return IntegratedContent(\n            documents=documents,\n            document_count=len(documents),\n            total_content_length=total_length,\n            document_types=document_types,\n            combined_content=combined_content,\n            content_summary=content_summary,\n            key_topics=key_topics\n        )",
      "code_hash": "e20f5e22ceb80b12a11b34adc1bbfec0"
    },
    {
      "name": "_combine_content",
      "line_start": 81,
      "line_end": 97,
      "args": [
        {
          "name": "self"
        },
        {
          "name": "documents",
          "type_hint": "List[ExtractedDocument]"
        }
      ],
      "return_type": "str",
      "docstring": "合并文档内容",
      "is_async": false,
      "decorators": [],
      "code": "    def _combine_content(self, documents: List[ExtractedDocument]) -> str:\n        \"\"\"合并文档内容\"\"\"\n        combined = []\n        \n        for i, doc in enumerate(documents, 1):\n            combined.append(f\"=== 文档 {i}: {doc.filename} ===\")\n            combined.append(f\"文件类型: {doc.file_type}\")\n            combined.append(f\"文件大小: {doc.file_size} 字节\")\n            combined.append(f\"提取时间: {doc.extracted_at}\")\n            combined.append(\"\")\n            combined.append(\"内容:\")\n            combined.append(doc.content)\n            combined.append(\"\")\n            combined.append(\"=\" * 50)\n            combined.append(\"\")\n        \n        return \"\\n\".join(combined)",
      "code_hash": "a8d95c650807d96892836fca700e3f54"
    },
    {
      "name": "_generate_summary",
      "line_start": 99,
      "line_end": 111,
      "args": [
        {
          "name": "self"
        },
        {
          "name": "documents",
          "type_hint": "List[ExtractedDocument]"
        }
      ],
      "return_type": "str",
      "docstring": "生成内容摘要",
      "is_async": false,
      "decorators": [],
      "code": "    def _generate_summary(self, documents: List[ExtractedDocument]) -> str:\n        \"\"\"生成内容摘要\"\"\"\n        if not documents:\n            return \"没有文档内容\"\n        \n        summary_parts = []\n        summary_parts.append(f\"共处理了 {len(documents)} 个文档:\")\n        \n        for i, doc in enumerate(documents, 1):\n            content_preview = doc.content[:100] + \"...\" if len(doc.content) > 100 else doc.content\n            summary_parts.append(f\"{i}. {doc.filename} ({doc.file_type}): {content_preview}\")\n        \n        return \"\\n\".join(summary_parts)",
      "code_hash": "934fc6f97d5ccb683e0177edd66a0366"
    },
    {
      "name": "_extract_key_topics",
      "line_start": 113,
      "line_end": 130,
      "args": [
        {
          "name": "self"
        },
        {
          "name": "content",
          "type_hint": "str"
        }
      ],
      "return_type": "List[str]",
      "docstring": "提取关键主题（简单的关键词提取）",
      "is_async": false,
      "decorators": [],
      "code": "    def _extract_key_topics(self, content: str) -> List[str]:\n        \"\"\"提取关键主题（简单的关键词提取）\"\"\"\n        if not content:\n            return []\n        \n        # 简单的中文关键词提取\n        # 这里可以根据需要使用更复杂的NLP方法\n        words = re.findall(r'[\\u4e00-\\u9fff]+', content)\n        \n        # 统计词频\n        word_count = {}\n        for word in words:\n            if len(word) >= 2:  # 只考虑长度>=2的词\n                word_count[word] = word_count.get(word, 0) + 1\n        \n        # 返回出现频率最高的前10个词\n        sorted_words = sorted(word_count.items(), key=lambda x: x[1], reverse=True)\n        return [word for word, count in sorted_words[:10] if count > 1] ",
      "code_hash": "66d45bfafce3b915b1837e1a633c45a4"
    }
  ],
  "classes": [
    {
      "name": "IntegratedContent",
      "line_start": 11,
      "line_end": 27,
      "bases": [],
      "methods": [
        {
          "name": "__post_init__",
          "line_start": 21,
          "line_end": 27,
          "args": [
            {
              "name": "self"
            }
          ],
          "return_type": null,
          "docstring": "初始化后处理",
          "is_async": false,
          "decorators": [],
          "code": "    def __post_init__(self):\n        \"\"\"初始化后处理\"\"\"\n        if not self.document_types:\n            self.document_types = {}\n            for doc in self.documents:\n                ext = doc.file_type.lower()\n                self.document_types[ext] = self.document_types.get(ext, 0) + 1",
          "code_hash": "c3ef16d786a9c2c23091f4f8516362d2"
        }
      ],
      "docstring": "整合后的内容",
      "decorators": [
        "dataclass"
      ],
      "code": "class IntegratedContent:\n    \"\"\"整合后的内容\"\"\"\n    documents: List[ExtractedDocument]\n    document_count: int\n    total_content_length: int\n    document_types: Dict[str, int]\n    combined_content: str\n    content_summary: str\n    key_topics: List[str]\n    \n    def __post_init__(self):\n        \"\"\"初始化后处理\"\"\"\n        if not self.document_types:\n            self.document_types = {}\n            for doc in self.documents:\n                ext = doc.file_type.lower()\n                self.document_types[ext] = self.document_types.get(ext, 0) + 1",
      "code_hash": "8b9786ed613765dcd7c6ed75bf0b69c8"
    },
    {
      "name": "ContentIntegrator",
      "line_start": 29,
      "line_end": 130,
      "bases": [],
      "methods": [
        {
          "name": "__init__",
          "line_start": 32,
          "line_end": 33,
          "args": [
            {
              "name": "self"
            }
          ],
          "return_type": null,
          "docstring": "",
          "is_async": false,
          "decorators": [],
          "code": "    def __init__(self):\n        pass",
          "code_hash": "daee209c03368e1f5e1bbd6536a5c59d"
        },
        {
          "name": "integrate_documents",
          "line_start": 35,
          "line_end": 79,
          "args": [
            {
              "name": "self"
            },
            {
              "name": "documents",
              "type_hint": "List[ExtractedDocument]"
            }
          ],
          "return_type": "IntegratedContent",
          "docstring": "整合多个文档\n\nArgs:\n    documents: 提取的文档列表\n    \nReturns:\n    IntegratedContent: 整合后的内容",
          "is_async": false,
          "decorators": [],
          "code": "    def integrate_documents(self, documents: List[ExtractedDocument]) -> IntegratedContent:\n        \"\"\"整合多个文档\n        \n        Args:\n            documents: 提取的文档列表\n            \n        Returns:\n            IntegratedContent: 整合后的内容\n        \"\"\"\n        if not documents:\n            return IntegratedContent(\n                documents=[],\n                document_count=0,\n                total_content_length=0,\n                document_types={},\n                combined_content=\"\",\n                content_summary=\"没有提供文档内容\",\n                key_topics=[]\n            )\n        \n        # 统计文档类型\n        document_types = {}\n        for doc in documents:\n            ext = doc.file_type.lower()\n            document_types[ext] = document_types.get(ext, 0) + 1\n        \n        # 合并内容\n        combined_content = self._combine_content(documents)\n        total_length = len(combined_content)\n        \n        # 生成摘要\n        content_summary = self._generate_summary(documents)\n        \n        # 提取关键主题\n        key_topics = self._extract_key_topics(combined_content)\n        \n        return IntegratedContent(\n            documents=documents,\n            document_count=len(documents),\n            total_content_length=total_length,\n            document_types=document_types,\n            combined_content=combined_content,\n            content_summary=content_summary,\n            key_topics=key_topics\n        )",
          "code_hash": "e20f5e22ceb80b12a11b34adc1bbfec0"
        },
        {
          "name": "_combine_content",
          "line_start": 81,
          "line_end": 97,
          "args": [
            {
              "name": "self"
            },
            {
              "name": "documents",
              "type_hint": "List[ExtractedDocument]"
            }
          ],
          "return_type": "str",
          "docstring": "合并文档内容",
          "is_async": false,
          "decorators": [],
          "code": "    def _combine_content(self, documents: List[ExtractedDocument]) -> str:\n        \"\"\"合并文档内容\"\"\"\n        combined = []\n        \n        for i, doc in enumerate(documents, 1):\n            combined.append(f\"=== 文档 {i}: {doc.filename} ===\")\n            combined.append(f\"文件类型: {doc.file_type}\")\n            combined.append(f\"文件大小: {doc.file_size} 字节\")\n            combined.append(f\"提取时间: {doc.extracted_at}\")\n            combined.append(\"\")\n            combined.append(\"内容:\")\n            combined.append(doc.content)\n            combined.append(\"\")\n            combined.append(\"=\" * 50)\n            combined.append(\"\")\n        \n        return \"\\n\".join(combined)",
          "code_hash": "a8d95c650807d96892836fca700e3f54"
        },
        {
          "name": "_generate_summary",
          "line_start": 99,
          "line_end": 111,
          "args": [
            {
              "name": "self"
            },
            {
              "name": "documents",
              "type_hint": "List[ExtractedDocument]"
            }
          ],
          "return_type": "str",
          "docstring": "生成内容摘要",
          "is_async": false,
          "decorators": [],
          "code": "    def _generate_summary(self, documents: List[ExtractedDocument]) -> str:\n        \"\"\"生成内容摘要\"\"\"\n        if not documents:\n            return \"没有文档内容\"\n        \n        summary_parts = []\n        summary_parts.append(f\"共处理了 {len(documents)} 个文档:\")\n        \n        for i, doc in enumerate(documents, 1):\n            content_preview = doc.content[:100] + \"...\" if len(doc.content) > 100 else doc.content\n            summary_parts.append(f\"{i}. {doc.filename} ({doc.file_type}): {content_preview}\")\n        \n        return \"\\n\".join(summary_parts)",
          "code_hash": "934fc6f97d5ccb683e0177edd66a0366"
        },
        {
          "name": "_extract_key_topics",
          "line_start": 113,
          "line_end": 130,
          "args": [
            {
              "name": "self"
            },
            {
              "name": "content",
              "type_hint": "str"
            }
          ],
          "return_type": "List[str]",
          "docstring": "提取关键主题（简单的关键词提取）",
          "is_async": false,
          "decorators": [],
          "code": "    def _extract_key_topics(self, content: str) -> List[str]:\n        \"\"\"提取关键主题（简单的关键词提取）\"\"\"\n        if not content:\n            return []\n        \n        # 简单的中文关键词提取\n        # 这里可以根据需要使用更复杂的NLP方法\n        words = re.findall(r'[\\u4e00-\\u9fff]+', content)\n        \n        # 统计词频\n        word_count = {}\n        for word in words:\n            if len(word) >= 2:  # 只考虑长度>=2的词\n                word_count[word] = word_count.get(word, 0) + 1\n        \n        # 返回出现频率最高的前10个词\n        sorted_words = sorted(word_count.items(), key=lambda x: x[1], reverse=True)\n        return [word for word, count in sorted_words[:10] if count > 1] ",
          "code_hash": "66d45bfafce3b915b1837e1a633c45a4"
        }
      ],
      "docstring": "内容整合器 - 整合多个文档的信息",
      "decorators": [],
      "code": "class ContentIntegrator:\n    \"\"\"内容整合器 - 整合多个文档的信息\"\"\"\n    \n    def __init__(self):\n        pass\n    \n    def integrate_documents(self, documents: List[ExtractedDocument]) -> IntegratedContent:\n        \"\"\"整合多个文档\n        \n        Args:\n            documents: 提取的文档列表\n            \n        Returns:\n            IntegratedContent: 整合后的内容\n        \"\"\"\n        if not documents:\n            return IntegratedContent(\n                documents=[],\n                document_count=0,\n                total_content_length=0,\n                document_types={},\n                combined_content=\"\",\n                content_summary=\"没有提供文档内容\",\n                key_topics=[]\n            )\n        \n        # 统计文档类型\n        document_types = {}\n        for doc in documents:\n            ext = doc.file_type.lower()\n            document_types[ext] = document_types.get(ext, 0) + 1\n        \n        # 合并内容\n        combined_content = self._combine_content(documents)\n        total_length = len(combined_content)\n        \n        # 生成摘要\n        content_summary = self._generate_summary(documents)\n        \n        # 提取关键主题\n        key_topics = self._extract_key_topics(combined_content)\n        \n        return IntegratedContent(\n            documents=documents,\n            document_count=len(documents),\n            total_content_length=total_length,\n            document_types=document_types,\n            combined_content=combined_content,\n            content_summary=content_summary,\n            key_topics=key_topics\n        )\n    \n    def _combine_content(self, documents: List[ExtractedDocument]) -> str:\n        \"\"\"合并文档内容\"\"\"\n        combined = []\n        \n        for i, doc in enumerate(documents, 1):\n            combined.append(f\"=== 文档 {i}: {doc.filename} ===\")\n            combined.append(f\"文件类型: {doc.file_type}\")\n            combined.append(f\"文件大小: {doc.file_size} 字节\")\n            combined.append(f\"提取时间: {doc.extracted_at}\")\n            combined.append(\"\")\n            combined.append(\"内容:\")\n            combined.append(doc.content)\n            combined.append(\"\")\n            combined.append(\"=\" * 50)\n            combined.append(\"\")\n        \n        return \"\\n\".join(combined)\n    \n    def _generate_summary(self, documents: List[ExtractedDocument]) -> str:\n        \"\"\"生成内容摘要\"\"\"\n        if not documents:\n            return \"没有文档内容\"\n        \n        summary_parts = []\n        summary_parts.append(f\"共处理了 {len(documents)} 个文档:\")\n        \n        for i, doc in enumerate(documents, 1):\n            content_preview = doc.content[:100] + \"...\" if len(doc.content) > 100 else doc.content\n            summary_parts.append(f\"{i}. {doc.filename} ({doc.file_type}): {content_preview}\")\n        \n        return \"\\n\".join(summary_parts)\n    \n    def _extract_key_topics(self, content: str) -> List[str]:\n        \"\"\"提取关键主题（简单的关键词提取）\"\"\"\n        if not content:\n            return []\n        \n        # 简单的中文关键词提取\n        # 这里可以根据需要使用更复杂的NLP方法\n        words = re.findall(r'[\\u4e00-\\u9fff]+', content)\n        \n        # 统计词频\n        word_count = {}\n        for word in words:\n            if len(word) >= 2:  # 只考虑长度>=2的词\n                word_count[word] = word_count.get(word, 0) + 1\n        \n        # 返回出现频率最高的前10个词\n        sorted_words = sorted(word_count.items(), key=lambda x: x[1], reverse=True)\n        return [word for word, count in sorted_words[:10] if count > 1] ",
      "code_hash": "dbbcdf46aa1b5c5eb28abc040390b4ef"
    }
  ],
  "imports": [
    {
      "type": "import",
      "modules": [
        "logging"
      ],
      "aliases": []
    },
    {
      "type": "from_import",
      "module": "typing",
      "names": [
        "List",
        "Dict",
        "Any",
        "Optional"
      ],
      "aliases": [],
      "level": 0
    },
    {
      "type": "from_import",
      "module": "dataclasses",
      "names": [
        "dataclass"
      ],
      "aliases": [],
      "level": 0
    },
    {
      "type": "from_import",
      "module": "datetime",
      "names": [
        "datetime"
      ],
      "aliases": [],
      "level": 0
    },
    {
      "type": "from_import",
      "module": "text_extractor",
      "names": [
        "ExtractedDocument"
      ],
      "aliases": [],
      "level": 1
    },
    {
      "type": "import",
      "modules": [
        "re"
      ],
      "aliases": []
    }
  ],
  "constants": [],
  "docstring": "",
  "content_hash": "d2d6195231df575ad4b88dfe689acd0a"
}