2025-07-31 15:35:23 +08:00

395 lines
27 KiB
JSON
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"file_path": "travel-algorithms/travel_algorithms/document_processing/document_processor.py",
"file_size": 7229,
"line_count": 200,
"functions": [
{
"name": "__init__",
"line_start": 28,
"line_end": 40,
"args": [
{
"name": "self"
},
{
"name": "config",
"type_hint": "AlgorithmConfig"
}
],
"return_type": null,
"docstring": "初始化文档处理器\n\nArgs:\n config: 算法配置",
"is_async": false,
"decorators": [],
"code": " def __init__(self, config: AlgorithmConfig):\n \"\"\"\n 初始化文档处理器\n \n Args:\n config: 算法配置\n \"\"\"\n self.config = config\n self.text_extractor = TextExtractor(config.document_processing)\n self.content_integrator = ContentIntegrator(config.document_processing)\n self.content_transformer = ContentTransformer(config)\n \n logger.info(\"统一文档处理器初始化完成\")",
"code_hash": "4b9ee82331745ab38dca2eda7d5ab72e"
},
{
"name": "_extract_documents",
"line_start": 108,
"line_end": 134,
"args": [
{
"name": "self"
},
{
"name": "sources",
"type_hint": "Union[str, Path, List[Union[str, Path]]]"
}
],
"return_type": "List[ExtractedDocument]",
"docstring": "提取文档内容",
"is_async": false,
"decorators": [],
"code": " def _extract_documents(self, sources: Union[str, Path, List[Union[str, Path]]]) -> List[ExtractedDocument]:\n \"\"\"提取文档内容\"\"\"\n if isinstance(sources, (str, Path)):\n sources = [sources]\n \n extracted_docs = []\n \n for source in sources:\n source_path = Path(source)\n \n try:\n if source_path.is_file():\n # 处理单个文件\n doc = self.text_extractor.extract_from_file(source_path)\n extracted_docs.append(doc)\n elif source_path.is_dir():\n # 处理目录中的所有文件\n dir_docs = self.text_extractor.extract_from_directory(source_path)\n extracted_docs.extend(dir_docs)\n else:\n logger.warning(f\"无效的文档源: {source_path}\")\n \n except Exception as e:\n logger.error(f\"处理文档源失败 {source_path}: {e}\")\n continue\n \n return extracted_docs",
"code_hash": "b4ce9ee5159b47d86dd213744fc8f6df"
},
{
"name": "integrate_only",
"line_start": 143,
"line_end": 145,
"args": [
{
"name": "self"
},
{
"name": "documents",
"type_hint": "List[ExtractedDocument]"
}
],
"return_type": "IntegratedContent",
"docstring": "仅执行内容整合",
"is_async": false,
"decorators": [],
"code": " def integrate_only(self, documents: List[ExtractedDocument]) -> IntegratedContent:\n \"\"\"仅执行内容整合\"\"\"\n return self.content_integrator.integrate_documents(documents)",
"code_hash": "368f39a7d8dab90bd6e5235af61765d6"
},
{
"name": "get_supported_formats",
"line_start": 162,
"line_end": 167,
"args": [
{
"name": "self"
}
],
"return_type": "Dict[str, str]",
"docstring": "获取支持的文件格式和转换格式",
"is_async": false,
"decorators": [],
"code": " def get_supported_formats(self) -> Dict[str, str]:\n \"\"\"获取支持的文件格式和转换格式\"\"\"\n return {\n 'input_formats': self.text_extractor.get_supported_formats(),\n 'output_formats': self.content_transformer.get_supported_formats()\n }",
"code_hash": "fba6eb3a4149d4d2f972168ce616591b"
},
{
"name": "get_processing_stats",
"line_start": 169,
"line_end": 176,
"args": [
{
"name": "self"
}
],
"return_type": "Dict[str, Any]",
"docstring": "获取处理器统计信息",
"is_async": false,
"decorators": [],
"code": " def get_processing_stats(self) -> Dict[str, Any]:\n \"\"\"获取处理器统计信息\"\"\"\n return {\n 'extractor_stats': self.text_extractor.get_extraction_stats(),\n 'integrator_stats': self.content_integrator.get_integration_stats(),\n 'transformer_stats': self.content_transformer.get_transformation_stats(),\n 'supported_formats': self.get_supported_formats()\n }",
"code_hash": "a854e400e8958fa3168b34474aeee365"
}
],
"classes": [
{
"name": "DocumentProcessor",
"line_start": 22,
"line_end": 201,
"bases": [],
"methods": [
{
"name": "__init__",
"line_start": 28,
"line_end": 40,
"args": [
{
"name": "self"
},
{
"name": "config",
"type_hint": "AlgorithmConfig"
}
],
"return_type": null,
"docstring": "初始化文档处理器\n\nArgs:\n config: 算法配置",
"is_async": false,
"decorators": [],
"code": " def __init__(self, config: AlgorithmConfig):\n \"\"\"\n 初始化文档处理器\n \n Args:\n config: 算法配置\n \"\"\"\n self.config = config\n self.text_extractor = TextExtractor(config.document_processing)\n self.content_integrator = ContentIntegrator(config.document_processing)\n self.content_transformer = ContentTransformer(config)\n \n logger.info(\"统一文档处理器初始化完成\")",
"code_hash": "4b9ee82331745ab38dca2eda7d5ab72e"
},
{
"name": "process_documents",
"line_start": 42,
"line_end": 106,
"args": [
{
"name": "self"
},
{
"name": "sources",
"type_hint": "Union[str, Path, List[Union[str, Path]]]"
},
{
"name": "target_format",
"type_hint": "str"
},
{
"name": "custom_prompt",
"type_hint": "Optional[str]"
},
{
"name": "additional_requirements",
"type_hint": "Optional[str]"
}
],
"return_type": "Dict[str, Any]",
"docstring": "完整的文档处理流水线\n\nArgs:\n sources: 文档源(文件路径、目录路径或路径列表)\n target_format: 目标转换格式\n custom_prompt: 自定义提示词\n additional_requirements: 额外要求\n \nReturns:\n 包含所有处理结果的字典",
"is_async": true,
"decorators": [],
"code": " async def process_documents(\n self,\n sources: Union[str, Path, List[Union[str, Path]]],\n target_format: str = 'summary',\n custom_prompt: Optional[str] = None,\n additional_requirements: Optional[str] = None\n ) -> Dict[str, Any]:\n \"\"\"\n 完整的文档处理流水线\n \n Args:\n sources: 文档源(文件路径、目录路径或路径列表)\n target_format: 目标转换格式\n custom_prompt: 自定义提示词\n additional_requirements: 额外要求\n \n Returns:\n 包含所有处理结果的字典\n \"\"\"\n try:\n logger.info(f\"开始文档处理流水线,目标格式: {target_format}\")\n \n # 第一步:提取文档内容\n extracted_docs = self._extract_documents(sources)\n \n if not extracted_docs:\n raise DocumentProcessingError(\"没有成功提取的文档\")\n \n # 第二步:整合文档内容\n integrated_content = self.content_integrator.integrate_documents(extracted_docs)\n \n # 第三步:转换为目标格式\n transformed_content = await self.content_transformer.transform_content(\n integrated_content=integrated_content,\n format_type=target_format,\n custom_prompt=custom_prompt,\n additional_requirements=additional_requirements\n )\n \n # 整理最终结果\n result = {\n 'processing_summary': {\n 'total_documents': len(extracted_docs),\n 'successful_extractions': len([d for d in extracted_docs if not d.error_info]),\n 'target_format': target_format,\n 'processing_completed': True\n },\n 'extracted_documents': [doc.get_summary() for doc in extracted_docs],\n 'integrated_content': integrated_content.to_dict(),\n 'transformed_content': transformed_content.to_dict(),\n 'final_output': transformed_content.transformed_text,\n 'quality_metrics': {\n 'extraction_success_rate': len([d for d in extracted_docs if not d.error_info]) / len(extracted_docs),\n 'content_integration_quality': len(integrated_content.key_topics) / max(1, integrated_content.document_count),\n 'transformation_quality': transformed_content.quality_score\n }\n }\n \n logger.info(f\"文档处理流水线完成,最终输出长度: {len(transformed_content.transformed_text)}\")\n return result\n \n except Exception as e:\n error_msg = f\"文档处理流水线失败: {str(e)}\"\n logger.error(error_msg, exc_info=True)\n raise DocumentProcessingError(error_msg)",
"code_hash": "f7fb77852390521c0eb54badeae1b5a1"
},
{
"name": "_extract_documents",
"line_start": 108,
"line_end": 134,
"args": [
{
"name": "self"
},
{
"name": "sources",
"type_hint": "Union[str, Path, List[Union[str, Path]]]"
}
],
"return_type": "List[ExtractedDocument]",
"docstring": "提取文档内容",
"is_async": false,
"decorators": [],
"code": " def _extract_documents(self, sources: Union[str, Path, List[Union[str, Path]]]) -> List[ExtractedDocument]:\n \"\"\"提取文档内容\"\"\"\n if isinstance(sources, (str, Path)):\n sources = [sources]\n \n extracted_docs = []\n \n for source in sources:\n source_path = Path(source)\n \n try:\n if source_path.is_file():\n # 处理单个文件\n doc = self.text_extractor.extract_from_file(source_path)\n extracted_docs.append(doc)\n elif source_path.is_dir():\n # 处理目录中的所有文件\n dir_docs = self.text_extractor.extract_from_directory(source_path)\n extracted_docs.extend(dir_docs)\n else:\n logger.warning(f\"无效的文档源: {source_path}\")\n \n except Exception as e:\n logger.error(f\"处理文档源失败 {source_path}: {e}\")\n continue\n \n return extracted_docs",
"code_hash": "b4ce9ee5159b47d86dd213744fc8f6df"
},
{
"name": "extract_only",
"line_start": 136,
"line_end": 141,
"args": [
{
"name": "self"
},
{
"name": "sources",
"type_hint": "Union[str, Path, List[Union[str, Path]]]"
}
],
"return_type": "List[ExtractedDocument]",
"docstring": "仅执行文档提取",
"is_async": true,
"decorators": [],
"code": " async def extract_only(\n self, \n sources: Union[str, Path, List[Union[str, Path]]]\n ) -> List[ExtractedDocument]:\n \"\"\"仅执行文档提取\"\"\"\n return self._extract_documents(sources)",
"code_hash": "4d805849738623cdd16c37152c1e3371"
},
{
"name": "integrate_only",
"line_start": 143,
"line_end": 145,
"args": [
{
"name": "self"
},
{
"name": "documents",
"type_hint": "List[ExtractedDocument]"
}
],
"return_type": "IntegratedContent",
"docstring": "仅执行内容整合",
"is_async": false,
"decorators": [],
"code": " def integrate_only(self, documents: List[ExtractedDocument]) -> IntegratedContent:\n \"\"\"仅执行内容整合\"\"\"\n return self.content_integrator.integrate_documents(documents)",
"code_hash": "368f39a7d8dab90bd6e5235af61765d6"
},
{
"name": "transform_only",
"line_start": 147,
"line_end": 160,
"args": [
{
"name": "self"
},
{
"name": "integrated_content",
"type_hint": "IntegratedContent"
},
{
"name": "target_format",
"type_hint": "str"
},
{
"name": "custom_prompt",
"type_hint": "Optional[str]"
},
{
"name": "additional_requirements",
"type_hint": "Optional[str]"
}
],
"return_type": "TransformedContent",
"docstring": "仅执行内容转换",
"is_async": true,
"decorators": [],
"code": " async def transform_only(\n self,\n integrated_content: IntegratedContent,\n target_format: str = 'summary',\n custom_prompt: Optional[str] = None,\n additional_requirements: Optional[str] = None\n ) -> TransformedContent:\n \"\"\"仅执行内容转换\"\"\"\n return await self.content_transformer.transform_content(\n integrated_content=integrated_content,\n format_type=target_format,\n custom_prompt=custom_prompt,\n additional_requirements=additional_requirements\n )",
"code_hash": "ef7e84b598ade4399922e642f056fbe8"
},
{
"name": "get_supported_formats",
"line_start": 162,
"line_end": 167,
"args": [
{
"name": "self"
}
],
"return_type": "Dict[str, str]",
"docstring": "获取支持的文件格式和转换格式",
"is_async": false,
"decorators": [],
"code": " def get_supported_formats(self) -> Dict[str, str]:\n \"\"\"获取支持的文件格式和转换格式\"\"\"\n return {\n 'input_formats': self.text_extractor.get_supported_formats(),\n 'output_formats': self.content_transformer.get_supported_formats()\n }",
"code_hash": "fba6eb3a4149d4d2f972168ce616591b"
},
{
"name": "get_processing_stats",
"line_start": 169,
"line_end": 176,
"args": [
{
"name": "self"
}
],
"return_type": "Dict[str, Any]",
"docstring": "获取处理器统计信息",
"is_async": false,
"decorators": [],
"code": " def get_processing_stats(self) -> Dict[str, Any]:\n \"\"\"获取处理器统计信息\"\"\"\n return {\n 'extractor_stats': self.text_extractor.get_extraction_stats(),\n 'integrator_stats': self.content_integrator.get_integration_stats(),\n 'transformer_stats': self.content_transformer.get_transformation_stats(),\n 'supported_formats': self.get_supported_formats()\n }",
"code_hash": "a854e400e8958fa3168b34474aeee365"
},
{
"name": "batch_process_directories",
"line_start": 178,
"line_end": 201,
"args": [
{
"name": "self"
},
{
"name": "directories",
"type_hint": "List[Union[str, Path]]"
},
{
"name": "target_format",
"type_hint": "str"
}
],
"return_type": "Dict[str, Dict[str, Any]]",
"docstring": "批量处理多个目录",
"is_async": true,
"decorators": [],
"code": " async def batch_process_directories(\n self,\n directories: List[Union[str, Path]],\n target_format: str = 'summary'\n ) -> Dict[str, Dict[str, Any]]:\n \"\"\"批量处理多个目录\"\"\"\n results = {}\n \n for i, directory in enumerate(directories):\n try:\n logger.info(f\"批量处理目录 {i+1}/{len(directories)}: {directory}\")\n \n result = await self.process_documents(\n sources=directory,\n target_format=target_format\n )\n \n results[str(directory)] = result\n \n except Exception as e:\n logger.error(f\"批量处理目录失败 {directory}: {e}\")\n results[str(directory)] = {'error': str(e)}\n \n return results ",
"code_hash": "0c58d9c4d0f4d546cb77a89d2eb374ec"
}
],
"docstring": "统一文档处理器\n提供完整的文档处理流水线提取 -> 整合 -> 转换",
"decorators": [],
"code": "class DocumentProcessor:\n \"\"\"\n 统一文档处理器\n 提供完整的文档处理流水线:提取 -> 整合 -> 转换\n \"\"\"\n \n def __init__(self, config: AlgorithmConfig):\n \"\"\"\n 初始化文档处理器\n \n Args:\n config: 算法配置\n \"\"\"\n self.config = config\n self.text_extractor = TextExtractor(config.document_processing)\n self.content_integrator = ContentIntegrator(config.document_processing)\n self.content_transformer = ContentTransformer(config)\n \n logger.info(\"统一文档处理器初始化完成\")\n \n async def process_documents(\n self,\n sources: Union[str, Path, List[Union[str, Path]]],\n target_format: str = 'summary',\n custom_prompt: Optional[str] = None,\n additional_requirements: Optional[str] = None\n ) -> Dict[str, Any]:\n \"\"\"\n 完整的文档处理流水线\n \n Args:\n sources: 文档源(文件路径、目录路径或路径列表)\n target_format: 目标转换格式\n custom_prompt: 自定义提示词\n additional_requirements: 额外要求\n \n Returns:\n 包含所有处理结果的字典\n \"\"\"\n try:\n logger.info(f\"开始文档处理流水线,目标格式: {target_format}\")\n \n # 第一步:提取文档内容\n extracted_docs = self._extract_documents(sources)\n \n if not extracted_docs:\n raise DocumentProcessingError(\"没有成功提取的文档\")\n \n # 第二步:整合文档内容\n integrated_content = self.content_integrator.integrate_documents(extracted_docs)\n \n # 第三步:转换为目标格式\n transformed_content = await self.content_transformer.transform_content(\n integrated_content=integrated_content,\n format_type=target_format,\n custom_prompt=custom_prompt,\n additional_requirements=additional_requirements\n )\n \n # 整理最终结果\n result = {\n 'processing_summary': {\n 'total_documents': len(extracted_docs),\n 'successful_extractions': len([d for d in extracted_docs if not d.error_info]),\n 'target_format': target_format,\n 'processing_completed': True\n },\n 'extracted_documents': [doc.get_summary() for doc in extracted_docs],\n 'integrated_content': integrated_content.to_dict(),\n 'transformed_content': transformed_content.to_dict(),\n 'final_output': transformed_content.transformed_text,\n 'quality_metrics': {\n 'extraction_success_rate': len([d for d in extracted_docs if not d.error_info]) / len(extracted_docs),\n 'content_integration_quality': len(integrated_content.key_topics) / max(1, integrated_content.document_count),\n 'transformation_quality': transformed_content.quality_score\n }\n }\n \n logger.info(f\"文档处理流水线完成,最终输出长度: {len(transformed_content.transformed_text)}\")\n return result\n \n except Exception as e:\n error_msg = f\"文档处理流水线失败: {str(e)}\"\n logger.error(error_msg, exc_info=True)\n raise DocumentProcessingError(error_msg)\n \n def _extract_documents(self, sources: Union[str, Path, List[Union[str, Path]]]) -> List[ExtractedDocument]:\n \"\"\"提取文档内容\"\"\"\n if isinstance(sources, (str, Path)):\n sources = [sources]\n \n extracted_docs = []\n \n for source in sources:\n source_path = Path(source)\n \n try:\n if source_path.is_file():\n # 处理单个文件\n doc = self.text_extractor.extract_from_file(source_path)\n extracted_docs.append(doc)\n elif source_path.is_dir():\n # 处理目录中的所有文件\n dir_docs = self.text_extractor.extract_from_directory(source_path)\n extracted_docs.extend(dir_docs)\n else:\n logger.warning(f\"无效的文档源: {source_path}\")\n \n except Exception as e:\n logger.error(f\"处理文档源失败 {source_path}: {e}\")\n continue\n \n return extracted_docs\n \n async def extract_only(\n self, \n sources: Union[str, Path, List[Union[str, Path]]]\n ) -> List[ExtractedDocument]:\n \"\"\"仅执行文档提取\"\"\"\n return self._extract_documents(sources)\n \n def integrate_only(self, documents: List[ExtractedDocument]) -> IntegratedContent:\n \"\"\"仅执行内容整合\"\"\"\n return self.content_integrator.integrate_documents(documents)\n \n async def transform_only(\n self,\n integrated_content: IntegratedContent,\n target_format: str = 'summary',\n custom_prompt: Optional[str] = None,\n additional_requirements: Optional[str] = None\n ) -> TransformedContent:\n \"\"\"仅执行内容转换\"\"\"\n return await self.content_transformer.transform_content(\n integrated_content=integrated_content,\n format_type=target_format,\n custom_prompt=custom_prompt,\n additional_requirements=additional_requirements\n )\n \n def get_supported_formats(self) -> Dict[str, str]:\n \"\"\"获取支持的文件格式和转换格式\"\"\"\n return {\n 'input_formats': self.text_extractor.get_supported_formats(),\n 'output_formats': self.content_transformer.get_supported_formats()\n }\n \n def get_processing_stats(self) -> Dict[str, Any]:\n \"\"\"获取处理器统计信息\"\"\"\n return {\n 'extractor_stats': self.text_extractor.get_extraction_stats(),\n 'integrator_stats': self.content_integrator.get_integration_stats(),\n 'transformer_stats': self.content_transformer.get_transformation_stats(),\n 'supported_formats': self.get_supported_formats()\n }\n \n async def batch_process_directories(\n self,\n directories: List[Union[str, Path]],\n target_format: str = 'summary'\n ) -> Dict[str, Dict[str, Any]]:\n \"\"\"批量处理多个目录\"\"\"\n results = {}\n \n for i, directory in enumerate(directories):\n try:\n logger.info(f\"批量处理目录 {i+1}/{len(directories)}: {directory}\")\n \n result = await self.process_documents(\n sources=directory,\n target_format=target_format\n )\n \n results[str(directory)] = result\n \n except Exception as e:\n logger.error(f\"批量处理目录失败 {directory}: {e}\")\n results[str(directory)] = {'error': str(e)}\n \n return results ",
"code_hash": "cc7099758d3cfc656cba79490f676eae"
}
],
"imports": [
{
"type": "import",
"modules": [
"logging"
],
"aliases": []
},
{
"type": "from_import",
"module": "typing",
"names": [
"List",
"Dict",
"Any",
"Optional",
"Union"
],
"aliases": [],
"level": 0
},
{
"type": "from_import",
"module": "pathlib",
"names": [
"Path"
],
"aliases": [],
"level": 0
},
{
"type": "from_import",
"module": "text_extractor",
"names": [
"TextExtractor",
"ExtractedDocument"
],
"aliases": [],
"level": 1
},
{
"type": "from_import",
"module": "content_integrator",
"names": [
"ContentIntegrator",
"IntegratedContent"
],
"aliases": [],
"level": 1
},
{
"type": "from_import",
"module": "content_transformer",
"names": [
"ContentTransformer",
"TransformedContent"
],
"aliases": [],
"level": 1
},
{
"type": "from_import",
"module": "config",
"names": [
"AlgorithmConfig"
],
"aliases": [],
"level": 2
},
{
"type": "from_import",
"module": "exceptions",
"names": [
"DocumentProcessingError"
],
"aliases": [],
"level": 2
}
],
"constants": [],
"docstring": "Document Processor\n统一文档处理器 - 集成文本提取、内容整合和格式转换功能",
"content_hash": "ce51a3f63e5798e85ac277283280b7dd"
}