{ "file_path": "travel-algorithms/travel_algorithms/document_processing/document_processor.py", "file_size": 7229, "line_count": 200, "functions": [ { "name": "__init__", "line_start": 28, "line_end": 40, "args": [ { "name": "self" }, { "name": "config", "type_hint": "AlgorithmConfig" } ], "return_type": null, "docstring": "初始化文档处理器\n\nArgs:\n config: 算法配置", "is_async": false, "decorators": [], "code": " def __init__(self, config: AlgorithmConfig):\n \"\"\"\n 初始化文档处理器\n \n Args:\n config: 算法配置\n \"\"\"\n self.config = config\n self.text_extractor = TextExtractor(config.document_processing)\n self.content_integrator = ContentIntegrator(config.document_processing)\n self.content_transformer = ContentTransformer(config)\n \n logger.info(\"统一文档处理器初始化完成\")", "code_hash": "4b9ee82331745ab38dca2eda7d5ab72e" }, { "name": "_extract_documents", "line_start": 108, "line_end": 134, "args": [ { "name": "self" }, { "name": "sources", "type_hint": "Union[str, Path, List[Union[str, Path]]]" } ], "return_type": "List[ExtractedDocument]", "docstring": "提取文档内容", "is_async": false, "decorators": [], "code": " def _extract_documents(self, sources: Union[str, Path, List[Union[str, Path]]]) -> List[ExtractedDocument]:\n \"\"\"提取文档内容\"\"\"\n if isinstance(sources, (str, Path)):\n sources = [sources]\n \n extracted_docs = []\n \n for source in sources:\n source_path = Path(source)\n \n try:\n if source_path.is_file():\n # 处理单个文件\n doc = self.text_extractor.extract_from_file(source_path)\n extracted_docs.append(doc)\n elif source_path.is_dir():\n # 处理目录中的所有文件\n dir_docs = self.text_extractor.extract_from_directory(source_path)\n extracted_docs.extend(dir_docs)\n else:\n logger.warning(f\"无效的文档源: {source_path}\")\n \n except Exception as e:\n logger.error(f\"处理文档源失败 {source_path}: {e}\")\n continue\n \n return extracted_docs", "code_hash": "b4ce9ee5159b47d86dd213744fc8f6df" }, { "name": "integrate_only", "line_start": 143, "line_end": 145, "args": [ { "name": "self" }, { "name": "documents", "type_hint": "List[ExtractedDocument]" } ], "return_type": "IntegratedContent", "docstring": "仅执行内容整合", "is_async": false, "decorators": [], "code": " def integrate_only(self, documents: List[ExtractedDocument]) -> IntegratedContent:\n \"\"\"仅执行内容整合\"\"\"\n return self.content_integrator.integrate_documents(documents)", "code_hash": "368f39a7d8dab90bd6e5235af61765d6" }, { "name": "get_supported_formats", "line_start": 162, "line_end": 167, "args": [ { "name": "self" } ], "return_type": "Dict[str, str]", "docstring": "获取支持的文件格式和转换格式", "is_async": false, "decorators": [], "code": " def get_supported_formats(self) -> Dict[str, str]:\n \"\"\"获取支持的文件格式和转换格式\"\"\"\n return {\n 'input_formats': self.text_extractor.get_supported_formats(),\n 'output_formats': self.content_transformer.get_supported_formats()\n }", "code_hash": "fba6eb3a4149d4d2f972168ce616591b" }, { "name": "get_processing_stats", "line_start": 169, "line_end": 176, "args": [ { "name": "self" } ], "return_type": "Dict[str, Any]", "docstring": "获取处理器统计信息", "is_async": false, "decorators": [], "code": " def get_processing_stats(self) -> Dict[str, Any]:\n \"\"\"获取处理器统计信息\"\"\"\n return {\n 'extractor_stats': self.text_extractor.get_extraction_stats(),\n 'integrator_stats': self.content_integrator.get_integration_stats(),\n 'transformer_stats': self.content_transformer.get_transformation_stats(),\n 'supported_formats': self.get_supported_formats()\n }", "code_hash": "a854e400e8958fa3168b34474aeee365" } ], "classes": [ { "name": "DocumentProcessor", "line_start": 22, "line_end": 201, "bases": [], "methods": [ { "name": "__init__", "line_start": 28, "line_end": 40, "args": [ { "name": "self" }, { "name": "config", "type_hint": "AlgorithmConfig" } ], "return_type": null, "docstring": "初始化文档处理器\n\nArgs:\n config: 算法配置", "is_async": false, "decorators": [], "code": " def __init__(self, config: AlgorithmConfig):\n \"\"\"\n 初始化文档处理器\n \n Args:\n config: 算法配置\n \"\"\"\n self.config = config\n self.text_extractor = TextExtractor(config.document_processing)\n self.content_integrator = ContentIntegrator(config.document_processing)\n self.content_transformer = ContentTransformer(config)\n \n logger.info(\"统一文档处理器初始化完成\")", "code_hash": "4b9ee82331745ab38dca2eda7d5ab72e" }, { "name": "process_documents", "line_start": 42, "line_end": 106, "args": [ { "name": "self" }, { "name": "sources", "type_hint": "Union[str, Path, List[Union[str, Path]]]" }, { "name": "target_format", "type_hint": "str" }, { "name": "custom_prompt", "type_hint": "Optional[str]" }, { "name": "additional_requirements", "type_hint": "Optional[str]" } ], "return_type": "Dict[str, Any]", "docstring": "完整的文档处理流水线\n\nArgs:\n sources: 文档源(文件路径、目录路径或路径列表)\n target_format: 目标转换格式\n custom_prompt: 自定义提示词\n additional_requirements: 额外要求\n \nReturns:\n 包含所有处理结果的字典", "is_async": true, "decorators": [], "code": " async def process_documents(\n self,\n sources: Union[str, Path, List[Union[str, Path]]],\n target_format: str = 'summary',\n custom_prompt: Optional[str] = None,\n additional_requirements: Optional[str] = None\n ) -> Dict[str, Any]:\n \"\"\"\n 完整的文档处理流水线\n \n Args:\n sources: 文档源(文件路径、目录路径或路径列表)\n target_format: 目标转换格式\n custom_prompt: 自定义提示词\n additional_requirements: 额外要求\n \n Returns:\n 包含所有处理结果的字典\n \"\"\"\n try:\n logger.info(f\"开始文档处理流水线,目标格式: {target_format}\")\n \n # 第一步:提取文档内容\n extracted_docs = self._extract_documents(sources)\n \n if not extracted_docs:\n raise DocumentProcessingError(\"没有成功提取的文档\")\n \n # 第二步:整合文档内容\n integrated_content = self.content_integrator.integrate_documents(extracted_docs)\n \n # 第三步:转换为目标格式\n transformed_content = await self.content_transformer.transform_content(\n integrated_content=integrated_content,\n format_type=target_format,\n custom_prompt=custom_prompt,\n additional_requirements=additional_requirements\n )\n \n # 整理最终结果\n result = {\n 'processing_summary': {\n 'total_documents': len(extracted_docs),\n 'successful_extractions': len([d for d in extracted_docs if not d.error_info]),\n 'target_format': target_format,\n 'processing_completed': True\n },\n 'extracted_documents': [doc.get_summary() for doc in extracted_docs],\n 'integrated_content': integrated_content.to_dict(),\n 'transformed_content': transformed_content.to_dict(),\n 'final_output': transformed_content.transformed_text,\n 'quality_metrics': {\n 'extraction_success_rate': len([d for d in extracted_docs if not d.error_info]) / len(extracted_docs),\n 'content_integration_quality': len(integrated_content.key_topics) / max(1, integrated_content.document_count),\n 'transformation_quality': transformed_content.quality_score\n }\n }\n \n logger.info(f\"文档处理流水线完成,最终输出长度: {len(transformed_content.transformed_text)}\")\n return result\n \n except Exception as e:\n error_msg = f\"文档处理流水线失败: {str(e)}\"\n logger.error(error_msg, exc_info=True)\n raise DocumentProcessingError(error_msg)", "code_hash": "f7fb77852390521c0eb54badeae1b5a1" }, { "name": "_extract_documents", "line_start": 108, "line_end": 134, "args": [ { "name": "self" }, { "name": "sources", "type_hint": "Union[str, Path, List[Union[str, Path]]]" } ], "return_type": "List[ExtractedDocument]", "docstring": "提取文档内容", "is_async": false, "decorators": [], "code": " def _extract_documents(self, sources: Union[str, Path, List[Union[str, Path]]]) -> List[ExtractedDocument]:\n \"\"\"提取文档内容\"\"\"\n if isinstance(sources, (str, Path)):\n sources = [sources]\n \n extracted_docs = []\n \n for source in sources:\n source_path = Path(source)\n \n try:\n if source_path.is_file():\n # 处理单个文件\n doc = self.text_extractor.extract_from_file(source_path)\n extracted_docs.append(doc)\n elif source_path.is_dir():\n # 处理目录中的所有文件\n dir_docs = self.text_extractor.extract_from_directory(source_path)\n extracted_docs.extend(dir_docs)\n else:\n logger.warning(f\"无效的文档源: {source_path}\")\n \n except Exception as e:\n logger.error(f\"处理文档源失败 {source_path}: {e}\")\n continue\n \n return extracted_docs", "code_hash": "b4ce9ee5159b47d86dd213744fc8f6df" }, { "name": "extract_only", "line_start": 136, "line_end": 141, "args": [ { "name": "self" }, { "name": "sources", "type_hint": "Union[str, Path, List[Union[str, Path]]]" } ], "return_type": "List[ExtractedDocument]", "docstring": "仅执行文档提取", "is_async": true, "decorators": [], "code": " async def extract_only(\n self, \n sources: Union[str, Path, List[Union[str, Path]]]\n ) -> List[ExtractedDocument]:\n \"\"\"仅执行文档提取\"\"\"\n return self._extract_documents(sources)", "code_hash": "4d805849738623cdd16c37152c1e3371" }, { "name": "integrate_only", "line_start": 143, "line_end": 145, "args": [ { "name": "self" }, { "name": "documents", "type_hint": "List[ExtractedDocument]" } ], "return_type": "IntegratedContent", "docstring": "仅执行内容整合", "is_async": false, "decorators": [], "code": " def integrate_only(self, documents: List[ExtractedDocument]) -> IntegratedContent:\n \"\"\"仅执行内容整合\"\"\"\n return self.content_integrator.integrate_documents(documents)", "code_hash": "368f39a7d8dab90bd6e5235af61765d6" }, { "name": "transform_only", "line_start": 147, "line_end": 160, "args": [ { "name": "self" }, { "name": "integrated_content", "type_hint": "IntegratedContent" }, { "name": "target_format", "type_hint": "str" }, { "name": "custom_prompt", "type_hint": "Optional[str]" }, { "name": "additional_requirements", "type_hint": "Optional[str]" } ], "return_type": "TransformedContent", "docstring": "仅执行内容转换", "is_async": true, "decorators": [], "code": " async def transform_only(\n self,\n integrated_content: IntegratedContent,\n target_format: str = 'summary',\n custom_prompt: Optional[str] = None,\n additional_requirements: Optional[str] = None\n ) -> TransformedContent:\n \"\"\"仅执行内容转换\"\"\"\n return await self.content_transformer.transform_content(\n integrated_content=integrated_content,\n format_type=target_format,\n custom_prompt=custom_prompt,\n additional_requirements=additional_requirements\n )", "code_hash": "ef7e84b598ade4399922e642f056fbe8" }, { "name": "get_supported_formats", "line_start": 162, "line_end": 167, "args": [ { "name": "self" } ], "return_type": "Dict[str, str]", "docstring": "获取支持的文件格式和转换格式", "is_async": false, "decorators": [], "code": " def get_supported_formats(self) -> Dict[str, str]:\n \"\"\"获取支持的文件格式和转换格式\"\"\"\n return {\n 'input_formats': self.text_extractor.get_supported_formats(),\n 'output_formats': self.content_transformer.get_supported_formats()\n }", "code_hash": "fba6eb3a4149d4d2f972168ce616591b" }, { "name": "get_processing_stats", "line_start": 169, "line_end": 176, "args": [ { "name": "self" } ], "return_type": "Dict[str, Any]", "docstring": "获取处理器统计信息", "is_async": false, "decorators": [], "code": " def get_processing_stats(self) -> Dict[str, Any]:\n \"\"\"获取处理器统计信息\"\"\"\n return {\n 'extractor_stats': self.text_extractor.get_extraction_stats(),\n 'integrator_stats': self.content_integrator.get_integration_stats(),\n 'transformer_stats': self.content_transformer.get_transformation_stats(),\n 'supported_formats': self.get_supported_formats()\n }", "code_hash": "a854e400e8958fa3168b34474aeee365" }, { "name": "batch_process_directories", "line_start": 178, "line_end": 201, "args": [ { "name": "self" }, { "name": "directories", "type_hint": "List[Union[str, Path]]" }, { "name": "target_format", "type_hint": "str" } ], "return_type": "Dict[str, Dict[str, Any]]", "docstring": "批量处理多个目录", "is_async": true, "decorators": [], "code": " async def batch_process_directories(\n self,\n directories: List[Union[str, Path]],\n target_format: str = 'summary'\n ) -> Dict[str, Dict[str, Any]]:\n \"\"\"批量处理多个目录\"\"\"\n results = {}\n \n for i, directory in enumerate(directories):\n try:\n logger.info(f\"批量处理目录 {i+1}/{len(directories)}: {directory}\")\n \n result = await self.process_documents(\n sources=directory,\n target_format=target_format\n )\n \n results[str(directory)] = result\n \n except Exception as e:\n logger.error(f\"批量处理目录失败 {directory}: {e}\")\n results[str(directory)] = {'error': str(e)}\n \n return results ", "code_hash": "0c58d9c4d0f4d546cb77a89d2eb374ec" } ], "docstring": "统一文档处理器\n提供完整的文档处理流水线:提取 -> 整合 -> 转换", "decorators": [], "code": "class DocumentProcessor:\n \"\"\"\n 统一文档处理器\n 提供完整的文档处理流水线:提取 -> 整合 -> 转换\n \"\"\"\n \n def __init__(self, config: AlgorithmConfig):\n \"\"\"\n 初始化文档处理器\n \n Args:\n config: 算法配置\n \"\"\"\n self.config = config\n self.text_extractor = TextExtractor(config.document_processing)\n self.content_integrator = ContentIntegrator(config.document_processing)\n self.content_transformer = ContentTransformer(config)\n \n logger.info(\"统一文档处理器初始化完成\")\n \n async def process_documents(\n self,\n sources: Union[str, Path, List[Union[str, Path]]],\n target_format: str = 'summary',\n custom_prompt: Optional[str] = None,\n additional_requirements: Optional[str] = None\n ) -> Dict[str, Any]:\n \"\"\"\n 完整的文档处理流水线\n \n Args:\n sources: 文档源(文件路径、目录路径或路径列表)\n target_format: 目标转换格式\n custom_prompt: 自定义提示词\n additional_requirements: 额外要求\n \n Returns:\n 包含所有处理结果的字典\n \"\"\"\n try:\n logger.info(f\"开始文档处理流水线,目标格式: {target_format}\")\n \n # 第一步:提取文档内容\n extracted_docs = self._extract_documents(sources)\n \n if not extracted_docs:\n raise DocumentProcessingError(\"没有成功提取的文档\")\n \n # 第二步:整合文档内容\n integrated_content = self.content_integrator.integrate_documents(extracted_docs)\n \n # 第三步:转换为目标格式\n transformed_content = await self.content_transformer.transform_content(\n integrated_content=integrated_content,\n format_type=target_format,\n custom_prompt=custom_prompt,\n additional_requirements=additional_requirements\n )\n \n # 整理最终结果\n result = {\n 'processing_summary': {\n 'total_documents': len(extracted_docs),\n 'successful_extractions': len([d for d in extracted_docs if not d.error_info]),\n 'target_format': target_format,\n 'processing_completed': True\n },\n 'extracted_documents': [doc.get_summary() for doc in extracted_docs],\n 'integrated_content': integrated_content.to_dict(),\n 'transformed_content': transformed_content.to_dict(),\n 'final_output': transformed_content.transformed_text,\n 'quality_metrics': {\n 'extraction_success_rate': len([d for d in extracted_docs if not d.error_info]) / len(extracted_docs),\n 'content_integration_quality': len(integrated_content.key_topics) / max(1, integrated_content.document_count),\n 'transformation_quality': transformed_content.quality_score\n }\n }\n \n logger.info(f\"文档处理流水线完成,最终输出长度: {len(transformed_content.transformed_text)}\")\n return result\n \n except Exception as e:\n error_msg = f\"文档处理流水线失败: {str(e)}\"\n logger.error(error_msg, exc_info=True)\n raise DocumentProcessingError(error_msg)\n \n def _extract_documents(self, sources: Union[str, Path, List[Union[str, Path]]]) -> List[ExtractedDocument]:\n \"\"\"提取文档内容\"\"\"\n if isinstance(sources, (str, Path)):\n sources = [sources]\n \n extracted_docs = []\n \n for source in sources:\n source_path = Path(source)\n \n try:\n if source_path.is_file():\n # 处理单个文件\n doc = self.text_extractor.extract_from_file(source_path)\n extracted_docs.append(doc)\n elif source_path.is_dir():\n # 处理目录中的所有文件\n dir_docs = self.text_extractor.extract_from_directory(source_path)\n extracted_docs.extend(dir_docs)\n else:\n logger.warning(f\"无效的文档源: {source_path}\")\n \n except Exception as e:\n logger.error(f\"处理文档源失败 {source_path}: {e}\")\n continue\n \n return extracted_docs\n \n async def extract_only(\n self, \n sources: Union[str, Path, List[Union[str, Path]]]\n ) -> List[ExtractedDocument]:\n \"\"\"仅执行文档提取\"\"\"\n return self._extract_documents(sources)\n \n def integrate_only(self, documents: List[ExtractedDocument]) -> IntegratedContent:\n \"\"\"仅执行内容整合\"\"\"\n return self.content_integrator.integrate_documents(documents)\n \n async def transform_only(\n self,\n integrated_content: IntegratedContent,\n target_format: str = 'summary',\n custom_prompt: Optional[str] = None,\n additional_requirements: Optional[str] = None\n ) -> TransformedContent:\n \"\"\"仅执行内容转换\"\"\"\n return await self.content_transformer.transform_content(\n integrated_content=integrated_content,\n format_type=target_format,\n custom_prompt=custom_prompt,\n additional_requirements=additional_requirements\n )\n \n def get_supported_formats(self) -> Dict[str, str]:\n \"\"\"获取支持的文件格式和转换格式\"\"\"\n return {\n 'input_formats': self.text_extractor.get_supported_formats(),\n 'output_formats': self.content_transformer.get_supported_formats()\n }\n \n def get_processing_stats(self) -> Dict[str, Any]:\n \"\"\"获取处理器统计信息\"\"\"\n return {\n 'extractor_stats': self.text_extractor.get_extraction_stats(),\n 'integrator_stats': self.content_integrator.get_integration_stats(),\n 'transformer_stats': self.content_transformer.get_transformation_stats(),\n 'supported_formats': self.get_supported_formats()\n }\n \n async def batch_process_directories(\n self,\n directories: List[Union[str, Path]],\n target_format: str = 'summary'\n ) -> Dict[str, Dict[str, Any]]:\n \"\"\"批量处理多个目录\"\"\"\n results = {}\n \n for i, directory in enumerate(directories):\n try:\n logger.info(f\"批量处理目录 {i+1}/{len(directories)}: {directory}\")\n \n result = await self.process_documents(\n sources=directory,\n target_format=target_format\n )\n \n results[str(directory)] = result\n \n except Exception as e:\n logger.error(f\"批量处理目录失败 {directory}: {e}\")\n results[str(directory)] = {'error': str(e)}\n \n return results ", "code_hash": "cc7099758d3cfc656cba79490f676eae" } ], "imports": [ { "type": "import", "modules": [ "logging" ], "aliases": [] }, { "type": "from_import", "module": "typing", "names": [ "List", "Dict", "Any", "Optional", "Union" ], "aliases": [], "level": 0 }, { "type": "from_import", "module": "pathlib", "names": [ "Path" ], "aliases": [], "level": 0 }, { "type": "from_import", "module": "text_extractor", "names": [ "TextExtractor", "ExtractedDocument" ], "aliases": [], "level": 1 }, { "type": "from_import", "module": "content_integrator", "names": [ "ContentIntegrator", "IntegratedContent" ], "aliases": [], "level": 1 }, { "type": "from_import", "module": "content_transformer", "names": [ "ContentTransformer", "TransformedContent" ], "aliases": [], "level": 1 }, { "type": "from_import", "module": "config", "names": [ "AlgorithmConfig" ], "aliases": [], "level": 2 }, { "type": "from_import", "module": "exceptions", "names": [ "DocumentProcessingError" ], "aliases": [], "level": 2 } ], "constants": [], "docstring": "Document Processor\n统一文档处理器 - 集成文本提取、内容整合和格式转换功能", "content_hash": "ce51a3f63e5798e85ac277283280b7dd" }