{ "file_path": "travel-algorithms/travel_algorithms/document_processing/text_extractor.py", "file_size": 20206, "line_count": 579, "functions": [ { "name": "__post_init__", "line_start": 63, "line_end": 71, "args": [ { "name": "self" } ], "return_type": null, "docstring": "初始化后处理", "is_async": false, "decorators": [], "code": " def __post_init__(self):\n \"\"\"初始化后处理\"\"\"\n if not self.extracted_at:\n self.extracted_at = datetime.now()\n \n # 计算内容统计\n self.metadata.setdefault('content_length', len(self.content))\n self.metadata.setdefault('word_count', len(self.content.split()))\n self.metadata.setdefault('line_count', len(self.content.splitlines()))", "code_hash": "e3b7d9c15591b8f58f4f499fcf8ecac8" }, { "name": "to_dict", "line_start": 73, "line_end": 85, "args": [ { "name": "self" } ], "return_type": "Dict[str, Any]", "docstring": "转换为字典格式", "is_async": false, "decorators": [], "code": " def to_dict(self) -> Dict[str, Any]:\n \"\"\"转换为字典格式\"\"\"\n return {\n 'filename': self.filename,\n 'file_type': self.file_type,\n 'content': self.content,\n 'metadata': self.metadata,\n 'extracted_at': self.extracted_at.isoformat(),\n 'file_size': self.file_size,\n 'page_count': self.page_count,\n 'extraction_method': self.extraction_method,\n 'error_info': self.error_info\n }", "code_hash": "9aec5f6566c91613207a14430fd3a65f" }, { "name": "get_summary", "line_start": 87, "line_end": 99, "args": [ { "name": "self" } ], "return_type": "Dict[str, Any]", "docstring": "获取文档摘要信息", "is_async": false, "decorators": [], "code": " def get_summary(self) -> Dict[str, Any]:\n \"\"\"获取文档摘要信息\"\"\"\n return {\n 'filename': self.filename,\n 'file_type': self.file_type,\n 'file_size': self.file_size,\n 'page_count': self.page_count,\n 'content_length': len(self.content),\n 'word_count': len(self.content.split()),\n 'extracted_at': self.extracted_at.isoformat(),\n 'extraction_method': self.extraction_method,\n 'has_error': bool(self.error_info)\n }", "code_hash": "3b4aeb4ce342f694cf6f50ec911f85a4" }, { "name": "__init__", "line_start": 123, "line_end": 146, "args": [ { "name": "self" }, { "name": "config", "type_hint": "DocumentProcessingConfig" } ], "return_type": null, "docstring": "初始化文本提取器\n\nArgs:\n config: 文档处理配置", "is_async": false, "decorators": [], "code": " def __init__(self, config: DocumentProcessingConfig):\n \"\"\"\n 初始化文本提取器\n \n Args:\n config: 文档处理配置\n \"\"\"\n self.config = config\n self.extraction_methods = {\n '.pdf': self._extract_from_pdf,\n '.docx': self._extract_from_docx,\n '.doc': self._extract_from_docx,\n '.xlsx': self._extract_from_excel,\n '.xls': self._extract_from_excel,\n '.txt': self._extract_from_text,\n '.md': self._extract_from_text,\n '.csv': self._extract_from_csv,\n '.json': self._extract_from_json,\n '.xml': self._extract_from_xml,\n '.html': self._extract_from_html,\n '.htm': self._extract_from_html\n }\n \n logger.info(f\"文本提取器初始化完成,支持格式: {list(self.SUPPORTED_EXTENSIONS.keys())}\")", "code_hash": "b80795161396567c1a4afe0cbede261e" }, { "name": "extract_from_file", "line_start": 148, "line_end": 230, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Union[str, Path]" } ], "return_type": "ExtractedDocument", "docstring": "从文件中提取文本\n\nArgs:\n file_path: 文件路径\n \nReturns:\n ExtractedDocument: 提取的文档数据\n \nRaises:\n DocumentProcessingError: 提取失败时抛出", "is_async": false, "decorators": [], "code": " def extract_from_file(self, file_path: Union[str, Path]) -> ExtractedDocument:\n \"\"\"\n 从文件中提取文本\n \n Args:\n file_path: 文件路径\n \n Returns:\n ExtractedDocument: 提取的文档数据\n \n Raises:\n DocumentProcessingError: 提取失败时抛出\n \"\"\"\n file_path = Path(file_path)\n \n if not file_path.exists():\n raise ResourceNotFoundError(f\"文件不存在: {file_path}\")\n \n if not file_path.is_file():\n raise DocumentProcessingError(f\"路径不是文件: {file_path}\")\n \n file_ext = file_path.suffix.lower()\n \n if file_ext not in self.SUPPORTED_EXTENSIONS:\n raise DocumentProcessingError(f\"不支持的文件格式: {file_ext}\")\n \n try:\n logger.info(f\"开始提取文件: {file_path}\")\n \n # 获取文件基本信息\n file_size = file_path.stat().st_size\n \n # 检查文件大小限制\n if self.config.max_file_size > 0 and file_size > self.config.max_file_size:\n raise DocumentProcessingError(\n f\"文件大小 {file_size} 超过限制 {self.config.max_file_size}\"\n )\n \n # 根据文件扩展名选择提取方法\n extraction_method = self.extraction_methods.get(file_ext)\n if not extraction_method:\n raise DocumentProcessingError(f\"未找到对应的提取方法: {file_ext}\")\n \n # 执行文本提取\n content, metadata, page_count = extraction_method(file_path)\n \n # 内容长度检查\n if self.config.max_content_length > 0 and len(content) > self.config.max_content_length:\n logger.warning(f\"内容长度 {len(content)} 超过限制,将被截断\")\n content = content[:self.config.max_content_length]\n metadata['content_truncated'] = True\n \n # 创建提取结果\n extracted_doc = ExtractedDocument(\n filename=file_path.name,\n file_type=self.SUPPORTED_EXTENSIONS[file_ext],\n content=content,\n metadata=metadata,\n extracted_at=datetime.now(),\n file_size=file_size,\n page_count=page_count,\n extraction_method=extraction_method.__name__\n )\n \n logger.info(f\"文件提取完成: {file_path}, 内容长度: {len(content)}\")\n return extracted_doc\n \n except Exception as e:\n error_msg = f\"文件提取失败 {file_path}: {str(e)}\"\n logger.error(error_msg, exc_info=True)\n \n # 创建错误文档\n return ExtractedDocument(\n filename=file_path.name,\n file_type=self.SUPPORTED_EXTENSIONS.get(file_ext, \"Unknown\"),\n content=\"\",\n metadata={'extraction_error': str(e)},\n extracted_at=datetime.now(),\n file_size=file_path.stat().st_size if file_path.exists() else 0,\n page_count=None,\n extraction_method=None,\n error_info=error_msg\n )", "code_hash": "ede6a7b109cac815a9ad476f06a9c1c3" }, { "name": "extract_from_directory", "line_start": 232, "line_end": 275, "args": [ { "name": "self" }, { "name": "directory_path", "type_hint": "Union[str, Path]" } ], "return_type": "List[ExtractedDocument]", "docstring": "从目录中提取所有支持的文档\n\nArgs:\n directory_path: 目录路径\n \nReturns:\n List[ExtractedDocument]: 提取的文档列表", "is_async": false, "decorators": [], "code": " def extract_from_directory(self, directory_path: Union[str, Path]) -> List[ExtractedDocument]:\n \"\"\"\n 从目录中提取所有支持的文档\n \n Args:\n directory_path: 目录路径\n \n Returns:\n List[ExtractedDocument]: 提取的文档列表\n \"\"\"\n directory_path = Path(directory_path)\n \n if not directory_path.exists():\n raise ResourceNotFoundError(f\"目录不存在: {directory_path}\")\n \n if not directory_path.is_dir():\n raise DocumentProcessingError(f\"路径不是目录: {directory_path}\")\n \n extracted_docs = []\n processed_count = 0\n \n # 递归搜索文件\n for file_path in directory_path.rglob(\"*\"):\n if file_path.is_file():\n file_ext = file_path.suffix.lower()\n \n if file_ext in self.SUPPORTED_EXTENSIONS:\n try:\n doc = self.extract_from_file(file_path)\n extracted_docs.append(doc)\n processed_count += 1\n \n # 检查处理数量限制\n if (self.config.max_documents > 0 and \n processed_count >= self.config.max_documents):\n logger.warning(f\"达到文档数量限制 {self.config.max_documents}\")\n break\n \n except Exception as e:\n logger.error(f\"处理文件失败 {file_path}: {e}\")\n continue\n \n logger.info(f\"目录提取完成: {directory_path}, 处理文件数: {len(extracted_docs)}\")\n return extracted_docs", "code_hash": "040078b945a9aaae7216ebb69c026e23" }, { "name": "_extract_from_pdf", "line_start": 277, "line_end": 329, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Path" } ], "return_type": "tuple[str, Dict[str, Any], Optional[int]]", "docstring": "从PDF文件中提取文本", "is_async": false, "decorators": [], "code": " def _extract_from_pdf(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从PDF文件中提取文本\"\"\"\n if not PDF_AVAILABLE:\n raise DocumentProcessingError(\"PDF支持库未安装,请安装: pip install PyPDF2 pdfplumber\")\n \n content = \"\"\n metadata = {}\n page_count = 0\n \n try:\n # 优先使用pdfplumber,回退到PyPDF2\n with pdfplumber.open(file_path) as pdf:\n page_count = len(pdf.pages)\n metadata['page_count'] = page_count\n \n for page in pdf.pages:\n text = page.extract_text()\n if text:\n content += text + \"\\n\"\n \n # 提取表格数据\n tables = []\n for page in pdf.pages:\n page_tables = page.extract_tables()\n if page_tables:\n tables.extend(page_tables)\n \n if tables:\n metadata['table_count'] = len(tables)\n # 将表格转换为文本\n for i, table in enumerate(tables):\n content += f\"\\n\\n表格 {i+1}:\\n\"\n for row in table:\n if row:\n content += \"\\t\".join(str(cell) if cell else \"\" for cell in row) + \"\\n\"\n \n except Exception as e:\n logger.warning(f\"pdfplumber提取失败,尝试PyPDF2: {e}\")\n \n try:\n with open(file_path, 'rb') as file:\n pdf_reader = PyPDF2.PdfReader(file)\n page_count = len(pdf_reader.pages)\n metadata['page_count'] = page_count\n \n for page in pdf_reader.pages:\n content += page.extract_text() + \"\\n\"\n \n except Exception as e2:\n raise DocumentProcessingError(f\"PDF提取失败: {e2}\")\n \n metadata['extraction_library'] = 'pdfplumber' if 'pdfplumber' in str(type(pdf)) else 'PyPDF2'\n return content.strip(), metadata, page_count", "code_hash": "595de0393b9c007a029197da48307407" }, { "name": "_extract_from_docx", "line_start": 331, "line_end": 365, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Path" } ], "return_type": "tuple[str, Dict[str, Any], Optional[int]]", "docstring": "从Word文档中提取文本", "is_async": false, "decorators": [], "code": " def _extract_from_docx(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从Word文档中提取文本\"\"\"\n if not DOCX_AVAILABLE:\n raise DocumentProcessingError(\"Word文档支持库未安装,请安装: pip install python-docx\")\n \n try:\n doc = Document(file_path)\n content = \"\"\n metadata = {}\n \n # 提取段落文本\n paragraph_count = 0\n for paragraph in doc.paragraphs:\n if paragraph.text.strip():\n content += paragraph.text + \"\\n\"\n paragraph_count += 1\n \n # 提取表格文本\n table_count = len(doc.tables)\n for table in doc.tables:\n content += \"\\n\\n表格:\\n\"\n for row in table.rows:\n row_text = \"\\t\".join(cell.text.strip() for cell in row.cells)\n content += row_text + \"\\n\"\n \n metadata.update({\n 'paragraph_count': paragraph_count,\n 'table_count': table_count,\n 'extraction_library': 'python-docx'\n })\n \n return content.strip(), metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"Word文档提取失败: {e}\")", "code_hash": "c2d01e8cfa286d1633d191be12390b28" }, { "name": "_extract_from_excel", "line_start": 367, "line_end": 401, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Path" } ], "return_type": "tuple[str, Dict[str, Any], Optional[int]]", "docstring": "从Excel文件中提取文本", "is_async": false, "decorators": [], "code": " def _extract_from_excel(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从Excel文件中提取文本\"\"\"\n if not EXCEL_AVAILABLE:\n raise DocumentProcessingError(\"Excel支持库未安装,请安装: pip install openpyxl\")\n \n try:\n workbook = load_workbook(file_path, data_only=True)\n content = \"\"\n metadata = {}\n \n sheet_count = len(workbook.sheetnames)\n total_rows = 0\n \n for sheet_name in workbook.sheetnames:\n sheet = workbook[sheet_name]\n content += f\"\\n\\n工作表: {sheet_name}\\n\"\n \n sheet_rows = 0\n for row in sheet.iter_rows(values_only=True):\n if any(cell is not None for cell in row):\n row_text = \"\\t\".join(str(cell) if cell is not None else \"\" for cell in row)\n content += row_text + \"\\n\"\n sheet_rows += 1\n total_rows += 1\n \n metadata.update({\n 'sheet_count': sheet_count,\n 'total_rows': total_rows,\n 'extraction_library': 'openpyxl'\n })\n \n return content.strip(), metadata, sheet_count\n \n except Exception as e:\n raise DocumentProcessingError(f\"Excel文件提取失败: {e}\")", "code_hash": "9f91b9c5beed4befce153fadc76f5618" }, { "name": "_extract_from_text", "line_start": 403, "line_end": 431, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Path" } ], "return_type": "tuple[str, Dict[str, Any], Optional[int]]", "docstring": "从文本文件中提取内容", "is_async": false, "decorators": [], "code": " def _extract_from_text(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从文本文件中提取内容\"\"\"\n try:\n # 尝试多种编码\n encodings = ['utf-8', 'gbk', 'gb2312', 'latin-1']\n content = \"\"\n used_encoding = \"\"\n \n for encoding in encodings:\n try:\n with open(file_path, 'r', encoding=encoding) as file:\n content = file.read()\n used_encoding = encoding\n break\n except UnicodeDecodeError:\n continue\n \n if not content and not used_encoding:\n raise DocumentProcessingError(\"无法解码文本文件,尝试了多种编码方式\")\n \n metadata = {\n 'encoding': used_encoding,\n 'line_count': len(content.splitlines())\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"文本文件提取失败: {e}\")", "code_hash": "b7adae74706c1052b8607a2a0a92039f" }, { "name": "_extract_from_csv", "line_start": 433, "line_end": 466, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Path" } ], "return_type": "tuple[str, Dict[str, Any], Optional[int]]", "docstring": "从CSV文件中提取内容", "is_async": false, "decorators": [], "code": " def _extract_from_csv(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从CSV文件中提取内容\"\"\"\n try:\n if PANDAS_AVAILABLE:\n # 使用pandas处理CSV\n df = pd.read_csv(file_path)\n content = df.to_string(index=False)\n metadata = {\n 'rows': len(df),\n 'columns': len(df.columns),\n 'column_names': list(df.columns),\n 'extraction_library': 'pandas'\n }\n else:\n # 使用内置csv模块\n import csv\n content = \"\"\n with open(file_path, 'r', encoding='utf-8') as file:\n csv_reader = csv.reader(file)\n rows = list(csv_reader)\n \n for row in rows:\n content += \"\\t\".join(row) + \"\\n\"\n \n metadata = {\n 'rows': len(rows),\n 'columns': len(rows[0]) if rows else 0,\n 'extraction_library': 'csv'\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"CSV文件提取失败: {e}\")", "code_hash": "8e55541a37dfa2e36110638c22153ebc" }, { "name": "_extract_from_json", "line_start": 468, "line_end": 488, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Path" } ], "return_type": "tuple[str, Dict[str, Any], Optional[int]]", "docstring": "从JSON文件中提取内容", "is_async": false, "decorators": [], "code": " def _extract_from_json(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从JSON文件中提取内容\"\"\"\n try:\n import json\n \n with open(file_path, 'r', encoding='utf-8') as file:\n data = json.load(file)\n \n # 将JSON转换为格式化的文本\n content = json.dumps(data, ensure_ascii=False, indent=2)\n \n metadata = {\n 'json_keys': list(data.keys()) if isinstance(data, dict) else [],\n 'json_type': type(data).__name__,\n 'extraction_library': 'json'\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"JSON文件提取失败: {e}\")", "code_hash": "95a6d0ec0374f263879bc25a36d92b74" }, { "name": "_extract_from_xml", "line_start": 490, "line_end": 522, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Path" } ], "return_type": "tuple[str, Dict[str, Any], Optional[int]]", "docstring": "从XML文件中提取内容", "is_async": false, "decorators": [], "code": " def _extract_from_xml(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从XML文件中提取内容\"\"\"\n try:\n import xml.etree.ElementTree as ET\n \n tree = ET.parse(file_path)\n root = tree.getroot()\n \n # 提取所有文本内容\n content = \"\"\n def extract_text(element, level=0):\n nonlocal content\n indent = \" \" * level\n if element.text and element.text.strip():\n content += f\"{indent}{element.tag}: {element.text.strip()}\\n\"\n else:\n content += f\"{indent}{element.tag}\\n\"\n \n for child in element:\n extract_text(child, level + 1)\n \n extract_text(root)\n \n metadata = {\n 'root_tag': root.tag,\n 'element_count': len(list(root.iter())),\n 'extraction_library': 'xml.etree.ElementTree'\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"XML文件提取失败: {e}\")", "code_hash": "fdc37ab01be4414d3094d8b13150b4c2" }, { "name": "_extract_from_html", "line_start": 524, "line_end": 550, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Path" } ], "return_type": "tuple[str, Dict[str, Any], Optional[int]]", "docstring": "从HTML文件中提取内容", "is_async": false, "decorators": [], "code": " def _extract_from_html(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从HTML文件中提取内容\"\"\"\n try:\n with open(file_path, 'r', encoding='utf-8') as file:\n html_content = file.read()\n \n # 简单的HTML文本提取(移除标签)\n import re\n # 移除script和style标签及其内容\n html_content = re.sub(r'', '', html_content, flags=re.DOTALL)\n html_content = re.sub(r'', '', html_content, flags=re.DOTALL)\n \n # 移除HTML标签\n content = re.sub(r'<[^>]+>', '', html_content)\n \n # 清理多余的空白\n content = re.sub(r'\\s+', ' ', content).strip()\n \n metadata = {\n 'original_size': len(html_content),\n 'extraction_library': 'regex'\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"HTML文件提取失败: {e}\")", "code_hash": "c7a170bb6866f195eb9065aa21caf871" }, { "name": "get_supported_formats", "line_start": 552, "line_end": 554, "args": [ { "name": "self" } ], "return_type": "Dict[str, str]", "docstring": "获取支持的文件格式", "is_async": false, "decorators": [], "code": " def get_supported_formats(self) -> Dict[str, str]:\n \"\"\"获取支持的文件格式\"\"\"\n return self.SUPPORTED_EXTENSIONS.copy()", "code_hash": "b190b58a832baa0f901258b0ed23a4e2" }, { "name": "is_supported_format", "line_start": 556, "line_end": 559, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Union[str, Path]" } ], "return_type": "bool", "docstring": "检查文件格式是否支持", "is_async": false, "decorators": [], "code": " def is_supported_format(self, file_path: Union[str, Path]) -> bool:\n \"\"\"检查文件格式是否支持\"\"\"\n file_path = Path(file_path)\n return file_path.suffix.lower() in self.SUPPORTED_EXTENSIONS", "code_hash": "d104465d277e576d465debd1d495da97" }, { "name": "get_extraction_stats", "line_start": 561, "line_end": 580, "args": [ { "name": "self" } ], "return_type": "Dict[str, Any]", "docstring": "获取提取器统计信息", "is_async": false, "decorators": [], "code": " def get_extraction_stats(self) -> Dict[str, Any]:\n \"\"\"获取提取器统计信息\"\"\"\n available_libraries = []\n \n if PDF_AVAILABLE:\n available_libraries.append(\"PDF (PyPDF2, pdfplumber)\")\n if DOCX_AVAILABLE:\n available_libraries.append(\"Word (python-docx)\")\n if EXCEL_AVAILABLE:\n available_libraries.append(\"Excel (openpyxl)\")\n if PANDAS_AVAILABLE:\n available_libraries.append(\"CSV (pandas)\")\n \n return {\n 'supported_formats': list(self.SUPPORTED_EXTENSIONS.keys()),\n 'available_libraries': available_libraries,\n 'max_file_size': self.config.max_file_size,\n 'max_content_length': self.config.max_content_length,\n 'max_documents': self.config.max_documents\n } ", "code_hash": "2c261c6f8ea522a7987b3c28e03a0807" }, { "name": "extract_text", "line_start": 500, "line_end": 509, "args": [ { "name": "element" }, { "name": "level" } ], "return_type": null, "docstring": "", "is_async": false, "decorators": [], "code": " def extract_text(element, level=0):\n nonlocal content\n indent = \" \" * level\n if element.text and element.text.strip():\n content += f\"{indent}{element.tag}: {element.text.strip()}\\n\"\n else:\n content += f\"{indent}{element.tag}\\n\"\n \n for child in element:\n extract_text(child, level + 1)", "code_hash": "17aee15b3d955d5237e6253e3c4f6f5b" } ], "classes": [ { "name": "ExtractedDocument", "line_start": 51, "line_end": 99, "bases": [], "methods": [ { "name": "__post_init__", "line_start": 63, "line_end": 71, "args": [ { "name": "self" } ], "return_type": null, "docstring": "初始化后处理", "is_async": false, "decorators": [], "code": " def __post_init__(self):\n \"\"\"初始化后处理\"\"\"\n if not self.extracted_at:\n self.extracted_at = datetime.now()\n \n # 计算内容统计\n self.metadata.setdefault('content_length', len(self.content))\n self.metadata.setdefault('word_count', len(self.content.split()))\n self.metadata.setdefault('line_count', len(self.content.splitlines()))", "code_hash": "e3b7d9c15591b8f58f4f499fcf8ecac8" }, { "name": "to_dict", "line_start": 73, "line_end": 85, "args": [ { "name": "self" } ], "return_type": "Dict[str, Any]", "docstring": "转换为字典格式", "is_async": false, "decorators": [], "code": " def to_dict(self) -> Dict[str, Any]:\n \"\"\"转换为字典格式\"\"\"\n return {\n 'filename': self.filename,\n 'file_type': self.file_type,\n 'content': self.content,\n 'metadata': self.metadata,\n 'extracted_at': self.extracted_at.isoformat(),\n 'file_size': self.file_size,\n 'page_count': self.page_count,\n 'extraction_method': self.extraction_method,\n 'error_info': self.error_info\n }", "code_hash": "9aec5f6566c91613207a14430fd3a65f" }, { "name": "get_summary", "line_start": 87, "line_end": 99, "args": [ { "name": "self" } ], "return_type": "Dict[str, Any]", "docstring": "获取文档摘要信息", "is_async": false, "decorators": [], "code": " def get_summary(self) -> Dict[str, Any]:\n \"\"\"获取文档摘要信息\"\"\"\n return {\n 'filename': self.filename,\n 'file_type': self.file_type,\n 'file_size': self.file_size,\n 'page_count': self.page_count,\n 'content_length': len(self.content),\n 'word_count': len(self.content.split()),\n 'extracted_at': self.extracted_at.isoformat(),\n 'extraction_method': self.extraction_method,\n 'has_error': bool(self.error_info)\n }", "code_hash": "3b4aeb4ce342f694cf6f50ec911f85a4" } ], "docstring": "提取的文档数据", "decorators": [ "dataclass" ], "code": "class ExtractedDocument:\n \"\"\"提取的文档数据\"\"\"\n filename: str\n file_type: str\n content: str # 纯文本内容\n metadata: Dict[str, Any] # 文档元数据\n extracted_at: datetime\n file_size: int\n page_count: Optional[int] = None\n extraction_method: Optional[str] = None\n error_info: Optional[str] = None\n \n def __post_init__(self):\n \"\"\"初始化后处理\"\"\"\n if not self.extracted_at:\n self.extracted_at = datetime.now()\n \n # 计算内容统计\n self.metadata.setdefault('content_length', len(self.content))\n self.metadata.setdefault('word_count', len(self.content.split()))\n self.metadata.setdefault('line_count', len(self.content.splitlines()))\n \n def to_dict(self) -> Dict[str, Any]:\n \"\"\"转换为字典格式\"\"\"\n return {\n 'filename': self.filename,\n 'file_type': self.file_type,\n 'content': self.content,\n 'metadata': self.metadata,\n 'extracted_at': self.extracted_at.isoformat(),\n 'file_size': self.file_size,\n 'page_count': self.page_count,\n 'extraction_method': self.extraction_method,\n 'error_info': self.error_info\n }\n \n def get_summary(self) -> Dict[str, Any]:\n \"\"\"获取文档摘要信息\"\"\"\n return {\n 'filename': self.filename,\n 'file_type': self.file_type,\n 'file_size': self.file_size,\n 'page_count': self.page_count,\n 'content_length': len(self.content),\n 'word_count': len(self.content.split()),\n 'extracted_at': self.extracted_at.isoformat(),\n 'extraction_method': self.extraction_method,\n 'has_error': bool(self.error_info)\n }", "code_hash": "48e79f6e59935d971c73087b22b67bad" }, { "name": "TextExtractor", "line_start": 102, "line_end": 580, "bases": [], "methods": [ { "name": "__init__", "line_start": 123, "line_end": 146, "args": [ { "name": "self" }, { "name": "config", "type_hint": "DocumentProcessingConfig" } ], "return_type": null, "docstring": "初始化文本提取器\n\nArgs:\n config: 文档处理配置", "is_async": false, "decorators": [], "code": " def __init__(self, config: DocumentProcessingConfig):\n \"\"\"\n 初始化文本提取器\n \n Args:\n config: 文档处理配置\n \"\"\"\n self.config = config\n self.extraction_methods = {\n '.pdf': self._extract_from_pdf,\n '.docx': self._extract_from_docx,\n '.doc': self._extract_from_docx,\n '.xlsx': self._extract_from_excel,\n '.xls': self._extract_from_excel,\n '.txt': self._extract_from_text,\n '.md': self._extract_from_text,\n '.csv': self._extract_from_csv,\n '.json': self._extract_from_json,\n '.xml': self._extract_from_xml,\n '.html': self._extract_from_html,\n '.htm': self._extract_from_html\n }\n \n logger.info(f\"文本提取器初始化完成,支持格式: {list(self.SUPPORTED_EXTENSIONS.keys())}\")", "code_hash": "b80795161396567c1a4afe0cbede261e" }, { "name": "extract_from_file", "line_start": 148, "line_end": 230, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Union[str, Path]" } ], "return_type": "ExtractedDocument", "docstring": "从文件中提取文本\n\nArgs:\n file_path: 文件路径\n \nReturns:\n ExtractedDocument: 提取的文档数据\n \nRaises:\n DocumentProcessingError: 提取失败时抛出", "is_async": false, "decorators": [], "code": " def extract_from_file(self, file_path: Union[str, Path]) -> ExtractedDocument:\n \"\"\"\n 从文件中提取文本\n \n Args:\n file_path: 文件路径\n \n Returns:\n ExtractedDocument: 提取的文档数据\n \n Raises:\n DocumentProcessingError: 提取失败时抛出\n \"\"\"\n file_path = Path(file_path)\n \n if not file_path.exists():\n raise ResourceNotFoundError(f\"文件不存在: {file_path}\")\n \n if not file_path.is_file():\n raise DocumentProcessingError(f\"路径不是文件: {file_path}\")\n \n file_ext = file_path.suffix.lower()\n \n if file_ext not in self.SUPPORTED_EXTENSIONS:\n raise DocumentProcessingError(f\"不支持的文件格式: {file_ext}\")\n \n try:\n logger.info(f\"开始提取文件: {file_path}\")\n \n # 获取文件基本信息\n file_size = file_path.stat().st_size\n \n # 检查文件大小限制\n if self.config.max_file_size > 0 and file_size > self.config.max_file_size:\n raise DocumentProcessingError(\n f\"文件大小 {file_size} 超过限制 {self.config.max_file_size}\"\n )\n \n # 根据文件扩展名选择提取方法\n extraction_method = self.extraction_methods.get(file_ext)\n if not extraction_method:\n raise DocumentProcessingError(f\"未找到对应的提取方法: {file_ext}\")\n \n # 执行文本提取\n content, metadata, page_count = extraction_method(file_path)\n \n # 内容长度检查\n if self.config.max_content_length > 0 and len(content) > self.config.max_content_length:\n logger.warning(f\"内容长度 {len(content)} 超过限制,将被截断\")\n content = content[:self.config.max_content_length]\n metadata['content_truncated'] = True\n \n # 创建提取结果\n extracted_doc = ExtractedDocument(\n filename=file_path.name,\n file_type=self.SUPPORTED_EXTENSIONS[file_ext],\n content=content,\n metadata=metadata,\n extracted_at=datetime.now(),\n file_size=file_size,\n page_count=page_count,\n extraction_method=extraction_method.__name__\n )\n \n logger.info(f\"文件提取完成: {file_path}, 内容长度: {len(content)}\")\n return extracted_doc\n \n except Exception as e:\n error_msg = f\"文件提取失败 {file_path}: {str(e)}\"\n logger.error(error_msg, exc_info=True)\n \n # 创建错误文档\n return ExtractedDocument(\n filename=file_path.name,\n file_type=self.SUPPORTED_EXTENSIONS.get(file_ext, \"Unknown\"),\n content=\"\",\n metadata={'extraction_error': str(e)},\n extracted_at=datetime.now(),\n file_size=file_path.stat().st_size if file_path.exists() else 0,\n page_count=None,\n extraction_method=None,\n error_info=error_msg\n )", "code_hash": "ede6a7b109cac815a9ad476f06a9c1c3" }, { "name": "extract_from_directory", "line_start": 232, "line_end": 275, "args": [ { "name": "self" }, { "name": "directory_path", "type_hint": "Union[str, Path]" } ], "return_type": "List[ExtractedDocument]", "docstring": "从目录中提取所有支持的文档\n\nArgs:\n directory_path: 目录路径\n \nReturns:\n List[ExtractedDocument]: 提取的文档列表", "is_async": false, "decorators": [], "code": " def extract_from_directory(self, directory_path: Union[str, Path]) -> List[ExtractedDocument]:\n \"\"\"\n 从目录中提取所有支持的文档\n \n Args:\n directory_path: 目录路径\n \n Returns:\n List[ExtractedDocument]: 提取的文档列表\n \"\"\"\n directory_path = Path(directory_path)\n \n if not directory_path.exists():\n raise ResourceNotFoundError(f\"目录不存在: {directory_path}\")\n \n if not directory_path.is_dir():\n raise DocumentProcessingError(f\"路径不是目录: {directory_path}\")\n \n extracted_docs = []\n processed_count = 0\n \n # 递归搜索文件\n for file_path in directory_path.rglob(\"*\"):\n if file_path.is_file():\n file_ext = file_path.suffix.lower()\n \n if file_ext in self.SUPPORTED_EXTENSIONS:\n try:\n doc = self.extract_from_file(file_path)\n extracted_docs.append(doc)\n processed_count += 1\n \n # 检查处理数量限制\n if (self.config.max_documents > 0 and \n processed_count >= self.config.max_documents):\n logger.warning(f\"达到文档数量限制 {self.config.max_documents}\")\n break\n \n except Exception as e:\n logger.error(f\"处理文件失败 {file_path}: {e}\")\n continue\n \n logger.info(f\"目录提取完成: {directory_path}, 处理文件数: {len(extracted_docs)}\")\n return extracted_docs", "code_hash": "040078b945a9aaae7216ebb69c026e23" }, { "name": "_extract_from_pdf", "line_start": 277, "line_end": 329, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Path" } ], "return_type": "tuple[str, Dict[str, Any], Optional[int]]", "docstring": "从PDF文件中提取文本", "is_async": false, "decorators": [], "code": " def _extract_from_pdf(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从PDF文件中提取文本\"\"\"\n if not PDF_AVAILABLE:\n raise DocumentProcessingError(\"PDF支持库未安装,请安装: pip install PyPDF2 pdfplumber\")\n \n content = \"\"\n metadata = {}\n page_count = 0\n \n try:\n # 优先使用pdfplumber,回退到PyPDF2\n with pdfplumber.open(file_path) as pdf:\n page_count = len(pdf.pages)\n metadata['page_count'] = page_count\n \n for page in pdf.pages:\n text = page.extract_text()\n if text:\n content += text + \"\\n\"\n \n # 提取表格数据\n tables = []\n for page in pdf.pages:\n page_tables = page.extract_tables()\n if page_tables:\n tables.extend(page_tables)\n \n if tables:\n metadata['table_count'] = len(tables)\n # 将表格转换为文本\n for i, table in enumerate(tables):\n content += f\"\\n\\n表格 {i+1}:\\n\"\n for row in table:\n if row:\n content += \"\\t\".join(str(cell) if cell else \"\" for cell in row) + \"\\n\"\n \n except Exception as e:\n logger.warning(f\"pdfplumber提取失败,尝试PyPDF2: {e}\")\n \n try:\n with open(file_path, 'rb') as file:\n pdf_reader = PyPDF2.PdfReader(file)\n page_count = len(pdf_reader.pages)\n metadata['page_count'] = page_count\n \n for page in pdf_reader.pages:\n content += page.extract_text() + \"\\n\"\n \n except Exception as e2:\n raise DocumentProcessingError(f\"PDF提取失败: {e2}\")\n \n metadata['extraction_library'] = 'pdfplumber' if 'pdfplumber' in str(type(pdf)) else 'PyPDF2'\n return content.strip(), metadata, page_count", "code_hash": "595de0393b9c007a029197da48307407" }, { "name": "_extract_from_docx", "line_start": 331, "line_end": 365, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Path" } ], "return_type": "tuple[str, Dict[str, Any], Optional[int]]", "docstring": "从Word文档中提取文本", "is_async": false, "decorators": [], "code": " def _extract_from_docx(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从Word文档中提取文本\"\"\"\n if not DOCX_AVAILABLE:\n raise DocumentProcessingError(\"Word文档支持库未安装,请安装: pip install python-docx\")\n \n try:\n doc = Document(file_path)\n content = \"\"\n metadata = {}\n \n # 提取段落文本\n paragraph_count = 0\n for paragraph in doc.paragraphs:\n if paragraph.text.strip():\n content += paragraph.text + \"\\n\"\n paragraph_count += 1\n \n # 提取表格文本\n table_count = len(doc.tables)\n for table in doc.tables:\n content += \"\\n\\n表格:\\n\"\n for row in table.rows:\n row_text = \"\\t\".join(cell.text.strip() for cell in row.cells)\n content += row_text + \"\\n\"\n \n metadata.update({\n 'paragraph_count': paragraph_count,\n 'table_count': table_count,\n 'extraction_library': 'python-docx'\n })\n \n return content.strip(), metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"Word文档提取失败: {e}\")", "code_hash": "c2d01e8cfa286d1633d191be12390b28" }, { "name": "_extract_from_excel", "line_start": 367, "line_end": 401, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Path" } ], "return_type": "tuple[str, Dict[str, Any], Optional[int]]", "docstring": "从Excel文件中提取文本", "is_async": false, "decorators": [], "code": " def _extract_from_excel(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从Excel文件中提取文本\"\"\"\n if not EXCEL_AVAILABLE:\n raise DocumentProcessingError(\"Excel支持库未安装,请安装: pip install openpyxl\")\n \n try:\n workbook = load_workbook(file_path, data_only=True)\n content = \"\"\n metadata = {}\n \n sheet_count = len(workbook.sheetnames)\n total_rows = 0\n \n for sheet_name in workbook.sheetnames:\n sheet = workbook[sheet_name]\n content += f\"\\n\\n工作表: {sheet_name}\\n\"\n \n sheet_rows = 0\n for row in sheet.iter_rows(values_only=True):\n if any(cell is not None for cell in row):\n row_text = \"\\t\".join(str(cell) if cell is not None else \"\" for cell in row)\n content += row_text + \"\\n\"\n sheet_rows += 1\n total_rows += 1\n \n metadata.update({\n 'sheet_count': sheet_count,\n 'total_rows': total_rows,\n 'extraction_library': 'openpyxl'\n })\n \n return content.strip(), metadata, sheet_count\n \n except Exception as e:\n raise DocumentProcessingError(f\"Excel文件提取失败: {e}\")", "code_hash": "9f91b9c5beed4befce153fadc76f5618" }, { "name": "_extract_from_text", "line_start": 403, "line_end": 431, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Path" } ], "return_type": "tuple[str, Dict[str, Any], Optional[int]]", "docstring": "从文本文件中提取内容", "is_async": false, "decorators": [], "code": " def _extract_from_text(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从文本文件中提取内容\"\"\"\n try:\n # 尝试多种编码\n encodings = ['utf-8', 'gbk', 'gb2312', 'latin-1']\n content = \"\"\n used_encoding = \"\"\n \n for encoding in encodings:\n try:\n with open(file_path, 'r', encoding=encoding) as file:\n content = file.read()\n used_encoding = encoding\n break\n except UnicodeDecodeError:\n continue\n \n if not content and not used_encoding:\n raise DocumentProcessingError(\"无法解码文本文件,尝试了多种编码方式\")\n \n metadata = {\n 'encoding': used_encoding,\n 'line_count': len(content.splitlines())\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"文本文件提取失败: {e}\")", "code_hash": "b7adae74706c1052b8607a2a0a92039f" }, { "name": "_extract_from_csv", "line_start": 433, "line_end": 466, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Path" } ], "return_type": "tuple[str, Dict[str, Any], Optional[int]]", "docstring": "从CSV文件中提取内容", "is_async": false, "decorators": [], "code": " def _extract_from_csv(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从CSV文件中提取内容\"\"\"\n try:\n if PANDAS_AVAILABLE:\n # 使用pandas处理CSV\n df = pd.read_csv(file_path)\n content = df.to_string(index=False)\n metadata = {\n 'rows': len(df),\n 'columns': len(df.columns),\n 'column_names': list(df.columns),\n 'extraction_library': 'pandas'\n }\n else:\n # 使用内置csv模块\n import csv\n content = \"\"\n with open(file_path, 'r', encoding='utf-8') as file:\n csv_reader = csv.reader(file)\n rows = list(csv_reader)\n \n for row in rows:\n content += \"\\t\".join(row) + \"\\n\"\n \n metadata = {\n 'rows': len(rows),\n 'columns': len(rows[0]) if rows else 0,\n 'extraction_library': 'csv'\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"CSV文件提取失败: {e}\")", "code_hash": "8e55541a37dfa2e36110638c22153ebc" }, { "name": "_extract_from_json", "line_start": 468, "line_end": 488, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Path" } ], "return_type": "tuple[str, Dict[str, Any], Optional[int]]", "docstring": "从JSON文件中提取内容", "is_async": false, "decorators": [], "code": " def _extract_from_json(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从JSON文件中提取内容\"\"\"\n try:\n import json\n \n with open(file_path, 'r', encoding='utf-8') as file:\n data = json.load(file)\n \n # 将JSON转换为格式化的文本\n content = json.dumps(data, ensure_ascii=False, indent=2)\n \n metadata = {\n 'json_keys': list(data.keys()) if isinstance(data, dict) else [],\n 'json_type': type(data).__name__,\n 'extraction_library': 'json'\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"JSON文件提取失败: {e}\")", "code_hash": "95a6d0ec0374f263879bc25a36d92b74" }, { "name": "_extract_from_xml", "line_start": 490, "line_end": 522, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Path" } ], "return_type": "tuple[str, Dict[str, Any], Optional[int]]", "docstring": "从XML文件中提取内容", "is_async": false, "decorators": [], "code": " def _extract_from_xml(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从XML文件中提取内容\"\"\"\n try:\n import xml.etree.ElementTree as ET\n \n tree = ET.parse(file_path)\n root = tree.getroot()\n \n # 提取所有文本内容\n content = \"\"\n def extract_text(element, level=0):\n nonlocal content\n indent = \" \" * level\n if element.text and element.text.strip():\n content += f\"{indent}{element.tag}: {element.text.strip()}\\n\"\n else:\n content += f\"{indent}{element.tag}\\n\"\n \n for child in element:\n extract_text(child, level + 1)\n \n extract_text(root)\n \n metadata = {\n 'root_tag': root.tag,\n 'element_count': len(list(root.iter())),\n 'extraction_library': 'xml.etree.ElementTree'\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"XML文件提取失败: {e}\")", "code_hash": "fdc37ab01be4414d3094d8b13150b4c2" }, { "name": "_extract_from_html", "line_start": 524, "line_end": 550, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Path" } ], "return_type": "tuple[str, Dict[str, Any], Optional[int]]", "docstring": "从HTML文件中提取内容", "is_async": false, "decorators": [], "code": " def _extract_from_html(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从HTML文件中提取内容\"\"\"\n try:\n with open(file_path, 'r', encoding='utf-8') as file:\n html_content = file.read()\n \n # 简单的HTML文本提取(移除标签)\n import re\n # 移除script和style标签及其内容\n html_content = re.sub(r'', '', html_content, flags=re.DOTALL)\n html_content = re.sub(r'', '', html_content, flags=re.DOTALL)\n \n # 移除HTML标签\n content = re.sub(r'<[^>]+>', '', html_content)\n \n # 清理多余的空白\n content = re.sub(r'\\s+', ' ', content).strip()\n \n metadata = {\n 'original_size': len(html_content),\n 'extraction_library': 'regex'\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"HTML文件提取失败: {e}\")", "code_hash": "c7a170bb6866f195eb9065aa21caf871" }, { "name": "get_supported_formats", "line_start": 552, "line_end": 554, "args": [ { "name": "self" } ], "return_type": "Dict[str, str]", "docstring": "获取支持的文件格式", "is_async": false, "decorators": [], "code": " def get_supported_formats(self) -> Dict[str, str]:\n \"\"\"获取支持的文件格式\"\"\"\n return self.SUPPORTED_EXTENSIONS.copy()", "code_hash": "b190b58a832baa0f901258b0ed23a4e2" }, { "name": "is_supported_format", "line_start": 556, "line_end": 559, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Union[str, Path]" } ], "return_type": "bool", "docstring": "检查文件格式是否支持", "is_async": false, "decorators": [], "code": " def is_supported_format(self, file_path: Union[str, Path]) -> bool:\n \"\"\"检查文件格式是否支持\"\"\"\n file_path = Path(file_path)\n return file_path.suffix.lower() in self.SUPPORTED_EXTENSIONS", "code_hash": "d104465d277e576d465debd1d495da97" }, { "name": "get_extraction_stats", "line_start": 561, "line_end": 580, "args": [ { "name": "self" } ], "return_type": "Dict[str, Any]", "docstring": "获取提取器统计信息", "is_async": false, "decorators": [], "code": " def get_extraction_stats(self) -> Dict[str, Any]:\n \"\"\"获取提取器统计信息\"\"\"\n available_libraries = []\n \n if PDF_AVAILABLE:\n available_libraries.append(\"PDF (PyPDF2, pdfplumber)\")\n if DOCX_AVAILABLE:\n available_libraries.append(\"Word (python-docx)\")\n if EXCEL_AVAILABLE:\n available_libraries.append(\"Excel (openpyxl)\")\n if PANDAS_AVAILABLE:\n available_libraries.append(\"CSV (pandas)\")\n \n return {\n 'supported_formats': list(self.SUPPORTED_EXTENSIONS.keys()),\n 'available_libraries': available_libraries,\n 'max_file_size': self.config.max_file_size,\n 'max_content_length': self.config.max_content_length,\n 'max_documents': self.config.max_documents\n } ", "code_hash": "2c261c6f8ea522a7987b3c28e03a0807" } ], "docstring": "文本提取器 - 重构版本\n支持从多种文档格式中提取文本内容", "decorators": [], "code": "class TextExtractor:\n \"\"\"\n 文本提取器 - 重构版本\n 支持从多种文档格式中提取文本内容\n \"\"\"\n \n SUPPORTED_EXTENSIONS = {\n '.pdf': 'PDF Document',\n '.docx': 'Microsoft Word Document',\n '.doc': 'Microsoft Word Document (Legacy)',\n '.xlsx': 'Microsoft Excel Spreadsheet',\n '.xls': 'Microsoft Excel Spreadsheet (Legacy)',\n '.txt': 'Plain Text',\n '.md': 'Markdown',\n '.csv': 'Comma Separated Values',\n '.json': 'JSON Document',\n '.xml': 'XML Document',\n '.html': 'HTML Document',\n '.htm': 'HTML Document'\n }\n \n def __init__(self, config: DocumentProcessingConfig):\n \"\"\"\n 初始化文本提取器\n \n Args:\n config: 文档处理配置\n \"\"\"\n self.config = config\n self.extraction_methods = {\n '.pdf': self._extract_from_pdf,\n '.docx': self._extract_from_docx,\n '.doc': self._extract_from_docx,\n '.xlsx': self._extract_from_excel,\n '.xls': self._extract_from_excel,\n '.txt': self._extract_from_text,\n '.md': self._extract_from_text,\n '.csv': self._extract_from_csv,\n '.json': self._extract_from_json,\n '.xml': self._extract_from_xml,\n '.html': self._extract_from_html,\n '.htm': self._extract_from_html\n }\n \n logger.info(f\"文本提取器初始化完成,支持格式: {list(self.SUPPORTED_EXTENSIONS.keys())}\")\n \n def extract_from_file(self, file_path: Union[str, Path]) -> ExtractedDocument:\n \"\"\"\n 从文件中提取文本\n \n Args:\n file_path: 文件路径\n \n Returns:\n ExtractedDocument: 提取的文档数据\n \n Raises:\n DocumentProcessingError: 提取失败时抛出\n \"\"\"\n file_path = Path(file_path)\n \n if not file_path.exists():\n raise ResourceNotFoundError(f\"文件不存在: {file_path}\")\n \n if not file_path.is_file():\n raise DocumentProcessingError(f\"路径不是文件: {file_path}\")\n \n file_ext = file_path.suffix.lower()\n \n if file_ext not in self.SUPPORTED_EXTENSIONS:\n raise DocumentProcessingError(f\"不支持的文件格式: {file_ext}\")\n \n try:\n logger.info(f\"开始提取文件: {file_path}\")\n \n # 获取文件基本信息\n file_size = file_path.stat().st_size\n \n # 检查文件大小限制\n if self.config.max_file_size > 0 and file_size > self.config.max_file_size:\n raise DocumentProcessingError(\n f\"文件大小 {file_size} 超过限制 {self.config.max_file_size}\"\n )\n \n # 根据文件扩展名选择提取方法\n extraction_method = self.extraction_methods.get(file_ext)\n if not extraction_method:\n raise DocumentProcessingError(f\"未找到对应的提取方法: {file_ext}\")\n \n # 执行文本提取\n content, metadata, page_count = extraction_method(file_path)\n \n # 内容长度检查\n if self.config.max_content_length > 0 and len(content) > self.config.max_content_length:\n logger.warning(f\"内容长度 {len(content)} 超过限制,将被截断\")\n content = content[:self.config.max_content_length]\n metadata['content_truncated'] = True\n \n # 创建提取结果\n extracted_doc = ExtractedDocument(\n filename=file_path.name,\n file_type=self.SUPPORTED_EXTENSIONS[file_ext],\n content=content,\n metadata=metadata,\n extracted_at=datetime.now(),\n file_size=file_size,\n page_count=page_count,\n extraction_method=extraction_method.__name__\n )\n \n logger.info(f\"文件提取完成: {file_path}, 内容长度: {len(content)}\")\n return extracted_doc\n \n except Exception as e:\n error_msg = f\"文件提取失败 {file_path}: {str(e)}\"\n logger.error(error_msg, exc_info=True)\n \n # 创建错误文档\n return ExtractedDocument(\n filename=file_path.name,\n file_type=self.SUPPORTED_EXTENSIONS.get(file_ext, \"Unknown\"),\n content=\"\",\n metadata={'extraction_error': str(e)},\n extracted_at=datetime.now(),\n file_size=file_path.stat().st_size if file_path.exists() else 0,\n page_count=None,\n extraction_method=None,\n error_info=error_msg\n )\n \n def extract_from_directory(self, directory_path: Union[str, Path]) -> List[ExtractedDocument]:\n \"\"\"\n 从目录中提取所有支持的文档\n \n Args:\n directory_path: 目录路径\n \n Returns:\n List[ExtractedDocument]: 提取的文档列表\n \"\"\"\n directory_path = Path(directory_path)\n \n if not directory_path.exists():\n raise ResourceNotFoundError(f\"目录不存在: {directory_path}\")\n \n if not directory_path.is_dir():\n raise DocumentProcessingError(f\"路径不是目录: {directory_path}\")\n \n extracted_docs = []\n processed_count = 0\n \n # 递归搜索文件\n for file_path in directory_path.rglob(\"*\"):\n if file_path.is_file():\n file_ext = file_path.suffix.lower()\n \n if file_ext in self.SUPPORTED_EXTENSIONS:\n try:\n doc = self.extract_from_file(file_path)\n extracted_docs.append(doc)\n processed_count += 1\n \n # 检查处理数量限制\n if (self.config.max_documents > 0 and \n processed_count >= self.config.max_documents):\n logger.warning(f\"达到文档数量限制 {self.config.max_documents}\")\n break\n \n except Exception as e:\n logger.error(f\"处理文件失败 {file_path}: {e}\")\n continue\n \n logger.info(f\"目录提取完成: {directory_path}, 处理文件数: {len(extracted_docs)}\")\n return extracted_docs\n \n def _extract_from_pdf(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从PDF文件中提取文本\"\"\"\n if not PDF_AVAILABLE:\n raise DocumentProcessingError(\"PDF支持库未安装,请安装: pip install PyPDF2 pdfplumber\")\n \n content = \"\"\n metadata = {}\n page_count = 0\n \n try:\n # 优先使用pdfplumber,回退到PyPDF2\n with pdfplumber.open(file_path) as pdf:\n page_count = len(pdf.pages)\n metadata['page_count'] = page_count\n \n for page in pdf.pages:\n text = page.extract_text()\n if text:\n content += text + \"\\n\"\n \n # 提取表格数据\n tables = []\n for page in pdf.pages:\n page_tables = page.extract_tables()\n if page_tables:\n tables.extend(page_tables)\n \n if tables:\n metadata['table_count'] = len(tables)\n # 将表格转换为文本\n for i, table in enumerate(tables):\n content += f\"\\n\\n表格 {i+1}:\\n\"\n for row in table:\n if row:\n content += \"\\t\".join(str(cell) if cell else \"\" for cell in row) + \"\\n\"\n \n except Exception as e:\n logger.warning(f\"pdfplumber提取失败,尝试PyPDF2: {e}\")\n \n try:\n with open(file_path, 'rb') as file:\n pdf_reader = PyPDF2.PdfReader(file)\n page_count = len(pdf_reader.pages)\n metadata['page_count'] = page_count\n \n for page in pdf_reader.pages:\n content += page.extract_text() + \"\\n\"\n \n except Exception as e2:\n raise DocumentProcessingError(f\"PDF提取失败: {e2}\")\n \n metadata['extraction_library'] = 'pdfplumber' if 'pdfplumber' in str(type(pdf)) else 'PyPDF2'\n return content.strip(), metadata, page_count\n \n def _extract_from_docx(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从Word文档中提取文本\"\"\"\n if not DOCX_AVAILABLE:\n raise DocumentProcessingError(\"Word文档支持库未安装,请安装: pip install python-docx\")\n \n try:\n doc = Document(file_path)\n content = \"\"\n metadata = {}\n \n # 提取段落文本\n paragraph_count = 0\n for paragraph in doc.paragraphs:\n if paragraph.text.strip():\n content += paragraph.text + \"\\n\"\n paragraph_count += 1\n \n # 提取表格文本\n table_count = len(doc.tables)\n for table in doc.tables:\n content += \"\\n\\n表格:\\n\"\n for row in table.rows:\n row_text = \"\\t\".join(cell.text.strip() for cell in row.cells)\n content += row_text + \"\\n\"\n \n metadata.update({\n 'paragraph_count': paragraph_count,\n 'table_count': table_count,\n 'extraction_library': 'python-docx'\n })\n \n return content.strip(), metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"Word文档提取失败: {e}\")\n \n def _extract_from_excel(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从Excel文件中提取文本\"\"\"\n if not EXCEL_AVAILABLE:\n raise DocumentProcessingError(\"Excel支持库未安装,请安装: pip install openpyxl\")\n \n try:\n workbook = load_workbook(file_path, data_only=True)\n content = \"\"\n metadata = {}\n \n sheet_count = len(workbook.sheetnames)\n total_rows = 0\n \n for sheet_name in workbook.sheetnames:\n sheet = workbook[sheet_name]\n content += f\"\\n\\n工作表: {sheet_name}\\n\"\n \n sheet_rows = 0\n for row in sheet.iter_rows(values_only=True):\n if any(cell is not None for cell in row):\n row_text = \"\\t\".join(str(cell) if cell is not None else \"\" for cell in row)\n content += row_text + \"\\n\"\n sheet_rows += 1\n total_rows += 1\n \n metadata.update({\n 'sheet_count': sheet_count,\n 'total_rows': total_rows,\n 'extraction_library': 'openpyxl'\n })\n \n return content.strip(), metadata, sheet_count\n \n except Exception as e:\n raise DocumentProcessingError(f\"Excel文件提取失败: {e}\")\n \n def _extract_from_text(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从文本文件中提取内容\"\"\"\n try:\n # 尝试多种编码\n encodings = ['utf-8', 'gbk', 'gb2312', 'latin-1']\n content = \"\"\n used_encoding = \"\"\n \n for encoding in encodings:\n try:\n with open(file_path, 'r', encoding=encoding) as file:\n content = file.read()\n used_encoding = encoding\n break\n except UnicodeDecodeError:\n continue\n \n if not content and not used_encoding:\n raise DocumentProcessingError(\"无法解码文本文件,尝试了多种编码方式\")\n \n metadata = {\n 'encoding': used_encoding,\n 'line_count': len(content.splitlines())\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"文本文件提取失败: {e}\")\n \n def _extract_from_csv(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从CSV文件中提取内容\"\"\"\n try:\n if PANDAS_AVAILABLE:\n # 使用pandas处理CSV\n df = pd.read_csv(file_path)\n content = df.to_string(index=False)\n metadata = {\n 'rows': len(df),\n 'columns': len(df.columns),\n 'column_names': list(df.columns),\n 'extraction_library': 'pandas'\n }\n else:\n # 使用内置csv模块\n import csv\n content = \"\"\n with open(file_path, 'r', encoding='utf-8') as file:\n csv_reader = csv.reader(file)\n rows = list(csv_reader)\n \n for row in rows:\n content += \"\\t\".join(row) + \"\\n\"\n \n metadata = {\n 'rows': len(rows),\n 'columns': len(rows[0]) if rows else 0,\n 'extraction_library': 'csv'\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"CSV文件提取失败: {e}\")\n \n def _extract_from_json(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从JSON文件中提取内容\"\"\"\n try:\n import json\n \n with open(file_path, 'r', encoding='utf-8') as file:\n data = json.load(file)\n \n # 将JSON转换为格式化的文本\n content = json.dumps(data, ensure_ascii=False, indent=2)\n \n metadata = {\n 'json_keys': list(data.keys()) if isinstance(data, dict) else [],\n 'json_type': type(data).__name__,\n 'extraction_library': 'json'\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"JSON文件提取失败: {e}\")\n \n def _extract_from_xml(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从XML文件中提取内容\"\"\"\n try:\n import xml.etree.ElementTree as ET\n \n tree = ET.parse(file_path)\n root = tree.getroot()\n \n # 提取所有文本内容\n content = \"\"\n def extract_text(element, level=0):\n nonlocal content\n indent = \" \" * level\n if element.text and element.text.strip():\n content += f\"{indent}{element.tag}: {element.text.strip()}\\n\"\n else:\n content += f\"{indent}{element.tag}\\n\"\n \n for child in element:\n extract_text(child, level + 1)\n \n extract_text(root)\n \n metadata = {\n 'root_tag': root.tag,\n 'element_count': len(list(root.iter())),\n 'extraction_library': 'xml.etree.ElementTree'\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"XML文件提取失败: {e}\")\n \n def _extract_from_html(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从HTML文件中提取内容\"\"\"\n try:\n with open(file_path, 'r', encoding='utf-8') as file:\n html_content = file.read()\n \n # 简单的HTML文本提取(移除标签)\n import re\n # 移除script和style标签及其内容\n html_content = re.sub(r'', '', html_content, flags=re.DOTALL)\n html_content = re.sub(r'', '', html_content, flags=re.DOTALL)\n \n # 移除HTML标签\n content = re.sub(r'<[^>]+>', '', html_content)\n \n # 清理多余的空白\n content = re.sub(r'\\s+', ' ', content).strip()\n \n metadata = {\n 'original_size': len(html_content),\n 'extraction_library': 'regex'\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"HTML文件提取失败: {e}\")\n \n def get_supported_formats(self) -> Dict[str, str]:\n \"\"\"获取支持的文件格式\"\"\"\n return self.SUPPORTED_EXTENSIONS.copy()\n \n def is_supported_format(self, file_path: Union[str, Path]) -> bool:\n \"\"\"检查文件格式是否支持\"\"\"\n file_path = Path(file_path)\n return file_path.suffix.lower() in self.SUPPORTED_EXTENSIONS\n \n def get_extraction_stats(self) -> Dict[str, Any]:\n \"\"\"获取提取器统计信息\"\"\"\n available_libraries = []\n \n if PDF_AVAILABLE:\n available_libraries.append(\"PDF (PyPDF2, pdfplumber)\")\n if DOCX_AVAILABLE:\n available_libraries.append(\"Word (python-docx)\")\n if EXCEL_AVAILABLE:\n available_libraries.append(\"Excel (openpyxl)\")\n if PANDAS_AVAILABLE:\n available_libraries.append(\"CSV (pandas)\")\n \n return {\n 'supported_formats': list(self.SUPPORTED_EXTENSIONS.keys()),\n 'available_libraries': available_libraries,\n 'max_file_size': self.config.max_file_size,\n 'max_content_length': self.config.max_content_length,\n 'max_documents': self.config.max_documents\n } ", "code_hash": "e17c94489b3b534c4c770787252fda8f" } ], "imports": [ { "type": "import", "modules": [ "os" ], "aliases": [] }, { "type": "import", "modules": [ "logging" ], "aliases": [] }, { "type": "import", "modules": [ "mimetypes" ], "aliases": [] }, { "type": "from_import", "module": "typing", "names": [ "List", "Dict", "Any", "Optional", "Union" ], "aliases": [], "level": 0 }, { "type": "from_import", "module": "pathlib", "names": [ "Path" ], "aliases": [], "level": 0 }, { "type": "from_import", "module": "dataclasses", "names": [ "dataclass" ], "aliases": [], "level": 0 }, { "type": "from_import", "module": "datetime", "names": [ "datetime" ], "aliases": [], "level": 0 }, { "type": "from_import", "module": "config", "names": [ "DocumentProcessingConfig" ], "aliases": [], "level": 2 }, { "type": "from_import", "module": "exceptions", "names": [ "DocumentProcessingError", "ResourceNotFoundError" ], "aliases": [], "level": 2 }, { "type": "import", "modules": [ "PyPDF2" ], "aliases": [] }, { "type": "import", "modules": [ "pdfplumber" ], "aliases": [] }, { "type": "from_import", "module": "docx", "names": [ "Document" ], "aliases": [], "level": 0 }, { "type": "import", "modules": [ "openpyxl" ], "aliases": [] }, { "type": "from_import", "module": "openpyxl", "names": [ "load_workbook" ], "aliases": [], "level": 0 }, { "type": "import", "modules": [ "pandas" ], "aliases": [ "pd" ] }, { "type": "import", "modules": [ "json" ], "aliases": [] }, { "type": "import", "modules": [ "xml.etree.ElementTree" ], "aliases": [ "ET" ] }, { "type": "import", "modules": [ "re" ], "aliases": [] }, { "type": "import", "modules": [ "csv" ], "aliases": [] } ], "constants": [ { "name": "PDF_AVAILABLE", "value": true, "type": "bool", "line": 21 }, { "name": "DOCX_AVAILABLE", "value": true, "type": "bool", "line": 27 }, { "name": "EXCEL_AVAILABLE", "value": true, "type": "bool", "line": 34 }, { "name": "PANDAS_AVAILABLE", "value": true, "type": "bool", "line": 40 }, { "name": "SUPPORTED_EXTENSIONS", "value": { ".pdf": "PDF Document", ".docx": "Microsoft Word Document", ".doc": "Microsoft Word Document (Legacy)", ".xlsx": "Microsoft Excel Spreadsheet", ".xls": "Microsoft Excel Spreadsheet (Legacy)", ".txt": "Plain Text", ".md": "Markdown", ".csv": "Comma Separated Values", ".json": "JSON Document", ".xml": "XML Document", ".html": "HTML Document", ".htm": "HTML Document" }, "type": "dict", "line": 108 }, { "name": "PDF_AVAILABLE", "value": false, "type": "bool", "line": 23 }, { "name": "DOCX_AVAILABLE", "value": false, "type": "bool", "line": 29 }, { "name": "EXCEL_AVAILABLE", "value": false, "type": "bool", "line": 36 }, { "name": "PANDAS_AVAILABLE", "value": false, "type": "bool", "line": 42 } ], "docstring": "Text Extractor\n文本提取器 - 重构版本,支持从多种格式文档中提取文本内容", "content_hash": "2e8ea1b0b3f987f64faa46d44b0f73f0" }