2025-07-31 15:35:23 +08:00

925 lines
83 KiB
JSON
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"file_path": "travel-algorithms/travel_algorithms/document_processing/text_extractor.py",
"file_size": 20206,
"line_count": 579,
"functions": [
{
"name": "__post_init__",
"line_start": 63,
"line_end": 71,
"args": [
{
"name": "self"
}
],
"return_type": null,
"docstring": "初始化后处理",
"is_async": false,
"decorators": [],
"code": " def __post_init__(self):\n \"\"\"初始化后处理\"\"\"\n if not self.extracted_at:\n self.extracted_at = datetime.now()\n \n # 计算内容统计\n self.metadata.setdefault('content_length', len(self.content))\n self.metadata.setdefault('word_count', len(self.content.split()))\n self.metadata.setdefault('line_count', len(self.content.splitlines()))",
"code_hash": "e3b7d9c15591b8f58f4f499fcf8ecac8"
},
{
"name": "to_dict",
"line_start": 73,
"line_end": 85,
"args": [
{
"name": "self"
}
],
"return_type": "Dict[str, Any]",
"docstring": "转换为字典格式",
"is_async": false,
"decorators": [],
"code": " def to_dict(self) -> Dict[str, Any]:\n \"\"\"转换为字典格式\"\"\"\n return {\n 'filename': self.filename,\n 'file_type': self.file_type,\n 'content': self.content,\n 'metadata': self.metadata,\n 'extracted_at': self.extracted_at.isoformat(),\n 'file_size': self.file_size,\n 'page_count': self.page_count,\n 'extraction_method': self.extraction_method,\n 'error_info': self.error_info\n }",
"code_hash": "9aec5f6566c91613207a14430fd3a65f"
},
{
"name": "get_summary",
"line_start": 87,
"line_end": 99,
"args": [
{
"name": "self"
}
],
"return_type": "Dict[str, Any]",
"docstring": "获取文档摘要信息",
"is_async": false,
"decorators": [],
"code": " def get_summary(self) -> Dict[str, Any]:\n \"\"\"获取文档摘要信息\"\"\"\n return {\n 'filename': self.filename,\n 'file_type': self.file_type,\n 'file_size': self.file_size,\n 'page_count': self.page_count,\n 'content_length': len(self.content),\n 'word_count': len(self.content.split()),\n 'extracted_at': self.extracted_at.isoformat(),\n 'extraction_method': self.extraction_method,\n 'has_error': bool(self.error_info)\n }",
"code_hash": "3b4aeb4ce342f694cf6f50ec911f85a4"
},
{
"name": "__init__",
"line_start": 123,
"line_end": 146,
"args": [
{
"name": "self"
},
{
"name": "config",
"type_hint": "DocumentProcessingConfig"
}
],
"return_type": null,
"docstring": "初始化文本提取器\n\nArgs:\n config: 文档处理配置",
"is_async": false,
"decorators": [],
"code": " def __init__(self, config: DocumentProcessingConfig):\n \"\"\"\n 初始化文本提取器\n \n Args:\n config: 文档处理配置\n \"\"\"\n self.config = config\n self.extraction_methods = {\n '.pdf': self._extract_from_pdf,\n '.docx': self._extract_from_docx,\n '.doc': self._extract_from_docx,\n '.xlsx': self._extract_from_excel,\n '.xls': self._extract_from_excel,\n '.txt': self._extract_from_text,\n '.md': self._extract_from_text,\n '.csv': self._extract_from_csv,\n '.json': self._extract_from_json,\n '.xml': self._extract_from_xml,\n '.html': self._extract_from_html,\n '.htm': self._extract_from_html\n }\n \n logger.info(f\"文本提取器初始化完成,支持格式: {list(self.SUPPORTED_EXTENSIONS.keys())}\")",
"code_hash": "b80795161396567c1a4afe0cbede261e"
},
{
"name": "extract_from_file",
"line_start": 148,
"line_end": 230,
"args": [
{
"name": "self"
},
{
"name": "file_path",
"type_hint": "Union[str, Path]"
}
],
"return_type": "ExtractedDocument",
"docstring": "从文件中提取文本\n\nArgs:\n file_path: 文件路径\n \nReturns:\n ExtractedDocument: 提取的文档数据\n \nRaises:\n DocumentProcessingError: 提取失败时抛出",
"is_async": false,
"decorators": [],
"code": " def extract_from_file(self, file_path: Union[str, Path]) -> ExtractedDocument:\n \"\"\"\n 从文件中提取文本\n \n Args:\n file_path: 文件路径\n \n Returns:\n ExtractedDocument: 提取的文档数据\n \n Raises:\n DocumentProcessingError: 提取失败时抛出\n \"\"\"\n file_path = Path(file_path)\n \n if not file_path.exists():\n raise ResourceNotFoundError(f\"文件不存在: {file_path}\")\n \n if not file_path.is_file():\n raise DocumentProcessingError(f\"路径不是文件: {file_path}\")\n \n file_ext = file_path.suffix.lower()\n \n if file_ext not in self.SUPPORTED_EXTENSIONS:\n raise DocumentProcessingError(f\"不支持的文件格式: {file_ext}\")\n \n try:\n logger.info(f\"开始提取文件: {file_path}\")\n \n # 获取文件基本信息\n file_size = file_path.stat().st_size\n \n # 检查文件大小限制\n if self.config.max_file_size > 0 and file_size > self.config.max_file_size:\n raise DocumentProcessingError(\n f\"文件大小 {file_size} 超过限制 {self.config.max_file_size}\"\n )\n \n # 根据文件扩展名选择提取方法\n extraction_method = self.extraction_methods.get(file_ext)\n if not extraction_method:\n raise DocumentProcessingError(f\"未找到对应的提取方法: {file_ext}\")\n \n # 执行文本提取\n content, metadata, page_count = extraction_method(file_path)\n \n # 内容长度检查\n if self.config.max_content_length > 0 and len(content) > self.config.max_content_length:\n logger.warning(f\"内容长度 {len(content)} 超过限制,将被截断\")\n content = content[:self.config.max_content_length]\n metadata['content_truncated'] = True\n \n # 创建提取结果\n extracted_doc = ExtractedDocument(\n filename=file_path.name,\n file_type=self.SUPPORTED_EXTENSIONS[file_ext],\n content=content,\n metadata=metadata,\n extracted_at=datetime.now(),\n file_size=file_size,\n page_count=page_count,\n extraction_method=extraction_method.__name__\n )\n \n logger.info(f\"文件提取完成: {file_path}, 内容长度: {len(content)}\")\n return extracted_doc\n \n except Exception as e:\n error_msg = f\"文件提取失败 {file_path}: {str(e)}\"\n logger.error(error_msg, exc_info=True)\n \n # 创建错误文档\n return ExtractedDocument(\n filename=file_path.name,\n file_type=self.SUPPORTED_EXTENSIONS.get(file_ext, \"Unknown\"),\n content=\"\",\n metadata={'extraction_error': str(e)},\n extracted_at=datetime.now(),\n file_size=file_path.stat().st_size if file_path.exists() else 0,\n page_count=None,\n extraction_method=None,\n error_info=error_msg\n )",
"code_hash": "ede6a7b109cac815a9ad476f06a9c1c3"
},
{
"name": "extract_from_directory",
"line_start": 232,
"line_end": 275,
"args": [
{
"name": "self"
},
{
"name": "directory_path",
"type_hint": "Union[str, Path]"
}
],
"return_type": "List[ExtractedDocument]",
"docstring": "从目录中提取所有支持的文档\n\nArgs:\n directory_path: 目录路径\n \nReturns:\n List[ExtractedDocument]: 提取的文档列表",
"is_async": false,
"decorators": [],
"code": " def extract_from_directory(self, directory_path: Union[str, Path]) -> List[ExtractedDocument]:\n \"\"\"\n 从目录中提取所有支持的文档\n \n Args:\n directory_path: 目录路径\n \n Returns:\n List[ExtractedDocument]: 提取的文档列表\n \"\"\"\n directory_path = Path(directory_path)\n \n if not directory_path.exists():\n raise ResourceNotFoundError(f\"目录不存在: {directory_path}\")\n \n if not directory_path.is_dir():\n raise DocumentProcessingError(f\"路径不是目录: {directory_path}\")\n \n extracted_docs = []\n processed_count = 0\n \n # 递归搜索文件\n for file_path in directory_path.rglob(\"*\"):\n if file_path.is_file():\n file_ext = file_path.suffix.lower()\n \n if file_ext in self.SUPPORTED_EXTENSIONS:\n try:\n doc = self.extract_from_file(file_path)\n extracted_docs.append(doc)\n processed_count += 1\n \n # 检查处理数量限制\n if (self.config.max_documents > 0 and \n processed_count >= self.config.max_documents):\n logger.warning(f\"达到文档数量限制 {self.config.max_documents}\")\n break\n \n except Exception as e:\n logger.error(f\"处理文件失败 {file_path}: {e}\")\n continue\n \n logger.info(f\"目录提取完成: {directory_path}, 处理文件数: {len(extracted_docs)}\")\n return extracted_docs",
"code_hash": "040078b945a9aaae7216ebb69c026e23"
},
{
"name": "_extract_from_pdf",
"line_start": 277,
"line_end": 329,
"args": [
{
"name": "self"
},
{
"name": "file_path",
"type_hint": "Path"
}
],
"return_type": "tuple[str, Dict[str, Any], Optional[int]]",
"docstring": "从PDF文件中提取文本",
"is_async": false,
"decorators": [],
"code": " def _extract_from_pdf(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从PDF文件中提取文本\"\"\"\n if not PDF_AVAILABLE:\n raise DocumentProcessingError(\"PDF支持库未安装请安装: pip install PyPDF2 pdfplumber\")\n \n content = \"\"\n metadata = {}\n page_count = 0\n \n try:\n # 优先使用pdfplumber回退到PyPDF2\n with pdfplumber.open(file_path) as pdf:\n page_count = len(pdf.pages)\n metadata['page_count'] = page_count\n \n for page in pdf.pages:\n text = page.extract_text()\n if text:\n content += text + \"\\n\"\n \n # 提取表格数据\n tables = []\n for page in pdf.pages:\n page_tables = page.extract_tables()\n if page_tables:\n tables.extend(page_tables)\n \n if tables:\n metadata['table_count'] = len(tables)\n # 将表格转换为文本\n for i, table in enumerate(tables):\n content += f\"\\n\\n表格 {i+1}:\\n\"\n for row in table:\n if row:\n content += \"\\t\".join(str(cell) if cell else \"\" for cell in row) + \"\\n\"\n \n except Exception as e:\n logger.warning(f\"pdfplumber提取失败尝试PyPDF2: {e}\")\n \n try:\n with open(file_path, 'rb') as file:\n pdf_reader = PyPDF2.PdfReader(file)\n page_count = len(pdf_reader.pages)\n metadata['page_count'] = page_count\n \n for page in pdf_reader.pages:\n content += page.extract_text() + \"\\n\"\n \n except Exception as e2:\n raise DocumentProcessingError(f\"PDF提取失败: {e2}\")\n \n metadata['extraction_library'] = 'pdfplumber' if 'pdfplumber' in str(type(pdf)) else 'PyPDF2'\n return content.strip(), metadata, page_count",
"code_hash": "595de0393b9c007a029197da48307407"
},
{
"name": "_extract_from_docx",
"line_start": 331,
"line_end": 365,
"args": [
{
"name": "self"
},
{
"name": "file_path",
"type_hint": "Path"
}
],
"return_type": "tuple[str, Dict[str, Any], Optional[int]]",
"docstring": "从Word文档中提取文本",
"is_async": false,
"decorators": [],
"code": " def _extract_from_docx(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从Word文档中提取文本\"\"\"\n if not DOCX_AVAILABLE:\n raise DocumentProcessingError(\"Word文档支持库未安装请安装: pip install python-docx\")\n \n try:\n doc = Document(file_path)\n content = \"\"\n metadata = {}\n \n # 提取段落文本\n paragraph_count = 0\n for paragraph in doc.paragraphs:\n if paragraph.text.strip():\n content += paragraph.text + \"\\n\"\n paragraph_count += 1\n \n # 提取表格文本\n table_count = len(doc.tables)\n for table in doc.tables:\n content += \"\\n\\n表格:\\n\"\n for row in table.rows:\n row_text = \"\\t\".join(cell.text.strip() for cell in row.cells)\n content += row_text + \"\\n\"\n \n metadata.update({\n 'paragraph_count': paragraph_count,\n 'table_count': table_count,\n 'extraction_library': 'python-docx'\n })\n \n return content.strip(), metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"Word文档提取失败: {e}\")",
"code_hash": "c2d01e8cfa286d1633d191be12390b28"
},
{
"name": "_extract_from_excel",
"line_start": 367,
"line_end": 401,
"args": [
{
"name": "self"
},
{
"name": "file_path",
"type_hint": "Path"
}
],
"return_type": "tuple[str, Dict[str, Any], Optional[int]]",
"docstring": "从Excel文件中提取文本",
"is_async": false,
"decorators": [],
"code": " def _extract_from_excel(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从Excel文件中提取文本\"\"\"\n if not EXCEL_AVAILABLE:\n raise DocumentProcessingError(\"Excel支持库未安装请安装: pip install openpyxl\")\n \n try:\n workbook = load_workbook(file_path, data_only=True)\n content = \"\"\n metadata = {}\n \n sheet_count = len(workbook.sheetnames)\n total_rows = 0\n \n for sheet_name in workbook.sheetnames:\n sheet = workbook[sheet_name]\n content += f\"\\n\\n工作表: {sheet_name}\\n\"\n \n sheet_rows = 0\n for row in sheet.iter_rows(values_only=True):\n if any(cell is not None for cell in row):\n row_text = \"\\t\".join(str(cell) if cell is not None else \"\" for cell in row)\n content += row_text + \"\\n\"\n sheet_rows += 1\n total_rows += 1\n \n metadata.update({\n 'sheet_count': sheet_count,\n 'total_rows': total_rows,\n 'extraction_library': 'openpyxl'\n })\n \n return content.strip(), metadata, sheet_count\n \n except Exception as e:\n raise DocumentProcessingError(f\"Excel文件提取失败: {e}\")",
"code_hash": "9f91b9c5beed4befce153fadc76f5618"
},
{
"name": "_extract_from_text",
"line_start": 403,
"line_end": 431,
"args": [
{
"name": "self"
},
{
"name": "file_path",
"type_hint": "Path"
}
],
"return_type": "tuple[str, Dict[str, Any], Optional[int]]",
"docstring": "从文本文件中提取内容",
"is_async": false,
"decorators": [],
"code": " def _extract_from_text(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从文本文件中提取内容\"\"\"\n try:\n # 尝试多种编码\n encodings = ['utf-8', 'gbk', 'gb2312', 'latin-1']\n content = \"\"\n used_encoding = \"\"\n \n for encoding in encodings:\n try:\n with open(file_path, 'r', encoding=encoding) as file:\n content = file.read()\n used_encoding = encoding\n break\n except UnicodeDecodeError:\n continue\n \n if not content and not used_encoding:\n raise DocumentProcessingError(\"无法解码文本文件,尝试了多种编码方式\")\n \n metadata = {\n 'encoding': used_encoding,\n 'line_count': len(content.splitlines())\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"文本文件提取失败: {e}\")",
"code_hash": "b7adae74706c1052b8607a2a0a92039f"
},
{
"name": "_extract_from_csv",
"line_start": 433,
"line_end": 466,
"args": [
{
"name": "self"
},
{
"name": "file_path",
"type_hint": "Path"
}
],
"return_type": "tuple[str, Dict[str, Any], Optional[int]]",
"docstring": "从CSV文件中提取内容",
"is_async": false,
"decorators": [],
"code": " def _extract_from_csv(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从CSV文件中提取内容\"\"\"\n try:\n if PANDAS_AVAILABLE:\n # 使用pandas处理CSV\n df = pd.read_csv(file_path)\n content = df.to_string(index=False)\n metadata = {\n 'rows': len(df),\n 'columns': len(df.columns),\n 'column_names': list(df.columns),\n 'extraction_library': 'pandas'\n }\n else:\n # 使用内置csv模块\n import csv\n content = \"\"\n with open(file_path, 'r', encoding='utf-8') as file:\n csv_reader = csv.reader(file)\n rows = list(csv_reader)\n \n for row in rows:\n content += \"\\t\".join(row) + \"\\n\"\n \n metadata = {\n 'rows': len(rows),\n 'columns': len(rows[0]) if rows else 0,\n 'extraction_library': 'csv'\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"CSV文件提取失败: {e}\")",
"code_hash": "8e55541a37dfa2e36110638c22153ebc"
},
{
"name": "_extract_from_json",
"line_start": 468,
"line_end": 488,
"args": [
{
"name": "self"
},
{
"name": "file_path",
"type_hint": "Path"
}
],
"return_type": "tuple[str, Dict[str, Any], Optional[int]]",
"docstring": "从JSON文件中提取内容",
"is_async": false,
"decorators": [],
"code": " def _extract_from_json(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从JSON文件中提取内容\"\"\"\n try:\n import json\n \n with open(file_path, 'r', encoding='utf-8') as file:\n data = json.load(file)\n \n # 将JSON转换为格式化的文本\n content = json.dumps(data, ensure_ascii=False, indent=2)\n \n metadata = {\n 'json_keys': list(data.keys()) if isinstance(data, dict) else [],\n 'json_type': type(data).__name__,\n 'extraction_library': 'json'\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"JSON文件提取失败: {e}\")",
"code_hash": "95a6d0ec0374f263879bc25a36d92b74"
},
{
"name": "_extract_from_xml",
"line_start": 490,
"line_end": 522,
"args": [
{
"name": "self"
},
{
"name": "file_path",
"type_hint": "Path"
}
],
"return_type": "tuple[str, Dict[str, Any], Optional[int]]",
"docstring": "从XML文件中提取内容",
"is_async": false,
"decorators": [],
"code": " def _extract_from_xml(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从XML文件中提取内容\"\"\"\n try:\n import xml.etree.ElementTree as ET\n \n tree = ET.parse(file_path)\n root = tree.getroot()\n \n # 提取所有文本内容\n content = \"\"\n def extract_text(element, level=0):\n nonlocal content\n indent = \" \" * level\n if element.text and element.text.strip():\n content += f\"{indent}{element.tag}: {element.text.strip()}\\n\"\n else:\n content += f\"{indent}{element.tag}\\n\"\n \n for child in element:\n extract_text(child, level + 1)\n \n extract_text(root)\n \n metadata = {\n 'root_tag': root.tag,\n 'element_count': len(list(root.iter())),\n 'extraction_library': 'xml.etree.ElementTree'\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"XML文件提取失败: {e}\")",
"code_hash": "fdc37ab01be4414d3094d8b13150b4c2"
},
{
"name": "_extract_from_html",
"line_start": 524,
"line_end": 550,
"args": [
{
"name": "self"
},
{
"name": "file_path",
"type_hint": "Path"
}
],
"return_type": "tuple[str, Dict[str, Any], Optional[int]]",
"docstring": "从HTML文件中提取内容",
"is_async": false,
"decorators": [],
"code": " def _extract_from_html(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从HTML文件中提取内容\"\"\"\n try:\n with open(file_path, 'r', encoding='utf-8') as file:\n html_content = file.read()\n \n # 简单的HTML文本提取移除标签\n import re\n # 移除script和style标签及其内容\n html_content = re.sub(r'<script.*?</script>', '', html_content, flags=re.DOTALL)\n html_content = re.sub(r'<style.*?</style>', '', html_content, flags=re.DOTALL)\n \n # 移除HTML标签\n content = re.sub(r'<[^>]+>', '', html_content)\n \n # 清理多余的空白\n content = re.sub(r'\\s+', ' ', content).strip()\n \n metadata = {\n 'original_size': len(html_content),\n 'extraction_library': 'regex'\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"HTML文件提取失败: {e}\")",
"code_hash": "c7a170bb6866f195eb9065aa21caf871"
},
{
"name": "get_supported_formats",
"line_start": 552,
"line_end": 554,
"args": [
{
"name": "self"
}
],
"return_type": "Dict[str, str]",
"docstring": "获取支持的文件格式",
"is_async": false,
"decorators": [],
"code": " def get_supported_formats(self) -> Dict[str, str]:\n \"\"\"获取支持的文件格式\"\"\"\n return self.SUPPORTED_EXTENSIONS.copy()",
"code_hash": "b190b58a832baa0f901258b0ed23a4e2"
},
{
"name": "is_supported_format",
"line_start": 556,
"line_end": 559,
"args": [
{
"name": "self"
},
{
"name": "file_path",
"type_hint": "Union[str, Path]"
}
],
"return_type": "bool",
"docstring": "检查文件格式是否支持",
"is_async": false,
"decorators": [],
"code": " def is_supported_format(self, file_path: Union[str, Path]) -> bool:\n \"\"\"检查文件格式是否支持\"\"\"\n file_path = Path(file_path)\n return file_path.suffix.lower() in self.SUPPORTED_EXTENSIONS",
"code_hash": "d104465d277e576d465debd1d495da97"
},
{
"name": "get_extraction_stats",
"line_start": 561,
"line_end": 580,
"args": [
{
"name": "self"
}
],
"return_type": "Dict[str, Any]",
"docstring": "获取提取器统计信息",
"is_async": false,
"decorators": [],
"code": " def get_extraction_stats(self) -> Dict[str, Any]:\n \"\"\"获取提取器统计信息\"\"\"\n available_libraries = []\n \n if PDF_AVAILABLE:\n available_libraries.append(\"PDF (PyPDF2, pdfplumber)\")\n if DOCX_AVAILABLE:\n available_libraries.append(\"Word (python-docx)\")\n if EXCEL_AVAILABLE:\n available_libraries.append(\"Excel (openpyxl)\")\n if PANDAS_AVAILABLE:\n available_libraries.append(\"CSV (pandas)\")\n \n return {\n 'supported_formats': list(self.SUPPORTED_EXTENSIONS.keys()),\n 'available_libraries': available_libraries,\n 'max_file_size': self.config.max_file_size,\n 'max_content_length': self.config.max_content_length,\n 'max_documents': self.config.max_documents\n } ",
"code_hash": "2c261c6f8ea522a7987b3c28e03a0807"
},
{
"name": "extract_text",
"line_start": 500,
"line_end": 509,
"args": [
{
"name": "element"
},
{
"name": "level"
}
],
"return_type": null,
"docstring": "",
"is_async": false,
"decorators": [],
"code": " def extract_text(element, level=0):\n nonlocal content\n indent = \" \" * level\n if element.text and element.text.strip():\n content += f\"{indent}{element.tag}: {element.text.strip()}\\n\"\n else:\n content += f\"{indent}{element.tag}\\n\"\n \n for child in element:\n extract_text(child, level + 1)",
"code_hash": "17aee15b3d955d5237e6253e3c4f6f5b"
}
],
"classes": [
{
"name": "ExtractedDocument",
"line_start": 51,
"line_end": 99,
"bases": [],
"methods": [
{
"name": "__post_init__",
"line_start": 63,
"line_end": 71,
"args": [
{
"name": "self"
}
],
"return_type": null,
"docstring": "初始化后处理",
"is_async": false,
"decorators": [],
"code": " def __post_init__(self):\n \"\"\"初始化后处理\"\"\"\n if not self.extracted_at:\n self.extracted_at = datetime.now()\n \n # 计算内容统计\n self.metadata.setdefault('content_length', len(self.content))\n self.metadata.setdefault('word_count', len(self.content.split()))\n self.metadata.setdefault('line_count', len(self.content.splitlines()))",
"code_hash": "e3b7d9c15591b8f58f4f499fcf8ecac8"
},
{
"name": "to_dict",
"line_start": 73,
"line_end": 85,
"args": [
{
"name": "self"
}
],
"return_type": "Dict[str, Any]",
"docstring": "转换为字典格式",
"is_async": false,
"decorators": [],
"code": " def to_dict(self) -> Dict[str, Any]:\n \"\"\"转换为字典格式\"\"\"\n return {\n 'filename': self.filename,\n 'file_type': self.file_type,\n 'content': self.content,\n 'metadata': self.metadata,\n 'extracted_at': self.extracted_at.isoformat(),\n 'file_size': self.file_size,\n 'page_count': self.page_count,\n 'extraction_method': self.extraction_method,\n 'error_info': self.error_info\n }",
"code_hash": "9aec5f6566c91613207a14430fd3a65f"
},
{
"name": "get_summary",
"line_start": 87,
"line_end": 99,
"args": [
{
"name": "self"
}
],
"return_type": "Dict[str, Any]",
"docstring": "获取文档摘要信息",
"is_async": false,
"decorators": [],
"code": " def get_summary(self) -> Dict[str, Any]:\n \"\"\"获取文档摘要信息\"\"\"\n return {\n 'filename': self.filename,\n 'file_type': self.file_type,\n 'file_size': self.file_size,\n 'page_count': self.page_count,\n 'content_length': len(self.content),\n 'word_count': len(self.content.split()),\n 'extracted_at': self.extracted_at.isoformat(),\n 'extraction_method': self.extraction_method,\n 'has_error': bool(self.error_info)\n }",
"code_hash": "3b4aeb4ce342f694cf6f50ec911f85a4"
}
],
"docstring": "提取的文档数据",
"decorators": [
"dataclass"
],
"code": "class ExtractedDocument:\n \"\"\"提取的文档数据\"\"\"\n filename: str\n file_type: str\n content: str # 纯文本内容\n metadata: Dict[str, Any] # 文档元数据\n extracted_at: datetime\n file_size: int\n page_count: Optional[int] = None\n extraction_method: Optional[str] = None\n error_info: Optional[str] = None\n \n def __post_init__(self):\n \"\"\"初始化后处理\"\"\"\n if not self.extracted_at:\n self.extracted_at = datetime.now()\n \n # 计算内容统计\n self.metadata.setdefault('content_length', len(self.content))\n self.metadata.setdefault('word_count', len(self.content.split()))\n self.metadata.setdefault('line_count', len(self.content.splitlines()))\n \n def to_dict(self) -> Dict[str, Any]:\n \"\"\"转换为字典格式\"\"\"\n return {\n 'filename': self.filename,\n 'file_type': self.file_type,\n 'content': self.content,\n 'metadata': self.metadata,\n 'extracted_at': self.extracted_at.isoformat(),\n 'file_size': self.file_size,\n 'page_count': self.page_count,\n 'extraction_method': self.extraction_method,\n 'error_info': self.error_info\n }\n \n def get_summary(self) -> Dict[str, Any]:\n \"\"\"获取文档摘要信息\"\"\"\n return {\n 'filename': self.filename,\n 'file_type': self.file_type,\n 'file_size': self.file_size,\n 'page_count': self.page_count,\n 'content_length': len(self.content),\n 'word_count': len(self.content.split()),\n 'extracted_at': self.extracted_at.isoformat(),\n 'extraction_method': self.extraction_method,\n 'has_error': bool(self.error_info)\n }",
"code_hash": "48e79f6e59935d971c73087b22b67bad"
},
{
"name": "TextExtractor",
"line_start": 102,
"line_end": 580,
"bases": [],
"methods": [
{
"name": "__init__",
"line_start": 123,
"line_end": 146,
"args": [
{
"name": "self"
},
{
"name": "config",
"type_hint": "DocumentProcessingConfig"
}
],
"return_type": null,
"docstring": "初始化文本提取器\n\nArgs:\n config: 文档处理配置",
"is_async": false,
"decorators": [],
"code": " def __init__(self, config: DocumentProcessingConfig):\n \"\"\"\n 初始化文本提取器\n \n Args:\n config: 文档处理配置\n \"\"\"\n self.config = config\n self.extraction_methods = {\n '.pdf': self._extract_from_pdf,\n '.docx': self._extract_from_docx,\n '.doc': self._extract_from_docx,\n '.xlsx': self._extract_from_excel,\n '.xls': self._extract_from_excel,\n '.txt': self._extract_from_text,\n '.md': self._extract_from_text,\n '.csv': self._extract_from_csv,\n '.json': self._extract_from_json,\n '.xml': self._extract_from_xml,\n '.html': self._extract_from_html,\n '.htm': self._extract_from_html\n }\n \n logger.info(f\"文本提取器初始化完成,支持格式: {list(self.SUPPORTED_EXTENSIONS.keys())}\")",
"code_hash": "b80795161396567c1a4afe0cbede261e"
},
{
"name": "extract_from_file",
"line_start": 148,
"line_end": 230,
"args": [
{
"name": "self"
},
{
"name": "file_path",
"type_hint": "Union[str, Path]"
}
],
"return_type": "ExtractedDocument",
"docstring": "从文件中提取文本\n\nArgs:\n file_path: 文件路径\n \nReturns:\n ExtractedDocument: 提取的文档数据\n \nRaises:\n DocumentProcessingError: 提取失败时抛出",
"is_async": false,
"decorators": [],
"code": " def extract_from_file(self, file_path: Union[str, Path]) -> ExtractedDocument:\n \"\"\"\n 从文件中提取文本\n \n Args:\n file_path: 文件路径\n \n Returns:\n ExtractedDocument: 提取的文档数据\n \n Raises:\n DocumentProcessingError: 提取失败时抛出\n \"\"\"\n file_path = Path(file_path)\n \n if not file_path.exists():\n raise ResourceNotFoundError(f\"文件不存在: {file_path}\")\n \n if not file_path.is_file():\n raise DocumentProcessingError(f\"路径不是文件: {file_path}\")\n \n file_ext = file_path.suffix.lower()\n \n if file_ext not in self.SUPPORTED_EXTENSIONS:\n raise DocumentProcessingError(f\"不支持的文件格式: {file_ext}\")\n \n try:\n logger.info(f\"开始提取文件: {file_path}\")\n \n # 获取文件基本信息\n file_size = file_path.stat().st_size\n \n # 检查文件大小限制\n if self.config.max_file_size > 0 and file_size > self.config.max_file_size:\n raise DocumentProcessingError(\n f\"文件大小 {file_size} 超过限制 {self.config.max_file_size}\"\n )\n \n # 根据文件扩展名选择提取方法\n extraction_method = self.extraction_methods.get(file_ext)\n if not extraction_method:\n raise DocumentProcessingError(f\"未找到对应的提取方法: {file_ext}\")\n \n # 执行文本提取\n content, metadata, page_count = extraction_method(file_path)\n \n # 内容长度检查\n if self.config.max_content_length > 0 and len(content) > self.config.max_content_length:\n logger.warning(f\"内容长度 {len(content)} 超过限制,将被截断\")\n content = content[:self.config.max_content_length]\n metadata['content_truncated'] = True\n \n # 创建提取结果\n extracted_doc = ExtractedDocument(\n filename=file_path.name,\n file_type=self.SUPPORTED_EXTENSIONS[file_ext],\n content=content,\n metadata=metadata,\n extracted_at=datetime.now(),\n file_size=file_size,\n page_count=page_count,\n extraction_method=extraction_method.__name__\n )\n \n logger.info(f\"文件提取完成: {file_path}, 内容长度: {len(content)}\")\n return extracted_doc\n \n except Exception as e:\n error_msg = f\"文件提取失败 {file_path}: {str(e)}\"\n logger.error(error_msg, exc_info=True)\n \n # 创建错误文档\n return ExtractedDocument(\n filename=file_path.name,\n file_type=self.SUPPORTED_EXTENSIONS.get(file_ext, \"Unknown\"),\n content=\"\",\n metadata={'extraction_error': str(e)},\n extracted_at=datetime.now(),\n file_size=file_path.stat().st_size if file_path.exists() else 0,\n page_count=None,\n extraction_method=None,\n error_info=error_msg\n )",
"code_hash": "ede6a7b109cac815a9ad476f06a9c1c3"
},
{
"name": "extract_from_directory",
"line_start": 232,
"line_end": 275,
"args": [
{
"name": "self"
},
{
"name": "directory_path",
"type_hint": "Union[str, Path]"
}
],
"return_type": "List[ExtractedDocument]",
"docstring": "从目录中提取所有支持的文档\n\nArgs:\n directory_path: 目录路径\n \nReturns:\n List[ExtractedDocument]: 提取的文档列表",
"is_async": false,
"decorators": [],
"code": " def extract_from_directory(self, directory_path: Union[str, Path]) -> List[ExtractedDocument]:\n \"\"\"\n 从目录中提取所有支持的文档\n \n Args:\n directory_path: 目录路径\n \n Returns:\n List[ExtractedDocument]: 提取的文档列表\n \"\"\"\n directory_path = Path(directory_path)\n \n if not directory_path.exists():\n raise ResourceNotFoundError(f\"目录不存在: {directory_path}\")\n \n if not directory_path.is_dir():\n raise DocumentProcessingError(f\"路径不是目录: {directory_path}\")\n \n extracted_docs = []\n processed_count = 0\n \n # 递归搜索文件\n for file_path in directory_path.rglob(\"*\"):\n if file_path.is_file():\n file_ext = file_path.suffix.lower()\n \n if file_ext in self.SUPPORTED_EXTENSIONS:\n try:\n doc = self.extract_from_file(file_path)\n extracted_docs.append(doc)\n processed_count += 1\n \n # 检查处理数量限制\n if (self.config.max_documents > 0 and \n processed_count >= self.config.max_documents):\n logger.warning(f\"达到文档数量限制 {self.config.max_documents}\")\n break\n \n except Exception as e:\n logger.error(f\"处理文件失败 {file_path}: {e}\")\n continue\n \n logger.info(f\"目录提取完成: {directory_path}, 处理文件数: {len(extracted_docs)}\")\n return extracted_docs",
"code_hash": "040078b945a9aaae7216ebb69c026e23"
},
{
"name": "_extract_from_pdf",
"line_start": 277,
"line_end": 329,
"args": [
{
"name": "self"
},
{
"name": "file_path",
"type_hint": "Path"
}
],
"return_type": "tuple[str, Dict[str, Any], Optional[int]]",
"docstring": "从PDF文件中提取文本",
"is_async": false,
"decorators": [],
"code": " def _extract_from_pdf(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从PDF文件中提取文本\"\"\"\n if not PDF_AVAILABLE:\n raise DocumentProcessingError(\"PDF支持库未安装请安装: pip install PyPDF2 pdfplumber\")\n \n content = \"\"\n metadata = {}\n page_count = 0\n \n try:\n # 优先使用pdfplumber回退到PyPDF2\n with pdfplumber.open(file_path) as pdf:\n page_count = len(pdf.pages)\n metadata['page_count'] = page_count\n \n for page in pdf.pages:\n text = page.extract_text()\n if text:\n content += text + \"\\n\"\n \n # 提取表格数据\n tables = []\n for page in pdf.pages:\n page_tables = page.extract_tables()\n if page_tables:\n tables.extend(page_tables)\n \n if tables:\n metadata['table_count'] = len(tables)\n # 将表格转换为文本\n for i, table in enumerate(tables):\n content += f\"\\n\\n表格 {i+1}:\\n\"\n for row in table:\n if row:\n content += \"\\t\".join(str(cell) if cell else \"\" for cell in row) + \"\\n\"\n \n except Exception as e:\n logger.warning(f\"pdfplumber提取失败尝试PyPDF2: {e}\")\n \n try:\n with open(file_path, 'rb') as file:\n pdf_reader = PyPDF2.PdfReader(file)\n page_count = len(pdf_reader.pages)\n metadata['page_count'] = page_count\n \n for page in pdf_reader.pages:\n content += page.extract_text() + \"\\n\"\n \n except Exception as e2:\n raise DocumentProcessingError(f\"PDF提取失败: {e2}\")\n \n metadata['extraction_library'] = 'pdfplumber' if 'pdfplumber' in str(type(pdf)) else 'PyPDF2'\n return content.strip(), metadata, page_count",
"code_hash": "595de0393b9c007a029197da48307407"
},
{
"name": "_extract_from_docx",
"line_start": 331,
"line_end": 365,
"args": [
{
"name": "self"
},
{
"name": "file_path",
"type_hint": "Path"
}
],
"return_type": "tuple[str, Dict[str, Any], Optional[int]]",
"docstring": "从Word文档中提取文本",
"is_async": false,
"decorators": [],
"code": " def _extract_from_docx(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从Word文档中提取文本\"\"\"\n if not DOCX_AVAILABLE:\n raise DocumentProcessingError(\"Word文档支持库未安装请安装: pip install python-docx\")\n \n try:\n doc = Document(file_path)\n content = \"\"\n metadata = {}\n \n # 提取段落文本\n paragraph_count = 0\n for paragraph in doc.paragraphs:\n if paragraph.text.strip():\n content += paragraph.text + \"\\n\"\n paragraph_count += 1\n \n # 提取表格文本\n table_count = len(doc.tables)\n for table in doc.tables:\n content += \"\\n\\n表格:\\n\"\n for row in table.rows:\n row_text = \"\\t\".join(cell.text.strip() for cell in row.cells)\n content += row_text + \"\\n\"\n \n metadata.update({\n 'paragraph_count': paragraph_count,\n 'table_count': table_count,\n 'extraction_library': 'python-docx'\n })\n \n return content.strip(), metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"Word文档提取失败: {e}\")",
"code_hash": "c2d01e8cfa286d1633d191be12390b28"
},
{
"name": "_extract_from_excel",
"line_start": 367,
"line_end": 401,
"args": [
{
"name": "self"
},
{
"name": "file_path",
"type_hint": "Path"
}
],
"return_type": "tuple[str, Dict[str, Any], Optional[int]]",
"docstring": "从Excel文件中提取文本",
"is_async": false,
"decorators": [],
"code": " def _extract_from_excel(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从Excel文件中提取文本\"\"\"\n if not EXCEL_AVAILABLE:\n raise DocumentProcessingError(\"Excel支持库未安装请安装: pip install openpyxl\")\n \n try:\n workbook = load_workbook(file_path, data_only=True)\n content = \"\"\n metadata = {}\n \n sheet_count = len(workbook.sheetnames)\n total_rows = 0\n \n for sheet_name in workbook.sheetnames:\n sheet = workbook[sheet_name]\n content += f\"\\n\\n工作表: {sheet_name}\\n\"\n \n sheet_rows = 0\n for row in sheet.iter_rows(values_only=True):\n if any(cell is not None for cell in row):\n row_text = \"\\t\".join(str(cell) if cell is not None else \"\" for cell in row)\n content += row_text + \"\\n\"\n sheet_rows += 1\n total_rows += 1\n \n metadata.update({\n 'sheet_count': sheet_count,\n 'total_rows': total_rows,\n 'extraction_library': 'openpyxl'\n })\n \n return content.strip(), metadata, sheet_count\n \n except Exception as e:\n raise DocumentProcessingError(f\"Excel文件提取失败: {e}\")",
"code_hash": "9f91b9c5beed4befce153fadc76f5618"
},
{
"name": "_extract_from_text",
"line_start": 403,
"line_end": 431,
"args": [
{
"name": "self"
},
{
"name": "file_path",
"type_hint": "Path"
}
],
"return_type": "tuple[str, Dict[str, Any], Optional[int]]",
"docstring": "从文本文件中提取内容",
"is_async": false,
"decorators": [],
"code": " def _extract_from_text(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从文本文件中提取内容\"\"\"\n try:\n # 尝试多种编码\n encodings = ['utf-8', 'gbk', 'gb2312', 'latin-1']\n content = \"\"\n used_encoding = \"\"\n \n for encoding in encodings:\n try:\n with open(file_path, 'r', encoding=encoding) as file:\n content = file.read()\n used_encoding = encoding\n break\n except UnicodeDecodeError:\n continue\n \n if not content and not used_encoding:\n raise DocumentProcessingError(\"无法解码文本文件,尝试了多种编码方式\")\n \n metadata = {\n 'encoding': used_encoding,\n 'line_count': len(content.splitlines())\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"文本文件提取失败: {e}\")",
"code_hash": "b7adae74706c1052b8607a2a0a92039f"
},
{
"name": "_extract_from_csv",
"line_start": 433,
"line_end": 466,
"args": [
{
"name": "self"
},
{
"name": "file_path",
"type_hint": "Path"
}
],
"return_type": "tuple[str, Dict[str, Any], Optional[int]]",
"docstring": "从CSV文件中提取内容",
"is_async": false,
"decorators": [],
"code": " def _extract_from_csv(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从CSV文件中提取内容\"\"\"\n try:\n if PANDAS_AVAILABLE:\n # 使用pandas处理CSV\n df = pd.read_csv(file_path)\n content = df.to_string(index=False)\n metadata = {\n 'rows': len(df),\n 'columns': len(df.columns),\n 'column_names': list(df.columns),\n 'extraction_library': 'pandas'\n }\n else:\n # 使用内置csv模块\n import csv\n content = \"\"\n with open(file_path, 'r', encoding='utf-8') as file:\n csv_reader = csv.reader(file)\n rows = list(csv_reader)\n \n for row in rows:\n content += \"\\t\".join(row) + \"\\n\"\n \n metadata = {\n 'rows': len(rows),\n 'columns': len(rows[0]) if rows else 0,\n 'extraction_library': 'csv'\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"CSV文件提取失败: {e}\")",
"code_hash": "8e55541a37dfa2e36110638c22153ebc"
},
{
"name": "_extract_from_json",
"line_start": 468,
"line_end": 488,
"args": [
{
"name": "self"
},
{
"name": "file_path",
"type_hint": "Path"
}
],
"return_type": "tuple[str, Dict[str, Any], Optional[int]]",
"docstring": "从JSON文件中提取内容",
"is_async": false,
"decorators": [],
"code": " def _extract_from_json(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从JSON文件中提取内容\"\"\"\n try:\n import json\n \n with open(file_path, 'r', encoding='utf-8') as file:\n data = json.load(file)\n \n # 将JSON转换为格式化的文本\n content = json.dumps(data, ensure_ascii=False, indent=2)\n \n metadata = {\n 'json_keys': list(data.keys()) if isinstance(data, dict) else [],\n 'json_type': type(data).__name__,\n 'extraction_library': 'json'\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"JSON文件提取失败: {e}\")",
"code_hash": "95a6d0ec0374f263879bc25a36d92b74"
},
{
"name": "_extract_from_xml",
"line_start": 490,
"line_end": 522,
"args": [
{
"name": "self"
},
{
"name": "file_path",
"type_hint": "Path"
}
],
"return_type": "tuple[str, Dict[str, Any], Optional[int]]",
"docstring": "从XML文件中提取内容",
"is_async": false,
"decorators": [],
"code": " def _extract_from_xml(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从XML文件中提取内容\"\"\"\n try:\n import xml.etree.ElementTree as ET\n \n tree = ET.parse(file_path)\n root = tree.getroot()\n \n # 提取所有文本内容\n content = \"\"\n def extract_text(element, level=0):\n nonlocal content\n indent = \" \" * level\n if element.text and element.text.strip():\n content += f\"{indent}{element.tag}: {element.text.strip()}\\n\"\n else:\n content += f\"{indent}{element.tag}\\n\"\n \n for child in element:\n extract_text(child, level + 1)\n \n extract_text(root)\n \n metadata = {\n 'root_tag': root.tag,\n 'element_count': len(list(root.iter())),\n 'extraction_library': 'xml.etree.ElementTree'\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"XML文件提取失败: {e}\")",
"code_hash": "fdc37ab01be4414d3094d8b13150b4c2"
},
{
"name": "_extract_from_html",
"line_start": 524,
"line_end": 550,
"args": [
{
"name": "self"
},
{
"name": "file_path",
"type_hint": "Path"
}
],
"return_type": "tuple[str, Dict[str, Any], Optional[int]]",
"docstring": "从HTML文件中提取内容",
"is_async": false,
"decorators": [],
"code": " def _extract_from_html(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从HTML文件中提取内容\"\"\"\n try:\n with open(file_path, 'r', encoding='utf-8') as file:\n html_content = file.read()\n \n # 简单的HTML文本提取移除标签\n import re\n # 移除script和style标签及其内容\n html_content = re.sub(r'<script.*?</script>', '', html_content, flags=re.DOTALL)\n html_content = re.sub(r'<style.*?</style>', '', html_content, flags=re.DOTALL)\n \n # 移除HTML标签\n content = re.sub(r'<[^>]+>', '', html_content)\n \n # 清理多余的空白\n content = re.sub(r'\\s+', ' ', content).strip()\n \n metadata = {\n 'original_size': len(html_content),\n 'extraction_library': 'regex'\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"HTML文件提取失败: {e}\")",
"code_hash": "c7a170bb6866f195eb9065aa21caf871"
},
{
"name": "get_supported_formats",
"line_start": 552,
"line_end": 554,
"args": [
{
"name": "self"
}
],
"return_type": "Dict[str, str]",
"docstring": "获取支持的文件格式",
"is_async": false,
"decorators": [],
"code": " def get_supported_formats(self) -> Dict[str, str]:\n \"\"\"获取支持的文件格式\"\"\"\n return self.SUPPORTED_EXTENSIONS.copy()",
"code_hash": "b190b58a832baa0f901258b0ed23a4e2"
},
{
"name": "is_supported_format",
"line_start": 556,
"line_end": 559,
"args": [
{
"name": "self"
},
{
"name": "file_path",
"type_hint": "Union[str, Path]"
}
],
"return_type": "bool",
"docstring": "检查文件格式是否支持",
"is_async": false,
"decorators": [],
"code": " def is_supported_format(self, file_path: Union[str, Path]) -> bool:\n \"\"\"检查文件格式是否支持\"\"\"\n file_path = Path(file_path)\n return file_path.suffix.lower() in self.SUPPORTED_EXTENSIONS",
"code_hash": "d104465d277e576d465debd1d495da97"
},
{
"name": "get_extraction_stats",
"line_start": 561,
"line_end": 580,
"args": [
{
"name": "self"
}
],
"return_type": "Dict[str, Any]",
"docstring": "获取提取器统计信息",
"is_async": false,
"decorators": [],
"code": " def get_extraction_stats(self) -> Dict[str, Any]:\n \"\"\"获取提取器统计信息\"\"\"\n available_libraries = []\n \n if PDF_AVAILABLE:\n available_libraries.append(\"PDF (PyPDF2, pdfplumber)\")\n if DOCX_AVAILABLE:\n available_libraries.append(\"Word (python-docx)\")\n if EXCEL_AVAILABLE:\n available_libraries.append(\"Excel (openpyxl)\")\n if PANDAS_AVAILABLE:\n available_libraries.append(\"CSV (pandas)\")\n \n return {\n 'supported_formats': list(self.SUPPORTED_EXTENSIONS.keys()),\n 'available_libraries': available_libraries,\n 'max_file_size': self.config.max_file_size,\n 'max_content_length': self.config.max_content_length,\n 'max_documents': self.config.max_documents\n } ",
"code_hash": "2c261c6f8ea522a7987b3c28e03a0807"
}
],
"docstring": "文本提取器 - 重构版本\n支持从多种文档格式中提取文本内容",
"decorators": [],
"code": "class TextExtractor:\n \"\"\"\n 文本提取器 - 重构版本\n 支持从多种文档格式中提取文本内容\n \"\"\"\n \n SUPPORTED_EXTENSIONS = {\n '.pdf': 'PDF Document',\n '.docx': 'Microsoft Word Document',\n '.doc': 'Microsoft Word Document (Legacy)',\n '.xlsx': 'Microsoft Excel Spreadsheet',\n '.xls': 'Microsoft Excel Spreadsheet (Legacy)',\n '.txt': 'Plain Text',\n '.md': 'Markdown',\n '.csv': 'Comma Separated Values',\n '.json': 'JSON Document',\n '.xml': 'XML Document',\n '.html': 'HTML Document',\n '.htm': 'HTML Document'\n }\n \n def __init__(self, config: DocumentProcessingConfig):\n \"\"\"\n 初始化文本提取器\n \n Args:\n config: 文档处理配置\n \"\"\"\n self.config = config\n self.extraction_methods = {\n '.pdf': self._extract_from_pdf,\n '.docx': self._extract_from_docx,\n '.doc': self._extract_from_docx,\n '.xlsx': self._extract_from_excel,\n '.xls': self._extract_from_excel,\n '.txt': self._extract_from_text,\n '.md': self._extract_from_text,\n '.csv': self._extract_from_csv,\n '.json': self._extract_from_json,\n '.xml': self._extract_from_xml,\n '.html': self._extract_from_html,\n '.htm': self._extract_from_html\n }\n \n logger.info(f\"文本提取器初始化完成,支持格式: {list(self.SUPPORTED_EXTENSIONS.keys())}\")\n \n def extract_from_file(self, file_path: Union[str, Path]) -> ExtractedDocument:\n \"\"\"\n 从文件中提取文本\n \n Args:\n file_path: 文件路径\n \n Returns:\n ExtractedDocument: 提取的文档数据\n \n Raises:\n DocumentProcessingError: 提取失败时抛出\n \"\"\"\n file_path = Path(file_path)\n \n if not file_path.exists():\n raise ResourceNotFoundError(f\"文件不存在: {file_path}\")\n \n if not file_path.is_file():\n raise DocumentProcessingError(f\"路径不是文件: {file_path}\")\n \n file_ext = file_path.suffix.lower()\n \n if file_ext not in self.SUPPORTED_EXTENSIONS:\n raise DocumentProcessingError(f\"不支持的文件格式: {file_ext}\")\n \n try:\n logger.info(f\"开始提取文件: {file_path}\")\n \n # 获取文件基本信息\n file_size = file_path.stat().st_size\n \n # 检查文件大小限制\n if self.config.max_file_size > 0 and file_size > self.config.max_file_size:\n raise DocumentProcessingError(\n f\"文件大小 {file_size} 超过限制 {self.config.max_file_size}\"\n )\n \n # 根据文件扩展名选择提取方法\n extraction_method = self.extraction_methods.get(file_ext)\n if not extraction_method:\n raise DocumentProcessingError(f\"未找到对应的提取方法: {file_ext}\")\n \n # 执行文本提取\n content, metadata, page_count = extraction_method(file_path)\n \n # 内容长度检查\n if self.config.max_content_length > 0 and len(content) > self.config.max_content_length:\n logger.warning(f\"内容长度 {len(content)} 超过限制,将被截断\")\n content = content[:self.config.max_content_length]\n metadata['content_truncated'] = True\n \n # 创建提取结果\n extracted_doc = ExtractedDocument(\n filename=file_path.name,\n file_type=self.SUPPORTED_EXTENSIONS[file_ext],\n content=content,\n metadata=metadata,\n extracted_at=datetime.now(),\n file_size=file_size,\n page_count=page_count,\n extraction_method=extraction_method.__name__\n )\n \n logger.info(f\"文件提取完成: {file_path}, 内容长度: {len(content)}\")\n return extracted_doc\n \n except Exception as e:\n error_msg = f\"文件提取失败 {file_path}: {str(e)}\"\n logger.error(error_msg, exc_info=True)\n \n # 创建错误文档\n return ExtractedDocument(\n filename=file_path.name,\n file_type=self.SUPPORTED_EXTENSIONS.get(file_ext, \"Unknown\"),\n content=\"\",\n metadata={'extraction_error': str(e)},\n extracted_at=datetime.now(),\n file_size=file_path.stat().st_size if file_path.exists() else 0,\n page_count=None,\n extraction_method=None,\n error_info=error_msg\n )\n \n def extract_from_directory(self, directory_path: Union[str, Path]) -> List[ExtractedDocument]:\n \"\"\"\n 从目录中提取所有支持的文档\n \n Args:\n directory_path: 目录路径\n \n Returns:\n List[ExtractedDocument]: 提取的文档列表\n \"\"\"\n directory_path = Path(directory_path)\n \n if not directory_path.exists():\n raise ResourceNotFoundError(f\"目录不存在: {directory_path}\")\n \n if not directory_path.is_dir():\n raise DocumentProcessingError(f\"路径不是目录: {directory_path}\")\n \n extracted_docs = []\n processed_count = 0\n \n # 递归搜索文件\n for file_path in directory_path.rglob(\"*\"):\n if file_path.is_file():\n file_ext = file_path.suffix.lower()\n \n if file_ext in self.SUPPORTED_EXTENSIONS:\n try:\n doc = self.extract_from_file(file_path)\n extracted_docs.append(doc)\n processed_count += 1\n \n # 检查处理数量限制\n if (self.config.max_documents > 0 and \n processed_count >= self.config.max_documents):\n logger.warning(f\"达到文档数量限制 {self.config.max_documents}\")\n break\n \n except Exception as e:\n logger.error(f\"处理文件失败 {file_path}: {e}\")\n continue\n \n logger.info(f\"目录提取完成: {directory_path}, 处理文件数: {len(extracted_docs)}\")\n return extracted_docs\n \n def _extract_from_pdf(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从PDF文件中提取文本\"\"\"\n if not PDF_AVAILABLE:\n raise DocumentProcessingError(\"PDF支持库未安装请安装: pip install PyPDF2 pdfplumber\")\n \n content = \"\"\n metadata = {}\n page_count = 0\n \n try:\n # 优先使用pdfplumber回退到PyPDF2\n with pdfplumber.open(file_path) as pdf:\n page_count = len(pdf.pages)\n metadata['page_count'] = page_count\n \n for page in pdf.pages:\n text = page.extract_text()\n if text:\n content += text + \"\\n\"\n \n # 提取表格数据\n tables = []\n for page in pdf.pages:\n page_tables = page.extract_tables()\n if page_tables:\n tables.extend(page_tables)\n \n if tables:\n metadata['table_count'] = len(tables)\n # 将表格转换为文本\n for i, table in enumerate(tables):\n content += f\"\\n\\n表格 {i+1}:\\n\"\n for row in table:\n if row:\n content += \"\\t\".join(str(cell) if cell else \"\" for cell in row) + \"\\n\"\n \n except Exception as e:\n logger.warning(f\"pdfplumber提取失败尝试PyPDF2: {e}\")\n \n try:\n with open(file_path, 'rb') as file:\n pdf_reader = PyPDF2.PdfReader(file)\n page_count = len(pdf_reader.pages)\n metadata['page_count'] = page_count\n \n for page in pdf_reader.pages:\n content += page.extract_text() + \"\\n\"\n \n except Exception as e2:\n raise DocumentProcessingError(f\"PDF提取失败: {e2}\")\n \n metadata['extraction_library'] = 'pdfplumber' if 'pdfplumber' in str(type(pdf)) else 'PyPDF2'\n return content.strip(), metadata, page_count\n \n def _extract_from_docx(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从Word文档中提取文本\"\"\"\n if not DOCX_AVAILABLE:\n raise DocumentProcessingError(\"Word文档支持库未安装请安装: pip install python-docx\")\n \n try:\n doc = Document(file_path)\n content = \"\"\n metadata = {}\n \n # 提取段落文本\n paragraph_count = 0\n for paragraph in doc.paragraphs:\n if paragraph.text.strip():\n content += paragraph.text + \"\\n\"\n paragraph_count += 1\n \n # 提取表格文本\n table_count = len(doc.tables)\n for table in doc.tables:\n content += \"\\n\\n表格:\\n\"\n for row in table.rows:\n row_text = \"\\t\".join(cell.text.strip() for cell in row.cells)\n content += row_text + \"\\n\"\n \n metadata.update({\n 'paragraph_count': paragraph_count,\n 'table_count': table_count,\n 'extraction_library': 'python-docx'\n })\n \n return content.strip(), metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"Word文档提取失败: {e}\")\n \n def _extract_from_excel(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从Excel文件中提取文本\"\"\"\n if not EXCEL_AVAILABLE:\n raise DocumentProcessingError(\"Excel支持库未安装请安装: pip install openpyxl\")\n \n try:\n workbook = load_workbook(file_path, data_only=True)\n content = \"\"\n metadata = {}\n \n sheet_count = len(workbook.sheetnames)\n total_rows = 0\n \n for sheet_name in workbook.sheetnames:\n sheet = workbook[sheet_name]\n content += f\"\\n\\n工作表: {sheet_name}\\n\"\n \n sheet_rows = 0\n for row in sheet.iter_rows(values_only=True):\n if any(cell is not None for cell in row):\n row_text = \"\\t\".join(str(cell) if cell is not None else \"\" for cell in row)\n content += row_text + \"\\n\"\n sheet_rows += 1\n total_rows += 1\n \n metadata.update({\n 'sheet_count': sheet_count,\n 'total_rows': total_rows,\n 'extraction_library': 'openpyxl'\n })\n \n return content.strip(), metadata, sheet_count\n \n except Exception as e:\n raise DocumentProcessingError(f\"Excel文件提取失败: {e}\")\n \n def _extract_from_text(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从文本文件中提取内容\"\"\"\n try:\n # 尝试多种编码\n encodings = ['utf-8', 'gbk', 'gb2312', 'latin-1']\n content = \"\"\n used_encoding = \"\"\n \n for encoding in encodings:\n try:\n with open(file_path, 'r', encoding=encoding) as file:\n content = file.read()\n used_encoding = encoding\n break\n except UnicodeDecodeError:\n continue\n \n if not content and not used_encoding:\n raise DocumentProcessingError(\"无法解码文本文件,尝试了多种编码方式\")\n \n metadata = {\n 'encoding': used_encoding,\n 'line_count': len(content.splitlines())\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"文本文件提取失败: {e}\")\n \n def _extract_from_csv(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从CSV文件中提取内容\"\"\"\n try:\n if PANDAS_AVAILABLE:\n # 使用pandas处理CSV\n df = pd.read_csv(file_path)\n content = df.to_string(index=False)\n metadata = {\n 'rows': len(df),\n 'columns': len(df.columns),\n 'column_names': list(df.columns),\n 'extraction_library': 'pandas'\n }\n else:\n # 使用内置csv模块\n import csv\n content = \"\"\n with open(file_path, 'r', encoding='utf-8') as file:\n csv_reader = csv.reader(file)\n rows = list(csv_reader)\n \n for row in rows:\n content += \"\\t\".join(row) + \"\\n\"\n \n metadata = {\n 'rows': len(rows),\n 'columns': len(rows[0]) if rows else 0,\n 'extraction_library': 'csv'\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"CSV文件提取失败: {e}\")\n \n def _extract_from_json(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从JSON文件中提取内容\"\"\"\n try:\n import json\n \n with open(file_path, 'r', encoding='utf-8') as file:\n data = json.load(file)\n \n # 将JSON转换为格式化的文本\n content = json.dumps(data, ensure_ascii=False, indent=2)\n \n metadata = {\n 'json_keys': list(data.keys()) if isinstance(data, dict) else [],\n 'json_type': type(data).__name__,\n 'extraction_library': 'json'\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"JSON文件提取失败: {e}\")\n \n def _extract_from_xml(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从XML文件中提取内容\"\"\"\n try:\n import xml.etree.ElementTree as ET\n \n tree = ET.parse(file_path)\n root = tree.getroot()\n \n # 提取所有文本内容\n content = \"\"\n def extract_text(element, level=0):\n nonlocal content\n indent = \" \" * level\n if element.text and element.text.strip():\n content += f\"{indent}{element.tag}: {element.text.strip()}\\n\"\n else:\n content += f\"{indent}{element.tag}\\n\"\n \n for child in element:\n extract_text(child, level + 1)\n \n extract_text(root)\n \n metadata = {\n 'root_tag': root.tag,\n 'element_count': len(list(root.iter())),\n 'extraction_library': 'xml.etree.ElementTree'\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"XML文件提取失败: {e}\")\n \n def _extract_from_html(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从HTML文件中提取内容\"\"\"\n try:\n with open(file_path, 'r', encoding='utf-8') as file:\n html_content = file.read()\n \n # 简单的HTML文本提取移除标签\n import re\n # 移除script和style标签及其内容\n html_content = re.sub(r'<script.*?</script>', '', html_content, flags=re.DOTALL)\n html_content = re.sub(r'<style.*?</style>', '', html_content, flags=re.DOTALL)\n \n # 移除HTML标签\n content = re.sub(r'<[^>]+>', '', html_content)\n \n # 清理多余的空白\n content = re.sub(r'\\s+', ' ', content).strip()\n \n metadata = {\n 'original_size': len(html_content),\n 'extraction_library': 'regex'\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"HTML文件提取失败: {e}\")\n \n def get_supported_formats(self) -> Dict[str, str]:\n \"\"\"获取支持的文件格式\"\"\"\n return self.SUPPORTED_EXTENSIONS.copy()\n \n def is_supported_format(self, file_path: Union[str, Path]) -> bool:\n \"\"\"检查文件格式是否支持\"\"\"\n file_path = Path(file_path)\n return file_path.suffix.lower() in self.SUPPORTED_EXTENSIONS\n \n def get_extraction_stats(self) -> Dict[str, Any]:\n \"\"\"获取提取器统计信息\"\"\"\n available_libraries = []\n \n if PDF_AVAILABLE:\n available_libraries.append(\"PDF (PyPDF2, pdfplumber)\")\n if DOCX_AVAILABLE:\n available_libraries.append(\"Word (python-docx)\")\n if EXCEL_AVAILABLE:\n available_libraries.append(\"Excel (openpyxl)\")\n if PANDAS_AVAILABLE:\n available_libraries.append(\"CSV (pandas)\")\n \n return {\n 'supported_formats': list(self.SUPPORTED_EXTENSIONS.keys()),\n 'available_libraries': available_libraries,\n 'max_file_size': self.config.max_file_size,\n 'max_content_length': self.config.max_content_length,\n 'max_documents': self.config.max_documents\n } ",
"code_hash": "e17c94489b3b534c4c770787252fda8f"
}
],
"imports": [
{
"type": "import",
"modules": [
"os"
],
"aliases": []
},
{
"type": "import",
"modules": [
"logging"
],
"aliases": []
},
{
"type": "import",
"modules": [
"mimetypes"
],
"aliases": []
},
{
"type": "from_import",
"module": "typing",
"names": [
"List",
"Dict",
"Any",
"Optional",
"Union"
],
"aliases": [],
"level": 0
},
{
"type": "from_import",
"module": "pathlib",
"names": [
"Path"
],
"aliases": [],
"level": 0
},
{
"type": "from_import",
"module": "dataclasses",
"names": [
"dataclass"
],
"aliases": [],
"level": 0
},
{
"type": "from_import",
"module": "datetime",
"names": [
"datetime"
],
"aliases": [],
"level": 0
},
{
"type": "from_import",
"module": "config",
"names": [
"DocumentProcessingConfig"
],
"aliases": [],
"level": 2
},
{
"type": "from_import",
"module": "exceptions",
"names": [
"DocumentProcessingError",
"ResourceNotFoundError"
],
"aliases": [],
"level": 2
},
{
"type": "import",
"modules": [
"PyPDF2"
],
"aliases": []
},
{
"type": "import",
"modules": [
"pdfplumber"
],
"aliases": []
},
{
"type": "from_import",
"module": "docx",
"names": [
"Document"
],
"aliases": [],
"level": 0
},
{
"type": "import",
"modules": [
"openpyxl"
],
"aliases": []
},
{
"type": "from_import",
"module": "openpyxl",
"names": [
"load_workbook"
],
"aliases": [],
"level": 0
},
{
"type": "import",
"modules": [
"pandas"
],
"aliases": [
"pd"
]
},
{
"type": "import",
"modules": [
"json"
],
"aliases": []
},
{
"type": "import",
"modules": [
"xml.etree.ElementTree"
],
"aliases": [
"ET"
]
},
{
"type": "import",
"modules": [
"re"
],
"aliases": []
},
{
"type": "import",
"modules": [
"csv"
],
"aliases": []
}
],
"constants": [
{
"name": "PDF_AVAILABLE",
"value": true,
"type": "bool",
"line": 21
},
{
"name": "DOCX_AVAILABLE",
"value": true,
"type": "bool",
"line": 27
},
{
"name": "EXCEL_AVAILABLE",
"value": true,
"type": "bool",
"line": 34
},
{
"name": "PANDAS_AVAILABLE",
"value": true,
"type": "bool",
"line": 40
},
{
"name": "SUPPORTED_EXTENSIONS",
"value": {
".pdf": "PDF Document",
".docx": "Microsoft Word Document",
".doc": "Microsoft Word Document (Legacy)",
".xlsx": "Microsoft Excel Spreadsheet",
".xls": "Microsoft Excel Spreadsheet (Legacy)",
".txt": "Plain Text",
".md": "Markdown",
".csv": "Comma Separated Values",
".json": "JSON Document",
".xml": "XML Document",
".html": "HTML Document",
".htm": "HTML Document"
},
"type": "dict",
"line": 108
},
{
"name": "PDF_AVAILABLE",
"value": false,
"type": "bool",
"line": 23
},
{
"name": "DOCX_AVAILABLE",
"value": false,
"type": "bool",
"line": 29
},
{
"name": "EXCEL_AVAILABLE",
"value": false,
"type": "bool",
"line": 36
},
{
"name": "PANDAS_AVAILABLE",
"value": false,
"type": "bool",
"line": 42
}
],
"docstring": "Text Extractor\n文本提取器 - 重构版本,支持从多种格式文档中提取文本内容",
"content_hash": "2e8ea1b0b3f987f64faa46d44b0f73f0"
}