673 lines
53 KiB
JSON
673 lines
53 KiB
JSON
|
|
{
|
|||
|
|
"file_path": "document/text_extractor.py",
|
|||
|
|
"file_size": 12661,
|
|||
|
|
"line_count": 355,
|
|||
|
|
"functions": [
|
|||
|
|
{
|
|||
|
|
"name": "__post_init__",
|
|||
|
|
"line_start": 50,
|
|||
|
|
"line_end": 53,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": null,
|
|||
|
|
"docstring": "",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def __post_init__(self):\n # 确保content是字符串\n if not isinstance(self.content, str):\n self.content = str(self.content)",
|
|||
|
|
"code_hash": "e461fc43783cbe62b22d1a9349785191"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "__init__",
|
|||
|
|
"line_start": 58,
|
|||
|
|
"line_end": 69,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "config",
|
|||
|
|
"type_hint": "Optional[Dict[str, Any]]"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": null,
|
|||
|
|
"docstring": "",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def __init__(self, config: Optional[Dict[str, Any]] = None):\n self.config = config or {}\n self.supported_formats = {\n '.pdf': self._extract_pdf,\n '.docx': self._extract_docx,\n '.doc': self._extract_doc,\n '.txt': self._extract_txt,\n '.md': self._extract_txt,\n '.xlsx': self._extract_xlsx,\n '.xls': self._extract_xls,\n '.csv': self._extract_csv\n }",
|
|||
|
|
"code_hash": "4477c71180d28cd290e553df70152277"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "extract",
|
|||
|
|
"line_start": 71,
|
|||
|
|
"line_end": 102,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "str"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "ExtractedDocument",
|
|||
|
|
"docstring": "提取单个文件的文本内容",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def extract(self, file_path: str) -> ExtractedDocument:\n \"\"\"提取单个文件的文本内容\"\"\"\n path_obj = Path(file_path)\n \n if not path_obj.exists():\n raise FileNotFoundError(f\"文件不存在: {file_path}\")\n \n file_ext = path_obj.suffix.lower()\n if file_ext not in self.supported_formats:\n raise ValueError(f\"不支持的文件格式: {file_ext}\")\n \n try:\n # 获取文件信息\n file_size = path_obj.stat().st_size\n \n # 提取文本内容\n extractor = self.supported_formats[file_ext]\n content, metadata = extractor(path_obj)\n \n return ExtractedDocument(\n filename=path_obj.name,\n file_type=file_ext,\n content=content,\n metadata=metadata,\n extracted_at=datetime.now(),\n file_size=file_size,\n page_count=metadata.get('page_count')\n )\n \n except Exception as e:\n logger.error(f\"提取文件 {file_path} 时出错: {str(e)}\")\n raise",
|
|||
|
|
"code_hash": "5436f5f2cacc8d554f042f4bf89de554"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "extract_batch",
|
|||
|
|
"line_start": 104,
|
|||
|
|
"line_end": 126,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_paths",
|
|||
|
|
"type_hint": "List[str]"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "List[ExtractedDocument]",
|
|||
|
|
"docstring": "批量提取多个文件的文本内容",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def extract_batch(self, file_paths: List[str]) -> List[ExtractedDocument]:\n \"\"\"批量提取多个文件的文本内容\"\"\"\n results = []\n \n for file_path in file_paths:\n try:\n result = self.extract(file_path)\n results.append(result)\n logger.info(f\"成功提取文件: {file_path}\")\n except Exception as e:\n logger.error(f\"提取文件 {file_path} 失败: {str(e)}\")\n # 创建错误记录\n error_doc = ExtractedDocument(\n filename=Path(file_path).name,\n file_type=Path(file_path).suffix.lower(),\n content=f\"提取失败: {str(e)}\",\n metadata={\"error\": str(e)},\n extracted_at=datetime.now(),\n file_size=0\n )\n results.append(error_doc)\n \n return results",
|
|||
|
|
"code_hash": "f800214463cb7786ec16c690d19d451e"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "_extract_pdf",
|
|||
|
|
"line_start": 128,
|
|||
|
|
"line_end": 184,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Path"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "tuple[str, Dict[str, Any]]",
|
|||
|
|
"docstring": "提取PDF文件的纯文本内容",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def _extract_pdf(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取PDF文件的纯文本内容\"\"\"\n if not PDF_AVAILABLE:\n raise ImportError(\"需要安装 PyPDF2 和 pdfplumber: pip install PyPDF2 pdfplumber\")\n \n content_parts = []\n metadata = {}\n \n try:\n # 使用pdfplumber提取文本(更好的文本提取)\n with pdfplumber.open(file_path) as pdf:\n metadata['page_count'] = len(pdf.pages)\n \n for page_num, page in enumerate(pdf.pages, 1):\n page_text = page.extract_text()\n if page_text:\n content_parts.append(f\"=== 第 {page_num} 页 ===\\n{page_text}\\n\")\n \n # 获取文档元数据\n if pdf.metadata:\n metadata.update({\n 'title': pdf.metadata.get('Title', ''),\n 'author': pdf.metadata.get('Author', ''),\n 'subject': pdf.metadata.get('Subject', ''),\n 'creator': pdf.metadata.get('Creator', ''),\n 'producer': pdf.metadata.get('Producer', ''),\n 'creation_date': pdf.metadata.get('CreationDate', ''),\n 'modification_date': pdf.metadata.get('ModDate', '')\n })\n \n except Exception as e:\n logger.warning(f\"pdfplumber提取失败,尝试使用PyPDF2: {str(e)}\")\n \n # 备用方案:使用PyPDF2\n with open(file_path, 'rb') as file:\n pdf_reader = PyPDF2.PdfReader(file)\n metadata['page_count'] = len(pdf_reader.pages)\n \n for page_num, page in enumerate(pdf_reader.pages, 1):\n page_text = page.extract_text()\n if page_text:\n content_parts.append(f\"=== 第 {page_num} 页 ===\\n{page_text}\\n\")\n \n # 获取文档元数据\n if pdf_reader.metadata:\n metadata.update({\n 'title': pdf_reader.metadata.get('/Title', ''),\n 'author': pdf_reader.metadata.get('/Author', ''),\n 'subject': pdf_reader.metadata.get('/Subject', ''),\n 'creator': pdf_reader.metadata.get('/Creator', ''),\n 'producer': pdf_reader.metadata.get('/Producer', ''),\n 'creation_date': pdf_reader.metadata.get('/CreationDate', ''),\n 'modification_date': pdf_reader.metadata.get('/ModDate', '')\n })\n \n content = '\\n'.join(content_parts) if content_parts else \"\"\n return content, metadata",
|
|||
|
|
"code_hash": "92ff0fefec17a73a1c8224b15b5c70ea"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "_extract_docx",
|
|||
|
|
"line_start": 186,
|
|||
|
|
"line_end": 227,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Path"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "tuple[str, Dict[str, Any]]",
|
|||
|
|
"docstring": "提取DOCX文件的纯文本内容",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def _extract_docx(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取DOCX文件的纯文本内容\"\"\"\n if not DOCX_AVAILABLE:\n raise ImportError(\"需要安装 python-docx: pip install python-docx\")\n \n doc = Document(str(file_path))\n content_parts = []\n metadata = {}\n \n # 提取所有段落文本\n for paragraph in doc.paragraphs:\n if paragraph.text.strip():\n content_parts.append(paragraph.text)\n \n # 提取表格内容\n for table in doc.tables:\n table_content = []\n for row in table.rows:\n row_content = []\n for cell in row.cells:\n row_content.append(cell.text.strip())\n table_content.append('\\t'.join(row_content))\n if table_content:\n content_parts.append('\\n=== 表格 ===\\n' + '\\n'.join(table_content) + '\\n')\n \n # 获取文档属性\n core_props = doc.core_properties\n metadata.update({\n 'title': core_props.title or '',\n 'author': core_props.author or '',\n 'subject': core_props.subject or '',\n 'keywords': core_props.keywords or '',\n 'comments': core_props.comments or '',\n 'created': str(core_props.created) if core_props.created else '',\n 'modified': str(core_props.modified) if core_props.modified else '',\n 'last_modified_by': core_props.last_modified_by or '',\n 'paragraph_count': len(doc.paragraphs),\n 'table_count': len(doc.tables)\n })\n \n content = '\\n'.join(content_parts)\n return content, metadata",
|
|||
|
|
"code_hash": "c7b53be194f6c57266ac47d0f4191b08"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "_extract_doc",
|
|||
|
|
"line_start": 229,
|
|||
|
|
"line_end": 243,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Path"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "tuple[str, Dict[str, Any]]",
|
|||
|
|
"docstring": "提取DOC文件的纯文本内容",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def _extract_doc(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取DOC文件的纯文本内容\"\"\"\n # DOC格式较复杂,建议转换为DOCX或使用专门的库\n logger.warning(\"DOC格式支持有限,建议转换为DOCX格式\")\n \n # 尝试读取为文本文件\n try:\n with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:\n content = file.read()\n except:\n with open(file_path, 'r', encoding='gbk', errors='ignore') as file:\n content = file.read()\n \n metadata = {'format': 'doc', 'encoding_note': '可能存在编码问题'}\n return content, metadata",
|
|||
|
|
"code_hash": "a9f6fc06bae7709495db7817d022c1ca"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "_extract_txt",
|
|||
|
|
"line_start": 245,
|
|||
|
|
"line_end": 272,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Path"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "tuple[str, Dict[str, Any]]",
|
|||
|
|
"docstring": "提取TXT/MD文件的纯文本内容",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def _extract_txt(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取TXT/MD文件的纯文本内容\"\"\"\n encodings = ['utf-8', 'gbk', 'gb2312', 'big5', 'utf-16']\n content = \"\"\n used_encoding = \"\"\n \n for encoding in encodings:\n try:\n with open(file_path, 'r', encoding=encoding) as file:\n content = file.read()\n used_encoding = encoding\n break\n except UnicodeDecodeError:\n continue\n \n if not content:\n # 最后尝试忽略错误\n with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:\n content = file.read()\n used_encoding = 'utf-8 (with errors ignored)'\n \n metadata = {\n 'encoding': used_encoding,\n 'line_count': len(content.splitlines()),\n 'char_count': len(content)\n }\n \n return content, metadata",
|
|||
|
|
"code_hash": "a531b26825b63c4db57e234dc159a0d7"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "_extract_xlsx",
|
|||
|
|
"line_start": 274,
|
|||
|
|
"line_end": 301,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Path"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "tuple[str, Dict[str, Any]]",
|
|||
|
|
"docstring": "提取XLSX文件的纯文本内容",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def _extract_xlsx(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取XLSX文件的纯文本内容\"\"\"\n if not EXCEL_AVAILABLE:\n raise ImportError(\"需要安装 openpyxl: pip install openpyxl\")\n \n workbook = load_workbook(file_path, read_only=True)\n content_parts = []\n metadata = {\n 'sheet_count': len(workbook.sheetnames),\n 'sheet_names': workbook.sheetnames\n }\n \n for sheet_name in workbook.sheetnames:\n sheet = workbook[sheet_name]\n content_parts.append(f\"\\n=== 工作表: {sheet_name} ===\\n\")\n \n for row in sheet.iter_rows(values_only=True):\n row_content = []\n for cell in row:\n if cell is not None:\n row_content.append(str(cell))\n else:\n row_content.append(\"\")\n if any(cell.strip() for cell in row_content): # 跳过空行\n content_parts.append('\\t'.join(row_content))\n \n content = '\\n'.join(content_parts)\n return content, metadata",
|
|||
|
|
"code_hash": "4b90a3f26bbb6b4e5da3fbd78a7ea278"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "_extract_xls",
|
|||
|
|
"line_start": 303,
|
|||
|
|
"line_end": 315,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Path"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "tuple[str, Dict[str, Any]]",
|
|||
|
|
"docstring": "提取XLS文件的纯文本内容",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def _extract_xls(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取XLS文件的纯文本内容\"\"\"\n logger.warning(\"XLS格式支持有限,建议转换为XLSX格式\")\n \n # 简单的文本提取\n try:\n with open(file_path, 'rb') as file:\n content = file.read().decode('utf-8', errors='ignore')\n except:\n content = f\"无法读取XLS文件: {file_path}\"\n \n metadata = {'format': 'xls', 'note': '可能存在格式问题'}\n return content, metadata",
|
|||
|
|
"code_hash": "4a81d1dc3865abd2faa49705e5e8f2e9"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "_extract_csv",
|
|||
|
|
"line_start": 317,
|
|||
|
|
"line_end": 348,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Path"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "tuple[str, Dict[str, Any]]",
|
|||
|
|
"docstring": "提取CSV文件的纯文本内容",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def _extract_csv(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取CSV文件的纯文本内容\"\"\"\n encodings = ['utf-8', 'gbk', 'gb2312']\n content = \"\"\n used_encoding = \"\"\n \n for encoding in encodings:\n try:\n with open(file_path, 'r', encoding=encoding) as file:\n content = file.read()\n used_encoding = encoding\n break\n except UnicodeDecodeError:\n continue\n \n if not content:\n with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:\n content = file.read()\n used_encoding = 'utf-8 (with errors ignored)'\n \n # 计算行数和列数\n lines = content.splitlines()\n row_count = len(lines)\n col_count = len(lines[0].split(',')) if lines else 0\n \n metadata = {\n 'encoding': used_encoding,\n 'row_count': row_count,\n 'estimated_col_count': col_count\n }\n \n return content, metadata",
|
|||
|
|
"code_hash": "d34952e59e05f67baf52a96ce6f99d9d"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "get_supported_formats",
|
|||
|
|
"line_start": 350,
|
|||
|
|
"line_end": 352,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "List[str]",
|
|||
|
|
"docstring": "获取支持的文件格式列表",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def get_supported_formats(self) -> List[str]:\n \"\"\"获取支持的文件格式列表\"\"\"\n return list(self.supported_formats.keys())",
|
|||
|
|
"code_hash": "657931bc9cac8245425c24c3fb872456"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "is_supported",
|
|||
|
|
"line_start": 354,
|
|||
|
|
"line_end": 356,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "str"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "bool",
|
|||
|
|
"docstring": "检查文件格式是否支持",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def is_supported(self, file_path: str) -> bool:\n \"\"\"检查文件格式是否支持\"\"\"\n return Path(file_path).suffix.lower() in self.supported_formats ",
|
|||
|
|
"code_hash": "5b7ac2ce0b4a7b9d115bef46ab04c2dd"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"classes": [
|
|||
|
|
{
|
|||
|
|
"name": "ExtractedDocument",
|
|||
|
|
"line_start": 40,
|
|||
|
|
"line_end": 53,
|
|||
|
|
"bases": [],
|
|||
|
|
"methods": [
|
|||
|
|
{
|
|||
|
|
"name": "__post_init__",
|
|||
|
|
"line_start": 50,
|
|||
|
|
"line_end": 53,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": null,
|
|||
|
|
"docstring": "",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def __post_init__(self):\n # 确保content是字符串\n if not isinstance(self.content, str):\n self.content = str(self.content)",
|
|||
|
|
"code_hash": "e461fc43783cbe62b22d1a9349785191"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"docstring": "提取的文档数据",
|
|||
|
|
"decorators": [
|
|||
|
|
"dataclass"
|
|||
|
|
],
|
|||
|
|
"code": "class ExtractedDocument:\n \"\"\"提取的文档数据\"\"\"\n filename: str\n file_type: str\n content: str # 纯文本内容\n metadata: Dict[str, Any] # 文档元数据\n extracted_at: datetime\n file_size: int\n page_count: Optional[int] = None\n \n def __post_init__(self):\n # 确保content是字符串\n if not isinstance(self.content, str):\n self.content = str(self.content)",
|
|||
|
|
"code_hash": "8b9d1fc812b3ea4aebc4e64cf53eaba6"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "TextExtractor",
|
|||
|
|
"line_start": 55,
|
|||
|
|
"line_end": 356,
|
|||
|
|
"bases": [],
|
|||
|
|
"methods": [
|
|||
|
|
{
|
|||
|
|
"name": "__init__",
|
|||
|
|
"line_start": 58,
|
|||
|
|
"line_end": 69,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "config",
|
|||
|
|
"type_hint": "Optional[Dict[str, Any]]"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": null,
|
|||
|
|
"docstring": "",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def __init__(self, config: Optional[Dict[str, Any]] = None):\n self.config = config or {}\n self.supported_formats = {\n '.pdf': self._extract_pdf,\n '.docx': self._extract_docx,\n '.doc': self._extract_doc,\n '.txt': self._extract_txt,\n '.md': self._extract_txt,\n '.xlsx': self._extract_xlsx,\n '.xls': self._extract_xls,\n '.csv': self._extract_csv\n }",
|
|||
|
|
"code_hash": "4477c71180d28cd290e553df70152277"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "extract",
|
|||
|
|
"line_start": 71,
|
|||
|
|
"line_end": 102,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "str"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "ExtractedDocument",
|
|||
|
|
"docstring": "提取单个文件的文本内容",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def extract(self, file_path: str) -> ExtractedDocument:\n \"\"\"提取单个文件的文本内容\"\"\"\n path_obj = Path(file_path)\n \n if not path_obj.exists():\n raise FileNotFoundError(f\"文件不存在: {file_path}\")\n \n file_ext = path_obj.suffix.lower()\n if file_ext not in self.supported_formats:\n raise ValueError(f\"不支持的文件格式: {file_ext}\")\n \n try:\n # 获取文件信息\n file_size = path_obj.stat().st_size\n \n # 提取文本内容\n extractor = self.supported_formats[file_ext]\n content, metadata = extractor(path_obj)\n \n return ExtractedDocument(\n filename=path_obj.name,\n file_type=file_ext,\n content=content,\n metadata=metadata,\n extracted_at=datetime.now(),\n file_size=file_size,\n page_count=metadata.get('page_count')\n )\n \n except Exception as e:\n logger.error(f\"提取文件 {file_path} 时出错: {str(e)}\")\n raise",
|
|||
|
|
"code_hash": "5436f5f2cacc8d554f042f4bf89de554"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "extract_batch",
|
|||
|
|
"line_start": 104,
|
|||
|
|
"line_end": 126,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_paths",
|
|||
|
|
"type_hint": "List[str]"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "List[ExtractedDocument]",
|
|||
|
|
"docstring": "批量提取多个文件的文本内容",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def extract_batch(self, file_paths: List[str]) -> List[ExtractedDocument]:\n \"\"\"批量提取多个文件的文本内容\"\"\"\n results = []\n \n for file_path in file_paths:\n try:\n result = self.extract(file_path)\n results.append(result)\n logger.info(f\"成功提取文件: {file_path}\")\n except Exception as e:\n logger.error(f\"提取文件 {file_path} 失败: {str(e)}\")\n # 创建错误记录\n error_doc = ExtractedDocument(\n filename=Path(file_path).name,\n file_type=Path(file_path).suffix.lower(),\n content=f\"提取失败: {str(e)}\",\n metadata={\"error\": str(e)},\n extracted_at=datetime.now(),\n file_size=0\n )\n results.append(error_doc)\n \n return results",
|
|||
|
|
"code_hash": "f800214463cb7786ec16c690d19d451e"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "_extract_pdf",
|
|||
|
|
"line_start": 128,
|
|||
|
|
"line_end": 184,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Path"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "tuple[str, Dict[str, Any]]",
|
|||
|
|
"docstring": "提取PDF文件的纯文本内容",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def _extract_pdf(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取PDF文件的纯文本内容\"\"\"\n if not PDF_AVAILABLE:\n raise ImportError(\"需要安装 PyPDF2 和 pdfplumber: pip install PyPDF2 pdfplumber\")\n \n content_parts = []\n metadata = {}\n \n try:\n # 使用pdfplumber提取文本(更好的文本提取)\n with pdfplumber.open(file_path) as pdf:\n metadata['page_count'] = len(pdf.pages)\n \n for page_num, page in enumerate(pdf.pages, 1):\n page_text = page.extract_text()\n if page_text:\n content_parts.append(f\"=== 第 {page_num} 页 ===\\n{page_text}\\n\")\n \n # 获取文档元数据\n if pdf.metadata:\n metadata.update({\n 'title': pdf.metadata.get('Title', ''),\n 'author': pdf.metadata.get('Author', ''),\n 'subject': pdf.metadata.get('Subject', ''),\n 'creator': pdf.metadata.get('Creator', ''),\n 'producer': pdf.metadata.get('Producer', ''),\n 'creation_date': pdf.metadata.get('CreationDate', ''),\n 'modification_date': pdf.metadata.get('ModDate', '')\n })\n \n except Exception as e:\n logger.warning(f\"pdfplumber提取失败,尝试使用PyPDF2: {str(e)}\")\n \n # 备用方案:使用PyPDF2\n with open(file_path, 'rb') as file:\n pdf_reader = PyPDF2.PdfReader(file)\n metadata['page_count'] = len(pdf_reader.pages)\n \n for page_num, page in enumerate(pdf_reader.pages, 1):\n page_text = page.extract_text()\n if page_text:\n content_parts.append(f\"=== 第 {page_num} 页 ===\\n{page_text}\\n\")\n \n # 获取文档元数据\n if pdf_reader.metadata:\n metadata.update({\n 'title': pdf_reader.metadata.get('/Title', ''),\n 'author': pdf_reader.metadata.get('/Author', ''),\n 'subject': pdf_reader.metadata.get('/Subject', ''),\n 'creator': pdf_reader.metadata.get('/Creator', ''),\n 'producer': pdf_reader.metadata.get('/Producer', ''),\n 'creation_date': pdf_reader.metadata.get('/CreationDate', ''),\n 'modification_date': pdf_reader.metadata.get('/ModDate', '')\n })\n \n content = '\\n'.join(content_parts) if content_parts else \"\"\n return content, metadata",
|
|||
|
|
"code_hash": "92ff0fefec17a73a1c8224b15b5c70ea"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "_extract_docx",
|
|||
|
|
"line_start": 186,
|
|||
|
|
"line_end": 227,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Path"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "tuple[str, Dict[str, Any]]",
|
|||
|
|
"docstring": "提取DOCX文件的纯文本内容",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def _extract_docx(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取DOCX文件的纯文本内容\"\"\"\n if not DOCX_AVAILABLE:\n raise ImportError(\"需要安装 python-docx: pip install python-docx\")\n \n doc = Document(str(file_path))\n content_parts = []\n metadata = {}\n \n # 提取所有段落文本\n for paragraph in doc.paragraphs:\n if paragraph.text.strip():\n content_parts.append(paragraph.text)\n \n # 提取表格内容\n for table in doc.tables:\n table_content = []\n for row in table.rows:\n row_content = []\n for cell in row.cells:\n row_content.append(cell.text.strip())\n table_content.append('\\t'.join(row_content))\n if table_content:\n content_parts.append('\\n=== 表格 ===\\n' + '\\n'.join(table_content) + '\\n')\n \n # 获取文档属性\n core_props = doc.core_properties\n metadata.update({\n 'title': core_props.title or '',\n 'author': core_props.author or '',\n 'subject': core_props.subject or '',\n 'keywords': core_props.keywords or '',\n 'comments': core_props.comments or '',\n 'created': str(core_props.created) if core_props.created else '',\n 'modified': str(core_props.modified) if core_props.modified else '',\n 'last_modified_by': core_props.last_modified_by or '',\n 'paragraph_count': len(doc.paragraphs),\n 'table_count': len(doc.tables)\n })\n \n content = '\\n'.join(content_parts)\n return content, metadata",
|
|||
|
|
"code_hash": "c7b53be194f6c57266ac47d0f4191b08"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "_extract_doc",
|
|||
|
|
"line_start": 229,
|
|||
|
|
"line_end": 243,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Path"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "tuple[str, Dict[str, Any]]",
|
|||
|
|
"docstring": "提取DOC文件的纯文本内容",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def _extract_doc(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取DOC文件的纯文本内容\"\"\"\n # DOC格式较复杂,建议转换为DOCX或使用专门的库\n logger.warning(\"DOC格式支持有限,建议转换为DOCX格式\")\n \n # 尝试读取为文本文件\n try:\n with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:\n content = file.read()\n except:\n with open(file_path, 'r', encoding='gbk', errors='ignore') as file:\n content = file.read()\n \n metadata = {'format': 'doc', 'encoding_note': '可能存在编码问题'}\n return content, metadata",
|
|||
|
|
"code_hash": "a9f6fc06bae7709495db7817d022c1ca"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "_extract_txt",
|
|||
|
|
"line_start": 245,
|
|||
|
|
"line_end": 272,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Path"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "tuple[str, Dict[str, Any]]",
|
|||
|
|
"docstring": "提取TXT/MD文件的纯文本内容",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def _extract_txt(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取TXT/MD文件的纯文本内容\"\"\"\n encodings = ['utf-8', 'gbk', 'gb2312', 'big5', 'utf-16']\n content = \"\"\n used_encoding = \"\"\n \n for encoding in encodings:\n try:\n with open(file_path, 'r', encoding=encoding) as file:\n content = file.read()\n used_encoding = encoding\n break\n except UnicodeDecodeError:\n continue\n \n if not content:\n # 最后尝试忽略错误\n with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:\n content = file.read()\n used_encoding = 'utf-8 (with errors ignored)'\n \n metadata = {\n 'encoding': used_encoding,\n 'line_count': len(content.splitlines()),\n 'char_count': len(content)\n }\n \n return content, metadata",
|
|||
|
|
"code_hash": "a531b26825b63c4db57e234dc159a0d7"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "_extract_xlsx",
|
|||
|
|
"line_start": 274,
|
|||
|
|
"line_end": 301,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Path"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "tuple[str, Dict[str, Any]]",
|
|||
|
|
"docstring": "提取XLSX文件的纯文本内容",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def _extract_xlsx(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取XLSX文件的纯文本内容\"\"\"\n if not EXCEL_AVAILABLE:\n raise ImportError(\"需要安装 openpyxl: pip install openpyxl\")\n \n workbook = load_workbook(file_path, read_only=True)\n content_parts = []\n metadata = {\n 'sheet_count': len(workbook.sheetnames),\n 'sheet_names': workbook.sheetnames\n }\n \n for sheet_name in workbook.sheetnames:\n sheet = workbook[sheet_name]\n content_parts.append(f\"\\n=== 工作表: {sheet_name} ===\\n\")\n \n for row in sheet.iter_rows(values_only=True):\n row_content = []\n for cell in row:\n if cell is not None:\n row_content.append(str(cell))\n else:\n row_content.append(\"\")\n if any(cell.strip() for cell in row_content): # 跳过空行\n content_parts.append('\\t'.join(row_content))\n \n content = '\\n'.join(content_parts)\n return content, metadata",
|
|||
|
|
"code_hash": "4b90a3f26bbb6b4e5da3fbd78a7ea278"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "_extract_xls",
|
|||
|
|
"line_start": 303,
|
|||
|
|
"line_end": 315,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Path"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "tuple[str, Dict[str, Any]]",
|
|||
|
|
"docstring": "提取XLS文件的纯文本内容",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def _extract_xls(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取XLS文件的纯文本内容\"\"\"\n logger.warning(\"XLS格式支持有限,建议转换为XLSX格式\")\n \n # 简单的文本提取\n try:\n with open(file_path, 'rb') as file:\n content = file.read().decode('utf-8', errors='ignore')\n except:\n content = f\"无法读取XLS文件: {file_path}\"\n \n metadata = {'format': 'xls', 'note': '可能存在格式问题'}\n return content, metadata",
|
|||
|
|
"code_hash": "4a81d1dc3865abd2faa49705e5e8f2e9"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "_extract_csv",
|
|||
|
|
"line_start": 317,
|
|||
|
|
"line_end": 348,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Path"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "tuple[str, Dict[str, Any]]",
|
|||
|
|
"docstring": "提取CSV文件的纯文本内容",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def _extract_csv(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取CSV文件的纯文本内容\"\"\"\n encodings = ['utf-8', 'gbk', 'gb2312']\n content = \"\"\n used_encoding = \"\"\n \n for encoding in encodings:\n try:\n with open(file_path, 'r', encoding=encoding) as file:\n content = file.read()\n used_encoding = encoding\n break\n except UnicodeDecodeError:\n continue\n \n if not content:\n with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:\n content = file.read()\n used_encoding = 'utf-8 (with errors ignored)'\n \n # 计算行数和列数\n lines = content.splitlines()\n row_count = len(lines)\n col_count = len(lines[0].split(',')) if lines else 0\n \n metadata = {\n 'encoding': used_encoding,\n 'row_count': row_count,\n 'estimated_col_count': col_count\n }\n \n return content, metadata",
|
|||
|
|
"code_hash": "d34952e59e05f67baf52a96ce6f99d9d"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "get_supported_formats",
|
|||
|
|
"line_start": 350,
|
|||
|
|
"line_end": 352,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "List[str]",
|
|||
|
|
"docstring": "获取支持的文件格式列表",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def get_supported_formats(self) -> List[str]:\n \"\"\"获取支持的文件格式列表\"\"\"\n return list(self.supported_formats.keys())",
|
|||
|
|
"code_hash": "657931bc9cac8245425c24c3fb872456"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "is_supported",
|
|||
|
|
"line_start": 354,
|
|||
|
|
"line_end": 356,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "str"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "bool",
|
|||
|
|
"docstring": "检查文件格式是否支持",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def is_supported(self, file_path: str) -> bool:\n \"\"\"检查文件格式是否支持\"\"\"\n return Path(file_path).suffix.lower() in self.supported_formats ",
|
|||
|
|
"code_hash": "5b7ac2ce0b4a7b9d115bef46ab04c2dd"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"docstring": "文本提取器 - 只做纯文本提取,保留所有原始内容",
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": "class TextExtractor:\n \"\"\"文本提取器 - 只做纯文本提取,保留所有原始内容\"\"\"\n \n def __init__(self, config: Optional[Dict[str, Any]] = None):\n self.config = config or {}\n self.supported_formats = {\n '.pdf': self._extract_pdf,\n '.docx': self._extract_docx,\n '.doc': self._extract_doc,\n '.txt': self._extract_txt,\n '.md': self._extract_txt,\n '.xlsx': self._extract_xlsx,\n '.xls': self._extract_xls,\n '.csv': self._extract_csv\n }\n \n def extract(self, file_path: str) -> ExtractedDocument:\n \"\"\"提取单个文件的文本内容\"\"\"\n path_obj = Path(file_path)\n \n if not path_obj.exists():\n raise FileNotFoundError(f\"文件不存在: {file_path}\")\n \n file_ext = path_obj.suffix.lower()\n if file_ext not in self.supported_formats:\n raise ValueError(f\"不支持的文件格式: {file_ext}\")\n \n try:\n # 获取文件信息\n file_size = path_obj.stat().st_size\n \n # 提取文本内容\n extractor = self.supported_formats[file_ext]\n content, metadata = extractor(path_obj)\n \n return ExtractedDocument(\n filename=path_obj.name,\n file_type=file_ext,\n content=content,\n metadata=metadata,\n extracted_at=datetime.now(),\n file_size=file_size,\n page_count=metadata.get('page_count')\n )\n \n except Exception as e:\n logger.error(f\"提取文件 {file_path} 时出错: {str(e)}\")\n raise\n \n def extract_batch(self, file_paths: List[str]) -> List[ExtractedDocument]:\n \"\"\"批量提取多个文件的文本内容\"\"\"\n results = []\n \n for file_path in file_paths:\n try:\n result = self.extract(file_path)\n results.append(result)\n logger.info(f\"成功提取文件: {file_path}\")\n except Exception as e:\n logger.error(f\"提取文件 {file_path} 失败: {str(e)}\")\n # 创建错误记录\n error_doc = ExtractedDocument(\n filename=Path(file_path).name,\n file_type=Path(file_path).suffix.lower(),\n content=f\"提取失败: {str(e)}\",\n metadata={\"error\": str(e)},\n extracted_at=datetime.now(),\n file_size=0\n )\n results.append(error_doc)\n \n return results\n \n def _extract_pdf(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取PDF文件的纯文本内容\"\"\"\n if not PDF_AVAILABLE:\n raise ImportError(\"需要安装 PyPDF2 和 pdfplumber: pip install PyPDF2 pdfplumber\")\n \n content_parts = []\n metadata = {}\n \n try:\n # 使用pdfplumber提取文本(更好的文本提取)\n with pdfplumber.open(file_path) as pdf:\n metadata['page_count'] = len(pdf.pages)\n \n for page_num, page in enumerate(pdf.pages, 1):\n page_text = page.extract_text()\n if page_text:\n content_parts.append(f\"=== 第 {page_num} 页 ===\\n{page_text}\\n\")\n \n # 获取文档元数据\n if pdf.metadata:\n metadata.update({\n 'title': pdf.metadata.get('Title', ''),\n 'author': pdf.metadata.get('Author', ''),\n 'subject': pdf.metadata.get('Subject', ''),\n 'creator': pdf.metadata.get('Creator', ''),\n 'producer': p
|
|||
|
|
"code_hash": "37d892d5ebf6d743d758c52421e08e2b"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"imports": [
|
|||
|
|
{
|
|||
|
|
"type": "import",
|
|||
|
|
"modules": [
|
|||
|
|
"os"
|
|||
|
|
],
|
|||
|
|
"aliases": []
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"type": "import",
|
|||
|
|
"modules": [
|
|||
|
|
"logging"
|
|||
|
|
],
|
|||
|
|
"aliases": []
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"type": "from_import",
|
|||
|
|
"module": "typing",
|
|||
|
|
"names": [
|
|||
|
|
"List",
|
|||
|
|
"Dict",
|
|||
|
|
"Any",
|
|||
|
|
"Optional"
|
|||
|
|
],
|
|||
|
|
"aliases": [],
|
|||
|
|
"level": 0
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"type": "from_import",
|
|||
|
|
"module": "pathlib",
|
|||
|
|
"names": [
|
|||
|
|
"Path"
|
|||
|
|
],
|
|||
|
|
"aliases": [],
|
|||
|
|
"level": 0
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"type": "from_import",
|
|||
|
|
"module": "dataclasses",
|
|||
|
|
"names": [
|
|||
|
|
"dataclass"
|
|||
|
|
],
|
|||
|
|
"aliases": [],
|
|||
|
|
"level": 0
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"type": "from_import",
|
|||
|
|
"module": "datetime",
|
|||
|
|
"names": [
|
|||
|
|
"datetime"
|
|||
|
|
],
|
|||
|
|
"aliases": [],
|
|||
|
|
"level": 0
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"type": "import",
|
|||
|
|
"modules": [
|
|||
|
|
"PyPDF2"
|
|||
|
|
],
|
|||
|
|
"aliases": []
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"type": "import",
|
|||
|
|
"modules": [
|
|||
|
|
"pdfplumber"
|
|||
|
|
],
|
|||
|
|
"aliases": []
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"type": "from_import",
|
|||
|
|
"module": "docx",
|
|||
|
|
"names": [
|
|||
|
|
"Document"
|
|||
|
|
],
|
|||
|
|
"aliases": [],
|
|||
|
|
"level": 0
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"type": "import",
|
|||
|
|
"modules": [
|
|||
|
|
"openpyxl"
|
|||
|
|
],
|
|||
|
|
"aliases": []
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"type": "from_import",
|
|||
|
|
"module": "openpyxl",
|
|||
|
|
"names": [
|
|||
|
|
"load_workbook"
|
|||
|
|
],
|
|||
|
|
"aliases": [],
|
|||
|
|
"level": 0
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"constants": [
|
|||
|
|
{
|
|||
|
|
"name": "PDF_AVAILABLE",
|
|||
|
|
"value": true,
|
|||
|
|
"type": "bool",
|
|||
|
|
"line": 20
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "DOCX_AVAILABLE",
|
|||
|
|
"value": true,
|
|||
|
|
"type": "bool",
|
|||
|
|
"line": 26
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "EXCEL_AVAILABLE",
|
|||
|
|
"value": true,
|
|||
|
|
"type": "bool",
|
|||
|
|
"line": 33
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "PDF_AVAILABLE",
|
|||
|
|
"value": false,
|
|||
|
|
"type": "bool",
|
|||
|
|
"line": 22
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "DOCX_AVAILABLE",
|
|||
|
|
"value": false,
|
|||
|
|
"type": "bool",
|
|||
|
|
"line": 28
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "EXCEL_AVAILABLE",
|
|||
|
|
"value": false,
|
|||
|
|
"type": "bool",
|
|||
|
|
"line": 35
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"docstring": "文本提取器模块\n支持从PDF、Word、TXT等格式的文档中提取文本内容",
|
|||
|
|
"content_hash": "99faa0b695aa0a84886242bcb430ff10"
|
|||
|
|
}
|