925 lines
83 KiB
JSON
925 lines
83 KiB
JSON
|
|
{
|
|||
|
|
"file_path": "travel-algorithms/travel_algorithms/document_processing/text_extractor.py",
|
|||
|
|
"file_size": 20206,
|
|||
|
|
"line_count": 579,
|
|||
|
|
"functions": [
|
|||
|
|
{
|
|||
|
|
"name": "__post_init__",
|
|||
|
|
"line_start": 63,
|
|||
|
|
"line_end": 71,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": null,
|
|||
|
|
"docstring": "初始化后处理",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def __post_init__(self):\n \"\"\"初始化后处理\"\"\"\n if not self.extracted_at:\n self.extracted_at = datetime.now()\n \n # 计算内容统计\n self.metadata.setdefault('content_length', len(self.content))\n self.metadata.setdefault('word_count', len(self.content.split()))\n self.metadata.setdefault('line_count', len(self.content.splitlines()))",
|
|||
|
|
"code_hash": "e3b7d9c15591b8f58f4f499fcf8ecac8"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "to_dict",
|
|||
|
|
"line_start": 73,
|
|||
|
|
"line_end": 85,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "Dict[str, Any]",
|
|||
|
|
"docstring": "转换为字典格式",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def to_dict(self) -> Dict[str, Any]:\n \"\"\"转换为字典格式\"\"\"\n return {\n 'filename': self.filename,\n 'file_type': self.file_type,\n 'content': self.content,\n 'metadata': self.metadata,\n 'extracted_at': self.extracted_at.isoformat(),\n 'file_size': self.file_size,\n 'page_count': self.page_count,\n 'extraction_method': self.extraction_method,\n 'error_info': self.error_info\n }",
|
|||
|
|
"code_hash": "9aec5f6566c91613207a14430fd3a65f"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "get_summary",
|
|||
|
|
"line_start": 87,
|
|||
|
|
"line_end": 99,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "Dict[str, Any]",
|
|||
|
|
"docstring": "获取文档摘要信息",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def get_summary(self) -> Dict[str, Any]:\n \"\"\"获取文档摘要信息\"\"\"\n return {\n 'filename': self.filename,\n 'file_type': self.file_type,\n 'file_size': self.file_size,\n 'page_count': self.page_count,\n 'content_length': len(self.content),\n 'word_count': len(self.content.split()),\n 'extracted_at': self.extracted_at.isoformat(),\n 'extraction_method': self.extraction_method,\n 'has_error': bool(self.error_info)\n }",
|
|||
|
|
"code_hash": "3b4aeb4ce342f694cf6f50ec911f85a4"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "__init__",
|
|||
|
|
"line_start": 123,
|
|||
|
|
"line_end": 146,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "config",
|
|||
|
|
"type_hint": "DocumentProcessingConfig"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": null,
|
|||
|
|
"docstring": "初始化文本提取器\n\nArgs:\n config: 文档处理配置",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def __init__(self, config: DocumentProcessingConfig):\n \"\"\"\n 初始化文本提取器\n \n Args:\n config: 文档处理配置\n \"\"\"\n self.config = config\n self.extraction_methods = {\n '.pdf': self._extract_from_pdf,\n '.docx': self._extract_from_docx,\n '.doc': self._extract_from_docx,\n '.xlsx': self._extract_from_excel,\n '.xls': self._extract_from_excel,\n '.txt': self._extract_from_text,\n '.md': self._extract_from_text,\n '.csv': self._extract_from_csv,\n '.json': self._extract_from_json,\n '.xml': self._extract_from_xml,\n '.html': self._extract_from_html,\n '.htm': self._extract_from_html\n }\n \n logger.info(f\"文本提取器初始化完成,支持格式: {list(self.SUPPORTED_EXTENSIONS.keys())}\")",
|
|||
|
|
"code_hash": "b80795161396567c1a4afe0cbede261e"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "extract_from_file",
|
|||
|
|
"line_start": 148,
|
|||
|
|
"line_end": 230,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Union[str, Path]"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "ExtractedDocument",
|
|||
|
|
"docstring": "从文件中提取文本\n\nArgs:\n file_path: 文件路径\n \nReturns:\n ExtractedDocument: 提取的文档数据\n \nRaises:\n DocumentProcessingError: 提取失败时抛出",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def extract_from_file(self, file_path: Union[str, Path]) -> ExtractedDocument:\n \"\"\"\n 从文件中提取文本\n \n Args:\n file_path: 文件路径\n \n Returns:\n ExtractedDocument: 提取的文档数据\n \n Raises:\n DocumentProcessingError: 提取失败时抛出\n \"\"\"\n file_path = Path(file_path)\n \n if not file_path.exists():\n raise ResourceNotFoundError(f\"文件不存在: {file_path}\")\n \n if not file_path.is_file():\n raise DocumentProcessingError(f\"路径不是文件: {file_path}\")\n \n file_ext = file_path.suffix.lower()\n \n if file_ext not in self.SUPPORTED_EXTENSIONS:\n raise DocumentProcessingError(f\"不支持的文件格式: {file_ext}\")\n \n try:\n logger.info(f\"开始提取文件: {file_path}\")\n \n # 获取文件基本信息\n file_size = file_path.stat().st_size\n \n # 检查文件大小限制\n if self.config.max_file_size > 0 and file_size > self.config.max_file_size:\n raise DocumentProcessingError(\n f\"文件大小 {file_size} 超过限制 {self.config.max_file_size}\"\n )\n \n # 根据文件扩展名选择提取方法\n extraction_method = self.extraction_methods.get(file_ext)\n if not extraction_method:\n raise DocumentProcessingError(f\"未找到对应的提取方法: {file_ext}\")\n \n # 执行文本提取\n content, metadata, page_count = extraction_method(file_path)\n \n # 内容长度检查\n if self.config.max_content_length > 0 and len(content) > self.config.max_content_length:\n logger.warning(f\"内容长度 {len(content)} 超过限制,将被截断\")\n content = content[:self.config.max_content_length]\n metadata['content_truncated'] = True\n \n # 创建提取结果\n extracted_doc = ExtractedDocument(\n filename=file_path.name,\n file_type=self.SUPPORTED_EXTENSIONS[file_ext],\n content=content,\n metadata=metadata,\n extracted_at=datetime.now(),\n file_size=file_size,\n page_count=page_count,\n extraction_method=extraction_method.__name__\n )\n \n logger.info(f\"文件提取完成: {file_path}, 内容长度: {len(content)}\")\n return extracted_doc\n \n except Exception as e:\n error_msg = f\"文件提取失败 {file_path}: {str(e)}\"\n logger.error(error_msg, exc_info=True)\n \n # 创建错误文档\n return ExtractedDocument(\n filename=file_path.name,\n file_type=self.SUPPORTED_EXTENSIONS.get(file_ext, \"Unknown\"),\n content=\"\",\n metadata={'extraction_error': str(e)},\n extracted_at=datetime.now(),\n file_size=file_path.stat().st_size if file_path.exists() else 0,\n page_count=None,\n extraction_method=None,\n error_info=error_msg\n )",
|
|||
|
|
"code_hash": "ede6a7b109cac815a9ad476f06a9c1c3"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "extract_from_directory",
|
|||
|
|
"line_start": 232,
|
|||
|
|
"line_end": 275,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "directory_path",
|
|||
|
|
"type_hint": "Union[str, Path]"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "List[ExtractedDocument]",
|
|||
|
|
"docstring": "从目录中提取所有支持的文档\n\nArgs:\n directory_path: 目录路径\n \nReturns:\n List[ExtractedDocument]: 提取的文档列表",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def extract_from_directory(self, directory_path: Union[str, Path]) -> List[ExtractedDocument]:\n \"\"\"\n 从目录中提取所有支持的文档\n \n Args:\n directory_path: 目录路径\n \n Returns:\n List[ExtractedDocument]: 提取的文档列表\n \"\"\"\n directory_path = Path(directory_path)\n \n if not directory_path.exists():\n raise ResourceNotFoundError(f\"目录不存在: {directory_path}\")\n \n if not directory_path.is_dir():\n raise DocumentProcessingError(f\"路径不是目录: {directory_path}\")\n \n extracted_docs = []\n processed_count = 0\n \n # 递归搜索文件\n for file_path in directory_path.rglob(\"*\"):\n if file_path.is_file():\n file_ext = file_path.suffix.lower()\n \n if file_ext in self.SUPPORTED_EXTENSIONS:\n try:\n doc = self.extract_from_file(file_path)\n extracted_docs.append(doc)\n processed_count += 1\n \n # 检查处理数量限制\n if (self.config.max_documents > 0 and \n processed_count >= self.config.max_documents):\n logger.warning(f\"达到文档数量限制 {self.config.max_documents}\")\n break\n \n except Exception as e:\n logger.error(f\"处理文件失败 {file_path}: {e}\")\n continue\n \n logger.info(f\"目录提取完成: {directory_path}, 处理文件数: {len(extracted_docs)}\")\n return extracted_docs",
|
|||
|
|
"code_hash": "040078b945a9aaae7216ebb69c026e23"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "_extract_from_pdf",
|
|||
|
|
"line_start": 277,
|
|||
|
|
"line_end": 329,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Path"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "tuple[str, Dict[str, Any], Optional[int]]",
|
|||
|
|
"docstring": "从PDF文件中提取文本",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def _extract_from_pdf(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从PDF文件中提取文本\"\"\"\n if not PDF_AVAILABLE:\n raise DocumentProcessingError(\"PDF支持库未安装,请安装: pip install PyPDF2 pdfplumber\")\n \n content = \"\"\n metadata = {}\n page_count = 0\n \n try:\n # 优先使用pdfplumber,回退到PyPDF2\n with pdfplumber.open(file_path) as pdf:\n page_count = len(pdf.pages)\n metadata['page_count'] = page_count\n \n for page in pdf.pages:\n text = page.extract_text()\n if text:\n content += text + \"\\n\"\n \n # 提取表格数据\n tables = []\n for page in pdf.pages:\n page_tables = page.extract_tables()\n if page_tables:\n tables.extend(page_tables)\n \n if tables:\n metadata['table_count'] = len(tables)\n # 将表格转换为文本\n for i, table in enumerate(tables):\n content += f\"\\n\\n表格 {i+1}:\\n\"\n for row in table:\n if row:\n content += \"\\t\".join(str(cell) if cell else \"\" for cell in row) + \"\\n\"\n \n except Exception as e:\n logger.warning(f\"pdfplumber提取失败,尝试PyPDF2: {e}\")\n \n try:\n with open(file_path, 'rb') as file:\n pdf_reader = PyPDF2.PdfReader(file)\n page_count = len(pdf_reader.pages)\n metadata['page_count'] = page_count\n \n for page in pdf_reader.pages:\n content += page.extract_text() + \"\\n\"\n \n except Exception as e2:\n raise DocumentProcessingError(f\"PDF提取失败: {e2}\")\n \n metadata['extraction_library'] = 'pdfplumber' if 'pdfplumber' in str(type(pdf)) else 'PyPDF2'\n return content.strip(), metadata, page_count",
|
|||
|
|
"code_hash": "595de0393b9c007a029197da48307407"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "_extract_from_docx",
|
|||
|
|
"line_start": 331,
|
|||
|
|
"line_end": 365,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Path"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "tuple[str, Dict[str, Any], Optional[int]]",
|
|||
|
|
"docstring": "从Word文档中提取文本",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def _extract_from_docx(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从Word文档中提取文本\"\"\"\n if not DOCX_AVAILABLE:\n raise DocumentProcessingError(\"Word文档支持库未安装,请安装: pip install python-docx\")\n \n try:\n doc = Document(file_path)\n content = \"\"\n metadata = {}\n \n # 提取段落文本\n paragraph_count = 0\n for paragraph in doc.paragraphs:\n if paragraph.text.strip():\n content += paragraph.text + \"\\n\"\n paragraph_count += 1\n \n # 提取表格文本\n table_count = len(doc.tables)\n for table in doc.tables:\n content += \"\\n\\n表格:\\n\"\n for row in table.rows:\n row_text = \"\\t\".join(cell.text.strip() for cell in row.cells)\n content += row_text + \"\\n\"\n \n metadata.update({\n 'paragraph_count': paragraph_count,\n 'table_count': table_count,\n 'extraction_library': 'python-docx'\n })\n \n return content.strip(), metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"Word文档提取失败: {e}\")",
|
|||
|
|
"code_hash": "c2d01e8cfa286d1633d191be12390b28"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "_extract_from_excel",
|
|||
|
|
"line_start": 367,
|
|||
|
|
"line_end": 401,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Path"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "tuple[str, Dict[str, Any], Optional[int]]",
|
|||
|
|
"docstring": "从Excel文件中提取文本",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def _extract_from_excel(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从Excel文件中提取文本\"\"\"\n if not EXCEL_AVAILABLE:\n raise DocumentProcessingError(\"Excel支持库未安装,请安装: pip install openpyxl\")\n \n try:\n workbook = load_workbook(file_path, data_only=True)\n content = \"\"\n metadata = {}\n \n sheet_count = len(workbook.sheetnames)\n total_rows = 0\n \n for sheet_name in workbook.sheetnames:\n sheet = workbook[sheet_name]\n content += f\"\\n\\n工作表: {sheet_name}\\n\"\n \n sheet_rows = 0\n for row in sheet.iter_rows(values_only=True):\n if any(cell is not None for cell in row):\n row_text = \"\\t\".join(str(cell) if cell is not None else \"\" for cell in row)\n content += row_text + \"\\n\"\n sheet_rows += 1\n total_rows += 1\n \n metadata.update({\n 'sheet_count': sheet_count,\n 'total_rows': total_rows,\n 'extraction_library': 'openpyxl'\n })\n \n return content.strip(), metadata, sheet_count\n \n except Exception as e:\n raise DocumentProcessingError(f\"Excel文件提取失败: {e}\")",
|
|||
|
|
"code_hash": "9f91b9c5beed4befce153fadc76f5618"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "_extract_from_text",
|
|||
|
|
"line_start": 403,
|
|||
|
|
"line_end": 431,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Path"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "tuple[str, Dict[str, Any], Optional[int]]",
|
|||
|
|
"docstring": "从文本文件中提取内容",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def _extract_from_text(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从文本文件中提取内容\"\"\"\n try:\n # 尝试多种编码\n encodings = ['utf-8', 'gbk', 'gb2312', 'latin-1']\n content = \"\"\n used_encoding = \"\"\n \n for encoding in encodings:\n try:\n with open(file_path, 'r', encoding=encoding) as file:\n content = file.read()\n used_encoding = encoding\n break\n except UnicodeDecodeError:\n continue\n \n if not content and not used_encoding:\n raise DocumentProcessingError(\"无法解码文本文件,尝试了多种编码方式\")\n \n metadata = {\n 'encoding': used_encoding,\n 'line_count': len(content.splitlines())\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"文本文件提取失败: {e}\")",
|
|||
|
|
"code_hash": "b7adae74706c1052b8607a2a0a92039f"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "_extract_from_csv",
|
|||
|
|
"line_start": 433,
|
|||
|
|
"line_end": 466,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Path"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "tuple[str, Dict[str, Any], Optional[int]]",
|
|||
|
|
"docstring": "从CSV文件中提取内容",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def _extract_from_csv(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从CSV文件中提取内容\"\"\"\n try:\n if PANDAS_AVAILABLE:\n # 使用pandas处理CSV\n df = pd.read_csv(file_path)\n content = df.to_string(index=False)\n metadata = {\n 'rows': len(df),\n 'columns': len(df.columns),\n 'column_names': list(df.columns),\n 'extraction_library': 'pandas'\n }\n else:\n # 使用内置csv模块\n import csv\n content = \"\"\n with open(file_path, 'r', encoding='utf-8') as file:\n csv_reader = csv.reader(file)\n rows = list(csv_reader)\n \n for row in rows:\n content += \"\\t\".join(row) + \"\\n\"\n \n metadata = {\n 'rows': len(rows),\n 'columns': len(rows[0]) if rows else 0,\n 'extraction_library': 'csv'\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"CSV文件提取失败: {e}\")",
|
|||
|
|
"code_hash": "8e55541a37dfa2e36110638c22153ebc"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "_extract_from_json",
|
|||
|
|
"line_start": 468,
|
|||
|
|
"line_end": 488,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Path"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "tuple[str, Dict[str, Any], Optional[int]]",
|
|||
|
|
"docstring": "从JSON文件中提取内容",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def _extract_from_json(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从JSON文件中提取内容\"\"\"\n try:\n import json\n \n with open(file_path, 'r', encoding='utf-8') as file:\n data = json.load(file)\n \n # 将JSON转换为格式化的文本\n content = json.dumps(data, ensure_ascii=False, indent=2)\n \n metadata = {\n 'json_keys': list(data.keys()) if isinstance(data, dict) else [],\n 'json_type': type(data).__name__,\n 'extraction_library': 'json'\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"JSON文件提取失败: {e}\")",
|
|||
|
|
"code_hash": "95a6d0ec0374f263879bc25a36d92b74"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "_extract_from_xml",
|
|||
|
|
"line_start": 490,
|
|||
|
|
"line_end": 522,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Path"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "tuple[str, Dict[str, Any], Optional[int]]",
|
|||
|
|
"docstring": "从XML文件中提取内容",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def _extract_from_xml(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从XML文件中提取内容\"\"\"\n try:\n import xml.etree.ElementTree as ET\n \n tree = ET.parse(file_path)\n root = tree.getroot()\n \n # 提取所有文本内容\n content = \"\"\n def extract_text(element, level=0):\n nonlocal content\n indent = \" \" * level\n if element.text and element.text.strip():\n content += f\"{indent}{element.tag}: {element.text.strip()}\\n\"\n else:\n content += f\"{indent}{element.tag}\\n\"\n \n for child in element:\n extract_text(child, level + 1)\n \n extract_text(root)\n \n metadata = {\n 'root_tag': root.tag,\n 'element_count': len(list(root.iter())),\n 'extraction_library': 'xml.etree.ElementTree'\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"XML文件提取失败: {e}\")",
|
|||
|
|
"code_hash": "fdc37ab01be4414d3094d8b13150b4c2"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "_extract_from_html",
|
|||
|
|
"line_start": 524,
|
|||
|
|
"line_end": 550,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Path"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "tuple[str, Dict[str, Any], Optional[int]]",
|
|||
|
|
"docstring": "从HTML文件中提取内容",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def _extract_from_html(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从HTML文件中提取内容\"\"\"\n try:\n with open(file_path, 'r', encoding='utf-8') as file:\n html_content = file.read()\n \n # 简单的HTML文本提取(移除标签)\n import re\n # 移除script和style标签及其内容\n html_content = re.sub(r'<script.*?</script>', '', html_content, flags=re.DOTALL)\n html_content = re.sub(r'<style.*?</style>', '', html_content, flags=re.DOTALL)\n \n # 移除HTML标签\n content = re.sub(r'<[^>]+>', '', html_content)\n \n # 清理多余的空白\n content = re.sub(r'\\s+', ' ', content).strip()\n \n metadata = {\n 'original_size': len(html_content),\n 'extraction_library': 'regex'\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"HTML文件提取失败: {e}\")",
|
|||
|
|
"code_hash": "c7a170bb6866f195eb9065aa21caf871"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "get_supported_formats",
|
|||
|
|
"line_start": 552,
|
|||
|
|
"line_end": 554,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "Dict[str, str]",
|
|||
|
|
"docstring": "获取支持的文件格式",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def get_supported_formats(self) -> Dict[str, str]:\n \"\"\"获取支持的文件格式\"\"\"\n return self.SUPPORTED_EXTENSIONS.copy()",
|
|||
|
|
"code_hash": "b190b58a832baa0f901258b0ed23a4e2"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "is_supported_format",
|
|||
|
|
"line_start": 556,
|
|||
|
|
"line_end": 559,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Union[str, Path]"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "bool",
|
|||
|
|
"docstring": "检查文件格式是否支持",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def is_supported_format(self, file_path: Union[str, Path]) -> bool:\n \"\"\"检查文件格式是否支持\"\"\"\n file_path = Path(file_path)\n return file_path.suffix.lower() in self.SUPPORTED_EXTENSIONS",
|
|||
|
|
"code_hash": "d104465d277e576d465debd1d495da97"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "get_extraction_stats",
|
|||
|
|
"line_start": 561,
|
|||
|
|
"line_end": 580,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "Dict[str, Any]",
|
|||
|
|
"docstring": "获取提取器统计信息",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def get_extraction_stats(self) -> Dict[str, Any]:\n \"\"\"获取提取器统计信息\"\"\"\n available_libraries = []\n \n if PDF_AVAILABLE:\n available_libraries.append(\"PDF (PyPDF2, pdfplumber)\")\n if DOCX_AVAILABLE:\n available_libraries.append(\"Word (python-docx)\")\n if EXCEL_AVAILABLE:\n available_libraries.append(\"Excel (openpyxl)\")\n if PANDAS_AVAILABLE:\n available_libraries.append(\"CSV (pandas)\")\n \n return {\n 'supported_formats': list(self.SUPPORTED_EXTENSIONS.keys()),\n 'available_libraries': available_libraries,\n 'max_file_size': self.config.max_file_size,\n 'max_content_length': self.config.max_content_length,\n 'max_documents': self.config.max_documents\n } ",
|
|||
|
|
"code_hash": "2c261c6f8ea522a7987b3c28e03a0807"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "extract_text",
|
|||
|
|
"line_start": 500,
|
|||
|
|
"line_end": 509,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "element"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "level"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": null,
|
|||
|
|
"docstring": "",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def extract_text(element, level=0):\n nonlocal content\n indent = \" \" * level\n if element.text and element.text.strip():\n content += f\"{indent}{element.tag}: {element.text.strip()}\\n\"\n else:\n content += f\"{indent}{element.tag}\\n\"\n \n for child in element:\n extract_text(child, level + 1)",
|
|||
|
|
"code_hash": "17aee15b3d955d5237e6253e3c4f6f5b"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"classes": [
|
|||
|
|
{
|
|||
|
|
"name": "ExtractedDocument",
|
|||
|
|
"line_start": 51,
|
|||
|
|
"line_end": 99,
|
|||
|
|
"bases": [],
|
|||
|
|
"methods": [
|
|||
|
|
{
|
|||
|
|
"name": "__post_init__",
|
|||
|
|
"line_start": 63,
|
|||
|
|
"line_end": 71,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": null,
|
|||
|
|
"docstring": "初始化后处理",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def __post_init__(self):\n \"\"\"初始化后处理\"\"\"\n if not self.extracted_at:\n self.extracted_at = datetime.now()\n \n # 计算内容统计\n self.metadata.setdefault('content_length', len(self.content))\n self.metadata.setdefault('word_count', len(self.content.split()))\n self.metadata.setdefault('line_count', len(self.content.splitlines()))",
|
|||
|
|
"code_hash": "e3b7d9c15591b8f58f4f499fcf8ecac8"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "to_dict",
|
|||
|
|
"line_start": 73,
|
|||
|
|
"line_end": 85,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "Dict[str, Any]",
|
|||
|
|
"docstring": "转换为字典格式",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def to_dict(self) -> Dict[str, Any]:\n \"\"\"转换为字典格式\"\"\"\n return {\n 'filename': self.filename,\n 'file_type': self.file_type,\n 'content': self.content,\n 'metadata': self.metadata,\n 'extracted_at': self.extracted_at.isoformat(),\n 'file_size': self.file_size,\n 'page_count': self.page_count,\n 'extraction_method': self.extraction_method,\n 'error_info': self.error_info\n }",
|
|||
|
|
"code_hash": "9aec5f6566c91613207a14430fd3a65f"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "get_summary",
|
|||
|
|
"line_start": 87,
|
|||
|
|
"line_end": 99,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "Dict[str, Any]",
|
|||
|
|
"docstring": "获取文档摘要信息",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def get_summary(self) -> Dict[str, Any]:\n \"\"\"获取文档摘要信息\"\"\"\n return {\n 'filename': self.filename,\n 'file_type': self.file_type,\n 'file_size': self.file_size,\n 'page_count': self.page_count,\n 'content_length': len(self.content),\n 'word_count': len(self.content.split()),\n 'extracted_at': self.extracted_at.isoformat(),\n 'extraction_method': self.extraction_method,\n 'has_error': bool(self.error_info)\n }",
|
|||
|
|
"code_hash": "3b4aeb4ce342f694cf6f50ec911f85a4"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"docstring": "提取的文档数据",
|
|||
|
|
"decorators": [
|
|||
|
|
"dataclass"
|
|||
|
|
],
|
|||
|
|
"code": "class ExtractedDocument:\n \"\"\"提取的文档数据\"\"\"\n filename: str\n file_type: str\n content: str # 纯文本内容\n metadata: Dict[str, Any] # 文档元数据\n extracted_at: datetime\n file_size: int\n page_count: Optional[int] = None\n extraction_method: Optional[str] = None\n error_info: Optional[str] = None\n \n def __post_init__(self):\n \"\"\"初始化后处理\"\"\"\n if not self.extracted_at:\n self.extracted_at = datetime.now()\n \n # 计算内容统计\n self.metadata.setdefault('content_length', len(self.content))\n self.metadata.setdefault('word_count', len(self.content.split()))\n self.metadata.setdefault('line_count', len(self.content.splitlines()))\n \n def to_dict(self) -> Dict[str, Any]:\n \"\"\"转换为字典格式\"\"\"\n return {\n 'filename': self.filename,\n 'file_type': self.file_type,\n 'content': self.content,\n 'metadata': self.metadata,\n 'extracted_at': self.extracted_at.isoformat(),\n 'file_size': self.file_size,\n 'page_count': self.page_count,\n 'extraction_method': self.extraction_method,\n 'error_info': self.error_info\n }\n \n def get_summary(self) -> Dict[str, Any]:\n \"\"\"获取文档摘要信息\"\"\"\n return {\n 'filename': self.filename,\n 'file_type': self.file_type,\n 'file_size': self.file_size,\n 'page_count': self.page_count,\n 'content_length': len(self.content),\n 'word_count': len(self.content.split()),\n 'extracted_at': self.extracted_at.isoformat(),\n 'extraction_method': self.extraction_method,\n 'has_error': bool(self.error_info)\n }",
|
|||
|
|
"code_hash": "48e79f6e59935d971c73087b22b67bad"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "TextExtractor",
|
|||
|
|
"line_start": 102,
|
|||
|
|
"line_end": 580,
|
|||
|
|
"bases": [],
|
|||
|
|
"methods": [
|
|||
|
|
{
|
|||
|
|
"name": "__init__",
|
|||
|
|
"line_start": 123,
|
|||
|
|
"line_end": 146,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "config",
|
|||
|
|
"type_hint": "DocumentProcessingConfig"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": null,
|
|||
|
|
"docstring": "初始化文本提取器\n\nArgs:\n config: 文档处理配置",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def __init__(self, config: DocumentProcessingConfig):\n \"\"\"\n 初始化文本提取器\n \n Args:\n config: 文档处理配置\n \"\"\"\n self.config = config\n self.extraction_methods = {\n '.pdf': self._extract_from_pdf,\n '.docx': self._extract_from_docx,\n '.doc': self._extract_from_docx,\n '.xlsx': self._extract_from_excel,\n '.xls': self._extract_from_excel,\n '.txt': self._extract_from_text,\n '.md': self._extract_from_text,\n '.csv': self._extract_from_csv,\n '.json': self._extract_from_json,\n '.xml': self._extract_from_xml,\n '.html': self._extract_from_html,\n '.htm': self._extract_from_html\n }\n \n logger.info(f\"文本提取器初始化完成,支持格式: {list(self.SUPPORTED_EXTENSIONS.keys())}\")",
|
|||
|
|
"code_hash": "b80795161396567c1a4afe0cbede261e"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "extract_from_file",
|
|||
|
|
"line_start": 148,
|
|||
|
|
"line_end": 230,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Union[str, Path]"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "ExtractedDocument",
|
|||
|
|
"docstring": "从文件中提取文本\n\nArgs:\n file_path: 文件路径\n \nReturns:\n ExtractedDocument: 提取的文档数据\n \nRaises:\n DocumentProcessingError: 提取失败时抛出",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def extract_from_file(self, file_path: Union[str, Path]) -> ExtractedDocument:\n \"\"\"\n 从文件中提取文本\n \n Args:\n file_path: 文件路径\n \n Returns:\n ExtractedDocument: 提取的文档数据\n \n Raises:\n DocumentProcessingError: 提取失败时抛出\n \"\"\"\n file_path = Path(file_path)\n \n if not file_path.exists():\n raise ResourceNotFoundError(f\"文件不存在: {file_path}\")\n \n if not file_path.is_file():\n raise DocumentProcessingError(f\"路径不是文件: {file_path}\")\n \n file_ext = file_path.suffix.lower()\n \n if file_ext not in self.SUPPORTED_EXTENSIONS:\n raise DocumentProcessingError(f\"不支持的文件格式: {file_ext}\")\n \n try:\n logger.info(f\"开始提取文件: {file_path}\")\n \n # 获取文件基本信息\n file_size = file_path.stat().st_size\n \n # 检查文件大小限制\n if self.config.max_file_size > 0 and file_size > self.config.max_file_size:\n raise DocumentProcessingError(\n f\"文件大小 {file_size} 超过限制 {self.config.max_file_size}\"\n )\n \n # 根据文件扩展名选择提取方法\n extraction_method = self.extraction_methods.get(file_ext)\n if not extraction_method:\n raise DocumentProcessingError(f\"未找到对应的提取方法: {file_ext}\")\n \n # 执行文本提取\n content, metadata, page_count = extraction_method(file_path)\n \n # 内容长度检查\n if self.config.max_content_length > 0 and len(content) > self.config.max_content_length:\n logger.warning(f\"内容长度 {len(content)} 超过限制,将被截断\")\n content = content[:self.config.max_content_length]\n metadata['content_truncated'] = True\n \n # 创建提取结果\n extracted_doc = ExtractedDocument(\n filename=file_path.name,\n file_type=self.SUPPORTED_EXTENSIONS[file_ext],\n content=content,\n metadata=metadata,\n extracted_at=datetime.now(),\n file_size=file_size,\n page_count=page_count,\n extraction_method=extraction_method.__name__\n )\n \n logger.info(f\"文件提取完成: {file_path}, 内容长度: {len(content)}\")\n return extracted_doc\n \n except Exception as e:\n error_msg = f\"文件提取失败 {file_path}: {str(e)}\"\n logger.error(error_msg, exc_info=True)\n \n # 创建错误文档\n return ExtractedDocument(\n filename=file_path.name,\n file_type=self.SUPPORTED_EXTENSIONS.get(file_ext, \"Unknown\"),\n content=\"\",\n metadata={'extraction_error': str(e)},\n extracted_at=datetime.now(),\n file_size=file_path.stat().st_size if file_path.exists() else 0,\n page_count=None,\n extraction_method=None,\n error_info=error_msg\n )",
|
|||
|
|
"code_hash": "ede6a7b109cac815a9ad476f06a9c1c3"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "extract_from_directory",
|
|||
|
|
"line_start": 232,
|
|||
|
|
"line_end": 275,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "directory_path",
|
|||
|
|
"type_hint": "Union[str, Path]"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "List[ExtractedDocument]",
|
|||
|
|
"docstring": "从目录中提取所有支持的文档\n\nArgs:\n directory_path: 目录路径\n \nReturns:\n List[ExtractedDocument]: 提取的文档列表",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def extract_from_directory(self, directory_path: Union[str, Path]) -> List[ExtractedDocument]:\n \"\"\"\n 从目录中提取所有支持的文档\n \n Args:\n directory_path: 目录路径\n \n Returns:\n List[ExtractedDocument]: 提取的文档列表\n \"\"\"\n directory_path = Path(directory_path)\n \n if not directory_path.exists():\n raise ResourceNotFoundError(f\"目录不存在: {directory_path}\")\n \n if not directory_path.is_dir():\n raise DocumentProcessingError(f\"路径不是目录: {directory_path}\")\n \n extracted_docs = []\n processed_count = 0\n \n # 递归搜索文件\n for file_path in directory_path.rglob(\"*\"):\n if file_path.is_file():\n file_ext = file_path.suffix.lower()\n \n if file_ext in self.SUPPORTED_EXTENSIONS:\n try:\n doc = self.extract_from_file(file_path)\n extracted_docs.append(doc)\n processed_count += 1\n \n # 检查处理数量限制\n if (self.config.max_documents > 0 and \n processed_count >= self.config.max_documents):\n logger.warning(f\"达到文档数量限制 {self.config.max_documents}\")\n break\n \n except Exception as e:\n logger.error(f\"处理文件失败 {file_path}: {e}\")\n continue\n \n logger.info(f\"目录提取完成: {directory_path}, 处理文件数: {len(extracted_docs)}\")\n return extracted_docs",
|
|||
|
|
"code_hash": "040078b945a9aaae7216ebb69c026e23"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "_extract_from_pdf",
|
|||
|
|
"line_start": 277,
|
|||
|
|
"line_end": 329,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Path"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "tuple[str, Dict[str, Any], Optional[int]]",
|
|||
|
|
"docstring": "从PDF文件中提取文本",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def _extract_from_pdf(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从PDF文件中提取文本\"\"\"\n if not PDF_AVAILABLE:\n raise DocumentProcessingError(\"PDF支持库未安装,请安装: pip install PyPDF2 pdfplumber\")\n \n content = \"\"\n metadata = {}\n page_count = 0\n \n try:\n # 优先使用pdfplumber,回退到PyPDF2\n with pdfplumber.open(file_path) as pdf:\n page_count = len(pdf.pages)\n metadata['page_count'] = page_count\n \n for page in pdf.pages:\n text = page.extract_text()\n if text:\n content += text + \"\\n\"\n \n # 提取表格数据\n tables = []\n for page in pdf.pages:\n page_tables = page.extract_tables()\n if page_tables:\n tables.extend(page_tables)\n \n if tables:\n metadata['table_count'] = len(tables)\n # 将表格转换为文本\n for i, table in enumerate(tables):\n content += f\"\\n\\n表格 {i+1}:\\n\"\n for row in table:\n if row:\n content += \"\\t\".join(str(cell) if cell else \"\" for cell in row) + \"\\n\"\n \n except Exception as e:\n logger.warning(f\"pdfplumber提取失败,尝试PyPDF2: {e}\")\n \n try:\n with open(file_path, 'rb') as file:\n pdf_reader = PyPDF2.PdfReader(file)\n page_count = len(pdf_reader.pages)\n metadata['page_count'] = page_count\n \n for page in pdf_reader.pages:\n content += page.extract_text() + \"\\n\"\n \n except Exception as e2:\n raise DocumentProcessingError(f\"PDF提取失败: {e2}\")\n \n metadata['extraction_library'] = 'pdfplumber' if 'pdfplumber' in str(type(pdf)) else 'PyPDF2'\n return content.strip(), metadata, page_count",
|
|||
|
|
"code_hash": "595de0393b9c007a029197da48307407"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "_extract_from_docx",
|
|||
|
|
"line_start": 331,
|
|||
|
|
"line_end": 365,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Path"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "tuple[str, Dict[str, Any], Optional[int]]",
|
|||
|
|
"docstring": "从Word文档中提取文本",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def _extract_from_docx(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从Word文档中提取文本\"\"\"\n if not DOCX_AVAILABLE:\n raise DocumentProcessingError(\"Word文档支持库未安装,请安装: pip install python-docx\")\n \n try:\n doc = Document(file_path)\n content = \"\"\n metadata = {}\n \n # 提取段落文本\n paragraph_count = 0\n for paragraph in doc.paragraphs:\n if paragraph.text.strip():\n content += paragraph.text + \"\\n\"\n paragraph_count += 1\n \n # 提取表格文本\n table_count = len(doc.tables)\n for table in doc.tables:\n content += \"\\n\\n表格:\\n\"\n for row in table.rows:\n row_text = \"\\t\".join(cell.text.strip() for cell in row.cells)\n content += row_text + \"\\n\"\n \n metadata.update({\n 'paragraph_count': paragraph_count,\n 'table_count': table_count,\n 'extraction_library': 'python-docx'\n })\n \n return content.strip(), metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"Word文档提取失败: {e}\")",
|
|||
|
|
"code_hash": "c2d01e8cfa286d1633d191be12390b28"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "_extract_from_excel",
|
|||
|
|
"line_start": 367,
|
|||
|
|
"line_end": 401,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Path"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "tuple[str, Dict[str, Any], Optional[int]]",
|
|||
|
|
"docstring": "从Excel文件中提取文本",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def _extract_from_excel(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从Excel文件中提取文本\"\"\"\n if not EXCEL_AVAILABLE:\n raise DocumentProcessingError(\"Excel支持库未安装,请安装: pip install openpyxl\")\n \n try:\n workbook = load_workbook(file_path, data_only=True)\n content = \"\"\n metadata = {}\n \n sheet_count = len(workbook.sheetnames)\n total_rows = 0\n \n for sheet_name in workbook.sheetnames:\n sheet = workbook[sheet_name]\n content += f\"\\n\\n工作表: {sheet_name}\\n\"\n \n sheet_rows = 0\n for row in sheet.iter_rows(values_only=True):\n if any(cell is not None for cell in row):\n row_text = \"\\t\".join(str(cell) if cell is not None else \"\" for cell in row)\n content += row_text + \"\\n\"\n sheet_rows += 1\n total_rows += 1\n \n metadata.update({\n 'sheet_count': sheet_count,\n 'total_rows': total_rows,\n 'extraction_library': 'openpyxl'\n })\n \n return content.strip(), metadata, sheet_count\n \n except Exception as e:\n raise DocumentProcessingError(f\"Excel文件提取失败: {e}\")",
|
|||
|
|
"code_hash": "9f91b9c5beed4befce153fadc76f5618"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "_extract_from_text",
|
|||
|
|
"line_start": 403,
|
|||
|
|
"line_end": 431,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Path"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "tuple[str, Dict[str, Any], Optional[int]]",
|
|||
|
|
"docstring": "从文本文件中提取内容",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def _extract_from_text(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从文本文件中提取内容\"\"\"\n try:\n # 尝试多种编码\n encodings = ['utf-8', 'gbk', 'gb2312', 'latin-1']\n content = \"\"\n used_encoding = \"\"\n \n for encoding in encodings:\n try:\n with open(file_path, 'r', encoding=encoding) as file:\n content = file.read()\n used_encoding = encoding\n break\n except UnicodeDecodeError:\n continue\n \n if not content and not used_encoding:\n raise DocumentProcessingError(\"无法解码文本文件,尝试了多种编码方式\")\n \n metadata = {\n 'encoding': used_encoding,\n 'line_count': len(content.splitlines())\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"文本文件提取失败: {e}\")",
|
|||
|
|
"code_hash": "b7adae74706c1052b8607a2a0a92039f"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "_extract_from_csv",
|
|||
|
|
"line_start": 433,
|
|||
|
|
"line_end": 466,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Path"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "tuple[str, Dict[str, Any], Optional[int]]",
|
|||
|
|
"docstring": "从CSV文件中提取内容",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def _extract_from_csv(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从CSV文件中提取内容\"\"\"\n try:\n if PANDAS_AVAILABLE:\n # 使用pandas处理CSV\n df = pd.read_csv(file_path)\n content = df.to_string(index=False)\n metadata = {\n 'rows': len(df),\n 'columns': len(df.columns),\n 'column_names': list(df.columns),\n 'extraction_library': 'pandas'\n }\n else:\n # 使用内置csv模块\n import csv\n content = \"\"\n with open(file_path, 'r', encoding='utf-8') as file:\n csv_reader = csv.reader(file)\n rows = list(csv_reader)\n \n for row in rows:\n content += \"\\t\".join(row) + \"\\n\"\n \n metadata = {\n 'rows': len(rows),\n 'columns': len(rows[0]) if rows else 0,\n 'extraction_library': 'csv'\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"CSV文件提取失败: {e}\")",
|
|||
|
|
"code_hash": "8e55541a37dfa2e36110638c22153ebc"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "_extract_from_json",
|
|||
|
|
"line_start": 468,
|
|||
|
|
"line_end": 488,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Path"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "tuple[str, Dict[str, Any], Optional[int]]",
|
|||
|
|
"docstring": "从JSON文件中提取内容",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def _extract_from_json(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从JSON文件中提取内容\"\"\"\n try:\n import json\n \n with open(file_path, 'r', encoding='utf-8') as file:\n data = json.load(file)\n \n # 将JSON转换为格式化的文本\n content = json.dumps(data, ensure_ascii=False, indent=2)\n \n metadata = {\n 'json_keys': list(data.keys()) if isinstance(data, dict) else [],\n 'json_type': type(data).__name__,\n 'extraction_library': 'json'\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"JSON文件提取失败: {e}\")",
|
|||
|
|
"code_hash": "95a6d0ec0374f263879bc25a36d92b74"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "_extract_from_xml",
|
|||
|
|
"line_start": 490,
|
|||
|
|
"line_end": 522,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Path"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "tuple[str, Dict[str, Any], Optional[int]]",
|
|||
|
|
"docstring": "从XML文件中提取内容",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def _extract_from_xml(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从XML文件中提取内容\"\"\"\n try:\n import xml.etree.ElementTree as ET\n \n tree = ET.parse(file_path)\n root = tree.getroot()\n \n # 提取所有文本内容\n content = \"\"\n def extract_text(element, level=0):\n nonlocal content\n indent = \" \" * level\n if element.text and element.text.strip():\n content += f\"{indent}{element.tag}: {element.text.strip()}\\n\"\n else:\n content += f\"{indent}{element.tag}\\n\"\n \n for child in element:\n extract_text(child, level + 1)\n \n extract_text(root)\n \n metadata = {\n 'root_tag': root.tag,\n 'element_count': len(list(root.iter())),\n 'extraction_library': 'xml.etree.ElementTree'\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"XML文件提取失败: {e}\")",
|
|||
|
|
"code_hash": "fdc37ab01be4414d3094d8b13150b4c2"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "_extract_from_html",
|
|||
|
|
"line_start": 524,
|
|||
|
|
"line_end": 550,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Path"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "tuple[str, Dict[str, Any], Optional[int]]",
|
|||
|
|
"docstring": "从HTML文件中提取内容",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def _extract_from_html(self, file_path: Path) -> tuple[str, Dict[str, Any], Optional[int]]:\n \"\"\"从HTML文件中提取内容\"\"\"\n try:\n with open(file_path, 'r', encoding='utf-8') as file:\n html_content = file.read()\n \n # 简单的HTML文本提取(移除标签)\n import re\n # 移除script和style标签及其内容\n html_content = re.sub(r'<script.*?</script>', '', html_content, flags=re.DOTALL)\n html_content = re.sub(r'<style.*?</style>', '', html_content, flags=re.DOTALL)\n \n # 移除HTML标签\n content = re.sub(r'<[^>]+>', '', html_content)\n \n # 清理多余的空白\n content = re.sub(r'\\s+', ' ', content).strip()\n \n metadata = {\n 'original_size': len(html_content),\n 'extraction_library': 'regex'\n }\n \n return content, metadata, None\n \n except Exception as e:\n raise DocumentProcessingError(f\"HTML文件提取失败: {e}\")",
|
|||
|
|
"code_hash": "c7a170bb6866f195eb9065aa21caf871"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "get_supported_formats",
|
|||
|
|
"line_start": 552,
|
|||
|
|
"line_end": 554,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "Dict[str, str]",
|
|||
|
|
"docstring": "获取支持的文件格式",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def get_supported_formats(self) -> Dict[str, str]:\n \"\"\"获取支持的文件格式\"\"\"\n return self.SUPPORTED_EXTENSIONS.copy()",
|
|||
|
|
"code_hash": "b190b58a832baa0f901258b0ed23a4e2"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "is_supported_format",
|
|||
|
|
"line_start": 556,
|
|||
|
|
"line_end": 559,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "file_path",
|
|||
|
|
"type_hint": "Union[str, Path]"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "bool",
|
|||
|
|
"docstring": "检查文件格式是否支持",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def is_supported_format(self, file_path: Union[str, Path]) -> bool:\n \"\"\"检查文件格式是否支持\"\"\"\n file_path = Path(file_path)\n return file_path.suffix.lower() in self.SUPPORTED_EXTENSIONS",
|
|||
|
|
"code_hash": "d104465d277e576d465debd1d495da97"
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "get_extraction_stats",
|
|||
|
|
"line_start": 561,
|
|||
|
|
"line_end": 580,
|
|||
|
|
"args": [
|
|||
|
|
{
|
|||
|
|
"name": "self"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"return_type": "Dict[str, Any]",
|
|||
|
|
"docstring": "获取提取器统计信息",
|
|||
|
|
"is_async": false,
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": " def get_extraction_stats(self) -> Dict[str, Any]:\n \"\"\"获取提取器统计信息\"\"\"\n available_libraries = []\n \n if PDF_AVAILABLE:\n available_libraries.append(\"PDF (PyPDF2, pdfplumber)\")\n if DOCX_AVAILABLE:\n available_libraries.append(\"Word (python-docx)\")\n if EXCEL_AVAILABLE:\n available_libraries.append(\"Excel (openpyxl)\")\n if PANDAS_AVAILABLE:\n available_libraries.append(\"CSV (pandas)\")\n \n return {\n 'supported_formats': list(self.SUPPORTED_EXTENSIONS.keys()),\n 'available_libraries': available_libraries,\n 'max_file_size': self.config.max_file_size,\n 'max_content_length': self.config.max_content_length,\n 'max_documents': self.config.max_documents\n } ",
|
|||
|
|
"code_hash": "2c261c6f8ea522a7987b3c28e03a0807"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"docstring": "文本提取器 - 重构版本\n支持从多种文档格式中提取文本内容",
|
|||
|
|
"decorators": [],
|
|||
|
|
"code": "class TextExtractor:\n \"\"\"\n 文本提取器 - 重构版本\n 支持从多种文档格式中提取文本内容\n \"\"\"\n \n SUPPORTED_EXTENSIONS = {\n '.pdf': 'PDF Document',\n '.docx': 'Microsoft Word Document',\n '.doc': 'Microsoft Word Document (Legacy)',\n '.xlsx': 'Microsoft Excel Spreadsheet',\n '.xls': 'Microsoft Excel Spreadsheet (Legacy)',\n '.txt': 'Plain Text',\n '.md': 'Markdown',\n '.csv': 'Comma Separated Values',\n '.json': 'JSON Document',\n '.xml': 'XML Document',\n '.html': 'HTML Document',\n '.htm': 'HTML Document'\n }\n \n def __init__(self, config: DocumentProcessingConfig):\n \"\"\"\n 初始化文本提取器\n \n Args:\n config: 文档处理配置\n \"\"\"\n self.config = config\n self.extraction_methods = {\n '.pdf': self._extract_from_pdf,\n '.docx': self._extract_from_docx,\n '.doc': self._extract_from_docx,\n '.xlsx': self._extract_from_excel,\n '.xls': self._extract_from_excel,\n '.txt': self._extract_from_text,\n '.md': self._extract_from_text,\n '.csv': self._extract_from_csv,\n '.json': self._extract_from_json,\n '.xml': self._extract_from_xml,\n '.html': self._extract_from_html,\n '.htm': self._extract_from_html\n }\n \n logger.info(f\"文本提取器初始化完成,支持格式: {list(self.SUPPORTED_EXTENSIONS.keys())}\")\n \n def extract_from_file(self, file_path: Union[str, Path]) -> ExtractedDocument:\n \"\"\"\n 从文件中提取文本\n \n Args:\n file_path: 文件路径\n \n Returns:\n ExtractedDocument: 提取的文档数据\n \n Raises:\n DocumentProcessingError: 提取失败时抛出\n \"\"\"\n file_path = Path(file_path)\n \n if not file_path.exists():\n raise ResourceNotFoundError(f\"文件不存在: {file_path}\")\n \n if not file_path.is_file():\n raise DocumentProcessingError(f\"路径不是文件: {file_path}\")\n \n file_ext = file_path.suffix.lower()\n \n if file_ext not in self.SUPPORTED_EXTENSIONS:\n raise DocumentProcessingError(f\"不支持的文件格式: {file_ext}\")\n \n try:\n logger.info(f\"开始提取文件: {file_path}\")\n \n # 获取文件基本信息\n file_size = file_path.stat().st_size\n \n # 检查文件大小限制\n if self.config.max_file_size > 0 and file_size > self.config.max_file_size:\n raise DocumentProcessingError(\n f\"文件大小 {file_size} 超过限制 {self.config.max_file_size}\"\n )\n \n # 根据文件扩展名选择提取方法\n extraction_method = self.extraction_methods.get(file_ext)\n if not extraction_method:\n raise DocumentProcessingError(f\"未找到对应的提取方法: {file_ext}\")\n \n # 执行文本提取\n content, metadata, page_count = extraction_method(file_path)\n \n # 内容长度检查\n if self.config.max_content_length > 0 and len(content) > self.config.max_content_length:\n logger.warning(f\"内容长度 {len(content)} 超过限制,将被截断\")\n content = content[:self.config.max_content_length]\n metadata['content_truncated'] = True\n \n # 创建提取结果\n extracted_doc = ExtractedDocument(\n filename=file_path.name,\n file_type=self.SUPPORTED_EXTENSIONS[file_ext],\n content=content,\n metadata=metadata,\n extracted_at=datetime.now(),\n
|
|||
|
|
"code_hash": "e17c94489b3b534c4c770787252fda8f"
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"imports": [
|
|||
|
|
{
|
|||
|
|
"type": "import",
|
|||
|
|
"modules": [
|
|||
|
|
"os"
|
|||
|
|
],
|
|||
|
|
"aliases": []
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"type": "import",
|
|||
|
|
"modules": [
|
|||
|
|
"logging"
|
|||
|
|
],
|
|||
|
|
"aliases": []
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"type": "import",
|
|||
|
|
"modules": [
|
|||
|
|
"mimetypes"
|
|||
|
|
],
|
|||
|
|
"aliases": []
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"type": "from_import",
|
|||
|
|
"module": "typing",
|
|||
|
|
"names": [
|
|||
|
|
"List",
|
|||
|
|
"Dict",
|
|||
|
|
"Any",
|
|||
|
|
"Optional",
|
|||
|
|
"Union"
|
|||
|
|
],
|
|||
|
|
"aliases": [],
|
|||
|
|
"level": 0
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"type": "from_import",
|
|||
|
|
"module": "pathlib",
|
|||
|
|
"names": [
|
|||
|
|
"Path"
|
|||
|
|
],
|
|||
|
|
"aliases": [],
|
|||
|
|
"level": 0
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"type": "from_import",
|
|||
|
|
"module": "dataclasses",
|
|||
|
|
"names": [
|
|||
|
|
"dataclass"
|
|||
|
|
],
|
|||
|
|
"aliases": [],
|
|||
|
|
"level": 0
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"type": "from_import",
|
|||
|
|
"module": "datetime",
|
|||
|
|
"names": [
|
|||
|
|
"datetime"
|
|||
|
|
],
|
|||
|
|
"aliases": [],
|
|||
|
|
"level": 0
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"type": "from_import",
|
|||
|
|
"module": "config",
|
|||
|
|
"names": [
|
|||
|
|
"DocumentProcessingConfig"
|
|||
|
|
],
|
|||
|
|
"aliases": [],
|
|||
|
|
"level": 2
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"type": "from_import",
|
|||
|
|
"module": "exceptions",
|
|||
|
|
"names": [
|
|||
|
|
"DocumentProcessingError",
|
|||
|
|
"ResourceNotFoundError"
|
|||
|
|
],
|
|||
|
|
"aliases": [],
|
|||
|
|
"level": 2
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"type": "import",
|
|||
|
|
"modules": [
|
|||
|
|
"PyPDF2"
|
|||
|
|
],
|
|||
|
|
"aliases": []
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"type": "import",
|
|||
|
|
"modules": [
|
|||
|
|
"pdfplumber"
|
|||
|
|
],
|
|||
|
|
"aliases": []
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"type": "from_import",
|
|||
|
|
"module": "docx",
|
|||
|
|
"names": [
|
|||
|
|
"Document"
|
|||
|
|
],
|
|||
|
|
"aliases": [],
|
|||
|
|
"level": 0
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"type": "import",
|
|||
|
|
"modules": [
|
|||
|
|
"openpyxl"
|
|||
|
|
],
|
|||
|
|
"aliases": []
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"type": "from_import",
|
|||
|
|
"module": "openpyxl",
|
|||
|
|
"names": [
|
|||
|
|
"load_workbook"
|
|||
|
|
],
|
|||
|
|
"aliases": [],
|
|||
|
|
"level": 0
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"type": "import",
|
|||
|
|
"modules": [
|
|||
|
|
"pandas"
|
|||
|
|
],
|
|||
|
|
"aliases": [
|
|||
|
|
"pd"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"type": "import",
|
|||
|
|
"modules": [
|
|||
|
|
"json"
|
|||
|
|
],
|
|||
|
|
"aliases": []
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"type": "import",
|
|||
|
|
"modules": [
|
|||
|
|
"xml.etree.ElementTree"
|
|||
|
|
],
|
|||
|
|
"aliases": [
|
|||
|
|
"ET"
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"type": "import",
|
|||
|
|
"modules": [
|
|||
|
|
"re"
|
|||
|
|
],
|
|||
|
|
"aliases": []
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"type": "import",
|
|||
|
|
"modules": [
|
|||
|
|
"csv"
|
|||
|
|
],
|
|||
|
|
"aliases": []
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"constants": [
|
|||
|
|
{
|
|||
|
|
"name": "PDF_AVAILABLE",
|
|||
|
|
"value": true,
|
|||
|
|
"type": "bool",
|
|||
|
|
"line": 21
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "DOCX_AVAILABLE",
|
|||
|
|
"value": true,
|
|||
|
|
"type": "bool",
|
|||
|
|
"line": 27
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "EXCEL_AVAILABLE",
|
|||
|
|
"value": true,
|
|||
|
|
"type": "bool",
|
|||
|
|
"line": 34
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "PANDAS_AVAILABLE",
|
|||
|
|
"value": true,
|
|||
|
|
"type": "bool",
|
|||
|
|
"line": 40
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "SUPPORTED_EXTENSIONS",
|
|||
|
|
"value": {
|
|||
|
|
".pdf": "PDF Document",
|
|||
|
|
".docx": "Microsoft Word Document",
|
|||
|
|
".doc": "Microsoft Word Document (Legacy)",
|
|||
|
|
".xlsx": "Microsoft Excel Spreadsheet",
|
|||
|
|
".xls": "Microsoft Excel Spreadsheet (Legacy)",
|
|||
|
|
".txt": "Plain Text",
|
|||
|
|
".md": "Markdown",
|
|||
|
|
".csv": "Comma Separated Values",
|
|||
|
|
".json": "JSON Document",
|
|||
|
|
".xml": "XML Document",
|
|||
|
|
".html": "HTML Document",
|
|||
|
|
".htm": "HTML Document"
|
|||
|
|
},
|
|||
|
|
"type": "dict",
|
|||
|
|
"line": 108
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "PDF_AVAILABLE",
|
|||
|
|
"value": false,
|
|||
|
|
"type": "bool",
|
|||
|
|
"line": 23
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "DOCX_AVAILABLE",
|
|||
|
|
"value": false,
|
|||
|
|
"type": "bool",
|
|||
|
|
"line": 29
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "EXCEL_AVAILABLE",
|
|||
|
|
"value": false,
|
|||
|
|
"type": "bool",
|
|||
|
|
"line": 36
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "PANDAS_AVAILABLE",
|
|||
|
|
"value": false,
|
|||
|
|
"type": "bool",
|
|||
|
|
"line": 42
|
|||
|
|
}
|
|||
|
|
],
|
|||
|
|
"docstring": "Text Extractor\n文本提取器 - 重构版本,支持从多种格式文档中提取文本内容",
|
|||
|
|
"content_hash": "2e8ea1b0b3f987f64faa46d44b0f73f0"
|
|||
|
|
}
|