{ "file_path": "document/text_extractor.py", "file_size": 12661, "line_count": 355, "functions": [ { "name": "__post_init__", "line_start": 50, "line_end": 53, "args": [ { "name": "self" } ], "return_type": null, "docstring": "", "is_async": false, "decorators": [], "code": " def __post_init__(self):\n # 确保content是字符串\n if not isinstance(self.content, str):\n self.content = str(self.content)", "code_hash": "e461fc43783cbe62b22d1a9349785191" }, { "name": "__init__", "line_start": 58, "line_end": 69, "args": [ { "name": "self" }, { "name": "config", "type_hint": "Optional[Dict[str, Any]]" } ], "return_type": null, "docstring": "", "is_async": false, "decorators": [], "code": " def __init__(self, config: Optional[Dict[str, Any]] = None):\n self.config = config or {}\n self.supported_formats = {\n '.pdf': self._extract_pdf,\n '.docx': self._extract_docx,\n '.doc': self._extract_doc,\n '.txt': self._extract_txt,\n '.md': self._extract_txt,\n '.xlsx': self._extract_xlsx,\n '.xls': self._extract_xls,\n '.csv': self._extract_csv\n }", "code_hash": "4477c71180d28cd290e553df70152277" }, { "name": "extract", "line_start": 71, "line_end": 102, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "str" } ], "return_type": "ExtractedDocument", "docstring": "提取单个文件的文本内容", "is_async": false, "decorators": [], "code": " def extract(self, file_path: str) -> ExtractedDocument:\n \"\"\"提取单个文件的文本内容\"\"\"\n path_obj = Path(file_path)\n \n if not path_obj.exists():\n raise FileNotFoundError(f\"文件不存在: {file_path}\")\n \n file_ext = path_obj.suffix.lower()\n if file_ext not in self.supported_formats:\n raise ValueError(f\"不支持的文件格式: {file_ext}\")\n \n try:\n # 获取文件信息\n file_size = path_obj.stat().st_size\n \n # 提取文本内容\n extractor = self.supported_formats[file_ext]\n content, metadata = extractor(path_obj)\n \n return ExtractedDocument(\n filename=path_obj.name,\n file_type=file_ext,\n content=content,\n metadata=metadata,\n extracted_at=datetime.now(),\n file_size=file_size,\n page_count=metadata.get('page_count')\n )\n \n except Exception as e:\n logger.error(f\"提取文件 {file_path} 时出错: {str(e)}\")\n raise", "code_hash": "5436f5f2cacc8d554f042f4bf89de554" }, { "name": "extract_batch", "line_start": 104, "line_end": 126, "args": [ { "name": "self" }, { "name": "file_paths", "type_hint": "List[str]" } ], "return_type": "List[ExtractedDocument]", "docstring": "批量提取多个文件的文本内容", "is_async": false, "decorators": [], "code": " def extract_batch(self, file_paths: List[str]) -> List[ExtractedDocument]:\n \"\"\"批量提取多个文件的文本内容\"\"\"\n results = []\n \n for file_path in file_paths:\n try:\n result = self.extract(file_path)\n results.append(result)\n logger.info(f\"成功提取文件: {file_path}\")\n except Exception as e:\n logger.error(f\"提取文件 {file_path} 失败: {str(e)}\")\n # 创建错误记录\n error_doc = ExtractedDocument(\n filename=Path(file_path).name,\n file_type=Path(file_path).suffix.lower(),\n content=f\"提取失败: {str(e)}\",\n metadata={\"error\": str(e)},\n extracted_at=datetime.now(),\n file_size=0\n )\n results.append(error_doc)\n \n return results", "code_hash": "f800214463cb7786ec16c690d19d451e" }, { "name": "_extract_pdf", "line_start": 128, "line_end": 184, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Path" } ], "return_type": "tuple[str, Dict[str, Any]]", "docstring": "提取PDF文件的纯文本内容", "is_async": false, "decorators": [], "code": " def _extract_pdf(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取PDF文件的纯文本内容\"\"\"\n if not PDF_AVAILABLE:\n raise ImportError(\"需要安装 PyPDF2 和 pdfplumber: pip install PyPDF2 pdfplumber\")\n \n content_parts = []\n metadata = {}\n \n try:\n # 使用pdfplumber提取文本(更好的文本提取)\n with pdfplumber.open(file_path) as pdf:\n metadata['page_count'] = len(pdf.pages)\n \n for page_num, page in enumerate(pdf.pages, 1):\n page_text = page.extract_text()\n if page_text:\n content_parts.append(f\"=== 第 {page_num} 页 ===\\n{page_text}\\n\")\n \n # 获取文档元数据\n if pdf.metadata:\n metadata.update({\n 'title': pdf.metadata.get('Title', ''),\n 'author': pdf.metadata.get('Author', ''),\n 'subject': pdf.metadata.get('Subject', ''),\n 'creator': pdf.metadata.get('Creator', ''),\n 'producer': pdf.metadata.get('Producer', ''),\n 'creation_date': pdf.metadata.get('CreationDate', ''),\n 'modification_date': pdf.metadata.get('ModDate', '')\n })\n \n except Exception as e:\n logger.warning(f\"pdfplumber提取失败,尝试使用PyPDF2: {str(e)}\")\n \n # 备用方案:使用PyPDF2\n with open(file_path, 'rb') as file:\n pdf_reader = PyPDF2.PdfReader(file)\n metadata['page_count'] = len(pdf_reader.pages)\n \n for page_num, page in enumerate(pdf_reader.pages, 1):\n page_text = page.extract_text()\n if page_text:\n content_parts.append(f\"=== 第 {page_num} 页 ===\\n{page_text}\\n\")\n \n # 获取文档元数据\n if pdf_reader.metadata:\n metadata.update({\n 'title': pdf_reader.metadata.get('/Title', ''),\n 'author': pdf_reader.metadata.get('/Author', ''),\n 'subject': pdf_reader.metadata.get('/Subject', ''),\n 'creator': pdf_reader.metadata.get('/Creator', ''),\n 'producer': pdf_reader.metadata.get('/Producer', ''),\n 'creation_date': pdf_reader.metadata.get('/CreationDate', ''),\n 'modification_date': pdf_reader.metadata.get('/ModDate', '')\n })\n \n content = '\\n'.join(content_parts) if content_parts else \"\"\n return content, metadata", "code_hash": "92ff0fefec17a73a1c8224b15b5c70ea" }, { "name": "_extract_docx", "line_start": 186, "line_end": 227, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Path" } ], "return_type": "tuple[str, Dict[str, Any]]", "docstring": "提取DOCX文件的纯文本内容", "is_async": false, "decorators": [], "code": " def _extract_docx(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取DOCX文件的纯文本内容\"\"\"\n if not DOCX_AVAILABLE:\n raise ImportError(\"需要安装 python-docx: pip install python-docx\")\n \n doc = Document(str(file_path))\n content_parts = []\n metadata = {}\n \n # 提取所有段落文本\n for paragraph in doc.paragraphs:\n if paragraph.text.strip():\n content_parts.append(paragraph.text)\n \n # 提取表格内容\n for table in doc.tables:\n table_content = []\n for row in table.rows:\n row_content = []\n for cell in row.cells:\n row_content.append(cell.text.strip())\n table_content.append('\\t'.join(row_content))\n if table_content:\n content_parts.append('\\n=== 表格 ===\\n' + '\\n'.join(table_content) + '\\n')\n \n # 获取文档属性\n core_props = doc.core_properties\n metadata.update({\n 'title': core_props.title or '',\n 'author': core_props.author or '',\n 'subject': core_props.subject or '',\n 'keywords': core_props.keywords or '',\n 'comments': core_props.comments or '',\n 'created': str(core_props.created) if core_props.created else '',\n 'modified': str(core_props.modified) if core_props.modified else '',\n 'last_modified_by': core_props.last_modified_by or '',\n 'paragraph_count': len(doc.paragraphs),\n 'table_count': len(doc.tables)\n })\n \n content = '\\n'.join(content_parts)\n return content, metadata", "code_hash": "c7b53be194f6c57266ac47d0f4191b08" }, { "name": "_extract_doc", "line_start": 229, "line_end": 243, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Path" } ], "return_type": "tuple[str, Dict[str, Any]]", "docstring": "提取DOC文件的纯文本内容", "is_async": false, "decorators": [], "code": " def _extract_doc(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取DOC文件的纯文本内容\"\"\"\n # DOC格式较复杂,建议转换为DOCX或使用专门的库\n logger.warning(\"DOC格式支持有限,建议转换为DOCX格式\")\n \n # 尝试读取为文本文件\n try:\n with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:\n content = file.read()\n except:\n with open(file_path, 'r', encoding='gbk', errors='ignore') as file:\n content = file.read()\n \n metadata = {'format': 'doc', 'encoding_note': '可能存在编码问题'}\n return content, metadata", "code_hash": "a9f6fc06bae7709495db7817d022c1ca" }, { "name": "_extract_txt", "line_start": 245, "line_end": 272, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Path" } ], "return_type": "tuple[str, Dict[str, Any]]", "docstring": "提取TXT/MD文件的纯文本内容", "is_async": false, "decorators": [], "code": " def _extract_txt(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取TXT/MD文件的纯文本内容\"\"\"\n encodings = ['utf-8', 'gbk', 'gb2312', 'big5', 'utf-16']\n content = \"\"\n used_encoding = \"\"\n \n for encoding in encodings:\n try:\n with open(file_path, 'r', encoding=encoding) as file:\n content = file.read()\n used_encoding = encoding\n break\n except UnicodeDecodeError:\n continue\n \n if not content:\n # 最后尝试忽略错误\n with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:\n content = file.read()\n used_encoding = 'utf-8 (with errors ignored)'\n \n metadata = {\n 'encoding': used_encoding,\n 'line_count': len(content.splitlines()),\n 'char_count': len(content)\n }\n \n return content, metadata", "code_hash": "a531b26825b63c4db57e234dc159a0d7" }, { "name": "_extract_xlsx", "line_start": 274, "line_end": 301, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Path" } ], "return_type": "tuple[str, Dict[str, Any]]", "docstring": "提取XLSX文件的纯文本内容", "is_async": false, "decorators": [], "code": " def _extract_xlsx(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取XLSX文件的纯文本内容\"\"\"\n if not EXCEL_AVAILABLE:\n raise ImportError(\"需要安装 openpyxl: pip install openpyxl\")\n \n workbook = load_workbook(file_path, read_only=True)\n content_parts = []\n metadata = {\n 'sheet_count': len(workbook.sheetnames),\n 'sheet_names': workbook.sheetnames\n }\n \n for sheet_name in workbook.sheetnames:\n sheet = workbook[sheet_name]\n content_parts.append(f\"\\n=== 工作表: {sheet_name} ===\\n\")\n \n for row in sheet.iter_rows(values_only=True):\n row_content = []\n for cell in row:\n if cell is not None:\n row_content.append(str(cell))\n else:\n row_content.append(\"\")\n if any(cell.strip() for cell in row_content): # 跳过空行\n content_parts.append('\\t'.join(row_content))\n \n content = '\\n'.join(content_parts)\n return content, metadata", "code_hash": "4b90a3f26bbb6b4e5da3fbd78a7ea278" }, { "name": "_extract_xls", "line_start": 303, "line_end": 315, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Path" } ], "return_type": "tuple[str, Dict[str, Any]]", "docstring": "提取XLS文件的纯文本内容", "is_async": false, "decorators": [], "code": " def _extract_xls(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取XLS文件的纯文本内容\"\"\"\n logger.warning(\"XLS格式支持有限,建议转换为XLSX格式\")\n \n # 简单的文本提取\n try:\n with open(file_path, 'rb') as file:\n content = file.read().decode('utf-8', errors='ignore')\n except:\n content = f\"无法读取XLS文件: {file_path}\"\n \n metadata = {'format': 'xls', 'note': '可能存在格式问题'}\n return content, metadata", "code_hash": "4a81d1dc3865abd2faa49705e5e8f2e9" }, { "name": "_extract_csv", "line_start": 317, "line_end": 348, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Path" } ], "return_type": "tuple[str, Dict[str, Any]]", "docstring": "提取CSV文件的纯文本内容", "is_async": false, "decorators": [], "code": " def _extract_csv(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取CSV文件的纯文本内容\"\"\"\n encodings = ['utf-8', 'gbk', 'gb2312']\n content = \"\"\n used_encoding = \"\"\n \n for encoding in encodings:\n try:\n with open(file_path, 'r', encoding=encoding) as file:\n content = file.read()\n used_encoding = encoding\n break\n except UnicodeDecodeError:\n continue\n \n if not content:\n with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:\n content = file.read()\n used_encoding = 'utf-8 (with errors ignored)'\n \n # 计算行数和列数\n lines = content.splitlines()\n row_count = len(lines)\n col_count = len(lines[0].split(',')) if lines else 0\n \n metadata = {\n 'encoding': used_encoding,\n 'row_count': row_count,\n 'estimated_col_count': col_count\n }\n \n return content, metadata", "code_hash": "d34952e59e05f67baf52a96ce6f99d9d" }, { "name": "get_supported_formats", "line_start": 350, "line_end": 352, "args": [ { "name": "self" } ], "return_type": "List[str]", "docstring": "获取支持的文件格式列表", "is_async": false, "decorators": [], "code": " def get_supported_formats(self) -> List[str]:\n \"\"\"获取支持的文件格式列表\"\"\"\n return list(self.supported_formats.keys())", "code_hash": "657931bc9cac8245425c24c3fb872456" }, { "name": "is_supported", "line_start": 354, "line_end": 356, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "str" } ], "return_type": "bool", "docstring": "检查文件格式是否支持", "is_async": false, "decorators": [], "code": " def is_supported(self, file_path: str) -> bool:\n \"\"\"检查文件格式是否支持\"\"\"\n return Path(file_path).suffix.lower() in self.supported_formats ", "code_hash": "5b7ac2ce0b4a7b9d115bef46ab04c2dd" } ], "classes": [ { "name": "ExtractedDocument", "line_start": 40, "line_end": 53, "bases": [], "methods": [ { "name": "__post_init__", "line_start": 50, "line_end": 53, "args": [ { "name": "self" } ], "return_type": null, "docstring": "", "is_async": false, "decorators": [], "code": " def __post_init__(self):\n # 确保content是字符串\n if not isinstance(self.content, str):\n self.content = str(self.content)", "code_hash": "e461fc43783cbe62b22d1a9349785191" } ], "docstring": "提取的文档数据", "decorators": [ "dataclass" ], "code": "class ExtractedDocument:\n \"\"\"提取的文档数据\"\"\"\n filename: str\n file_type: str\n content: str # 纯文本内容\n metadata: Dict[str, Any] # 文档元数据\n extracted_at: datetime\n file_size: int\n page_count: Optional[int] = None\n \n def __post_init__(self):\n # 确保content是字符串\n if not isinstance(self.content, str):\n self.content = str(self.content)", "code_hash": "8b9d1fc812b3ea4aebc4e64cf53eaba6" }, { "name": "TextExtractor", "line_start": 55, "line_end": 356, "bases": [], "methods": [ { "name": "__init__", "line_start": 58, "line_end": 69, "args": [ { "name": "self" }, { "name": "config", "type_hint": "Optional[Dict[str, Any]]" } ], "return_type": null, "docstring": "", "is_async": false, "decorators": [], "code": " def __init__(self, config: Optional[Dict[str, Any]] = None):\n self.config = config or {}\n self.supported_formats = {\n '.pdf': self._extract_pdf,\n '.docx': self._extract_docx,\n '.doc': self._extract_doc,\n '.txt': self._extract_txt,\n '.md': self._extract_txt,\n '.xlsx': self._extract_xlsx,\n '.xls': self._extract_xls,\n '.csv': self._extract_csv\n }", "code_hash": "4477c71180d28cd290e553df70152277" }, { "name": "extract", "line_start": 71, "line_end": 102, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "str" } ], "return_type": "ExtractedDocument", "docstring": "提取单个文件的文本内容", "is_async": false, "decorators": [], "code": " def extract(self, file_path: str) -> ExtractedDocument:\n \"\"\"提取单个文件的文本内容\"\"\"\n path_obj = Path(file_path)\n \n if not path_obj.exists():\n raise FileNotFoundError(f\"文件不存在: {file_path}\")\n \n file_ext = path_obj.suffix.lower()\n if file_ext not in self.supported_formats:\n raise ValueError(f\"不支持的文件格式: {file_ext}\")\n \n try:\n # 获取文件信息\n file_size = path_obj.stat().st_size\n \n # 提取文本内容\n extractor = self.supported_formats[file_ext]\n content, metadata = extractor(path_obj)\n \n return ExtractedDocument(\n filename=path_obj.name,\n file_type=file_ext,\n content=content,\n metadata=metadata,\n extracted_at=datetime.now(),\n file_size=file_size,\n page_count=metadata.get('page_count')\n )\n \n except Exception as e:\n logger.error(f\"提取文件 {file_path} 时出错: {str(e)}\")\n raise", "code_hash": "5436f5f2cacc8d554f042f4bf89de554" }, { "name": "extract_batch", "line_start": 104, "line_end": 126, "args": [ { "name": "self" }, { "name": "file_paths", "type_hint": "List[str]" } ], "return_type": "List[ExtractedDocument]", "docstring": "批量提取多个文件的文本内容", "is_async": false, "decorators": [], "code": " def extract_batch(self, file_paths: List[str]) -> List[ExtractedDocument]:\n \"\"\"批量提取多个文件的文本内容\"\"\"\n results = []\n \n for file_path in file_paths:\n try:\n result = self.extract(file_path)\n results.append(result)\n logger.info(f\"成功提取文件: {file_path}\")\n except Exception as e:\n logger.error(f\"提取文件 {file_path} 失败: {str(e)}\")\n # 创建错误记录\n error_doc = ExtractedDocument(\n filename=Path(file_path).name,\n file_type=Path(file_path).suffix.lower(),\n content=f\"提取失败: {str(e)}\",\n metadata={\"error\": str(e)},\n extracted_at=datetime.now(),\n file_size=0\n )\n results.append(error_doc)\n \n return results", "code_hash": "f800214463cb7786ec16c690d19d451e" }, { "name": "_extract_pdf", "line_start": 128, "line_end": 184, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Path" } ], "return_type": "tuple[str, Dict[str, Any]]", "docstring": "提取PDF文件的纯文本内容", "is_async": false, "decorators": [], "code": " def _extract_pdf(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取PDF文件的纯文本内容\"\"\"\n if not PDF_AVAILABLE:\n raise ImportError(\"需要安装 PyPDF2 和 pdfplumber: pip install PyPDF2 pdfplumber\")\n \n content_parts = []\n metadata = {}\n \n try:\n # 使用pdfplumber提取文本(更好的文本提取)\n with pdfplumber.open(file_path) as pdf:\n metadata['page_count'] = len(pdf.pages)\n \n for page_num, page in enumerate(pdf.pages, 1):\n page_text = page.extract_text()\n if page_text:\n content_parts.append(f\"=== 第 {page_num} 页 ===\\n{page_text}\\n\")\n \n # 获取文档元数据\n if pdf.metadata:\n metadata.update({\n 'title': pdf.metadata.get('Title', ''),\n 'author': pdf.metadata.get('Author', ''),\n 'subject': pdf.metadata.get('Subject', ''),\n 'creator': pdf.metadata.get('Creator', ''),\n 'producer': pdf.metadata.get('Producer', ''),\n 'creation_date': pdf.metadata.get('CreationDate', ''),\n 'modification_date': pdf.metadata.get('ModDate', '')\n })\n \n except Exception as e:\n logger.warning(f\"pdfplumber提取失败,尝试使用PyPDF2: {str(e)}\")\n \n # 备用方案:使用PyPDF2\n with open(file_path, 'rb') as file:\n pdf_reader = PyPDF2.PdfReader(file)\n metadata['page_count'] = len(pdf_reader.pages)\n \n for page_num, page in enumerate(pdf_reader.pages, 1):\n page_text = page.extract_text()\n if page_text:\n content_parts.append(f\"=== 第 {page_num} 页 ===\\n{page_text}\\n\")\n \n # 获取文档元数据\n if pdf_reader.metadata:\n metadata.update({\n 'title': pdf_reader.metadata.get('/Title', ''),\n 'author': pdf_reader.metadata.get('/Author', ''),\n 'subject': pdf_reader.metadata.get('/Subject', ''),\n 'creator': pdf_reader.metadata.get('/Creator', ''),\n 'producer': pdf_reader.metadata.get('/Producer', ''),\n 'creation_date': pdf_reader.metadata.get('/CreationDate', ''),\n 'modification_date': pdf_reader.metadata.get('/ModDate', '')\n })\n \n content = '\\n'.join(content_parts) if content_parts else \"\"\n return content, metadata", "code_hash": "92ff0fefec17a73a1c8224b15b5c70ea" }, { "name": "_extract_docx", "line_start": 186, "line_end": 227, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Path" } ], "return_type": "tuple[str, Dict[str, Any]]", "docstring": "提取DOCX文件的纯文本内容", "is_async": false, "decorators": [], "code": " def _extract_docx(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取DOCX文件的纯文本内容\"\"\"\n if not DOCX_AVAILABLE:\n raise ImportError(\"需要安装 python-docx: pip install python-docx\")\n \n doc = Document(str(file_path))\n content_parts = []\n metadata = {}\n \n # 提取所有段落文本\n for paragraph in doc.paragraphs:\n if paragraph.text.strip():\n content_parts.append(paragraph.text)\n \n # 提取表格内容\n for table in doc.tables:\n table_content = []\n for row in table.rows:\n row_content = []\n for cell in row.cells:\n row_content.append(cell.text.strip())\n table_content.append('\\t'.join(row_content))\n if table_content:\n content_parts.append('\\n=== 表格 ===\\n' + '\\n'.join(table_content) + '\\n')\n \n # 获取文档属性\n core_props = doc.core_properties\n metadata.update({\n 'title': core_props.title or '',\n 'author': core_props.author or '',\n 'subject': core_props.subject or '',\n 'keywords': core_props.keywords or '',\n 'comments': core_props.comments or '',\n 'created': str(core_props.created) if core_props.created else '',\n 'modified': str(core_props.modified) if core_props.modified else '',\n 'last_modified_by': core_props.last_modified_by or '',\n 'paragraph_count': len(doc.paragraphs),\n 'table_count': len(doc.tables)\n })\n \n content = '\\n'.join(content_parts)\n return content, metadata", "code_hash": "c7b53be194f6c57266ac47d0f4191b08" }, { "name": "_extract_doc", "line_start": 229, "line_end": 243, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Path" } ], "return_type": "tuple[str, Dict[str, Any]]", "docstring": "提取DOC文件的纯文本内容", "is_async": false, "decorators": [], "code": " def _extract_doc(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取DOC文件的纯文本内容\"\"\"\n # DOC格式较复杂,建议转换为DOCX或使用专门的库\n logger.warning(\"DOC格式支持有限,建议转换为DOCX格式\")\n \n # 尝试读取为文本文件\n try:\n with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:\n content = file.read()\n except:\n with open(file_path, 'r', encoding='gbk', errors='ignore') as file:\n content = file.read()\n \n metadata = {'format': 'doc', 'encoding_note': '可能存在编码问题'}\n return content, metadata", "code_hash": "a9f6fc06bae7709495db7817d022c1ca" }, { "name": "_extract_txt", "line_start": 245, "line_end": 272, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Path" } ], "return_type": "tuple[str, Dict[str, Any]]", "docstring": "提取TXT/MD文件的纯文本内容", "is_async": false, "decorators": [], "code": " def _extract_txt(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取TXT/MD文件的纯文本内容\"\"\"\n encodings = ['utf-8', 'gbk', 'gb2312', 'big5', 'utf-16']\n content = \"\"\n used_encoding = \"\"\n \n for encoding in encodings:\n try:\n with open(file_path, 'r', encoding=encoding) as file:\n content = file.read()\n used_encoding = encoding\n break\n except UnicodeDecodeError:\n continue\n \n if not content:\n # 最后尝试忽略错误\n with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:\n content = file.read()\n used_encoding = 'utf-8 (with errors ignored)'\n \n metadata = {\n 'encoding': used_encoding,\n 'line_count': len(content.splitlines()),\n 'char_count': len(content)\n }\n \n return content, metadata", "code_hash": "a531b26825b63c4db57e234dc159a0d7" }, { "name": "_extract_xlsx", "line_start": 274, "line_end": 301, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Path" } ], "return_type": "tuple[str, Dict[str, Any]]", "docstring": "提取XLSX文件的纯文本内容", "is_async": false, "decorators": [], "code": " def _extract_xlsx(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取XLSX文件的纯文本内容\"\"\"\n if not EXCEL_AVAILABLE:\n raise ImportError(\"需要安装 openpyxl: pip install openpyxl\")\n \n workbook = load_workbook(file_path, read_only=True)\n content_parts = []\n metadata = {\n 'sheet_count': len(workbook.sheetnames),\n 'sheet_names': workbook.sheetnames\n }\n \n for sheet_name in workbook.sheetnames:\n sheet = workbook[sheet_name]\n content_parts.append(f\"\\n=== 工作表: {sheet_name} ===\\n\")\n \n for row in sheet.iter_rows(values_only=True):\n row_content = []\n for cell in row:\n if cell is not None:\n row_content.append(str(cell))\n else:\n row_content.append(\"\")\n if any(cell.strip() for cell in row_content): # 跳过空行\n content_parts.append('\\t'.join(row_content))\n \n content = '\\n'.join(content_parts)\n return content, metadata", "code_hash": "4b90a3f26bbb6b4e5da3fbd78a7ea278" }, { "name": "_extract_xls", "line_start": 303, "line_end": 315, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Path" } ], "return_type": "tuple[str, Dict[str, Any]]", "docstring": "提取XLS文件的纯文本内容", "is_async": false, "decorators": [], "code": " def _extract_xls(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取XLS文件的纯文本内容\"\"\"\n logger.warning(\"XLS格式支持有限,建议转换为XLSX格式\")\n \n # 简单的文本提取\n try:\n with open(file_path, 'rb') as file:\n content = file.read().decode('utf-8', errors='ignore')\n except:\n content = f\"无法读取XLS文件: {file_path}\"\n \n metadata = {'format': 'xls', 'note': '可能存在格式问题'}\n return content, metadata", "code_hash": "4a81d1dc3865abd2faa49705e5e8f2e9" }, { "name": "_extract_csv", "line_start": 317, "line_end": 348, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "Path" } ], "return_type": "tuple[str, Dict[str, Any]]", "docstring": "提取CSV文件的纯文本内容", "is_async": false, "decorators": [], "code": " def _extract_csv(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取CSV文件的纯文本内容\"\"\"\n encodings = ['utf-8', 'gbk', 'gb2312']\n content = \"\"\n used_encoding = \"\"\n \n for encoding in encodings:\n try:\n with open(file_path, 'r', encoding=encoding) as file:\n content = file.read()\n used_encoding = encoding\n break\n except UnicodeDecodeError:\n continue\n \n if not content:\n with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:\n content = file.read()\n used_encoding = 'utf-8 (with errors ignored)'\n \n # 计算行数和列数\n lines = content.splitlines()\n row_count = len(lines)\n col_count = len(lines[0].split(',')) if lines else 0\n \n metadata = {\n 'encoding': used_encoding,\n 'row_count': row_count,\n 'estimated_col_count': col_count\n }\n \n return content, metadata", "code_hash": "d34952e59e05f67baf52a96ce6f99d9d" }, { "name": "get_supported_formats", "line_start": 350, "line_end": 352, "args": [ { "name": "self" } ], "return_type": "List[str]", "docstring": "获取支持的文件格式列表", "is_async": false, "decorators": [], "code": " def get_supported_formats(self) -> List[str]:\n \"\"\"获取支持的文件格式列表\"\"\"\n return list(self.supported_formats.keys())", "code_hash": "657931bc9cac8245425c24c3fb872456" }, { "name": "is_supported", "line_start": 354, "line_end": 356, "args": [ { "name": "self" }, { "name": "file_path", "type_hint": "str" } ], "return_type": "bool", "docstring": "检查文件格式是否支持", "is_async": false, "decorators": [], "code": " def is_supported(self, file_path: str) -> bool:\n \"\"\"检查文件格式是否支持\"\"\"\n return Path(file_path).suffix.lower() in self.supported_formats ", "code_hash": "5b7ac2ce0b4a7b9d115bef46ab04c2dd" } ], "docstring": "文本提取器 - 只做纯文本提取,保留所有原始内容", "decorators": [], "code": "class TextExtractor:\n \"\"\"文本提取器 - 只做纯文本提取,保留所有原始内容\"\"\"\n \n def __init__(self, config: Optional[Dict[str, Any]] = None):\n self.config = config or {}\n self.supported_formats = {\n '.pdf': self._extract_pdf,\n '.docx': self._extract_docx,\n '.doc': self._extract_doc,\n '.txt': self._extract_txt,\n '.md': self._extract_txt,\n '.xlsx': self._extract_xlsx,\n '.xls': self._extract_xls,\n '.csv': self._extract_csv\n }\n \n def extract(self, file_path: str) -> ExtractedDocument:\n \"\"\"提取单个文件的文本内容\"\"\"\n path_obj = Path(file_path)\n \n if not path_obj.exists():\n raise FileNotFoundError(f\"文件不存在: {file_path}\")\n \n file_ext = path_obj.suffix.lower()\n if file_ext not in self.supported_formats:\n raise ValueError(f\"不支持的文件格式: {file_ext}\")\n \n try:\n # 获取文件信息\n file_size = path_obj.stat().st_size\n \n # 提取文本内容\n extractor = self.supported_formats[file_ext]\n content, metadata = extractor(path_obj)\n \n return ExtractedDocument(\n filename=path_obj.name,\n file_type=file_ext,\n content=content,\n metadata=metadata,\n extracted_at=datetime.now(),\n file_size=file_size,\n page_count=metadata.get('page_count')\n )\n \n except Exception as e:\n logger.error(f\"提取文件 {file_path} 时出错: {str(e)}\")\n raise\n \n def extract_batch(self, file_paths: List[str]) -> List[ExtractedDocument]:\n \"\"\"批量提取多个文件的文本内容\"\"\"\n results = []\n \n for file_path in file_paths:\n try:\n result = self.extract(file_path)\n results.append(result)\n logger.info(f\"成功提取文件: {file_path}\")\n except Exception as e:\n logger.error(f\"提取文件 {file_path} 失败: {str(e)}\")\n # 创建错误记录\n error_doc = ExtractedDocument(\n filename=Path(file_path).name,\n file_type=Path(file_path).suffix.lower(),\n content=f\"提取失败: {str(e)}\",\n metadata={\"error\": str(e)},\n extracted_at=datetime.now(),\n file_size=0\n )\n results.append(error_doc)\n \n return results\n \n def _extract_pdf(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取PDF文件的纯文本内容\"\"\"\n if not PDF_AVAILABLE:\n raise ImportError(\"需要安装 PyPDF2 和 pdfplumber: pip install PyPDF2 pdfplumber\")\n \n content_parts = []\n metadata = {}\n \n try:\n # 使用pdfplumber提取文本(更好的文本提取)\n with pdfplumber.open(file_path) as pdf:\n metadata['page_count'] = len(pdf.pages)\n \n for page_num, page in enumerate(pdf.pages, 1):\n page_text = page.extract_text()\n if page_text:\n content_parts.append(f\"=== 第 {page_num} 页 ===\\n{page_text}\\n\")\n \n # 获取文档元数据\n if pdf.metadata:\n metadata.update({\n 'title': pdf.metadata.get('Title', ''),\n 'author': pdf.metadata.get('Author', ''),\n 'subject': pdf.metadata.get('Subject', ''),\n 'creator': pdf.metadata.get('Creator', ''),\n 'producer': pdf.metadata.get('Producer', ''),\n 'creation_date': pdf.metadata.get('CreationDate', ''),\n 'modification_date': pdf.metadata.get('ModDate', '')\n })\n \n except Exception as e:\n logger.warning(f\"pdfplumber提取失败,尝试使用PyPDF2: {str(e)}\")\n \n # 备用方案:使用PyPDF2\n with open(file_path, 'rb') as file:\n pdf_reader = PyPDF2.PdfReader(file)\n metadata['page_count'] = len(pdf_reader.pages)\n \n for page_num, page in enumerate(pdf_reader.pages, 1):\n page_text = page.extract_text()\n if page_text:\n content_parts.append(f\"=== 第 {page_num} 页 ===\\n{page_text}\\n\")\n \n # 获取文档元数据\n if pdf_reader.metadata:\n metadata.update({\n 'title': pdf_reader.metadata.get('/Title', ''),\n 'author': pdf_reader.metadata.get('/Author', ''),\n 'subject': pdf_reader.metadata.get('/Subject', ''),\n 'creator': pdf_reader.metadata.get('/Creator', ''),\n 'producer': pdf_reader.metadata.get('/Producer', ''),\n 'creation_date': pdf_reader.metadata.get('/CreationDate', ''),\n 'modification_date': pdf_reader.metadata.get('/ModDate', '')\n })\n \n content = '\\n'.join(content_parts) if content_parts else \"\"\n return content, metadata\n \n def _extract_docx(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取DOCX文件的纯文本内容\"\"\"\n if not DOCX_AVAILABLE:\n raise ImportError(\"需要安装 python-docx: pip install python-docx\")\n \n doc = Document(str(file_path))\n content_parts = []\n metadata = {}\n \n # 提取所有段落文本\n for paragraph in doc.paragraphs:\n if paragraph.text.strip():\n content_parts.append(paragraph.text)\n \n # 提取表格内容\n for table in doc.tables:\n table_content = []\n for row in table.rows:\n row_content = []\n for cell in row.cells:\n row_content.append(cell.text.strip())\n table_content.append('\\t'.join(row_content))\n if table_content:\n content_parts.append('\\n=== 表格 ===\\n' + '\\n'.join(table_content) + '\\n')\n \n # 获取文档属性\n core_props = doc.core_properties\n metadata.update({\n 'title': core_props.title or '',\n 'author': core_props.author or '',\n 'subject': core_props.subject or '',\n 'keywords': core_props.keywords or '',\n 'comments': core_props.comments or '',\n 'created': str(core_props.created) if core_props.created else '',\n 'modified': str(core_props.modified) if core_props.modified else '',\n 'last_modified_by': core_props.last_modified_by or '',\n 'paragraph_count': len(doc.paragraphs),\n 'table_count': len(doc.tables)\n })\n \n content = '\\n'.join(content_parts)\n return content, metadata\n \n def _extract_doc(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取DOC文件的纯文本内容\"\"\"\n # DOC格式较复杂,建议转换为DOCX或使用专门的库\n logger.warning(\"DOC格式支持有限,建议转换为DOCX格式\")\n \n # 尝试读取为文本文件\n try:\n with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:\n content = file.read()\n except:\n with open(file_path, 'r', encoding='gbk', errors='ignore') as file:\n content = file.read()\n \n metadata = {'format': 'doc', 'encoding_note': '可能存在编码问题'}\n return content, metadata\n \n def _extract_txt(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取TXT/MD文件的纯文本内容\"\"\"\n encodings = ['utf-8', 'gbk', 'gb2312', 'big5', 'utf-16']\n content = \"\"\n used_encoding = \"\"\n \n for encoding in encodings:\n try:\n with open(file_path, 'r', encoding=encoding) as file:\n content = file.read()\n used_encoding = encoding\n break\n except UnicodeDecodeError:\n continue\n \n if not content:\n # 最后尝试忽略错误\n with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:\n content = file.read()\n used_encoding = 'utf-8 (with errors ignored)'\n \n metadata = {\n 'encoding': used_encoding,\n 'line_count': len(content.splitlines()),\n 'char_count': len(content)\n }\n \n return content, metadata\n \n def _extract_xlsx(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取XLSX文件的纯文本内容\"\"\"\n if not EXCEL_AVAILABLE:\n raise ImportError(\"需要安装 openpyxl: pip install openpyxl\")\n \n workbook = load_workbook(file_path, read_only=True)\n content_parts = []\n metadata = {\n 'sheet_count': len(workbook.sheetnames),\n 'sheet_names': workbook.sheetnames\n }\n \n for sheet_name in workbook.sheetnames:\n sheet = workbook[sheet_name]\n content_parts.append(f\"\\n=== 工作表: {sheet_name} ===\\n\")\n \n for row in sheet.iter_rows(values_only=True):\n row_content = []\n for cell in row:\n if cell is not None:\n row_content.append(str(cell))\n else:\n row_content.append(\"\")\n if any(cell.strip() for cell in row_content): # 跳过空行\n content_parts.append('\\t'.join(row_content))\n \n content = '\\n'.join(content_parts)\n return content, metadata\n \n def _extract_xls(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取XLS文件的纯文本内容\"\"\"\n logger.warning(\"XLS格式支持有限,建议转换为XLSX格式\")\n \n # 简单的文本提取\n try:\n with open(file_path, 'rb') as file:\n content = file.read().decode('utf-8', errors='ignore')\n except:\n content = f\"无法读取XLS文件: {file_path}\"\n \n metadata = {'format': 'xls', 'note': '可能存在格式问题'}\n return content, metadata\n \n def _extract_csv(self, file_path: Path) -> tuple[str, Dict[str, Any]]:\n \"\"\"提取CSV文件的纯文本内容\"\"\"\n encodings = ['utf-8', 'gbk', 'gb2312']\n content = \"\"\n used_encoding = \"\"\n \n for encoding in encodings:\n try:\n with open(file_path, 'r', encoding=encoding) as file:\n content = file.read()\n used_encoding = encoding\n break\n except UnicodeDecodeError:\n continue\n \n if not content:\n with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:\n content = file.read()\n used_encoding = 'utf-8 (with errors ignored)'\n \n # 计算行数和列数\n lines = content.splitlines()\n row_count = len(lines)\n col_count = len(lines[0].split(',')) if lines else 0\n \n metadata = {\n 'encoding': used_encoding,\n 'row_count': row_count,\n 'estimated_col_count': col_count\n }\n \n return content, metadata\n \n def get_supported_formats(self) -> List[str]:\n \"\"\"获取支持的文件格式列表\"\"\"\n return list(self.supported_formats.keys())\n \n def is_supported(self, file_path: str) -> bool:\n \"\"\"检查文件格式是否支持\"\"\"\n return Path(file_path).suffix.lower() in self.supported_formats ", "code_hash": "37d892d5ebf6d743d758c52421e08e2b" } ], "imports": [ { "type": "import", "modules": [ "os" ], "aliases": [] }, { "type": "import", "modules": [ "logging" ], "aliases": [] }, { "type": "from_import", "module": "typing", "names": [ "List", "Dict", "Any", "Optional" ], "aliases": [], "level": 0 }, { "type": "from_import", "module": "pathlib", "names": [ "Path" ], "aliases": [], "level": 0 }, { "type": "from_import", "module": "dataclasses", "names": [ "dataclass" ], "aliases": [], "level": 0 }, { "type": "from_import", "module": "datetime", "names": [ "datetime" ], "aliases": [], "level": 0 }, { "type": "import", "modules": [ "PyPDF2" ], "aliases": [] }, { "type": "import", "modules": [ "pdfplumber" ], "aliases": [] }, { "type": "from_import", "module": "docx", "names": [ "Document" ], "aliases": [], "level": 0 }, { "type": "import", "modules": [ "openpyxl" ], "aliases": [] }, { "type": "from_import", "module": "openpyxl", "names": [ "load_workbook" ], "aliases": [], "level": 0 } ], "constants": [ { "name": "PDF_AVAILABLE", "value": true, "type": "bool", "line": 20 }, { "name": "DOCX_AVAILABLE", "value": true, "type": "bool", "line": 26 }, { "name": "EXCEL_AVAILABLE", "value": true, "type": "bool", "line": 33 }, { "name": "PDF_AVAILABLE", "value": false, "type": "bool", "line": 22 }, { "name": "DOCX_AVAILABLE", "value": false, "type": "bool", "line": 28 }, { "name": "EXCEL_AVAILABLE", "value": false, "type": "bool", "line": 35 } ], "docstring": "文本提取器模块\n支持从PDF、Word、TXT等格式的文档中提取文本内容", "content_hash": "99faa0b695aa0a84886242bcb430ff10" }