64 lines
2.0 KiB
Python
64 lines
2.0 KiB
Python
|
|
import os
|
||
|
|
from typing import List, Optional
|
||
|
|
from PyPDF2 import PdfReader
|
||
|
|
from docx import Document
|
||
|
|
import pdfplumber
|
||
|
|
|
||
|
|
|
||
|
|
class DocumentService:
|
||
|
|
def __init__(self, raw_dir: str, parsed_dir: str):
|
||
|
|
self.raw_dir = raw_dir
|
||
|
|
self.parsed_dir = parsed_dir
|
||
|
|
|
||
|
|
def parse_pdf(self, file_path: str) -> str:
|
||
|
|
"""解析PDF文件"""
|
||
|
|
text = ""
|
||
|
|
try:
|
||
|
|
with pdfplumber.open(file_path) as pdf:
|
||
|
|
for page in pdf.pages:
|
||
|
|
page_text = page.extract_text()
|
||
|
|
if page_text:
|
||
|
|
text += page_text + "\n"
|
||
|
|
except Exception:
|
||
|
|
reader = PdfReader(file_path)
|
||
|
|
for page in reader.pages:
|
||
|
|
text += page.extract_text() + "\n"
|
||
|
|
|
||
|
|
return text.strip()
|
||
|
|
|
||
|
|
def parse_docx(self, file_path: str) -> str:
|
||
|
|
"""解析Word文件"""
|
||
|
|
doc = Document(file_path)
|
||
|
|
text = ""
|
||
|
|
for paragraph in doc.paragraphs:
|
||
|
|
text += paragraph.text + "\n"
|
||
|
|
return text.strip()
|
||
|
|
|
||
|
|
def parse_txt(self, file_path: str) -> str:
|
||
|
|
"""解析TXT文件"""
|
||
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
||
|
|
return f.read().strip()
|
||
|
|
|
||
|
|
def parse_document(self, file_path: str) -> str:
|
||
|
|
"""根据文件类型解析文档"""
|
||
|
|
ext = os.path.splitext(file_path)[1].lower()
|
||
|
|
|
||
|
|
if ext == ".pdf":
|
||
|
|
return self.parse_pdf(file_path)
|
||
|
|
elif ext in [".docx", ".doc"]:
|
||
|
|
return self.parse_docx(file_path)
|
||
|
|
elif ext == ".txt":
|
||
|
|
return self.parse_txt(file_path)
|
||
|
|
else:
|
||
|
|
raise ValueError(f"Unsupported file format: {ext}")
|
||
|
|
|
||
|
|
def save_parsed_text(self, doc_id: str, text: str) -> str:
|
||
|
|
"""保存解析后的文本"""
|
||
|
|
parsed_path = os.path.join(self.parsed_dir, f"{doc_id}.txt")
|
||
|
|
with open(parsed_path, "w", encoding="utf-8") as f:
|
||
|
|
f.write(text)
|
||
|
|
return parsed_path
|
||
|
|
|
||
|
|
|
||
|
|
def get_document_service(raw_dir: str, parsed_dir: str) -> DocumentService:
|
||
|
|
return DocumentService(raw_dir, parsed_dir)
|