Files
2026-05-11 11:22:55 +08:00

64 lines
2.0 KiB
Python

import os
from typing import List, Optional
from PyPDF2 import PdfReader
from docx import Document
import pdfplumber
class DocumentService:
def __init__(self, raw_dir: str, parsed_dir: str):
self.raw_dir = raw_dir
self.parsed_dir = parsed_dir
def parse_pdf(self, file_path: str) -> str:
"""解析PDF文件"""
text = ""
try:
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
except Exception:
reader = PdfReader(file_path)
for page in reader.pages:
text += page.extract_text() + "\n"
return text.strip()
def parse_docx(self, file_path: str) -> str:
"""解析Word文件"""
doc = Document(file_path)
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text.strip()
def parse_txt(self, file_path: str) -> str:
"""解析TXT文件"""
with open(file_path, "r", encoding="utf-8") as f:
return f.read().strip()
def parse_document(self, file_path: str) -> str:
"""根据文件类型解析文档"""
ext = os.path.splitext(file_path)[1].lower()
if ext == ".pdf":
return self.parse_pdf(file_path)
elif ext in [".docx", ".doc"]:
return self.parse_docx(file_path)
elif ext == ".txt":
return self.parse_txt(file_path)
else:
raise ValueError(f"Unsupported file format: {ext}")
def save_parsed_text(self, doc_id: str, text: str) -> str:
"""保存解析后的文本"""
parsed_path = os.path.join(self.parsed_dir, f"{doc_id}.txt")
with open(parsed_path, "w", encoding="utf-8") as f:
f.write(text)
return parsed_path
def get_document_service(raw_dir: str, parsed_dir: str) -> DocumentService:
return DocumentService(raw_dir, parsed_dir)