import os from typing import List, Optional from PyPDF2 import PdfReader from docx import Document import pdfplumber class DocumentService: def __init__(self, raw_dir: str, parsed_dir: str): self.raw_dir = raw_dir self.parsed_dir = parsed_dir def parse_pdf(self, file_path: str) -> str: """解析PDF文件""" text = "" try: with pdfplumber.open(file_path) as pdf: for page in pdf.pages: page_text = page.extract_text() if page_text: text += page_text + "\n" except Exception: reader = PdfReader(file_path) for page in reader.pages: text += page.extract_text() + "\n" return text.strip() def parse_docx(self, file_path: str) -> str: """解析Word文件""" doc = Document(file_path) text = "" for paragraph in doc.paragraphs: text += paragraph.text + "\n" return text.strip() def parse_txt(self, file_path: str) -> str: """解析TXT文件""" with open(file_path, "r", encoding="utf-8") as f: return f.read().strip() def parse_document(self, file_path: str) -> str: """根据文件类型解析文档""" ext = os.path.splitext(file_path)[1].lower() if ext == ".pdf": return self.parse_pdf(file_path) elif ext in [".docx", ".doc"]: return self.parse_docx(file_path) elif ext == ".txt": return self.parse_txt(file_path) else: raise ValueError(f"Unsupported file format: {ext}") def save_parsed_text(self, doc_id: str, text: str) -> str: """保存解析后的文本""" parsed_path = os.path.join(self.parsed_dir, f"{doc_id}.txt") with open(parsed_path, "w", encoding="utf-8") as f: f.write(text) return parsed_path def get_document_service(raw_dir: str, parsed_dir: str) -> DocumentService: return DocumentService(raw_dir, parsed_dir)