初始化
This commit is contained in:
64
app/services/document.py
Normal file
64
app/services/document.py
Normal file
@@ -0,0 +1,64 @@
|
||||
import os
|
||||
from typing import List, Optional
|
||||
from PyPDF2 import PdfReader
|
||||
from docx import Document
|
||||
import pdfplumber
|
||||
|
||||
|
||||
class DocumentService:
|
||||
def __init__(self, raw_dir: str, parsed_dir: str):
|
||||
self.raw_dir = raw_dir
|
||||
self.parsed_dir = parsed_dir
|
||||
|
||||
def parse_pdf(self, file_path: str) -> str:
|
||||
"""解析PDF文件"""
|
||||
text = ""
|
||||
try:
|
||||
with pdfplumber.open(file_path) as pdf:
|
||||
for page in pdf.pages:
|
||||
page_text = page.extract_text()
|
||||
if page_text:
|
||||
text += page_text + "\n"
|
||||
except Exception:
|
||||
reader = PdfReader(file_path)
|
||||
for page in reader.pages:
|
||||
text += page.extract_text() + "\n"
|
||||
|
||||
return text.strip()
|
||||
|
||||
def parse_docx(self, file_path: str) -> str:
|
||||
"""解析Word文件"""
|
||||
doc = Document(file_path)
|
||||
text = ""
|
||||
for paragraph in doc.paragraphs:
|
||||
text += paragraph.text + "\n"
|
||||
return text.strip()
|
||||
|
||||
def parse_txt(self, file_path: str) -> str:
|
||||
"""解析TXT文件"""
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
return f.read().strip()
|
||||
|
||||
def parse_document(self, file_path: str) -> str:
|
||||
"""根据文件类型解析文档"""
|
||||
ext = os.path.splitext(file_path)[1].lower()
|
||||
|
||||
if ext == ".pdf":
|
||||
return self.parse_pdf(file_path)
|
||||
elif ext in [".docx", ".doc"]:
|
||||
return self.parse_docx(file_path)
|
||||
elif ext == ".txt":
|
||||
return self.parse_txt(file_path)
|
||||
else:
|
||||
raise ValueError(f"Unsupported file format: {ext}")
|
||||
|
||||
def save_parsed_text(self, doc_id: str, text: str) -> str:
|
||||
"""保存解析后的文本"""
|
||||
parsed_path = os.path.join(self.parsed_dir, f"{doc_id}.txt")
|
||||
with open(parsed_path, "w", encoding="utf-8") as f:
|
||||
f.write(text)
|
||||
return parsed_path
|
||||
|
||||
|
||||
def get_document_service(raw_dir: str, parsed_dir: str) -> DocumentService:
|
||||
return DocumentService(raw_dir, parsed_dir)
|
||||
Reference in New Issue
Block a user