v0.21.1-fastapi

This commit is contained in:
2025-11-04 16:06:36 +08:00
parent 3e58c3d0e9
commit d57b5d76ae
218 changed files with 19617 additions and 72339 deletions

View File

@@ -17,15 +17,30 @@ import json
import os.path
import pathlib
import re
import traceback
from pathlib import Path
from typing import List, Optional
from typing import Optional, List
from fastapi import APIRouter, Depends, File, Form, HTTPException, UploadFile, Query
from fastapi.responses import StreamingResponse
from fastapi.security import HTTPAuthorizationCredentials
from api.utils.api_utils import security
from fastapi import APIRouter, Depends, Query, UploadFile, File, Form
from fastapi.responses import Response
from api.apps.models.auth_dependencies import get_current_user
from api.apps.models.document_models import (
CreateDocumentRequest,
WebCrawlRequest,
ListDocumentsQuery,
ListDocumentsBody,
FilterDocumentsRequest,
GetDocumentInfosRequest,
ChangeStatusRequest,
DeleteDocumentRequest,
RunDocumentRequest,
RenameDocumentRequest,
ChangeParserRequest,
ChangeParserSimpleRequest,
UploadAndParseRequest,
ParseRequest,
SetMetaRequest,
)
from api import settings
from api.common.check_team_permission import check_kb_team_permission
from api.constants import FILE_NAME_LEN_LIMIT, IMG_BASE64_PREFIX
@@ -43,159 +58,29 @@ from api.utils.api_utils import (
get_data_error_result,
get_json_result,
server_error_response,
validate_request,
)
from api.utils.file_utils import filename_type, get_project_base_directory, thumbnail
from api.utils.web_utils import CONTENT_TYPE_MAP, html2pdf, is_valid_url
from deepdoc.parser.html_parser import RAGFlowHtmlParser
from rag.nlp import search
from rag.nlp import search, rag_tokenizer
from rag.utils.storage_factory import STORAGE_IMPL
from pydantic import BaseModel
from api.db.db_models import User
# Security
# Pydantic models for request/response
class WebCrawlRequest(BaseModel):
kb_id: str
name: str
url: str
class CreateDocumentRequest(BaseModel):
name: str
kb_id: str
class DocumentListRequest(BaseModel):
run_status: List[str] = []
types: List[str] = []
suffix: List[str] = []
class DocumentFilterRequest(BaseModel):
kb_id: str
keywords: str = ""
run_status: List[str] = []
types: List[str] = []
suffix: List[str] = []
class DocumentInfosRequest(BaseModel):
doc_ids: List[str]
class ChangeStatusRequest(BaseModel):
doc_ids: List[str]
status: str
class RemoveDocumentRequest(BaseModel):
doc_id: List[str]
class RunDocumentRequest(BaseModel):
doc_ids: List[str]
run: int
delete: bool = False
class RenameDocumentRequest(BaseModel):
doc_id: str
name: str
class ChangeParserRequest(BaseModel):
doc_id: str
parser_id: str
pipeline_id: Optional[str] = None
parser_config: Optional[dict] = None
class UploadAndParseRequest(BaseModel):
conversation_id: str
class ParseRequest(BaseModel):
url: Optional[str] = None
class SetMetaRequest(BaseModel):
doc_id: str
meta: str
# Dependency injection
async def get_current_user(credentials: HTTPAuthorizationCredentials = Depends(security)):
"""获取当前用户"""
from api.db import StatusEnum
from api.db.services.user_service import UserService
from fastapi import HTTPException, status
import logging
try:
from itsdangerous.url_safe import URLSafeTimedSerializer as Serializer
except ImportError:
# 如果没有itsdangerous使用jwt作为替代
import jwt
Serializer = jwt
jwt = Serializer(secret_key=settings.SECRET_KEY)
authorization = credentials.credentials
if authorization:
try:
access_token = str(jwt.loads(authorization))
if not access_token or not access_token.strip():
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Authentication attempt with empty access token"
)
# Access tokens should be UUIDs (32 hex characters)
if len(access_token.strip()) < 32:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail=f"Authentication attempt with invalid token format: {len(access_token)} chars"
)
user = UserService.query(
access_token=access_token, status=StatusEnum.VALID.value
)
if user:
if not user[0].access_token or not user[0].access_token.strip():
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail=f"User {user[0].email} has empty access_token in database"
)
return user[0]
else:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Invalid access token"
)
except Exception as e:
logging.warning(f"load_user got exception {e}")
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Invalid access token"
)
else:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Authorization header required"
)
# Create router
# 创建路由器
router = APIRouter()
@router.post("/upload")
async def upload(
kb_id: str = Form(...),
file: List[UploadFile] = File(...),
files: List[UploadFile] = File(...),
current_user = Depends(get_current_user)
):
if not kb_id:
return get_json_result(data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
if not file:
"""上传文档"""
if not files:
return get_json_result(data=False, message="No file part!", code=settings.RetCode.ARGUMENT_ERROR)
# Use UploadFile directly - file is already a list from multiple file fields
file_objs = file
for file_obj in file_objs:
if file_obj.filename == "":
for file_obj in files:
if not file_obj.filename or file_obj.filename == "":
return get_json_result(data=False, message="No file selected!", code=settings.RetCode.ARGUMENT_ERROR)
if len(file_obj.filename.encode("utf-8")) > FILE_NAME_LEN_LIMIT:
return get_json_result(data=False, message=f"File name must be {FILE_NAME_LEN_LIMIT} bytes or less.", code=settings.RetCode.ARGUMENT_ERROR)
@@ -206,29 +91,30 @@ async def upload(
if not check_kb_team_permission(kb, current_user.id):
return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)
err, files = await FileService.upload_document(kb, file_objs, current_user.id)
err, uploaded_files = FileService.upload_document(kb, files, current_user.id)
if err:
return get_json_result(data=files, message="\n".join(err), code=settings.RetCode.SERVER_ERROR)
return get_json_result(data=uploaded_files, message="\n".join(err), code=settings.RetCode.SERVER_ERROR)
if not files:
return get_json_result(data=files, message="There seems to be an issue with your file format. Please verify it is correct and not corrupted.", code=settings.RetCode.DATA_ERROR)
files = [f[0] for f in files] # remove the blob
if not uploaded_files:
return get_json_result(data=uploaded_files, message="There seems to be an issue with your file format. Please verify it is correct and not corrupted.", code=settings.RetCode.DATA_ERROR)
files_result = [f[0] for f in uploaded_files] # remove the blob
return get_json_result(data=files)
return get_json_result(data=files_result)
@router.post("/web_crawl")
async def web_crawl(
req: WebCrawlRequest,
request: WebCrawlRequest,
current_user = Depends(get_current_user)
):
kb_id = req.kb_id
if not kb_id:
return get_json_result(data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
name = req.name
url = req.url
"""网页爬取"""
kb_id = request.kb_id
name = request.name
url = request.url
if not is_valid_url(url):
return get_json_result(data=False, message="The URL format is invalid", code=settings.RetCode.ARGUMENT_ERROR)
e, kb = KnowledgebaseService.get_by_id(kb_id)
if not e:
raise LookupError("Can't find this knowledgebase!")
@@ -259,6 +145,7 @@ async def web_crawl(
"id": get_uuid(),
"kb_id": kb.id,
"parser_id": kb.parser_id,
"pipeline_id": kb.pipeline_id,
"parser_config": kb.parser_config,
"created_by": current_user.id,
"type": filetype,
@@ -285,25 +172,26 @@ async def web_crawl(
@router.post("/create")
async def create(
req: CreateDocumentRequest,
request: CreateDocumentRequest,
current_user = Depends(get_current_user)
):
kb_id = req.kb_id
if not kb_id:
return get_json_result(data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
if len(req.name.encode("utf-8")) > FILE_NAME_LEN_LIMIT:
"""创建文档"""
req = request.model_dump(exclude_unset=True)
kb_id = req["kb_id"]
if len(req["name"].encode("utf-8")) > FILE_NAME_LEN_LIMIT:
return get_json_result(data=False, message=f"File name must be {FILE_NAME_LEN_LIMIT} bytes or less.", code=settings.RetCode.ARGUMENT_ERROR)
if req.name.strip() == "":
if req["name"].strip() == "":
return get_json_result(data=False, message="File name can't be empty.", code=settings.RetCode.ARGUMENT_ERROR)
req.name = req.name.strip()
req["name"] = req["name"].strip()
try:
e, kb = KnowledgebaseService.get_by_id(kb_id)
if not e:
return get_data_error_result(message="Can't find this knowledgebase!")
if DocumentService.query(name=req.name, kb_id=kb_id):
if DocumentService.query(name=req["name"], kb_id=kb_id):
return get_data_error_result(message="Duplicated document name in the same knowledgebase.")
kb_root_folder = FileService.get_kb_folder(kb.tenant_id)
@@ -326,8 +214,8 @@ async def create(
"parser_config": kb.parser_config,
"created_by": current_user.id,
"type": FileType.VIRTUAL,
"name": req.name,
"suffix": Path(req.name).suffix.lstrip("."),
"name": req["name"],
"suffix": Path(req["name"]).suffix.lstrip("."),
"location": "",
"size": 0,
}
@@ -342,47 +230,46 @@ async def create(
@router.post("/list")
async def list_docs(
kb_id: str = Query(...),
keywords: str = Query(""),
page: int = Query(0),
page_size: int = Query(0),
orderby: str = Query("create_time"),
desc: str = Query("true"),
create_time_from: int = Query(0),
create_time_to: int = Query(0),
req: DocumentListRequest = None,
query: ListDocumentsQuery = Depends(),
body: Optional[ListDocumentsBody] = None,
current_user = Depends(get_current_user)
):
if not kb_id:
return get_json_result(data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
"""列出文档"""
if body is None:
body = ListDocumentsBody()
kb_id = query.kb_id
tenants = UserTenantService.query(user_id=current_user.id)
for tenant in tenants:
if KnowledgebaseService.query(tenant_id=tenant.tenant_id, id=kb_id):
break
else:
return get_json_result(data=False, message="Only owner of knowledgebase authorized for this operation.", code=settings.RetCode.OPERATING_ERROR)
keywords = query.keywords or ""
page_number = int(query.page or 0)
items_per_page = int(query.page_size or 0)
orderby = query.orderby or "create_time"
desc = query.desc.lower() == "true" if query.desc else True
create_time_from = int(query.create_time_from or 0)
create_time_to = int(query.create_time_to or 0)
if desc.lower() == "false":
desc_bool = False
else:
desc_bool = True
run_status = req.run_status if req else []
run_status = body.run_status or []
if run_status:
invalid_status = {s for s in run_status if s not in VALID_TASK_STATUS}
if invalid_status:
return get_data_error_result(message=f"Invalid filter run status conditions: {', '.join(invalid_status)}")
types = req.types if req else []
types = body.types or []
if types:
invalid_types = {t for t in types if t not in VALID_FILE_TYPES}
if invalid_types:
return get_data_error_result(message=f"Invalid filter conditions: {', '.join(invalid_types)} type{'s' if len(invalid_types) > 1 else ''}")
suffix = req.suffix if req else []
suffix = body.suffix or []
try:
docs, tol = DocumentService.get_by_kb_id(kb_id, page, page_size, orderby, desc_bool, keywords, run_status, types, suffix)
docs, tol = DocumentService.get_by_kb_id(kb_id, page_number, items_per_page, orderby, desc, keywords, run_status, types, suffix)
if create_time_from or create_time_to:
filtered_docs = []
@@ -403,12 +290,11 @@ async def list_docs(
@router.post("/filter")
async def get_filter(
req: DocumentFilterRequest,
request: FilterDocumentsRequest,
current_user = Depends(get_current_user)
):
kb_id = req.kb_id
if not kb_id:
return get_json_result(data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
"""过滤文档"""
kb_id = request.kb_id
tenants = UserTenantService.query(user_id=current_user.id)
for tenant in tenants:
if KnowledgebaseService.query(tenant_id=tenant.tenant_id, id=kb_id):
@@ -416,15 +302,16 @@ async def get_filter(
else:
return get_json_result(data=False, message="Only owner of knowledgebase authorized for this operation.", code=settings.RetCode.OPERATING_ERROR)
keywords = req.keywords
suffix = req.suffix
run_status = req.run_status
keywords = request.keywords or ""
suffix = request.suffix or []
run_status = request.run_status or []
if run_status:
invalid_status = {s for s in run_status if s not in VALID_TASK_STATUS}
if invalid_status:
return get_data_error_result(message=f"Invalid filter run status conditions: {', '.join(invalid_status)}")
types = req.types
types = request.types or []
if types:
invalid_types = {t for t in types if t not in VALID_FILE_TYPES}
if invalid_types:
@@ -439,10 +326,11 @@ async def get_filter(
@router.post("/infos")
async def docinfos(
req: DocumentInfosRequest,
request: GetDocumentInfosRequest,
current_user = Depends(get_current_user)
):
doc_ids = req.doc_ids
"""获取文档信息"""
doc_ids = request.doc_ids
for doc_id in doc_ids:
if not DocumentService.accessible(doc_id, current_user.id):
return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)
@@ -452,8 +340,9 @@ async def docinfos(
@router.get("/thumbnails")
async def thumbnails(
doc_ids: List[str] = Query(...)
doc_ids: List[str] = Query(..., description="文档ID列表"),
):
"""获取文档缩略图"""
if not doc_ids:
return get_json_result(data=False, message='Lack of "Document ID"', code=settings.RetCode.ARGUMENT_ERROR)
@@ -471,14 +360,12 @@ async def thumbnails(
@router.post("/change_status")
async def change_status(
req: ChangeStatusRequest,
request: ChangeStatusRequest,
current_user = Depends(get_current_user)
):
doc_ids = req.doc_ids
status = str(req.status)
if status not in ["0", "1"]:
return get_json_result(data=False, message='"Status" must be either 0 or 1!', code=settings.RetCode.ARGUMENT_ERROR)
"""修改文档状态"""
doc_ids = request.doc_ids
status = request.status
result = {}
for doc_id in doc_ids:
@@ -511,10 +398,11 @@ async def change_status(
@router.post("/rm")
async def rm(
req: RemoveDocumentRequest,
request: DeleteDocumentRequest,
current_user = Depends(get_current_user)
):
doc_ids = req.doc_id
"""删除文档"""
doc_ids = request.doc_id
if isinstance(doc_ids, str):
doc_ids = [doc_ids]
@@ -570,17 +458,19 @@ async def rm(
@router.post("/run")
async def run(
req: RunDocumentRequest,
request: RunDocumentRequest,
current_user = Depends(get_current_user)
):
for doc_id in req.doc_ids:
"""运行文档解析"""
for doc_id in request.doc_ids:
if not DocumentService.accessible(doc_id, current_user.id):
return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)
try:
kb_table_num_map = {}
for id in req.doc_ids:
info = {"run": str(req.run), "progress": 0}
if req.run == int(TaskStatus.RUNNING.value) and req.delete:
for id in request.doc_ids:
info = {"run": str(request.run), "progress": 0}
if str(request.run) == TaskStatus.RUNNING.value and request.delete:
info["progress_msg"] = ""
info["chunk_num"] = 0
info["token_num"] = 0
@@ -592,21 +482,21 @@ async def run(
if not e:
return get_data_error_result(message="Document not found!")
if req.run == int(TaskStatus.CANCEL.value):
if str(request.run) == TaskStatus.CANCEL.value:
if str(doc.run) == TaskStatus.RUNNING.value:
cancel_all_task_of(id)
else:
return get_data_error_result(message="Cannot cancel a task that is not in RUNNING status")
if all([req.delete, req.run == int(TaskStatus.RUNNING.value), str(doc.run) == TaskStatus.DONE.value]):
if all([not request.delete, str(request.run) == TaskStatus.RUNNING.value, str(doc.run) == TaskStatus.DONE.value]):
DocumentService.clear_chunk_num_when_rerun(doc.id)
DocumentService.update_by_id(id, info)
if req.delete:
if request.delete:
TaskService.filter_delete([Task.doc_id == id])
if settings.docStoreConn.indexExist(search.index_name(tenant_id), doc.kb_id):
settings.docStoreConn.delete({"doc_id": id}, search.index_name(tenant_id), doc.kb_id)
if req.run == int(TaskStatus.RUNNING.value):
if str(request.run) == TaskStatus.RUNNING.value:
doc = doc.to_dict()
doc["tenant_id"] = tenant_id
@@ -628,37 +518,53 @@ async def run(
return get_json_result(data=True)
except Exception as e:
traceback.print_exc()
return server_error_response(e)
@router.post("/rename")
async def rename(
req: RenameDocumentRequest,
request: RenameDocumentRequest,
current_user = Depends(get_current_user)
):
if not DocumentService.accessible(req.doc_id, current_user.id):
"""重命名文档"""
if not DocumentService.accessible(request.doc_id, current_user.id):
return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)
try:
e, doc = DocumentService.get_by_id(req.doc_id)
e, doc = DocumentService.get_by_id(request.doc_id)
if not e:
return get_data_error_result(message="Document not found!")
if pathlib.Path(req.name.lower()).suffix != pathlib.Path(doc.name.lower()).suffix:
if pathlib.Path(request.name.lower()).suffix != pathlib.Path(doc.name.lower()).suffix:
return get_json_result(data=False, message="The extension of file can't be changed", code=settings.RetCode.ARGUMENT_ERROR)
if len(req.name.encode("utf-8")) > FILE_NAME_LEN_LIMIT:
if len(request.name.encode("utf-8")) > FILE_NAME_LEN_LIMIT:
return get_json_result(data=False, message=f"File name must be {FILE_NAME_LEN_LIMIT} bytes or less.", code=settings.RetCode.ARGUMENT_ERROR)
for d in DocumentService.query(name=req.name, kb_id=doc.kb_id):
if d.name == req.name:
for d in DocumentService.query(name=request.name, kb_id=doc.kb_id):
if d.name == request.name:
return get_data_error_result(message="Duplicated document name in the same knowledgebase.")
if not DocumentService.update_by_id(req.doc_id, {"name": req.name}):
if not DocumentService.update_by_id(request.doc_id, {"name": request.name}):
return get_data_error_result(message="Database error (Document rename)!")
informs = File2DocumentService.get_by_document_id(req.doc_id)
informs = File2DocumentService.get_by_document_id(request.doc_id)
if informs:
e, file = FileService.get_by_id(informs[0].file_id)
FileService.update_by_id(file.id, {"name": req.name})
FileService.update_by_id(file.id, {"name": request.name})
tenant_id = DocumentService.get_tenant_id(request.doc_id)
title_tks = rag_tokenizer.tokenize(request.name)
es_body = {
"docnm_kwd": request.name,
"title_tks": title_tks,
"title_sm_tks": rag_tokenizer.fine_grained_tokenize(title_tks),
}
if settings.docStoreConn.indexExist(search.index_name(tenant_id), doc.kb_id):
settings.docStoreConn.update(
{"doc_id": request.doc_id},
es_body,
search.index_name(tenant_id),
doc.kb_id,
)
return get_json_result(data=True)
except Exception as e:
@@ -667,6 +573,7 @@ async def rename(
@router.get("/get/{doc_id}")
async def get(doc_id: str):
"""获取文档文件"""
try:
e, doc = DocumentService.get_by_id(doc_id)
if not e:
@@ -677,70 +584,79 @@ async def get(doc_id: str):
ext = re.search(r"\.([^.]+)$", doc.name.lower())
ext = ext.group(1) if ext else None
content_type = "application/octet-stream"
if ext:
if doc.type == FileType.VISUAL.value:
media_type = CONTENT_TYPE_MAP.get(ext, f"image/{ext}")
content_type = CONTENT_TYPE_MAP.get(ext, f"image/{ext}")
else:
media_type = CONTENT_TYPE_MAP.get(ext, f"application/{ext}")
else:
media_type = "application/octet-stream"
return StreamingResponse(
iter([content]),
media_type=media_type,
headers={"Content-Disposition": f"attachment; filename={doc.name}"}
)
content_type = CONTENT_TYPE_MAP.get(ext, f"application/{ext}")
return Response(content=content, media_type=content_type)
except Exception as e:
return server_error_response(e)
@router.post("/change_parser")
async def change_parser(
req: ChangeParserRequest,
request: ChangeParserSimpleRequest,
current_user = Depends(get_current_user)
):
if not DocumentService.accessible(req.doc_id, current_user.id):
"""修改文档解析器"""
if not DocumentService.accessible(request.doc_id, current_user.id):
return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)
e, doc = DocumentService.get_by_id(req.doc_id)
e, doc = DocumentService.get_by_id(request.doc_id)
if not e:
return get_data_error_result(message="Document not found!")
def reset_doc():
def reset_doc(update_data_override=None):
nonlocal doc
e = DocumentService.update_by_id(doc.id, {"parser_id": req.parser_id, "progress": 0, "progress_msg": "", "run": TaskStatus.UNSTART.value})
update_data = update_data_override or {}
if request.pipeline_id is not None:
update_data["pipeline_id"] = request.pipeline_id
if request.parser_id is not None:
update_data["parser_id"] = request.parser_id
update_data.update({
"progress": 0,
"progress_msg": "",
"run": TaskStatus.UNSTART.value
})
e = DocumentService.update_by_id(doc.id, update_data)
if not e:
return get_data_error_result(message="Document not found!")
if doc.token_num > 0:
e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1, doc.process_duration * -1)
if not e:
return get_data_error_result(message="Document not found!")
tenant_id = DocumentService.get_tenant_id(req.doc_id)
tenant_id = DocumentService.get_tenant_id(request.doc_id)
if not tenant_id:
return get_data_error_result(message="Tenant not found!")
if settings.docStoreConn.indexExist(search.index_name(tenant_id), doc.kb_id):
settings.docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), doc.kb_id)
try:
if req.pipeline_id:
if doc.pipeline_id == req.pipeline_id:
if request.pipeline_id is not None and request.pipeline_id != "":
if doc.pipeline_id == request.pipeline_id:
return get_json_result(data=True)
DocumentService.update_by_id(doc.id, {"pipeline_id": req.pipeline_id})
reset_doc()
reset_doc({"pipeline_id": request.pipeline_id})
return get_json_result(data=True)
if doc.parser_id.lower() == req.parser_id.lower():
if req.parser_config:
if req.parser_config == doc.parser_config:
if request.parser_id is None:
return get_json_result(data=False, message="缺少 parser_id 或 pipeline_id", code=settings.RetCode.ARGUMENT_ERROR)
if doc.parser_id.lower() == request.parser_id.lower():
if request.parser_config is not None:
if request.parser_config == doc.parser_config:
return get_json_result(data=True)
else:
return get_json_result(data=True)
if (doc.type == FileType.VISUAL and req.parser_id != "picture") or (re.search(r"\.(ppt|pptx|pages)$", doc.name) and req.parser_id != "presentation"):
if (doc.type == FileType.VISUAL and request.parser_id != "picture") or (re.search(r"\.(ppt|pptx|pages)$", doc.name) and request.parser_id != "presentation"):
return get_data_error_result(message="Not supported yet!")
if req.parser_config:
DocumentService.update_parser_config(doc.id, req.parser_config)
if request.parser_config is not None:
DocumentService.update_parser_config(doc.id, request.parser_config)
reset_doc()
return get_json_result(data=True)
except Exception as e:
@@ -749,16 +665,14 @@ async def change_parser(
@router.get("/image/{image_id}")
async def get_image(image_id: str):
"""获取图片"""
try:
arr = image_id.split("-")
if len(arr) != 2:
return get_data_error_result(message="Image not found.")
bkt, nm = image_id.split("-")
content = STORAGE_IMPL.get(bkt, nm)
return StreamingResponse(
iter([content]),
media_type="image/JPEG"
)
return Response(content=content, media_type="image/JPEG")
except Exception as e:
return server_error_response(e)
@@ -769,28 +683,28 @@ async def upload_and_parse(
files: List[UploadFile] = File(...),
current_user = Depends(get_current_user)
):
"""上传并解析"""
if not files:
return get_json_result(data=False, message="No file part!", code=settings.RetCode.ARGUMENT_ERROR)
# Use UploadFile directly
file_objs = files
for file_obj in file_objs:
if file_obj.filename == "":
for file_obj in files:
if not file_obj.filename or file_obj.filename == "":
return get_json_result(data=False, message="No file selected!", code=settings.RetCode.ARGUMENT_ERROR)
doc_ids = await doc_upload_and_parse(conversation_id, file_objs, current_user.id)
doc_ids = doc_upload_and_parse(conversation_id, files, current_user.id)
return get_json_result(data=doc_ids)
@router.post("/parse")
async def parse(
req: ParseRequest = None,
files: List[UploadFile] = File(None),
request: Optional[ParseRequest] = None,
files: Optional[List[UploadFile]] = File(None),
current_user = Depends(get_current_user)
):
url = req.url if req else ""
"""解析文档"""
url = request.url if request else None
if url:
if not is_valid_url(url):
return get_json_result(data=False, message="The URL format is invalid", code=settings.RetCode.ARGUMENT_ERROR)
@@ -828,46 +742,37 @@ async def parse(
if not r or not r.group(1):
return get_json_result(data=False, message="Can't not identify downloaded file", code=settings.RetCode.ARGUMENT_ERROR)
f = File(r.group(1), os.path.join(download_path, r.group(1)))
txt = await FileService.parse_docs([f], current_user.id)
txt = FileService.parse_docs([f], current_user.id)
return get_json_result(data=txt)
if not files:
return get_json_result(data=False, message="No file part!", code=settings.RetCode.ARGUMENT_ERROR)
# Use UploadFile directly
file_objs = files
txt = await FileService.parse_docs(file_objs, current_user.id)
txt = FileService.parse_docs(files, current_user.id)
return get_json_result(data=txt)
@router.post("/set_meta")
async def set_meta(
req: SetMetaRequest,
request: SetMetaRequest,
current_user = Depends(get_current_user)
):
if not DocumentService.accessible(req.doc_id, current_user.id):
"""设置元数据"""
if not DocumentService.accessible(request.doc_id, current_user.id):
return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)
try:
meta = json.loads(req.meta)
if not isinstance(meta, dict):
return get_json_result(data=False, message="Only dictionary type supported.", code=settings.RetCode.ARGUMENT_ERROR)
for k, v in meta.items():
if not isinstance(v, str) and not isinstance(v, int) and not isinstance(v, float):
return get_json_result(data=False, message=f"The type is not supported: {v}", code=settings.RetCode.ARGUMENT_ERROR)
except Exception as e:
return get_json_result(data=False, message=f"Json syntax error: {e}", code=settings.RetCode.ARGUMENT_ERROR)
if not isinstance(meta, dict):
return get_json_result(data=False, message='Meta data should be in Json map format, like {"key": "value"}', code=settings.RetCode.ARGUMENT_ERROR)
try:
e, doc = DocumentService.get_by_id(req.doc_id)
meta = json.loads(request.meta)
e, doc = DocumentService.get_by_id(request.doc_id)
if not e:
return get_data_error_result(message="Document not found!")
if not DocumentService.update_by_id(req.doc_id, {"meta_fields": meta}):
if not DocumentService.update_by_id(request.doc_id, {"meta_fields": meta}):
return get_data_error_result(message="Database error (meta updates)!")
return get_json_result(data=True)
except Exception as e:
return server_error_response(e)