diff --git a/Dockerfile b/Dockerfile index 5961286..9904837 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,15 +13,17 @@ USER root WORKDIR /ragflow -# install dependencies from uv.lock file -COPY pyproject.toml uv.lock ./ - +# install dependencies from pyproject.toml +COPY pyproject.toml ./ # https://github.com/astral-sh/uv/issues/10462 # uv records index url into uv.lock but doesn't failover among multiple indexes +# Generate uv.lock from pyproject.toml and install dependencies with cache RUN --mount=type=cache,id=ragflow_uv,target=/root/.cache/uv,sharing=locked \ if [ "$NEED_MIRROR" == "1" ]; then \ + uv lock --index-url https://mirrors.aliyun.com/pypi/simple; \ sed -i 's|pypi.org|mirrors.aliyun.com/pypi|g' uv.lock; \ else \ + uv lock; \ sed -i 's|mirrors.aliyun.com/pypi|pypi.org|g' uv.lock; \ fi; \ if [ "$LIGHTEN" == "1" ]; then \ @@ -68,13 +70,13 @@ COPY rag rag COPY agent agent COPY graphrag graphrag COPY agentic_reasoning agentic_reasoning -COPY pyproject.toml uv.lock ./ +COPY pyproject.toml ./ COPY mcp mcp COPY plugin plugin COPY docker/service_conf.yaml.template ./conf/service_conf.yaml.template -COPY docker/entrypoint.sh ./ -RUN chmod +x ./entrypoint*.sh +COPY --chmod=+x docker/entrypoint.sh ./entrypoint.sh + # Copy compiled web pages COPY --from=builder /ragflow/web/dist /ragflow/web/dist diff --git a/api/apps/__init___fastapi.py b/api/apps/__init___fastapi.py index f60062e..afc01c7 100644 --- a/api/apps/__init___fastapi.py +++ b/api/apps/__init___fastapi.py @@ -162,6 +162,9 @@ def setup_routes(app: FastAPI): from api.apps.chunk_app import router as chunk_router from api.apps.mcp_server_app import router as mcp_router from api.apps.canvas_app import router as canvas_router + from api.apps.tenant_app import router as tenant_router + from api.apps.dialog_app import router as dialog_router + from api.apps.system_app import router as system_router app.include_router(user_router, prefix=f"/{API_VERSION}/user", tags=["User"]) app.include_router(kb_router, prefix=f"/{API_VERSION}/kb", tags=["KnowledgeBase"]) @@ -170,6 +173,9 @@ def setup_routes(app: FastAPI): app.include_router(chunk_router, prefix=f"/{API_VERSION}/chunk", tags=["Chunk"]) app.include_router(mcp_router, prefix=f"/{API_VERSION}/mcp", tags=["MCP"]) app.include_router(canvas_router, prefix=f"/{API_VERSION}/canvas", tags=["Canvas"]) + app.include_router(tenant_router, prefix=f"/{API_VERSION}/tenant", tags=["Tenant"]) + app.include_router(dialog_router, prefix=f"/{API_VERSION}/dialog", tags=["Dialog"]) + app.include_router(system_router, prefix=f"/{API_VERSION}/system", tags=["System"]) diff --git a/api/apps/dialog_app.py b/api/apps/dialog_app.py index e7f1e06..c1c9b0b 100644 --- a/api/apps/dialog_app.py +++ b/api/apps/dialog_app.py @@ -14,8 +14,17 @@ # limitations under the License. # -from flask import request -from flask_login import login_required, current_user +from typing import Optional + +from fastapi import APIRouter, Depends, Query + +from api.apps.models.auth_dependencies import get_current_user +from api.apps.models.dialog_models import ( + SetDialogRequest, + ListDialogsNextQuery, + ListDialogsNextBody, + DeleteDialogRequest, +) from api.db.services import duplicate_name from api.db.services.dialog_service import DialogService from api.db import StatusEnum @@ -23,16 +32,21 @@ from api.db.services.tenant_llm_service import TenantLLMService from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.user_service import TenantService, UserTenantService from api import settings -from api.utils.api_utils import server_error_response, get_data_error_result, validate_request +from api.utils.api_utils import server_error_response, get_data_error_result from api.utils import get_uuid from api.utils.api_utils import get_json_result +# 创建路由器 +router = APIRouter() -@manager.route('/set', methods=['POST']) # noqa: F821 -@validate_request("prompt_config") -@login_required -def set_dialog(): - req = request.json + +@router.post('/set') +async def set_dialog( + request: SetDialogRequest, + current_user = Depends(get_current_user) +): + """设置/创建对话框""" + req = request.model_dump(exclude_unset=True) dialog_id = req.get("dialog_id", "") is_create = not dialog_id name = req.get("name", "New Dialog") @@ -124,10 +138,12 @@ def set_dialog(): return server_error_response(e) -@manager.route('/get', methods=['GET']) # noqa: F821 -@login_required -def get(): - dialog_id = request.args["dialog_id"] +@router.get('/get') +async def get( + dialog_id: str = Query(..., description="对话框ID"), + current_user = Depends(get_current_user) +): + """获取对话框详情""" try: e, dia = DialogService.get_by_id(dialog_id) if not e: @@ -150,9 +166,11 @@ def get_kb_names(kb_ids): return ids, nms -@manager.route('/list', methods=['GET']) # noqa: F821 -@login_required -def list_dialogs(): +@router.get('/list') +async def list_dialogs( + current_user = Depends(get_current_user) +): + """列出对话框""" try: diags = DialogService.query( tenant_id=current_user.id, @@ -167,21 +185,24 @@ def list_dialogs(): return server_error_response(e) -@manager.route('/next', methods=['POST']) # noqa: F821 -@login_required -def list_dialogs_next(): - keywords = request.args.get("keywords", "") - page_number = int(request.args.get("page", 0)) - items_per_page = int(request.args.get("page_size", 0)) - parser_id = request.args.get("parser_id") - orderby = request.args.get("orderby", "create_time") - if request.args.get("desc", "true").lower() == "false": - desc = False - else: - desc = True +@router.post('/next') +async def list_dialogs_next( + query: ListDialogsNextQuery = Depends(), + body: Optional[ListDialogsNextBody] = None, + current_user = Depends(get_current_user) +): + """列出对话框(分页)""" + if body is None: + body = ListDialogsNextBody() + + keywords = query.keywords or "" + page_number = int(query.page or 0) + items_per_page = int(query.page_size or 0) + parser_id = query.parser_id + orderby = query.orderby or "create_time" + desc = query.desc.lower() == "true" if query.desc else True - req = request.get_json() - owner_ids = req.get("owner_ids", []) + owner_ids = body.owner_ids or [] try: if not owner_ids: # tenants = TenantService.get_joined_tenants_by_user_id(current_user.id) @@ -204,15 +225,16 @@ def list_dialogs_next(): return server_error_response(e) -@manager.route('/rm', methods=['POST']) # noqa: F821 -@login_required -@validate_request("dialog_ids") -def rm(): - req = request.json - dialog_list=[] +@router.post('/rm') +async def rm( + request: DeleteDialogRequest, + current_user = Depends(get_current_user) +): + """删除对话框""" + dialog_list = [] tenants = UserTenantService.query(user_id=current_user.id) try: - for id in req["dialog_ids"]: + for id in request.dialog_ids: for tenant in tenants: if DialogService.query(tenant_id=tenant.tenant_id, id=id): break @@ -220,7 +242,7 @@ def rm(): return get_json_result( data=False, message='Only owner of dialog authorized for this operation.', code=settings.RetCode.OPERATING_ERROR) - dialog_list.append({"id": id,"status":StatusEnum.INVALID.value}) + dialog_list.append({"id": id, "status": StatusEnum.INVALID.value}) DialogService.update_many_by_id(dialog_list) return get_json_result(data=True) except Exception as e: diff --git a/api/apps/models/auth_dependencies.py b/api/apps/models/auth_dependencies.py index 67931a6..83436db 100644 --- a/api/apps/models/auth_dependencies.py +++ b/api/apps/models/auth_dependencies.py @@ -24,25 +24,43 @@ from api.utils.api_utils import get_json_result http_bearer = HTTPBearer(auto_error=False) -def get_current_user(credentials: Optional[HTTPAuthorizationCredentials] = Security(http_bearer)): +def get_current_user( + authorization: Optional[str] = Header(None, alias="Authorization"), + credentials: Optional[HTTPAuthorizationCredentials] = Security(http_bearer) +): """FastAPI 依赖注入:获取当前用户(替代 Flask 的 login_required 和 current_user) + 支持两种格式的 Authorization 头: + 1. 标准格式:Bearer + 2. 简化格式:(不带 Bearer 前缀) + 使用 Security(http_bearer) 可以让 FastAPI 自动在 OpenAPI schema 中添加安全要求, 这样 Swagger UI 就会显示授权输入框并自动在请求中添加 Authorization 头。 """ # 延迟导入以避免循环导入 from api.apps.__init___fastapi import get_current_user_from_token - if not credentials: + token = None + + # 优先从 HTTPBearer 获取(标准格式:Bearer ) + if credentials: + token = credentials.credentials + # 如果 HTTPBearer 没有获取到,尝试直接从 Header 获取(可能是简化格式) + elif authorization: + # 如果包含 "Bearer " 前缀,则去除它 + if authorization.startswith("Bearer "): + token = authorization[7:] # 去除 "Bearer " 前缀(7个字符) + else: + # 不带 Bearer 前缀,直接使用 + token = authorization + + if not token: raise HTTPException( status_code=status.HTTP_401_UNAUTHORIZED, detail="Authorization header is required" ) - # HTTPBearer 已经提取了 Bearer token,credentials.credentials 就是 token 本身 - authorization = credentials.credentials - - user = get_current_user_from_token(authorization) + user = get_current_user_from_token(token) if not user: raise HTTPException( status_code=status.HTTP_401_UNAUTHORIZED, diff --git a/api/apps/models/dialog_models.py b/api/apps/models/dialog_models.py new file mode 100644 index 0000000..d778bee --- /dev/null +++ b/api/apps/models/dialog_models.py @@ -0,0 +1,57 @@ +# +# Copyright 2024 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Optional, List, Dict, Any +from pydantic import BaseModel, Field + + +class SetDialogRequest(BaseModel): + """设置/创建对话框请求""" + dialog_id: Optional[str] = Field(default="", description="对话框ID,为空时创建新对话框") + name: Optional[str] = Field(default="New Dialog", description="对话框名称") + description: Optional[str] = Field(default="A helpful dialog", description="对话框描述") + icon: Optional[str] = Field(default="", description="图标") + top_n: Optional[int] = Field(default=6, description="Top N") + top_k: Optional[int] = Field(default=1024, description="Top K") + rerank_id: Optional[str] = Field(default="", description="重排序模型ID") + similarity_threshold: Optional[float] = Field(default=0.1, description="相似度阈值") + vector_similarity_weight: Optional[float] = Field(default=0.3, description="向量相似度权重") + llm_setting: Optional[Dict[str, Any]] = Field(default={}, description="LLM设置") + meta_data_filter: Optional[Dict[str, Any]] = Field(default={}, description="元数据过滤器") + prompt_config: Dict[str, Any] = Field(..., description="提示配置") + kb_ids: Optional[List[str]] = Field(default=[], description="知识库ID列表") + llm_id: Optional[str] = Field(default=None, description="LLM ID") + + +class ListDialogsNextQuery(BaseModel): + """列出对话框查询参数""" + keywords: Optional[str] = "" + page: Optional[int] = 0 + page_size: Optional[int] = 0 + parser_id: Optional[str] = None + orderby: Optional[str] = "create_time" + desc: Optional[str] = "true" + + +class ListDialogsNextBody(BaseModel): + """列出对话框请求体""" + owner_ids: Optional[List[str]] = [] + + +class DeleteDialogRequest(BaseModel): + """删除对话框请求""" + dialog_ids: List[str] = Field(..., description="要删除的对话框ID列表") + diff --git a/api/apps/models/document_models.py b/api/apps/models/document_models.py index a9fc161..c5ecd4a 100644 --- a/api/apps/models/document_models.py +++ b/api/apps/models/document_models.py @@ -138,11 +138,11 @@ class GetDocumentInfosRequest(BaseModel): class ChangeStatusRequest(BaseModel): """修改文档状态请求""" doc_ids: List[str] - status: str # "0" 或 "1" + status: int @model_validator(mode='after') def validate_status(self): - if self.status not in ["0", "1"]: + if self.status not in [0, 1]: raise ValueError('Status must be either 0 or 1!') return self @@ -155,7 +155,7 @@ class DeleteDocumentRequest(BaseModel): class RunDocumentRequest(BaseModel): """运行文档解析请求""" doc_ids: List[str] - run: str # TaskStatus 值 + run: int # TaskStatus 值 delete: Optional[bool] = False diff --git a/api/apps/models/tenant_models.py b/api/apps/models/tenant_models.py new file mode 100644 index 0000000..f7c6ec8 --- /dev/null +++ b/api/apps/models/tenant_models.py @@ -0,0 +1,23 @@ +# +# Copyright 2024 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from pydantic import BaseModel, Field, EmailStr + + +class InviteUserRequest(BaseModel): + """邀请用户请求""" + email: EmailStr = Field(..., description="要邀请的用户邮箱") + diff --git a/api/apps/system_app.py b/api/apps/system_app.py index 4302813..7cd689d 100644 --- a/api/apps/system_app.py +++ b/api/apps/system_app.py @@ -17,7 +17,8 @@ import logging from datetime import datetime import json -from flask_login import login_required, current_user +from fastapi import APIRouter, Depends +from fastapi.responses import JSONResponse from api.db.db_models import APIToken from api.db.services.api_service import APITokenService @@ -36,67 +37,26 @@ from rag.utils.storage_factory import STORAGE_IMPL, STORAGE_IMPL_TYPE from timeit import default_timer as timer from rag.utils.redis_conn import REDIS_CONN -from flask import jsonify from api.utils.health_utils import run_health_checks +from api.apps.models.auth_dependencies import get_current_user + +# 创建路由器 +router = APIRouter() -@manager.route("/version", methods=["GET"]) # noqa: F821 -@login_required -def version(): - """ - Get the current version of the application. - --- - tags: - - System - security: - - ApiKeyAuth: [] - responses: - 200: - description: Version retrieved successfully. - schema: - type: object - properties: - version: - type: string - description: Version number. - """ +@router.get("/version") +async def version( + current_user = Depends(get_current_user) +): + """获取应用程序当前版本""" return get_json_result(data=get_ragflow_version()) -@manager.route("/status", methods=["GET"]) # noqa: F821 -@login_required -def status(): - """ - Get the system status. - --- - tags: - - System - security: - - ApiKeyAuth: [] - responses: - 200: - description: System is operational. - schema: - type: object - properties: - es: - type: object - description: Elasticsearch status. - storage: - type: object - description: Storage status. - database: - type: object - description: Database status. - 503: - description: Service unavailable. - schema: - type: object - properties: - error: - type: string - description: Error message. - """ +@router.get("/status") +async def status( + current_user = Depends(get_current_user) +): + """获取系统状态""" res = {} st = timer() try: @@ -172,43 +132,24 @@ def status(): return get_json_result(data=res) -@manager.route("/healthz", methods=["GET"]) # noqa: F821 -def healthz(): +@router.get("/healthz") +async def healthz(): + """健康检查""" result, all_ok = run_health_checks() - return jsonify(result), (200 if all_ok else 500) + return JSONResponse(content=result, status_code=200 if all_ok else 500) -@manager.route("/ping", methods=["GET"]) # noqa: F821 -def ping(): - return "pong", 200 +@router.get("/ping") +async def ping(): + """心跳检测""" + return "pong" -@manager.route("/new_token", methods=["POST"]) # noqa: F821 -@login_required -def new_token(): - """ - Generate a new API token. - --- - tags: - - API Tokens - security: - - ApiKeyAuth: [] - parameters: - - in: query - name: name - type: string - required: false - description: Name of the token. - responses: - 200: - description: Token generated successfully. - schema: - type: object - properties: - token: - type: string - description: The generated API token. - """ +@router.post("/new_token") +async def new_token( + current_user = Depends(get_current_user) +): + """生成新的 API 令牌""" try: tenants = UserTenantService.query(user_id=current_user.id) if not tenants: @@ -233,37 +174,11 @@ def new_token(): return server_error_response(e) -@manager.route("/token_list", methods=["GET"]) # noqa: F821 -@login_required -def token_list(): - """ - List all API tokens for the current user. - --- - tags: - - API Tokens - security: - - ApiKeyAuth: [] - responses: - 200: - description: List of API tokens. - schema: - type: object - properties: - tokens: - type: array - items: - type: object - properties: - token: - type: string - description: The API token. - name: - type: string - description: Name of the token. - create_time: - type: string - description: Token creation time. - """ +@router.get("/token_list") +async def token_list( + current_user = Depends(get_current_user) +): + """列出当前用户的所有 API 令牌""" try: tenants = UserTenantService.query(user_id=current_user.id) if not tenants: @@ -282,55 +197,21 @@ def token_list(): return server_error_response(e) -@manager.route("/token/", methods=["DELETE"]) # noqa: F821 -@login_required -def rm(token): - """ - Remove an API token. - --- - tags: - - API Tokens - security: - - ApiKeyAuth: [] - parameters: - - in: path - name: token - type: string - required: true - description: The API token to remove. - responses: - 200: - description: Token removed successfully. - schema: - type: object - properties: - success: - type: boolean - description: Deletion status. - """ +@router.delete("/token/{token}") +async def rm( + token: str, + current_user = Depends(get_current_user) +): + """删除 API 令牌""" APITokenService.filter_delete( [APIToken.tenant_id == current_user.id, APIToken.token == token] ) return get_json_result(data=True) -@manager.route('/config', methods=['GET']) # noqa: F821 -def get_config(): - """ - Get system configuration. - --- - tags: - - System - responses: - 200: - description: Return system configuration - schema: - type: object - properties: - registerEnable: - type: integer 0 means disabled, 1 means enabled - description: Whether user registration is enabled - """ +@router.get('/config') +async def get_config(): + """获取系统配置""" return get_json_result(data={ "registerEnabled": settings.REGISTER_ENABLED }) diff --git a/api/apps/tenant_app.py b/api/apps/tenant_app.py index 1066849..cc65326 100644 --- a/api/apps/tenant_app.py +++ b/api/apps/tenant_app.py @@ -14,9 +14,10 @@ # limitations under the License. # -from flask import request -from flask_login import login_required, current_user +from fastapi import APIRouter, Depends, Path +from api.apps.models.auth_dependencies import get_current_user +from api.apps.models.tenant_models import InviteUserRequest from api import settings from api.apps import smtp_mail_server from api.db import UserTenantRole, StatusEnum @@ -24,13 +25,19 @@ from api.db.db_models import UserTenant from api.db.services.user_service import UserTenantService, UserService from api.utils import get_uuid, delta_seconds -from api.utils.api_utils import get_json_result, validate_request, server_error_response, get_data_error_result +from api.utils.api_utils import get_json_result, server_error_response, get_data_error_result from api.utils.web_utils import send_invite_email +# 创建路由器 +router = APIRouter() -@manager.route("//user/list", methods=["GET"]) # noqa: F821 -@login_required -def user_list(tenant_id): + +@router.get("/{tenant_id}/user/list") +async def user_list( + tenant_id: str = Path(..., description="租户ID"), + current_user = Depends(get_current_user) +): + """获取租户用户列表""" if current_user.id != tenant_id: return get_json_result( data=False, @@ -46,18 +53,20 @@ def user_list(tenant_id): return server_error_response(e) -@manager.route('//user', methods=['POST']) # noqa: F821 -@login_required -@validate_request("email") -def create(tenant_id): +@router.post('/{tenant_id}/user') +async def create( + tenant_id: str, + request: InviteUserRequest, + current_user = Depends(get_current_user) +): + """邀请用户加入租户""" if current_user.id != tenant_id: return get_json_result( data=False, message='No authorization.', code=settings.RetCode.AUTHENTICATION_ERROR) - req = request.json - invite_user_email = req["email"] + invite_user_email = request.email invite_users = UserService.query(email=invite_user_email) if not invite_users: return get_data_error_result(message="User not found.") @@ -101,9 +110,13 @@ def create(tenant_id): return get_json_result(data=usr) -@manager.route('//user/', methods=['DELETE']) # noqa: F821 -@login_required -def rm(tenant_id, user_id): +@router.delete('/{tenant_id}/user/{user_id}') +async def rm( + tenant_id: str = Path(..., description="租户ID"), + user_id: str = Path(..., description="用户ID"), + current_user = Depends(get_current_user) +): + """从租户中删除用户""" if current_user.id != tenant_id and current_user.id != user_id: return get_json_result( data=False, @@ -117,9 +130,11 @@ def rm(tenant_id, user_id): return server_error_response(e) -@manager.route("/list", methods=["GET"]) # noqa: F821 -@login_required -def tenant_list(): +@router.get("/list") +async def tenant_list( + current_user = Depends(get_current_user) +): + """获取租户列表""" try: users = UserTenantService.get_tenants_by_user_id(current_user.id) for u in users: @@ -129,9 +144,12 @@ def tenant_list(): return server_error_response(e) -@manager.route("/agree/", methods=["PUT"]) # noqa: F821 -@login_required -def agree(tenant_id): +@router.put("/agree/{tenant_id}") +async def agree( + tenant_id: str = Path(..., description="租户ID"), + current_user = Depends(get_current_user) +): + """同意加入租户邀请""" try: UserTenantService.filter_update([UserTenant.tenant_id == tenant_id, UserTenant.user_id == current_user.id], {"role": UserTenantRole.NORMAL}) diff --git a/deepdoc/README.md b/deepdoc/README.md new file mode 100644 index 0000000..14c7947 --- /dev/null +++ b/deepdoc/README.md @@ -0,0 +1,122 @@ +English | [简体中文](./README_zh.md) + +# *Deep*Doc + +- [1. Introduction](#1) +- [2. Vision](#2) +- [3. Parser](#3) + + +## 1. Introduction + +With a bunch of documents from various domains with various formats and along with diverse retrieval requirements, +an accurate analysis becomes a very challenge task. *Deep*Doc is born for that purpose. +There are 2 parts in *Deep*Doc so far: vision and parser. +You can run the flowing test programs if you're interested in our results of OCR, layout recognition and TSR. +```bash +python deepdoc/vision/t_ocr.py -h +usage: t_ocr.py [-h] --inputs INPUTS [--output_dir OUTPUT_DIR] + +options: + -h, --help show this help message and exit + --inputs INPUTS Directory where to store images or PDFs, or a file path to a single image or PDF + --output_dir OUTPUT_DIR + Directory where to store the output images. Default: './ocr_outputs' +``` +```bash +python deepdoc/vision/t_recognizer.py -h +usage: t_recognizer.py [-h] --inputs INPUTS [--output_dir OUTPUT_DIR] [--threshold THRESHOLD] [--mode {layout,tsr}] + +options: + -h, --help show this help message and exit + --inputs INPUTS Directory where to store images or PDFs, or a file path to a single image or PDF + --output_dir OUTPUT_DIR + Directory where to store the output images. Default: './layouts_outputs' + --threshold THRESHOLD + A threshold to filter out detections. Default: 0.5 + --mode {layout,tsr} Task mode: layout recognition or table structure recognition +``` + +Our models are served on HuggingFace. If you have trouble downloading HuggingFace models, this might help!! +```bash +export HF_ENDPOINT=https://hf-mirror.com +``` + + +## 2. Vision + +We use vision information to resolve problems as human being. + - OCR. Since a lot of documents presented as images or at least be able to transform to image, + OCR is a very essential and fundamental or even universal solution for text extraction. + ```bash + python deepdoc/vision/t_ocr.py --inputs=path_to_images_or_pdfs --output_dir=path_to_store_result + ``` + The inputs could be directory to images or PDF, or a image or PDF. + You can look into the folder 'path_to_store_result' where has images which demonstrate the positions of results, + txt files which contain the OCR text. +
+ +
+ + - Layout recognition. Documents from different domain may have various layouts, + like, newspaper, magazine, book and résumé are distinct in terms of layout. + Only when machine have an accurate layout analysis, it can decide if these text parts are successive or not, + or this part needs Table Structure Recognition(TSR) to process, or this part is a figure and described with this caption. + We have 10 basic layout components which covers most cases: + - Text + - Title + - Figure + - Figure caption + - Table + - Table caption + - Header + - Footer + - Reference + - Equation + + Have a try on the following command to see the layout detection results. + ```bash + python deepdoc/vision/t_recognizer.py --inputs=path_to_images_or_pdfs --threshold=0.2 --mode=layout --output_dir=path_to_store_result + ``` + The inputs could be directory to images or PDF, or a image or PDF. + You can look into the folder 'path_to_store_result' where has images which demonstrate the detection results as following: +
+ +
+ + - Table Structure Recognition(TSR). Data table is a frequently used structure to present data including numbers or text. + And the structure of a table might be very complex, like hierarchy headers, spanning cells and projected row headers. + Along with TSR, we also reassemble the content into sentences which could be well comprehended by LLM. + We have five labels for TSR task: + - Column + - Row + - Column header + - Projected row header + - Spanning cell + + Have a try on the following command to see the layout detection results. + ```bash + python deepdoc/vision/t_recognizer.py --inputs=path_to_images_or_pdfs --threshold=0.2 --mode=tsr --output_dir=path_to_store_result + ``` + The inputs could be directory to images or PDF, or a image or PDF. + You can look into the folder 'path_to_store_result' where has both images and html pages which demonstrate the detection results as following: +
+ +
+ + +## 3. Parser + +Four kinds of document formats as PDF, DOCX, EXCEL and PPT have their corresponding parser. +The most complex one is PDF parser since PDF's flexibility. The output of PDF parser includes: + - Text chunks with their own positions in PDF(page number and rectangular positions). + - Tables with cropped image from the PDF, and contents which has already translated into natural language sentences. + - Figures with caption and text in the figures. + +### Résumé + +The résumé is a very complicated kind of document. A résumé which is composed of unstructured text +with various layouts could be resolved into structured data composed of nearly a hundred of fields. +We haven't opened the parser yet, as we open the processing method after parsing procedure. + + \ No newline at end of file diff --git a/deepdoc/README_zh.md b/deepdoc/README_zh.md new file mode 100644 index 0000000..4ada7ed --- /dev/null +++ b/deepdoc/README_zh.md @@ -0,0 +1,116 @@ +[English](./README.md) | 简体中文 + +# *Deep*Doc + +- [*Deep*Doc](#deepdoc) + - [1. 介绍](#1-介绍) + - [2. 视觉处理](#2-视觉处理) + - [3. 解析器](#3-解析器) + - [简历](#简历) + + +## 1. 介绍 + +对于来自不同领域、具有不同格式和不同检索要求的大量文档,准确的分析成为一项极具挑战性的任务。*Deep*Doc 就是为了这个目的而诞生的。到目前为止,*Deep*Doc 中有两个组成部分:视觉处理和解析器。如果您对我们的OCR、布局识别和TSR结果感兴趣,您可以运行下面的测试程序。 + +```bash +python deepdoc/vision/t_ocr.py -h +usage: t_ocr.py [-h] --inputs INPUTS [--output_dir OUTPUT_DIR] + +options: + -h, --help show this help message and exit + --inputs INPUTS Directory where to store images or PDFs, or a file path to a single image or PDF + --output_dir OUTPUT_DIR + Directory where to store the output images. Default: './ocr_outputs' +``` + +```bash +python deepdoc/vision/t_recognizer.py -h +usage: t_recognizer.py [-h] --inputs INPUTS [--output_dir OUTPUT_DIR] [--threshold THRESHOLD] [--mode {layout,tsr}] + +options: + -h, --help show this help message and exit + --inputs INPUTS Directory where to store images or PDFs, or a file path to a single image or PDF + --output_dir OUTPUT_DIR + Directory where to store the output images. Default: './layouts_outputs' + --threshold THRESHOLD + A threshold to filter out detections. Default: 0.5 + --mode {layout,tsr} Task mode: layout recognition or table structure recognition +``` + +HuggingFace为我们的模型提供服务。如果你在下载HuggingFace模型时遇到问题,这可能会有所帮助!! + +```bash +export HF_ENDPOINT=https://hf-mirror.com +``` + + +## 2. 视觉处理 + +作为人类,我们使用视觉信息来解决问题。 + + - **OCR(Optical Character Recognition,光学字符识别)**。由于许多文档都是以图像形式呈现的,或者至少能够转换为图像,因此OCR是文本提取的一个非常重要、基本,甚至通用的解决方案。 + + ```bash + python deepdoc/vision/t_ocr.py --inputs=path_to_images_or_pdfs --output_dir=path_to_store_result + ``` + + 输入可以是图像或PDF的目录,或者单个图像、PDF文件。您可以查看文件夹 `path_to_store_result` ,其中有演示结果位置的图像,以及包含OCR文本的txt文件。 + +
+ +
+ + - 布局识别(Layout recognition)。来自不同领域的文件可能有不同的布局,如报纸、杂志、书籍和简历在布局方面是不同的。只有当机器有准确的布局分析时,它才能决定这些文本部分是连续的还是不连续的,或者这个部分需要表结构识别(Table Structure Recognition,TSR)来处理,或者这个部件是一个图形并用这个标题来描述。我们有10个基本布局组件,涵盖了大多数情况: + - 文本 + - 标题 + - 配图 + - 配图标题 + - 表格 + - 表格标题 + - 页头 + - 页尾 + - 参考引用 + - 公式 + + 请尝试以下命令以查看布局检测结果。 + + ```bash + python deepdoc/vision/t_recognizer.py --inputs=path_to_images_or_pdfs --threshold=0.2 --mode=layout --output_dir=path_to_store_result + ``` + + 输入可以是图像或PDF的目录,或者单个图像、PDF文件。您可以查看文件夹 `path_to_store_result` ,其中有显示检测结果的图像,如下所示: +
+ +
+ + - **TSR(Table Structure Recognition,表结构识别)**。数据表是一种常用的结构,用于表示包括数字或文本在内的数据。表的结构可能非常复杂,比如层次结构标题、跨单元格和投影行标题。除了TSR,我们还将内容重新组合成LLM可以很好理解的句子。TSR任务有五个标签: + - 列 + - 行 + - 列标题 + - 行标题 + - 合并单元格 + + 请尝试以下命令以查看布局检测结果。 + + ```bash + python deepdoc/vision/t_recognizer.py --inputs=path_to_images_or_pdfs --threshold=0.2 --mode=tsr --output_dir=path_to_store_result + ``` + + 输入可以是图像或PDF的目录,或者单个图像、PDF文件。您可以查看文件夹 `path_to_store_result` ,其中包含图像和html页面,这些页面展示了以下检测结果: + +
+ +
+ + +## 3. 解析器 + +PDF、DOCX、EXCEL和PPT四种文档格式都有相应的解析器。最复杂的是PDF解析器,因为PDF具有灵活性。PDF解析器的输出包括: + - 在PDF中有自己位置的文本块(页码和矩形位置)。 + - 带有PDF裁剪图像的表格,以及已经翻译成自然语言句子的内容。 + - 图中带标题和文字的图。 + +### 简历 + +简历是一种非常复杂的文档。由各种格式的非结构化文本构成的简历可以被解析为包含近百个字段的结构化数据。我们还没有启用解析器,因为在解析过程之后才会启动处理方法。 diff --git a/deepdoc/__init__.py b/deepdoc/__init__.py new file mode 100644 index 0000000..643f797 --- /dev/null +++ b/deepdoc/__init__.py @@ -0,0 +1,18 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from beartype.claw import beartype_this_package +beartype_this_package() diff --git a/deepdoc/parser/__init__.py b/deepdoc/parser/__init__.py new file mode 100644 index 0000000..809a56e --- /dev/null +++ b/deepdoc/parser/__init__.py @@ -0,0 +1,40 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from .docx_parser import RAGFlowDocxParser as DocxParser +from .excel_parser import RAGFlowExcelParser as ExcelParser +from .html_parser import RAGFlowHtmlParser as HtmlParser +from .json_parser import RAGFlowJsonParser as JsonParser +from .markdown_parser import MarkdownElementExtractor +from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser +from .pdf_parser import PlainParser +from .pdf_parser import RAGFlowPdfParser as PdfParser +from .ppt_parser import RAGFlowPptParser as PptParser +from .txt_parser import RAGFlowTxtParser as TxtParser + +__all__ = [ + "PdfParser", + "PlainParser", + "DocxParser", + "ExcelParser", + "PptParser", + "HtmlParser", + "JsonParser", + "MarkdownParser", + "TxtParser", + "MarkdownElementExtractor", +] + diff --git a/deepdoc/parser/docx_parser.py b/deepdoc/parser/docx_parser.py new file mode 100644 index 0000000..2a65841 --- /dev/null +++ b/deepdoc/parser/docx_parser.py @@ -0,0 +1,139 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from docx import Document +import re +import pandas as pd +from collections import Counter +from rag.nlp import rag_tokenizer +from io import BytesIO + + +class RAGFlowDocxParser: + + def __extract_table_content(self, tb): + df = [] + for row in tb.rows: + df.append([c.text for c in row.cells]) + return self.__compose_table_content(pd.DataFrame(df)) + + def __compose_table_content(self, df): + + def blockType(b): + pattern = [ + ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"), + (r"^(20|19)[0-9]{2}年$", "Dt"), + (r"^(20|19)[0-9]{2}[年/-][0-9]{1,2}月*$", "Dt"), + ("^[0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"), + (r"^第*[一二三四1-4]季度$", "Dt"), + (r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"), + (r"^(20|19)[0-9]{2}[ABCDE]$", "DT"), + ("^[0-9.,+%/ -]+$", "Nu"), + (r"^[0-9A-Z/\._~-]+$", "Ca"), + (r"^[A-Z]*[a-z' -]+$", "En"), + (r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$", "NE"), + (r"^.{1}$", "Sg") + ] + for p, n in pattern: + if re.search(p, b): + return n + tks = [t for t in rag_tokenizer.tokenize(b).split() if len(t) > 1] + if len(tks) > 3: + if len(tks) < 12: + return "Tx" + else: + return "Lx" + + if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr": + return "Nr" + + return "Ot" + + if len(df) < 2: + return [] + max_type = Counter([blockType(str(df.iloc[i, j])) for i in range( + 1, len(df)) for j in range(len(df.iloc[i, :]))]) + max_type = max(max_type.items(), key=lambda x: x[1])[0] + + colnm = len(df.iloc[0, :]) + hdrows = [0] # header is not necessarily appear in the first line + if max_type == "Nu": + for r in range(1, len(df)): + tys = Counter([blockType(str(df.iloc[r, j])) + for j in range(len(df.iloc[r, :]))]) + tys = max(tys.items(), key=lambda x: x[1])[0] + if tys != max_type: + hdrows.append(r) + + lines = [] + for i in range(1, len(df)): + if i in hdrows: + continue + hr = [r - i for r in hdrows] + hr = [r for r in hr if r < 0] + t = len(hr) - 1 + while t > 0: + if hr[t] - hr[t - 1] > 1: + hr = hr[t:] + break + t -= 1 + headers = [] + for j in range(len(df.iloc[i, :])): + t = [] + for h in hr: + x = str(df.iloc[i + h, j]).strip() + if x in t: + continue + t.append(x) + t = ",".join(t) + if t: + t += ": " + headers.append(t) + cells = [] + for j in range(len(df.iloc[i, :])): + if not str(df.iloc[i, j]): + continue + cells.append(headers[j] + str(df.iloc[i, j])) + lines.append(";".join(cells)) + + if colnm > 3: + return lines + return ["\n".join(lines)] + + def __call__(self, fnm, from_page=0, to_page=100000000): + self.doc = Document(fnm) if isinstance( + fnm, str) else Document(BytesIO(fnm)) + pn = 0 # parsed page + secs = [] # parsed contents + for p in self.doc.paragraphs: + if pn > to_page: + break + + runs_within_single_paragraph = [] # save runs within the range of pages + for run in p.runs: + if pn > to_page: + break + if from_page <= pn < to_page and p.text.strip(): + runs_within_single_paragraph.append(run.text) # append run.text first + + # wrap page break checker into a static method + if 'lastRenderedPageBreak' in run._element.xml: + pn += 1 + + secs.append(("".join(runs_within_single_paragraph), p.style.name if hasattr(p.style, 'name') else '')) # then concat run.text as part of the paragraph + + tbls = [self.__extract_table_content(tb) for tb in self.doc.tables] + return secs, tbls diff --git a/deepdoc/parser/excel_parser.py b/deepdoc/parser/excel_parser.py new file mode 100644 index 0000000..4d0496a --- /dev/null +++ b/deepdoc/parser/excel_parser.py @@ -0,0 +1,209 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import logging +import re +import sys +from io import BytesIO + +import pandas as pd +from openpyxl import Workbook, load_workbook + +from rag.nlp import find_codec + +# copied from `/openpyxl/cell/cell.py` +ILLEGAL_CHARACTERS_RE = re.compile(r"[\000-\010]|[\013-\014]|[\016-\037]") + + +class RAGFlowExcelParser: + @staticmethod + def _load_excel_to_workbook(file_like_object): + if isinstance(file_like_object, bytes): + file_like_object = BytesIO(file_like_object) + + # Read first 4 bytes to determine file type + file_like_object.seek(0) + file_head = file_like_object.read(4) + file_like_object.seek(0) + + if not (file_head.startswith(b"PK\x03\x04") or file_head.startswith(b"\xd0\xcf\x11\xe0")): + logging.info("Not an Excel file, converting CSV to Excel Workbook") + + try: + file_like_object.seek(0) + df = pd.read_csv(file_like_object) + return RAGFlowExcelParser._dataframe_to_workbook(df) + + except Exception as e_csv: + raise Exception(f"Failed to parse CSV and convert to Excel Workbook: {e_csv}") + + try: + return load_workbook(file_like_object, data_only=True) + except Exception as e: + logging.info(f"openpyxl load error: {e}, try pandas instead") + try: + file_like_object.seek(0) + try: + dfs = pd.read_excel(file_like_object, sheet_name=None) + return RAGFlowExcelParser._dataframe_to_workbook(dfs) + except Exception as ex: + logging.info(f"pandas with default engine load error: {ex}, try calamine instead") + file_like_object.seek(0) + df = pd.read_excel(file_like_object, engine="calamine") + return RAGFlowExcelParser._dataframe_to_workbook(df) + except Exception as e_pandas: + raise Exception(f"pandas.read_excel error: {e_pandas}, original openpyxl error: {e}") + + @staticmethod + def _clean_dataframe(df: pd.DataFrame): + def clean_string(s): + if isinstance(s, str): + return ILLEGAL_CHARACTERS_RE.sub(" ", s) + return s + + return df.apply(lambda col: col.map(clean_string)) + + @staticmethod + def _dataframe_to_workbook(df): + # if contains multiple sheets use _dataframes_to_workbook + if isinstance(df, dict) and len(df) > 1: + return RAGFlowExcelParser._dataframes_to_workbook(df) + + df = RAGFlowExcelParser._clean_dataframe(df) + wb = Workbook() + ws = wb.active + ws.title = "Data" + + for col_num, column_name in enumerate(df.columns, 1): + ws.cell(row=1, column=col_num, value=column_name) + + for row_num, row in enumerate(df.values, 2): + for col_num, value in enumerate(row, 1): + ws.cell(row=row_num, column=col_num, value=value) + + return wb + + @staticmethod + def _dataframes_to_workbook(dfs: dict): + wb = Workbook() + default_sheet = wb.active + wb.remove(default_sheet) + + for sheet_name, df in dfs.items(): + df = RAGFlowExcelParser._clean_dataframe(df) + ws = wb.create_sheet(title=sheet_name) + for col_num, column_name in enumerate(df.columns, 1): + ws.cell(row=1, column=col_num, value=column_name) + for row_num, row in enumerate(df.values, 2): + for col_num, value in enumerate(row, 1): + ws.cell(row=row_num, column=col_num, value=value) + return wb + + def html(self, fnm, chunk_rows=256): + from html import escape + + file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm + wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object) + tb_chunks = [] + + def _fmt(v): + if v is None: + return "" + return str(v).strip() + + for sheetname in wb.sheetnames: + ws = wb[sheetname] + rows = list(ws.rows) + if not rows: + continue + + tb_rows_0 = "" + for t in list(rows[0]): + tb_rows_0 += f"{escape(_fmt(t.value))}" + tb_rows_0 += "" + + for chunk_i in range((len(rows) - 1) // chunk_rows + 1): + tb = "" + tb += f"" + tb += tb_rows_0 + for r in list(rows[1 + chunk_i * chunk_rows : min(1 + (chunk_i + 1) * chunk_rows, len(rows))]): + tb += "" + for i, c in enumerate(r): + if c.value is None: + tb += "" + else: + tb += f"" + tb += "" + tb += "
{sheetname}
{escape(_fmt(c.value))}
\n" + tb_chunks.append(tb) + + return tb_chunks + + def markdown(self, fnm): + import pandas as pd + + file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm + try: + file_like_object.seek(0) + df = pd.read_excel(file_like_object) + except Exception as e: + logging.warning(f"Parse spreadsheet error: {e}, trying to interpret as CSV file") + file_like_object.seek(0) + df = pd.read_csv(file_like_object) + df = df.replace(r"^\s*$", "", regex=True) + return df.to_markdown(index=False) + + def __call__(self, fnm): + file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm + wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object) + + res = [] + for sheetname in wb.sheetnames: + ws = wb[sheetname] + rows = list(ws.rows) + if not rows: + continue + ti = list(rows[0]) + for r in list(rows[1:]): + fields = [] + for i, c in enumerate(r): + if not c.value: + continue + t = str(ti[i].value) if i < len(ti) else "" + t += (":" if t else "") + str(c.value) + fields.append(t) + line = "; ".join(fields) + if sheetname.lower().find("sheet") < 0: + line += " ——" + sheetname + res.append(line) + return res + + @staticmethod + def row_number(fnm, binary): + if fnm.split(".")[-1].lower().find("xls") >= 0: + wb = RAGFlowExcelParser._load_excel_to_workbook(BytesIO(binary)) + total = 0 + for sheetname in wb.sheetnames: + ws = wb[sheetname] + total += len(list(ws.rows)) + return total + + if fnm.split(".")[-1].lower() in ["csv", "txt"]: + encoding = find_codec(binary) + txt = binary.decode(encoding, errors="ignore") + return len(txt.split("\n")) + + +if __name__ == "__main__": + psr = RAGFlowExcelParser() + psr(sys.argv[1]) diff --git a/deepdoc/parser/figure_parser.py b/deepdoc/parser/figure_parser.py new file mode 100644 index 0000000..98c1d33 --- /dev/null +++ b/deepdoc/parser/figure_parser.py @@ -0,0 +1,144 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from concurrent.futures import ThreadPoolExecutor, as_completed + +from PIL import Image + +from api.db import LLMType +from api.db.services.llm_service import LLMBundle +from api.utils.api_utils import timeout +from rag.app.picture import vision_llm_chunk as picture_vision_llm_chunk +from rag.prompts.generator import vision_llm_figure_describe_prompt + + +def vision_figure_parser_figure_data_wrapper(figures_data_without_positions): + return [ + ( + (figure_data[1], [figure_data[0]]), + [(0, 0, 0, 0, 0)], + ) + for figure_data in figures_data_without_positions + if isinstance(figure_data[1], Image.Image) + ] + +def vision_figure_parser_docx_wrapper(sections,tbls,callback=None,**kwargs): + try: + vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT) + callback(0.7, "Visual model detected. Attempting to enhance figure extraction...") + except Exception: + vision_model = None + if vision_model: + figures_data = vision_figure_parser_figure_data_wrapper(sections) + try: + docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs) + boosted_figures = docx_vision_parser(callback=callback) + tbls.extend(boosted_figures) + except Exception as e: + callback(0.8, f"Visual model error: {e}. Skipping figure parsing enhancement.") + return tbls + +def vision_figure_parser_pdf_wrapper(tbls,callback=None,**kwargs): + try: + vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT) + callback(0.7, "Visual model detected. Attempting to enhance figure extraction...") + except Exception: + vision_model = None + if vision_model: + def is_figure_item(item): + return ( + isinstance(item[0][0], Image.Image) and + isinstance(item[0][1], list) + ) + figures_data = [item for item in tbls if is_figure_item(item)] + try: + docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs) + boosted_figures = docx_vision_parser(callback=callback) + tbls = [item for item in tbls if not is_figure_item(item)] + tbls.extend(boosted_figures) + except Exception as e: + callback(0.8, f"Visual model error: {e}. Skipping figure parsing enhancement.") + return tbls + +shared_executor = ThreadPoolExecutor(max_workers=10) + + +class VisionFigureParser: + def __init__(self, vision_model, figures_data, *args, **kwargs): + self.vision_model = vision_model + self._extract_figures_info(figures_data) + assert len(self.figures) == len(self.descriptions) + assert not self.positions or (len(self.figures) == len(self.positions)) + + def _extract_figures_info(self, figures_data): + self.figures = [] + self.descriptions = [] + self.positions = [] + + for item in figures_data: + # position + if len(item) == 2 and isinstance(item[0], tuple) and len(item[0]) == 2 and isinstance(item[1], list) and isinstance(item[1][0], tuple) and len(item[1][0]) == 5: + img_desc = item[0] + assert len(img_desc) == 2 and isinstance(img_desc[0], Image.Image) and isinstance(img_desc[1], list), "Should be (figure, [description])" + self.figures.append(img_desc[0]) + self.descriptions.append(img_desc[1]) + self.positions.append(item[1]) + else: + assert len(item) == 2 and isinstance(item[0], Image.Image) and isinstance(item[1], list), f"Unexpected form of figure data: get {len(item)=}, {item=}" + self.figures.append(item[0]) + self.descriptions.append(item[1]) + + def _assemble(self): + self.assembled = [] + self.has_positions = len(self.positions) != 0 + for i in range(len(self.figures)): + figure = self.figures[i] + desc = self.descriptions[i] + pos = self.positions[i] if self.has_positions else None + + figure_desc = (figure, desc) + + if pos is not None: + self.assembled.append((figure_desc, pos)) + else: + self.assembled.append((figure_desc,)) + + return self.assembled + + def __call__(self, **kwargs): + callback = kwargs.get("callback", lambda prog, msg: None) + + @timeout(30, 3) + def process(figure_idx, figure_binary): + description_text = picture_vision_llm_chunk( + binary=figure_binary, + vision_model=self.vision_model, + prompt=vision_llm_figure_describe_prompt(), + callback=callback, + ) + return figure_idx, description_text + + futures = [] + for idx, img_binary in enumerate(self.figures or []): + futures.append(shared_executor.submit(process, idx, img_binary)) + + for future in as_completed(futures): + figure_num, txt = future.result() + if txt: + self.descriptions[figure_num] = txt + "\n".join(self.descriptions[figure_num]) + + self._assemble() + + return self.assembled diff --git a/deepdoc/parser/html_parser.py b/deepdoc/parser/html_parser.py new file mode 100644 index 0000000..44ff103 --- /dev/null +++ b/deepdoc/parser/html_parser.py @@ -0,0 +1,214 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from rag.nlp import find_codec, rag_tokenizer +import uuid +import chardet +from bs4 import BeautifulSoup, NavigableString, Tag, Comment +import html + +def get_encoding(file): + with open(file,'rb') as f: + tmp = chardet.detect(f.read()) + return tmp['encoding'] + +BLOCK_TAGS = [ + "h1", "h2", "h3", "h4", "h5", "h6", + "p", "div", "article", "section", "aside", + "ul", "ol", "li", + "table", "pre", "code", "blockquote", + "figure", "figcaption" +] +TITLE_TAGS = {"h1": "#", "h2": "##", "h3": "###", "h4": "#####", "h5": "#####", "h6": "######"} + + +class RAGFlowHtmlParser: + def __call__(self, fnm, binary=None, chunk_token_num=512): + if binary: + encoding = find_codec(binary) + txt = binary.decode(encoding, errors="ignore") + else: + with open(fnm, "r",encoding=get_encoding(fnm)) as f: + txt = f.read() + return self.parser_txt(txt, chunk_token_num) + + @classmethod + def parser_txt(cls, txt, chunk_token_num): + if not isinstance(txt, str): + raise TypeError("txt type should be string!") + + temp_sections = [] + soup = BeautifulSoup(txt, "html5lib") + # delete + + + %s + + +""" % TableStructureRecognizer.construct_table(boxes, html=True) + return html + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--inputs', + help="Directory where to store images or PDFs, or a file path to a single image or PDF", + required=True) + parser.add_argument('--output_dir', help="Directory where to store the output images. Default: './layouts_outputs'", + default="./layouts_outputs") + parser.add_argument( + '--threshold', + help="A threshold to filter out detections. Default: 0.5", + default=0.5) + parser.add_argument('--mode', help="Task mode: layout recognition or table structure recognition", choices=["layout", "tsr"], + default="layout") + args = parser.parse_args() + main(args) diff --git a/deepdoc/vision/table_structure_recognizer.py b/deepdoc/vision/table_structure_recognizer.py new file mode 100644 index 0000000..7f4736c --- /dev/null +++ b/deepdoc/vision/table_structure_recognizer.py @@ -0,0 +1,612 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import logging +import os +import re +from collections import Counter + +import numpy as np +from huggingface_hub import snapshot_download + +from api.utils.file_utils import get_project_base_directory +from rag.nlp import rag_tokenizer + +from .recognizer import Recognizer + + +class TableStructureRecognizer(Recognizer): + labels = [ + "table", + "table column", + "table row", + "table column header", + "table projected row header", + "table spanning cell", + ] + + def __init__(self): + try: + super().__init__(self.labels, "tsr", os.path.join(get_project_base_directory(), "rag/res/deepdoc")) + except Exception: + super().__init__( + self.labels, + "tsr", + snapshot_download( + repo_id="InfiniFlow/deepdoc", + local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"), + local_dir_use_symlinks=False, + ), + ) + + def __call__(self, images, thr=0.2): + table_structure_recognizer_type = os.getenv("TABLE_STRUCTURE_RECOGNIZER_TYPE", "onnx").lower() + if table_structure_recognizer_type not in ["onnx", "ascend"]: + raise RuntimeError("Unsupported table structure recognizer type.") + + if table_structure_recognizer_type == "onnx": + logging.debug("Using Onnx table structure recognizer", flush=True) + tbls = super().__call__(images, thr) + else: # ascend + logging.debug("Using Ascend table structure recognizer", flush=True) + tbls = self._run_ascend_tsr(images, thr) + + res = [] + # align left&right for rows, align top&bottom for columns + for tbl in tbls: + lts = [ + { + "label": b["type"], + "score": b["score"], + "x0": b["bbox"][0], + "x1": b["bbox"][2], + "top": b["bbox"][1], + "bottom": b["bbox"][-1], + } + for b in tbl + ] + if not lts: + continue + + left = [b["x0"] for b in lts if b["label"].find("row") > 0 or b["label"].find("header") > 0] + right = [b["x1"] for b in lts if b["label"].find("row") > 0 or b["label"].find("header") > 0] + if not left: + continue + left = np.mean(left) if len(left) > 4 else np.min(left) + right = np.mean(right) if len(right) > 4 else np.max(right) + for b in lts: + if b["label"].find("row") > 0 or b["label"].find("header") > 0: + if b["x0"] > left: + b["x0"] = left + if b["x1"] < right: + b["x1"] = right + + top = [b["top"] for b in lts if b["label"] == "table column"] + bottom = [b["bottom"] for b in lts if b["label"] == "table column"] + if not top: + res.append(lts) + continue + top = np.median(top) if len(top) > 4 else np.min(top) + bottom = np.median(bottom) if len(bottom) > 4 else np.max(bottom) + for b in lts: + if b["label"] == "table column": + if b["top"] > top: + b["top"] = top + if b["bottom"] < bottom: + b["bottom"] = bottom + + res.append(lts) + return res + + @staticmethod + def is_caption(bx): + patt = [r"[图表]+[ 0-9::]{2,}"] + if any([re.match(p, bx["text"].strip()) for p in patt]) or bx.get("layout_type", "").find("caption") >= 0: + return True + return False + + @staticmethod + def blockType(b): + patt = [ + ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"), + (r"^(20|19)[0-9]{2}年$", "Dt"), + (r"^(20|19)[0-9]{2}[年-][0-9]{1,2}月*$", "Dt"), + ("^[0-9]{1,2}[月-][0-9]{1,2}日*$", "Dt"), + (r"^第*[一二三四1-4]季度$", "Dt"), + (r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"), + (r"^(20|19)[0-9]{2}[ABCDE]$", "Dt"), + ("^[0-9.,+%/ -]+$", "Nu"), + (r"^[0-9A-Z/\._~-]+$", "Ca"), + (r"^[A-Z]*[a-z' -]+$", "En"), + (r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$", "NE"), + (r"^.{1}$", "Sg"), + ] + for p, n in patt: + if re.search(p, b["text"].strip()): + return n + tks = [t for t in rag_tokenizer.tokenize(b["text"]).split() if len(t) > 1] + if len(tks) > 3: + if len(tks) < 12: + return "Tx" + else: + return "Lx" + + if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr": + return "Nr" + + return "Ot" + + @staticmethod + def construct_table(boxes, is_english=False, html=True, **kwargs): + cap = "" + i = 0 + while i < len(boxes): + if TableStructureRecognizer.is_caption(boxes[i]): + if is_english: + cap + " " + cap += boxes[i]["text"] + boxes.pop(i) + i -= 1 + i += 1 + + if not boxes: + return [] + for b in boxes: + b["btype"] = TableStructureRecognizer.blockType(b) + max_type = Counter([b["btype"] for b in boxes]).items() + max_type = max(max_type, key=lambda x: x[1])[0] if max_type else "" + logging.debug("MAXTYPE: " + max_type) + + rowh = [b["R_bott"] - b["R_top"] for b in boxes if "R" in b] + rowh = np.min(rowh) if rowh else 0 + boxes = Recognizer.sort_R_firstly(boxes, rowh / 2) + # for b in boxes:print(b) + boxes[0]["rn"] = 0 + rows = [[boxes[0]]] + btm = boxes[0]["bottom"] + for b in boxes[1:]: + b["rn"] = len(rows) - 1 + lst_r = rows[-1] + if lst_r[-1].get("R", "") != b.get("R", "") or (b["top"] >= btm - 3 and lst_r[-1].get("R", "-1") != b.get("R", "-2")): # new row + btm = b["bottom"] + b["rn"] += 1 + rows.append([b]) + continue + btm = (btm + b["bottom"]) / 2.0 + rows[-1].append(b) + + colwm = [b["C_right"] - b["C_left"] for b in boxes if "C" in b] + colwm = np.min(colwm) if colwm else 0 + crosspage = len(set([b["page_number"] for b in boxes])) > 1 + if crosspage: + boxes = Recognizer.sort_X_firstly(boxes, colwm / 2) + else: + boxes = Recognizer.sort_C_firstly(boxes, colwm / 2) + boxes[0]["cn"] = 0 + cols = [[boxes[0]]] + right = boxes[0]["x1"] + for b in boxes[1:]: + b["cn"] = len(cols) - 1 + lst_c = cols[-1] + if (int(b.get("C", "1")) - int(lst_c[-1].get("C", "1")) == 1 and b["page_number"] == lst_c[-1]["page_number"]) or ( + b["x0"] >= right and lst_c[-1].get("C", "-1") != b.get("C", "-2") + ): # new col + right = b["x1"] + b["cn"] += 1 + cols.append([b]) + continue + right = (right + b["x1"]) / 2.0 + cols[-1].append(b) + + tbl = [[[] for _ in range(len(cols))] for _ in range(len(rows))] + for b in boxes: + tbl[b["rn"]][b["cn"]].append(b) + + if len(rows) >= 4: + # remove single in column + j = 0 + while j < len(tbl[0]): + e, ii = 0, 0 + for i in range(len(tbl)): + if tbl[i][j]: + e += 1 + ii = i + if e > 1: + break + if e > 1: + j += 1 + continue + f = (j > 0 and tbl[ii][j - 1] and tbl[ii][j - 1][0].get("text")) or j == 0 + ff = (j + 1 < len(tbl[ii]) and tbl[ii][j + 1] and tbl[ii][j + 1][0].get("text")) or j + 1 >= len(tbl[ii]) + if f and ff: + j += 1 + continue + bx = tbl[ii][j][0] + logging.debug("Relocate column single: " + bx["text"]) + # j column only has one value + left, right = 100000, 100000 + if j > 0 and not f: + for i in range(len(tbl)): + if tbl[i][j - 1]: + left = min(left, np.min([bx["x0"] - a["x1"] for a in tbl[i][j - 1]])) + if j + 1 < len(tbl[0]) and not ff: + for i in range(len(tbl)): + if tbl[i][j + 1]: + right = min(right, np.min([a["x0"] - bx["x1"] for a in tbl[i][j + 1]])) + assert left < 100000 or right < 100000 + if left < right: + for jj in range(j, len(tbl[0])): + for i in range(len(tbl)): + for a in tbl[i][jj]: + a["cn"] -= 1 + if tbl[ii][j - 1]: + tbl[ii][j - 1].extend(tbl[ii][j]) + else: + tbl[ii][j - 1] = tbl[ii][j] + for i in range(len(tbl)): + tbl[i].pop(j) + + else: + for jj in range(j + 1, len(tbl[0])): + for i in range(len(tbl)): + for a in tbl[i][jj]: + a["cn"] -= 1 + if tbl[ii][j + 1]: + tbl[ii][j + 1].extend(tbl[ii][j]) + else: + tbl[ii][j + 1] = tbl[ii][j] + for i in range(len(tbl)): + tbl[i].pop(j) + cols.pop(j) + assert len(cols) == len(tbl[0]), "Column NO. miss matched: %d vs %d" % (len(cols), len(tbl[0])) + + if len(cols) >= 4: + # remove single in row + i = 0 + while i < len(tbl): + e, jj = 0, 0 + for j in range(len(tbl[i])): + if tbl[i][j]: + e += 1 + jj = j + if e > 1: + break + if e > 1: + i += 1 + continue + f = (i > 0 and tbl[i - 1][jj] and tbl[i - 1][jj][0].get("text")) or i == 0 + ff = (i + 1 < len(tbl) and tbl[i + 1][jj] and tbl[i + 1][jj][0].get("text")) or i + 1 >= len(tbl) + if f and ff: + i += 1 + continue + + bx = tbl[i][jj][0] + logging.debug("Relocate row single: " + bx["text"]) + # i row only has one value + up, down = 100000, 100000 + if i > 0 and not f: + for j in range(len(tbl[i - 1])): + if tbl[i - 1][j]: + up = min(up, np.min([bx["top"] - a["bottom"] for a in tbl[i - 1][j]])) + if i + 1 < len(tbl) and not ff: + for j in range(len(tbl[i + 1])): + if tbl[i + 1][j]: + down = min(down, np.min([a["top"] - bx["bottom"] for a in tbl[i + 1][j]])) + assert up < 100000 or down < 100000 + if up < down: + for ii in range(i, len(tbl)): + for j in range(len(tbl[ii])): + for a in tbl[ii][j]: + a["rn"] -= 1 + if tbl[i - 1][jj]: + tbl[i - 1][jj].extend(tbl[i][jj]) + else: + tbl[i - 1][jj] = tbl[i][jj] + tbl.pop(i) + + else: + for ii in range(i + 1, len(tbl)): + for j in range(len(tbl[ii])): + for a in tbl[ii][j]: + a["rn"] -= 1 + if tbl[i + 1][jj]: + tbl[i + 1][jj].extend(tbl[i][jj]) + else: + tbl[i + 1][jj] = tbl[i][jj] + tbl.pop(i) + rows.pop(i) + + # which rows are headers + hdset = set([]) + for i in range(len(tbl)): + cnt, h = 0, 0 + for j, arr in enumerate(tbl[i]): + if not arr: + continue + cnt += 1 + if max_type == "Nu" and arr[0]["btype"] == "Nu": + continue + if any([a.get("H") for a in arr]) or (max_type == "Nu" and arr[0]["btype"] != "Nu"): + h += 1 + if h / cnt > 0.5: + hdset.add(i) + + if html: + return TableStructureRecognizer.__html_table(cap, hdset, TableStructureRecognizer.__cal_spans(boxes, rows, cols, tbl, True)) + + return TableStructureRecognizer.__desc_table(cap, hdset, TableStructureRecognizer.__cal_spans(boxes, rows, cols, tbl, False), is_english) + + @staticmethod + def __html_table(cap, hdset, tbl): + # constrcut HTML + html = "" + if cap: + html += f"" + for i in range(len(tbl)): + row = "" + txts = [] + for j, arr in enumerate(tbl[i]): + if arr is None: + continue + if not arr: + row += "" if i not in hdset else "" + continue + txt = "" + if arr: + h = min(np.min([c["bottom"] - c["top"] for c in arr]) / 2, 10) + txt = " ".join([c["text"] for c in Recognizer.sort_Y_firstly(arr, h)]) + txts.append(txt) + sp = "" + if arr[0].get("colspan"): + sp = "colspan={}".format(arr[0]["colspan"]) + if arr[0].get("rowspan"): + sp += " rowspan={}".format(arr[0]["rowspan"]) + if i in hdset: + row += f"" + else: + row += f"" + + if i in hdset: + if all([t in hdset for t in txts]): + continue + for t in txts: + hdset.add(t) + + if row != "": + row += "" + else: + row = "" + html += "\n" + row + html += "\n
{cap}
" + txt + "" + txt + "
" + return html + + @staticmethod + def __desc_table(cap, hdr_rowno, tbl, is_english): + # get text of every colomn in header row to become header text + clmno = len(tbl[0]) + rowno = len(tbl) + headers = {} + hdrset = set() + lst_hdr = [] + de = "的" if not is_english else " for " + for r in sorted(list(hdr_rowno)): + headers[r] = ["" for _ in range(clmno)] + for i in range(clmno): + if not tbl[r][i]: + continue + txt = " ".join([a["text"].strip() for a in tbl[r][i]]) + headers[r][i] = txt + hdrset.add(txt) + if all([not t for t in headers[r]]): + del headers[r] + hdr_rowno.remove(r) + continue + for j in range(clmno): + if headers[r][j]: + continue + if j >= len(lst_hdr): + break + headers[r][j] = lst_hdr[j] + lst_hdr = headers[r] + for i in range(rowno): + if i not in hdr_rowno: + continue + for j in range(i + 1, rowno): + if j not in hdr_rowno: + break + for k in range(clmno): + if not headers[j - 1][k]: + continue + if headers[j][k].find(headers[j - 1][k]) >= 0: + continue + if len(headers[j][k]) > len(headers[j - 1][k]): + headers[j][k] += (de if headers[j][k] else "") + headers[j - 1][k] + else: + headers[j][k] = headers[j - 1][k] + (de if headers[j - 1][k] else "") + headers[j][k] + + logging.debug(f">>>>>>>>>>>>>>>>>{cap}:SIZE:{rowno}X{clmno} Header: {hdr_rowno}") + row_txt = [] + for i in range(rowno): + if i in hdr_rowno: + continue + rtxt = [] + + def append(delimer): + nonlocal rtxt, row_txt + rtxt = delimer.join(rtxt) + if row_txt and len(row_txt[-1]) + len(rtxt) < 64: + row_txt[-1] += "\n" + rtxt + else: + row_txt.append(rtxt) + + r = 0 + if len(headers.items()): + _arr = [(i - r, r) for r, _ in headers.items() if r < i] + if _arr: + _, r = min(_arr, key=lambda x: x[0]) + + if r not in headers and clmno <= 2: + for j in range(clmno): + if not tbl[i][j]: + continue + txt = "".join([a["text"].strip() for a in tbl[i][j]]) + if txt: + rtxt.append(txt) + if rtxt: + append(":") + continue + + for j in range(clmno): + if not tbl[i][j]: + continue + txt = "".join([a["text"].strip() for a in tbl[i][j]]) + if not txt: + continue + ctt = headers[r][j] if r in headers else "" + if ctt: + ctt += ":" + ctt += txt + if ctt: + rtxt.append(ctt) + + if rtxt: + row_txt.append("; ".join(rtxt)) + + if cap: + if is_english: + from_ = " in " + else: + from_ = "来自" + row_txt = [t + f"\t——{from_}“{cap}”" for t in row_txt] + return row_txt + + @staticmethod + def __cal_spans(boxes, rows, cols, tbl, html=True): + # caculate span + clft = [np.mean([c.get("C_left", c["x0"]) for c in cln]) for cln in cols] + crgt = [np.mean([c.get("C_right", c["x1"]) for c in cln]) for cln in cols] + rtop = [np.mean([c.get("R_top", c["top"]) for c in row]) for row in rows] + rbtm = [np.mean([c.get("R_btm", c["bottom"]) for c in row]) for row in rows] + for b in boxes: + if "SP" not in b: + continue + b["colspan"] = [b["cn"]] + b["rowspan"] = [b["rn"]] + # col span + for j in range(0, len(clft)): + if j == b["cn"]: + continue + if clft[j] + (crgt[j] - clft[j]) / 2 < b["H_left"]: + continue + if crgt[j] - (crgt[j] - clft[j]) / 2 > b["H_right"]: + continue + b["colspan"].append(j) + # row span + for j in range(0, len(rtop)): + if j == b["rn"]: + continue + if rtop[j] + (rbtm[j] - rtop[j]) / 2 < b["H_top"]: + continue + if rbtm[j] - (rbtm[j] - rtop[j]) / 2 > b["H_bott"]: + continue + b["rowspan"].append(j) + + def join(arr): + if not arr: + return "" + return "".join([t["text"] for t in arr]) + + # rm the spaning cells + for i in range(len(tbl)): + for j, arr in enumerate(tbl[i]): + if not arr: + continue + if all(["rowspan" not in a and "colspan" not in a for a in arr]): + continue + rowspan, colspan = [], [] + for a in arr: + if isinstance(a.get("rowspan", 0), list): + rowspan.extend(a["rowspan"]) + if isinstance(a.get("colspan", 0), list): + colspan.extend(a["colspan"]) + rowspan, colspan = set(rowspan), set(colspan) + if len(rowspan) < 2 and len(colspan) < 2: + for a in arr: + if "rowspan" in a: + del a["rowspan"] + if "colspan" in a: + del a["colspan"] + continue + rowspan, colspan = sorted(rowspan), sorted(colspan) + rowspan = list(range(rowspan[0], rowspan[-1] + 1)) + colspan = list(range(colspan[0], colspan[-1] + 1)) + assert i in rowspan, rowspan + assert j in colspan, colspan + arr = [] + for r in rowspan: + for c in colspan: + arr_txt = join(arr) + if tbl[r][c] and join(tbl[r][c]) != arr_txt: + arr.extend(tbl[r][c]) + tbl[r][c] = None if html else arr + for a in arr: + if len(rowspan) > 1: + a["rowspan"] = len(rowspan) + elif "rowspan" in a: + del a["rowspan"] + if len(colspan) > 1: + a["colspan"] = len(colspan) + elif "colspan" in a: + del a["colspan"] + tbl[rowspan[0]][colspan[0]] = arr + + return tbl + + def _run_ascend_tsr(self, image_list, thr=0.2, batch_size=16): + import math + + from ais_bench.infer.interface import InferSession + + model_dir = os.path.join(get_project_base_directory(), "rag/res/deepdoc") + model_file_path = os.path.join(model_dir, "tsr.om") + + if not os.path.exists(model_file_path): + raise ValueError(f"Model file not found: {model_file_path}") + + device_id = int(os.getenv("ASCEND_LAYOUT_RECOGNIZER_DEVICE_ID", 0)) + session = InferSession(device_id=device_id, model_path=model_file_path) + + images = [np.array(im) if not isinstance(im, np.ndarray) else im for im in image_list] + results = [] + + conf_thr = max(thr, 0.08) + + batch_loop_cnt = math.ceil(float(len(images)) / batch_size) + for bi in range(batch_loop_cnt): + s = bi * batch_size + e = min((bi + 1) * batch_size, len(images)) + batch_images = images[s:e] + + inputs_list = self.preprocess(batch_images) + for ins in inputs_list: + feeds = [] + if "image" in ins: + feeds.append(ins["image"]) + else: + feeds.append(ins[self.input_names[0]]) + output_list = session.infer(feeds=feeds, mode="static") + bb = self.postprocess(output_list, ins, conf_thr) + results.append(bb) + return results diff --git a/docker/.env b/docker/.env index db3a4cc..d4f3bc2 100644 --- a/docker/.env +++ b/docker/.env @@ -98,7 +98,7 @@ ADMIN_SVR_HTTP_PORT=9381 # The RAGFlow Docker image to download. # Defaults to the v0.21.1-slim edition, which is the RAGFlow Docker image without embedding models. -RAGFLOW_IMAGE=infiniflow/ragflow:v0.21.1-slim +RAGFLOW_IMAGE=infiniflow/ragflow:v0.21.1-fastapi # # To download the RAGFlow Docker image with embedding models, uncomment the following line instead: # RAGFLOW_IMAGE=infiniflow/ragflow:v0.21.1 diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh old mode 100644 new mode 100755 index 7f660d0..2b68439 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -188,7 +188,7 @@ if [[ "${ENABLE_WEBSERVER}" -eq 1 ]]; then echo "Starting ragflow_server..." while true; do - "$PY" api/ragflow_server.py + "$PY" api/ragflow_server_fastapi.py done & fi diff --git a/pyproject.toml b/pyproject.toml index e618534..4d1c0f5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,12 +34,14 @@ dependencies = [ "elastic-transport==8.12.0", "elasticsearch==8.12.1", "elasticsearch-dsl==8.12.0", + "email-validator>=2.0.0", "extract-msg>=0.39.0", "filelock==3.15.4", "flask==3.0.3", "flask-cors==5.0.0", "flask-login==0.6.3", "flask-session==0.8.0", + "fastapi==0.118.2", "google-search-results==2.4.2", "groq==0.9.0", "hanziconv==0.3.2", diff --git a/rag/res/deepdoc/README.md b/rag/res/deepdoc/README.md new file mode 100644 index 0000000..14f1992 --- /dev/null +++ b/rag/res/deepdoc/README.md @@ -0,0 +1,19 @@ +--- +license: apache-2.0 +--- + +### Model Loading +```python +import xgboost as xgb +import torch + +model = xgb.Booster() +if torch.cuda.is_available(): +model.set_param({"device": "cuda"}) +model.load_model('InfiniFlow/text_concat_xgb_v1.0') +``` + +### Prediction +```python +model.predict(xgb.DMatrix([feature]))[0] +``` \ No newline at end of file diff --git a/rag/res/deepdoc/det.onnx b/rag/res/deepdoc/det.onnx new file mode 100644 index 0000000..1394b95 Binary files /dev/null and b/rag/res/deepdoc/det.onnx differ diff --git a/rag/res/deepdoc/ocr.res b/rag/res/deepdoc/ocr.res new file mode 100644 index 0000000..84b885d --- /dev/null +++ b/rag/res/deepdoc/ocr.res @@ -0,0 +1,6623 @@ +' +疗 +绚 +诚 +娇 +溜 +题 +贿 +者 +廖 +更 +纳 +加 +奉 +公 +一 +就 +汴 +计 +与 +路 +房 +原 +妇 +2 +0 +8 +- +7 +其 +> +: +] +, +, +骑 +刈 +全 +消 +昏 +傈 +安 +久 +钟 +嗅 +不 +影 +处 +驽 +蜿 +资 +关 +椤 +地 +瘸 +专 +问 +忖 +票 +嫉 +炎 +韵 +要 +月 +田 +节 +陂 +鄙 +捌 +备 +拳 +伺 +眼 +网 +盎 +大 +傍 +心 +东 +愉 +汇 +蹿 +科 +每 +业 +里 +航 +晏 +字 +平 +录 +先 +1 +3 +彤 +鲶 +产 +稍 +督 +腴 +有 +象 +岳 +注 +绍 +在 +泺 +文 +定 +核 +名 +水 +过 +理 +让 +偷 +率 +等 +这 +发 +” +为 +含 +肥 +酉 +相 +鄱 +七 +编 +猥 +锛 +日 +镀 +蒂 +掰 +倒 +辆 +栾 +栗 +综 +涩 +州 +雌 +滑 +馀 +了 +机 +块 +司 +宰 +甙 +兴 +矽 +抚 +保 +用 +沧 +秩 +如 +收 +息 +滥 +页 +疑 +埠 +! +! +姥 +异 +橹 +钇 +向 +下 +跄 +的 +椴 +沫 +国 +绥 +獠 +报 +开 +民 +蜇 +何 +分 +凇 +长 +讥 +藏 +掏 +施 +羽 +中 +讲 +派 +嘟 +人 +提 +浼 +间 +世 +而 +古 +多 +倪 +唇 +饯 +控 +庚 +首 +赛 +蜓 +味 +断 +制 +觉 +技 +替 +艰 +溢 +潮 +夕 +钺 +外 +摘 +枋 +动 +双 +单 +啮 +户 +枇 +确 +锦 +曜 +杜 +或 +能 +效 +霜 +盒 +然 +侗 +电 +晁 +放 +步 +鹃 +新 +杖 +蜂 +吒 +濂 +瞬 +评 +总 +隍 +对 +独 +合 +也 +是 +府 +青 +天 +诲 +墙 +组 +滴 +级 +邀 +帘 +示 +已 +时 +骸 +仄 +泅 +和 +遨 +店 +雇 +疫 +持 +巍 +踮 +境 +只 +亨 +目 +鉴 +崤 +闲 +体 +泄 +杂 +作 +般 +轰 +化 +解 +迂 +诿 +蛭 +璀 +腾 +告 +版 +服 +省 +师 +小 +规 +程 +线 +海 +办 +引 +二 +桧 +牌 +砺 +洄 +裴 +修 +图 +痫 +胡 +许 +犊 +事 +郛 +基 +柴 +呼 +食 +研 +奶 +律 +蛋 +因 +葆 +察 +戏 +褒 +戒 +再 +李 +骁 +工 +貂 +油 +鹅 +章 +啄 +休 +场 +给 +睡 +纷 +豆 +器 +捎 +说 +敏 +学 +会 +浒 +设 +诊 +格 +廓 +查 +来 +霓 +室 +溆 +¢ +诡 +寥 +焕 +舜 +柒 +狐 +回 +戟 +砾 +厄 +实 +翩 +尿 +五 +入 +径 +惭 +喹 +股 +宇 +篝 +| +; +美 +期 +云 +九 +祺 +扮 +靠 +锝 +槌 +系 +企 +酰 +阊 +暂 +蚕 +忻 +豁 +本 +羹 +执 +条 +钦 +H +獒 +限 +进 +季 +楦 +于 +芘 +玖 +铋 +茯 +未 +答 +粘 +括 +样 +精 +欠 +矢 +甥 +帷 +嵩 +扣 +令 +仔 +风 +皈 +行 +支 +部 +蓉 +刮 +站 +蜡 +救 +钊 +汗 +松 +嫌 +成 +可 +. +鹤 +院 +从 +交 +政 +怕 +活 +调 +球 +局 +验 +髌 +第 +韫 +谗 +串 +到 +圆 +年 +米 +/ +* +友 +忿 +检 +区 +看 +自 +敢 +刃 +个 +兹 +弄 +流 +留 +同 +没 +齿 +星 +聆 +轼 +湖 +什 +三 +建 +蛔 +儿 +椋 +汕 +震 +颧 +鲤 +跟 +力 +情 +璺 +铨 +陪 +务 +指 +族 +训 +滦 +鄣 +濮 +扒 +商 +箱 +十 +召 +慷 +辗 +所 +莞 +管 +护 +臭 +横 +硒 +嗓 +接 +侦 +六 +露 +党 +馋 +驾 +剖 +高 +侬 +妪 +幂 +猗 +绺 +骐 +央 +酐 +孝 +筝 +课 +徇 +缰 +门 +男 +西 +项 +句 +谙 +瞒 +秃 +篇 +教 +碲 +罚 +声 +呐 +景 +前 +富 +嘴 +鳌 +稀 +免 +朋 +啬 +睐 +去 +赈 +鱼 +住 +肩 +愕 +速 +旁 +波 +厅 +健 +茼 +厥 +鲟 +谅 +投 +攸 +炔 +数 +方 +击 +呋 +谈 +绩 +别 +愫 +僚 +躬 +鹧 +胪 +炳 +招 +喇 +膨 +泵 +蹦 +毛 +结 +5 +4 +谱 +识 +陕 +粽 +婚 +拟 +构 +且 +搜 +任 +潘 +比 +郢 +妨 +醪 +陀 +桔 +碘 +扎 +选 +哈 +骷 +楷 +亿 +明 +缆 +脯 +监 +睫 +逻 +婵 +共 +赴 +淝 +凡 +惦 +及 +达 +揖 +谩 +澹 +减 +焰 +蛹 +番 +祁 +柏 +员 +禄 +怡 +峤 +龙 +白 +叽 +生 +闯 +起 +细 +装 +谕 +竟 +聚 +钙 +上 +导 +渊 +按 +艾 +辘 +挡 +耒 +盹 +饪 +臀 +记 +邮 +蕙 +受 +各 +医 +搂 +普 +滇 +朗 +茸 +带 +翻 +酚 +( +光 +堤 +墟 +蔷 +万 +幻 +〓 +瑙 +辈 +昧 +盏 +亘 +蛀 +吉 +铰 +请 +子 +假 +闻 +税 +井 +诩 +哨 +嫂 +好 +面 +琐 +校 +馊 +鬣 +缂 +营 +访 +炖 +占 +农 +缀 +否 +经 +钚 +棵 +趟 +张 +亟 +吏 +茶 +谨 +捻 +论 +迸 +堂 +玉 +信 +吧 +瞠 +乡 +姬 +寺 +咬 +溏 +苄 +皿 +意 +赉 +宝 +尔 +钰 +艺 +特 +唳 +踉 +都 +荣 +倚 +登 +荐 +丧 +奇 +涵 +批 +炭 +近 +符 +傩 +感 +道 +着 +菊 +虹 +仲 +众 +懈 +濯 +颞 +眺 +南 +释 +北 +缝 +标 +既 +茗 +整 +撼 +迤 +贲 +挎 +耱 +拒 +某 +妍 +卫 +哇 +英 +矶 +藩 +治 +他 +元 +领 +膜 +遮 +穗 +蛾 +飞 +荒 +棺 +劫 +么 +市 +火 +温 +拈 +棚 +洼 +转 +果 +奕 +卸 +迪 +伸 +泳 +斗 +邡 +侄 +涨 +屯 +萋 +胭 +氡 +崮 +枞 +惧 +冒 +彩 +斜 +手 +豚 +随 +旭 +淑 +妞 +形 +菌 +吲 +沱 +争 +驯 +歹 +挟 +兆 +柱 +传 +至 +包 +内 +响 +临 +红 +功 +弩 +衡 +寂 +禁 +老 +棍 +耆 +渍 +织 +害 +氵 +渑 +布 +载 +靥 +嗬 +虽 +苹 +咨 +娄 +库 +雉 +榜 +帜 +嘲 +套 +瑚 +亲 +簸 +欧 +边 +6 +腿 +旮 +抛 +吹 +瞳 +得 +镓 +梗 +厨 +继 +漾 +愣 +憨 +士 +策 +窑 +抑 +躯 +襟 +脏 +参 +贸 +言 +干 +绸 +鳄 +穷 +藜 +音 +折 +详 +) +举 +悍 +甸 +癌 +黎 +谴 +死 +罩 +迁 +寒 +驷 +袖 +媒 +蒋 +掘 +模 +纠 +恣 +观 +祖 +蛆 +碍 +位 +稿 +主 +澧 +跌 +筏 +京 +锏 +帝 +贴 +证 +糠 +才 +黄 +鲸 +略 +炯 +饱 +四 +出 +园 +犀 +牧 +容 +汉 +杆 +浈 +汰 +瑷 +造 +虫 +瘩 +怪 +驴 +济 +应 +花 +沣 +谔 +夙 +旅 +价 +矿 +以 +考 +s +u +呦 +晒 +巡 +茅 +准 +肟 +瓴 +詹 +仟 +褂 +译 +桌 +混 +宁 +怦 +郑 +抿 +些 +余 +鄂 +饴 +攒 +珑 +群 +阖 +岔 +琨 +藓 +预 +环 +洮 +岌 +宀 +杲 +瀵 +最 +常 +囡 +周 +踊 +女 +鼓 +袭 +喉 +简 +范 +薯 +遐 +疏 +粱 +黜 +禧 +法 +箔 +斤 +遥 +汝 +奥 +直 +贞 +撑 +置 +绱 +集 +她 +馅 +逗 +钧 +橱 +魉 +[ +恙 +躁 +唤 +9 +旺 +膘 +待 +脾 +惫 +购 +吗 +依 +盲 +度 +瘿 +蠖 +俾 +之 +镗 +拇 +鲵 +厝 +簧 +续 +款 +展 +啃 +表 +剔 +品 +钻 +腭 +损 +清 +锶 +统 +涌 +寸 +滨 +贪 +链 +吠 +冈 +伎 +迥 +咏 +吁 +览 +防 +迅 +失 +汾 +阔 +逵 +绀 +蔑 +列 +川 +凭 +努 +熨 +揪 +利 +俱 +绉 +抢 +鸨 +我 +即 +责 +膦 +易 +毓 +鹊 +刹 +玷 +岿 +空 +嘞 +绊 +排 +术 +估 +锷 +违 +们 +苟 +铜 +播 +肘 +件 +烫 +审 +鲂 +广 +像 +铌 +惰 +铟 +巳 +胍 +鲍 +康 +憧 +色 +恢 +想 +拷 +尤 +疳 +知 +S +Y +F +D +A +峄 +裕 +帮 +握 +搔 +氐 +氘 +难 +墒 +沮 +雨 +叁 +缥 +悴 +藐 +湫 +娟 +苑 +稠 +颛 +簇 +后 +阕 +闭 +蕤 +缚 +怎 +佞 +码 +嘤 +蔡 +痊 +舱 +螯 +帕 +赫 +昵 +升 +烬 +岫 +、 +疵 +蜻 +髁 +蕨 +隶 +烛 +械 +丑 +盂 +梁 +强 +鲛 +由 +拘 +揉 +劭 +龟 +撤 +钩 +呕 +孛 +费 +妻 +漂 +求 +阑 +崖 +秤 +甘 +通 +深 +补 +赃 +坎 +床 +啪 +承 +吼 +量 +暇 +钼 +烨 +阂 +擎 +脱 +逮 +称 +P +神 +属 +矗 +华 +届 +狍 +葑 +汹 +育 +患 +窒 +蛰 +佼 +静 +槎 +运 +鳗 +庆 +逝 +曼 +疱 +克 +代 +官 +此 +麸 +耧 +蚌 +晟 +例 +础 +榛 +副 +测 +唰 +缢 +迹 +灬 +霁 +身 +岁 +赭 +扛 +又 +菡 +乜 +雾 +板 +读 +陷 +徉 +贯 +郁 +虑 +变 +钓 +菜 +圾 +现 +琢 +式 +乐 +维 +渔 +浜 +左 +吾 +脑 +钡 +警 +T +啵 +拴 +偌 +漱 +湿 +硕 +止 +骼 +魄 +积 +燥 +联 +踢 +玛 +则 +窿 +见 +振 +畿 +送 +班 +钽 +您 +赵 +刨 +印 +讨 +踝 +籍 +谡 +舌 +崧 +汽 +蔽 +沪 +酥 +绒 +怖 +财 +帖 +肱 +私 +莎 +勋 +羔 +霸 +励 +哼 +帐 +将 +帅 +渠 +纪 +婴 +娩 +岭 +厘 +滕 +吻 +伤 +坝 +冠 +戊 +隆 +瘁 +介 +涧 +物 +黍 +并 +姗 +奢 +蹑 +掣 +垸 +锴 +命 +箍 +捉 +病 +辖 +琰 +眭 +迩 +艘 +绌 +繁 +寅 +若 +毋 +思 +诉 +类 +诈 +燮 +轲 +酮 +狂 +重 +反 +职 +筱 +县 +委 +磕 +绣 +奖 +晋 +濉 +志 +徽 +肠 +呈 +獐 +坻 +口 +片 +碰 +几 +村 +柿 +劳 +料 +获 +亩 +惕 +晕 +厌 +号 +罢 +池 +正 +鏖 +煨 +家 +棕 +复 +尝 +懋 +蜥 +锅 +岛 +扰 +队 +坠 +瘾 +钬 +@ +卧 +疣 +镇 +譬 +冰 +彷 +频 +黯 +据 +垄 +采 +八 +缪 +瘫 +型 +熹 +砰 +楠 +襁 +箐 +但 +嘶 +绳 +啤 +拍 +盥 +穆 +傲 +洗 +盯 +塘 +怔 +筛 +丿 +台 +恒 +喂 +葛 +永 +¥ +烟 +酒 +桦 +书 +砂 +蚝 +缉 +态 +瀚 +袄 +圳 +轻 +蛛 +超 +榧 +遛 +姒 +奘 +铮 +右 +荽 +望 +偻 +卡 +丶 +氰 +附 +做 +革 +索 +戚 +坨 +桷 +唁 +垅 +榻 +岐 +偎 +坛 +莨 +山 +殊 +微 +骇 +陈 +爨 +推 +嗝 +驹 +澡 +藁 +呤 +卤 +嘻 +糅 +逛 +侵 +郓 +酌 +德 +摇 +※ +鬃 +被 +慨 +殡 +羸 +昌 +泡 +戛 +鞋 +河 +宪 +沿 +玲 +鲨 +翅 +哽 +源 +铅 +语 +照 +邯 +址 +荃 +佬 +顺 +鸳 +町 +霭 +睾 +瓢 +夸 +椁 +晓 +酿 +痈 +咔 +侏 +券 +噎 +湍 +签 +嚷 +离 +午 +尚 +社 +锤 +背 +孟 +使 +浪 +缦 +潍 +鞅 +军 +姹 +驶 +笑 +鳟 +鲁 +》 +孽 +钜 +绿 +洱 +礴 +焯 +椰 +颖 +囔 +乌 +孔 +巴 +互 +性 +椽 +哞 +聘 +昨 +早 +暮 +胶 +炀 +隧 +低 +彗 +昝 +铁 +呓 +氽 +藉 +喔 +癖 +瑗 +姨 +权 +胱 +韦 +堑 +蜜 +酋 +楝 +砝 +毁 +靓 +歙 +锲 +究 +屋 +喳 +骨 +辨 +碑 +武 +鸠 +宫 +辜 +烊 +适 +坡 +殃 +培 +佩 +供 +走 +蜈 +迟 +翼 +况 +姣 +凛 +浔 +吃 +飘 +债 +犟 +金 +促 +苛 +崇 +坂 +莳 +畔 +绂 +兵 +蠕 +斋 +根 +砍 +亢 +欢 +恬 +崔 +剁 +餐 +榫 +快 +扶 +‖ +濒 +缠 +鳜 +当 +彭 +驭 +浦 +篮 +昀 +锆 +秸 +钳 +弋 +娣 +瞑 +夷 +龛 +苫 +拱 +致 +% +嵊 +障 +隐 +弑 +初 +娓 +抉 +汩 +累 +蓖 +" +唬 +助 +苓 +昙 +押 +毙 +破 +城 +郧 +逢 +嚏 +獭 +瞻 +溱 +婿 +赊 +跨 +恼 +璧 +萃 +姻 +貉 +灵 +炉 +密 +氛 +陶 +砸 +谬 +衔 +点 +琛 +沛 +枳 +层 +岱 +诺 +脍 +榈 +埂 +征 +冷 +裁 +打 +蹴 +素 +瘘 +逞 +蛐 +聊 +激 +腱 +萘 +踵 +飒 +蓟 +吆 +取 +咙 +簋 +涓 +矩 +曝 +挺 +揣 +座 +你 +史 +舵 +焱 +尘 +苏 +笈 +脚 +溉 +榨 +诵 +樊 +邓 +焊 +义 +庶 +儋 +蟋 +蒲 +赦 +呷 +杞 +诠 +豪 +还 +试 +颓 +茉 +太 +除 +紫 +逃 +痴 +草 +充 +鳕 +珉 +祗 +墨 +渭 +烩 +蘸 +慕 +璇 +镶 +穴 +嵘 +恶 +骂 +险 +绋 +幕 +碉 +肺 +戳 +刘 +潞 +秣 +纾 +潜 +銮 +洛 +须 +罘 +销 +瘪 +汞 +兮 +屉 +r +林 +厕 +质 +探 +划 +狸 +殚 +善 +煊 +烹 +〒 +锈 +逯 +宸 +辍 +泱 +柚 +袍 +远 +蹋 +嶙 +绝 +峥 +娥 +缍 +雀 +徵 +认 +镱 +谷 += +贩 +勉 +撩 +鄯 +斐 +洋 +非 +祚 +泾 +诒 +饿 +撬 +威 +晷 +搭 +芍 +锥 +笺 +蓦 +候 +琊 +档 +礁 +沼 +卵 +荠 +忑 +朝 +凹 +瑞 +头 +仪 +弧 +孵 +畏 +铆 +突 +衲 +车 +浩 +气 +茂 +悖 +厢 +枕 +酝 +戴 +湾 +邹 +飚 +攘 +锂 +写 +宵 +翁 +岷 +无 +喜 +丈 +挑 +嗟 +绛 +殉 +议 +槽 +具 +醇 +淞 +笃 +郴 +阅 +饼 +底 +壕 +砚 +弈 +询 +缕 +庹 +翟 +零 +筷 +暨 +舟 +闺 +甯 +撞 +麂 +茌 +蔼 +很 +珲 +捕 +棠 +角 +阉 +媛 +娲 +诽 +剿 +尉 +爵 +睬 +韩 +诰 +匣 +危 +糍 +镯 +立 +浏 +阳 +少 +盆 +舔 +擘 +匪 +申 +尬 +铣 +旯 +抖 +赘 +瓯 +居 +ˇ +哮 +游 +锭 +茏 +歌 +坏 +甚 +秒 +舞 +沙 +仗 +劲 +潺 +阿 +燧 +郭 +嗖 +霏 +忠 +材 +奂 +耐 +跺 +砀 +输 +岖 +媳 +氟 +极 +摆 +灿 +今 +扔 +腻 +枝 +奎 +药 +熄 +吨 +话 +q +额 +慑 +嘌 +协 +喀 +壳 +埭 +视 +著 +於 +愧 +陲 +翌 +峁 +颅 +佛 +腹 +聋 +侯 +咎 +叟 +秀 +颇 +存 +较 +罪 +哄 +岗 +扫 +栏 +钾 +羌 +己 +璨 +枭 +霉 +煌 +涸 +衿 +键 +镝 +益 +岢 +奏 +连 +夯 +睿 +冥 +均 +糖 +狞 +蹊 +稻 +爸 +刿 +胥 +煜 +丽 +肿 +璃 +掸 +跚 +灾 +垂 +樾 +濑 +乎 +莲 +窄 +犹 +撮 +战 +馄 +软 +络 +显 +鸢 +胸 +宾 +妲 +恕 +埔 +蝌 +份 +遇 +巧 +瞟 +粒 +恰 +剥 +桡 +博 +讯 +凯 +堇 +阶 +滤 +卖 +斌 +骚 +彬 +兑 +磺 +樱 +舷 +两 +娱 +福 +仃 +差 +找 +桁 +÷ +净 +把 +阴 +污 +戬 +雷 +碓 +蕲 +楚 +罡 +焖 +抽 +妫 +咒 +仑 +闱 +尽 +邑 +菁 +爱 +贷 +沥 +鞑 +牡 +嗉 +崴 +骤 +塌 +嗦 +订 +拮 +滓 +捡 +锻 +次 +坪 +杩 +臃 +箬 +融 +珂 +鹗 +宗 +枚 +降 +鸬 +妯 +阄 +堰 +盐 +毅 +必 +杨 +崃 +俺 +甬 +状 +莘 +货 +耸 +菱 +腼 +铸 +唏 +痤 +孚 +澳 +懒 +溅 +翘 +疙 +杷 +淼 +缙 +骰 +喊 +悉 +砻 +坷 +艇 +赁 +界 +谤 +纣 +宴 +晃 +茹 +归 +饭 +梢 +铡 +街 +抄 +肼 +鬟 +苯 +颂 +撷 +戈 +炒 +咆 +茭 +瘙 +负 +仰 +客 +琉 +铢 +封 +卑 +珥 +椿 +镧 +窨 +鬲 +寿 +御 +袤 +铃 +萎 +砖 +餮 +脒 +裳 +肪 +孕 +嫣 +馗 +嵇 +恳 +氯 +江 +石 +褶 +冢 +祸 +阻 +狈 +羞 +银 +靳 +透 +咳 +叼 +敷 +芷 +啥 +它 +瓤 +兰 +痘 +懊 +逑 +肌 +往 +捺 +坊 +甩 +呻 +〃 +沦 +忘 +膻 +祟 +菅 +剧 +崆 +智 +坯 +臧 +霍 +墅 +攻 +眯 +倘 +拢 +骠 +铐 +庭 +岙 +瓠 +′ +缺 +泥 +迢 +捶 +? +? +郏 +喙 +掷 +沌 +纯 +秘 +种 +听 +绘 +固 +螨 +团 +香 +盗 +妒 +埚 +蓝 +拖 +旱 +荞 +铀 +血 +遏 +汲 +辰 +叩 +拽 +幅 +硬 +惶 +桀 +漠 +措 +泼 +唑 +齐 +肾 +念 +酱 +虚 +屁 +耶 +旗 +砦 +闵 +婉 +馆 +拭 +绅 +韧 +忏 +窝 +醋 +葺 +顾 +辞 +倜 +堆 +辋 +逆 +玟 +贱 +疾 +董 +惘 +倌 +锕 +淘 +嘀 +莽 +俭 +笏 +绑 +鲷 +杈 +择 +蟀 +粥 +嗯 +驰 +逾 +案 +谪 +褓 +胫 +哩 +昕 +颚 +鲢 +绠 +躺 +鹄 +崂 +儒 +俨 +丝 +尕 +泌 +啊 +萸 +彰 +幺 +吟 +骄 +苣 +弦 +脊 +瑰 +〈 +诛 +镁 +析 +闪 +剪 +侧 +哟 +框 +螃 +守 +嬗 +燕 +狭 +铈 +缮 +概 +迳 +痧 +鲲 +俯 +售 +笼 +痣 +扉 +挖 +满 +咋 +援 +邱 +扇 +歪 +便 +玑 +绦 +峡 +蛇 +叨 +〖 +泽 +胃 +斓 +喋 +怂 +坟 +猪 +该 +蚬 +炕 +弥 +赞 +棣 +晔 +娠 +挲 +狡 +创 +疖 +铕 +镭 +稷 +挫 +弭 +啾 +翔 +粉 +履 +苘 +哦 +楼 +秕 +铂 +土 +锣 +瘟 +挣 +栉 +习 +享 +桢 +袅 +磨 +桂 +谦 +延 +坚 +蔚 +噗 +署 +谟 +猬 +钎 +恐 +嬉 +雒 +倦 +衅 +亏 +璩 +睹 +刻 +殿 +王 +算 +雕 +麻 +丘 +柯 +骆 +丸 +塍 +谚 +添 +鲈 +垓 +桎 +蚯 +芥 +予 +飕 +镦 +谌 +窗 +醚 +菀 +亮 +搪 +莺 +蒿 +羁 +足 +J +真 +轶 +悬 +衷 +靛 +翊 +掩 +哒 +炅 +掐 +冼 +妮 +l +谐 +稚 +荆 +擒 +犯 +陵 +虏 +浓 +崽 +刍 +陌 +傻 +孜 +千 +靖 +演 +矜 +钕 +煽 +杰 +酗 +渗 +伞 +栋 +俗 +泫 +戍 +罕 +沾 +疽 +灏 +煦 +芬 +磴 +叱 +阱 +榉 +湃 +蜀 +叉 +醒 +彪 +租 +郡 +篷 +屎 +良 +垢 +隗 +弱 +陨 +峪 +砷 +掴 +颁 +胎 +雯 +绵 +贬 +沐 +撵 +隘 +篙 +暖 +曹 +陡 +栓 +填 +臼 +彦 +瓶 +琪 +潼 +哪 +鸡 +摩 +啦 +俟 +锋 +域 +耻 +蔫 +疯 +纹 +撇 +毒 +绶 +痛 +酯 +忍 +爪 +赳 +歆 +嘹 +辕 +烈 +册 +朴 +钱 +吮 +毯 +癜 +娃 +谀 +邵 +厮 +炽 +璞 +邃 +丐 +追 +词 +瓒 +忆 +轧 +芫 +谯 +喷 +弟 +半 +冕 +裙 +掖 +墉 +绮 +寝 +苔 +势 +顷 +褥 +切 +衮 +君 +佳 +嫒 +蚩 +霞 +佚 +洙 +逊 +镖 +暹 +唛 +& +殒 +顶 +碗 +獗 +轭 +铺 +蛊 +废 +恹 +汨 +崩 +珍 +那 +杵 +曲 +纺 +夏 +薰 +傀 +闳 +淬 +姘 +舀 +拧 +卷 +楂 +恍 +讪 +厩 +寮 +篪 +赓 +乘 +灭 +盅 +鞣 +沟 +慎 +挂 +饺 +鼾 +杳 +树 +缨 +丛 +絮 +娌 +臻 +嗳 +篡 +侩 +述 +衰 +矛 +圈 +蚜 +匕 +筹 +匿 +濞 +晨 +叶 +骋 +郝 +挚 +蚴 +滞 +增 +侍 +描 +瓣 +吖 +嫦 +蟒 +匾 +圣 +赌 +毡 +癞 +恺 +百 +曳 +需 +篓 +肮 +庖 +帏 +卿 +驿 +遗 +蹬 +鬓 +骡 +歉 +芎 +胳 +屐 +禽 +烦 +晌 +寄 +媾 +狄 +翡 +苒 +船 +廉 +终 +痞 +殇 +々 +畦 +饶 +改 +拆 +悻 +萄 +£ +瓿 +乃 +訾 +桅 +匮 +溧 +拥 +纱 +铍 +骗 +蕃 +龋 +缬 +父 +佐 +疚 +栎 +醍 +掳 +蓄 +x +惆 +颜 +鲆 +榆 +〔 +猎 +敌 +暴 +谥 +鲫 +贾 +罗 +玻 +缄 +扦 +芪 +癣 +落 +徒 +臾 +恿 +猩 +托 +邴 +肄 +牵 +春 +陛 +耀 +刊 +拓 +蓓 +邳 +堕 +寇 +枉 +淌 +啡 +湄 +兽 +酷 +萼 +碚 +濠 +萤 +夹 +旬 +戮 +梭 +琥 +椭 +昔 +勺 +蜊 +绐 +晚 +孺 +僵 +宣 +摄 +冽 +旨 +萌 +忙 +蚤 +眉 +噼 +蟑 +付 +契 +瓜 +悼 +颡 +壁 +曾 +窕 +颢 +澎 +仿 +俑 +浑 +嵌 +浣 +乍 +碌 +褪 +乱 +蔟 +隙 +玩 +剐 +葫 +箫 +纲 +围 +伐 +决 +伙 +漩 +瑟 +刑 +肓 +镳 +缓 +蹭 +氨 +皓 +典 +畲 +坍 +铑 +檐 +塑 +洞 +倬 +储 +胴 +淳 +戾 +吐 +灼 +惺 +妙 +毕 +珐 +缈 +虱 +盖 +羰 +鸿 +磅 +谓 +髅 +娴 +苴 +唷 +蚣 +霹 +抨 +贤 +唠 +犬 +誓 +逍 +庠 +逼 +麓 +籼 +釉 +呜 +碧 +秧 +氩 +摔 +霄 +穸 +纨 +辟 +妈 +映 +完 +牛 +缴 +嗷 +炊 +恩 +荔 +茆 +掉 +紊 +慌 +莓 +羟 +阙 +萁 +磐 +另 +蕹 +辱 +鳐 +湮 +吡 +吩 +唐 +睦 +垠 +舒 +圜 +冗 +瞿 +溺 +芾 +囱 +匠 +僳 +汐 +菩 +饬 +漓 +黑 +霰 +浸 +濡 +窥 +毂 +蒡 +兢 +驻 +鹉 +芮 +诙 +迫 +雳 +厂 +忐 +臆 +猴 +鸣 +蚪 +栈 +箕 +羡 +渐 +莆 +捍 +眈 +哓 +趴 +蹼 +埕 +嚣 +骛 +宏 +淄 +斑 +噜 +严 +瑛 +垃 +椎 +诱 +压 +庾 +绞 +焘 +廿 +抡 +迄 +棘 +夫 +纬 +锹 +眨 +瞌 +侠 +脐 +竞 +瀑 +孳 +骧 +遁 +姜 +颦 +荪 +滚 +萦 +伪 +逸 +粳 +爬 +锁 +矣 +役 +趣 +洒 +颔 +诏 +逐 +奸 +甭 +惠 +攀 +蹄 +泛 +尼 +拼 +阮 +鹰 +亚 +颈 +惑 +勒 +〉 +际 +肛 +爷 +刚 +钨 +丰 +养 +冶 +鲽 +辉 +蔻 +画 +覆 +皴 +妊 +麦 +返 +醉 +皂 +擀 +〗 +酶 +凑 +粹 +悟 +诀 +硖 +港 +卜 +z +杀 +涕 +± +舍 +铠 +抵 +弛 +段 +敝 +镐 +奠 +拂 +轴 +跛 +袱 +e +t +沉 +菇 +俎 +薪 +峦 +秭 +蟹 +历 +盟 +菠 +寡 +液 +肢 +喻 +染 +裱 +悱 +抱 +氙 +赤 +捅 +猛 +跑 +氮 +谣 +仁 +尺 +辊 +窍 +烙 +衍 +架 +擦 +倏 +璐 +瑁 +币 +楞 +胖 +夔 +趸 +邛 +惴 +饕 +虔 +蝎 +§ +哉 +贝 +宽 +辫 +炮 +扩 +饲 +籽 +魏 +菟 +锰 +伍 +猝 +末 +琳 +哚 +蛎 +邂 +呀 +姿 +鄞 +却 +歧 +仙 +恸 +椐 +森 +牒 +寤 +袒 +婆 +虢 +雅 +钉 +朵 +贼 +欲 +苞 +寰 +故 +龚 +坭 +嘘 +咫 +礼 +硷 +兀 +睢 +汶 +’ +铲 +烧 +绕 +诃 +浃 +钿 +哺 +柜 +讼 +颊 +璁 +腔 +洽 +咐 +脲 +簌 +筠 +镣 +玮 +鞠 +谁 +兼 +姆 +挥 +梯 +蝴 +谘 +漕 +刷 +躏 +宦 +弼 +b +垌 +劈 +麟 +莉 +揭 +笙 +渎 +仕 +嗤 +仓 +配 +怏 +抬 +错 +泯 +镊 +孰 +猿 +邪 +仍 +秋 +鼬 +壹 +歇 +吵 +炼 +< +尧 +射 +柬 +廷 +胧 +霾 +凳 +隋 +肚 +浮 +梦 +祥 +株 +堵 +退 +L +鹫 +跎 +凶 +毽 +荟 +炫 +栩 +玳 +甜 +沂 +鹿 +顽 +伯 +爹 +赔 +蛴 +徐 +匡 +欣 +狰 +缸 +雹 +蟆 +疤 +默 +沤 +啜 +痂 +衣 +禅 +w +i +h +辽 +葳 +黝 +钗 +停 +沽 +棒 +馨 +颌 +肉 +吴 +硫 +悯 +劾 +娈 +马 +啧 +吊 +悌 +镑 +峭 +帆 +瀣 +涉 +咸 +疸 +滋 +泣 +翦 +拙 +癸 +钥 +蜒 ++ +尾 +庄 +凝 +泉 +婢 +渴 +谊 +乞 +陆 +锉 +糊 +鸦 +淮 +I +B +N +晦 +弗 +乔 +庥 +葡 +尻 +席 +橡 +傣 +渣 +拿 +惩 +麋 +斛 +缃 +矮 +蛏 +岘 +鸽 +姐 +膏 +催 +奔 +镒 +喱 +蠡 +摧 +钯 +胤 +柠 +拐 +璋 +鸥 +卢 +荡 +倾 +^ +_ +珀 +逄 +萧 +塾 +掇 +贮 +笆 +聂 +圃 +冲 +嵬 +M +滔 +笕 +值 +炙 +偶 +蜱 +搐 +梆 +汪 +蔬 +腑 +鸯 +蹇 +敞 +绯 +仨 +祯 +谆 +梧 +糗 +鑫 +啸 +豺 +囹 +猾 +巢 +柄 +瀛 +筑 +踌 +沭 +暗 +苁 +鱿 +蹉 +脂 +蘖 +牢 +热 +木 +吸 +溃 +宠 +序 +泞 +偿 +拜 +檩 +厚 +朐 +毗 +螳 +吞 +媚 +朽 +担 +蝗 +橘 +畴 +祈 +糟 +盱 +隼 +郜 +惜 +珠 +裨 +铵 +焙 +琚 +唯 +咚 +噪 +骊 +丫 +滢 +勤 +棉 +呸 +咣 +淀 +隔 +蕾 +窈 +饨 +挨 +煅 +短 +匙 +粕 +镜 +赣 +撕 +墩 +酬 +馁 +豌 +颐 +抗 +酣 +氓 +佑 +搁 +哭 +递 +耷 +涡 +桃 +贻 +碣 +截 +瘦 +昭 +镌 +蔓 +氚 +甲 +猕 +蕴 +蓬 +散 +拾 +纛 +狼 +猷 +铎 +埋 +旖 +矾 +讳 +囊 +糜 +迈 +粟 +蚂 +紧 +鲳 +瘢 +栽 +稼 +羊 +锄 +斟 +睁 +桥 +瓮 +蹙 +祉 +醺 +鼻 +昱 +剃 +跳 +篱 +跷 +蒜 +翎 +宅 +晖 +嗑 +壑 +峻 +癫 +屏 +狠 +陋 +袜 +途 +憎 +祀 +莹 +滟 +佶 +溥 +臣 +约 +盛 +峰 +磁 +慵 +婪 +拦 +莅 +朕 +鹦 +粲 +裤 +哎 +疡 +嫖 +琵 +窟 +堪 +谛 +嘉 +儡 +鳝 +斩 +郾 +驸 +酊 +妄 +胜 +贺 +徙 +傅 +噌 +钢 +栅 +庇 +恋 +匝 +巯 +邈 +尸 +锚 +粗 +佟 +蛟 +薹 +纵 +蚊 +郅 +绢 +锐 +苗 +俞 +篆 +淆 +膀 +鲜 +煎 +诶 +秽 +寻 +涮 +刺 +怀 +噶 +巨 +褰 +魅 +灶 +灌 +桉 +藕 +谜 +舸 +薄 +搀 +恽 +借 +牯 +痉 +渥 +愿 +亓 +耘 +杠 +柩 +锔 +蚶 +钣 +珈 +喘 +蹒 +幽 +赐 +稗 +晤 +莱 +泔 +扯 +肯 +菪 +裆 +腩 +豉 +疆 +骜 +腐 +倭 +珏 +唔 +粮 +亡 +润 +慰 +伽 +橄 +玄 +誉 +醐 +胆 +龊 +粼 +塬 +陇 +彼 +削 +嗣 +绾 +芽 +妗 +垭 +瘴 +爽 +薏 +寨 +龈 +泠 +弹 +赢 +漪 +猫 +嘧 +涂 +恤 +圭 +茧 +烽 +屑 +痕 +巾 +赖 +荸 +凰 +腮 +畈 +亵 +蹲 +偃 +苇 +澜 +艮 +换 +骺 +烘 +苕 +梓 +颉 +肇 +哗 +悄 +氤 +涠 +葬 +屠 +鹭 +植 +竺 +佯 +诣 +鲇 +瘀 +鲅 +邦 +移 +滁 +冯 +耕 +癔 +戌 +茬 +沁 +巩 +悠 +湘 +洪 +痹 +锟 +循 +谋 +腕 +鳃 +钠 +捞 +焉 +迎 +碱 +伫 +急 +榷 +奈 +邝 +卯 +辄 +皲 +卟 +醛 +畹 +忧 +稳 +雄 +昼 +缩 +阈 +睑 +扌 +耗 +曦 +涅 +捏 +瞧 +邕 +淖 +漉 +铝 +耦 +禹 +湛 +喽 +莼 +琅 +诸 +苎 +纂 +硅 +始 +嗨 +傥 +燃 +臂 +赅 +嘈 +呆 +贵 +屹 +壮 +肋 +亍 +蚀 +卅 +豹 +腆 +邬 +迭 +浊 +} +童 +螂 +捐 +圩 +勐 +触 +寞 +汊 +壤 +荫 +膺 +渌 +芳 +懿 +遴 +螈 +泰 +蓼 +蛤 +茜 +舅 +枫 +朔 +膝 +眙 +避 +梅 +判 +鹜 +璜 +牍 +缅 +垫 +藻 +黔 +侥 +惚 +懂 +踩 +腰 +腈 +札 +丞 +唾 +慈 +顿 +摹 +荻 +琬 +~ +斧 +沈 +滂 +胁 +胀 +幄 +莜 +Z +匀 +鄄 +掌 +绰 +茎 +焚 +赋 +萱 +谑 +汁 +铒 +瞎 +夺 +蜗 +野 +娆 +冀 +弯 +篁 +懵 +灞 +隽 +芡 +脘 +俐 +辩 +芯 +掺 +喏 +膈 +蝈 +觐 +悚 +踹 +蔗 +熠 +鼠 +呵 +抓 +橼 +峨 +畜 +缔 +禾 +崭 +弃 +熊 +摒 +凸 +拗 +穹 +蒙 +抒 +祛 +劝 +闫 +扳 +阵 +醌 +踪 +喵 +侣 +搬 +仅 +荧 +赎 +蝾 +琦 +买 +婧 +瞄 +寓 +皎 +冻 +赝 +箩 +莫 +瞰 +郊 +笫 +姝 +筒 +枪 +遣 +煸 +袋 +舆 +痱 +涛 +母 +〇 +启 +践 +耙 +绲 +盘 +遂 +昊 +搞 +槿 +诬 +纰 +泓 +惨 +檬 +亻 +越 +C +o +憩 +熵 +祷 +钒 +暧 +塔 +阗 +胰 +咄 +娶 +魔 +琶 +钞 +邻 +扬 +杉 +殴 +咽 +弓 +〆 +髻 +】 +吭 +揽 +霆 +拄 +殖 +脆 +彻 +岩 +芝 +勃 +辣 +剌 +钝 +嘎 +甄 +佘 +皖 +伦 +授 +徕 +憔 +挪 +皇 +庞 +稔 +芜 +踏 +溴 +兖 +卒 +擢 +饥 +鳞 +煲 +‰ +账 +颗 +叻 +斯 +捧 +鳍 +琮 +讹 +蛙 +纽 +谭 +酸 +兔 +莒 +睇 +伟 +觑 +羲 +嗜 +宜 +褐 +旎 +辛 +卦 +诘 +筋 +鎏 +溪 +挛 +熔 +阜 +晰 +鳅 +丢 +奚 +灸 +呱 +献 +陉 +黛 +鸪 +甾 +萨 +疮 +拯 +洲 +疹 +辑 +叙 +恻 +谒 +允 +柔 +烂 +氏 +逅 +漆 +拎 +惋 +扈 +湟 +纭 +啕 +掬 +擞 +哥 +忽 +涤 +鸵 +靡 +郗 +瓷 +扁 +廊 +怨 +雏 +钮 +敦 +E +懦 +憋 +汀 +拚 +啉 +腌 +岸 +f +痼 +瞅 +尊 +咀 +眩 +飙 +忌 +仝 +迦 +熬 +毫 +胯 +篑 +茄 +腺 +凄 +舛 +碴 +锵 +诧 +羯 +後 +漏 +汤 +宓 +仞 +蚁 +壶 +谰 +皑 +铄 +棰 +罔 +辅 +晶 +苦 +牟 +闽 +\ +烃 +饮 +聿 +丙 +蛳 +朱 +煤 +涔 +鳖 +犁 +罐 +荼 +砒 +淦 +妤 +黏 +戎 +孑 +婕 +瑾 +戢 +钵 +枣 +捋 +砥 +衩 +狙 +桠 +稣 +阎 +肃 +梏 +诫 +孪 +昶 +婊 +衫 +嗔 +侃 +塞 +蜃 +樵 +峒 +貌 +屿 +欺 +缫 +阐 +栖 +诟 +珞 +荭 +吝 +萍 +嗽 +恂 +啻 +蜴 +磬 +峋 +俸 +豫 +谎 +徊 +镍 +韬 +魇 +晴 +U +囟 +猜 +蛮 +坐 +囿 +伴 +亭 +肝 +佗 +蝠 +妃 +胞 +滩 +榴 +氖 +垩 +苋 +砣 +扪 +馏 +姓 +轩 +厉 +夥 +侈 +禀 +垒 +岑 +赏 +钛 +辐 +痔 +披 +纸 +碳 +“ +坞 +蠓 +挤 +荥 +沅 +悔 +铧 +帼 +蒌 +蝇 +a +p +y +n +g +哀 +浆 +瑶 +凿 +桶 +馈 +皮 +奴 +苜 +佤 +伶 +晗 +铱 +炬 +优 +弊 +氢 +恃 +甫 +攥 +端 +锌 +灰 +稹 +炝 +曙 +邋 +亥 +眶 +碾 +拉 +萝 +绔 +捷 +浍 +腋 +姑 +菖 +凌 +涞 +麽 +锢 +桨 +潢 +绎 +镰 +殆 +锑 +渝 +铬 +困 +绽 +觎 +匈 +糙 +暑 +裹 +鸟 +盔 +肽 +迷 +綦 +『 +亳 +佝 +俘 +钴 +觇 +骥 +仆 +疝 +跪 +婶 +郯 +瀹 +唉 +脖 +踞 +针 +晾 +忒 +扼 +瞩 +叛 +椒 +疟 +嗡 +邗 +肆 +跆 +玫 +忡 +捣 +咧 +唆 +艄 +蘑 +潦 +笛 +阚 +沸 +泻 +掊 +菽 +贫 +斥 +髂 +孢 +镂 +赂 +麝 +鸾 +屡 +衬 +苷 +恪 +叠 +希 +粤 +爻 +喝 +茫 +惬 +郸 +绻 +庸 +撅 +碟 +宄 +妹 +膛 +叮 +饵 +崛 +嗲 +椅 +冤 +搅 +咕 +敛 +尹 +垦 +闷 +蝉 +霎 +勰 +败 +蓑 +泸 +肤 +鹌 +幌 +焦 +浠 +鞍 +刁 +舰 +乙 +竿 +裔 +。 +茵 +函 +伊 +兄 +丨 +娜 +匍 +謇 +莪 +宥 +似 +蝽 +翳 +酪 +翠 +粑 +薇 +祢 +骏 +赠 +叫 +Q +噤 +噻 +竖 +芗 +莠 +潭 +俊 +羿 +耜 +O +郫 +趁 +嗪 +囚 +蹶 +芒 +洁 +笋 +鹑 +敲 +硝 +啶 +堡 +渲 +揩 +』 +携 +宿 +遒 +颍 +扭 +棱 +割 +萜 +蔸 +葵 +琴 +捂 +饰 +衙 +耿 +掠 +募 +岂 +窖 +涟 +蔺 +瘤 +柞 +瞪 +怜 +匹 +距 +楔 +炜 +哆 +秦 +缎 +幼 +茁 +绪 +痨 +恨 +楸 +娅 +瓦 +桩 +雪 +嬴 +伏 +榔 +妥 +铿 +拌 +眠 +雍 +缇 +‘ +卓 +搓 +哌 +觞 +噩 +屈 +哧 +髓 +咦 +巅 +娑 +侑 +淫 +膳 +祝 +勾 +姊 +莴 +胄 +疃 +薛 +蜷 +胛 +巷 +芙 +芋 +熙 +闰 +勿 +窃 +狱 +剩 +钏 +幢 +陟 +铛 +慧 +靴 +耍 +k +浙 +浇 +飨 +惟 +绗 +祜 +澈 +啼 +咪 +磷 +摞 +诅 +郦 +抹 +跃 +壬 +吕 +肖 +琏 +颤 +尴 +剡 +抠 +凋 +赚 +泊 +津 +宕 +殷 +倔 +氲 +漫 +邺 +涎 +怠 +$ +垮 +荬 +遵 +俏 +叹 +噢 +饽 +蜘 +孙 +筵 +疼 +鞭 +羧 +牦 +箭 +潴 +c +眸 +祭 +髯 +啖 +坳 +愁 +芩 +驮 +倡 +巽 +穰 +沃 +胚 +怒 +凤 +槛 +剂 +趵 +嫁 +v +邢 +灯 +鄢 +桐 +睽 +檗 +锯 +槟 +婷 +嵋 +圻 +诗 +蕈 +颠 +遭 +痢 +芸 +怯 +馥 +竭 +锗 +徜 +恭 +遍 +籁 +剑 +嘱 +苡 +龄 +僧 +桑 +潸 +弘 +澶 +楹 +悲 +讫 +愤 +腥 +悸 +谍 +椹 +呢 +桓 +葭 +攫 +阀 +翰 +躲 +敖 +柑 +郎 +笨 +橇 +呃 +魁 +燎 +脓 +葩 +磋 +垛 +玺 +狮 +沓 +砜 +蕊 +锺 +罹 +蕉 +翱 +虐 +闾 +巫 +旦 +茱 +嬷 +枯 +鹏 +贡 +芹 +汛 +矫 +绁 +拣 +禺 +佃 +讣 +舫 +惯 +乳 +趋 +疲 +挽 +岚 +虾 +衾 +蠹 +蹂 +飓 +氦 +铖 +孩 +稞 +瑜 +壅 +掀 +勘 +妓 +畅 +髋 +W +庐 +牲 +蓿 +榕 +练 +垣 +唱 +邸 +菲 +昆 +婺 +穿 +绡 +麒 +蚱 +掂 +愚 +泷 +涪 +漳 +妩 +娉 +榄 +讷 +觅 +旧 +藤 +煮 +呛 +柳 +腓 +叭 +庵 +烷 +阡 +罂 +蜕 +擂 +猖 +咿 +媲 +脉 +【 +沏 +貅 +黠 +熏 +哲 +烁 +坦 +酵 +兜 +× +潇 +撒 +剽 +珩 +圹 +乾 +摸 +樟 +帽 +嗒 +襄 +魂 +轿 +憬 +锡 +〕 +喃 +皆 +咖 +隅 +脸 +残 +泮 +袂 +鹂 +珊 +囤 +捆 +咤 +误 +徨 +闹 +淙 +芊 +淋 +怆 +囗 +拨 +梳 +渤 +R +G +绨 +蚓 +婀 +幡 +狩 +麾 +谢 +唢 +裸 +旌 +伉 +纶 +裂 +驳 +砼 +咛 +澄 +樨 +蹈 +宙 +澍 +倍 +貔 +操 +勇 +蟠 +摈 +砧 +虬 +够 +缁 +悦 +藿 +撸 +艹 +摁 +淹 +豇 +虎 +榭 +ˉ +吱 +d +° +喧 +荀 +踱 +侮 +奋 +偕 +饷 +犍 +惮 +坑 +璎 +徘 +宛 +妆 +袈 +倩 +窦 +昂 +荏 +乖 +K +怅 +撰 +鳙 +牙 +袁 +酞 +X +痿 +琼 +闸 +雁 +趾 +荚 +虻 +涝 +《 +杏 +韭 +偈 +烤 +绫 +鞘 +卉 +症 +遢 +蓥 +诋 +杭 +荨 +匆 +竣 +簪 +辙 +敕 +虞 +丹 +缭 +咩 +黟 +m +淤 +瑕 +咂 +铉 +硼 +茨 +嶂 +痒 +畸 +敬 +涿 +粪 +窘 +熟 +叔 +嫔 +盾 +忱 +裘 +憾 +梵 +赡 +珙 +咯 +娘 +庙 +溯 +胺 +葱 +痪 +摊 +荷 +卞 +乒 +髦 +寐 +铭 +坩 +胗 +枷 +爆 +溟 +嚼 +羚 +砬 +轨 +惊 +挠 +罄 +竽 +菏 +氧 +浅 +楣 +盼 +枢 +炸 +阆 +杯 +谏 +噬 +淇 +渺 +俪 +秆 +墓 +泪 +跻 +砌 +痰 +垡 +渡 +耽 +釜 +讶 +鳎 +煞 +呗 +韶 +舶 +绷 +鹳 +缜 +旷 +铊 +皱 +龌 +檀 +霖 +奄 +槐 +艳 +蝶 +旋 +哝 +赶 +骞 +蚧 +腊 +盈 +丁 +` +蜚 +矸 +蝙 +睨 +嚓 +僻 +鬼 +醴 +夜 +彝 +磊 +笔 +拔 +栀 +糕 +厦 +邰 +纫 +逭 +纤 +眦 +膊 +馍 +躇 +烯 +蘼 +冬 +诤 +暄 +骶 +哑 +瘠 +」 +臊 +丕 +愈 +咱 +螺 +擅 +跋 +搏 +硪 +谄 +笠 +淡 +嘿 +骅 +谧 +鼎 +皋 +姚 +歼 +蠢 +驼 +耳 +胬 +挝 +涯 +狗 +蒽 +孓 +犷 +凉 +芦 +箴 +铤 +孤 +嘛 +坤 +V +茴 +朦 +挞 +尖 +橙 +诞 +搴 +碇 +洵 +浚 +帚 +蜍 +漯 +柘 +嚎 +讽 +芭 +荤 +咻 +祠 +秉 +跖 +埃 +吓 +糯 +眷 +馒 +惹 +娼 +鲑 +嫩 +讴 +轮 +瞥 +靶 +褚 +乏 +缤 +宋 +帧 +删 +驱 +碎 +扑 +俩 +俄 +偏 +涣 +竹 +噱 +皙 +佰 +渚 +唧 +斡 +# +镉 +刀 +崎 +筐 +佣 +夭 +贰 +肴 +峙 +哔 +艿 +匐 +牺 +镛 +缘 +仡 +嫡 +劣 +枸 +堀 +梨 +簿 +鸭 +蒸 +亦 +稽 +浴 +{ +衢 +束 +槲 +j +阁 +揍 +疥 +棋 +潋 +聪 +窜 +乓 +睛 +插 +冉 +阪 +苍 +搽 +「 +蟾 +螟 +幸 +仇 +樽 +撂 +慢 +跤 +幔 +俚 +淅 +覃 +觊 +溶 +妖 +帛 +侨 +曰 +妾 +泗 +· +: +瀘 +風 +Ë +( +) +∶ +紅 +紗 +瑭 +雲 +頭 +鶏 +財 +許 +• +¥ +樂 +焗 +麗 +— +; +滙 +東 +榮 +繪 +興 +… +門 +業 +π +楊 +國 +顧 +é +盤 +寳 +Λ +龍 +鳳 +島 +誌 +緣 +結 +銭 +萬 +勝 +祎 +璟 +優 +歡 +臨 +時 +購 += +★ +藍 +昇 +鐵 +觀 +勅 +農 +聲 +畫 +兿 +術 +發 +劉 +記 +專 +耑 +園 +書 +壴 +種 +Ο +● +褀 +號 +銀 +匯 +敟 +锘 +葉 +橪 +廣 +進 +蒄 +鑽 +阝 +祙 +貢 +鍋 +豊 +夬 +喆 +團 +閣 +開 +燁 +賓 +館 +酡 +沔 +順 ++ +硚 +劵 +饸 +陽 +車 +湓 +復 +萊 +氣 +軒 +華 +堃 +迮 +纟 +戶 +馬 +學 +裡 +電 +嶽 +獨 +マ +シ +サ +ジ +燘 +袪 +環 +❤ +臺 +灣 +専 +賣 +孖 +聖 +攝 +線 +▪ +α +傢 +俬 +夢 +達 +莊 +喬 +貝 +薩 +劍 +羅 +壓 +棛 +饦 +尃 +璈 +囍 +醫 +G +I +A +# +N +鷄 +髙 +嬰 +啓 +約 +隹 +潔 +賴 +藝 +~ +寶 +籣 +麺 +  +嶺 +√ +義 +網 +峩 +長 +∧ +魚 +機 +構 +② +鳯 +偉 +L +B +㙟 +畵 +鴿 +' +詩 +溝 +嚞 +屌 +藔 +佧 +玥 +蘭 +織 +1 +3 +9 +0 +7 +點 +砭 +鴨 +鋪 +銘 +廳 +弍 +‧ +創 +湯 +坶 +℃ +卩 +骝 +& +烜 +荘 +當 +潤 +扞 +係 +懷 +碶 +钅 +蚨 +讠 +☆ +叢 +爲 +埗 +涫 +塗 +→ +楽 +現 +鯨 +愛 +瑪 +鈺 +忄 +悶 +藥 +飾 +樓 +視 +孬 +ㆍ +燚 +苪 +師 +① +丼 +锽 +│ +韓 +標 +è +兒 +閏 +匋 +張 +漢 +Ü +髪 +會 +閑 +檔 +習 +裝 +の +峯 +菘 +輝 +И +雞 +釣 +億 +浐 +K +O +R +8 +H +E +P +T +W +D +S +C +M +F +姌 +饹 +» +晞 +廰 +ä +嵯 +鷹 +負 +飲 +絲 +冚 +楗 +澤 +綫 +區 +❋ +← +質 +靑 +揚 +③ +滬 +統 +産 +協 +﹑ +乸 +畐 +經 +運 +際 +洺 +岽 +為 +粵 +諾 +崋 +豐 +碁 +ɔ +V +2 +6 +齋 +誠 +訂 +´ +勑 +雙 +陳 +無 +í +泩 +媄 +夌 +刂 +i +c +t +o +r +a +嘢 +耄 +燴 +暃 +壽 +媽 +靈 +抻 +體 +唻 +É +冮 +甹 +鎮 +錦 +ʌ +蜛 +蠄 +尓 +駕 +戀 +飬 +逹 +倫 +貴 +極 +Я +Й +寬 +磚 +嶪 +郎 +職 +| +間 +n +d +剎 +伈 +課 +飛 +橋 +瘊 +№ +譜 +骓 +圗 +滘 +縣 +粿 +咅 +養 +濤 +彳 +® +% +Ⅱ +啰 +㴪 +見 +矞 +薬 +糁 +邨 +鲮 +顔 +罱 +З +選 +話 +贏 +氪 +俵 +競 +瑩 +繡 +枱 +β +綉 +á +獅 +爾 +™ +麵 +戋 +淩 +徳 +個 +劇 +場 +務 +簡 +寵 +h +實 +膠 +轱 +圖 +築 +嘣 +樹 +㸃 +營 +耵 +孫 +饃 +鄺 +飯 +麯 +遠 +輸 +坫 +孃 +乚 +閃 +鏢 +㎡ +題 +廠 +關 +↑ +爺 +將 +軍 +連 +篦 +覌 +參 +箸 +- +窠 +棽 +寕 +夀 +爰 +歐 +呙 +閥 +頡 +熱 +雎 +垟 +裟 +凬 +勁 +帑 +馕 +夆 +疌 +枼 +馮 +貨 +蒤 +樸 +彧 +旸 +靜 +龢 +暢 +㐱 +鳥 +珺 +鏡 +灡 +爭 +堷 +廚 +Ó +騰 +診 +┅ +蘇 +褔 +凱 +頂 +豕 +亞 +帥 +嘬 +⊥ +仺 +桖 +複 +饣 +絡 +穂 +顏 +棟 +納 +▏ +濟 +親 +設 +計 +攵 +埌 +烺 +ò +頤 +燦 +蓮 +撻 +節 +講 +濱 +濃 +娽 +洳 +朿 +燈 +鈴 +護 +膚 +铔 +過 +補 +Z +U +5 +4 +坋 +闿 +䖝 +餘 +缐 +铞 +貿 +铪 +桼 +趙 +鍊 +[ +㐂 +垚 +菓 +揸 +捲 +鐘 +滏 +𣇉 +爍 +輪 +燜 +鴻 +鮮 +動 +鹞 +鷗 +丄 +慶 +鉌 +翥 +飮 +腸 +⇋ +漁 +覺 +來 +熘 +昴 +翏 +鲱 +圧 +鄉 +萭 +頔 +爐 +嫚 +г +貭 +類 +聯 +幛 +輕 +訓 +鑒 +夋 +锨 +芃 +珣 +䝉 +扙 +嵐 +銷 +處 +ㄱ +語 +誘 +苝 +歸 +儀 +燒 +楿 +內 +粢 +葒 +奧 +麥 +礻 +滿 +蠔 +穵 +瞭 +態 +鱬 +榞 +硂 +鄭 +黃 +煙 +祐 +奓 +逺 +* +瑄 +獲 +聞 +薦 +讀 +這 +樣 +決 +問 +啟 +們 +執 +説 +轉 +單 +隨 +唘 +帶 +倉 +庫 +還 +贈 +尙 +皺 +■ +餅 +產 +○ +∈ +報 +狀 +楓 +賠 +琯 +嗮 +禮 +` +傳 +> +≤ +嗞 +Φ +≥ +換 +咭 +∣ +↓ +曬 +ε +応 +寫 +″ +終 +様 +純 +費 +療 +聨 +凍 +壐 +郵 +ü +黒 +∫ +製 +塊 +調 +軽 +確 +撃 +級 +馴 +Ⅲ +涇 +繹 +數 +碼 +證 +狒 +処 +劑 +< +晧 +賀 +衆 +] +櫥 +兩 +陰 +絶 +對 +鯉 +憶 +◎ +p +e +Y +蕒 +煖 +頓 +測 +試 +鼽 +僑 +碩 +妝 +帯 +≈ +鐡 +舖 +權 +喫 +倆 +ˋ +該 +悅 +ā +俫 +. +f +s +b +m +k +g +u +j +貼 +淨 +濕 +針 +適 +備 +l +/ +給 +謢 +強 +觸 +衛 +與 +⊙ +$ +緯 +變 +⑴ +⑵ +⑶ +㎏ +殺 +∩ +幚 +─ +價 +▲ +離 +ú +ó +飄 +烏 +関 +閟 +﹝ +﹞ +邏 +輯 +鍵 +驗 +訣 +導 +歷 +屆 +層 +▼ +儱 +錄 +熳 +ē +艦 +吋 +錶 +辧 +飼 +顯 +④ +禦 +販 +気 +対 +枰 +閩 +紀 +幹 +瞓 +貊 +淚 +△ +眞 +墊 +Ω +獻 +褲 +縫 +緑 +亜 +鉅 +餠 +{ +} +◆ +蘆 +薈 +█ +◇ +溫 +彈 +晳 +粧 +犸 +穩 +訊 +崬 +凖 +熥 +П +舊 +條 +紋 +圍 +Ⅳ +筆 +尷 +難 +雜 +錯 +綁 +識 +頰 +鎖 +艶 +□ +殁 +殼 +⑧ +├ +▕ +鵬 +ǐ +ō +ǒ +糝 +綱 +▎ +μ +盜 +饅 +醬 +籤 +蓋 +釀 +鹽 +據 +à +ɡ +辦 +◥ +彐 +┌ +婦 +獸 +鲩 +伱 +ī +蒟 +蒻 +齊 +袆 +腦 +寧 +凈 +妳 +煥 +詢 +偽 +謹 +啫 +鯽 +騷 +鱸 +損 +傷 +鎻 +髮 +買 +冏 +儥 +両 +﹢ +∞ +載 +喰 +z +羙 +悵 +燙 +曉 +員 +組 +徹 +艷 +痠 +鋼 +鼙 +縮 +細 +嚒 +爯 +≠ +維 +" +鱻 +壇 +厍 +帰 +浥 +犇 +薡 +軎 +² +應 +醜 +刪 +緻 +鶴 +賜 +噁 +軌 +尨 +镔 +鷺 +槗 +彌 +葚 +濛 +請 +溇 +緹 +賢 +訪 +獴 +瑅 +資 +縤 +陣 +蕟 +栢 +韻 +祼 +恁 +伢 +謝 +劃 +涑 +總 +衖 +踺 +砋 +凉 +籃 +駿 +苼 +瘋 +昽 +紡 +驊 +腎 +﹗ +響 +杋 +剛 +嚴 +禪 +歓 +槍 +傘 +檸 +檫 +炣 +勢 +鏜 +鎢 +銑 +尐 +減 +奪 +惡 +θ +僮 +婭 +臘 +ū +ì +殻 +鉄 +∑ +蛲 +焼 +緖 +續 +紹 +懮 \ No newline at end of file diff --git a/rag/res/deepdoc/rec.onnx b/rag/res/deepdoc/rec.onnx new file mode 100644 index 0000000..9da312a Binary files /dev/null and b/rag/res/deepdoc/rec.onnx differ diff --git a/rag/res/deepdoc/updown_concat_xgb.model b/rag/res/deepdoc/updown_concat_xgb.model new file mode 100644 index 0000000..49c015a Binary files /dev/null and b/rag/res/deepdoc/updown_concat_xgb.model differ