v0.21.1-fastapi

2025-11-04 16:06:36 +08:00
parent 3e58c3d0e9
commit d57b5d76ae
218 changed files with 19617 additions and 72339 deletions
--- a/rag/svr/task_executor.py
+++ b/rag/svr/task_executor.py
@@ -12,7 +12,7 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-
+import concurrent
 # from beartype import BeartypeConf
 # from beartype.claw import beartype_all  # <-- you didn't sign up for this
 # beartype_all(conf=BeartypeConf(violation_type=UserWarning))    # <-- emit warnings from all code
@@ -32,7 +32,7 @@ from api.utils.log_utils import init_root_logger, get_project_base_directory
 from graphrag.general.index import run_graphrag_for_kb
 from graphrag.utils import get_llm_cache, set_llm_cache, get_tags_from_cache, set_tags_to_cache
 from rag.flow.pipeline import Pipeline
-from rag.prompts.generator import keyword_extraction, question_proposal, content_tagging
+from rag.prompts.generator import keyword_extraction, question_proposal, content_tagging, run_toc_from_text
 import logging
 import os
 from datetime import datetime
@@ -228,9 +228,10 @@ async def collect():
    canceled = False
    if msg.get("doc_id", "") in [GRAPH_RAPTOR_FAKE_DOC_ID, CANVAS_DEBUG_DOC_ID]:
        task = msg
-        if task["task_type"] in ["graphrag", "raptor", "mindmap"] and msg.get("doc_ids", []):
+        if task["task_type"] in ["graphrag", "raptor", "mindmap"]:
            task = TaskService.get_task(msg["id"], msg["doc_ids"])
-            task["doc_ids"] = msg["doc_ids"]
+            task["doc_id"] = msg["doc_id"]
+            task["doc_ids"] = msg.get("doc_ids", []) or []
    else:
        task = TaskService.get_task(msg["id"])

@@ -317,7 +318,7 @@ async def build_chunks(task, progress_callback):
                d["img_id"] = ""
                docs.append(d)
                return
-            await image2id(d, partial(STORAGE_IMPL.put), d["id"], task["kb_id"])
+            await image2id(d, partial(STORAGE_IMPL.put, tenant_id=task["tenant_id"]), d["id"], task["kb_id"])
            docs.append(d)
        except Exception:
            logging.exception(
@@ -380,7 +381,7 @@ async def build_chunks(task, progress_callback):
        examples = []
        all_tags = get_tags_from_cache(kb_ids)
        if not all_tags:
-            all_tags = settings.retrievaler.all_tags_in_portion(tenant_id, kb_ids, S)
+            all_tags = settings.retriever.all_tags_in_portion(tenant_id, kb_ids, S)
            set_tags_to_cache(kb_ids, all_tags)
        else:
            all_tags = json.loads(all_tags)
@@ -393,7 +394,7 @@ async def build_chunks(task, progress_callback):
            if task_canceled:
                progress_callback(-1, msg="Task has been canceled.")
                return
-            if settings.retrievaler.tag_content(tenant_id, kb_ids, d, all_tags, topn_tags=topn_tags, S=S) and len(d[TAG_FLD]) > 0:
+            if settings.retriever.tag_content(tenant_id, kb_ids, d, all_tags, topn_tags=topn_tags, S=S) and len(d[TAG_FLD]) > 0:
                examples.append({"content": d["content_with_weight"], TAG_FLD: d[TAG_FLD]})
            else:
                docs_to_tag.append(d)
@@ -419,6 +420,39 @@ async def build_chunks(task, progress_callback):
    return docs


+def build_TOC(task, docs, progress_callback):
+    progress_callback(msg="Start to generate table of content ...")
+    chat_mdl = LLMBundle(task["tenant_id"], LLMType.CHAT, llm_name=task["llm_id"], lang=task["language"])
+    docs = sorted(docs, key=lambda d:(
+        d.get("page_num_int", 0)[0] if isinstance(d.get("page_num_int", 0), list) else d.get("page_num_int", 0),
+        d.get("top_int", 0)[0] if isinstance(d.get("top_int", 0), list) else d.get("top_int", 0)
+    ))
+    toc: list[dict] = trio.run(run_toc_from_text, [d["content_with_weight"] for d in docs], chat_mdl, progress_callback)
+    logging.info("------------ T O C -------------\n"+json.dumps(toc, ensure_ascii=False, indent='  '))
+    ii = 0
+    while ii < len(toc):
+        try:
+            idx = int(toc[ii]["chunk_id"])
+            del toc[ii]["chunk_id"]
+            toc[ii]["ids"] = [docs[idx]["id"]]
+            if ii == len(toc) -1:
+                break
+            for jj in range(idx+1, int(toc[ii+1]["chunk_id"])+1):
+                toc[ii]["ids"].append(docs[jj]["id"])
+        except Exception as e:
+            logging.exception(e)
+        ii += 1
+
+    if toc:
+        d = copy.deepcopy(docs[-1])
+        d["content_with_weight"] = json.dumps(toc, ensure_ascii=False)
+        d["toc_kwd"] = "toc"
+        d["available_int"] = 0
+        d["page_num_int"] = [100000000]
+        d["id"] = xxhash.xxh64((d["content_with_weight"] + str(d["doc_id"])).encode("utf-8", "surrogatepass")).hexdigest()
+        return d
+
+
 def init_kb(row, vector_size: int):
    idxnm = search.index_name(row["tenant_id"])
    return settings.docStoreConn.createIdx(idxnm, row.get("kb_id", ""), vector_size)
@@ -645,7 +679,7 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si
    chunks = []
    vctr_nm = "q_%d_vec"%vector_size
    for doc_id in doc_ids:
-        for d in settings.retrievaler.chunk_list(doc_id, row["tenant_id"], [str(row["kb_id"])],
+        for d in settings.retriever.chunk_list(doc_id, row["tenant_id"], [str(row["kb_id"])],
                                                 fields=["content_with_weight", vctr_nm],
                                                 sort_by_position=True):
            chunks.append((d["content_with_weight"], np.array(d[vctr_nm])))
@@ -659,7 +693,7 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si
        raptor_config["threshold"],
    )
    original_length = len(chunks)
-    chunks = await raptor(chunks, row["kb_parser_config"]["raptor"]["random_seed"], callback)
+    chunks = await raptor(chunks, kb_parser_config["raptor"]["random_seed"], callback)
    doc = {
        "doc_id": fake_doc_id,
        "kb_id": [str(row["kb_id"])],
@@ -721,7 +755,7 @@ async def insert_es(task_id, task_tenant_id, task_dataset_id, chunks, progress_c
    return True


-@timeout(60*60*2, 1)
+@timeout(60*60*3, 1)
 async def do_handle_task(task):
    task_type = task.get("task_type", "")

@@ -741,6 +775,8 @@ async def do_handle_task(task):
    task_document_name = task["name"]
    task_parser_config = task["parser_config"]
    task_start_ts = timer()
+    toc_thread = None
+    executor = concurrent.futures.ThreadPoolExecutor()

    # prepare the progress callback function
    progress_callback = partial(set_progress, task_id, task_from_page, task_to_page)
@@ -782,8 +818,22 @@ async def do_handle_task(task):

        kb_parser_config = kb.parser_config
        if not kb_parser_config.get("raptor", {}).get("use_raptor", False):
-            progress_callback(prog=-1.0, msg="Internal error: Invalid RAPTOR configuration")
-            return
+            kb_parser_config.update(
+                {
+                    "raptor": {
+                        "use_raptor": True,
+                        "prompt": "Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:\n      {cluster_content}\nThe above is the content you need to summarize.",
+                        "max_token": 256,
+                        "threshold": 0.1,
+                        "max_cluster": 64,
+                        "random_seed": 0,
+                    },
+                }
+            )
+            if not KnowledgebaseService.update_by_id(kb.id, {"parser_config":kb_parser_config}):
+                progress_callback(prog=-1.0, msg="Internal error: Invalid RAPTOR configuration")
+                return
+
        # bind LLM for raptor
        chat_model = LLMBundle(task_tenant_id, LLMType.CHAT, llm_name=task_llm_id, lang=task_language)
        # run RAPTOR
@@ -806,8 +856,25 @@ async def do_handle_task(task):

        kb_parser_config = kb.parser_config
        if not kb_parser_config.get("graphrag", {}).get("use_graphrag", False):
-            progress_callback(prog=-1.0, msg="Internal error: Invalid GraphRAG configuration")
-            return
+            kb_parser_config.update(
+                {
+                    "graphrag": {
+                        "use_graphrag": True,
+                        "entity_types": [
+                            "organization",
+                            "person",
+                            "geo",
+                            "event",
+                            "category",
+                        ],
+                        "method": "light",
+                    }
+                }
+            )
+            if not KnowledgebaseService.update_by_id(kb.id, {"parser_config":kb_parser_config}):
+                progress_callback(prog=-1.0, msg="Internal error: Invalid GraphRAG configuration")
+                return
+

        graphrag_conf = kb_parser_config.get("graphrag", {})
        start_ts = timer()
@@ -842,8 +909,6 @@ async def do_handle_task(task):
        if not chunks:
            progress_callback(1., msg=f"No chunk built from {task_document_name}")
            return
-        # TODO: exception handler
-        ## set_progress(task["did"], -1, "ERROR: ")
        progress_callback(msg="Generate {} chunks".format(len(chunks)))
        start_ts = timer()
        try:
@@ -857,6 +922,8 @@ async def do_handle_task(task):
        progress_message = "Embedding chunks ({:.2f}s)".format(timer() - start_ts)
        logging.info(progress_message)
        progress_callback(msg=progress_message)
+        if task["parser_id"].lower() == "naive" and task["parser_config"].get("toc_extraction", False):
+            toc_thread = executor.submit(build_TOC,task, chunks, progress_callback)

    chunk_count = len(set([chunk["id"] for chunk in chunks]))
    start_ts = timer()
@@ -871,8 +938,17 @@ async def do_handle_task(task):
    DocumentService.increment_chunk_num(task_doc_id, task_dataset_id, token_count, chunk_count, 0)

    time_cost = timer() - start_ts
+    progress_callback(msg="Indexing done ({:.2f}s).".format(time_cost))
+    if toc_thread:
+        d = toc_thread.result()
+        if d:
+            e = await insert_es(task_id, task_tenant_id, task_dataset_id, [d], progress_callback)
+            if not e:
+                return
+            DocumentService.increment_chunk_num(task_doc_id, task_dataset_id, 0, 1, 0)
+
    task_time_cost = timer() - task_start_ts
-    progress_callback(prog=1.0, msg="Indexing done ({:.2f}s). Task done ({:.2f}s)".format(time_cost, task_time_cost))
+    progress_callback(prog=1.0, msg="Task done ({:.2f}s)".format(task_time_cost))
    logging.info(
        "Chunk doc({}), page({}-{}), chunks({}), token({}), elapsed:{:.2f}".format(task_document_name, task_from_page,
                                                                                   task_to_page, len(chunks),
@@ -977,13 +1053,14 @@ async def task_manager():

 async def main():
    logging.info(r"""
-  ______           __      ______                     __
- /_  __/___ ______/ /__   / ____/  _____  _______  __/ /_____  _____
-  / / / __ `/ ___/ //_/  / __/ | |/_/ _ \/ ___/ / / / __/ __ \/ ___/
- / / / /_/ (__  ) ,<    / /____>  </  __/ /__/ /_/ / /_/ /_/ / /
-/_/  \__,_/____/_/|_|  /_____/_/|_|\___/\___/\__,_/\__/\____/_/
+    ____                      __  _
+   /  _/___  ____ ____  _____/ /_(_)___  ____     ________  ______   _____  _____
+   / // __ \/ __ `/ _ \/ ___/ __/ / __ \/ __ \   / ___/ _ \/ ___/ | / / _ \/ ___/
+ _/ // / / / /_/ /  __(__  ) /_/ / /_/ / / / /  (__  )  __/ /   | |/ /  __/ /
+/___/_/ /_/\__, /\___/____/\__/_/\____/_/ /_/  /____/\___/_/    |___/\___/_/
+          /____/
    """)
-    logging.info(f'TaskExecutor: RAGFlow version: {get_ragflow_version()}')
+    logging.info(f'RAGFlow version: {get_ragflow_version()}')
    settings.init_settings()
    print_rag_settings()
    if sys.platform != "win32":