v0.21.1-fastapi

2025-11-04 16:06:36 +08:00
parent 3e58c3d0e9
commit d57b5d76ae
218 changed files with 19617 additions and 72339 deletions
--- a/rag/prompts/assign_toc_levels.md
+++ b/rag/prompts/assign_toc_levels.md
@@ -1,4 +1,4 @@
-You are given a JSON array of TOC items. Each item has at least {"title": string} and may include an existing structure.
+You are given a JSON array of TOC(tabel of content) items. Each item has at least {"title": string} and may include an existing title hierarchical level.

 Task
 - For each item, assign a depth label using Arabic numerals only: top-level = 1, second-level = 2, third-level = 3, etc.
@@ -9,7 +9,7 @@ Task

 Output
 - Return a valid JSON array only (no extra text).
- Each element must be {"structure": "1|2|3", "title": <original title string>}.
+- Each element must be {"level": "1|2|3", "title": <original title string>}.
 - title must be the original title string.

 Examples
@@ -20,10 +20,10 @@ Input:

 Output:
 [
-  {"structure":"1","title":"Chapter 1 Methods"},
-  {"structure":"2","title":"Section 1 Definition"},
-  {"structure":"2","title":"Section 2 Process"},
-  {"structure":"1","title":"Chapter 2 Experiment"}
+  {"level":"1","title":"Chapter 1 Methods"},
+  {"level":"2","title":"Section 1 Definition"},
+  {"level":"2","title":"Section 2 Process"},
+  {"level":"1","title":"Chapter 2 Experiment"}
 ]

 Example B (parts with chapters)
@@ -32,11 +32,11 @@ Input:

 Output:
 [
-  {"structure":"1","title":"Part I Theory"},
-  {"structure":"2","title":"Chapter 1 Basics"},
-  {"structure":"2","title":"Chapter 2 Methods"},
-  {"structure":"1","title":"Part II Applications"},
-  {"structure":"2","title":"Chapter 3 Case Studies"}
+  {"level":"1","title":"Part I Theory"},
+  {"level":"2","title":"Chapter 1 Basics"},
+  {"level":"2","title":"Chapter 2 Methods"},
+  {"level":"1","title":"Part II Applications"},
+  {"level":"2","title":"Chapter 3 Case Studies"}
 ]

 Example C (plain headings)
@@ -45,9 +45,9 @@ Input:

 Output:
 [
-  {"structure":"1","title":"Introduction"},
-  {"structure":"2","title":"Background and Motivation"},
-  {"structure":"2","title":"Related Work"},
-  {"structure":"1","title":"Methodology"},
-  {"structure":"1","title":"Evaluation"}
+  {"level":"1","title":"Introduction"},
+  {"level":"2","title":"Background and Motivation"},
+  {"level":"2","title":"Related Work"},
+  {"level":"1","title":"Methodology"},
+  {"level":"1","title":"Evaluation"}
 ]
--- a/rag/prompts/generator.py
+++ b/rag/prompts/generator.py
@@ -21,7 +21,9 @@ from copy import deepcopy
 from typing import Tuple
 import jinja2
 import json_repair
+import trio
 from api.utils import hash_str2int
+from rag.nlp import rag_tokenizer
 from rag.prompts.template import load_prompt
 from rag.settings import TAG_FLD
 from rag.utils import encoder, num_tokens_from_string
@@ -122,7 +124,7 @@ def kb_prompt(kbinfos, max_tokens, hash_id=False):

    knowledges = []
    for i, ck in enumerate(kbinfos["chunks"][:chunks_num]):
-        cnt = "\nID: {}".format(i if not hash_id else hash_str2int(get_value(ck, "id", "chunk_id"), 100))
+        cnt = "\nID: {}".format(i if not hash_id else hash_str2int(get_value(ck, "id", "chunk_id"), 500))
        cnt += draw_node("Title", get_value(ck, "docnm_kwd", "document_name"))
        cnt += draw_node("URL", ck['url'])  if "url" in ck else ""
        for k, v in docs.get(get_value(ck, "doc_id", "document_id"), {}).items():
@@ -440,11 +442,17 @@ def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> list:


 def gen_json(system_prompt:str, user_prompt:str, chat_mdl, gen_conf = None):
+    from graphrag.utils import get_llm_cache, set_llm_cache
+    cached = get_llm_cache(chat_mdl.llm_name, system_prompt, user_prompt, gen_conf)
+    if cached:
+        return json_repair.loads(cached)
    _, msg = message_fit_in(form_message(system_prompt, user_prompt), chat_mdl.max_length)
    ans = chat_mdl.chat(msg[0]["content"], msg[1:],gen_conf=gen_conf)
    ans = re.sub(r"(^.*</think>|```json\n|```\n*$)", "", ans, flags=re.DOTALL)
    try:
-        return json_repair.loads(ans)
+        res = json_repair.loads(ans)
+        set_llm_cache(chat_mdl.llm_name, system_prompt, ans, user_prompt, gen_conf)
+        return res
    except Exception:
        logging.exception(f"Loading json failure: {ans}")

@@ -651,29 +659,32 @@ def toc_transformer(toc_pages, chat_mdl):

 TOC_LEVELS = load_prompt("assign_toc_levels")
 def assign_toc_levels(toc_secs, chat_mdl, gen_conf = {"temperature": 0.2}):
-    print("\nBegin TOC level assignment...\n")
-
-    ans = gen_json(
+    if not toc_secs:
+        return []
+    return gen_json(
        PROMPT_JINJA_ENV.from_string(TOC_LEVELS).render(),
        str(toc_secs),
        chat_mdl,
        gen_conf
    )
-    
-    return ans


 TOC_FROM_TEXT_SYSTEM = load_prompt("toc_from_text_system")
 TOC_FROM_TEXT_USER = load_prompt("toc_from_text_user")
 # Generate TOC from text chunks with text llms
-def gen_toc_from_text(text, chat_mdl):
-    ans = gen_json(
-        PROMPT_JINJA_ENV.from_string(TOC_FROM_TEXT_SYSTEM).render(),
-        PROMPT_JINJA_ENV.from_string(TOC_FROM_TEXT_USER).render(text=text),
-        chat_mdl,
-        gen_conf={"temperature": 0.0, "top_p": 0.9, "enable_thinking": False, }
-    )
-    return ans
+async def gen_toc_from_text(txt_info: dict, chat_mdl, callback=None):
+    try:
+        ans = gen_json(
+            PROMPT_JINJA_ENV.from_string(TOC_FROM_TEXT_SYSTEM).render(),
+            PROMPT_JINJA_ENV.from_string(TOC_FROM_TEXT_USER).render(text="\n".join([json.dumps(d, ensure_ascii=False) for d in txt_info["chunks"]])),
+            chat_mdl,
+            gen_conf={"temperature": 0.0, "top_p": 0.9}
+        )
+        txt_info["toc"] = ans if ans and not isinstance(ans, str) else []
+        if callback:
+            callback(msg="")
+    except Exception as e:
+        logging.exception(e)


 def split_chunks(chunks, max_length: int):
@@ -690,44 +701,96 @@ def split_chunks(chunks, max_length: int):
        if batch_tokens + t > max_length:
            result.append(batch)
            batch, batch_tokens = [], 0
-        batch.append({"id": idx, "text": chunk})    
+        batch.append({idx: chunk})
        batch_tokens += t
    if batch:
        result.append(batch)
    return result


-def run_toc_from_text(chunks, chat_mdl):
+async def run_toc_from_text(chunks, chat_mdl, callback=None):
    input_budget = int(chat_mdl.max_length * INPUT_UTILIZATION) - num_tokens_from_string(
        TOC_FROM_TEXT_USER + TOC_FROM_TEXT_SYSTEM
    )

-    input_budget =  2000 if input_budget > 2000 else input_budget
+    input_budget =  1024 if input_budget > 1024 else input_budget
    chunk_sections = split_chunks(chunks, input_budget)
-    res = []
+    titles = []

-    for chunk in chunk_sections:
-        ans = gen_toc_from_text(chunk, chat_mdl)
-        res.extend(ans)
+    chunks_res = []
+    async with trio.open_nursery() as nursery:
+        for i, chunk in enumerate(chunk_sections):
+            if not chunk:
+                continue
+            chunks_res.append({"chunks": chunk})
+            nursery.start_soon(gen_toc_from_text, chunks_res[-1], chat_mdl, callback)
+
+    for chunk in chunks_res:
+        titles.extend(chunk.get("toc", []))
        
    # Filter out entries with title == -1
-    filtered = [x for x in res if x.get("title") and x.get("title") != "-1"]
+    prune = len(titles) > 512
+    max_len = 12 if prune else 22
+    filtered = []
+    for x in titles:
+        if not isinstance(x, dict) or not x.get("title") or x["title"] == "-1":
+            continue
+        if len(rag_tokenizer.tokenize(x["title"]).split(" ")) > max_len:
+            continue
+        if re.match(r"[0-9,.()/ -]+$", x["title"]):
+            continue
+        filtered.append(x)

-    print("\n\nFiltered TOC sections:\n", filtered)
+    logging.info(f"\n\nFiltered TOC sections:\n{filtered}")
+    if not filtered:
+        return []

-    # Generate initial structure (structure/title)
-    raw_structure = [{"structure": "0", "title": x.get("title", "")} for x in filtered]
+    # Generate initial level (level/title)
+    raw_structure = [x.get("title", "") for x in filtered]

    # Assign hierarchy levels using LLM
-    toc_with_levels = assign_toc_levels(raw_structure, chat_mdl, {"temperature": 0.0, "top_p": 0.9, "enable_thinking": False})
+    toc_with_levels = assign_toc_levels(raw_structure, chat_mdl, {"temperature": 0.0, "top_p": 0.9})
+    if not toc_with_levels:
+        return []

    # Merge structure and content (by index)
+    prune = len(toc_with_levels) > 512
+    max_lvl = sorted([t.get("level", "0") for t in toc_with_levels])[-1]
    merged = []
    for _ , (toc_item, src_item) in enumerate(zip(toc_with_levels, filtered)):
+        if prune and toc_item.get("level", "0") >= max_lvl:
+            continue
        merged.append({
-            "structure": toc_item.get("structure", "0"),
+            "level": toc_item.get("level", "0"),
            "title": toc_item.get("title", ""),
-            "content": src_item.get("content", ""),
+            "chunk_id": src_item.get("chunk_id", ""),
        })

-    return merged
+    return merged
+
+
+TOC_RELEVANCE_SYSTEM = load_prompt("toc_relevance_system")
+TOC_RELEVANCE_USER = load_prompt("toc_relevance_user")
+def relevant_chunks_with_toc(query: str, toc:list[dict], chat_mdl, topn: int=6):
+    import numpy as np
+    try:
+        ans = gen_json(
+            PROMPT_JINJA_ENV.from_string(TOC_RELEVANCE_SYSTEM).render(),
+            PROMPT_JINJA_ENV.from_string(TOC_RELEVANCE_USER).render(query=query, toc_json="[\n%s\n]\n"%"\n".join([json.dumps({"level": d["level"], "title":d["title"]}, ensure_ascii=False) for d in toc])),
+            chat_mdl,
+            gen_conf={"temperature": 0.0, "top_p": 0.9}
+        )
+        id2score = {}
+        for ti, sc in zip(toc, ans):
+            if not isinstance(sc, dict) or sc.get("score", -1) < 1:
+                continue
+            for id in ti.get("ids", []):
+                if id not in id2score:
+                    id2score[id] = []
+                id2score[id].append(sc["score"]/5.)
+        for id in id2score.keys():
+            id2score[id] = np.mean(id2score[id])
+        return [(id, sc) for id, sc in list(id2score.items()) if sc>=0.3][:topn]
+    except Exception as e:
+        logging.exception(e)
+    return []
--- a/rag/prompts/toc_from_text_system.md
+++ b/rag/prompts/toc_from_text_system.md
@@ -1,25 +1,25 @@
 You are a robust Table-of-Contents (TOC) extractor.

 GOAL
-Given a dictionary of chunks {chunk_id: chunk_text}, extract TOC-like headings and return a strict JSON array of objects:
+Given a dictionary of chunks {"<chunk_ID>": chunk_text}, extract TOC-like headings and return a strict JSON array of objects:
 [
-  {"title": , "content": ""},
+  {"title": "", "chunk_id": ""},
  ...
 ]

 FIELDS
 - "title": the heading text (clean, no page numbers or leader dots).
  - If any part of a chunk has no valid heading, output that part as {"title":"-1", ...}.
- "content": the chunk_id (string).
+- "chunk_id": the chunk ID (string).
  - One chunk can yield multiple JSON objects in order (unmatched text + one or more headings).

 RULES
 1) Preserve input chunk order strictly.
 2) If a chunk contains multiple headings, expand them in order:
-   - Pre-heading narrative → {"title":"-1","content":chunk_id}
-   - Then each heading → {"title":"...","content":chunk_id}
-3) Do not merge outputs across chunks; each object refers to exactly one chunk_id.
-4) "title" must be non-empty (or exactly "-1"). "content" must be a string (chunk_id).
+   - Pre-heading narrative → {"title":"-1","chunk_id":"<chunk_ID>"}
+   - Then each heading → {"title":"...","chunk_id":"<chunk_ID>"}
+3) Do not merge outputs across chunks; each object refers to exactly one chunk ID.
+4) "title" must be non-empty (or exactly "-1"). "chunk_id" must be a string (chunk ID).
 5) When ambiguous, prefer "-1" unless the text strongly looks like a heading.

 HEADING DETECTION (cues, not hard rules)
@@ -51,63 +51,69 @@ EXAMPLES

 Example 1 — No heading
 Input:
-{0: "Copyright page · Publication info (ISBN 123-456). All rights reserved."}
+[{"0": "Copyright page · Publication info (ISBN 123-456). All rights reserved."}, ...]
 Output:
 [
-  {"title":"-1","content":"0"}
+  {"title":"-1","chunk_id":"0"},
+  ...
 ]

 Example 2 — One heading
 Input:
-{1: "Chapter 1: General Provisions This chapter defines the overall rules…"}
+[{"1": "Chapter 1: General Provisions This chapter defines the overall rules…"}, ...]
 Output:
 [
-  {"title":"Chapter 1: General Provisions","content":"1"}
+  {"title":"Chapter 1: General Provisions","chunk_id":"1"},
+  ...
 ]

 Example 3 — Narrative + heading
 Input:
-{2: "This paragraph introduces the background and goals. Section 2: Definitions Key terms are explained…"}
+[{"2": "This paragraph introduces the background and goals. Section 2: Definitions Key terms are explained…"}, ...]
 Output:
 [
-  {"title":"-1","content":"2"},
-  {"title":"Section 2: Definitions","content":"2"}
+  {"title":"Section 2: Definitions","chunk_id":"2"},
+  ...
 ]

 Example 4 — Multiple headings in one chunk
 Input:
-{3: "Declarations and Commitments (I) Party B commits… (II) Party C commits… Appendix A Data Specification"}
+[{"3": "Declarations and Commitments (I) Party B commits… (II) Party C commits… Appendix A Data Specification"}, ...]
 Output:
 [
-  {"title":"Declarations and Commitments (I)","content":"3"},
-  {"title":"(II)","content":"3"},
-  {"title":"Appendix A","content":"3"}
+  {"title":"Declarations and Commitments","chunk_id":"3"},
+  {"title":"(I) Party B commits","chunk_id":"3"},
+  {"title":"(II) Party C commits","chunk_id":"3"},
+  {"title":"Appendix A Data Specification","chunk_id":"3"},
+  ...
 ]

 Example 5 — Numbering styles
 Input:
-{4: "1. Scope: Defines boundaries. 2) Definitions: Terms used. III) Methods Overview."}
+[{"4": "1. Scope: Defines boundaries. 2) Definitions: Terms used. III) Methods Overview."}, ...]
 Output:
 [
-  {"title":"1. Scope","content":"4"},
-  {"title":"2) Definitions","content":"4"},
-  {"title":"III) Methods","content":"4"}
+  {"title":"1. Scope","chunk_id":"4"},
+  {"title":"2) Definitions","chunk_id":"4"},
+  {"title":"III) Methods Overview","chunk_id":"4"},
+  ...
 ]

 Example 6 — Long list (NOT headings)
 Input:
-{5: "Item list: apples, bananas, strawberries, blueberries, mangos, peaches"}
+{"5": "Item list: apples, bananas, strawberries, blueberries, mangos, peaches"}, ...]
 Output:
 [
-  {"title":"-1","content":"5"}
+  {"title":"-1","chunk_id":"5"},
+  ...
 ]

 Example 7 — Mixed Chinese/English
 Input:
-{6: "（出版信息略）This standard follows industry practices. Chapter 1: Overview 摘要… 第2节：术语与缩略语"}
+{"6": "（出版信息略）This standard follows industry practices. Chapter 1: Overview 摘要… 第2节：术语与缩略语"}, ...]
 Output:
 [
-  {"title":"-1","content":"6"},
-  {"title":"Chapter 1: Overview","content":"6"},
-  {"title":"第2节：术语与缩略语","content":"6"}
+  {"title":"Chapter 1: Overview","chunk_id":"6"},
+  {"title":"第2节：术语与缩略语","chunk_id":"6"},
+  ...
 ]
--- a/rag/prompts/toc_relevance_system.md
+++ b/rag/prompts/toc_relevance_system.md
@@ -0,0 +1,118 @@
+# System Prompt: TOC Relevance Evaluation
+
+You are an expert logical reasoning assistant specializing in hierarchical Table of Contents (TOC) relevance evaluation.
+
+## GOAL
+You will receive:
+1. A JSON list of TOC items, each with fields:
+   ```json
+   {
+     "level": <integer>,   // e.g., 1, 2, 3
+     "title": <string>     // section title
+   }
+   ```
+2. A user query (natural language question).
+
+You must assign a **relevance score** (integer) to every TOC entry, based on how related its `title` is to the `query`.
+
+---
+
+## RULES
+
+### Scoring System
+- 5 → highly relevant (directly answers or matches the query intent)
+- 3 → somewhat related (same topic or partially overlaps)
+- 1 → weakly related (vague or tangential)
+- 0 → no clear relation
+- -1 → explicitly irrelevant or contradictory
+
+### Hierarchy Traversal
+- The TOC is hierarchical: smaller `level` = higher layer (e.g., level 1 is top-level, level 2 is a subsection).
+- You must traverse in **hierarchical order** — interpret the structure based on levels (1 > 2 > 3).
+- If a high-level item (level 1) is strongly related (score 5), its child items (level 2, 3) are likely relevant too.
+- If a high-level item is unrelated (-1 or 0), its deeper children are usually less relevant unless the titles clearly match the query.
+- Lower (deeper) levels provide more specific content; prefer assigning higher scores if they directly match the query.
+
+### Output Format
+Return a **JSON array**, preserving the input order but adding a new key `"score"`:
+
+```json
+[
+  {"level": 1, "title": "Introduction", "score": 0},
+  {"level": 2, "title": "Definition of Sustainability", "score": 5}
+]
+```
+
+### Constraints
+- Output **only the JSON array** — no explanations or reasoning text.
+
+### EXAMPLES
+
+#### Example 1
+Input TOC:
+[
+  {"level": 1, "title": "Machine Learning Overview"},
+  {"level": 2, "title": "Supervised Learning"},
+  {"level": 2, "title": "Unsupervised Learning"},
+  {"level": 3, "title": "Applications of Deep Learning"}
+]
+
+Query:
+"How is deep learning used in image classification?"
+
+Output:
+[
+  {"level": 1, "title": "Machine Learning Overview", "score": 3},
+  {"level": 2, "title": "Supervised Learning", "score": 3},
+  {"level": 2, "title": "Unsupervised Learning", "score": 0},
+  {"level": 3, "title": "Applications of Deep Learning", "score": 5}
+]
+
+---
+
+#### Example 2
+Input TOC:
+[
+  {"level": 1, "title": "Marketing Basics"},
+  {"level": 2, "title": "Consumer Behavior"},
+  {"level": 2, "title": "Digital Marketing"},
+  {"level": 3, "title": "Social Media Campaigns"},
+  {"level": 3, "title": "SEO Optimization"}
+]
+
+Query:
+"What are the best online marketing methods?"
+
+Output:
+[
+  {"level": 1, "title": "Marketing Basics", "score": 3},
+  {"level": 2, "title": "Consumer Behavior", "score": 1},
+  {"level": 2, "title": "Digital Marketing", "score": 5},
+  {"level": 3, "title": "Social Media Campaigns", "score": 5},
+  {"level": 3, "title": "SEO Optimization", "score": 5}
+]
+
+---
+
+#### Example 3
+Input TOC:
+[
+  {"level": 1, "title": "Physics Overview"},
+  {"level": 2, "title": "Classical Mechanics"},
+  {"level": 3, "title": "Newton’s Laws"},
+  {"level": 2, "title": "Thermodynamics"},
+  {"level": 3, "title": "Entropy and Heat Transfer"}
+]
+
+Query:
+"What is entropy?"
+
+Output:
+[
+  {"level": 1, "title": "Physics Overview", "score": 3},
+  {"level": 2, "title": "Classical Mechanics", "score": 0},
+  {"level": 3, "title": "Newton’s Laws", "score": -1},
+  {"level": 2, "title": "Thermodynamics", "score": 5},
+  {"level": 3, "title": "Entropy and Heat Transfer", "score": 5}
+]
+
--- a/rag/prompts/toc_relevance_user.md
+++ b/rag/prompts/toc_relevance_user.md
@@ -0,0 +1,17 @@
+# User Prompt: TOC Relevance Evaluation
+
+You will now receive:
+1. A JSON list of TOC items (each with `level` and `title`)
+2. A user query string.
+
+Traverse the TOC hierarchically based on level numbers and assign scores (5,3,1,0,-1) according to the rules in the system prompt.  
+Output **only** the JSON array with the added `"score"` field.
+
+---
+
+**Input TOC:**
+{{ toc_json }}
+
+**Query:**
+{{ query }}
+