v0.21.1-fastapi
This commit is contained in:
@@ -12,7 +12,7 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import concurrent
|
||||
# from beartype import BeartypeConf
|
||||
# from beartype.claw import beartype_all # <-- you didn't sign up for this
|
||||
# beartype_all(conf=BeartypeConf(violation_type=UserWarning)) # <-- emit warnings from all code
|
||||
@@ -32,7 +32,7 @@ from api.utils.log_utils import init_root_logger, get_project_base_directory
|
||||
from graphrag.general.index import run_graphrag_for_kb
|
||||
from graphrag.utils import get_llm_cache, set_llm_cache, get_tags_from_cache, set_tags_to_cache
|
||||
from rag.flow.pipeline import Pipeline
|
||||
from rag.prompts.generator import keyword_extraction, question_proposal, content_tagging
|
||||
from rag.prompts.generator import keyword_extraction, question_proposal, content_tagging, run_toc_from_text
|
||||
import logging
|
||||
import os
|
||||
from datetime import datetime
|
||||
@@ -228,9 +228,10 @@ async def collect():
|
||||
canceled = False
|
||||
if msg.get("doc_id", "") in [GRAPH_RAPTOR_FAKE_DOC_ID, CANVAS_DEBUG_DOC_ID]:
|
||||
task = msg
|
||||
if task["task_type"] in ["graphrag", "raptor", "mindmap"] and msg.get("doc_ids", []):
|
||||
if task["task_type"] in ["graphrag", "raptor", "mindmap"]:
|
||||
task = TaskService.get_task(msg["id"], msg["doc_ids"])
|
||||
task["doc_ids"] = msg["doc_ids"]
|
||||
task["doc_id"] = msg["doc_id"]
|
||||
task["doc_ids"] = msg.get("doc_ids", []) or []
|
||||
else:
|
||||
task = TaskService.get_task(msg["id"])
|
||||
|
||||
@@ -317,7 +318,7 @@ async def build_chunks(task, progress_callback):
|
||||
d["img_id"] = ""
|
||||
docs.append(d)
|
||||
return
|
||||
await image2id(d, partial(STORAGE_IMPL.put), d["id"], task["kb_id"])
|
||||
await image2id(d, partial(STORAGE_IMPL.put, tenant_id=task["tenant_id"]), d["id"], task["kb_id"])
|
||||
docs.append(d)
|
||||
except Exception:
|
||||
logging.exception(
|
||||
@@ -380,7 +381,7 @@ async def build_chunks(task, progress_callback):
|
||||
examples = []
|
||||
all_tags = get_tags_from_cache(kb_ids)
|
||||
if not all_tags:
|
||||
all_tags = settings.retrievaler.all_tags_in_portion(tenant_id, kb_ids, S)
|
||||
all_tags = settings.retriever.all_tags_in_portion(tenant_id, kb_ids, S)
|
||||
set_tags_to_cache(kb_ids, all_tags)
|
||||
else:
|
||||
all_tags = json.loads(all_tags)
|
||||
@@ -393,7 +394,7 @@ async def build_chunks(task, progress_callback):
|
||||
if task_canceled:
|
||||
progress_callback(-1, msg="Task has been canceled.")
|
||||
return
|
||||
if settings.retrievaler.tag_content(tenant_id, kb_ids, d, all_tags, topn_tags=topn_tags, S=S) and len(d[TAG_FLD]) > 0:
|
||||
if settings.retriever.tag_content(tenant_id, kb_ids, d, all_tags, topn_tags=topn_tags, S=S) and len(d[TAG_FLD]) > 0:
|
||||
examples.append({"content": d["content_with_weight"], TAG_FLD: d[TAG_FLD]})
|
||||
else:
|
||||
docs_to_tag.append(d)
|
||||
@@ -419,6 +420,39 @@ async def build_chunks(task, progress_callback):
|
||||
return docs
|
||||
|
||||
|
||||
def build_TOC(task, docs, progress_callback):
|
||||
progress_callback(msg="Start to generate table of content ...")
|
||||
chat_mdl = LLMBundle(task["tenant_id"], LLMType.CHAT, llm_name=task["llm_id"], lang=task["language"])
|
||||
docs = sorted(docs, key=lambda d:(
|
||||
d.get("page_num_int", 0)[0] if isinstance(d.get("page_num_int", 0), list) else d.get("page_num_int", 0),
|
||||
d.get("top_int", 0)[0] if isinstance(d.get("top_int", 0), list) else d.get("top_int", 0)
|
||||
))
|
||||
toc: list[dict] = trio.run(run_toc_from_text, [d["content_with_weight"] for d in docs], chat_mdl, progress_callback)
|
||||
logging.info("------------ T O C -------------\n"+json.dumps(toc, ensure_ascii=False, indent=' '))
|
||||
ii = 0
|
||||
while ii < len(toc):
|
||||
try:
|
||||
idx = int(toc[ii]["chunk_id"])
|
||||
del toc[ii]["chunk_id"]
|
||||
toc[ii]["ids"] = [docs[idx]["id"]]
|
||||
if ii == len(toc) -1:
|
||||
break
|
||||
for jj in range(idx+1, int(toc[ii+1]["chunk_id"])+1):
|
||||
toc[ii]["ids"].append(docs[jj]["id"])
|
||||
except Exception as e:
|
||||
logging.exception(e)
|
||||
ii += 1
|
||||
|
||||
if toc:
|
||||
d = copy.deepcopy(docs[-1])
|
||||
d["content_with_weight"] = json.dumps(toc, ensure_ascii=False)
|
||||
d["toc_kwd"] = "toc"
|
||||
d["available_int"] = 0
|
||||
d["page_num_int"] = [100000000]
|
||||
d["id"] = xxhash.xxh64((d["content_with_weight"] + str(d["doc_id"])).encode("utf-8", "surrogatepass")).hexdigest()
|
||||
return d
|
||||
|
||||
|
||||
def init_kb(row, vector_size: int):
|
||||
idxnm = search.index_name(row["tenant_id"])
|
||||
return settings.docStoreConn.createIdx(idxnm, row.get("kb_id", ""), vector_size)
|
||||
@@ -645,7 +679,7 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si
|
||||
chunks = []
|
||||
vctr_nm = "q_%d_vec"%vector_size
|
||||
for doc_id in doc_ids:
|
||||
for d in settings.retrievaler.chunk_list(doc_id, row["tenant_id"], [str(row["kb_id"])],
|
||||
for d in settings.retriever.chunk_list(doc_id, row["tenant_id"], [str(row["kb_id"])],
|
||||
fields=["content_with_weight", vctr_nm],
|
||||
sort_by_position=True):
|
||||
chunks.append((d["content_with_weight"], np.array(d[vctr_nm])))
|
||||
@@ -659,7 +693,7 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si
|
||||
raptor_config["threshold"],
|
||||
)
|
||||
original_length = len(chunks)
|
||||
chunks = await raptor(chunks, row["kb_parser_config"]["raptor"]["random_seed"], callback)
|
||||
chunks = await raptor(chunks, kb_parser_config["raptor"]["random_seed"], callback)
|
||||
doc = {
|
||||
"doc_id": fake_doc_id,
|
||||
"kb_id": [str(row["kb_id"])],
|
||||
@@ -721,7 +755,7 @@ async def insert_es(task_id, task_tenant_id, task_dataset_id, chunks, progress_c
|
||||
return True
|
||||
|
||||
|
||||
@timeout(60*60*2, 1)
|
||||
@timeout(60*60*3, 1)
|
||||
async def do_handle_task(task):
|
||||
task_type = task.get("task_type", "")
|
||||
|
||||
@@ -741,6 +775,8 @@ async def do_handle_task(task):
|
||||
task_document_name = task["name"]
|
||||
task_parser_config = task["parser_config"]
|
||||
task_start_ts = timer()
|
||||
toc_thread = None
|
||||
executor = concurrent.futures.ThreadPoolExecutor()
|
||||
|
||||
# prepare the progress callback function
|
||||
progress_callback = partial(set_progress, task_id, task_from_page, task_to_page)
|
||||
@@ -782,8 +818,22 @@ async def do_handle_task(task):
|
||||
|
||||
kb_parser_config = kb.parser_config
|
||||
if not kb_parser_config.get("raptor", {}).get("use_raptor", False):
|
||||
progress_callback(prog=-1.0, msg="Internal error: Invalid RAPTOR configuration")
|
||||
return
|
||||
kb_parser_config.update(
|
||||
{
|
||||
"raptor": {
|
||||
"use_raptor": True,
|
||||
"prompt": "Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:\n {cluster_content}\nThe above is the content you need to summarize.",
|
||||
"max_token": 256,
|
||||
"threshold": 0.1,
|
||||
"max_cluster": 64,
|
||||
"random_seed": 0,
|
||||
},
|
||||
}
|
||||
)
|
||||
if not KnowledgebaseService.update_by_id(kb.id, {"parser_config":kb_parser_config}):
|
||||
progress_callback(prog=-1.0, msg="Internal error: Invalid RAPTOR configuration")
|
||||
return
|
||||
|
||||
# bind LLM for raptor
|
||||
chat_model = LLMBundle(task_tenant_id, LLMType.CHAT, llm_name=task_llm_id, lang=task_language)
|
||||
# run RAPTOR
|
||||
@@ -806,8 +856,25 @@ async def do_handle_task(task):
|
||||
|
||||
kb_parser_config = kb.parser_config
|
||||
if not kb_parser_config.get("graphrag", {}).get("use_graphrag", False):
|
||||
progress_callback(prog=-1.0, msg="Internal error: Invalid GraphRAG configuration")
|
||||
return
|
||||
kb_parser_config.update(
|
||||
{
|
||||
"graphrag": {
|
||||
"use_graphrag": True,
|
||||
"entity_types": [
|
||||
"organization",
|
||||
"person",
|
||||
"geo",
|
||||
"event",
|
||||
"category",
|
||||
],
|
||||
"method": "light",
|
||||
}
|
||||
}
|
||||
)
|
||||
if not KnowledgebaseService.update_by_id(kb.id, {"parser_config":kb_parser_config}):
|
||||
progress_callback(prog=-1.0, msg="Internal error: Invalid GraphRAG configuration")
|
||||
return
|
||||
|
||||
|
||||
graphrag_conf = kb_parser_config.get("graphrag", {})
|
||||
start_ts = timer()
|
||||
@@ -842,8 +909,6 @@ async def do_handle_task(task):
|
||||
if not chunks:
|
||||
progress_callback(1., msg=f"No chunk built from {task_document_name}")
|
||||
return
|
||||
# TODO: exception handler
|
||||
## set_progress(task["did"], -1, "ERROR: ")
|
||||
progress_callback(msg="Generate {} chunks".format(len(chunks)))
|
||||
start_ts = timer()
|
||||
try:
|
||||
@@ -857,6 +922,8 @@ async def do_handle_task(task):
|
||||
progress_message = "Embedding chunks ({:.2f}s)".format(timer() - start_ts)
|
||||
logging.info(progress_message)
|
||||
progress_callback(msg=progress_message)
|
||||
if task["parser_id"].lower() == "naive" and task["parser_config"].get("toc_extraction", False):
|
||||
toc_thread = executor.submit(build_TOC,task, chunks, progress_callback)
|
||||
|
||||
chunk_count = len(set([chunk["id"] for chunk in chunks]))
|
||||
start_ts = timer()
|
||||
@@ -871,8 +938,17 @@ async def do_handle_task(task):
|
||||
DocumentService.increment_chunk_num(task_doc_id, task_dataset_id, token_count, chunk_count, 0)
|
||||
|
||||
time_cost = timer() - start_ts
|
||||
progress_callback(msg="Indexing done ({:.2f}s).".format(time_cost))
|
||||
if toc_thread:
|
||||
d = toc_thread.result()
|
||||
if d:
|
||||
e = await insert_es(task_id, task_tenant_id, task_dataset_id, [d], progress_callback)
|
||||
if not e:
|
||||
return
|
||||
DocumentService.increment_chunk_num(task_doc_id, task_dataset_id, 0, 1, 0)
|
||||
|
||||
task_time_cost = timer() - task_start_ts
|
||||
progress_callback(prog=1.0, msg="Indexing done ({:.2f}s). Task done ({:.2f}s)".format(time_cost, task_time_cost))
|
||||
progress_callback(prog=1.0, msg="Task done ({:.2f}s)".format(task_time_cost))
|
||||
logging.info(
|
||||
"Chunk doc({}), page({}-{}), chunks({}), token({}), elapsed:{:.2f}".format(task_document_name, task_from_page,
|
||||
task_to_page, len(chunks),
|
||||
@@ -977,13 +1053,14 @@ async def task_manager():
|
||||
|
||||
async def main():
|
||||
logging.info(r"""
|
||||
______ __ ______ __
|
||||
/_ __/___ ______/ /__ / ____/ _____ _______ __/ /_____ _____
|
||||
/ / / __ `/ ___/ //_/ / __/ | |/_/ _ \/ ___/ / / / __/ __ \/ ___/
|
||||
/ / / /_/ (__ ) ,< / /____> </ __/ /__/ /_/ / /_/ /_/ / /
|
||||
/_/ \__,_/____/_/|_| /_____/_/|_|\___/\___/\__,_/\__/\____/_/
|
||||
____ __ _
|
||||
/ _/___ ____ ____ _____/ /_(_)___ ____ ________ ______ _____ _____
|
||||
/ // __ \/ __ `/ _ \/ ___/ __/ / __ \/ __ \ / ___/ _ \/ ___/ | / / _ \/ ___/
|
||||
_/ // / / / /_/ / __(__ ) /_/ / /_/ / / / / (__ ) __/ / | |/ / __/ /
|
||||
/___/_/ /_/\__, /\___/____/\__/_/\____/_/ /_/ /____/\___/_/ |___/\___/_/
|
||||
/____/
|
||||
""")
|
||||
logging.info(f'TaskExecutor: RAGFlow version: {get_ragflow_version()}')
|
||||
logging.info(f'RAGFlow version: {get_ragflow_version()}')
|
||||
settings.init_settings()
|
||||
print_rag_settings()
|
||||
if sys.platform != "win32":
|
||||
|
||||
Reference in New Issue
Block a user