v0.21.1-fastapi
This commit is contained in:
@@ -459,12 +459,10 @@ def tree_merge(bull, sections, depth):
|
||||
return len(BULLET_PATTERN[bull])+1, text
|
||||
else:
|
||||
return len(BULLET_PATTERN[bull])+2, text
|
||||
|
||||
level_set = set()
|
||||
lines = []
|
||||
for section in sections:
|
||||
level, text = get_level(bull, section)
|
||||
|
||||
if not text.strip("\n"):
|
||||
continue
|
||||
|
||||
@@ -578,8 +576,7 @@ def hierarchical_merge(bull, sections, depth):
|
||||
|
||||
|
||||
def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。;!?", overlapped_percent=0):
|
||||
from ocr.service import get_ocr_service
|
||||
ocr_service = get_ocr_service()
|
||||
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
||||
if not sections:
|
||||
return []
|
||||
if isinstance(sections, str):
|
||||
@@ -599,7 +596,7 @@ def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。;
|
||||
# Ensure that the length of the merged chunk does not exceed chunk_token_num
|
||||
if cks[-1] == "" or tk_nums[-1] > chunk_token_num * (100 - overlapped_percent)/100.:
|
||||
if cks:
|
||||
overlapped = ocr_service.remove_tag_sync(cks[-1])
|
||||
overlapped = RAGFlowPdfParser.remove_tag(cks[-1])
|
||||
t = overlapped[int(len(overlapped)*(100-overlapped_percent)/100.):] + t
|
||||
if t.find(pos) < 0:
|
||||
t += pos
|
||||
@@ -614,20 +611,19 @@ def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。;
|
||||
dels = get_delimiters(delimiter)
|
||||
for sec, pos in sections:
|
||||
if num_tokens_from_string(sec) < chunk_token_num:
|
||||
add_chunk(sec, pos)
|
||||
add_chunk("\n"+sec, pos)
|
||||
continue
|
||||
split_sec = re.split(r"(%s)" % dels, sec, flags=re.DOTALL)
|
||||
for sub_sec in split_sec:
|
||||
if re.match(f"^{dels}$", sub_sec):
|
||||
continue
|
||||
add_chunk(sub_sec, pos)
|
||||
add_chunk("\n"+sub_sec, pos)
|
||||
|
||||
return cks
|
||||
|
||||
|
||||
def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。;!?", overlapped_percent=0):
|
||||
from ocr.service import get_ocr_service
|
||||
ocr_service = get_ocr_service()
|
||||
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
||||
if not texts or len(texts) != len(images):
|
||||
return [], []
|
||||
cks = [""]
|
||||
@@ -644,7 +640,7 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
|
||||
# Ensure that the length of the merged chunk does not exceed chunk_token_num
|
||||
if cks[-1] == "" or tk_nums[-1] > chunk_token_num * (100 - overlapped_percent)/100.:
|
||||
if cks:
|
||||
overlapped = ocr_service.remove_tag_sync(cks[-1])
|
||||
overlapped = RAGFlowPdfParser.remove_tag(cks[-1])
|
||||
t = overlapped[int(len(overlapped)*(100-overlapped_percent)/100.):] + t
|
||||
if t.find(pos) < 0:
|
||||
t += pos
|
||||
@@ -671,13 +667,13 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
|
||||
for sub_sec in split_sec:
|
||||
if re.match(f"^{dels}$", sub_sec):
|
||||
continue
|
||||
add_chunk(sub_sec, image, text_pos)
|
||||
add_chunk("\n"+sub_sec, image, text_pos)
|
||||
else:
|
||||
split_sec = re.split(r"(%s)" % dels, text)
|
||||
for sub_sec in split_sec:
|
||||
if re.match(f"^{dels}$", sub_sec):
|
||||
continue
|
||||
add_chunk(sub_sec, image)
|
||||
add_chunk("\n"+sub_sec, image)
|
||||
|
||||
return cks, result_images
|
||||
|
||||
@@ -759,7 +755,7 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
||||
for sub_sec in split_sec:
|
||||
if re.match(f"^{dels}$", sub_sec):
|
||||
continue
|
||||
add_chunk(sub_sec, image,"")
|
||||
add_chunk("\n"+sub_sec, image,"")
|
||||
line = ""
|
||||
|
||||
if line:
|
||||
@@ -767,7 +763,7 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
||||
for sub_sec in split_sec:
|
||||
if re.match(f"^{dels}$", sub_sec):
|
||||
continue
|
||||
add_chunk(sub_sec, image,"")
|
||||
add_chunk("\n"+sub_sec, image,"")
|
||||
|
||||
return cks, images
|
||||
|
||||
@@ -799,8 +795,8 @@ class Node:
|
||||
def __init__(self, level, depth=-1, texts=None):
|
||||
self.level = level
|
||||
self.depth = depth
|
||||
self.texts = texts if texts is not None else [] # 存放内容
|
||||
self.children = [] # 子节点
|
||||
self.texts = texts or []
|
||||
self.children = []
|
||||
|
||||
def add_child(self, child_node):
|
||||
self.children.append(child_node)
|
||||
@@ -827,35 +823,51 @@ class Node:
|
||||
return f"Node(level={self.level}, texts={self.texts}, children={len(self.children)})"
|
||||
|
||||
def build_tree(self, lines):
|
||||
stack = [self]
|
||||
for line in lines:
|
||||
level, text = line
|
||||
node = Node(level=level, texts=[text])
|
||||
|
||||
if level <= self.depth or self.depth == -1:
|
||||
while stack and level <= stack[-1].get_level():
|
||||
stack.pop()
|
||||
|
||||
stack[-1].add_child(node)
|
||||
stack.append(node)
|
||||
else:
|
||||
stack = [self]
|
||||
for level, text in lines:
|
||||
if self.depth != -1 and level > self.depth:
|
||||
# Beyond target depth: merge content into the current leaf instead of creating deeper nodes
|
||||
stack[-1].add_text(text)
|
||||
return self
|
||||
continue
|
||||
|
||||
# Move up until we find the proper parent whose level is strictly smaller than current
|
||||
while len(stack) > 1 and level <= stack[-1].get_level():
|
||||
stack.pop()
|
||||
|
||||
node = Node(level=level, texts=[text])
|
||||
# Attach as child of current parent and descend
|
||||
stack[-1].add_child(node)
|
||||
stack.append(node)
|
||||
|
||||
return self
|
||||
|
||||
def get_tree(self):
|
||||
tree_list = []
|
||||
self._dfs(self, tree_list, 0, [])
|
||||
self._dfs(self, tree_list, [])
|
||||
return tree_list
|
||||
|
||||
def _dfs(self, node, tree_list, current_depth, titles):
|
||||
def _dfs(self, node, tree_list, titles):
|
||||
level = node.get_level()
|
||||
texts = node.get_texts()
|
||||
child = node.get_children()
|
||||
|
||||
if node.get_texts():
|
||||
if 0 < node.get_level() < self.depth:
|
||||
titles.extend(node.get_texts())
|
||||
else:
|
||||
combined_text = ["\n".join(titles + node.get_texts())]
|
||||
tree_list.append(combined_text)
|
||||
if level == 0 and texts:
|
||||
tree_list.append("\n".join(titles+texts))
|
||||
|
||||
# Titles within configured depth are accumulated into the current path
|
||||
if 1 <= level <= self.depth:
|
||||
path_titles = titles + texts
|
||||
else:
|
||||
path_titles = titles
|
||||
|
||||
for child in node.get_children():
|
||||
self._dfs(child, tree_list, current_depth + 1, titles.copy())
|
||||
# Body outside the depth limit becomes its own chunk under the current title path
|
||||
if level > self.depth and texts:
|
||||
tree_list.append("\n".join(path_titles + texts))
|
||||
|
||||
# A leaf title within depth emits its title path as a chunk (header-only section)
|
||||
elif not child and (1 <= level <= self.depth):
|
||||
tree_list.append("\n".join(path_titles))
|
||||
|
||||
# Recurse into children with the updated title path
|
||||
for c in child:
|
||||
self._dfs(c, tree_list, path_titles)
|
||||
@@ -13,12 +13,15 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import math
|
||||
import os
|
||||
from collections import OrderedDict
|
||||
from dataclasses import dataclass
|
||||
|
||||
from rag.prompts.generator import relevant_chunks_with_toc
|
||||
from rag.settings import TAG_FLD, PAGERANK_FLD
|
||||
from rag.utils import rmSpace, get_float
|
||||
from rag.nlp import rag_tokenizer, query
|
||||
@@ -69,7 +72,7 @@ class Dealer:
|
||||
def search(self, req, idx_names: str | list[str],
|
||||
kb_ids: list[str],
|
||||
emb_mdl=None,
|
||||
highlight=False,
|
||||
highlight: bool | list = False,
|
||||
rank_feature: dict | None = None
|
||||
):
|
||||
filters = self.get_filters(req)
|
||||
@@ -98,7 +101,11 @@ class Dealer:
|
||||
total = self.dataStore.getTotal(res)
|
||||
logging.debug("Dealer.search TOTAL: {}".format(total))
|
||||
else:
|
||||
highlightFields = ["content_ltks", "title_tks"] if highlight else []
|
||||
highlightFields = ["content_ltks", "title_tks"]
|
||||
if not highlight:
|
||||
highlightFields = []
|
||||
elif isinstance(highlight, list):
|
||||
highlightFields = highlight
|
||||
matchText, keywords = self.qryr.question(qst, min_match=0.3)
|
||||
if emb_mdl is None:
|
||||
matchExprs = [matchText]
|
||||
@@ -152,7 +159,7 @@ class Dealer:
|
||||
query_vector=q_vec,
|
||||
aggregation=aggs,
|
||||
highlight=highlight,
|
||||
field=self.dataStore.getFields(res, src),
|
||||
field=self.dataStore.getFields(res, src + ["_score"]),
|
||||
keywords=keywords
|
||||
)
|
||||
|
||||
@@ -352,10 +359,8 @@ class Dealer:
|
||||
if not question:
|
||||
return ranks
|
||||
|
||||
RERANK_LIMIT = 64
|
||||
RERANK_LIMIT = int(RERANK_LIMIT//page_size + ((RERANK_LIMIT%page_size)/(page_size*1.) + 0.5)) * page_size if page_size>1 else 1
|
||||
if RERANK_LIMIT < 1: ## when page_size is very large the RERANK_LIMIT will be 0.
|
||||
RERANK_LIMIT = 1
|
||||
# Ensure RERANK_LIMIT is multiple of page_size
|
||||
RERANK_LIMIT = math.ceil(64/page_size) * page_size if page_size>1 else 1
|
||||
req = {"kb_ids": kb_ids, "doc_ids": doc_ids, "page": math.ceil(page_size*page/RERANK_LIMIT), "size": RERANK_LIMIT,
|
||||
"question": question, "vector": True, "topk": top,
|
||||
"similarity": similarity_threshold,
|
||||
@@ -374,15 +379,26 @@ class Dealer:
|
||||
vector_similarity_weight,
|
||||
rank_feature=rank_feature)
|
||||
else:
|
||||
sim, tsim, vsim = self.rerank(
|
||||
sres, question, 1 - vector_similarity_weight, vector_similarity_weight,
|
||||
rank_feature=rank_feature)
|
||||
lower_case_doc_engine = os.getenv('DOC_ENGINE', 'elasticsearch')
|
||||
if lower_case_doc_engine == "elasticsearch":
|
||||
# ElasticSearch doesn't normalize each way score before fusion.
|
||||
sim, tsim, vsim = self.rerank(
|
||||
sres, question, 1 - vector_similarity_weight, vector_similarity_weight,
|
||||
rank_feature=rank_feature)
|
||||
else:
|
||||
# Don't need rerank here since Infinity normalizes each way score before fusion.
|
||||
sim = [sres.field[id].get("_score", 0.0) for id in sres.ids]
|
||||
sim = [s if s is not None else 0. for s in sim]
|
||||
tsim = sim
|
||||
vsim = sim
|
||||
# Already paginated in search function
|
||||
idx = np.argsort(sim * -1)[(page - 1) * page_size:page * page_size]
|
||||
begin = ((page % (RERANK_LIMIT//page_size)) - 1) * page_size
|
||||
sim = sim[begin : begin + page_size]
|
||||
sim_np = np.array(sim)
|
||||
idx = np.argsort(sim_np * -1)
|
||||
dim = len(sres.query_vector)
|
||||
vector_column = f"q_{dim}_vec"
|
||||
zero_vector = [0.0] * dim
|
||||
sim_np = np.array(sim)
|
||||
filtered_count = (sim_np >= similarity_threshold).sum()
|
||||
ranks["total"] = int(filtered_count) # Convert from np.int64 to Python int otherwise JSON serializable error
|
||||
for i in idx:
|
||||
@@ -514,3 +530,63 @@ class Dealer:
|
||||
tag_fea = sorted([(a, round(0.1*(c + 1) / (cnt + S) / max(1e-6, all_tags.get(a, 0.0001)))) for a, c in aggs],
|
||||
key=lambda x: x[1] * -1)[:topn_tags]
|
||||
return {a.replace(".", "_"): max(1, c) for a, c in tag_fea}
|
||||
|
||||
def retrieval_by_toc(self, query:str, chunks:list[dict], tenant_ids:list[str], chat_mdl, topn: int=6):
|
||||
if not chunks:
|
||||
return []
|
||||
idx_nms = [index_name(tid) for tid in tenant_ids]
|
||||
ranks, doc_id2kb_id = {}, {}
|
||||
for ck in chunks:
|
||||
if ck["doc_id"] not in ranks:
|
||||
ranks[ck["doc_id"]] = 0
|
||||
ranks[ck["doc_id"]] += ck["similarity"]
|
||||
doc_id2kb_id[ck["doc_id"]] = ck["kb_id"]
|
||||
doc_id = sorted(ranks.items(), key=lambda x: x[1]*-1.)[0][0]
|
||||
kb_ids = [doc_id2kb_id[doc_id]]
|
||||
es_res = self.dataStore.search(["content_with_weight"], [], {"doc_id": doc_id, "toc_kwd": "toc"}, [], OrderByExpr(), 0, 128, idx_nms,
|
||||
kb_ids)
|
||||
toc = []
|
||||
dict_chunks = self.dataStore.getFields(es_res, ["content_with_weight"])
|
||||
for _, doc in dict_chunks.items():
|
||||
try:
|
||||
toc.extend(json.loads(doc["content_with_weight"]))
|
||||
except Exception as e:
|
||||
logging.exception(e)
|
||||
if not toc:
|
||||
return chunks
|
||||
|
||||
ids = relevant_chunks_with_toc(query, toc, chat_mdl, topn*2)
|
||||
if not ids:
|
||||
return chunks
|
||||
|
||||
vector_size = 1024
|
||||
id2idx = {ck["chunk_id"]: i for i, ck in enumerate(chunks)}
|
||||
for cid, sim in ids:
|
||||
if cid in id2idx:
|
||||
chunks[id2idx[cid]]["similarity"] += sim
|
||||
continue
|
||||
chunk = self.dataStore.get(cid, idx_nms, kb_ids)
|
||||
d = {
|
||||
"chunk_id": cid,
|
||||
"content_ltks": chunk["content_ltks"],
|
||||
"content_with_weight": chunk["content_with_weight"],
|
||||
"doc_id": doc_id,
|
||||
"docnm_kwd": chunk.get("docnm_kwd", ""),
|
||||
"kb_id": chunk["kb_id"],
|
||||
"important_kwd": chunk.get("important_kwd", []),
|
||||
"image_id": chunk.get("img_id", ""),
|
||||
"similarity": sim,
|
||||
"vector_similarity": sim,
|
||||
"term_similarity": sim,
|
||||
"vector": [0.0] * vector_size,
|
||||
"positions": chunk.get("position_int", []),
|
||||
"doc_type_kwd": chunk.get("doc_type_kwd", "")
|
||||
}
|
||||
for k in chunk.keys():
|
||||
if k[-4:] == "_vec":
|
||||
d["vector"] = chunk[k]
|
||||
vector_size = len(chunk[k])
|
||||
break
|
||||
chunks.append(d)
|
||||
|
||||
return sorted(chunks, key=lambda x:x["similarity"]*-1)[:topn]
|
||||
|
||||
Reference in New Issue
Block a user