Refactor document handling and update Milvus collection settings
- Removed multiple failed document entries from `documents.json`. - Added a new document entry with updated metadata and changed the index name to `regulations_dense_1024_v2`. - Updated architecture documentation to reflect changes in the Milvus collection name. - Adjusted requirements by removing the sqlalchemy dependency. - Modified test cases to align with new document structure and naming conventions. - Introduced a new test file for Milvus vector index runtime recovery and error handling. - Updated assertions in various test files to ensure compatibility with the new schema.
This commit is contained in:
@@ -8,18 +8,91 @@ from typing import Any
|
||||
|
||||
|
||||
|
||||
@dataclass
|
||||
@dataclass(init=False)
|
||||
class AnswerSource:
|
||||
"""Represent answer source data."""
|
||||
"""Represent answer source data with legacy aliases."""
|
||||
|
||||
doc_id: str
|
||||
doc_name: str
|
||||
doc_title: str
|
||||
chunk_id: str
|
||||
chunk_type: str
|
||||
section_title: str
|
||||
page_number: int
|
||||
page_start: int
|
||||
page_end: int
|
||||
section_level: int
|
||||
chunk_index: int
|
||||
piece_index: int
|
||||
score: float
|
||||
content: str
|
||||
text: str
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
doc_id: str,
|
||||
doc_title: str | None = None,
|
||||
chunk_id: str,
|
||||
chunk_type: str = "",
|
||||
section_title: str = "",
|
||||
page_start: int = 0,
|
||||
page_end: int = 0,
|
||||
section_level: int = 0,
|
||||
chunk_index: int = 0,
|
||||
piece_index: int = 0,
|
||||
score: float = 0.0,
|
||||
text: str | None = None,
|
||||
metadata: dict[str, Any] | None = None,
|
||||
doc_name: str | None = None,
|
||||
content: str | None = None,
|
||||
page_number: int | None = None,
|
||||
**_: Any,
|
||||
) -> None:
|
||||
"""Initialize the answer source while accepting legacy field names."""
|
||||
self.doc_id = doc_id
|
||||
self.doc_title = doc_title if doc_title is not None else (doc_name or "")
|
||||
self.chunk_id = chunk_id
|
||||
self.chunk_type = chunk_type
|
||||
self.section_title = section_title
|
||||
self.page_start = int(page_start or page_number or 0)
|
||||
self.page_end = int(page_end or self.page_start)
|
||||
self.section_level = int(section_level or 0)
|
||||
self.chunk_index = int(chunk_index or 0)
|
||||
self.piece_index = int(piece_index or 0)
|
||||
self.score = float(score)
|
||||
self.text = text if text is not None else (content or "")
|
||||
self.metadata = dict(metadata or {})
|
||||
|
||||
@property
|
||||
def doc_name(self) -> str:
|
||||
"""Return the legacy document name alias."""
|
||||
return self.doc_title
|
||||
|
||||
@doc_name.setter
|
||||
def doc_name(self, value: str) -> None:
|
||||
"""Update the legacy document name alias."""
|
||||
self.doc_title = value
|
||||
|
||||
@property
|
||||
def content(self) -> str:
|
||||
"""Return the legacy content alias."""
|
||||
return self.text
|
||||
|
||||
@content.setter
|
||||
def content(self, value: str) -> None:
|
||||
"""Update the legacy content alias."""
|
||||
self.text = value
|
||||
|
||||
@property
|
||||
def page_number(self) -> int:
|
||||
"""Return the legacy page number alias."""
|
||||
return self.page_start
|
||||
|
||||
@page_number.setter
|
||||
def page_number(self, value: int) -> None:
|
||||
"""Update the legacy page number alias."""
|
||||
self.page_start = value
|
||||
self.page_end = max(self.page_end, value)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ConversationMessage:
|
||||
|
||||
@@ -60,23 +60,117 @@ class ParsedDocument:
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass
|
||||
@dataclass(init=False)
|
||||
class Chunk:
|
||||
"""Represent the Chunk type."""
|
||||
"""Represent one retrieval chunk with backward-compatible aliases."""
|
||||
|
||||
chunk_id: str
|
||||
doc_id: str
|
||||
doc_name: str
|
||||
content: str
|
||||
doc_title: str
|
||||
text: str
|
||||
embedding_text: str
|
||||
chunk_type: str = ""
|
||||
chunk_index: int = 0
|
||||
piece_index: int = 0
|
||||
page_start: int = 0
|
||||
page_end: int = 0
|
||||
section_title: str = ""
|
||||
section_path: list[str] = field(default_factory=list)
|
||||
page_number: int = 0
|
||||
section_level: int = 0
|
||||
source_ids: list[str] = field(default_factory=list)
|
||||
regulation_type: str = ""
|
||||
version: str = ""
|
||||
semantic_id: str = ""
|
||||
block_type: str = ""
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
chunk_id: str,
|
||||
doc_id: str,
|
||||
doc_title: str | None = None,
|
||||
text: str | None = None,
|
||||
embedding_text: str = "",
|
||||
chunk_type: str = "",
|
||||
chunk_index: int = 0,
|
||||
piece_index: int = 0,
|
||||
page_start: int = 0,
|
||||
page_end: int = 0,
|
||||
section_title: str = "",
|
||||
section_path: list[str] | None = None,
|
||||
section_level: int = 0,
|
||||
source_ids: list[str] | None = None,
|
||||
regulation_type: str = "",
|
||||
version: str = "",
|
||||
semantic_id: str = "",
|
||||
metadata: dict[str, Any] | None = None,
|
||||
doc_name: str | None = None,
|
||||
content: str | None = None,
|
||||
page_number: int | None = None,
|
||||
block_type: str | None = None,
|
||||
**_: Any,
|
||||
) -> None:
|
||||
"""Initialize the chunk while accepting legacy field names."""
|
||||
self.chunk_id = chunk_id
|
||||
self.doc_id = doc_id
|
||||
self.doc_title = doc_title if doc_title is not None else (doc_name or "")
|
||||
self.text = text if text is not None else (content or "")
|
||||
self.embedding_text = embedding_text or self.text
|
||||
self.chunk_type = chunk_type or (block_type or "")
|
||||
self.chunk_index = int(chunk_index or 0)
|
||||
self.piece_index = int(piece_index or 0)
|
||||
self.page_start = int(page_start or page_number or 0)
|
||||
self.page_end = int(page_end or self.page_start)
|
||||
self.section_title = section_title
|
||||
self.section_path = list(section_path or [])
|
||||
self.section_level = int(section_level or 0)
|
||||
self.source_ids = list(source_ids or [])
|
||||
self.regulation_type = regulation_type
|
||||
self.version = version
|
||||
self.semantic_id = semantic_id
|
||||
self.metadata = dict(metadata or {})
|
||||
|
||||
@property
|
||||
def doc_name(self) -> str:
|
||||
"""Return the legacy document name alias."""
|
||||
return self.doc_title
|
||||
|
||||
@doc_name.setter
|
||||
def doc_name(self, value: str) -> None:
|
||||
"""Update the legacy document name alias."""
|
||||
self.doc_title = value
|
||||
|
||||
@property
|
||||
def content(self) -> str:
|
||||
"""Return the legacy content alias."""
|
||||
return self.text
|
||||
|
||||
@content.setter
|
||||
def content(self, value: str) -> None:
|
||||
"""Update the legacy content alias."""
|
||||
self.text = value
|
||||
|
||||
@property
|
||||
def page_number(self) -> int:
|
||||
"""Return the legacy page number alias."""
|
||||
return self.page_start
|
||||
|
||||
@page_number.setter
|
||||
def page_number(self, value: int) -> None:
|
||||
"""Update the legacy page number alias."""
|
||||
self.page_start = value
|
||||
self.page_end = max(self.page_end, value)
|
||||
|
||||
@property
|
||||
def block_type(self) -> str:
|
||||
"""Return the legacy block type alias."""
|
||||
return self.chunk_type
|
||||
|
||||
@block_type.setter
|
||||
def block_type(self, value: str) -> None:
|
||||
"""Update the legacy block type alias."""
|
||||
self.chunk_type = value
|
||||
|
||||
|
||||
@dataclass
|
||||
class DocumentProcessingRun:
|
||||
|
||||
@@ -16,14 +16,88 @@ class RetrievalQuery:
|
||||
filters: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@dataclass(init=False)
|
||||
class RetrievedChunk:
|
||||
"""Represent the Retrieved Chunk type."""
|
||||
"""Represent the retrieved chunk payload with legacy aliases."""
|
||||
|
||||
chunk_id: str
|
||||
doc_id: str
|
||||
doc_name: str
|
||||
content: str
|
||||
doc_title: str
|
||||
text: str
|
||||
score: float
|
||||
chunk_type: str = ""
|
||||
section_title: str = ""
|
||||
page_number: int = 0
|
||||
page_start: int = 0
|
||||
page_end: int = 0
|
||||
section_level: int = 0
|
||||
chunk_index: int = 0
|
||||
piece_index: int = 0
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
chunk_id: str,
|
||||
doc_id: str,
|
||||
doc_title: str | None = None,
|
||||
text: str | None = None,
|
||||
score: float = 0.0,
|
||||
chunk_type: str = "",
|
||||
section_title: str = "",
|
||||
page_start: int = 0,
|
||||
page_end: int = 0,
|
||||
section_level: int = 0,
|
||||
chunk_index: int = 0,
|
||||
piece_index: int = 0,
|
||||
metadata: dict[str, Any] | None = None,
|
||||
doc_name: str | None = None,
|
||||
content: str | None = None,
|
||||
page_number: int | None = None,
|
||||
block_type: str | None = None,
|
||||
**_: Any,
|
||||
) -> None:
|
||||
"""Initialize the retrieved chunk while accepting legacy field names."""
|
||||
self.chunk_id = chunk_id
|
||||
self.doc_id = doc_id
|
||||
self.doc_title = doc_title if doc_title is not None else (doc_name or "")
|
||||
self.text = text if text is not None else (content or "")
|
||||
self.score = float(score)
|
||||
self.chunk_type = chunk_type or (block_type or "")
|
||||
self.section_title = section_title
|
||||
self.page_start = int(page_start or page_number or 0)
|
||||
self.page_end = int(page_end or self.page_start)
|
||||
self.section_level = int(section_level or 0)
|
||||
self.chunk_index = int(chunk_index or 0)
|
||||
self.piece_index = int(piece_index or 0)
|
||||
self.metadata = dict(metadata or {})
|
||||
|
||||
@property
|
||||
def doc_name(self) -> str:
|
||||
"""Return the legacy document name alias."""
|
||||
return self.doc_title
|
||||
|
||||
@doc_name.setter
|
||||
def doc_name(self, value: str) -> None:
|
||||
"""Update the legacy document name alias."""
|
||||
self.doc_title = value
|
||||
|
||||
@property
|
||||
def content(self) -> str:
|
||||
"""Return the legacy content alias."""
|
||||
return self.text
|
||||
|
||||
@content.setter
|
||||
def content(self, value: str) -> None:
|
||||
"""Update the legacy content alias."""
|
||||
self.text = value
|
||||
|
||||
@property
|
||||
def page_number(self) -> int:
|
||||
"""Return the legacy page number alias."""
|
||||
return self.page_start
|
||||
|
||||
@page_number.setter
|
||||
def page_number(self, value: int) -> None:
|
||||
"""Update the legacy page number alias."""
|
||||
self.page_start = value
|
||||
self.page_end = max(self.page_end, value)
|
||||
|
||||
Reference in New Issue
Block a user