- Removed multiple failed document entries from `documents.json`. - Added a new document entry with updated metadata and changed the index name to `regulations_dense_1024_v2`. - Updated architecture documentation to reflect changes in the Milvus collection name. - Adjusted requirements by removing the sqlalchemy dependency. - Modified test cases to align with new document structure and naming conventions. - Introduced a new test file for Milvus vector index runtime recovery and error handling. - Updated assertions in various test files to ensure compatibility with the new schema.
122 lines
6.1 KiB
PL/PgSQL
122 lines
6.1 KiB
PL/PgSQL
-- 法规文档向量检索系统数据库表结构
|
||
-- PostgreSQL
|
||
|
||
-- ==================== 文档表 ====================
|
||
CREATE TABLE documents (
|
||
id SERIAL PRIMARY KEY,
|
||
doc_id VARCHAR(128) UNIQUE NOT NULL, -- 文档唯一标识,如 "GB14747-2006"
|
||
title VARCHAR(512) NOT NULL, -- 文档标题
|
||
doc_type VARCHAR(32), -- 文档类型:标准/法规/规范
|
||
standard_number VARCHAR(64), -- 标准编号:如 "GB 14747-2006"
|
||
publish_date DATE, -- 发布日期
|
||
implement_date DATE, -- 实施日期
|
||
status VARCHAR(32), -- 状态:现行/废止/修订
|
||
source_url VARCHAR(512), -- 来源 URL
|
||
file_path VARCHAR(512), -- 本地 PDF 文件路径
|
||
file_size INT, -- 文件大小(字节)
|
||
upload_time TIMESTAMP DEFAULT NOW(), -- 上传时间
|
||
created_at TIMESTAMP DEFAULT NOW(),
|
||
updated_at TIMESTAMP DEFAULT NOW()
|
||
);
|
||
|
||
COMMENT ON TABLE documents IS '文档元数据表';
|
||
COMMENT ON COLUMN documents.doc_id IS '文档唯一标识,用于关联 Milvus 和其他表';
|
||
COMMENT ON COLUMN documents.standard_number IS '标准编号,如 GB 14747-2006';
|
||
|
||
-- ==================== 章节结构表 ====================
|
||
CREATE TABLE sections (
|
||
id SERIAL PRIMARY KEY,
|
||
doc_id VARCHAR(128) NOT NULL,
|
||
unique_id VARCHAR(64) NOT NULL, -- 阿里云返回的唯一标识
|
||
level INT NOT NULL, -- 层级:1, 2, 3...
|
||
title VARCHAR(512) NOT NULL, -- 章节标题
|
||
page INT, -- 所在页码
|
||
index INT, -- 页内顺序
|
||
parent_id INT, -- 父章节 ID(树形结构)
|
||
created_at TIMESTAMP DEFAULT NOW(),
|
||
|
||
CONSTRAINT fk_sections_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id),
|
||
CONSTRAINT fk_sections_parent_id FOREIGN KEY (parent_id) REFERENCES sections(id),
|
||
CONSTRAINT uq_sections_doc_unique UNIQUE (doc_id, unique_id)
|
||
);
|
||
|
||
COMMENT ON TABLE sections IS '章节结构表,用于目录导航';
|
||
COMMENT ON COLUMN sections.parent_id IS '父章节 ID,构建树形结构';
|
||
COMMENT ON COLUMN sections.level IS '层级深度,1 为最顶层';
|
||
|
||
-- ==================== 语义块表 ====================
|
||
CREATE TABLE semantic_blocks (
|
||
id SERIAL PRIMARY KEY,
|
||
doc_id VARCHAR(128) NOT NULL,
|
||
semantic_id VARCHAR(64) NOT NULL, -- 语义块唯一标识
|
||
block_type VARCHAR(32) NOT NULL, -- 类型:section_text/table/figure
|
||
page_start INT NOT NULL, -- 起始页码
|
||
page_end INT NOT NULL, -- 结束页码
|
||
section_id INT, -- 所属章节
|
||
section_title VARCHAR(512), -- 章节标题(冗余,方便查询)
|
||
section_level INT, -- 章节层级
|
||
source_ids JSONB, -- 原始 layout IDs(JSON 数组)
|
||
text TEXT NOT NULL, -- 完整内容(未被切分)
|
||
created_at TIMESTAMP DEFAULT NOW(),
|
||
|
||
CONSTRAINT fk_semantic_blocks_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id),
|
||
CONSTRAINT fk_semantic_blocks_section_id FOREIGN KEY (section_id) REFERENCES sections(id),
|
||
CONSTRAINT uq_semantic_blocks_doc_semantic UNIQUE (doc_id, semantic_id)
|
||
);
|
||
|
||
COMMENT ON TABLE semantic_blocks IS '语义块表,用于邻域扩展,恢复完整内容';
|
||
COMMENT ON COLUMN semantic_blocks.block_type IS '类型:section_text(正文)、table(表格)、figure(图示)';
|
||
COMMENT ON COLUMN semantic_blocks.source_ids IS '原始阿里云 layout 的 uniqueId 数组';
|
||
COMMENT ON COLUMN semantic_blocks.text IS '完整语义内容,未被切分';
|
||
|
||
-- ==================== 向量块元数据表 ====================
|
||
CREATE TABLE vector_chunks (
|
||
id SERIAL PRIMARY KEY,
|
||
doc_id VARCHAR(128) NOT NULL,
|
||
chunk_id VARCHAR(64) NOT NULL, -- Milvus 主键
|
||
semantic_id VARCHAR(64) NOT NULL, -- 关联语义块
|
||
chunk_index INT NOT NULL, -- 切片序号(全局)
|
||
piece_index INT, -- 同语义块内的切片序号
|
||
page_start INT,
|
||
page_end INT,
|
||
section_title VARCHAR(512),
|
||
text VARCHAR(2048), -- 切片文本(可选,缩短版用于展示)
|
||
source_ids JSONB, -- 原始 layout IDs(JSON 数组)
|
||
created_at TIMESTAMP DEFAULT NOW(),
|
||
|
||
CONSTRAINT fk_vector_chunks_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id),
|
||
CONSTRAINT fk_vector_chunks_semantic_id FOREIGN KEY (doc_id, semantic_id)
|
||
REFERENCES semantic_blocks(doc_id, semantic_id),
|
||
CONSTRAINT uq_vector_chunks_doc_chunk UNIQUE (doc_id, chunk_id)
|
||
);
|
||
|
||
COMMENT ON TABLE vector_chunks IS '向量块元数据表,用于快速关联查询';
|
||
COMMENT ON COLUMN vector_chunks.chunk_id IS 'Milvus 向量库主键';
|
||
COMMENT ON COLUMN vector_chunks.piece_index IS '同语义块内的切片序号,用于按序拼接';
|
||
|
||
-- ==================== 索引 ====================
|
||
CREATE INDEX idx_sections_doc_id ON sections(doc_id);
|
||
CREATE INDEX idx_sections_parent_id ON sections(parent_id);
|
||
CREATE INDEX idx_sections_level ON sections(level);
|
||
|
||
CREATE INDEX idx_semantic_blocks_doc_id ON semantic_blocks(doc_id);
|
||
CREATE INDEX idx_semantic_blocks_section_id ON semantic_blocks(section_id);
|
||
CREATE INDEX idx_semantic_blocks_block_type ON semantic_blocks(block_type);
|
||
CREATE INDEX idx_semantic_blocks_semantic_id ON semantic_blocks(semantic_id);
|
||
|
||
CREATE INDEX idx_vector_chunks_doc_id ON vector_chunks(doc_id);
|
||
CREATE INDEX idx_vector_chunks_semantic_id ON vector_chunks(semantic_id);
|
||
CREATE INDEX idx_vector_chunks_chunk_id ON vector_chunks(chunk_id);
|
||
|
||
-- ==================== 触发器:自动更新 updated_at ====================
|
||
CREATE OR REPLACE FUNCTION update_updated_at()
|
||
RETURNS TRIGGER AS $$
|
||
BEGIN
|
||
NEW.updated_at = NOW();
|
||
RETURN NEW;
|
||
END;
|
||
$$ LANGUAGE plpgsql;
|
||
|
||
CREATE TRIGGER tr_documents_updated_at
|
||
BEFORE UPDATE ON documents
|
||
FOR EACH ROW EXECUTE FUNCTION update_updated_at(); |