-- 法规文档向量检索系统数据库表结构 -- PostgreSQL -- ==================== 文档表 ==================== CREATE TABLE documents ( id SERIAL PRIMARY KEY, doc_id VARCHAR(128) UNIQUE NOT NULL, -- 文档唯一标识,如 "GB14747-2006" title VARCHAR(512) NOT NULL, -- 文档标题 doc_type VARCHAR(32), -- 文档类型:标准/法规/规范 standard_number VARCHAR(64), -- 标准编号:如 "GB 14747-2006" publish_date DATE, -- 发布日期 implement_date DATE, -- 实施日期 status VARCHAR(32), -- 状态:现行/废止/修订 source_url VARCHAR(512), -- 来源 URL file_path VARCHAR(512), -- 本地 PDF 文件路径 file_size INT, -- 文件大小(字节) upload_time TIMESTAMP DEFAULT NOW(), -- 上传时间 created_at TIMESTAMP DEFAULT NOW(), updated_at TIMESTAMP DEFAULT NOW() ); COMMENT ON TABLE documents IS '文档元数据表'; COMMENT ON COLUMN documents.doc_id IS '文档唯一标识,用于关联 Milvus 和其他表'; COMMENT ON COLUMN documents.standard_number IS '标准编号,如 GB 14747-2006'; -- ==================== 章节结构表 ==================== CREATE TABLE sections ( id SERIAL PRIMARY KEY, doc_id VARCHAR(128) NOT NULL, unique_id VARCHAR(64) NOT NULL, -- 阿里云返回的唯一标识 level INT NOT NULL, -- 层级:1, 2, 3... title VARCHAR(512) NOT NULL, -- 章节标题 page INT, -- 所在页码 index INT, -- 页内顺序 parent_id INT, -- 父章节 ID(树形结构) created_at TIMESTAMP DEFAULT NOW(), CONSTRAINT fk_sections_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id), CONSTRAINT fk_sections_parent_id FOREIGN KEY (parent_id) REFERENCES sections(id), CONSTRAINT uq_sections_doc_unique UNIQUE (doc_id, unique_id) ); COMMENT ON TABLE sections IS '章节结构表,用于目录导航'; COMMENT ON COLUMN sections.parent_id IS '父章节 ID,构建树形结构'; COMMENT ON COLUMN sections.level IS '层级深度,1 为最顶层'; -- ==================== 语义块表 ==================== CREATE TABLE semantic_blocks ( id SERIAL PRIMARY KEY, doc_id VARCHAR(128) NOT NULL, semantic_id VARCHAR(64) NOT NULL, -- 语义块唯一标识 block_type VARCHAR(32) NOT NULL, -- 类型:section_text/table/figure page_start INT NOT NULL, -- 起始页码 page_end INT NOT NULL, -- 结束页码 section_id INT, -- 所属章节 section_title VARCHAR(512), -- 章节标题(冗余,方便查询) section_level INT, -- 章节层级 source_ids JSONB, -- 原始 layout IDs(JSON 数组) text TEXT NOT NULL, -- 完整内容(未被切分) created_at TIMESTAMP DEFAULT NOW(), CONSTRAINT fk_semantic_blocks_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id), CONSTRAINT fk_semantic_blocks_section_id FOREIGN KEY (section_id) REFERENCES sections(id), CONSTRAINT uq_semantic_blocks_doc_semantic UNIQUE (doc_id, semantic_id) ); COMMENT ON TABLE semantic_blocks IS '语义块表,用于邻域扩展,恢复完整内容'; COMMENT ON COLUMN semantic_blocks.block_type IS '类型:section_text(正文)、table(表格)、figure(图示)'; COMMENT ON COLUMN semantic_blocks.source_ids IS '原始阿里云 layout 的 uniqueId 数组'; COMMENT ON COLUMN semantic_blocks.text IS '完整语义内容,未被切分'; -- ==================== 向量块元数据表 ==================== CREATE TABLE vector_chunks ( id SERIAL PRIMARY KEY, doc_id VARCHAR(128) NOT NULL, chunk_id VARCHAR(64) NOT NULL, -- Milvus 主键 semantic_id VARCHAR(64) NOT NULL, -- 关联语义块 chunk_index INT NOT NULL, -- 切片序号(全局) piece_index INT, -- 同语义块内的切片序号 page_start INT, page_end INT, section_title VARCHAR(512), text VARCHAR(2048), -- 切片文本(可选,缩短版用于展示) source_ids JSONB, -- 原始 layout IDs(JSON 数组) created_at TIMESTAMP DEFAULT NOW(), CONSTRAINT fk_vector_chunks_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id), CONSTRAINT fk_vector_chunks_semantic_id FOREIGN KEY (doc_id, semantic_id) REFERENCES semantic_blocks(doc_id, semantic_id), CONSTRAINT uq_vector_chunks_doc_chunk UNIQUE (doc_id, chunk_id) ); COMMENT ON TABLE vector_chunks IS '向量块元数据表,用于快速关联查询'; COMMENT ON COLUMN vector_chunks.chunk_id IS 'Milvus 向量库主键'; COMMENT ON COLUMN vector_chunks.piece_index IS '同语义块内的切片序号,用于按序拼接'; -- ==================== 索引 ==================== CREATE INDEX idx_sections_doc_id ON sections(doc_id); CREATE INDEX idx_sections_parent_id ON sections(parent_id); CREATE INDEX idx_sections_level ON sections(level); CREATE INDEX idx_semantic_blocks_doc_id ON semantic_blocks(doc_id); CREATE INDEX idx_semantic_blocks_section_id ON semantic_blocks(section_id); CREATE INDEX idx_semantic_blocks_block_type ON semantic_blocks(block_type); CREATE INDEX idx_semantic_blocks_semantic_id ON semantic_blocks(semantic_id); CREATE INDEX idx_vector_chunks_doc_id ON vector_chunks(doc_id); CREATE INDEX idx_vector_chunks_semantic_id ON vector_chunks(semantic_id); CREATE INDEX idx_vector_chunks_chunk_id ON vector_chunks(chunk_id); -- ==================== 触发器:自动更新 updated_at ==================== CREATE OR REPLACE FUNCTION update_updated_at() RETURNS TRIGGER AS $$ BEGIN NEW.updated_at = NOW(); RETURN NEW; END; $$ LANGUAGE plpgsql; CREATE TRIGGER tr_documents_updated_at BEFORE UPDATE ON documents FOR EACH ROW EXECUTE FUNCTION update_updated_at();