Files
AIRegulation-DocAnalysis/aliyun_parser/schema.sql
wangwei dcda7e0423 @
chore: delete old layout/common/tabs components before redesign
@
2026-06-03 16:58:35 +08:00

122 lines
6.1 KiB
PL/PgSQL
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

-- 法规文档向量检索系统数据库表结构
-- PostgreSQL
-- ==================== 文档表 ====================
CREATE TABLE documents (
id SERIAL PRIMARY KEY,
doc_id VARCHAR(128) UNIQUE NOT NULL, -- 文档唯一标识,如 "GB14747-2006"
title VARCHAR(512) NOT NULL, -- 文档标题
doc_type VARCHAR(32), -- 文档类型:标准/法规/规范
standard_number VARCHAR(64), -- 标准编号:如 "GB 14747-2006"
publish_date DATE, -- 发布日期
implement_date DATE, -- 实施日期
status VARCHAR(32), -- 状态:现行/废止/修订
source_url VARCHAR(512), -- 来源 URL
file_path VARCHAR(512), -- 本地 PDF 文件路径
file_size INT, -- 文件大小(字节)
upload_time TIMESTAMP DEFAULT NOW(), -- 上传时间
created_at TIMESTAMP DEFAULT NOW(),
updated_at TIMESTAMP DEFAULT NOW()
);
COMMENT ON TABLE documents IS '文档元数据表';
COMMENT ON COLUMN documents.doc_id IS '文档唯一标识,用于关联 Milvus 和其他表';
COMMENT ON COLUMN documents.standard_number IS '标准编号,如 GB 14747-2006';
-- ==================== 章节结构表 ====================
CREATE TABLE sections (
id SERIAL PRIMARY KEY,
doc_id VARCHAR(128) NOT NULL,
unique_id VARCHAR(64) NOT NULL, -- 阿里云返回的唯一标识
level INT NOT NULL, -- 层级1, 2, 3...
title VARCHAR(512) NOT NULL, -- 章节标题
page INT, -- 所在页码
index INT, -- 页内顺序
parent_id INT, -- 父章节 ID树形结构
created_at TIMESTAMP DEFAULT NOW(),
CONSTRAINT fk_sections_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id),
CONSTRAINT fk_sections_parent_id FOREIGN KEY (parent_id) REFERENCES sections(id),
CONSTRAINT uq_sections_doc_unique UNIQUE (doc_id, unique_id)
);
COMMENT ON TABLE sections IS '章节结构表,用于目录导航';
COMMENT ON COLUMN sections.parent_id IS '父章节 ID构建树形结构';
COMMENT ON COLUMN sections.level IS '层级深度1 为最顶层';
-- ==================== 语义块表 ====================
CREATE TABLE semantic_blocks (
id SERIAL PRIMARY KEY,
doc_id VARCHAR(128) NOT NULL,
semantic_id VARCHAR(64) NOT NULL, -- 语义块唯一标识
block_type VARCHAR(32) NOT NULL, -- 类型section_text/table/figure
page_start INT NOT NULL, -- 起始页码
page_end INT NOT NULL, -- 结束页码
section_id INT, -- 所属章节
section_title VARCHAR(512), -- 章节标题(冗余,方便查询)
section_level INT, -- 章节层级
source_ids JSONB, -- 原始 layout IDsJSON 数组)
text TEXT NOT NULL, -- 完整内容(未被切分)
created_at TIMESTAMP DEFAULT NOW(),
CONSTRAINT fk_semantic_blocks_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id),
CONSTRAINT fk_semantic_blocks_section_id FOREIGN KEY (section_id) REFERENCES sections(id),
CONSTRAINT uq_semantic_blocks_doc_semantic UNIQUE (doc_id, semantic_id)
);
COMMENT ON TABLE semantic_blocks IS '语义块表,用于邻域扩展,恢复完整内容';
COMMENT ON COLUMN semantic_blocks.block_type IS '类型section_text正文、table表格、figure图示';
COMMENT ON COLUMN semantic_blocks.source_ids IS '原始阿里云 layout 的 uniqueId 数组';
COMMENT ON COLUMN semantic_blocks.text IS '完整语义内容,未被切分';
-- ==================== 向量块元数据表 ====================
CREATE TABLE vector_chunks (
id SERIAL PRIMARY KEY,
doc_id VARCHAR(128) NOT NULL,
chunk_id VARCHAR(64) NOT NULL, -- Milvus 主键
semantic_id VARCHAR(64) NOT NULL, -- 关联语义块
chunk_index INT NOT NULL, -- 切片序号(全局)
piece_index INT, -- 同语义块内的切片序号
page_start INT,
page_end INT,
section_title VARCHAR(512),
text VARCHAR(2048), -- 切片文本(可选,缩短版用于展示)
source_ids JSONB, -- 原始 layout IDsJSON 数组)
created_at TIMESTAMP DEFAULT NOW(),
CONSTRAINT fk_vector_chunks_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id),
CONSTRAINT fk_vector_chunks_semantic_id FOREIGN KEY (doc_id, semantic_id)
REFERENCES semantic_blocks(doc_id, semantic_id),
CONSTRAINT uq_vector_chunks_doc_chunk UNIQUE (doc_id, chunk_id)
);
COMMENT ON TABLE vector_chunks IS '向量块元数据表,用于快速关联查询';
COMMENT ON COLUMN vector_chunks.chunk_id IS 'Milvus 向量库主键';
COMMENT ON COLUMN vector_chunks.piece_index IS '同语义块内的切片序号,用于按序拼接';
-- ==================== 索引 ====================
CREATE INDEX idx_sections_doc_id ON sections(doc_id);
CREATE INDEX idx_sections_parent_id ON sections(parent_id);
CREATE INDEX idx_sections_level ON sections(level);
CREATE INDEX idx_semantic_blocks_doc_id ON semantic_blocks(doc_id);
CREATE INDEX idx_semantic_blocks_section_id ON semantic_blocks(section_id);
CREATE INDEX idx_semantic_blocks_block_type ON semantic_blocks(block_type);
CREATE INDEX idx_semantic_blocks_semantic_id ON semantic_blocks(semantic_id);
CREATE INDEX idx_vector_chunks_doc_id ON vector_chunks(doc_id);
CREATE INDEX idx_vector_chunks_semantic_id ON vector_chunks(semantic_id);
CREATE INDEX idx_vector_chunks_chunk_id ON vector_chunks(chunk_id);
-- ==================== 触发器:自动更新 updated_at ====================
CREATE OR REPLACE FUNCTION update_updated_at()
RETURNS TRIGGER AS $$
BEGIN
NEW.updated_at = NOW();
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
CREATE TRIGGER tr_documents_updated_at
BEFORE UPDATE ON documents
FOR EACH ROW EXECUTE FUNCTION update_updated_at();