Delete log file for May 14, 2026, to clean up unnecessary data and maintain log management.
This commit is contained in:
122
backend/app/aliyun_parser/schema.sql
Normal file
122
backend/app/aliyun_parser/schema.sql
Normal file
@@ -0,0 +1,122 @@
|
||||
-- 法规文档向量检索系统数据库表结构
|
||||
-- PostgreSQL
|
||||
|
||||
-- ==================== 文档表 ====================
|
||||
CREATE TABLE documents (
|
||||
id SERIAL PRIMARY KEY,
|
||||
doc_id VARCHAR(128) UNIQUE NOT NULL, -- 文档唯一标识,如 "GB14747-2006"
|
||||
title VARCHAR(512) NOT NULL, -- 文档标题
|
||||
doc_type VARCHAR(32), -- 文档类型:标准/法规/规范
|
||||
standard_number VARCHAR(64), -- 标准编号:如 "GB 14747-2006"
|
||||
publish_date DATE, -- 发布日期
|
||||
implement_date DATE, -- 实施日期
|
||||
status VARCHAR(32), -- 状态:现行/废止/修订
|
||||
source_url VARCHAR(512), -- 来源 URL
|
||||
file_path VARCHAR(512), -- 本地 PDF 文件路径
|
||||
file_size INT, -- 文件大小(字节)
|
||||
upload_time TIMESTAMP DEFAULT NOW(), -- 上传时间
|
||||
created_at TIMESTAMP DEFAULT NOW(),
|
||||
updated_at TIMESTAMP DEFAULT NOW()
|
||||
);
|
||||
|
||||
COMMENT ON TABLE documents IS '文档元数据表';
|
||||
COMMENT ON COLUMN documents.doc_id IS '文档唯一标识,用于关联 Milvus 和其他表';
|
||||
COMMENT ON COLUMN documents.standard_number IS '标准编号,如 GB 14747-2006';
|
||||
|
||||
-- ==================== 章节结构表 ====================
|
||||
CREATE TABLE sections (
|
||||
id SERIAL PRIMARY KEY,
|
||||
doc_id VARCHAR(128) NOT NULL,
|
||||
unique_id VARCHAR(64) NOT NULL, -- 阿里云返回的唯一标识
|
||||
level INT NOT NULL, -- 层级:1, 2, 3...
|
||||
title VARCHAR(512) NOT NULL, -- 章节标题
|
||||
page INT, -- 所在页码
|
||||
index INT, -- 页内顺序
|
||||
parent_id INT, -- 父章节 ID(树形结构)
|
||||
created_at TIMESTAMP DEFAULT NOW(),
|
||||
|
||||
CONSTRAINT fk_sections_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id),
|
||||
CONSTRAINT fk_sections_parent_id FOREIGN KEY (parent_id) REFERENCES sections(id),
|
||||
CONSTRAINT uq_sections_doc_unique UNIQUE (doc_id, unique_id)
|
||||
);
|
||||
|
||||
COMMENT ON TABLE sections IS '章节结构表,用于目录导航';
|
||||
COMMENT ON COLUMN sections.parent_id IS '父章节 ID,构建树形结构';
|
||||
COMMENT ON COLUMN sections.level IS '层级深度,1 为最顶层';
|
||||
|
||||
-- ==================== 语义块表 ====================
|
||||
CREATE TABLE semantic_blocks (
|
||||
id SERIAL PRIMARY KEY,
|
||||
doc_id VARCHAR(128) NOT NULL,
|
||||
semantic_id VARCHAR(64) NOT NULL, -- 语义块唯一标识
|
||||
block_type VARCHAR(32) NOT NULL, -- 类型:section_text/table/figure
|
||||
page_start INT NOT NULL, -- 起始页码
|
||||
page_end INT NOT NULL, -- 结束页码
|
||||
section_id INT, -- 所属章节
|
||||
section_title VARCHAR(512), -- 章节标题(冗余,方便查询)
|
||||
section_level INT, -- 章节层级
|
||||
source_ids JSONB, -- 原始 layout IDs(JSON 数组)
|
||||
text TEXT NOT NULL, -- 完整内容(未被切分)
|
||||
created_at TIMESTAMP DEFAULT NOW(),
|
||||
|
||||
CONSTRAINT fk_semantic_blocks_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id),
|
||||
CONSTRAINT fk_semantic_blocks_section_id FOREIGN KEY (section_id) REFERENCES sections(id),
|
||||
CONSTRAINT uq_semantic_blocks_doc_semantic UNIQUE (doc_id, semantic_id)
|
||||
);
|
||||
|
||||
COMMENT ON TABLE semantic_blocks IS '语义块表,用于邻域扩展,恢复完整内容';
|
||||
COMMENT ON COLUMN semantic_blocks.block_type IS '类型:section_text(正文)、table(表格)、figure(图示)';
|
||||
COMMENT ON COLUMN semantic_blocks.source_ids IS '原始阿里云 layout 的 uniqueId 数组';
|
||||
COMMENT ON COLUMN semantic_blocks.text IS '完整语义内容,未被切分';
|
||||
|
||||
-- ==================== 向量块元数据表 ====================
|
||||
CREATE TABLE vector_chunks (
|
||||
id SERIAL PRIMARY KEY,
|
||||
doc_id VARCHAR(128) NOT NULL,
|
||||
chunk_id VARCHAR(64) NOT NULL, -- Milvus 主键
|
||||
semantic_id VARCHAR(64) NOT NULL, -- 关联语义块
|
||||
chunk_index INT NOT NULL, -- 切片序号(全局)
|
||||
piece_index INT, -- 同语义块内的切片序号
|
||||
page_start INT,
|
||||
page_end INT,
|
||||
section_title VARCHAR(512),
|
||||
text VARCHAR(2048), -- 切片文本(可选,缩短版用于展示)
|
||||
source_ids JSONB, -- 原始 layout IDs(JSON 数组)
|
||||
created_at TIMESTAMP DEFAULT NOW(),
|
||||
|
||||
CONSTRAINT fk_vector_chunks_doc_id FOREIGN KEY (doc_id) REFERENCES documents(doc_id),
|
||||
CONSTRAINT fk_vector_chunks_semantic_id FOREIGN KEY (doc_id, semantic_id)
|
||||
REFERENCES semantic_blocks(doc_id, semantic_id),
|
||||
CONSTRAINT uq_vector_chunks_doc_chunk UNIQUE (doc_id, chunk_id)
|
||||
);
|
||||
|
||||
COMMENT ON TABLE vector_chunks IS '向量块元数据表,用于快速关联查询';
|
||||
COMMENT ON COLUMN vector_chunks.chunk_id IS 'Milvus 向量库主键';
|
||||
COMMENT ON COLUMN vector_chunks.piece_index IS '同语义块内的切片序号,用于按序拼接';
|
||||
|
||||
-- ==================== 索引 ====================
|
||||
CREATE INDEX idx_sections_doc_id ON sections(doc_id);
|
||||
CREATE INDEX idx_sections_parent_id ON sections(parent_id);
|
||||
CREATE INDEX idx_sections_level ON sections(level);
|
||||
|
||||
CREATE INDEX idx_semantic_blocks_doc_id ON semantic_blocks(doc_id);
|
||||
CREATE INDEX idx_semantic_blocks_section_id ON semantic_blocks(section_id);
|
||||
CREATE INDEX idx_semantic_blocks_block_type ON semantic_blocks(block_type);
|
||||
CREATE INDEX idx_semantic_blocks_semantic_id ON semantic_blocks(semantic_id);
|
||||
|
||||
CREATE INDEX idx_vector_chunks_doc_id ON vector_chunks(doc_id);
|
||||
CREATE INDEX idx_vector_chunks_semantic_id ON vector_chunks(semantic_id);
|
||||
CREATE INDEX idx_vector_chunks_chunk_id ON vector_chunks(chunk_id);
|
||||
|
||||
-- ==================== 触发器:自动更新 updated_at ====================
|
||||
CREATE OR REPLACE FUNCTION update_updated_at()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
NEW.updated_at = NOW();
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
CREATE TRIGGER tr_documents_updated_at
|
||||
BEFORE UPDATE ON documents
|
||||
FOR EACH ROW EXECUTE FUNCTION update_updated_at();
|
||||
Reference in New Issue
Block a user