- Updated LocalDocumentParser to include raw_layouts and artifact_prefix from settings. - Added new documents with failure reasons and metadata to documents.json for better error tracking. - Created a new documentation file detailing the Aliyun ingest implementation process. - Updated RFC to reflect changes in the parsing backend and embedding dimensions. - Modified tests to accommodate the new embedding dimension of 1024 and updated parser and chunk builder assertions. - Verified migration configurations to ensure correct settings for embedding model and backend.
217 lines
7.1 KiB
Python
217 lines
7.1 KiB
Python
"""
|
|
Post-migration backend smoke checks.
|
|
|
|
Purpose:
|
|
1. Verify the new architecture modules can be imported
|
|
2. Verify migration-critical config matches the RFC
|
|
3. Verify external dependencies when they are available
|
|
4. Optionally verify the real ingest path with a sample document
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
|
sys.path.insert(0, str(PROJECT_ROOT / "backend"))
|
|
|
|
from loguru import logger
|
|
|
|
from app.config.logging import setup_logging
|
|
from app.config.settings import settings
|
|
from app.shared.bootstrap import (
|
|
get_document_command_service,
|
|
get_retrieval_service,
|
|
get_vector_index,
|
|
)
|
|
|
|
setup_logging(level="INFO")
|
|
|
|
|
|
def verify_service_wiring() -> bool:
|
|
"""Verify the new module layout and service entrypoints can be imported."""
|
|
logger.info("=" * 60)
|
|
logger.info("Step 1: verify module wiring")
|
|
logger.info("=" * 60)
|
|
|
|
try:
|
|
from app.api.main import app
|
|
from app.application.agent import AgentConversationService
|
|
from app.application.documents import DocumentCommandService, DocumentQueryService
|
|
from app.application.knowledge import KnowledgeRetrievalService
|
|
from app.shared import bootstrap
|
|
|
|
assert app is not None
|
|
assert DocumentCommandService is not None
|
|
assert DocumentQueryService is not None
|
|
assert KnowledgeRetrievalService is not None
|
|
assert AgentConversationService is not None
|
|
assert bootstrap is not None
|
|
logger.success("module wiring ok")
|
|
return True
|
|
except Exception as exc:
|
|
logger.error(f"module wiring failed: {exc}")
|
|
return False
|
|
|
|
|
|
def verify_migration_config() -> bool:
|
|
"""Verify migration-critical config values."""
|
|
logger.info("=" * 60)
|
|
logger.info("Step 2: verify migration config")
|
|
logger.info("=" * 60)
|
|
|
|
try:
|
|
assert settings.embedding_model == "text-embedding-v3"
|
|
assert settings.embedding_dim == 1024
|
|
assert settings.milvus_collection == "regulations_dense_1024_v1"
|
|
assert settings.parser_backend == "aliyun"
|
|
assert settings.chunk_backend == "aliyun"
|
|
logger.info(f"embedding_model={settings.embedding_model}")
|
|
logger.info(f"embedding_base_url={settings.embedding_base_url}")
|
|
logger.info(f"embedding_dim={settings.embedding_dim}")
|
|
logger.info(f"milvus_collection={settings.milvus_collection}")
|
|
logger.info(f"parser_backend={settings.parser_backend}")
|
|
logger.info(f"chunk_backend={settings.chunk_backend}")
|
|
logger.success("migration config ok")
|
|
return True
|
|
except Exception as exc:
|
|
logger.error(f"migration config mismatch: {exc}")
|
|
return False
|
|
|
|
|
|
def verify_minio_connection() -> bool:
|
|
"""Verify MinIO connectivity for the binary store path."""
|
|
logger.info("=" * 60)
|
|
logger.info("Step 3: verify MinIO connection")
|
|
logger.info("=" * 60)
|
|
|
|
try:
|
|
binary_store = get_document_command_service().binary_store
|
|
assert binary_store is not None
|
|
logger.success("MinIO connection ok")
|
|
return True
|
|
except Exception as exc:
|
|
logger.error(f"MinIO connection failed: {exc}")
|
|
logger.info("start MinIO first or update .env storage settings")
|
|
return False
|
|
|
|
|
|
def verify_milvus_connection() -> bool:
|
|
"""Verify dense-only Milvus adapter connectivity."""
|
|
logger.info("=" * 60)
|
|
logger.info("Step 4: verify Milvus connection")
|
|
logger.info("=" * 60)
|
|
|
|
try:
|
|
health = get_vector_index().health()
|
|
logger.info(f"Milvus health: {health}")
|
|
logger.success("Milvus connection ok")
|
|
return True
|
|
except Exception as exc:
|
|
logger.error(f"Milvus connection failed: {exc}")
|
|
logger.info("start Milvus first or update .env vector settings")
|
|
return False
|
|
|
|
|
|
def verify_ingest_pipeline(sample_file: Path) -> bool:
|
|
"""Verify upload -> parse -> embed -> index using a real file."""
|
|
logger.info("=" * 60)
|
|
logger.info("Step 5: verify real ingest pipeline")
|
|
logger.info("=" * 60)
|
|
|
|
if not sample_file.exists():
|
|
logger.error(f"sample file not found: {sample_file}")
|
|
return False
|
|
|
|
if sample_file.suffix.lower() not in {".pdf", ".doc", ".docx"}:
|
|
logger.error("sample file must be PDF, DOC, or DOCX")
|
|
return False
|
|
|
|
if not settings.alibaba_access_key_id or not settings.alibaba_access_key_secret:
|
|
logger.error("missing Aliyun parser credentials")
|
|
return False
|
|
|
|
try:
|
|
result = get_document_command_service().upload_and_process(
|
|
file_name=sample_file.name,
|
|
content=sample_file.read_bytes(),
|
|
content_type=_guess_content_type(sample_file),
|
|
doc_name=sample_file.stem,
|
|
regulation_type="smoke-test",
|
|
version="migration",
|
|
generate_summary=False,
|
|
)
|
|
logger.info(f"process result: doc_id={result.doc_id}, status={result.status}, chunks={result.num_chunks}")
|
|
if result.status != "indexed":
|
|
logger.error(f"ingest failed: {result.message}")
|
|
return False
|
|
|
|
retrieval_results = get_retrieval_service().retrieve(
|
|
query=sample_file.stem,
|
|
top_k=3,
|
|
filters=f'doc_id == "{result.doc_id}"',
|
|
)
|
|
logger.info(f"retrieval count: {len(retrieval_results)}")
|
|
logger.success("real ingest pipeline ok")
|
|
return True
|
|
except Exception as exc:
|
|
logger.error(f"real ingest pipeline failed: {exc}")
|
|
return False
|
|
|
|
|
|
def _guess_content_type(sample_file: Path) -> str:
|
|
suffix = sample_file.suffix.lower()
|
|
if suffix == ".pdf":
|
|
return "application/pdf"
|
|
if suffix == ".doc":
|
|
return "application/msword"
|
|
if suffix == ".docx":
|
|
return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
return "application/octet-stream"
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(description="Verify the migrated backend path")
|
|
parser.add_argument("--sample-file", type=Path, help="Optional PDF/DOC/DOCX for real ingest verification")
|
|
return parser.parse_args()
|
|
|
|
|
|
def main() -> bool:
|
|
args = parse_args()
|
|
|
|
results = [
|
|
("module_wiring", verify_service_wiring()),
|
|
("migration_config", verify_migration_config()),
|
|
("minio_connection", verify_minio_connection()),
|
|
("milvus_connection", verify_milvus_connection()),
|
|
]
|
|
|
|
if args.sample_file:
|
|
results.append(("real_ingest_pipeline", verify_ingest_pipeline(args.sample_file)))
|
|
else:
|
|
logger.info("no sample file provided; skip real ingest check")
|
|
|
|
logger.info("\n" + "=" * 60)
|
|
logger.info("check summary")
|
|
logger.info("=" * 60)
|
|
|
|
all_passed = True
|
|
for name, passed in results:
|
|
status = "PASS" if passed else "FAIL"
|
|
logger.info(f"{name}: {status}")
|
|
if not passed:
|
|
all_passed = False
|
|
|
|
if all_passed:
|
|
logger.success("all executed checks passed")
|
|
else:
|
|
logger.warning("some checks failed; inspect environment dependencies")
|
|
|
|
return all_passed
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(0 if main() else 1)
|