Files
AIRegulation-DocAnalysis/tests/verify_mvp.py

212 lines
6.8 KiB
Python

"""
Post-migration backend smoke checks.
Purpose:
1. Verify the new architecture modules can be imported
2. Verify migration-critical config matches the RFC
3. Verify external dependencies when they are available
4. Optionally verify the real ingest path with a sample document
"""
from __future__ import annotations
import argparse
import sys
from pathlib import Path
PROJECT_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(PROJECT_ROOT / "backend"))
from loguru import logger
from app.config.logging import setup_logging
from app.config.settings import settings
from app.shared.bootstrap import (
get_document_command_service,
get_retrieval_service,
get_vector_index,
)
setup_logging(level="INFO")
def verify_service_wiring() -> bool:
"""Verify the new module layout and service entrypoints can be imported."""
logger.info("=" * 60)
logger.info("Step 1: verify module wiring")
logger.info("=" * 60)
try:
from app.api.main import app
from app.application.agent import AgentConversationService
from app.application.documents import DocumentCommandService, DocumentQueryService
from app.application.knowledge import KnowledgeRetrievalService
from app.shared import bootstrap
assert app is not None
assert DocumentCommandService is not None
assert DocumentQueryService is not None
assert KnowledgeRetrievalService is not None
assert AgentConversationService is not None
assert bootstrap is not None
logger.success("module wiring ok")
return True
except Exception as exc:
logger.error(f"module wiring failed: {exc}")
return False
def verify_migration_config() -> bool:
"""Verify migration-critical config values."""
logger.info("=" * 60)
logger.info("Step 2: verify migration config")
logger.info("=" * 60)
try:
assert settings.embedding_model == "text-embedding-v3"
assert settings.embedding_dim == 1536
assert settings.milvus_collection == "regulations_dense_1536"
logger.info(f"embedding_model={settings.embedding_model}")
logger.info(f"embedding_dim={settings.embedding_dim}")
logger.info(f"milvus_collection={settings.milvus_collection}")
logger.success("migration config ok")
return True
except Exception as exc:
logger.error(f"migration config mismatch: {exc}")
return False
def verify_minio_connection() -> bool:
"""Verify MinIO connectivity for the binary store path."""
logger.info("=" * 60)
logger.info("Step 3: verify MinIO connection")
logger.info("=" * 60)
try:
binary_store = get_document_command_service().binary_store
assert binary_store is not None
logger.success("MinIO connection ok")
return True
except Exception as exc:
logger.error(f"MinIO connection failed: {exc}")
logger.info("start MinIO first or update .env storage settings")
return False
def verify_milvus_connection() -> bool:
"""Verify dense-only Milvus adapter connectivity."""
logger.info("=" * 60)
logger.info("Step 4: verify Milvus connection")
logger.info("=" * 60)
try:
health = get_vector_index().health()
logger.info(f"Milvus health: {health}")
logger.success("Milvus connection ok")
return True
except Exception as exc:
logger.error(f"Milvus connection failed: {exc}")
logger.info("start Milvus first or update .env vector settings")
return False
def verify_ingest_pipeline(sample_file: Path) -> bool:
"""Verify upload -> parse -> embed -> index using a real file."""
logger.info("=" * 60)
logger.info("Step 5: verify real ingest pipeline")
logger.info("=" * 60)
if not sample_file.exists():
logger.error(f"sample file not found: {sample_file}")
return False
if sample_file.suffix.lower() not in {".pdf", ".doc", ".docx"}:
logger.error("sample file must be PDF, DOC, or DOCX")
return False
if not settings.alibaba_access_key_id or not settings.alibaba_access_key_secret:
logger.error("missing Aliyun parser credentials")
return False
try:
result = get_document_command_service().upload_and_process(
file_name=sample_file.name,
content=sample_file.read_bytes(),
content_type=_guess_content_type(sample_file),
doc_name=sample_file.stem,
regulation_type="smoke-test",
version="migration",
generate_summary=False,
)
logger.info(f"process result: doc_id={result.doc_id}, status={result.status}, chunks={result.num_chunks}")
if result.status != "indexed":
logger.error(f"ingest failed: {result.message}")
return False
retrieval_results = get_retrieval_service().retrieve(
query=sample_file.stem,
top_k=3,
filters=f'doc_id == "{result.doc_id}"',
)
logger.info(f"retrieval count: {len(retrieval_results)}")
logger.success("real ingest pipeline ok")
return True
except Exception as exc:
logger.error(f"real ingest pipeline failed: {exc}")
return False
def _guess_content_type(sample_file: Path) -> str:
suffix = sample_file.suffix.lower()
if suffix == ".pdf":
return "application/pdf"
if suffix == ".doc":
return "application/msword"
if suffix == ".docx":
return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
return "application/octet-stream"
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Verify the migrated backend path")
parser.add_argument("--sample-file", type=Path, help="Optional PDF/DOC/DOCX for real ingest verification")
return parser.parse_args()
def main() -> bool:
args = parse_args()
results = [
("module_wiring", verify_service_wiring()),
("migration_config", verify_migration_config()),
("minio_connection", verify_minio_connection()),
("milvus_connection", verify_milvus_connection()),
]
if args.sample_file:
results.append(("real_ingest_pipeline", verify_ingest_pipeline(args.sample_file)))
else:
logger.info("no sample file provided; skip real ingest check")
logger.info("\n" + "=" * 60)
logger.info("check summary")
logger.info("=" * 60)
all_passed = True
for name, passed in results:
status = "PASS" if passed else "FAIL"
logger.info(f"{name}: {status}")
if not passed:
all_passed = False
if all_passed:
logger.success("all executed checks passed")
else:
logger.warning("some checks failed; inspect environment dependencies")
return all_passed
if __name__ == "__main__":
sys.exit(0 if main() else 1)