diff --git a/backend/app/api/main.py b/backend/app/api/main.py
index 7f7e48f..b5fec60 100644
--- a/backend/app/api/main.py
+++ b/backend/app/api/main.py
@@ -11,7 +11,7 @@ from app.api.models import ErrorResponse
 from app.api.routes import api_router
 from app.config.logging import setup_logging
 from app.config.settings import settings
-from app.services.llm.llm_factory import LLMFactory
+from app.shared.bootstrap import cleanup_runtime_dependencies, preload_runtime_dependencies
 # Keep module behavior explicit so the backend flow stays easy to audit.
 
 
@@ -24,12 +24,12 @@ async def lifespan(app: FastAPI):
     logger.info(f"启动 {settings.app_name} v{settings.app_version}")
     logger.info(f"调试模式: {settings.debug}")
     logger.info("预加载LLM客户端...")
-    LLMFactory.preload_clients(["qwen", "deepseek"])
+    preload_runtime_dependencies()
 
     yield
 
     logger.info("应用关闭，执行清理...")
-    LLMFactory.cleanup()
+    cleanup_runtime_dependencies()
 
 
 app = FastAPI(
diff --git a/backend/app/api/routes/agent.py b/backend/app/api/routes/agent.py
index c0349fc..651aedf 100644
--- a/backend/app/api/routes/agent.py
+++ b/backend/app/api/routes/agent.py
@@ -20,7 +20,7 @@ from app.api.models import (
 )
 from app.config.settings import settings
 from app.shared.async_utils import iter_in_thread
-from app.shared.bootstrap import get_agent_conversation_service, get_conversation_store
+from app.shared.bootstrap import get_agent_conversation_service, get_agent_session_service
 # Keep route handlers close to their transport-layer wiring for easier auditing.
 
 
diff --git a/backend/app/infrastructure/llm/openai_compatible_answer_generator.py b/backend/app/infrastructure/llm/openai_compatible_answer_generator.py
index 2c296bb..1eee7c7 100644
--- a/backend/app/infrastructure/llm/openai_compatible_answer_generator.py
+++ b/backend/app/infrastructure/llm/openai_compatible_answer_generator.py
@@ -67,6 +67,21 @@ class OpenAICompatibleAnswerGenerator(AnswerGenerator):
         )
         return messages, context_tokens
 
+    def _is_context_truncated(self, *, retrieved_chunks: list[RetrievedChunk], context_tokens: int) -> bool:
+        """Return whether the prompt context had to omit retrieved chunks to fit the token budget."""
+        if not retrieved_chunks:
+            return False
+        estimated_total_tokens = sum(
+            self._estimate_tokens(
+                f"[{idx}] 文档: {chunk.doc_name}\n"
+                f"章节: {chunk.section_title or '未标注'}\n"
+                f"页码: {chunk.page_number}\n"
+                f"内容: {chunk.content}"
+            )
+            for idx, chunk in enumerate(retrieved_chunks, start=1)
+        )
+        return estimated_total_tokens > context_tokens
+
     def _sources(self, chunks: list[RetrievedChunk]) -> list[AnswerSource]:
         """Handle sources for this module for the Open A I Compatible Answer Generator instance."""
         return [
@@ -111,7 +126,10 @@ class OpenAICompatibleAnswerGenerator(AnswerGenerator):
             latency_ms=latency_ms,
             retrieved_count=len(retrieved_chunks),
             context_tokens=context_tokens,
-            truncated=len(retrieved_chunks) > len(messages),
+            truncated=self._is_context_truncated(
+                retrieved_chunks=retrieved_chunks,
+                context_tokens=context_tokens,
+            ),
             error=response.error,
         )
 
diff --git a/backend/app/shared/bootstrap.py b/backend/app/shared/bootstrap.py
index 47947f1..e11aba3 100644
--- a/backend/app/shared/bootstrap.py
+++ b/backend/app/shared/bootstrap.py
@@ -3,31 +3,134 @@
 from __future__ import annotations
 
 from functools import lru_cache
+from typing import Callable
 
 from app.application.agent import AgentConversationService, AgentSessionService
 from app.application.documents import DocumentCommandService, DocumentQueryService
 from app.application.knowledge import KnowledgeRetrievalService
+from app.application.perception.services import PerceptionService
 from app.config.settings import settings
+from app.domain.documents import DocumentBinaryStore
+from app.domain.retrieval import VectorIndex
 from app.infrastructure.embedding.openai_compatible_embedding_provider import OpenAICompatibleEmbeddingProvider
 from app.infrastructure.llm.openai_compatible_answer_generator import OpenAICompatibleAnswerGenerator
 from app.infrastructure.parser.aliyun_document_parser import AliyunDocumentParser
 from app.infrastructure.parser.local_chunk_builder import LocalRegulationChunkBuilder
 from app.infrastructure.parser.local_document_parser import LocalDocumentParser
 from app.infrastructure.parser.vector_chunk_builder import AliyunVectorChunkBuilder
+from app.infrastructure.perception.mock_event_store import MockEventStore
 from app.infrastructure.session.in_memory_conversation_store import InMemoryConversationStore
 from app.infrastructure.storage.json_document_repository import JsonDocumentRepository
 from app.infrastructure.storage.minio_binary_store import MinioDocumentBinaryStore
 from app.infrastructure.storage.postgres_document_repository import PostgresDocumentRepository
 from app.infrastructure.storage.postgres_parse_artifact_store import PostgresParseArtifactStore
 from app.infrastructure.vectorstore.bm25_retriever import BM25Retriever
+from app.infrastructure.vectorstore.cross_encoder_reranker import OpenAICompatibleReranker
 from app.infrastructure.vectorstore.dense_retriever import DenseRetriever
 from app.infrastructure.vectorstore.milvus_vector_index import MilvusVectorIndex
-from app.infrastructure.vectorstore.cross_encoder_reranker import OpenAICompatibleReranker
-from app.infrastructure.perception.mock_event_store import MockEventStore
-from app.application.perception.services import PerceptionService
+from app.services.llm.llm_factory import LLMFactory
 # Keep shared wiring centralized so dependency construction remains consistent.
 
 
+class LazyBinaryStore(DocumentBinaryStore):
+    """Delay MinIO connection work until binary storage is actually needed."""
+
+    def __init__(self, factory: Callable[[], DocumentBinaryStore]) -> None:
+        """Initialize the lazy binary store wrapper."""
+        self._factory = factory
+        self._store: DocumentBinaryStore | None = None
+
+    def _get_store(self) -> DocumentBinaryStore:
+        """Create the underlying store on first use and reuse it afterwards."""
+        if self._store is None:
+            self._store = self._factory()
+        return self._store
+
+    @property
+    def client(self):
+        """Expose the underlying client for compatibility with health endpoints."""
+        return self._get_store().client
+
+    def save(
+        self,
+        *,
+        object_name: str,
+        data: bytes,
+        content_type: str,
+        metadata: dict[str, str] | None = None,
+    ) -> None:
+        """Save data through the underlying binary store implementation."""
+        self._get_store().save(
+            object_name=object_name,
+            data=data,
+            content_type=content_type,
+            metadata=metadata,
+        )
+
+    def read(self, object_name: str) -> bytes:
+        """Read data through the underlying binary store implementation."""
+        return self._get_store().read(object_name)
+
+    def delete(self, object_name: str) -> None:
+        """Delete data through the underlying binary store implementation."""
+        self._get_store().delete(object_name)
+
+
+class LazyVectorIndex(VectorIndex):
+    """Delay Milvus connection work until vector operations are actually needed."""
+
+    def __init__(self, factory: Callable[[], VectorIndex]) -> None:
+        """Initialize the lazy vector index wrapper."""
+        self._factory = factory
+        self._index: VectorIndex | None = None
+
+    def _get_index(self) -> VectorIndex:
+        """Create the underlying index on first use and reuse it afterwards."""
+        if self._index is None:
+            self._index = self._factory()
+        return self._index
+
+    @property
+    def collection(self):
+        """Expose the underlying Milvus collection for compatibility adapters."""
+        return self._get_index().collection
+
+    def upsert(self, chunks, vectors) -> int:
+        """Insert or update vectors through the underlying vector index implementation."""
+        return self._get_index().upsert(chunks, vectors)
+
+    def delete_by_document(self, doc_id: str) -> int:
+        """Delete vectors through the underlying vector index implementation."""
+        return self._get_index().delete_by_document(doc_id)
+
+    def search(self, query_vector: list[float], top_k: int, filters: str | None = None):
+        """Search vectors through the underlying vector index implementation."""
+        return self._get_index().search(query_vector, top_k, filters)
+
+    def count_by_document(self) -> dict[str, int]:
+        """Count document vectors through the underlying vector index implementation."""
+        return self._get_index().count_by_document()
+
+    def list_document_metadata(self) -> list[dict]:
+        """List document metadata through the underlying vector index implementation."""
+        return self._get_index().list_document_metadata()
+
+    def health(self) -> dict:
+        """Return vector index health through the underlying vector index implementation."""
+        return self._get_index().health()
+
+
+@lru_cache
+def _build_binary_store() -> MinioDocumentBinaryStore:
+    """Return the concrete binary store implementation."""
+    return MinioDocumentBinaryStore()
+
+
+@lru_cache
+def _build_vector_index() -> MilvusVectorIndex:
+    """Return the concrete vector index implementation."""
+    return MilvusVectorIndex()
+
 
 @lru_cache
 def get_document_repository():
@@ -46,9 +149,9 @@ def get_parse_artifact_store():
 
 
 @lru_cache
-def get_binary_store() -> MinioDocumentBinaryStore:
+def get_binary_store() -> DocumentBinaryStore:
     """Return binary store."""
-    return MinioDocumentBinaryStore()
+    return LazyBinaryStore(_build_binary_store)
 
 
 @lru_cache
@@ -77,9 +180,9 @@ def get_embedding_provider() -> OpenAICompatibleEmbeddingProvider:
 
 
 @lru_cache
-def get_vector_index() -> MilvusVectorIndex:
+def get_vector_index() -> VectorIndex:
     """Return vector index."""
-    return MilvusVectorIndex()
+    return LazyVectorIndex(_build_vector_index)
 
 
 @lru_cache
@@ -162,6 +265,19 @@ def get_perception_service() -> PerceptionService:
         event_store=MockEventStore(),
         retrieval_service=get_retrieval_service(),
     )
+
+
+@lru_cache
 def get_agent_session_service() -> AgentSessionService:
     """Return agent session service."""
     return AgentSessionService(conversation_store=get_conversation_store())
+
+
+def preload_runtime_dependencies() -> None:
+    """Warm dependencies that are safe and useful to preload during startup."""
+    LLMFactory.preload_clients(["qwen", "deepseek"])
+
+
+def cleanup_runtime_dependencies() -> None:
+    """Release runtime dependencies that expose explicit cleanup hooks."""
+    LLMFactory.cleanup()
diff --git a/frontend/.env b/frontend/.env
new file mode 100644
index 0000000..d60765d
--- /dev/null
+++ b/frontend/.env
@@ -0,0 +1,2 @@
+VITE_API_PROXY_TARGET=http://6.86.80.8:8000
+FRONTEND_PORT=5173
diff --git a/frontend/.env.development b/frontend/.env.development
new file mode 100644
index 0000000..191794b
--- /dev/null
+++ b/frontend/.env.development
@@ -0,0 +1,2 @@
+VITE_API_PROXY_TARGET=http://127.0.0.1:8000
+FRONTEND_PORT=5173
diff --git a/frontend/.env.example b/frontend/.env.example
new file mode 100644
index 0000000..191794b
--- /dev/null
+++ b/frontend/.env.example
@@ -0,0 +1,2 @@
+VITE_API_PROXY_TARGET=http://127.0.0.1:8000
+FRONTEND_PORT=5173
diff --git a/frontend/README.md b/frontend/README.md
index cfb6d1a..7032377 100644
--- a/frontend/README.md
+++ b/frontend/README.md
@@ -49,6 +49,12 @@ npm run dev
 
 启动本地开发服务器，默认访问 `http://localhost:5173`
 
+前端环境文件约定如下：
+
+- `frontend/.env.development`：本地开发，默认代理到 `http://127.0.0.1:8000`
+- `frontend/.env.production`：生产构建，默认代理到 `http://6.86.80.8:8000`
+- `frontend/.env.local`：临时覆盖本机配置，优先级高于上面两者
+
 ### 构建生产版本
 
 ```bash
diff --git a/frontend/src/pages/RagChat/CitedAnswer.tsx b/frontend/src/pages/RagChat/CitedAnswer.tsx
index 42ccfef..1f33b5e 100644
--- a/frontend/src/pages/RagChat/CitedAnswer.tsx
+++ b/frontend/src/pages/RagChat/CitedAnswer.tsx
@@ -1,4 +1,4 @@
-import React, { useRef } from 'react';
+import React from 'react';
 import { useTheme } from '../../contexts';
 import type { RetrievalData } from '../../types';
 
diff --git a/frontend/vite.config.ts b/frontend/vite.config.ts
index 1e02afe..1f0da05 100644
--- a/frontend/vite.config.ts
+++ b/frontend/vite.config.ts
@@ -4,7 +4,8 @@ import react from '@vitejs/plugin-react'
 // https://vite.dev/config/
 export default defineConfig(({ mode }) => {
   const env = loadEnv(mode, process.cwd(), '')
-  const apiHost = env.API_HOST || '6.86.80.8'
+  // Default local frontend development to the local backend unless explicitly overridden.
+  const apiHost = env.API_HOST || '127.0.0.1'
   const apiPort = env.API_PORT || '8000'
   const proxyTarget = env.VITE_API_PROXY_TARGET || `http://${apiHost}:${apiPort}`
 
diff --git a/pyproject.toml b/pyproject.toml
index 375c01e..7b69071 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,7 +30,7 @@ dependencies = [
     "celery>=5.3.0",
     "redis>=4.5.0",
     "minio>=7.1.0",
-    "psycopg2-binary>=2.9.0"
+    "psycopg2-binary>=2.9.0",
 ]
 
 [dependency-groups]