diff --git a/apps/railwaycopilot/.gitignore b/apps/railwaycopilot/.gitignore new file mode 100644 index 0000000..e26ecaa --- /dev/null +++ b/apps/railwaycopilot/.gitignore @@ -0,0 +1,5 @@ +**/__pycache__ +**/__init___.py +.env +.DS_Store +**/.DS_Store \ No newline at end of file diff --git a/apps/railwaycopilot/backend/Dockerfile b/apps/railwaycopilot/backend/Dockerfile new file mode 100644 index 0000000..511169d --- /dev/null +++ b/apps/railwaycopilot/backend/Dockerfile @@ -0,0 +1,10 @@ +FROM python:3.11-slim + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +CMD ["streamlit", "run", "app.py", "--server.address=0.0.0.0", "--server.port=8501"] diff --git a/apps/railwaycopilot/backend/app.py b/apps/railwaycopilot/backend/app.py new file mode 100644 index 0000000..6f48ece --- /dev/null +++ b/apps/railwaycopilot/backend/app.py @@ -0,0 +1,83 @@ +import os +import streamlit as st + +from langchain_mistralai import ChatMistralAI, MistralAIEmbeddings +from langchain.docstore.document import Document + +from rail_rag.config import ( + MONGODB_URI, + EMBED_MODEL, + CHAT_MODEL, +) +from rail_rag.index_utils import get_mongo_collection +from rail_rag.retriever import MongoAtlasRetriever +from rail_rag.ui import render_prompt_lab +from rail_rag.generation import run_generation +from rail_rag.classifier import classify_text + +# --- Streamlit UI chrome --- +st.set_page_config(page_title="Rail Ops & Safety Assistant", page_icon="🚆", layout="wide") +st.title("🚆 Rail Operations & Safety Assistant (MongoDB + LangChain + Mistral)") + +# Sidebar: Prompt Lab (returns all user choices + composed system prompt) +lab = render_prompt_lab() + +if not os.getenv("MISTRAL_API_KEY"): + st.error("Missing `MISTRAL_API_KEY` in environment.") + st.stop() + +if not MONGODB_URI: + st.error("Missing `MONGODB_URI` in environment.") + st.stop() + +colA, colB, colC = st.columns([2, 1, 1]) +with colA: + q = st.text_input( + "Ask a question (e.g., 'What must a signaller do when going off duty?')", + "", + ) +with colB: + top_k = st.slider("Top-K chunks", 1, 10, 4, 1) +with colC: + show_debug = st.toggle("Show debug", value=False) + +# Connect resources (MongoDB collection + embeddings + retriever) +try: + collection = get_mongo_collection() +except Exception as e: + st.exception(e) + st.stop() + +embedder = MistralAIEmbeddings(model=EMBED_MODEL) +retriever = MongoAtlasRetriever(collection=collection, embedder=embedder, k=top_k) + +llm = ChatMistralAI(model=CHAT_MODEL) + +if q: + try: + retrieved = retriever.invoke(q) + + if show_debug: + with st.expander("🔎 Retrieved docs (debug)"): + for i, d in enumerate(retrieved, 1): + st.write(f"{i}. meta = {d.metadata}") + st.write((d.page_content or "")[:300] + "…") + + if not retrieved: + st.warning( + "No documents retrieved. " + "Check MongoDB URI / DB / collection / vector index / field names." + ) + st.stop() + + # Full prompt-building + A/B + rendering (answers + sources) + run_generation( + question=q, + retrieved=retrieved, + chat_model_name=CHAT_MODEL, + lab=lab, + ) + + except Exception as e: + st.exception(e) + st.stop() diff --git a/apps/railwaycopilot/backend/corpus/04. GERT8000_HB10.pdf b/apps/railwaycopilot/backend/corpus/04. GERT8000_HB10.pdf new file mode 100644 index 0000000..f4d2500 Binary files /dev/null and b/apps/railwaycopilot/backend/corpus/04. GERT8000_HB10.pdf differ diff --git a/apps/railwaycopilot/backend/corpus/Rules on walking on or near the line.pdf b/apps/railwaycopilot/backend/corpus/Rules on walking on or near the line.pdf new file mode 100644 index 0000000..5091510 Binary files /dev/null and b/apps/railwaycopilot/backend/corpus/Rules on walking on or near the line.pdf differ diff --git a/apps/railwaycopilot/backend/corpus/September 2024 Standards Update.pdf b/apps/railwaycopilot/backend/corpus/September 2024 Standards Update.pdf new file mode 100644 index 0000000..2dccb92 Binary files /dev/null and b/apps/railwaycopilot/backend/corpus/September 2024 Standards Update.pdf differ diff --git a/apps/railwaycopilot/backend/corpus/TS1_-_General_signalling_regulations_v18.pdf b/apps/railwaycopilot/backend/corpus/TS1_-_General_signalling_regulations_v18.pdf new file mode 100644 index 0000000..e2283e5 Binary files /dev/null and b/apps/railwaycopilot/backend/corpus/TS1_-_General_signalling_regulations_v18.pdf differ diff --git a/apps/railwaycopilot/backend/ingest_rulebook.py b/apps/railwaycopilot/backend/ingest_rulebook.py new file mode 100644 index 0000000..feab459 --- /dev/null +++ b/apps/railwaycopilot/backend/ingest_rulebook.py @@ -0,0 +1,214 @@ +import os +from glob import glob +from typing import List, Dict +import numpy as np +import requests +import certifi + +from pymongo import MongoClient +from pymongo.errors import OperationFailure +from langchain_text_splitters import RecursiveCharacterTextSplitter +from langchain_community.document_loaders import PyPDFLoader +from pymongo.errors import OperationFailure + +MONGODB_URI = os.getenv("MONGODB_URI") +DB_NAME = os.getenv("DB_NAME", "rail_ops") +COLL_NAME = os.getenv("COLLECTION_NAME", "rulebook_chunks") + +EMBED_MODEL = os.getenv("MISTRAL_EMBED_MODEL", "mistral-embed") +EMBED_DIM = 1024 +MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY") + +CORPUS_DIR = os.getenv("CORPUS_DIR", "corpus") + +# Field names +TEXT_KEY = "content" +VEC_KEY = "content_vector" +SRC_KEY = "source" +PAGE_KEY = "page" + + +class SimpleMistralEmbedder: + def __init__(self, model: str, api_key: str): + if not api_key: + raise RuntimeError("Missing MISTRAL_API_KEY in environment.") + self.model = model + self.api_key = api_key + self.url = "https://api.mistral.ai/v1/embeddings" + self.session = requests.Session() + self.headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + } + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + payload = {"model": self.model, "input": texts} + r = self.session.post(self.url, headers=self.headers, json=payload, timeout=60) + try: + j = r.json() + except Exception: + raise RuntimeError(f"Embeddings HTTP {r.status_code}: {r.text[:500]}") + # Accept multiple shapes to be version-tolerant + if "data" in j and isinstance(j["data"], list): + return [item["embedding"] for item in j["data"]] + if "embeddings" in j and isinstance(j["embeddings"], list): + return j["embeddings"] + if "error" in j: + raise RuntimeError(f"Mistral embeddings error: {j['error']}") + raise RuntimeError(f"Unexpected embeddings response shape: {str(j)[:500]}") + + def embed_query(self, text: str) -> List[float]: + return self.embed_documents([text])[0] + +# --------------------------------------------------------------------- +# Data loading & chunking +# --------------------------------------------------------------------- +def load_docs(corpus_dir: str): + docs = [] + for p in glob(os.path.join(corpus_dir, "*.pdf")): + for d in PyPDFLoader(p).load(): + d.metadata[SRC_KEY] = os.path.basename(p) + d.metadata[PAGE_KEY] = d.metadata.get("page") + docs.append(d) + return docs + +def chunk_docs(docs): + splitter = RecursiveCharacterTextSplitter( + chunk_size=800, + chunk_overlap=120, + add_start_index=True, + ) + return splitter.split_documents(docs) + +# --------------------------------------------------------------------- +# Build Mongo-ready documents +# --------------------------------------------------------------------- +def build_records(chunks, embedder: SimpleMistralEmbedder) -> List[Dict]: + texts = [c.page_content or "" for c in chunks] + vectors = embedder.embed_documents(texts) + recs = [] + for i, c in enumerate(chunks): + vec = vectors[i] + # Ensure correct dtype/dim for safety + if len(vec) != EMBED_DIM: + raise ValueError(f"Unexpected embedding dim {len(vec)} (expected {EMBED_DIM})") + # Mongo expects an array of numbers + vec = [float(x) for x in vec] + + rec = { + TEXT_KEY: c.page_content or "", + VEC_KEY: vec, + SRC_KEY: c.metadata.get(SRC_KEY), + } + page_val = c.metadata.get(PAGE_KEY) + if page_val is not None: + try: + rec[PAGE_KEY] = int(page_val) + except Exception: + rec[PAGE_KEY] = -1 + recs.append(rec) + return recs + +# --------------------------------------------------------------------- +# Ensure Atlas Vector Search index exists (vectorSearch) +# --------------------------------------------------------------------- +def ensure_vector_index(coll, index_name="vector_index"): + """ + Creates a Vector Search index on content_vector if it doesn't already exist. + """ + + print(f"[info] Checking existing search indexes on {coll.full_name}…") + + existing = [] + try: + existing = list(coll.aggregate([{"$listSearchIndexes": {}}])) + except OperationFailure as e: + print(f"[warn] $listSearchIndexes not supported or failed: {e}") + except Exception as e: + print(f"[warn] Unexpected error listing search indexes: {e}") + + for idx in existing: + if idx.get("name") == index_name: + print(f"[info] Search index '{index_name}' already exists.") + return + + print(f"[info] Creating VECTOR SEARCH index '{index_name}'…") + + definition = { + "name": index_name, + "type": "vectorSearch", + "definition": { + "fields": [ + { + "type": "vector", + "path": "content_vector", + "numDimensions": 1024, + "similarity": "cosine", + }, + { + "type": "filter", + "path": "source", + }, + { + "type": "filter", + "path": "page", + }, + ] + }, + } + + try: + result = coll.database.command({ + "createSearchIndexes": coll.name, + "indexes": [definition], + }) + print(f"[info] createSearchIndexes result: {result}") + except Exception as e: + print(f"[error] Failed to create search index '{index_name}': {e}") + +# --------------------------------------------------------------------- +def main(): + if not MISTRAL_API_KEY: + raise SystemExit("Missing MISTRAL_API_KEY in environment!") + if not MONGODB_URI: + raise SystemExit("Missing MONGODB_URI in environment!") + + # 1) Load & chunk PDFs + docs = load_docs(CORPUS_DIR) + if not docs: + raise SystemExit(f"No PDFs found in '{CORPUS_DIR}'") + chunks = chunk_docs(docs) + + # 2) Embed + embedder = SimpleMistralEmbedder(model=EMBED_MODEL, api_key=MISTRAL_API_KEY) + records = build_records(chunks, embedder) + + # 3) Connect to MongoDB Atlas + #client = MongoClient(MONGODB_URI) + client = MongoClient(MONGODB_URI, tlsCAFile=certifi.where()) + db = client[DB_NAME] + coll = db[COLL_NAME] + + # 4) Create / ensure vector index + try: + ensure_vector_index(coll, index_name="vector_index") + except Exception as e: + # If running locally (no Atlas) or on an older server this may fail; ingestion can still proceed. + print(f"[warn] Could not ensure vector index now: {e}") + + # 5) Fresh load: optional cleanup for a clean re-ingest + if os.getenv("FRESH_LOAD", "true").lower() in ("1", "true", "yes"): + coll.delete_many({}) + + # 6) Insert records + if records: + # Insert in batches + BATCH = 500 + for i in range(0, len(records), BATCH): + coll.insert_many(records[i:i+BATCH]) + print(f"[✅] Ingested {len(records)} chunks into '{DB_NAME}.{COLL_NAME}'") + else: + print("[ℹ️] No records to insert.") + +if __name__ == "__main__": + main() diff --git a/apps/railwaycopilot/backend/rail_rag/__init__.py b/apps/railwaycopilot/backend/rail_rag/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apps/railwaycopilot/backend/rail_rag/classifier.py b/apps/railwaycopilot/backend/rail_rag/classifier.py new file mode 100644 index 0000000..bc8d98c --- /dev/null +++ b/apps/railwaycopilot/backend/rail_rag/classifier.py @@ -0,0 +1,41 @@ +from langchain_mistralai import ChatMistralAI +import json + +CLASSIFIER_SYSTEM_PROMPT = """You are a classification assistant for rail operations and safety. +Classify the input into one of these intents: +- informational +- procedural +- compliance +- safety_critical +- other +Respond ONLY in JSON like: +{"intent": "..."}. +""" + +llm_classifier = ChatMistralAI(model="mistral-small-latest", temperature=0.0) + +def classify_text(text: str) -> dict: + messages = [ + ("system", CLASSIFIER_SYSTEM_PROMPT), + ("human", text), + ] + result = llm_classifier.invoke(messages) + raw = result.content.strip() + + # Try to parse JSON; if it fails, fall back to dict with string + try: + parsed = json.loads(raw) + if isinstance(parsed, dict): + return parsed + else: + return {"intent": str(parsed)} + except Exception: + # fallback: sometimes the LLM returns plain text or partial JSON + if raw.startswith("{") and raw.endswith("}"): + # slightly malformed JSON, try to clean quotes + raw = raw.replace("'", '"') + try: + return json.loads(raw) + except Exception: + pass + return {"intent": raw} diff --git a/apps/railwaycopilot/backend/rail_rag/config.py b/apps/railwaycopilot/backend/rail_rag/config.py new file mode 100644 index 0000000..d459cb9 --- /dev/null +++ b/apps/railwaycopilot/backend/rail_rag/config.py @@ -0,0 +1,22 @@ +import os + +MONGODB_URI = os.getenv("MONGODB_URI") +MONGO_DB_NAME = os.getenv("DB_NAME", "rail_ops") +MONGO_COLLECTION_NAME = os.getenv("COLLECTION_NAME", "rulebook_chunks") +VECTOR_INDEX_NAME = os.getenv("VECTOR_INDEX_NAME", "vector_index") + +# --- Mistral models --- +EMBED_MODEL = os.getenv("MISTRAL_EMBED_MODEL", "mistral-embed") +CHAT_MODEL = os.getenv("MISTRAL_CHAT_MODEL", "mistral-small-latest") + +# --- Field names --- +TEXT_KEY = "content" +VEC_KEY = "content_vector" +SRC_KEY = "source" +PAGE_KEY = "page" + +SYSTEM_PROMPT = """You are a Rail Operations & Safety assistant. +Answer ONLY using the provided context. +If the answer is not in the context, say “I don’t have that in the documents.” +Cite sources as (filename p.page). Be concise and correct. Do not reveal internal reasoning steps. +""" diff --git a/apps/railwaycopilot/backend/rail_rag/generation.py b/apps/railwaycopilot/backend/rail_rag/generation.py new file mode 100644 index 0000000..e0fb92d --- /dev/null +++ b/apps/railwaycopilot/backend/rail_rag/generation.py @@ -0,0 +1,102 @@ +import streamlit as st +from typing import List, Tuple +from langchain_mistralai import ChatMistralAI +from langchain.docstore.document import Document +from rail_rag.prompt_utils import build_context, build_messages, build_system_prompt +from rail_rag.classifier import classify_text + + +def _human_preview(question_text: str, context_text: str, limit: int = 1200) -> str: + snippet = context_text if len(context_text) <= limit else context_text[:limit] + "…" + return f"Question: {question_text}\n\nContext:\n{snippet}" + +def run_generation( + question: str, + retrieved: List[Document], + chat_model_name: str, + lab: dict, +): + active_system_prompt = lab["active_system_prompt"] + temperature = lab["temperature"] + max_tokens = lab["max_tokens"] + fewshot_pairs: List[Tuple[str, str]] = lab["fewshot_pairs"] + ab_test = lab["ab_test"] + base_prompt = lab["base_prompt"] + refuse_if_ooc = lab["refuse_if_ooc"] + extra_instructions = lab["extra_instructions"] + + ctx = build_context(retrieved) + messages_a = build_messages( + question=question, + context=ctx, + system_prompt=active_system_prompt, + few_shots=fewshot_pairs, + ) + + llm = ChatMistralAI(model=chat_model_name, temperature=temperature, max_tokens=max_tokens) + + # classification + classification = classify_text(question) + intent = classification.get("intent", "Unknown").capitalize() + + st.subheader("🧭 Query Classification") + st.markdown(f"**Intent:** {intent}") + st.divider() + + with st.expander("🧾 Prompts used (A/B Preview)", expanded=False): + st.markdown("### Prompt A — System") + st.code(active_system_prompt, language="markdown") + st.markdown("**Prompt A — Human**") + st.code(_human_preview(question, ctx), language="markdown") + + messages_b = None + alt_system_prompt = None + if ab_test: + alt_system_prompt = build_system_prompt( + base_prompt, + { + "force_citations": True, + "refuse_if_ooc": refuse_if_ooc, + "bulleted_style": True, + "structured_style": True, + }, + extra_instructions, + ) + messages_b = build_messages( + question=question, + context=ctx, + system_prompt=alt_system_prompt, + few_shots=fewshot_pairs, + ) + + st.markdown("---") + st.markdown("### Prompt B — System") + st.code(alt_system_prompt, language="markdown") + st.markdown("**Prompt B — Human**") + st.code(_human_preview(question, ctx), language="markdown") + + if not ab_test: + with st.spinner("Thinking…"): + ans_a = llm.invoke(messages_a) + st.subheader("Answer") + st.write(ans_a.content) + else: + col1, col2 = st.columns(2) + with col1: + st.markdown("### Prompt A") + with st.spinner("Running A…"): + ans_a = llm.invoke(messages_a) + st.write(ans_a.content) + + with col2: + st.markdown("### Prompt B") + with st.spinner("Running B…"): + ans_b = llm.invoke(messages_b) + st.write(ans_b.content) + + st.subheader("Sources") + for i, d in enumerate(retrieved, 1): + src = d.metadata.get("source", "document") + page = d.metadata.get("page", "n/a") + st.markdown(f"**{i}. {src} — p.{page}**") + st.write((d.page_content or "")[:400] + "…") diff --git a/apps/railwaycopilot/backend/rail_rag/index_utils.py b/apps/railwaycopilot/backend/rail_rag/index_utils.py new file mode 100644 index 0000000..82e8fe8 --- /dev/null +++ b/apps/railwaycopilot/backend/rail_rag/index_utils.py @@ -0,0 +1,30 @@ +from typing import Optional + +from pymongo import MongoClient +from pymongo.collection import Collection + +from rail_rag.config import ( + MONGODB_URI, + MONGO_DB_NAME, + MONGO_COLLECTION_NAME, +) + + +def get_mongo_collection( + uri: Optional[str] = None, + db_name: Optional[str] = None, + coll_name: Optional[str] = None, +) -> Collection: + + uri = uri or MONGODB_URI + db_name = db_name or MONGO_DB_NAME + coll_name = coll_name or MONGO_COLLECTION_NAME + + if not uri: + raise RuntimeError("Missing MONGODB_URI in environment.") + + client = MongoClient(uri) + db = client[db_name] + coll = db[coll_name] + return coll + diff --git a/apps/railwaycopilot/backend/rail_rag/prompt_presets.py b/apps/railwaycopilot/backend/rail_rag/prompt_presets.py new file mode 100644 index 0000000..5c66781 --- /dev/null +++ b/apps/railwaycopilot/backend/rail_rag/prompt_presets.py @@ -0,0 +1,24 @@ +from rail_rag.config import SYSTEM_PROMPT as BASE_SYSTEM_PROMPT + +PRESETS = { + "Baseline (Original)": BASE_SYSTEM_PROMPT, + "Strict Compliance": ( + BASE_SYSTEM_PROMPT + + "\nAlways refuse to answer if information is missing from context." + + "\nUse short, numbered steps when appropriate." + ), + "Operator Brief (Bulleted)": ( + BASE_SYSTEM_PROMPT + + "\nRespond in crisp bullet points suitable for radio/ops briefings." + + "\nIf a rule references a page, include it inline after the bullet." + ), + "Incident Response (Structured)": ( + BASE_SYSTEM_PROMPT + + "\nOrganize output as: Situation, Applicable Rules, Required Actions, Sources." + ), + "Trainer Mode (Explain & Cite)": ( + BASE_SYSTEM_PROMPT + + "\nExplain the rule briefly (1–2 sentences) and then summarize the action." + + "\nAlways include a Sources section with (filename p.page)." + ), +} diff --git a/apps/railwaycopilot/backend/rail_rag/prompt_utils.py b/apps/railwaycopilot/backend/rail_rag/prompt_utils.py new file mode 100644 index 0000000..de72b76 --- /dev/null +++ b/apps/railwaycopilot/backend/rail_rag/prompt_utils.py @@ -0,0 +1,43 @@ +from typing import List, Tuple, Dict +from langchain.docstore.document import Document + +def build_system_prompt(base: str, options: Dict[str, bool], extra_instructions: str) -> str: + base_clean = base.strip() + lines = [base_clean] + + if options.get("force_citations"): + lines.append("Always cite sources as (filename p.page).") + if options.get("refuse_if_ooc"): + lines.append("If the answer is not fully supported by the context, reply: ‘I don’t have that in the documents.’") + if options.get("bulleted_style"): + lines.append("Use concise bullet points.") + if options.get("structured_style"): + lines.append("Structure output with headings as appropriate.") + + if extra_instructions: + lines.append(extra_instructions.strip()) + + return "\n".join(lines) + + +def build_context(docs: List[Document]) -> str: + parts = [] + for i, d in enumerate(docs, 1): + src = d.metadata.get("source", "document") + page = d.metadata.get("page", "n/a") + parts.append(f"[{i}] ({src} p.{page})\n{d.page_content}") + return "\n\n".join(parts) + + +def build_messages(question: str, context: str, system_prompt: str, few_shots: List[Tuple[str, str]]): + """Return Chat messages as (role, content) tuples in the order the model expects. + few_shots: list of (role, content) pairs, e.g., [("human","..."), ("assistant","..."), ...] + """ + messages = [("system", system_prompt)] + + # Append few-shot examples (optional) + for role, content in few_shots: + messages.append((role, content)) + + messages.append(("human", f"Question: {question}\n\nContext:\n{context}\n\nAnswer:")) + return messages diff --git a/apps/railwaycopilot/backend/rail_rag/retriever.py b/apps/railwaycopilot/backend/rail_rag/retriever.py new file mode 100644 index 0000000..a0c4337 --- /dev/null +++ b/apps/railwaycopilot/backend/rail_rag/retriever.py @@ -0,0 +1,80 @@ +from typing import List, Optional + +from pydantic import PrivateAttr +from pymongo.collection import Collection +from langchain_mistralai import MistralAIEmbeddings +from langchain_core.retrievers import BaseRetriever +from langchain_core.callbacks import CallbackManagerForRetrieverRun +from langchain.docstore.document import Document +from langchain_community.vectorstores import MongoDBAtlasVectorSearch + +from rail_rag.config import ( + TEXT_KEY, + VEC_KEY, + SRC_KEY, + PAGE_KEY, + VECTOR_INDEX_NAME, +) + + +class MongoAtlasRetriever(BaseRetriever): + + # Public, validated field + k: int = 4 + + # Private attrs + _collection: Collection = PrivateAttr() + _embedder: MistralAIEmbeddings = PrivateAttr() + _vectorstore: MongoDBAtlasVectorSearch = PrivateAttr() + + class Config: + arbitrary_types_allowed = True + underscore_attrs_are_private = True + + def __init__( + self, + collection: Collection, + embedder: MistralAIEmbeddings, + k: int = 4, + index_name: str = VECTOR_INDEX_NAME, + **data, + ): + super().__init__(k=k, **data) + object.__setattr__(self, "_collection", collection) + object.__setattr__(self, "_embedder", embedder) + + # LangChain vector store that wraps Atlas Vector Search + vs = MongoDBAtlasVectorSearch( + collection=collection, + embedding=embedder, + index_name=index_name, + text_key=TEXT_KEY, + embedding_key=VEC_KEY, + ) + object.__setattr__(self, "_vectorstore", vs) + + def __repr__(self) -> str: + return f"" + + def __getstate__(self): + return {"k": self.k} + + def _get_relevant_documents( + self, + query: str, + *, + run_manager: Optional[CallbackManagerForRetrieverRun] = None, + ) -> List[Document]: + """ + Use MongoDB Atlas Vector Search via LangChain's MongoDBAtlasVectorSearch. + """ + docs = self._vectorstore.similarity_search(query, k=self.k) + + # Ensure important metadata keys exist + for d in docs: + md = d.metadata or {} + md.setdefault("source", md.get(SRC_KEY)) + md.setdefault("page", md.get(PAGE_KEY)) + d.metadata = md + + return docs diff --git a/apps/railwaycopilot/backend/rail_rag/ui.py b/apps/railwaycopilot/backend/rail_rag/ui.py new file mode 100644 index 0000000..1efcf90 --- /dev/null +++ b/apps/railwaycopilot/backend/rail_rag/ui.py @@ -0,0 +1,82 @@ +import streamlit as st +from rail_rag.prompt_presets import PRESETS +from rail_rag.prompt_utils import build_system_prompt + +def render_prompt_lab(): + with st.sidebar: + st.header("🧪 Prompt Lab") + + preset_name = st.selectbox("Preset", list(PRESETS.keys()), index=0) + base_prompt = PRESETS[preset_name] + + st.caption("Adjust global behavior") + force_citations = st.checkbox("Always include citations", value=True) + refuse_if_ooc = st.checkbox("Refuse if not in context", value=True) + bulleted_style = st.checkbox("Bulleted style", value=("Bulleted" in preset_name)) + structured_style = st.checkbox("Structured sections", value=("Structured" in preset_name)) + + extra_instructions = st.text_area( + "Extra instructions (optional)", + value="", + help="Add domain-specific constraints, formatting, or terminology." + ) + + st.divider() + st.caption("Generation controls") + temperature = st.slider("Temperature", 0.0, 1.5, 0.2, 0.05) + max_tokens = st.slider("Max tokens", 128, 2048, 512, 32) + + st.divider() + st.caption("Few-shot examples (optional)") + use_fewshot = st.checkbox("Enable few-shot examples", value=False) + fewshot_pairs = [] + if use_fewshot: + with st.expander("Add examples"): + ex_user = st.text_area("User example", value="What must a signaller do when going off duty?") + ex_assistant = st.text_area("Assistant example", value=( + "• Notify relief signaller and transfer any ongoing movements.\n" + "• Record handover in logbook.\nSources: (Rulebook.pdf p.12)" + )) + if st.button("Add example"): + st.session_state.setdefault("fewshots", []) + st.session_state["fewshots"].append(("human", ex_user)) + st.session_state["fewshots"].append(("assistant", ex_assistant)) + fewshot_pairs = st.session_state.get("fewshots", []) + + st.divider() + ab_test = st.checkbox( + "Run A/B prompt experiment", + value=False, + help=( + "Compare two prompts side-by-side.\n\n" + "🅰️ Prompt A — uses your current sidebar settings (preset + toggles + extra instructions + few-shots).\n" + "🅱️ Prompt B — same base preset but forces: citations, bullet + structured format, and same 'Refuse if not in context'." + ), + ) + + active_system_prompt = build_system_prompt( + base_prompt, + { + "force_citations": force_citations, + "refuse_if_ooc": refuse_if_ooc, + "bulleted_style": bulleted_style, + "structured_style": structured_style, + }, + extra_instructions, + ) + + return { + "preset_name": preset_name, + "base_prompt": base_prompt, + "force_citations": force_citations, + "refuse_if_ooc": refuse_if_ooc, + "bulleted_style": bulleted_style, + "structured_style": structured_style, + "extra_instructions": extra_instructions, + "temperature": temperature, + "max_tokens": max_tokens, + "use_fewshot": use_fewshot, + "fewshot_pairs": few_shots if (few_shots := fewshot_pairs) else [], + "ab_test": ab_test, + "active_system_prompt": active_system_prompt, + } diff --git a/apps/railwaycopilot/backend/requirements.txt b/apps/railwaycopilot/backend/requirements.txt new file mode 100644 index 0000000..c2ddd1e --- /dev/null +++ b/apps/railwaycopilot/backend/requirements.txt @@ -0,0 +1,16 @@ +streamlit +pypdf + +# MongoDB Atlas client + TLS CA bundle +pymongo>=4.6.1 +certifi + +# LangChain stack (current split) +langchain==0.2.16 +langchain-core==0.2.38 +langchain-community==0.2.11 +langchain-mistralai==0.1.10 +langchain-text-splitters==0.2.2 + +# Embeddings — keep if you still use sentence-transformers elsewhere +sentence-transformers diff --git a/apps/railwaycopilot/docker-compose.yml b/apps/railwaycopilot/docker-compose.yml new file mode 100644 index 0000000..53c9281 --- /dev/null +++ b/apps/railwaycopilot/docker-compose.yml @@ -0,0 +1,19 @@ +# docker-compose.yml + +services: + rag_app: + build: ./backend + container_name: rag_app_mistral + ports: + - "8501:8501" + environment: + - INDEX_NAME=rail_rulebook + - MISTRAL_API_KEY=${MISTRAL_API_KEY} + - MISTRAL_CHAT_MODEL=${MISTRAL_CHAT_MODEL:-mistral-small-latest} + - MISTRAL_EMBED_MODEL=${MISTRAL_EMBED_MODEL:-mistral-embed} + - MONGODB_URI=${MONGODB_URI} + - NORMALIZE_EMBEDDINGS=true + volumes: + - ./backend:/app + - ./backend/corpus:/app/corpus + restart: unless-stopped diff --git a/apps/railwaycopilot/readme.md b/apps/railwaycopilot/readme.md new file mode 100644 index 0000000..4bb0293 --- /dev/null +++ b/apps/railwaycopilot/readme.md @@ -0,0 +1,70 @@ +# Railway Operations & Safety Procedures Assistant + +This is a RAG application which ingests the following sources : + +- TS1 – General signalling regulations, Issue 18 (in force 07 Dec 2024) – PDF. from tectraining.co.uk +- HB10 – Duties of the COSS and person in charge when using a hand trolley (Issue 5, Sept 2023) – PDF from consultations.rssb.co.uk +- “Rules on walking on or near the line” (overview page, new rules from 07 Dec 2024) – pdf article from rssb.co.uk +- RSSB standards Updates from September 2024 from rssb.co.uk + +Once up and running, you can ask the chatbot questions like : +- What should a signaller do when going off-duty +- What should a driver do if a signal is defective? + +# Stack + +- Mistral’s Embeddings API (mistral-embed) and Chat Completions (mistral-small-latest). +- Langchain +- MongoDB Atlas as Vector Databse + +# Build +Create an .env file at the root of the folder, with the following configuration: + +MISTRAL_API_KEY="" +MISTRAL_CHAT_MODEL=mistral-small-latest +MISTRAL_EMBED_MODEL=mistral-embed +MONGODB_URI="" + +```sh +docker compose up --build -d + +#Chunk the source data, embed the chunks, store and index them in MongoDB : +docker compose exec rag_app python ingest_rulebook.py + +#Open the app +open http://localhost:8501 + +#Rebuilt after a change +docker compose build rag_app +docker compose up -d +``` + +# Features + +- Change the prompt structure : + - Start with the preset text (base_prompt). + - For each ticked checkbox, append an extra line of instructions. + - Append any free-text “Extra instructions”. + - Note: some presets already mention bullets / structure / refusal. The checkboxes can add additional lines that reinforce or duplicate that behavior. This is fine, the model just sees stronger guidance. +- Visualize the used prompt +- A/B Test different prompts : + - Prompt A is your defined prompt + - Prompt B uses the same base preset but forces : citations, bullet + structured format, and 'Refuse if not in context'.” +- Change Temperature (how deterministic vs creative the model can be) +- Change Max Tokens (how long the model's response can be) +- Define Top-K Chunks (how many document chunks your retriever returns for the LLM to use as context) +- Add extra instructions +- Use Few Shots inference (add your own example Q&A for the model to understand what it needs to do) +- Visualize a classification of the intent behind the question asked : informational, procedural, compliance, safety_critical, other +- Show Debug (see retrieved documents) + + +# Going further - potential improvements + +- Make the app agentic, leveraging function calling (calling 911, triggering OCR if the input is a picture of the problem) +- Tune the chunking and data preparation +- Incident classification and analytics +- hybrid search + + +