knowledge-rag/config.example.yaml at master · lyonzin/knowledge-rag · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
# ╔══════════════════════════════════════════════════════════════════════════════╗
# ║                                                                              ║
# ║                    KNOWLEDGE RAG — Configuration File                        ║
# ║                                                                              ║
# ╚══════════════════════════════════════════════════════════════════════════════╝
#
# This file controls how Knowledge RAG indexes, searches, and organizes
# your documents. Every field is optional — sensible defaults are used
# when omitted.
#
# Quick start:
#   1. Copy this file → config.yaml
#   2. Set documents_dir to your folder
#   3. Run the server — everything else works out of the box
#
# Want a head start? Check the presets/ folder for domain-specific configs:
#   presets/cybersecurity.yaml  — Offensive/defensive security, CTFs, threat hunting
#   presets/developer.yaml      — Software engineering, APIs, DevOps
#   presets/research.yaml       — Academic research, papers, studies
#   presets/general.yaml        — Blank slate, zero domain logic
#
# Usage:  cp presets/developer.yaml config.yaml
#
# Need help? https://github.com/lyonzin/knowledge-rag


# ============================================================================
# PATHS
# ============================================================================
# Where your data lives. Relative paths resolve from this file's location.
# Absolute paths work too (e.g., /home/user/my-docs or C:\Users\you\docs).

paths:
  # Folder containing your documents (scanned recursively).
  # This is the only path most users need to change.
  documents_dir: "./documents"

  # Internal storage for indexes and embeddings.
  # You generally don't need to touch this.
  data_dir: "./data"


# ============================================================================
# DOCUMENTS
# ============================================================================
# Control which files get indexed and how they're processed.

documents:
  # File types to index. Only files with these extensions are processed.
  # Remove or comment out formats you don't need.
  supported_formats:
    - .md        # Markdown
    - .txt       # Plain text
    - .pdf       # PDF documents
    - .docx      # Word documents
    # - .xlsx    # Excel spreadsheets
    # - .pptx    # PowerPoint presentations
    # - .csv     # CSV data files
    # - .py      # Python source code
    # - .json    # JSON files

  # How documents are split into searchable chunks.
  #
  #   chunk_size    → Max characters per chunk. Larger = more context per
  #                   result, but less precise. Smaller = more precise,
  #                   but may lose context.
  #
  #   chunk_overlap → Characters shared between consecutive chunks.
  #                   Prevents information from being cut at chunk boundaries.
  #
  # Recommended presets:
  #   Short notes/snippets:  chunk_size: 500,  chunk_overlap: 100
  #   General use (default): chunk_size: 1000, chunk_overlap: 200
  #   Long technical docs:   chunk_size: 1500, chunk_overlap: 300
  chunking:
    chunk_size: 1000
    chunk_overlap: 200


# ============================================================================
# MODELS
# ============================================================================
# AI models used for understanding and ranking your documents.
# Defaults work well for most cases — only change if you know what you're doing.

models:
  # Embedding model — converts text into vectors for semantic search.
  # Runs locally via ONNX (no API key needed, no data leaves your machine).
  #
  # Options (from fastest to most accurate):
  #   "BAAI/bge-small-en-v1.5"    → 384 dims, ~33MB, fast    (default)
  #   "BAAI/bge-base-en-v1.5"     → 768 dims, ~130MB, balanced
  #   "BAAI/bge-large-en-v1.5"    → 1024 dims, ~335MB, most accurate
  #
  # Multilingual alternatives:
  #   "BAAI/bge-small-zh-v1.5"         → Chinese
  #   "intfloat/multilingual-e5-small"  → 100+ languages
  #
  # WARNING: Changing this after indexing requires a full reindex.
  embedding:
    model: "BAAI/bge-small-en-v1.5"
    dimensions: 384

  # Reranker — re-scores search results for better accuracy.
  # Adds ~50-100ms per query but significantly improves result quality.
  # Set enabled: false on low-resource machines if search feels slow.
  reranker:
    enabled: true
    model: "Xenova/ms-marco-MiniLM-L-6-v2"
    # How many extra candidates to fetch before reranking.
    # Higher = better accuracy but slower. 3 is a good balance.
    top_k_multiplier: 3


# ============================================================================
# SEARCH
# ============================================================================
# Control search behavior and result limits.

search:
  # Number of results returned by default when no limit is specified.
  default_results: 5

  # Hard maximum — even if a client requests more, cap at this number.
  max_results: 20

  # ChromaDB collection name. Change this to maintain separate
  # knowledge bases in the same data directory.
  # Example: "work_kb", "research_kb", "personal_kb"
  collection_name: "knowledge_base"


# ============================================================================
# CATEGORIES
# ============================================================================
# Organize documents into categories based on their folder path.
# When a document's path contains a key below, it gets tagged with
# that category — enabling filtered searches.
#
# How it works:
#   A file at documents/recipes/italian/pasta.md
#   matches the pattern "recipes/italian" → category "italian"
#   and also matches "recipes" → category "cooking"
#   (most specific match wins)
#
# Set to empty {} to disable auto-categorization entirely.
# Documents will still be searchable — just without category filters.
#
# ┌─────────────────────────────────────────────────────────────────────┐
# │  path pattern (in documents_dir)    │    category name              │
# ├─────────────────────────────────────┼───────────────────────────────┤
# │  "recipes/italian"                  │    "italian"                  │
# │  "recipes"                          │    "cooking"                  │
# │  "work/projects"                    │    "projects"                 │
# │  "journal"                          │    "personal"                 │
# └─────────────────────────────────────┴───────────────────────────────┘

category_mappings: {}


# ============================================================================
# KEYWORD ROUTING
# ============================================================================
# Route search queries to specific categories based on keywords.
# When a query contains any of these keywords, results from that
# category are prioritized.
#
# This is a PERFORMANCE optimization, not a filter — results from
# other categories still appear, just ranked lower.
#
# Set to empty {} for pure semantic search with no keyword bias.

keyword_routes: {}


# ============================================================================
# QUERY EXPANSION
# ============================================================================
# Expand search terms with synonyms and abbreviations.
# When someone searches for "JS", also search for "JavaScript".
# This improves BM25 (keyword) recall without affecting semantic search.
#
# Format:
#   term:
#     - synonym1
#     - synonym2
#
# Set to empty {} for no expansion (search terms used as-is).

query_expansions: {}


# ============================================================================
# ADVANCED
# ============================================================================
# Settings for power users. Most people never need to touch these.

# advanced:
#   # File watcher — auto-reindex when documents change.
#   # Disable if you prefer manual reindexing only.
#   watch_for_changes: true
#   watch_debounce_seconds: 5
#
#   # BM25 parameters (keyword search tuning).
#   # k1: term frequency saturation (1.2-2.0). Higher = more weight on repeated terms.
#   # b:  length normalization (0.0-1.0). Higher = penalizes long documents more.
#   # bm25_k1: 1.5
#   # bm25_b: 0.75
#
#   # Logging verbosity: DEBUG, INFO, WARNING, ERROR
#   # log_level: "INFO"