-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy path8762395.html
More file actions
535 lines (423 loc) · 63.4 KB
/
8762395.html
File metadata and controls
535 lines (423 loc) · 63.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width">
<meta name="theme-color" content="#222"><meta name="generator" content="Hexo 6.3.0">
<link rel="apple-touch-icon" sizes="180x180" href="/images/apple-touch-icon-next.png">
<link rel="icon" type="image/png" sizes="32x32" href="/images/favicon.png">
<link rel="icon" type="image/png" sizes="16x16" href="/images/favicon.png">
<link rel="mask-icon" href="/images/logo.svg" color="#222">
<link rel="stylesheet" href="/css/main.css">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css" integrity="sha256-HtsXJanqjKTc8vVQjO4YMhiqFoXkfBsjBWcX91T1jr8=" crossorigin="anonymous">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/animate.css/3.1.1/animate.min.css" integrity="sha256-PR7ttpcvz8qrF57fur/yAx1qXMFJeJFiA6pSzWi0OIE=" crossorigin="anonymous">
<script class="next-config" data-name="main" type="application/json">{"hostname":"tallate.github.io","root":"/","images":"/images","scheme":"Gemini","darkmode":false,"version":"8.18.0","exturl":false,"sidebar":{"position":"left","display":"post","padding":18,"offset":12},"copycode":{"enable":false,"style":null},"fold":{"enable":false,"height":500},"bookmark":{"enable":false,"color":"#222","save":"auto"},"mediumzoom":false,"lazyload":false,"pangu":false,"comments":{"style":"tabs","active":null,"storage":true,"lazyload":false,"nav":null},"stickytabs":false,"motion":{"enable":true,"async":false,"transition":{"menu_item":"fadeInDown","post_block":"fadeIn","post_header":"fadeInDown","post_body":"fadeInDown","coll_header":"fadeInLeft","sidebar":"fadeInUp"}},"prism":false,"i18n":{"placeholder":"搜索...","empty":"没有找到任何搜索结果:${query}","hits_time":"找到 ${hits} 个搜索结果(用时 ${time} 毫秒)","hits":"找到 ${hits} 个搜索结果"}}</script><script src="/js/config.js"></script>
<meta name="description" content="基于 Lucene,ES 实现了分布式的索引管理,这篇文档分析单机视角下的索引原理。 [x] ES如何保证搜索的近实时(1秒后被搜到)[x] 为什么删除文档,不会立刻释放空间">
<meta property="og:type" content="article">
<meta property="og:title" content="ES2_1索引原理">
<meta property="og:url" content="https://tallate.github.io/8762395.html">
<meta property="og:site_name" content="Tallate">
<meta property="og:description" content="基于 Lucene,ES 实现了分布式的索引管理,这篇文档分析单机视角下的索引原理。 [x] ES如何保证搜索的近实时(1秒后被搜到)[x] 为什么删除文档,不会立刻释放空间">
<meta property="og:locale" content="zh_CN">
<meta property="og:image" content="https://tallate.github.io/imgs/ES/%E4%B8%80%E4%B8%AALucene%E7%B4%A2%E5%BC%95%E5%8C%85%E5%90%AB%E4%B8%80%E4%B8%AA%E6%8F%90%E4%BA%A4%E7%82%B9%E5%92%8C%E4%B8%89%E4%B8%AA%E6%AE%B5.png">
<meta property="og:image" content="https://tallate.github.io/imgs/ES/%E6%96%B0%E6%96%87%E6%A1%A3%E8%A2%AB%E6%B7%BB%E5%8A%A0%E5%88%B0%E7%BC%93%E5%AD%98.png">
<meta property="og:image" content="https://tallate.github.io/imgs/ES/%E7%BC%93%E5%86%B2%E5%8C%BA%E8%A2%AB%E5%86%99%E5%85%A5%E6%AE%B5%E4%BD%86%E6%9C%AA%E5%AE%8C%E6%88%90%E6%8F%90%E4%BA%A4.png">
<meta property="og:image" content="https://tallate.github.io/imgs/ES/%E6%96%B0%E7%9A%84%E6%96%87%E6%A1%A3%E8%A2%AB%E6%B7%BB%E5%8A%A0%E5%88%B0%E5%86%85%E5%AD%98%E7%BC%93%E5%86%B2%E5%8C%BA%E5%B9%B6%E4%B8%94%E8%A2%AB%E8%BF%BD%E5%8A%A0%E5%88%B0%E4%BA%86%E4%BA%8B%E5%8A%A1%E6%97%A5%E5%BF%97.png">
<meta property="og:image" content="https://tallate.github.io/imgs/ES/Refresh%E5%AE%8C%E6%88%90%E5%90%8E%E7%BC%93%E5%AD%98%E8%A2%AB%E6%B8%85%E7%A9%BA%E4%BD%86%E6%98%AF%E4%BA%8B%E5%8A%A1%E6%97%A5%E5%BF%97%E4%B8%8D%E4%BC%9A.png">
<meta property="og:image" content="https://tallate.github.io/imgs/ES/%E4%BA%8B%E5%8A%A1%E6%97%A5%E5%BF%97%E4%B8%8D%E6%96%AD%E7%A7%AF%E7%B4%AF%E6%96%87%E6%A1%A3.png">
<meta property="og:image" content="https://tallate.github.io/imgs/ES/Flush%E4%B9%8B%E5%90%8E%E6%AE%B5%E8%A2%AB%E5%85%A8%E9%87%8F%E6%8F%90%E4%BA%A4%E5%B9%B6%E4%B8%94%E4%BA%8B%E5%8A%A1%E6%97%A5%E5%BF%97%E8%A2%AB%E6%B8%85%E7%A9%BA.png">
<meta property="og:image" content="https://tallate.github.io/imgs/ES/%E6%8F%90%E4%BA%A4%E5%90%8E%E7%94%9F%E6%88%90%E6%96%B0%E6%AE%B5%E4%B8%94%E7%BC%93%E5%AD%98%E8%A2%AB%E6%B8%85%E7%A9%BA.png">
<meta property="og:image" content="https://tallate.github.io/imgs/ES/%E4%B8%A4%E4%B8%AA%E6%8F%90%E4%BA%A4%E4%BA%86%E7%9A%84%E6%AE%B5%E5%92%8C%E4%B8%80%E4%B8%AA%E6%9C%AA%E6%8F%90%E4%BA%A4%E7%9A%84%E6%AE%B5%E8%A2%AB%E5%90%88%E5%B9%B6%E5%88%B0%E4%B8%80%E4%B8%AA%E6%9B%B4%E5%A4%A7%E7%9A%84%E6%AE%B5.png">
<meta property="og:image" content="https://tallate.github.io/imgs/ES/%E5%90%88%E5%B9%B6%E7%BB%93%E6%9D%9F%E5%90%8E%E8%80%81%E7%9A%84%E6%AE%B5%E8%A2%AB%E5%88%A0%E9%99%A4.png">
<meta property="article:published_time" content="2019-07-29T12:14:29.000Z">
<meta property="article:modified_time" content="2025-07-06T17:56:20.858Z">
<meta property="article:author" content="tallate">
<meta property="article:tag" content="ElasticSearch">
<meta name="twitter:card" content="summary">
<meta name="twitter:image" content="https://tallate.github.io/imgs/ES/%E4%B8%80%E4%B8%AALucene%E7%B4%A2%E5%BC%95%E5%8C%85%E5%90%AB%E4%B8%80%E4%B8%AA%E6%8F%90%E4%BA%A4%E7%82%B9%E5%92%8C%E4%B8%89%E4%B8%AA%E6%AE%B5.png">
<link rel="canonical" href="https://tallate.github.io/8762395.html">
<script class="next-config" data-name="page" type="application/json">{"sidebar":"","isHome":false,"isPost":true,"lang":"zh-CN","comments":true,"permalink":"https://tallate.github.io/8762395.html","path":"/8762395.html","title":"ES2_1索引原理"}</script>
<script class="next-config" data-name="calendar" type="application/json">""</script>
<title>ES2_1索引原理 | Tallate</title>
<noscript>
<link rel="stylesheet" href="/css/noscript.css">
</noscript>
</head>
<body itemscope itemtype="http://schema.org/WebPage" class="use-motion">
<div class="headband"></div>
<main class="main">
<div class="column">
<header class="header" itemscope itemtype="http://schema.org/WPHeader"><div class="site-brand-container">
<div class="site-nav-toggle">
<div class="toggle" aria-label="切换导航栏" role="button">
<span class="toggle-line"></span>
<span class="toggle-line"></span>
<span class="toggle-line"></span>
</div>
</div>
<div class="site-meta">
<a href="/" class="brand" rel="start">
<i class="logo-line"></i>
<p class="site-title">Tallate</p>
<i class="logo-line"></i>
</a>
<p class="site-subtitle" itemprop="description">该吃吃该喝喝 啥事别往心里搁</p>
</div>
<div class="site-nav-right">
<div class="toggle popup-trigger" aria-label="搜索" role="button">
<i class="fa fa-search fa-fw fa-lg"></i>
</div>
</div>
</div>
<nav class="site-nav">
<ul class="main-menu menu"><li class="menu-item menu-item-home"><a href="/" rel="section"><i class="home fa-fw"></i>首页</a></li><li class="menu-item menu-item-about"><a href="/about/" rel="section"><i class="user fa-fw"></i>关于</a></li><li class="menu-item menu-item-tags"><a href="/tags/" rel="section"><i class="tags fa-fw"></i>标签<span class="badge">84</span></a></li><li class="menu-item menu-item-categories"><a href="/categories/" rel="section"><i class="th fa-fw"></i>分类<span class="badge">25</span></a></li><li class="menu-item menu-item-archives"><a href="/archives/" rel="section"><i class="archive fa-fw"></i>归档<span class="badge">192</span></a></li>
<li class="menu-item menu-item-search">
<a role="button" class="popup-trigger"><i class="fa fa-search fa-fw"></i>搜索
</a>
</li>
</ul>
</nav>
<div class="search-pop-overlay">
<div class="popup search-popup"><div class="search-header">
<span class="search-icon">
<i class="fa fa-search"></i>
</span>
<div class="search-input-container">
<input autocomplete="off" autocapitalize="off" maxlength="80"
placeholder="搜索..." spellcheck="false"
type="search" class="search-input">
</div>
<span class="popup-btn-close" role="button">
<i class="fa fa-times-circle"></i>
</span>
</div>
<div class="search-result-container no-result">
<div class="search-result-icon">
<i class="fa fa-spinner fa-pulse fa-5x"></i>
</div>
</div>
</div>
</div>
</header>
<aside class="sidebar">
<div class="sidebar-inner sidebar-nav-active sidebar-toc-active">
<ul class="sidebar-nav">
<li class="sidebar-nav-toc">
文章目录
</li>
<li class="sidebar-nav-overview">
站点概览
</li>
</ul>
<div class="sidebar-panel-container">
<!--noindex-->
<div class="post-toc-wrap sidebar-panel">
<div class="post-toc animated"><ol class="nav"><li class="nav-item nav-level-2"><a class="nav-link" href="#%E7%B4%A2%E5%BC%95%E7%AD%96%E7%95%A5"><span class="nav-number">1.</span> <span class="nav-text">索引策略</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#%E5%9C%A8-Elasticsearch-%E4%B8%AD%EF%BC%8C-%E6%AF%8F%E4%B8%AA%E5%AD%97%E6%AE%B5%E7%9A%84%E6%89%80%E6%9C%89%E6%95%B0%E6%8D%AE-%E9%83%BD%E6%98%AF-%E9%BB%98%E8%AE%A4%E8%A2%AB%E7%B4%A2%E5%BC%95%E7%9A%84-%E3%80%82-%E5%8D%B3%E6%AF%8F%E4%B8%AA%E5%AD%97%E6%AE%B5%E9%83%BD%E6%9C%89%E4%B8%BA%E4%BA%86%E5%BF%AB%E9%80%9F%E6%A3%80%E7%B4%A2%E8%AE%BE%E7%BD%AE%E7%9A%84%E4%B8%93%E7%94%A8%E5%80%92%E6%8E%92%E7%B4%A2%E5%BC%95%E3%80%82%E5%80%92%E6%8E%92%E7%B4%A2%E5%BC%95%E7%94%B1%E4%B8%80%E4%BA%9B%E8%AF%8D%E9%A1%B9%E7%BB%84%E6%88%90%EF%BC%8C%E6%AF%8F%E4%B8%AA%E8%AF%8D%E9%A1%B9%E5%8C%85%E5%90%AB%E4%BA%86%E5%AE%83%E6%89%80%E6%9C%89%E6%9B%BE%E5%87%BA%E7%8E%B0%E8%BF%87%E7%9A%84%E6%96%87%E6%A1%A3%E7%9A%84%E5%88%97%E8%A1%A8%E3%80%82Term-Doc-1-Doc-2-Doc-3"><span class="nav-number">2.</span> <span class="nav-text">在 Elasticsearch 中, 每个字段的所有数据 都是 默认被索引的 。 即每个字段都有为了快速检索设置的专用倒排索引。倒排索引由一些词项组成,每个词项包含了它所有曾出现过的文档的列表。Term | Doc 1 | Doc 2 | Doc 3 | </span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#%E5%80%92%E6%8E%92%E7%B4%A2%E5%BC%95%E7%9A%84%E4%B8%8D%E5%8F%98%E6%80%A7"><span class="nav-number">3.</span> <span class="nav-text">倒排索引的不变性</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#Segment%EF%BC%88%E6%AE%B5%EF%BC%89%E5%92%8CCommit-Point%EF%BC%88%E6%8F%90%E4%BA%A4%E7%82%B9%EF%BC%89"><span class="nav-number">4.</span> <span class="nav-text">Segment(段)和Commit Point(提交点)</span></a></li></ol></li><li class="nav-item nav-level-1"><a class="nav-link" href="#%E6%96%B0%E6%96%87%E6%A1%A3%E7%9A%84%E7%B4%A2%E5%BC%95%E6%B5%81%E7%A8%8B"><span class="nav-number"></span> <span class="nav-text">新文档的索引流程</span></a><ol class="nav-child"><li class="nav-item nav-level-2"><a class="nav-link" href="#%E5%87%86%E5%AE%9E%E6%97%B6%E6%90%9C%E7%B4%A2%E4%B8%8E%E5%88%B7%E6%96%B0%E7%AD%96%E7%95%A5"><span class="nav-number">1.</span> <span class="nav-text">准实时搜索与刷新策略</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#Refresh-%E5%8A%A8%E6%80%81%E6%9B%B4%E6%96%B0%E7%B4%A2%E5%BC%95"><span class="nav-number">2.</span> <span class="nav-text">Refresh - 动态更新索引</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#Searcher"><span class="nav-number">2.1.</span> <span class="nav-text">Searcher</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#%E5%BC%BA%E5%88%B6%E5%88%B7%E6%96%B0"><span class="nav-number">2.2.</span> <span class="nav-text">强制刷新</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#%E9%85%8D%E7%BD%AE%E5%88%B7%E6%96%B0%E7%AD%96%E7%95%A5"><span class="nav-number">2.3.</span> <span class="nav-text">配置刷新策略</span></a></li></ol></li><li class="nav-item nav-level-2"><a class="nav-link" href="#Transaction-Log%EF%BC%88%E4%BA%8B%E5%8A%A1%E6%97%A5%E5%BF%97%EF%BC%89-Refresh%E4%BF%9D%E8%AF%81%E5%87%86%E5%AE%9E%E6%97%B6%E6%90%9C%E7%B4%A2%E7%9A%84%E5%90%8C%E6%97%B6%E4%BF%9D%E8%AF%81%E6%95%B0%E6%8D%AE%E4%B8%8D%E4%B8%A2"><span class="nav-number">3.</span> <span class="nav-text">Transaction Log(事务日志) - Refresh保证准实时搜索的同时保证数据不丢</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#Lucene-%E4%B8%8D%E8%83%BD%E4%BF%9D%E8%AF%81%E7%B4%A2%E5%BC%95%E6%95%B0%E6%8D%AE%E4%B8%8D%E4%B8%A2%E5%A4%B1"><span class="nav-number">3.1.</span> <span class="nav-text">Lucene 不能保证索引数据不丢失</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#Transaction-Log%E5%86%99%E5%85%A5%E6%B5%81%E7%A8%8B"><span class="nav-number">3.2.</span> <span class="nav-text">Transaction Log写入流程</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#%E4%BD%BF%E7%94%A8%E4%BA%8B%E5%8A%A1%E6%97%A5%E5%BF%97%E8%AE%B0%E5%BD%95%E6%9C%AA%E6%8F%90%E4%BA%A4%E4%BA%8B%E5%8A%A1"><span class="nav-number">3.3.</span> <span class="nav-text">使用事务日志记录未提交事务</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#%E6%89%8B%E5%8A%A8%E6%89%A7%E8%A1%8C%E4%BA%8B%E5%8A%A1%E6%97%A5%E5%BF%97%E5%88%B7%E6%96%B0"><span class="nav-number">3.4.</span> <span class="nav-text">手动执行事务日志刷新</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#%E5%BC%82%E6%AD%A5-fsync"><span class="nav-number">3.5.</span> <span class="nav-text">异步 fsync</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#%E9%85%8D%E7%BD%AE"><span class="nav-number">3.6.</span> <span class="nav-text">配置</span></a></li></ol></li><li class="nav-item nav-level-2"><a class="nav-link" href="#Flush-Commit"><span class="nav-number">4.</span> <span class="nav-text">Flush - Commit</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#Merge-%E6%AE%B5%E5%90%88%E5%B9%B6"><span class="nav-number">5.</span> <span class="nav-text">Merge - 段合并</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#%E6%AE%B5%E5%90%88%E5%B9%B6%E6%B5%81%E7%A8%8B"><span class="nav-number">5.1.</span> <span class="nav-text">段合并流程</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#optimize-API"><span class="nav-number">5.2.</span> <span class="nav-text">optimize API</span></a></li></ol></li><li class="nav-item nav-level-2"><a class="nav-link" href="#%E5%88%A0%E9%99%A4%E5%92%8C%E6%9B%B4%E6%96%B0%E7%B4%A2%E5%BC%95"><span class="nav-number">6.</span> <span class="nav-text">删除和更新索引</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#%E9%87%8D%E5%BB%BA%E7%B4%A2%E5%BC%95"><span class="nav-number">7.</span> <span class="nav-text">重建索引</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#Update-By-Query"><span class="nav-number">7.1.</span> <span class="nav-text">Update By Query</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#Reindex"><span class="nav-number">7.2.</span> <span class="nav-text">Reindex</span></a></li></ol></li></ol></div>
</div>
<!--/noindex-->
<div class="site-overview-wrap sidebar-panel">
<div class="site-author animated" itemprop="author" itemscope itemtype="http://schema.org/Person">
<p class="site-author-name" itemprop="name">tallate</p>
<div class="site-description" itemprop="description"></div>
</div>
<div class="site-state-wrap animated">
<nav class="site-state">
<div class="site-state-item site-state-posts">
<a href="/archives/">
<span class="site-state-item-count">192</span>
<span class="site-state-item-name">日志</span>
</a>
</div>
<div class="site-state-item site-state-categories">
<a href="/categories/">
<span class="site-state-item-count">25</span>
<span class="site-state-item-name">分类</span></a>
</div>
<div class="site-state-item site-state-tags">
<a href="/tags/">
<span class="site-state-item-count">84</span>
<span class="site-state-item-name">标签</span></a>
</div>
</nav>
</div>
</div>
</div>
</div>
</aside>
</div>
<div class="main-inner post posts-expand">
<div class="post-block">
<article itemscope itemtype="http://schema.org/Article" class="post-content" lang="zh-CN">
<link itemprop="mainEntityOfPage" href="https://tallate.github.io/8762395.html">
<span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
<meta itemprop="image" content="/images/avatar.gif">
<meta itemprop="name" content="tallate">
</span>
<span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
<meta itemprop="name" content="Tallate">
<meta itemprop="description" content="">
</span>
<span hidden itemprop="post" itemscope itemtype="http://schema.org/CreativeWork">
<meta itemprop="name" content="ES2_1索引原理 | Tallate">
<meta itemprop="description" content="">
</span>
<header class="post-header">
<h1 class="post-title" itemprop="name headline">
ES2_1索引原理
</h1>
<div class="post-meta-container">
<div class="post-meta">
<span class="post-meta-item">
<span class="post-meta-item-icon">
<i class="far fa-calendar"></i>
</span>
<span class="post-meta-item-text">发表于</span>
<time title="创建时间:2019-07-29 20:14:29" itemprop="dateCreated datePublished" datetime="2019-07-29T20:14:29+08:00">2019-07-29</time>
</span>
<span class="post-meta-item">
<span class="post-meta-item-icon">
<i class="far fa-calendar-check"></i>
</span>
<span class="post-meta-item-text">更新于</span>
<time title="修改时间:2025-07-07 01:56:20" itemprop="dateModified" datetime="2025-07-07T01:56:20+08:00">2025-07-07</time>
</span>
<span class="post-meta-item">
<span class="post-meta-item-icon">
<i class="far fa-folder"></i>
</span>
<span class="post-meta-item-text">分类于</span>
<span itemprop="about" itemscope itemtype="http://schema.org/Thing">
<a href="/categories/ElasticSearch/" itemprop="url" rel="index"><span itemprop="name">ElasticSearch</span></a>
</span>
</span>
</div>
</div>
</header>
<div class="post-body" itemprop="articleBody"><p>基于 Lucene,ES 实现了分布式的索引管理,这篇文档分析单机视角下的索引原理。</p>
<p>[x] ES如何保证搜索的近实时(1秒后被搜到)<br>[x] 为什么删除文档,不会立刻释放空间</p>
<span id="more"></span>
<h2 id="索引策略"><a href="#索引策略" class="headerlink" title="索引策略"></a>索引策略</h2><h2 id="在-Elasticsearch-中,-每个字段的所有数据-都是-默认被索引的-。-即每个字段都有为了快速检索设置的专用倒排索引。倒排索引由一些词项组成,每个词项包含了它所有曾出现过的文档的列表。Term-Doc-1-Doc-2-Doc-3"><a href="#在-Elasticsearch-中,-每个字段的所有数据-都是-默认被索引的-。-即每个字段都有为了快速检索设置的专用倒排索引。倒排索引由一些词项组成,每个词项包含了它所有曾出现过的文档的列表。Term-Doc-1-Doc-2-Doc-3" class="headerlink" title="在 Elasticsearch 中, 每个字段的所有数据 都是 默认被索引的 。 即每个字段都有为了快速检索设置的专用倒排索引。倒排索引由一些词项组成,每个词项包含了它所有曾出现过的文档的列表。Term | Doc 1 | Doc 2 | Doc 3 | "></a>在 Elasticsearch 中, <strong>每个字段的所有数据 都是 默认被索引的</strong> 。 即每个字段都有为了快速检索设置的专用<strong>倒排索引</strong>。<br>倒排索引由一些词项组成,每个词项包含了它所有曾出现过的文档的列表。<br>Term | Doc 1 | Doc 2 | Doc 3 | </h2><p>brown | X | | X | …<br>fox | X | X | X | …<br>quick | X | X | | …<br>the | X | | X | …</p>
<p>另外,这个倒排索引相比特定词项出现过的文档列表,会包含更多其它信息。它会保存每一个词项出现过的文档总数, 在对应的文档中一个具体词项出现的总次数,词项在文档中的顺序,每个文档的长度,所有文档的平均长度,等等。这些统计信息允许 Elasticsearch 决定哪些词比其它词更重要,哪些文档比其它文档更重要,用于搜索时计算文档的相关性。</p>
<h2 id="倒排索引的不变性"><a href="#倒排索引的不变性" class="headerlink" title="倒排索引的不变性"></a>倒排索引的不变性</h2><p>倒排索引被写入磁盘后是 <strong>不可改变</strong> 的:它永远不会被修改。 不变性有重要的价值:</p>
<ul>
<li>无需考虑并发写文件问题,不需要锁,因此也避免了锁机制带来的性能问题。</li>
<li>一旦索引被读入内核的文件系统缓存,便会留在哪里,由于其不变性。只要文件系统缓存中还有足够的空间,那么大部分读请求会直接请求内存,而不会命中磁盘。这提供了很大的性能提升。</li>
<li>其它缓存(像 filter 缓存),在索引的生命周期内始终有效。它们不需要在每次数据改变时被重建,因为数据不会变化。</li>
<li>写入单个大的倒排索引允许数据被压缩,减少磁盘 I/O 和 需要被缓存到内存的索引的使用量。</li>
</ul>
<p>当然,一个不变的索引也有不好的地方:</p>
<ul>
<li>由于不变性,你不能修改它,如果你需要让一个新的文档 可被搜索,你<strong>需要重新索引该文档</strong>。这对一个索引能包含的数据量和被更新频率造成很大限制。</li>
</ul>
<h2 id="Segment(段)和Commit-Point(提交点)"><a href="#Segment(段)和Commit-Point(提交点)" class="headerlink" title="Segment(段)和Commit Point(提交点)"></a>Segment(段)和Commit Point(提交点)</h2><p><img src="/imgs/ES/%E4%B8%80%E4%B8%AALucene%E7%B4%A2%E5%BC%95%E5%8C%85%E5%90%AB%E4%B8%80%E4%B8%AA%E6%8F%90%E4%BA%A4%E7%82%B9%E5%92%8C%E4%B8%89%E4%B8%AA%E6%AE%B5.png" alt="一个Lucene索引包含一个提交点和三个段" title="一个Lucene索引包含一个提交点和三个段"></p>
<ul>
<li>每一个Segment本身都是一个倒排索引,索引在 Lucene 中表示所有段的集合。<blockquote>
<p>一个 Lucene 索引在 Elasticsearch 中被称作分片,一个 Elasticsearch 索引是分片的集合,当 Elasticsearch 在索引中搜索的时候,会发送查询请求到每一个属于该索引的分片,然后合并每个分片的结果到一个全局的结果集中。</p>
</blockquote>
</li>
<li>当有新文档写入时,会生成新Segment,查询时会同时查询所有Segments,并对结果汇总。</li>
<li>Commit Point是一个列出了所有已知Segments的文件<br>Elasticsearch 在启动或重新打开一个索引的过程中使用这个Commit Point来判断哪些段隶属于当前分片。</li>
<li>删除的文档信息,保存在”.del”文件中</li>
</ul>
<h1 id="新文档的索引流程"><a href="#新文档的索引流程" class="headerlink" title="新文档的索引流程"></a>新文档的索引流程</h1><h2 id="准实时搜索与刷新策略"><a href="#准实时搜索与刷新策略" class="headerlink" title="准实时搜索与刷新策略"></a>准实时搜索与刷新策略</h2><p>对一个文档进行更新操作后可能会发现属性还是旧的值,我们称之为<strong>准实时现象</strong>:更新的数据还存在于内存中、还未刷新到磁盘上。</p>
<ul>
<li>在索引期新文档会写入<strong>Segment</strong>,这些Segment是独立的,这意味着查询是可以与索引并行的,只是不时会有新增的索引段被添加到可被搜索的索引段集合之中。</li>
<li>Lucene 通过创建后续的(基于索引只写一次的特性)segments_N 文件来实现此功能,且该文件列举了索引中的索引段。这个过程称为<strong>提交(Commit)</strong>,Lucene 以一种安全的方式来执行该操作,能确保索引更改以原子操作方式写入索引,即便有错误发生,也能保证索引数据的一致性。</li>
</ul>
<p>随着按段搜索(per-segment)的发展,一个新的文档从索引到可被搜索的延迟显著降低,新文档在几分钟内即可被检索,但是这个速度还是不够快。磁盘在这里称为了瓶颈,提交(Commiting)一个新的段到磁盘需要一个 <code>fsync</code> 来确保段被物理性地写入磁盘,这样在断电的时候就不会丢失数据。 但是 <code>fsync</code> 操作代价很大,如果每次索引一个文档都去执行一次的话会造成很大的性能问题。<br>在 Lucene 中提交后,内存索引缓冲区中的文档会被写入到一个新的段中,但是这里新段会被先写入到文件系统缓存——这一步代价会比较低,稍后再被刷新到磁盘——这一步代价比较高,不过只要文件已经在缓存中就可以像其他文件一样被打开和读取了。</p>
<h2 id="Refresh-动态更新索引"><a href="#Refresh-动态更新索引" class="headerlink" title="Refresh - 动态更新索引"></a>Refresh - 动态更新索引</h2><p><img src="/imgs/ES/%E6%96%B0%E6%96%87%E6%A1%A3%E8%A2%AB%E6%B7%BB%E5%8A%A0%E5%88%B0%E7%BC%93%E5%AD%98.png" alt="新文档被添加到缓存" title="新文档被添加到缓存"><br>ES中新建的文档会先被写入到Index Buffer。<br><img src="/imgs/ES/%E7%BC%93%E5%86%B2%E5%8C%BA%E8%A2%AB%E5%86%99%E5%85%A5%E6%AE%B5%E4%BD%86%E6%9C%AA%E5%AE%8C%E6%88%90%E6%8F%90%E4%BA%A4.png" alt="缓冲区被写入段但未完成提交" title="缓冲区被写入段但未完成提交"><br>将Index Buffer写入Segment的过程叫<strong>Refresh</strong>,注意<strong>Refresh不执行fsync操作</strong>,此时还未被刷到磁盘。</p>
<ul>
<li>Refresh<strong>默认1次/秒</strong>,可通过<strong>index.refresh_interval</strong>配置。</li>
<li>Index Buffer被占满时,也会触发Refresh,默认值是JVM的10%。</li>
<li>Refresh后,数据就可以被搜索到了,这也是为什么Elasticsearch被称为<strong>近实时搜索</strong>。</li>
<li>如果系统有大量的数据写入,就会产生很多的Segment。</li>
</ul>
<h3 id="Searcher"><a href="#Searcher" class="headerlink" title="Searcher"></a>Searcher</h3><p>Lucene 使用了一个叫作<strong>Searcher</strong>的抽象类来执行索引的读取,如果索引更新提交了,但 Searcher 实例并没有重新打开,那么它觉察不到新索引段的加入。写入和 Searcher 重新打开新段的过程叫作<strong>刷新(Refresh)</strong>。出于性能考虑,Lucene 推迟了耗时的刷新,因此它不会在每次新增一个文档(或批量增加文档)的时候刷新,但 Searcher 会<strong>默认每秒刷新一次</strong>。这就是为什么我们说 Elasticsearch 是 近 实时搜索: 文档的变化并不是立即对搜索可见,但会在一秒之内变为可见。<br>因此新索引的数据找不到可能有以下两个原因:</p>
<ol>
<li>可能还未执行提交 commit 操作</li>
<li>Searcher 未重新打开执行刷新</li>
</ol>
<h3 id="强制刷新"><a href="#强制刷新" class="headerlink" title="强制刷新"></a>强制刷新</h3><p>如果有必要执行强制刷新,可以使用下面的命令:</p>
<figure class="highlight plaintext"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br></pre></td><td class="code"><pre><span class="line"># 刷新所有索引</span><br><span class="line">POST /_refresh</span><br><span class="line"># 只刷新一个索引</span><br><span class="line">POST /my_index/_refresh</span><br></pre></td></tr></table></figure>
<h3 id="配置刷新策略"><a href="#配置刷新策略" class="headerlink" title="配置刷新策略"></a>配置刷新策略</h3><p>可以更改 ElasticSearch 配置文件中的 index.refresh_interval,,或者使用下面的命令来修改自动刷新时间:</p>
<figure class="highlight plaintext"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br></pre></td><td class="code"><pre><span class="line">PUT /my_index/_settings</span><br><span class="line">{</span><br><span class="line"> "index": {</span><br><span class="line"> "refresh_interval": "5m"</span><br><span class="line"> }</span><br><span class="line">}</span><br><span class="line">PUT /my_index</span><br><span class="line">{</span><br><span class="line"> "settings": {</span><br><span class="line"> "refresh_interval": "30s" </span><br><span class="line"> }</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>刷新操作是很耗资源的,因此刷新间隔时间越长,索引速度越快。如果需要长时间高速建索引、或建一个比较大的新索引,并且在建索引结束之前暂不执行查询,那么可以考虑将 index.refresh_interval 参数值设置为-1,然后在建索引结束以后再将该参数恢复为初始值。</p>
<figure class="highlight plaintext"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br></pre></td><td class="code"><pre><span class="line"># 关闭自动刷新</span><br><span class="line">PUT /my_logs/_settings</span><br><span class="line">{</span><br><span class="line"> "refresh_interval": -1</span><br><span class="line">}</span><br><span class="line"># 每秒自动刷新</span><br><span class="line">PUT /my_logs/_settings</span><br><span class="line">{</span><br><span class="line"> "refresh_interval": "1s"</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<blockquote>
<p>注意 refresh_interval 的单位,设置为 1 实际上表示的是 1 毫秒,这显然会导致集群陷入瘫痪。<br>尽管刷新是比提交轻量很多的操作,它还是会有性能开销。 当写测试的时候, 手动刷新很有用,但是不要在生产环境下每次索引一个文档都去手动刷新。 相反,你的应用需要意识到 Elasticsearch 的近实时的性质,并接受它的不足。</p>
</blockquote>
<h2 id="Transaction-Log(事务日志)-Refresh保证准实时搜索的同时保证数据不丢"><a href="#Transaction-Log(事务日志)-Refresh保证准实时搜索的同时保证数据不丢" class="headerlink" title="Transaction Log(事务日志) - Refresh保证准实时搜索的同时保证数据不丢"></a>Transaction Log(事务日志) - Refresh保证准实时搜索的同时保证数据不丢</h2><h3 id="Lucene-不能保证索引数据不丢失"><a href="#Lucene-不能保证索引数据不丢失" class="headerlink" title="Lucene 不能保证索引数据不丢失"></a>Lucene 不能保证索引数据不丢失</h3><p>Lucene 能保证索引的一致性,但是这并不能保证当往索引中写数据(fsync)失败时不会损失数据(如磁盘空间不足、设备损坏,或没有足够的文件句柄供索引文件使用)。<br>另外,频繁提交操作会导致严重的性能问题(因为每提交一次就会触发一个索引段的创建操作,同时也可能触发索引段的合并)。<br>即使通过每秒刷新(Refresh)实现了近实时搜索,我们仍然需要经常进行完整提交来确保能从失败中恢复。但在两次提交之间发生变化的文档怎么办?我们也不希望丢失掉这些数据。</p>
<h3 id="Transaction-Log写入流程"><a href="#Transaction-Log写入流程" class="headerlink" title="Transaction Log写入流程"></a>Transaction Log写入流程</h3><p>如Refresh流程所述,Refresh过程并不会立刻将Segment刷新到磁盘,而是先写入缓存并开放查询。<br>为了保证数据不丢,在Index文档时,Lucene会同时写<strong>Transaction Log</strong>。</p>
<ul>
<li>高版本开始,Transaction Log默认落盘;</li>
<li>每个分片有一个Transaction Log;</li>
<li>在ES Refresh时,Index Buffer会被清空,但是Transaction Log不会清空</li>
<li>如果发生断电等情况,未落盘的Segment数据会被清空,ES会使用TransactionLog中的数据恢复。</li>
</ul>
<h3 id="使用事务日志记录未提交事务"><a href="#使用事务日志记录未提交事务" class="headerlink" title="使用事务日志记录未提交事务"></a>使用事务日志记录未提交事务</h3><p>Elasticsearch 增加了一个 <code>translog</code> ,或者叫事务日志,在每一次对 Elasticsearch 进行操作时均进行了日志记录。<br>ElasticSearch 通过使用<code>translog</code>保存所有的未提交的事务,而 ElasticSearch 会不时创建一个新的日志文件用于记录每个事务的后续操作。当有错误发生时,就会检查事务日志,必要时会再次执行某些操作,以确保没有丢失任何更改信息。而且,事务日志的相关操作都是自动完成的,用户并不会意识到某个特定时刻触发的更新提交。事务日志中的信息与存储介质之间的同步(同时清空事务日志)称为事务日志刷新(<code>Flush</code>),Flush 操作会截断 translog。<br>注意事务日志刷新与 Searcher 刷新的区别。大多数情况下,Searcher 刷新是你所期望的,即搜索到最新的文档。而事务日志刷新用来确保数据正确写入了索引并清空了事务日志。</p>
<p>通过<code>translog</code>,整个流程看起来是下面这样:</p>
<ol>
<li>一个文档被索引之后,就会被添加到内存缓冲区,并且 追加到了 translog;<br><img src="/imgs/ES/%E6%96%B0%E7%9A%84%E6%96%87%E6%A1%A3%E8%A2%AB%E6%B7%BB%E5%8A%A0%E5%88%B0%E5%86%85%E5%AD%98%E7%BC%93%E5%86%B2%E5%8C%BA%E5%B9%B6%E4%B8%94%E8%A2%AB%E8%BF%BD%E5%8A%A0%E5%88%B0%E4%BA%86%E4%BA%8B%E5%8A%A1%E6%97%A5%E5%BF%97.png" alt="新的文档被添加到内存缓冲区并且被追加到了事务日志" title="新的文档被添加到内存缓冲区并且被追加到了事务日志"></li>
<li>分片每秒被刷新(refresh)一次:<ul>
<li>这些在内存缓冲区的文档被写入到一个新的段中,且没有进行 fsync 操作。</li>
<li>这个段被打开,使其可被搜索。</li>
<li>内存缓冲区被清空。<br><img src="/imgs/ES/Refresh%E5%AE%8C%E6%88%90%E5%90%8E%E7%BC%93%E5%AD%98%E8%A2%AB%E6%B8%85%E7%A9%BA%E4%BD%86%E6%98%AF%E4%BA%8B%E5%8A%A1%E6%97%A5%E5%BF%97%E4%B8%8D%E4%BC%9A.png" alt="Refresh完成后缓存被清空但是事务日志不会" title="Refresh完成后缓存被清空但是事务日志不会"></li>
</ul>
</li>
<li>这个进程继续工作,更多的文档被添加到内存缓冲区和追加到事务日志;<br><img src="/imgs/ES/%E4%BA%8B%E5%8A%A1%E6%97%A5%E5%BF%97%E4%B8%8D%E6%96%AD%E7%A7%AF%E7%B4%AF%E6%96%87%E6%A1%A3.png" alt="事务日志不断积累文档" title="事务日志不断积累文档"></li>
<li>每隔一段时间,索引会被刷新(Flush),一个新的 translog 被创建,并且一个全量提交被执行。<ul>
<li>所有在内存缓冲区的文档都被写入一个新的段。</li>
<li>缓冲区被清空。</li>
<li>一个提交点被写入硬盘。</li>
<li>文件系统缓存通过 fsync 被刷新(flush)。</li>
<li>老的 translog 被删除。<br>translog 提供所有还没有被刷到磁盘的操作的一个持久化纪录。当 Elasticsearch 启动的时候, 它会从磁盘中使用最后一个提交点去恢复已知的段,并且会重放 translog 中所有在最后一次提交后发生的变更操作。<br>translog 也被用来提供实时 CRUD 。当你试着通过 ID 查询、更新、删除一个文档,它会在尝试从相应的段中检索之前, 首先检查 translog 任何最近的变更。这意味着它总是能够实时地获取到文档的最新版本。<br><img src="/imgs/ES/Flush%E4%B9%8B%E5%90%8E%E6%AE%B5%E8%A2%AB%E5%85%A8%E9%87%8F%E6%8F%90%E4%BA%A4%E5%B9%B6%E4%B8%94%E4%BA%8B%E5%8A%A1%E6%97%A5%E5%BF%97%E8%A2%AB%E6%B8%85%E7%A9%BA.png" alt="Flush之后段被全量提交并且事务日志被清空" title="Flush之后段被全量提交并且事务日志被清空"></li>
</ul>
</li>
</ol>
<h3 id="手动执行事务日志刷新"><a href="#手动执行事务日志刷新" class="headerlink" title="手动执行事务日志刷新"></a>手动执行事务日志刷新</h3><p>分片每 30 分钟被自动刷新(flush),或者在 translog 太大的时候也会刷新。</p>
<figure class="highlight plaintext"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre></td><td class="code"><pre><span class="line">POST /_flush</span><br><span class="line">POST /my_index/_flush</span><br><span class="line"># Flush所有的索引并且并且等待所有刷新在返回前完成。 </span><br><span class="line">POST /_flush?wait_for_ongoing</span><br><span class="line"># 在事务日志刷新之后,调用Searcher刷新操作,打开一个新的Searcher实例</span><br><span class="line">POST /my_index/_refresh</span><br></pre></td></tr></table></figure>
<p>一般不需要自己手动执行<code>Flush</code>操作,自动刷新就足够了。一般重启节点或关闭索引之前都需要执行一次<code>Flush</code>。<br>当 Elasticsearch 尝试恢复或重新打开一个索引, 它需要重放 translog 中所有的操作,如果日志越短,恢复越快。</p>
<h3 id="异步-fsync"><a href="#异步-fsync" class="headerlink" title="异步 fsync"></a>异步 fsync</h3><p>默认 translog 是每 5 秒被 fsync 刷新到硬盘,或者在每次写请求(index, delete, update, bulk)完成之后执行。这个过程在主分片和复制分片都会发生,这意味着在整个请求被 fsync 到主分片和复制分片的 translog 之前,客户端不会得到一个 200 OK 响应。<br>对于一些大容量的偶尔丢失几秒数据问题也不严重的集群,使用异步的 fsync 相对来说更好,比如,写入的数据被缓存到内存中,再每 5 秒执行一次 fsync ,可以使用如下命令配置:</p>
<figure class="highlight plaintext"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line">PUT /my_index/_settings</span><br><span class="line">{</span><br><span class="line"> "index.translog.durability": "async",</span><br><span class="line"> "index.translog.sync_interval": "5s"</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>当然,如果不确定丢失几秒数据的后果能否接受,最好还是使用默认的参数:<code>"index.translog.durability": "request"</code>。</p>
<h3 id="配置"><a href="#配置" class="headerlink" title="配置"></a>配置</h3><p>以下参数既可以通过修改 elasticsearch.yml 文件来配置,也可以通过索引配置更新 API 来更改。</p>
<ul>
<li>index.translog.flush_threshold_period:该参数的默认值为 30 分钟,它控制了强制自动事务日志刷新的时间间隔,即便是没有新数据写入。强制进行事务日志刷新通常会导致大量的 I/O 操作,因此当事务日志涉及少量数据时,才更适合进行这项操作。</li>
<li>index.translog.flush_threshold_ops:该参数确定了一个最大操作数,即在上次事务日志刷新以后,当索引更改操作次数超过该参数值时,强制进行事务日志刷新操作,默认值为 5000。</li>
<li>index.translog.flush_threshold_size:该参数确定了事务日志的最大容量,当容量大于等于该参数值,就强制进行事务日志刷新操作,默认值为 200MB。</li>
<li>index.translog.disable_flush:禁用事务日志刷新。尽管默认情况下事务日志刷新是可用的,但对它临时性地禁用能带来其他方面的便利。例如,向索引中导入大量文档的时候。</li>
</ul>
<p>或者调用 API 动态修改配置:</p>
<figure class="highlight plaintext"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre></td><td class="code"><pre><span class="line">PUT /my_index/_settings</span><br><span class="line">{</span><br><span class="line"> "index": {</span><br><span class="line"> "translog.disable_flush": true</span><br><span class="line"> }</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>前述命令在向索引导入大量数据之前执行、可以大幅提高索引的速度。但是请记住,当数据导入完毕之后,要重新设置事务日志刷新相关参数。</p>
<h2 id="Flush-Commit"><a href="#Flush-Commit" class="headerlink" title="Flush - Commit"></a>Flush - Commit</h2><p><img src="/imgs/ES/%E6%8F%90%E4%BA%A4%E5%90%8E%E7%94%9F%E6%88%90%E6%96%B0%E6%AE%B5%E4%B8%94%E7%BC%93%E5%AD%98%E8%A2%AB%E6%B8%85%E7%A9%BA.png" alt="提交后生成新段且缓存被清空" title="提交后生成新段且缓存被清空"><br>Flush是ES的持久化操作,对应Lucene的Commit操作,流程如下:</p>
<ol>
<li>调用Refresh,此时清空Index Buffer,将文档保存到缓存的Segments中</li>
<li>调用fsync,将缓存中Segments写入磁盘</li>
<li>清空(删除)Transaction Log</li>
</ol>
<p>Flush是一个非常重的操作,需要将文档数据刷新到磁盘,因此其执行间隔也是非常的长:</p>
<ul>
<li>Flush操作默认30分钟调用一次;</li>
<li>Transaction Log满(默认512MB)时强制执行一次。</li>
</ul>
<h2 id="Merge-段合并"><a href="#Merge-段合并" class="headerlink" title="Merge - 段合并"></a>Merge - 段合并</h2><p>由于自动刷新流程每秒会创建一个新的段,这样会导致短时间内的段数量暴增,而段数目太多会带来较大的麻烦:</p>
<ul>
<li>每一个段都会消耗文件句柄、内存和 cpu 运行周期;</li>
<li>更重要的是,每个搜索请求都必须轮流检查每个段,所以段越多,搜索也就越慢。</li>
<li>段中已经被删除的文档占用了大量空间,需要清除</li>
</ul>
<p>Elasticsearch 通过在后台进行段合并来解决这个问题:</p>
<ul>
<li>小的段被合并到大的段,然后这些大的段再被合并到更大的段。</li>
<li>段合并的时候会将那些旧的已删除文档 从文件系统中清除。 被删除的文档(或被更新文档的旧版本)不会被拷贝到新的大段中。</li>
</ul>
<p>Merge的触发方式有2种:</p>
<ul>
<li>ES和Lucene会自动进行Merge操作</li>
<li>手动执行<code>POST my_index/_forcemerge</code></li>
</ul>
<h3 id="段合并流程"><a href="#段合并流程" class="headerlink" title="段合并流程"></a>段合并流程</h3><p>进行索引和搜索时会自动进行段合并:</p>
<ol>
<li>当索引的时候,刷新(refresh)操作会创建新的段并将段打开以供搜索使用。</li>
<li>合并进程选择一小部分大小相似的段,并且在后台将它们合并到更大的段中。这并不会中断索引和搜索。<br><img src="/imgs/ES/%E4%B8%A4%E4%B8%AA%E6%8F%90%E4%BA%A4%E4%BA%86%E7%9A%84%E6%AE%B5%E5%92%8C%E4%B8%80%E4%B8%AA%E6%9C%AA%E6%8F%90%E4%BA%A4%E7%9A%84%E6%AE%B5%E8%A2%AB%E5%90%88%E5%B9%B6%E5%88%B0%E4%B8%80%E4%B8%AA%E6%9B%B4%E5%A4%A7%E7%9A%84%E6%AE%B5.png" alt="两个提交了的段和一个未提交的段被合并到一个更大的段" title="两个提交了的段和一个未提交的段被合并到一个更大的段"></li>
<li>合并完成后:<ul>
<li>新的段被刷新(flush)到了磁盘。写入一个包含新段且排除旧的和较小的段的新提交点。</li>
<li>新的段被打开用来搜索。</li>
<li>老的段被删除。<br><img src="/imgs/ES/%E5%90%88%E5%B9%B6%E7%BB%93%E6%9D%9F%E5%90%8E%E8%80%81%E7%9A%84%E6%AE%B5%E8%A2%AB%E5%88%A0%E9%99%A4.png" alt="合并结束后老的段被删除" title="合并结束后老的段被删除"></li>
</ul>
</li>
</ol>
<h3 id="optimize-API"><a href="#optimize-API" class="headerlink" title="optimize API"></a>optimize API</h3><p>optimize API 用于手动触发段合并。<br>将一个分片强制合并到 max_num_segments 参数指定大小的段数目。 这样做的意图是减少段的数量(通常减少到一个),来提升搜索性能。<br>optimize API 不应该被用在一个活跃的索引上,Elasticsearch 后台会自动触发合并。<br>在特定情况下,使用 optimize API 颇有益处。例如在日志这种用例下,每天、每周、每月的日志被存储在一个索引中。 老的索引实质上是只读的,它们也并不太可能会发生变化,将历史段合并成一个单独的段就很有用了。</p>
<figure class="highlight plaintext"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br></pre></td><td class="code"><pre><span class="line"># 合并索引中的每个分片为一个单独的段 </span><br><span class="line">POST /logstash-2014-10/_optimize?max_num_segments=1</span><br></pre></td></tr></table></figure>
<blockquote>
<p>使用 optimize API 触发段合并的操作不会受到任何资源上的限制。这可能会消耗掉你节点上全部的 I/O 资源, 使其没有余裕来处理搜索请求,从而有可能使集群失去响应。 如果你想要对索引执行 <code>optimize</code>,你需要先使用分片分配把索引移到一个安全的节点,再执行。</p>
</blockquote>
<h2 id="删除和更新索引"><a href="#删除和更新索引" class="headerlink" title="删除和更新索引"></a>删除和更新索引</h2><p>段是不可改变的,所以既不能从把文档从旧的段中移除,也不能修改旧的段来进行反映文档的更新。 取而代之的是,每个提交点会包含一个 <code>.del</code> 文件,文件中会列出这些被删除文档的段信息。<br>当一个文档被 <strong>删除</strong> 时,它实际上只是在 .del 文件中被<strong>标记删除</strong>。一个被标记删除的文档仍然可以被查询匹配到,但它会在最终结果被返回前从结果集中移除。<br>文档<strong>更新</strong>也是类似的操作方式:当一个文档被更新时,旧版本文档被标记删除,文档的新版本被索引到一个新的段中。 可能两个版本的文档都会被一个查询匹配到,但被删除的那个旧版本文档在结果集返回前就已经被移除。</p>
<h2 id="重建索引"><a href="#重建索引" class="headerlink" title="重建索引"></a>重建索引</h2><p>重建索引一般是索引的元数据发生变更了,但是文档还没更新,此时需要重建索引,让新数据结构可以被搜索到。</p>
<p>需要重建索引的情况:</p>
<ul>
<li>索引的Mappings发生变更,字段类型更改,分词器及字典更新</li>
<li>索引的Settings发生变更:索引的主分片数发生改变</li>
<li>集群内,汲取间需要做数据迁移</li>
</ul>
<p>ES内部有2种方法重建索引:</p>
<ol>
<li>Update By Query,在现有索引上重建</li>
<li>Reindex,在其他索引上重建索引</li>
</ol>
<h3 id="Update-By-Query"><a href="#Update-By-Query" class="headerlink" title="Update By Query"></a>Update By Query</h3><figure class="highlight plaintext"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br></pre></td><td class="code"><pre><span class="line">PUT blogs/doc_update/1</span><br><span class="line">{</span><br><span class="line"> "content":"Hadoop is cool",</span><br><span class="line"> "keyword":"hadoop"</span><br><span class="line">}</span><br><span class="line"></span><br><span class="line"># 修改 Mapping,增加子字段,使用英文分词器</span><br><span class="line">PUT blogs/_mapping/doc_update</span><br><span class="line">{</span><br><span class="line"> "properties" : {</span><br><span class="line"> "content" : {</span><br><span class="line"> "type" : "text",</span><br><span class="line"> "fields" : {</span><br><span class="line"> "english" : {</span><br><span class="line"> "type" : "text",</span><br><span class="line"> "analyzer":"english"</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line">}</span><br><span class="line"></span><br><span class="line">POST blogs/_search</span><br><span class="line">{</span><br><span class="line"> "query": {</span><br><span class="line"> "match": {</span><br><span class="line"> "content.english": "Hadoop"</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<ul>
<li>改变Mapping,增加子字段,使用英文分词器</li>
<li>虽然数据存在,无法查到结果</li>
<li>此时重新插入一条数据,是可以被查到的</li>
</ul>
<figure class="highlight plaintext"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br></pre></td><td class="code"><pre><span class="line"># Update所有文档</span><br><span class="line">POST blogs/_update_by_query</span><br><span class="line">{</span><br><span class="line">}</span><br><span class="line"></span><br><span class="line"># 查询之前写入的文档</span><br><span class="line">POST blogs/_search</span><br><span class="line">{</span><br><span class="line"> "query": {</span><br><span class="line"> "match": {</span><br><span class="line"> "content.english": "Hadoop"</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<h3 id="Reindex"><a href="#Reindex" class="headerlink" title="Reindex"></a>Reindex</h3><p>Reindex将老索引数据重建到新索引</p>
<ul>
<li><strong>新索引是可以新增、修改字段声明的</strong>,而ES本身是不支持对mapping中的字段进行修改的,这也是Reindex的主要意义</li>
<li>Reindex要求_source字段是enabled的</li>
<li>重建索引后,可以通过<strong>Index Alias</strong>在不停机的情况下取代原来的索引</li>
</ul>
<figure class="highlight plaintext"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br></pre></td><td class="code"><pre><span class="line"># 创建新的索引并且设定新的Mapping</span><br><span class="line">PUT blogs_fix</span><br><span class="line">{</span><br><span class="line"> "mappings": {</span><br><span class="line"> "doc": {</span><br><span class="line"> "properties" : {</span><br><span class="line"> "content" : {</span><br><span class="line"> "type" : "text",</span><br><span class="line"> "fields" : {</span><br><span class="line"> "english" : {</span><br><span class="line"> "type" : "text",</span><br><span class="line"> "analyzer" : "english"</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> },</span><br><span class="line"> "a": {</span><br><span class="line"> "type": "keyword"</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line">}</span><br><span class="line"></span><br><span class="line"># 利用Reindx API,将老索引</span><br><span class="line">POST _reindex</span><br><span class="line">{</span><br><span class="line"> "source": {</span><br><span class="line"> "index": "blogs"</span><br><span class="line"> },</span><br><span class="line"> "dest": {</span><br><span class="line"> "index": "blogs_fix"</span><br><span class="line"> }</span><br><span class="line">}</span><br><span class="line"></span><br><span class="line">GET blogs_fix/doc_update/1</span><br></pre></td></tr></table></figure>
<ul>
<li>上面的Reindex将老的blogs索引下的文档重建到新的blogs_fix</li>
</ul>
<figure class="highlight plaintext"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br></pre></td><td class="code"><pre><span class="line"># 重建索引时使用内部版本号</span><br><span class="line">POST _reindex</span><br><span class="line">{</span><br><span class="line"> "source": {</span><br><span class="line"> "index": "blogs"</span><br><span class="line"> },</span><br><span class="line"> "dest": {</span><br><span class="line"> "index": "blogs_fix",</span><br><span class="line"> "version_type": "internal"</span><br><span class="line"> }</span><br><span class="line">}</span><br><span class="line"></span><br><span class="line"># 重建索引时使用外部版本号</span><br><span class="line">POST _reindex</span><br><span class="line">{</span><br><span class="line"> "source": {</span><br><span class="line"> "index": "blogs"</span><br><span class="line"> },</span><br><span class="line"> "dest": {</span><br><span class="line"> "index": "blogs_fix",</span><br><span class="line"> "version_type": "external"</span><br><span class="line"> }</span><br><span class="line">}</span><br><span class="line"></span><br><span class="line"># 只创建不存在的文档,文档已存在的情况下,会导致版本冲突</span><br><span class="line">POST _reindex</span><br><span class="line">{</span><br><span class="line"> "source": {</span><br><span class="line"> "index": "blogs"</span><br><span class="line"> },</span><br><span class="line"> "dest": {</span><br><span class="line"> "index": "blogs_fix",</span><br><span class="line"> "op_type": "create"</span><br><span class="line"> }</span><br><span class="line">}</span><br><span class="line"></span><br></pre></td></tr></table></figure>
<p>查看Reindex执行进度:</p>
<figure class="highlight plaintext"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">GET _tasks?detailed=true&actions=*reindex</span><br></pre></td></tr></table></figure>
<p>异步操作,执行只返回Task ID:</p>
<figure class="highlight plaintext"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">POST _reindex?wait_for_completion=false</span><br></pre></td></tr></table></figure>
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/[email protected]/dist/kity.min.js"></script><script type="text/javascript" src="https://cdn.jsdelivr.net/npm/[email protected]/dist/kityminder.core.min.js"></script><script defer="true" type="text/javascript" src="https://cdn.jsdelivr.net/npm/[email protected]/dist/mindmap.min.js"></script><link rel="stylesheet" type="text/css" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/mindmap.min.css">
</div>
<footer class="post-footer">
<div class="post-tags">
<a href="/tags/ElasticSearch/" rel="tag"># ElasticSearch</a>
</div>
<div class="post-nav">
<div class="post-nav-item">
<a href="/4ea5471b.html" rel="prev" title="ZooKeeper 的使用">
<i class="fa fa-angle-left"></i> ZooKeeper 的使用
</a>
</div>
<div class="post-nav-item">
<a href="/c395b48b.html" rel="next" title="ES1_1使用ES">
ES1_1使用ES <i class="fa fa-angle-right"></i>
</a>
</div>
</div>
</footer>
</article>
</div>
</div>
</main>
<footer class="footer">
<div class="footer-inner">
<div class="copyright">
©
<span itemprop="copyrightYear">2025</span>
<span class="with-love">
<i class="fa fa-heart"></i>
</span>
<span class="author" itemprop="copyrightHolder">tallate</span>
</div>
<div class="powered-by">由 <a href="https://hexo.io/" rel="noopener" target="_blank">Hexo</a> & <a href="https://theme-next.js.org/" rel="noopener" target="_blank">NexT.Gemini</a> 强力驱动
</div>
</div>
</footer>
<div class="back-to-top" role="button" aria-label="返回顶部">
<i class="fa fa-arrow-up fa-lg"></i>
<span>0%</span>
</div>
<a href="https://github.com/tallate" class="github-corner" title="在 GitHub 上关注我" aria-label="在 GitHub 上关注我" rel="noopener" target="_blank"><svg width="80" height="80" viewBox="0 0 250 250" aria-hidden="true"><path d="M0,0 L115,115 L130,115 L142,142 L250,250 L250,0 Z"></path><path d="M128.3,109.0 C113.8,99.7 119.0,89.6 119.0,89.6 C122.0,82.7 120.5,78.6 120.5,78.6 C119.2,72.0 123.4,76.3 123.4,76.3 C127.3,80.9 125.5,87.3 125.5,87.3 C122.9,97.6 130.6,101.9 134.4,103.2" fill="currentColor" style="transform-origin: 130px 106px;" class="octo-arm"></path><path d="M115.0,115.0 C114.9,115.1 118.7,116.5 119.8,115.4 L133.7,101.6 C136.9,99.2 139.9,98.4 142.2,98.6 C133.8,88.0 127.5,74.4 143.8,58.0 C148.5,53.4 154.0,51.2 159.7,51.0 C160.3,49.4 163.2,43.6 171.4,40.1 C171.4,40.1 176.1,42.5 178.8,56.2 C183.1,58.6 187.2,61.8 190.9,65.4 C194.5,69.0 197.7,73.2 200.1,77.6 C213.8,80.2 216.3,84.9 216.3,84.9 C212.7,93.1 206.9,96.0 205.4,96.6 C205.1,102.4 203.0,107.8 198.3,112.5 C181.9,128.9 168.3,122.5 157.7,114.1 C157.9,116.9 156.7,120.9 152.7,124.9 L141.0,136.5 C139.8,137.7 141.6,141.9 141.8,141.8 Z" fill="currentColor" class="octo-body"></path></svg></a>
<noscript>
<div class="noscript-warning">Theme NexT works best with JavaScript enabled</div>
</noscript>
<script src="https://cdnjs.cloudflare.com/ajax/libs/animejs/3.2.1/anime.min.js" integrity="sha256-XL2inqUJaslATFnHdJOi9GfQ60on8Wx1C2H8DYiN1xY=" crossorigin="anonymous"></script>
<script src="/js/comments.js"></script><script src="/js/utils.js"></script><script src="/js/motion.js"></script><script src="/js/next-boot.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/hexo-generator-searchdb/1.4.1/search.js" integrity="sha256-1kfA5uHPf65M5cphT2dvymhkuyHPQp5A53EGZOnOLmc=" crossorigin="anonymous"></script>
<script src="/js/third-party/search/local-search.js"></script>
<script class="next-config" data-name="mermaid" type="application/json">{"enable":true,"version":"7.1.2","options":null,"js":{"url":"https://cdnjs.cloudflare.com/ajax/libs/mermaid/10.3.0/mermaid.min.js","integrity":"sha256-9y71g5Lz/KLsHjB8uXwnkuWDtAMDSzD/HdIbqhJfTAI="}}</script>
<script src="/js/third-party/tags/mermaid.js"></script>
</body>
</html>