Skip to content

Commit e4f77ce

Browse files
author
liuminjian
committed
Feature: When the cluster capacity is almost full, make the cluster read only
Signed-off-by: liuminjian <[email protected]>
1 parent b184f47 commit e4f77ce

31 files changed

+359
-103
lines changed

conf/chunkserver.conf

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,8 @@ copyset.sync_chunk_limits=2097152
128128
copyset.sync_threshold=65536
129129
# check syncing interval
130130
copyset.check_syncing_interval_ms=500
131+
# wait for retry time when disk space is insufficient
132+
copyset.wait_for_disk_freed_interval_ms=60000
131133

132134
#
133135
# Clone settings
@@ -215,6 +217,8 @@ chunkfilepool.allocate_percent=80
215217
chunkfilepool.chunk_file_pool_size=1GB
216218
# The thread num for format chunks
217219
chunkfilepool.thread_num=1
220+
# 当chunkserver磁盘使用率超过百分比,heartbeat设置disk状态
221+
chunkfilepool.diskUsagePercentLimit=95
218222

219223
#
220224
# WAL file pool

proto/chunk.proto

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ enum CHUNK_OP_STATUS {
8585
CHUNK_OP_STATUS_BACKWARD = 10; // 请求的版本落后当前chunk的版本
8686
CHUNK_OP_STATUS_CHUNK_EXIST = 11; // chunk已存在
8787
CHUNK_OP_STATUS_EPOCH_TOO_OLD = 12; // request epoch too old
88+
CHUNK_OP_STATUS_READONLY = 13; // If there is insufficient disk space, set the chunkserver to read-only
8889
};
8990

9091
message ChunkResponse {

proto/heartbeat.proto

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,13 @@ message CopysetStatistics {
7171
required uint32 writeIOPS = 4;
7272
}
7373

74+
enum ErrorType {
75+
NORMAL = 0;
76+
DISKFULL = 1;
77+
}
78+
7479
message DiskState {
75-
required uint32 errType = 1;
80+
required ErrorType errType = 1;
7681
required string errMsg = 2;
7782
}
7883

proto/topology.proto

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ enum ChunkServerStatus {
4848
enum DiskState {
4949
DISKNORMAL = 0;
5050
DISKERROR = 1;
51+
DISKFULL = 2;
5152
}
5253

5354
enum OnlineState {

src/chunkserver/chunkserver.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -710,6 +710,9 @@ void ChunkServer::InitCopysetNodeOptions(
710710
LOG_IF(FATAL, !conf->GetUInt32Value("copyset.sync_trigger_seconds",
711711
&copysetNodeOptions->syncTriggerSeconds));
712712
}
713+
LOG_IF(FATAL, !conf->GetUInt32Value(
714+
"copyset.wait_for_disk_freed_interval_ms",
715+
&copysetNodeOptions->waitForDiskFreedIntervalMs));
713716
}
714717

715718
void ChunkServer::InitCopyerOptions(
@@ -781,6 +784,8 @@ void ChunkServer::InitHeartbeatOptions(
781784
&heartbeatOptions->intervalSec));
782785
LOG_IF(FATAL, !conf->GetUInt32Value("mds.heartbeat_timeout",
783786
&heartbeatOptions->timeout));
787+
LOG_IF(FATAL, !conf->GetUInt32Value("chunkfilepool.diskUsagePercentLimit",
788+
&heartbeatOptions->chunkserverDiskLimit));
784789
}
785790

786791
void ChunkServer::InitRegisterOptions(

src/chunkserver/config_info.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,8 @@ struct CopysetNodeOptions {
140140
uint64_t syncThreshold = 64 * 1024;
141141
// check syncing interval
142142
uint32_t checkSyncingIntervalMs = 500u;
143+
// wait for retry time when disk space is insufficient
144+
uint32_t waitForDiskFreedIntervalMs = 60000;
143145

144146
CopysetNodeOptions();
145147
};

src/chunkserver/copyset_node.cpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,8 @@ CopysetNode::CopysetNode(const LogicPoolID &logicPoolId,
8585
lastScanSec_(0),
8686
enableOdsyncWhenOpenChunkFile_(false),
8787
isSyncing_(false),
88-
checkSyncingIntervalMs_(500) {
88+
checkSyncingIntervalMs_(500),
89+
readOnly_(false) {
8990
}
9091

9192
CopysetNode::~CopysetNode() {
@@ -135,6 +136,8 @@ int CopysetNode::Init(const CopysetNodeOptions &options) {
135136
dsOptions.locationLimit = options.locationLimit;
136137
dsOptions.enableOdsyncWhenOpenChunkFile =
137138
options.enableOdsyncWhenOpenChunkFile;
139+
dsOptions.waitForDiskFreedIntervalMs =
140+
options.waitForDiskFreedIntervalMs;
138141
dataStore_ = std::make_shared<CSDataStore>(options.localFileSystem,
139142
options.chunkFilePool,
140143
dsOptions);
@@ -345,6 +348,14 @@ void CopysetNode::WaitSnapshotDone() {
345348
}
346349
}
347350

351+
bool CopysetNode::ReadOnly() const {
352+
return readOnly_.load(std::memory_order_acquire);
353+
}
354+
355+
void CopysetNode::SetReadOnly(bool readOnly) {
356+
readOnly_.store(readOnly, std::memory_order_release);
357+
}
358+
348359
void CopysetNode::save_snapshot_background(::braft::SnapshotWriter *writer,
349360
::braft::Closure *done) {
350361
brpc::ClosureGuard doneGuard(done);

src/chunkserver/copyset_node.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,10 @@ class CopysetNode : public braft::StateMachine,
469469

470470
void WaitSnapshotDone();
471471

472+
bool ReadOnly() const;
473+
474+
void SetReadOnly(bool readOnly);
475+
472476
private:
473477
inline std::string GroupId() {
474478
return ToGroupId(logicPoolId_, copysetId_);
@@ -545,6 +549,8 @@ class CopysetNode : public braft::StateMachine,
545549
uint32_t checkSyncingIntervalMs_;
546550
// async snapshot future object
547551
std::future<void> snapshotFuture_;
552+
// copyset readonly flag
553+
std::atomic<bool> readOnly_;
548554
};
549555

550556
} // namespace chunkserver

src/chunkserver/datastore/chunkserver_chunkfile.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <fcntl.h>
2323
#include <algorithm>
2424
#include <memory>
25+
#include <errno.h>
2526

2627
#include "src/chunkserver/datastore/chunkserver_datastore.h"
2728
#include "src/chunkserver/datastore/chunkserver_chunkfile.h"
@@ -207,7 +208,7 @@ CSErrorCode CSChunkFile::Open(bool createFile) {
207208
if (rc != 0 && rc != -EEXIST) {
208209
LOG(ERROR) << "Error occured when create file."
209210
<< " filepath = " << chunkFilePath;
210-
return CSErrorCode::InternalError;
211+
return rc == -ENOSPC ? CSErrorCode::NoSpaceError : CSErrorCode::InternalError;
211212
}
212213
}
213214
int rc = -1;
@@ -400,7 +401,7 @@ CSErrorCode CSChunkFile::Write(SequenceNum sn,
400401
<< "ChunkID: " << chunkId_
401402
<< ",request sn: " << sn
402403
<< ",chunk sn: " << metaPage_.sn;
403-
return CSErrorCode::InternalError;
404+
return rc == -ENOSPC ? CSErrorCode::NoSpaceError : CSErrorCode::InternalError;
404405
}
405406
// If it is a clone chunk, the bitmap will be updated
406407
CSErrorCode errorCode = flush();
@@ -478,7 +479,7 @@ CSErrorCode CSChunkFile::Paste(const char * buf, off_t offset, size_t length) {
478479
<< "ChunkID: " << chunkId_
479480
<< ", offset: " << offset
480481
<< ", length: " << length;
481-
return CSErrorCode::InternalError;
482+
return rc == -ENOSPC ? CSErrorCode::NoSpaceError : CSErrorCode::InternalError;
482483
}
483484
}
484485

src/chunkserver/datastore/chunkserver_datastore.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,8 @@ CSDataStore::CSDataStore(std::shared_ptr<LocalFileSystem> lfs,
4444
baseDir_(options.baseDir),
4545
chunkFilePool_(chunkFilePool),
4646
lfs_(lfs),
47-
enableOdsyncWhenOpenChunkFile_(options.enableOdsyncWhenOpenChunkFile) {
47+
enableOdsyncWhenOpenChunkFile_(options.enableOdsyncWhenOpenChunkFile),
48+
waitForDiskFreedIntervalMs_(options.waitForDiskFreedIntervalMs) {
4849
CHECK(!baseDir_.empty()) << "Create datastore failed";
4950
CHECK(lfs_ != nullptr) << "Create datastore failed";
5051
CHECK(chunkFilePool_ != nullptr) << "Create datastore failed";

0 commit comments

Comments
 (0)