Skip to content

Commit b9219d6

Browse files
author
liuminjian
committed
1.heartbeat reports disk full error and mds set copyset availflag false.
2.copyset node leader set readonly when receive copyset availflag false from heartbeat. 3.if the disk becomes full while writing to the chunk file, the server return no space err and client hangs until space is freed up manually. Signed-off-by: liuminjian <[email protected]>
1 parent d1de1f7 commit b9219d6

31 files changed

+258
-901
lines changed

WORKSPACE

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@ git_repository(
3737
commit = "d12de388c97998f5ccd5cb97ed0da728815ef438",
3838
patches = [
3939
"//:thirdparties/braft/0001-fix-change-set_error-to-set_errorv.patch",
40-
"//:thirdparties/braft/add-iterator-has_error.patch",
4140
],
4241
patch_args = [
4342
"-p1"

proto/chunk.proto

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ enum CHUNK_OP_STATUS {
8686
CHUNK_OP_STATUS_CHUNK_EXIST = 11; // chunk已存在
8787
CHUNK_OP_STATUS_EPOCH_TOO_OLD = 12; // request epoch too old
8888
CHUNK_OP_STATUS_READONLY = 13; // copyset其他节点故障,设为只读
89+
CHUNK_OP_STATUS_ENOSPC = 14; // 空间不足错误
8990
};
9091

9192
message ChunkResponse {

proto/copyset.proto

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,4 @@ service CopysetService {
100100
rpc DeleteBrokenCopyset(CopysetRequest) returns (CopysetResponse);
101101

102102
rpc GetCopysetStatus (CopysetStatusRequest) returns (CopysetStatusResponse);
103-
104-
rpc DeleteBrokenCopysetNode (CopysetRequest2) returns (CopysetResponse2);
105103
};

proto/topology.proto

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -565,14 +565,6 @@ message ListUnAvailCopySetsResponse {
565565
repeated common.CopysetInfo copysets = 2;
566566
}
567567

568-
message DeleteBrokenCopysetInChunkServerRequest {
569-
required uint32 chunkServerID = 1;
570-
}
571-
572-
message DeleteBrokenCopysetInChunkServerResponse {
573-
required sint32 statusCode = 1;
574-
}
575-
576568
//TODO(hzsunjianliang): update userPolicy and so on
577569
service TopologyService {
578570
rpc RegistChunkServer(ChunkServerRegistRequest) returns (ChunkServerRegistResponse);
@@ -618,6 +610,4 @@ service TopologyService {
618610
rpc SetCopysetsAvailFlag(SetCopysetsAvailFlagRequest) returns (SetCopysetsAvailFlagResponse);
619611
rpc ListUnAvailCopySets(ListUnAvailCopySetsRequest) returns (ListUnAvailCopySetsResponse);
620612
rpc ListChunkFormatStatus(ListChunkFormatStatusRequest) returns (ListChunkFormatStatusResponse);
621-
rpc DeleteBrokenCopysetInChunkServer(DeleteBrokenCopysetInChunkServerRequest) returns (DeleteBrokenCopysetInChunkServerResponse);
622-
623613
}

src/chunkserver/copyset_node.cpp

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -293,7 +293,6 @@ void CopysetNode::on_apply(::braft::Iterator &iter) {
293293
*/
294294
braft::Closure *closure = iter.done();
295295

296-
std::shared_ptr<IteratorWrapper> wrapperPtr = std::make_shared<IteratorWrapper>(&iter);
297296
if (nullptr != closure) {
298297
/**
299298
* 1.closure不是null,那么说明当前节点正常,直接从内存中拿到Op
@@ -306,7 +305,7 @@ void CopysetNode::on_apply(::braft::Iterator &iter) {
306305
std::shared_ptr<ChunkOpRequest>& opRequest = chunkClosure->request_;
307306
concurrentapply_->Push(opRequest->ChunkId(), ChunkOpRequest::Schedule(opRequest->OpType()), // NOLINT
308307
&ChunkOpRequest::OnApply, opRequest,
309-
iter.index(), doneGuard.release(), wrapperPtr);
308+
iter.index(), doneGuard.release());
310309
} else {
311310
// 获取log entry
312311
butil::IOBuf log = iter.data();
@@ -323,11 +322,9 @@ void CopysetNode::on_apply(::braft::Iterator &iter) {
323322
auto chunkId = request.chunkid();
324323
concurrentapply_->Push(chunkId, ChunkOpRequest::Schedule(request.optype()), // NOLINT
325324
&ChunkOpRequest::OnApplyFromLog, opReq,
326-
dataStore_, std::move(request), data, wrapperPtr);
325+
dataStore_, std::move(request), data);
327326
}
328327
}
329-
// 等待写操作完成,否则on_apply结束后,异步有写错误无法调用set_error_and_rollback()
330-
concurrentapply_->Flush();
331328
}
332329

333330
void CopysetNode::on_shutdown() {
@@ -556,7 +553,7 @@ void CopysetNode::on_leader_stop(const butil::Status &status) {
556553
}
557554

558555
void CopysetNode::on_error(const ::braft::Error &e) {
559-
LOG(ERROR) << "Copyset: " << GroupIdString()
556+
LOG(FATAL) << "Copyset: " << GroupIdString()
560557
<< ", peer id: " << peerId_.to_string()
561558
<< " meet raft error: " << e;
562559
}
@@ -1126,13 +1123,5 @@ SyncChunkThread::~SyncChunkThread() {
11261123
Stop();
11271124
}
11281125

1129-
void IteratorWrapper::set_error_and_rollback(size_t ntail, const butil::Status* st) {
1130-
iter_->set_error_and_rollback(ntail, st);
1131-
}
1132-
1133-
bool IteratorWrapper::has_error() const{
1134-
return iter_->has_error();
1135-
}
1136-
11371126
} // namespace chunkserver
11381127
} // namespace curve

src/chunkserver/copyset_node.h

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -124,18 +124,6 @@ class SyncChunkThread : public curve::common::Uncopyable {
124124
CopysetNode* node_;
125125
};
126126

127-
// 用于unitest mock braft::Iterator
128-
class IteratorWrapper {
129-
public:
130-
IteratorWrapper() {}
131-
IteratorWrapper(braft::Iterator *iter): iter_(iter) {}
132-
~IteratorWrapper() {}
133-
virtual void set_error_and_rollback(size_t ntail = 1, const butil::Status* st = NULL);
134-
virtual bool has_error() const;
135-
private:
136-
braft::Iterator *iter_;
137-
};
138-
139127
/**
140128
* 一个Copyset Node就是一个复制组的副本
141129
*/

src/chunkserver/copyset_service.cpp

Lines changed: 0 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -232,45 +232,5 @@ void CopysetServiceImpl::GetCopysetStatus(RpcController *controller,
232232
request->copysetid());
233233
}
234234

235-
void CopysetServiceImpl::DeleteBrokenCopysetNode(RpcController *controller,
236-
const CopysetRequest2 *request,
237-
CopysetResponse2 *response,
238-
Closure *done) {
239-
(void)controller;
240-
brpc::ClosureGuard doneGuard(done);
241-
242-
Copyset copyset;
243-
244-
LOG(INFO) << "Received DeleteBrokenCopysetNode request";
245-
246-
for (int i = 0; i < request->copysets_size(); ++i) {
247-
copyset = request->copysets(i);
248-
249-
// 判断copyset是否存在
250-
auto nodePtr = copysetNodeManager_->GetCopysetNode(copyset.logicpoolid(),
251-
copyset.copysetid());
252-
if (nullptr == nodePtr) {
253-
continue;
254-
}
255-
256-
NodeStatus status;
257-
nodePtr->GetStatus(&status);
258-
// 只删除状态有问题的copyset node
259-
if (status.state != braft::State::STATE_ERROR) {
260-
continue;
261-
}
262-
263-
copysetNodeManager_->DeleteCopysetNode(copyset.logicpoolid(), copyset.copysetid());
264-
265-
LOG(INFO) << "Delete copyset node"
266-
<< ToGroupIdString(copyset.logicpoolid(),
267-
copyset.copysetid())
268-
<< " success.";
269-
}
270-
271-
response->set_status(COPYSET_OP_STATUS::COPYSET_OP_STATUS_SUCCESS);
272-
LOG(INFO) << "DeleteBrokenCopysetNode " << request->copysets().size() << " copysets success";
273-
}
274-
275235
} // namespace chunkserver
276236
} // namespace curve

src/chunkserver/copyset_service.h

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -71,14 +71,6 @@ class CopysetServiceImpl : public CopysetService {
7171
CopysetStatusResponse *response,
7272
Closure *done);
7373

74-
/**
75-
* 删除状态ERROR的copyset node
76-
*/
77-
void DeleteBrokenCopysetNode(RpcController *controller,
78-
const CopysetRequest2 *request,
79-
CopysetResponse2 *response,
80-
Closure *done);
81-
8274
private:
8375
// 复制组管理者
8476
CopysetNodeManager* copysetNodeManager_;

src/chunkserver/datastore/chunkserver_chunkfile.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <fcntl.h>
2323
#include <algorithm>
2424
#include <memory>
25+
#include <errno.h>
2526

2627
#include "src/chunkserver/datastore/chunkserver_datastore.h"
2728
#include "src/chunkserver/datastore/chunkserver_chunkfile.h"
@@ -400,6 +401,9 @@ CSErrorCode CSChunkFile::Write(SequenceNum sn,
400401
<< "ChunkID: " << chunkId_
401402
<< ",request sn: " << sn
402403
<< ",chunk sn: " << metaPage_.sn;
404+
if (rc == -ENOSPC) {
405+
return CSErrorCode::NoSpaceError;
406+
}
403407
return CSErrorCode::InternalError;
404408
}
405409
// If it is a clone chunk, the bitmap will be updated
@@ -478,6 +482,9 @@ CSErrorCode CSChunkFile::Paste(const char * buf, off_t offset, size_t length) {
478482
<< "ChunkID: " << chunkId_
479483
<< ", offset: " << offset
480484
<< ", length: " << length;
485+
if (rc == -ENOSPC) {
486+
return CSErrorCode::NoSpaceError;
487+
}
481488
return CSErrorCode::InternalError;
482489
}
483490
}

src/chunkserver/datastore/chunkserver_snapshot.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
*/
2222

2323
#include <memory>
24+
#include <errno.h>
2425
#include "src/chunkserver/datastore/chunkserver_datastore.h"
2526
#include "src/chunkserver/datastore/chunkserver_snapshot.h"
2627

@@ -216,6 +217,9 @@ CSErrorCode CSSnapshot::Write(const char * buf, off_t offset, size_t length) {
216217
LOG(ERROR) << "Write snapshot failed."
217218
<< "ChunkID: " << chunkId_
218219
<< ",snapshot sn: " << metaPage_.sn;
220+
if (rc == -ENOSPC) {
221+
return CSErrorCode::NoSpaceError;
222+
}
219223
return CSErrorCode::InternalError;
220224
}
221225
uint32_t pageBeginIndex = offset / blockSize_;

0 commit comments

Comments
 (0)