Skip to content
This repository was archived by the owner on Dec 20, 2024. It is now read-only.

Commit 4053d7a

Browse files
committed
add peer label
remove some labels which are possibly cause cardinality problem Signed-off-by: yeya24 <[email protected]>
1 parent aba48ed commit 4053d7a

File tree

16 files changed

+72
-72
lines changed

16 files changed

+72
-72
lines changed

dfdaemon/constant/constant.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,8 @@ const (
4646
)
4747

4848
const (
49-
// Namespace is the prefix of the metricsutils' name of dragonfly
49+
// Namespace is the prefix of metrics namespace of dragonfly
5050
Namespace = "dragonfly"
51-
// Subsystem represents metricsutils for dfdaemon
51+
// Subsystem represents metrics for dfdaemon
5252
Subsystem = "dfdaemon"
5353
)

dfdaemon/handler/root_handler.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,6 @@ func New() *http.ServeMux {
3232
s.HandleFunc("/args", getArgs)
3333
s.HandleFunc("/env", getEnv)
3434
s.HandleFunc("/debug/version", version.Handler)
35-
s.HandleFunc("/metricsutils", promhttp.Handler().ServeHTTP)
35+
s.HandleFunc("/metrics", promhttp.Handler().ServeHTTP)
3636
return s
3737
}

docs/user_guide/metrics.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,10 @@ This doc contains all the metrics that Dragonfly components currently support. N
99
- dragonfly_supernode_http_request_duration_seconds{code, handler, method} - http request latency in seconds
1010
- dragonfly_supernode_http_request_size_bytes{code, handler, method} - http request size in bytes
1111
- dragonfly_supernode_http_response_size_bytes{code, handler, method} - http response size in bytes
12-
- dragonfly_supernode_peers{hostname} - dragonfly peers
13-
- dragonfly_supernode_tasks{taskid, cdnstatus} - dragonfly tasks
14-
- dragonfly_supernode_dfgettasks{taskid, callsystem} - dragonfly dfget tasks
15-
- dragonfly_supernode_schedule_duration_milliseconds{taskid} - duration for task scheduling in milliseconds
12+
- dragonfly_supernode_peers{peer} - dragonfly peers, the label peer consists of the hostname and ip address of one peer.
13+
- dragonfly_supernode_tasks{cdnstatus} - dragonfly tasks
14+
- dragonfly_supernode_dfgettasks{callsystem} - dragonfly dfget tasks
15+
- dragonfly_supernode_schedule_duration_milliseconds{peer} - duration for task scheduling in milliseconds
1616
- dragonfly_supernode_trigger_cdn_total{} - total times of triggering cdn.
1717
- dragonfly_supernode_trigger_cdn_failed_total{} - total failed times of triggering cdn.
1818

supernode/config/constants.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,8 @@ const (
7070
)
7171

7272
const (
73-
// SubsystemSupernode represents metricsutils from supernode
73+
// SubsystemSupernode represents metrics from supernode
7474
SubsystemSupernode = "supernode"
75-
// SubsystemDfget represents metricsutils from dfget
75+
// SubsystemDfget represents metrics from dfget
7676
SubsystemDfget = "dfget"
7777
)

supernode/daemon/mgr/dfgettask/manager.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ type metrics struct {
4242
func newMetrics(register prometheus.Registerer) *metrics {
4343
return &metrics{
4444
dfgetTasks: metricsutils.NewGauge(config.SubsystemSupernode, "dfgettasks",
45-
"The number of dfget tasks", []string{"taskid", "callsystem"}, register),
45+
"The number of dfget tasks", []string{"callsystem"}, register),
4646
}
4747
}
4848

@@ -89,7 +89,7 @@ func (dtm *Manager) Add(ctx context.Context, dfgetTask *types.DfGetTask) error {
8989

9090
dtm.ptoc.Add(generatePeerKey(dfgetTask.PeerID, dfgetTask.TaskID), dfgetTask.CID)
9191
dtm.dfgetTaskStore.Put(key, dfgetTask)
92-
dtm.metrics.dfgetTasks.WithLabelValues(dfgetTask.TaskID, dfgetTask.CallSystem).Inc()
92+
dtm.metrics.dfgetTasks.WithLabelValues(dfgetTask.CallSystem).Inc()
9393

9494
return nil
9595
}
@@ -121,7 +121,7 @@ func (dtm *Manager) Delete(ctx context.Context, clientID, taskID string) error {
121121
return err
122122
}
123123
dtm.ptoc.Delete(generatePeerKey(dfgetTask.PeerID, dfgetTask.TaskID))
124-
dtm.metrics.dfgetTasks.WithLabelValues(dfgetTask.TaskID, dfgetTask.CallSystem).Dec()
124+
dtm.metrics.dfgetTasks.WithLabelValues(dfgetTask.CallSystem).Dec()
125125
return dtm.dfgetTaskStore.Delete(key)
126126
}
127127

supernode/daemon/mgr/dfgettask/manager_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ func (s *DfgetTaskMgrTestSuite) TestDfgetTaskAdd(c *check.C) {
9696
c.Check(err, check.IsNil)
9797
c.Assert(1, check.Equals,
9898
int(prom_testutil.ToFloat64(
99-
dfgetTasks.WithLabelValues(tc.dfgetTask.TaskID, tc.dfgetTask.CallSystem))))
99+
dfgetTasks.WithLabelValues(tc.dfgetTask.CallSystem))))
100100
dt, err := manager.Get(context.Background(), tc.dfgetTask.CID, tc.dfgetTask.TaskID)
101101
c.Check(err, check.IsNil)
102102
c.Check(dt, check.DeepEquals, tc.Expect)
@@ -207,7 +207,7 @@ func (s *DfgetTaskMgrTestSuite) TestDfgetTaskDelete(c *check.C) {
207207
c.Check(err, check.IsNil)
208208
c.Assert(0, check.Equals,
209209
int(prom_testutil.ToFloat64(
210-
dfgetTasks.WithLabelValues(tc.dfgetTask.TaskID, tc.dfgetTask.CallSystem))))
210+
dfgetTasks.WithLabelValues(tc.dfgetTask.CallSystem))))
211211

212212
_, err = manager.Get(context.Background(), tc.dfgetTask.CID, tc.dfgetTask.TaskID)
213213
c.Check(errortypes.IsDataNotFound(err), check.Equals, true)

supernode/daemon/mgr/peer/manager.go

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ type metrics struct {
4444
func newMetrics(register prometheus.Registerer) *metrics {
4545
return &metrics{
4646
peers: metricsutils.NewGauge(config.SubsystemSupernode, "peers",
47-
"The number of supernode peers", []string{"hostname"}, register),
47+
"The number of supernode peers", []string{"peer"}, register),
4848
}
4949
}
5050

@@ -83,7 +83,7 @@ func (pm *Manager) Register(ctx context.Context, peerCreateRequest *types.PeerCr
8383
Created: strfmt.DateTime(time.Now()),
8484
}
8585
pm.peerStore.Put(id, peerInfo)
86-
pm.metrics.peers.WithLabelValues(peerInfo.HostName.String()).Inc()
86+
pm.metrics.peers.WithLabelValues(GeneratePeerName(peerInfo)).Inc()
8787

8888
return &types.PeerCreateResponse{
8989
ID: id,
@@ -98,7 +98,7 @@ func (pm *Manager) DeRegister(ctx context.Context, peerID string) error {
9898
}
9999

100100
pm.peerStore.Delete(peerID)
101-
pm.metrics.peers.WithLabelValues(peerInfo.HostName.String()).Dec()
101+
pm.metrics.peers.WithLabelValues(GeneratePeerName(peerInfo)).Dec()
102102
return nil
103103
}
104104

@@ -200,3 +200,8 @@ func getLessFunc(listResult []interface{}, desc bool) (less func(i, j int) bool)
200200
func generatePeerID(peerInfo *types.PeerCreateRequest) string {
201201
return fmt.Sprintf("%s-%s-%d", peerInfo.HostName.String(), peerInfo.IP.String(), time.Now().UnixNano())
202202
}
203+
204+
// GeneratePeerName extracts the hostname and ip from peerInfo.
205+
func GeneratePeerName(info *types.PeerInfo) string {
206+
return info.HostName.String() + "-" + info.IP.String()
207+
}

supernode/daemon/mgr/peer/manager_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ func (s *PeerMgrTestSuite) TestPeerMgr(c *check.C) {
5656
c.Check(err, check.IsNil)
5757

5858
c.Assert(1, check.Equals,
59-
int(prom_testutil.ToFloat64(peers.WithLabelValues("foo"))))
59+
int(prom_testutil.ToFloat64(peers.WithLabelValues("foo-192.168.10.11"))))
6060

6161
// get
6262
id := resp.ID
@@ -82,7 +82,7 @@ func (s *PeerMgrTestSuite) TestPeerMgr(c *check.C) {
8282
c.Check(err, check.IsNil)
8383

8484
c.Assert(0, check.Equals,
85-
int(prom_testutil.ToFloat64(peers.WithLabelValues("foo"))))
85+
int(prom_testutil.ToFloat64(peers.WithLabelValues("foo-192.168.10.11"))))
8686

8787
// get
8888
info, err = manager.Get(context.Background(), id)

supernode/daemon/mgr/task/manager.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ type metrics struct {
5353
func newMetrics(register prometheus.Registerer) *metrics {
5454
return &metrics{
5555
tasks: metricsutils.NewGauge(config.SubsystemSupernode, "tasks",
56-
"The status of Supernode tasks", []string{"taskid", "cdnstatus"}, register),
56+
"The status of Supernode tasks", []string{"cdnstatus"}, register),
5757

5858
triggerCdnCount: metricsutils.NewCounter(config.SubsystemSupernode, "trigger_cdn_total",
5959
"The number of triggering cdn", []string{}, register),
@@ -62,8 +62,8 @@ func newMetrics(register prometheus.Registerer) *metrics {
6262
"The number of triggering cdn failure", []string{}, register),
6363

6464
scheduleDurationMilliSeconds: metricsutils.NewHistogram(config.SubsystemSupernode, "schedule_duration_milliseconds",
65-
"duration for task scheduling in milliseconds", []string{"taskid"},
66-
prometheus.ExponentialBuckets(0.02, 2, 7), register),
65+
"Duration for task scheduling in milliseconds", []string{"peer"},
66+
prometheus.ExponentialBuckets(0.02, 2, 6), register),
6767
}
6868
}
6969

supernode/daemon/mgr/task/manager_util.go

Lines changed: 17 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import (
3030
"github.com/dragonflyoss/Dragonfly/pkg/timeutils"
3131
"github.com/dragonflyoss/Dragonfly/supernode/config"
3232
"github.com/dragonflyoss/Dragonfly/supernode/daemon/mgr"
33+
"github.com/dragonflyoss/Dragonfly/supernode/daemon/mgr/peer"
3334
"github.com/dragonflyoss/Dragonfly/supernode/util"
3435

3536
"github.com/pkg/errors"
@@ -110,7 +111,7 @@ func (tm *Manager) addOrUpdateTask(ctx context.Context, req *types.TaskCreateReq
110111
task.PieceTotal = int32((fileLength + (int64(pieceSize) - 1)) / int64(pieceSize))
111112

112113
tm.taskStore.Put(taskID, task)
113-
tm.metrics.tasks.WithLabelValues(taskID, task.CdnStatus).Inc()
114+
tm.metrics.tasks.WithLabelValues(task.CdnStatus).Inc()
114115
return task, nil
115116
}
116117

@@ -162,8 +163,8 @@ func (tm *Manager) updateTask(taskID string, updateTaskInfo *types.TaskInfo) err
162163

163164
// only update the task CdnStatus when the new CDNStatus and
164165
// the origin CDNStatus both not equals success
165-
tm.metrics.tasks.WithLabelValues(taskID, task.CdnStatus).Dec()
166-
tm.metrics.tasks.WithLabelValues(taskID, updateTaskInfo.CdnStatus).Inc()
166+
tm.metrics.tasks.WithLabelValues(task.CdnStatus).Dec()
167+
tm.metrics.tasks.WithLabelValues(updateTaskInfo.CdnStatus).Inc()
167168
task.CdnStatus = updateTaskInfo.CdnStatus
168169
return nil
169170
}
@@ -178,27 +179,15 @@ func (tm *Manager) updateTask(taskID string, updateTaskInfo *types.TaskInfo) err
178179
task.RealMd5 = updateTaskInfo.RealMd5
179180
}
180181

181-
// only update the task info when the new CDNStatus equals success
182-
// and the origin CDNStatus not equals success.
183-
if isSuccessCDN(updateTaskInfo.CdnStatus) {
184-
if updateTaskInfo.FileLength != 0 {
185-
task.FileLength = updateTaskInfo.FileLength
186-
}
187-
188-
if !stringutils.IsEmptyStr(updateTaskInfo.RealMd5) {
189-
task.RealMd5 = updateTaskInfo.RealMd5
190-
}
191-
192-
var pieceTotal int32
193-
if updateTaskInfo.FileLength > 0 {
194-
pieceTotal = int32((updateTaskInfo.FileLength + int64(task.PieceSize-1)) / int64(task.PieceSize))
195-
}
196-
if pieceTotal != 0 {
197-
task.PieceTotal = pieceTotal
198-
}
182+
var pieceTotal int32
183+
if updateTaskInfo.FileLength > 0 {
184+
pieceTotal = int32((updateTaskInfo.FileLength + int64(task.PieceSize-1)) / int64(task.PieceSize))
199185
}
200-
tm.metrics.tasks.WithLabelValues(taskID, task.CdnStatus).Dec()
201-
tm.metrics.tasks.WithLabelValues(taskID, updateTaskInfo.CdnStatus).Inc()
186+
if pieceTotal != 0 {
187+
task.PieceTotal = pieceTotal
188+
}
189+
tm.metrics.tasks.WithLabelValues(task.CdnStatus).Dec()
190+
tm.metrics.tasks.WithLabelValues(updateTaskInfo.CdnStatus).Inc()
202191
task.CdnStatus = updateTaskInfo.CdnStatus
203192

204193
return nil
@@ -342,14 +331,18 @@ func (tm *Manager) parseAvailablePeers(ctx context.Context, clientID string, tas
342331
return true, finishInfo, nil
343332
}
344333

334+
// Get peerName to represent peer in metrics.
335+
p, _ := tm.peerMgr.Get(context.Background(), dfgetTask.PeerID)
336+
peerName := peer.GeneratePeerName(p)
337+
345338
// get scheduler pieceResult
346339
logrus.Debugf("start scheduler for taskID: %s clientID: %s", task.ID, clientID)
347340
startTime := time.Now()
348341
pieceResult, err := tm.schedulerMgr.Schedule(ctx, task.ID, clientID, dfgetTask.PeerID)
349342
if err != nil {
350343
return false, nil, err
351344
}
352-
tm.metrics.scheduleDurationMilliSeconds.WithLabelValues(task.ID).Observe(timeutils.SinceInMilliseconds(startTime))
345+
tm.metrics.scheduleDurationMilliSeconds.WithLabelValues(peerName).Observe(timeutils.SinceInMilliseconds(startTime))
353346
logrus.Debugf("get scheduler result length(%d) with taskID(%s) and clientID(%s)", len(pieceResult), task.ID, clientID)
354347

355348
var pieceInfos []*types.PieceInfo

0 commit comments

Comments
 (0)