diff --git a/node.go b/node.go index bf2f759c..c6b949ea 100644 --- a/node.go +++ b/node.go @@ -18,6 +18,7 @@ package main import ( "log" "os/exec" + "regexp" "sort" "strconv" "strings" @@ -27,12 +28,13 @@ import ( // NodeMetrics stores metrics for each node type NodeMetrics struct { - memAlloc uint64 - memTotal uint64 - cpuAlloc uint64 - cpuIdle uint64 - cpuOther uint64 - cpuTotal uint64 + memAlloc uint64 + memTotal uint64 + cpuAlloc uint64 + cpuIdle uint64 + cpuOther uint64 + cpuTotal uint64 + gpuTotal uint64 nodeStatus string } @@ -53,26 +55,31 @@ func ParseNodeMetrics(input []byte) map[string]*NodeMetrics { for _, line := range linesUniq { node := strings.Fields(line) nodeName := node[0] - nodeStatus := node[4] // mixed, allocated, etc. + nodeStatus := node[5] // mixed, allocated, etc. - nodes[nodeName] = &NodeMetrics{0, 0, 0, 0, 0, 0, ""} + nodes[nodeName] = &NodeMetrics{0, 0, 0, 0, 0, 0, 0, ""} memAlloc, _ := strconv.ParseUint(node[1], 10, 64) memTotal, _ := strconv.ParseUint(node[2], 10, 64) - cpuInfo := strings.Split(node[3], "/") cpuAlloc, _ := strconv.ParseUint(cpuInfo[0], 10, 64) cpuIdle, _ := strconv.ParseUint(cpuInfo[1], 10, 64) cpuOther, _ := strconv.ParseUint(cpuInfo[2], 10, 64) cpuTotal, _ := strconv.ParseUint(cpuInfo[3], 10, 64) + var gpuTotal uint64 + gpuRegex := regexp.MustCompile("gpu:(.*)") + if len(gpuRegex.FindStringSubmatch(node[4])) == 2 { + gpuTotal, _ = strconv.ParseUint(gpuRegex.FindStringSubmatch(node[4])[1], 10, 64) + } nodes[nodeName].memAlloc = memAlloc nodes[nodeName].memTotal = memTotal nodes[nodeName].cpuAlloc = cpuAlloc nodes[nodeName].cpuIdle = cpuIdle nodes[nodeName].cpuOther = cpuOther nodes[nodeName].cpuTotal = cpuTotal + nodes[nodeName].gpuTotal = gpuTotal nodes[nodeName].nodeStatus = nodeStatus } @@ -82,7 +89,7 @@ func ParseNodeMetrics(input []byte) map[string]*NodeMetrics { // NodeData executes the sinfo command to get data for each node // It returns the output of the sinfo command func NodeData() []byte { - cmd := exec.Command("sinfo", "-h", "-N", "-O", "NodeList,AllocMem,Memory,CPUsState,StateLong") + cmd := exec.Command("sinfo", "-h", "-N", "-O", "NodeList,AllocMem,Memory,CPUsState,Gres,StateLong") out, err := cmd.Output() if err != nil { log.Fatal(err) @@ -97,12 +104,13 @@ type NodeCollector struct { cpuTotal *prometheus.Desc memAlloc *prometheus.Desc memTotal *prometheus.Desc + gpuTotal *prometheus.Desc } // NewNodeCollector creates a Prometheus collector to keep all our stats in // It returns a set of collections for consumption func NewNodeCollector() *NodeCollector { - labels := []string{"node","status"} + labels := []string{"node", "status"} return &NodeCollector{ cpuAlloc: prometheus.NewDesc("slurm_node_cpu_alloc", "Allocated CPUs per node", labels, nil), @@ -111,6 +119,7 @@ func NewNodeCollector() *NodeCollector { cpuTotal: prometheus.NewDesc("slurm_node_cpu_total", "Total CPUs per node", labels, nil), memAlloc: prometheus.NewDesc("slurm_node_mem_alloc", "Allocated memory per node", labels, nil), memTotal: prometheus.NewDesc("slurm_node_mem_total", "Total memory per node", labels, nil), + gpuTotal: prometheus.NewDesc("slurm_node_gpu_total", "Total GPUs per node", labels, nil), } } @@ -122,16 +131,18 @@ func (nc *NodeCollector) Describe(ch chan<- *prometheus.Desc) { ch <- nc.cpuTotal ch <- nc.memAlloc ch <- nc.memTotal + ch <- nc.gpuTotal } func (nc *NodeCollector) Collect(ch chan<- prometheus.Metric) { nodes := NodeGetMetrics() for node := range nodes { ch <- prometheus.MustNewConstMetric(nc.cpuAlloc, prometheus.GaugeValue, float64(nodes[node].cpuAlloc), node, nodes[node].nodeStatus) - ch <- prometheus.MustNewConstMetric(nc.cpuIdle, prometheus.GaugeValue, float64(nodes[node].cpuIdle), node, nodes[node].nodeStatus) + ch <- prometheus.MustNewConstMetric(nc.cpuIdle, prometheus.GaugeValue, float64(nodes[node].cpuIdle), node, nodes[node].nodeStatus) ch <- prometheus.MustNewConstMetric(nc.cpuOther, prometheus.GaugeValue, float64(nodes[node].cpuOther), node, nodes[node].nodeStatus) ch <- prometheus.MustNewConstMetric(nc.cpuTotal, prometheus.GaugeValue, float64(nodes[node].cpuTotal), node, nodes[node].nodeStatus) ch <- prometheus.MustNewConstMetric(nc.memAlloc, prometheus.GaugeValue, float64(nodes[node].memAlloc), node, nodes[node].nodeStatus) ch <- prometheus.MustNewConstMetric(nc.memTotal, prometheus.GaugeValue, float64(nodes[node].memTotal), node, nodes[node].nodeStatus) + ch <- prometheus.MustNewConstMetric(nc.gpuTotal, prometheus.GaugeValue, float64(nodes[node].gpuTotal), node, nodes[node].nodeStatus) } } diff --git a/users.go b/users.go index 2b0e85e5..d37cee0e 100644 --- a/users.go +++ b/users.go @@ -16,18 +16,18 @@ along with this program. If not, see . */ package main import ( - "io/ioutil" - "os/exec" - "log" - "strings" - "strconv" - "regexp" - "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus" + "io/ioutil" + "log" + "os/exec" + "regexp" + "strconv" + "strings" ) func UsersData() []byte { - cmd := exec.Command("squeue","-a","-r","-h","-o %A|%u|%T|%C") - stdout, err := cmd.StdoutPipe() + cmd := exec.Command("squeue", "-a", "-r", "-h", "-o %A|%u|%T|%C|%m") + stdout, err := cmd.StdoutPipe() if err != nil { log.Fatal(err) } @@ -42,81 +42,101 @@ func UsersData() []byte { } type UserJobMetrics struct { - pending float64 - running float64 - running_cpus float64 - suspended float64 + pending float64 + running float64 + running_cpus float64 + running_mem float64 + suspended float64 } func ParseUsersMetrics(input []byte) map[string]*UserJobMetrics { - users := make(map[string]*UserJobMetrics) - lines := strings.Split(string(input), "\n") - for _, line := range lines { - if strings.Contains(line,"|") { - user := strings.Split(line,"|")[1] - _,key := users[user] - if !key { - users[user] = &UserJobMetrics{0,0,0,0} - } - state := strings.Split(line,"|")[2] - state = strings.ToLower(state) - cpus,_ := strconv.ParseFloat(strings.Split(line,"|")[3],64) - pending := regexp.MustCompile(`^pending`) - running := regexp.MustCompile(`^running`) - suspended := regexp.MustCompile(`^suspended`) - switch { - case pending.MatchString(state) == true: - users[user].pending++ - case running.MatchString(state) == true: - users[user].running++ - users[user].running_cpus += cpus - case suspended.MatchString(state) == true: - users[user].suspended++ - } - } - } - return users + users := make(map[string]*UserJobMetrics) + lines := strings.Split(string(input), "\n") + for _, line := range lines { + if strings.Contains(line, "|") { + user := strings.Split(line, "|")[1] + _, key := users[user] + if !key { + users[user] = &UserJobMetrics{0, 0, 0, 0, 0} + } + state := strings.Split(line, "|")[2] + state = strings.ToLower(state) + cpus, _ := strconv.ParseFloat(strings.Split(line, "|")[3], 64) + + var mem float64 + memstr := strings.Split(line, "|")[4] + memfloat64, _ := strconv.ParseFloat(string(memstr[0:len(memstr)-1]), 64) + switch string(memstr[len(memstr)-1]) { + case "M": + mem = memfloat64 * 1024 + case "G": + mem = memfloat64 * 1024 * 1024 + case "T": + mem = memfloat64 * 1024 * 1024 * 1024 + } + + pending := regexp.MustCompile(`^pending`) + running := regexp.MustCompile(`^running`) + suspended := regexp.MustCompile(`^suspended`) + switch { + case pending.MatchString(state) == true: + users[user].pending++ + case running.MatchString(state) == true: + users[user].running++ + users[user].running_cpus += cpus + users[user].running_mem += mem + case suspended.MatchString(state) == true: + users[user].suspended++ + } + } + } + return users } type UsersCollector struct { - pending *prometheus.Desc - running *prometheus.Desc - running_cpus *prometheus.Desc - suspended *prometheus.Desc + pending *prometheus.Desc + running *prometheus.Desc + running_cpus *prometheus.Desc + running_mem *prometheus.Desc + suspended *prometheus.Desc } func NewUsersCollector() *UsersCollector { - labels := []string{"user"} - return &UsersCollector { - pending: prometheus.NewDesc("slurm_user_jobs_pending", "Pending jobs for user", labels, nil), - running: prometheus.NewDesc("slurm_user_jobs_running", "Running jobs for user", labels, nil), - running_cpus: prometheus.NewDesc("slurm_user_cpus_running", "Running cpus for user", labels, nil), - suspended: prometheus.NewDesc("slurm_user_jobs_suspended", "Suspended jobs for user", labels, nil), - } + labels := []string{"user"} + return &UsersCollector{ + pending: prometheus.NewDesc("slurm_user_jobs_pending", "Pending jobs for user", labels, nil), + running: prometheus.NewDesc("slurm_user_jobs_running", "Running jobs for user", labels, nil), + running_cpus: prometheus.NewDesc("slurm_user_cpus_running", "Running cpus for user", labels, nil), + running_mem: prometheus.NewDesc("slurm_user_mem_running", "Running mem for user", labels, nil), + suspended: prometheus.NewDesc("slurm_user_jobs_suspended", "Suspended jobs for user", labels, nil), + } } func (uc *UsersCollector) Describe(ch chan<- *prometheus.Desc) { - ch <- uc.pending - ch <- uc.running - ch <- uc.running_cpus - ch <- uc.suspended + ch <- uc.pending + ch <- uc.running + ch <- uc.running_cpus + ch <- uc.running_mem + ch <- uc.suspended } func (uc *UsersCollector) Collect(ch chan<- prometheus.Metric) { - um := ParseUsersMetrics(UsersData()) - for u := range um { - if um[u].pending > 0 { - ch <- prometheus.MustNewConstMetric(uc.pending, prometheus.GaugeValue, um[u].pending, u) - } - if um[u].running > 0 { - ch <- prometheus.MustNewConstMetric(uc.running, prometheus.GaugeValue, um[u].running, u) - } - if um[u].running_cpus > 0 { - ch <- prometheus.MustNewConstMetric(uc.running_cpus, prometheus.GaugeValue, um[u].running_cpus, u) - } - if um[u].suspended > 0 { - ch <- prometheus.MustNewConstMetric(uc.suspended, prometheus.GaugeValue, um[u].suspended, u) - } - } + um := ParseUsersMetrics(UsersData()) + for u := range um { + if um[u].pending > 0 { + ch <- prometheus.MustNewConstMetric(uc.pending, prometheus.GaugeValue, um[u].pending, u) + } + if um[u].running > 0 { + ch <- prometheus.MustNewConstMetric(uc.running, prometheus.GaugeValue, um[u].running, u) + } + if um[u].running_cpus > 0 { + ch <- prometheus.MustNewConstMetric(uc.running_cpus, prometheus.GaugeValue, um[u].running_cpus, u) + } + if um[u].running_mem > 0 { + ch <- prometheus.MustNewConstMetric(uc.running_mem, prometheus.GaugeValue, um[u].running_mem, u) + } + if um[u].suspended > 0 { + ch <- prometheus.MustNewConstMetric(uc.suspended, prometheus.GaugeValue, um[u].suspended, u) + } + } } -