Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 23 additions & 12 deletions node.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package main
import (
"log"
"os/exec"
"regexp"
"sort"
"strconv"
"strings"
Expand All @@ -27,12 +28,13 @@ import (

// NodeMetrics stores metrics for each node
type NodeMetrics struct {
memAlloc uint64
memTotal uint64
cpuAlloc uint64
cpuIdle uint64
cpuOther uint64
cpuTotal uint64
memAlloc uint64
memTotal uint64
cpuAlloc uint64
cpuIdle uint64
cpuOther uint64
cpuTotal uint64
gpuTotal uint64
nodeStatus string
}

Expand All @@ -53,26 +55,31 @@ func ParseNodeMetrics(input []byte) map[string]*NodeMetrics {
for _, line := range linesUniq {
node := strings.Fields(line)
nodeName := node[0]
nodeStatus := node[4] // mixed, allocated, etc.
nodeStatus := node[5] // mixed, allocated, etc.

nodes[nodeName] = &NodeMetrics{0, 0, 0, 0, 0, 0, ""}
nodes[nodeName] = &NodeMetrics{0, 0, 0, 0, 0, 0, 0, ""}

memAlloc, _ := strconv.ParseUint(node[1], 10, 64)
memTotal, _ := strconv.ParseUint(node[2], 10, 64)


cpuInfo := strings.Split(node[3], "/")
cpuAlloc, _ := strconv.ParseUint(cpuInfo[0], 10, 64)
cpuIdle, _ := strconv.ParseUint(cpuInfo[1], 10, 64)
cpuOther, _ := strconv.ParseUint(cpuInfo[2], 10, 64)
cpuTotal, _ := strconv.ParseUint(cpuInfo[3], 10, 64)

var gpuTotal uint64
gpuRegex := regexp.MustCompile("gpu:(.*)")
if len(gpuRegex.FindStringSubmatch(node[4])) == 2 {
gpuTotal, _ = strconv.ParseUint(gpuRegex.FindStringSubmatch(node[4])[1], 10, 64)
}
nodes[nodeName].memAlloc = memAlloc
nodes[nodeName].memTotal = memTotal
nodes[nodeName].cpuAlloc = cpuAlloc
nodes[nodeName].cpuIdle = cpuIdle
nodes[nodeName].cpuOther = cpuOther
nodes[nodeName].cpuTotal = cpuTotal
nodes[nodeName].gpuTotal = gpuTotal
nodes[nodeName].nodeStatus = nodeStatus
}

Expand All @@ -82,7 +89,7 @@ func ParseNodeMetrics(input []byte) map[string]*NodeMetrics {
// NodeData executes the sinfo command to get data for each node
// It returns the output of the sinfo command
func NodeData() []byte {
cmd := exec.Command("sinfo", "-h", "-N", "-O", "NodeList,AllocMem,Memory,CPUsState,StateLong")
cmd := exec.Command("sinfo", "-h", "-N", "-O", "NodeList,AllocMem,Memory,CPUsState,Gres,StateLong")
out, err := cmd.Output()
if err != nil {
log.Fatal(err)
Expand All @@ -97,12 +104,13 @@ type NodeCollector struct {
cpuTotal *prometheus.Desc
memAlloc *prometheus.Desc
memTotal *prometheus.Desc
gpuTotal *prometheus.Desc
}

// NewNodeCollector creates a Prometheus collector to keep all our stats in
// It returns a set of collections for consumption
func NewNodeCollector() *NodeCollector {
labels := []string{"node","status"}
labels := []string{"node", "status"}

return &NodeCollector{
cpuAlloc: prometheus.NewDesc("slurm_node_cpu_alloc", "Allocated CPUs per node", labels, nil),
Expand All @@ -111,6 +119,7 @@ func NewNodeCollector() *NodeCollector {
cpuTotal: prometheus.NewDesc("slurm_node_cpu_total", "Total CPUs per node", labels, nil),
memAlloc: prometheus.NewDesc("slurm_node_mem_alloc", "Allocated memory per node", labels, nil),
memTotal: prometheus.NewDesc("slurm_node_mem_total", "Total memory per node", labels, nil),
gpuTotal: prometheus.NewDesc("slurm_node_gpu_total", "Total GPUs per node", labels, nil),
}
}

Expand All @@ -122,16 +131,18 @@ func (nc *NodeCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- nc.cpuTotal
ch <- nc.memAlloc
ch <- nc.memTotal
ch <- nc.gpuTotal
}

func (nc *NodeCollector) Collect(ch chan<- prometheus.Metric) {
nodes := NodeGetMetrics()
for node := range nodes {
ch <- prometheus.MustNewConstMetric(nc.cpuAlloc, prometheus.GaugeValue, float64(nodes[node].cpuAlloc), node, nodes[node].nodeStatus)
ch <- prometheus.MustNewConstMetric(nc.cpuIdle, prometheus.GaugeValue, float64(nodes[node].cpuIdle), node, nodes[node].nodeStatus)
ch <- prometheus.MustNewConstMetric(nc.cpuIdle, prometheus.GaugeValue, float64(nodes[node].cpuIdle), node, nodes[node].nodeStatus)
ch <- prometheus.MustNewConstMetric(nc.cpuOther, prometheus.GaugeValue, float64(nodes[node].cpuOther), node, nodes[node].nodeStatus)
ch <- prometheus.MustNewConstMetric(nc.cpuTotal, prometheus.GaugeValue, float64(nodes[node].cpuTotal), node, nodes[node].nodeStatus)
ch <- prometheus.MustNewConstMetric(nc.memAlloc, prometheus.GaugeValue, float64(nodes[node].memAlloc), node, nodes[node].nodeStatus)
ch <- prometheus.MustNewConstMetric(nc.memTotal, prometheus.GaugeValue, float64(nodes[node].memTotal), node, nodes[node].nodeStatus)
ch <- prometheus.MustNewConstMetric(nc.gpuTotal, prometheus.GaugeValue, float64(nodes[node].gpuTotal), node, nodes[node].nodeStatus)
}
}
162 changes: 91 additions & 71 deletions users.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,18 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */
package main

import (
"io/ioutil"
"os/exec"
"log"
"strings"
"strconv"
"regexp"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus"
"io/ioutil"
"log"
"os/exec"
"regexp"
"strconv"
"strings"
)

func UsersData() []byte {
cmd := exec.Command("squeue","-a","-r","-h","-o %A|%u|%T|%C")
stdout, err := cmd.StdoutPipe()
cmd := exec.Command("squeue", "-a", "-r", "-h", "-o %A|%u|%T|%C|%m")
stdout, err := cmd.StdoutPipe()
if err != nil {
log.Fatal(err)
}
Expand All @@ -42,81 +42,101 @@ func UsersData() []byte {
}

type UserJobMetrics struct {
pending float64
running float64
running_cpus float64
suspended float64
pending float64
running float64
running_cpus float64
running_mem float64
suspended float64
}

func ParseUsersMetrics(input []byte) map[string]*UserJobMetrics {
users := make(map[string]*UserJobMetrics)
lines := strings.Split(string(input), "\n")
for _, line := range lines {
if strings.Contains(line,"|") {
user := strings.Split(line,"|")[1]
_,key := users[user]
if !key {
users[user] = &UserJobMetrics{0,0,0,0}
}
state := strings.Split(line,"|")[2]
state = strings.ToLower(state)
cpus,_ := strconv.ParseFloat(strings.Split(line,"|")[3],64)
pending := regexp.MustCompile(`^pending`)
running := regexp.MustCompile(`^running`)
suspended := regexp.MustCompile(`^suspended`)
switch {
case pending.MatchString(state) == true:
users[user].pending++
case running.MatchString(state) == true:
users[user].running++
users[user].running_cpus += cpus
case suspended.MatchString(state) == true:
users[user].suspended++
}
}
}
return users
users := make(map[string]*UserJobMetrics)
lines := strings.Split(string(input), "\n")
for _, line := range lines {
if strings.Contains(line, "|") {
user := strings.Split(line, "|")[1]
_, key := users[user]
if !key {
users[user] = &UserJobMetrics{0, 0, 0, 0, 0}
}
state := strings.Split(line, "|")[2]
state = strings.ToLower(state)
cpus, _ := strconv.ParseFloat(strings.Split(line, "|")[3], 64)

var mem float64
memstr := strings.Split(line, "|")[4]
memfloat64, _ := strconv.ParseFloat(string(memstr[0:len(memstr)-1]), 64)
switch string(memstr[len(memstr)-1]) {
case "M":
mem = memfloat64 * 1024
case "G":
mem = memfloat64 * 1024 * 1024
case "T":
mem = memfloat64 * 1024 * 1024 * 1024
}

pending := regexp.MustCompile(`^pending`)
running := regexp.MustCompile(`^running`)
suspended := regexp.MustCompile(`^suspended`)
switch {
case pending.MatchString(state) == true:
users[user].pending++
case running.MatchString(state) == true:
users[user].running++
users[user].running_cpus += cpus
users[user].running_mem += mem
case suspended.MatchString(state) == true:
users[user].suspended++
}
}
}
return users
}

type UsersCollector struct {
pending *prometheus.Desc
running *prometheus.Desc
running_cpus *prometheus.Desc
suspended *prometheus.Desc
pending *prometheus.Desc
running *prometheus.Desc
running_cpus *prometheus.Desc
running_mem *prometheus.Desc
suspended *prometheus.Desc
}

func NewUsersCollector() *UsersCollector {
labels := []string{"user"}
return &UsersCollector {
pending: prometheus.NewDesc("slurm_user_jobs_pending", "Pending jobs for user", labels, nil),
running: prometheus.NewDesc("slurm_user_jobs_running", "Running jobs for user", labels, nil),
running_cpus: prometheus.NewDesc("slurm_user_cpus_running", "Running cpus for user", labels, nil),
suspended: prometheus.NewDesc("slurm_user_jobs_suspended", "Suspended jobs for user", labels, nil),
}
labels := []string{"user"}
return &UsersCollector{
pending: prometheus.NewDesc("slurm_user_jobs_pending", "Pending jobs for user", labels, nil),
running: prometheus.NewDesc("slurm_user_jobs_running", "Running jobs for user", labels, nil),
running_cpus: prometheus.NewDesc("slurm_user_cpus_running", "Running cpus for user", labels, nil),
running_mem: prometheus.NewDesc("slurm_user_mem_running", "Running mem for user", labels, nil),
suspended: prometheus.NewDesc("slurm_user_jobs_suspended", "Suspended jobs for user", labels, nil),
}
}

func (uc *UsersCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- uc.pending
ch <- uc.running
ch <- uc.running_cpus
ch <- uc.suspended
ch <- uc.pending
ch <- uc.running
ch <- uc.running_cpus
ch <- uc.running_mem
ch <- uc.suspended
}

func (uc *UsersCollector) Collect(ch chan<- prometheus.Metric) {
um := ParseUsersMetrics(UsersData())
for u := range um {
if um[u].pending > 0 {
ch <- prometheus.MustNewConstMetric(uc.pending, prometheus.GaugeValue, um[u].pending, u)
}
if um[u].running > 0 {
ch <- prometheus.MustNewConstMetric(uc.running, prometheus.GaugeValue, um[u].running, u)
}
if um[u].running_cpus > 0 {
ch <- prometheus.MustNewConstMetric(uc.running_cpus, prometheus.GaugeValue, um[u].running_cpus, u)
}
if um[u].suspended > 0 {
ch <- prometheus.MustNewConstMetric(uc.suspended, prometheus.GaugeValue, um[u].suspended, u)
}
}
um := ParseUsersMetrics(UsersData())
for u := range um {
if um[u].pending > 0 {
ch <- prometheus.MustNewConstMetric(uc.pending, prometheus.GaugeValue, um[u].pending, u)
}
if um[u].running > 0 {
ch <- prometheus.MustNewConstMetric(uc.running, prometheus.GaugeValue, um[u].running, u)
}
if um[u].running_cpus > 0 {
ch <- prometheus.MustNewConstMetric(uc.running_cpus, prometheus.GaugeValue, um[u].running_cpus, u)
}
if um[u].running_mem > 0 {
ch <- prometheus.MustNewConstMetric(uc.running_mem, prometheus.GaugeValue, um[u].running_mem, u)
}
if um[u].suspended > 0 {
ch <- prometheus.MustNewConstMetric(uc.suspended, prometheus.GaugeValue, um[u].suspended, u)
}
}
}