From 27712e7ef0b79d6cbb849db0da4689b520447546 Mon Sep 17 00:00:00 2001 From: JesseStutler Date: Thu, 3 Apr 2025 10:00:19 +0800 Subject: [PATCH 1/4] Remove the execute permission for some files, chmod to 644 Signed-off-by: JesseStutler --- docs/design/jobflow/README.md | 0 pkg/controllers/jobflow/constant.go | 0 pkg/controllers/jobflow/jobflow_controller.go | 0 pkg/controllers/jobflow/jobflow_controller_action.go | 0 pkg/controllers/jobflow/jobflow_controller_action_test.go | 0 pkg/controllers/jobflow/jobflow_controller_handler.go | 0 pkg/controllers/jobflow/jobflow_controller_handler_test.go | 0 pkg/controllers/jobflow/jobflow_controller_util.go | 0 pkg/controllers/jobflow/jobflow_controller_util_test.go | 0 pkg/controllers/jobflow/state/factory.go | 0 pkg/controllers/jobflow/state/failed.go | 0 pkg/controllers/jobflow/state/pending.go | 0 pkg/controllers/jobflow/state/running.go | 0 pkg/controllers/jobflow/state/succeed.go | 0 pkg/controllers/jobflow/state/terminating.go | 0 pkg/controllers/jobtemplate/constant.go | 0 pkg/controllers/jobtemplate/jobTemplate_controller_util_test.go | 0 pkg/controllers/jobtemplate/jobtemplate_controller.go | 0 pkg/controllers/jobtemplate/jobtemplate_controller_action.go | 0 pkg/controllers/jobtemplate/jobtemplate_controller_action_test.go | 0 pkg/controllers/jobtemplate/jobtemplate_controller_handler.go | 0 .../jobtemplate/jobtemplate_controller_handler_test.go | 0 pkg/controllers/jobtemplate/jobtemplate_controller_util.go | 0 pkg/webhooks/admission/pods/mutate/annotation.go | 0 pkg/webhooks/admission/pods/mutate/factory.go | 0 pkg/webhooks/admission/pods/mutate/namespace.go | 0 26 files changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 docs/design/jobflow/README.md mode change 100755 => 100644 pkg/controllers/jobflow/constant.go mode change 100755 => 100644 pkg/controllers/jobflow/jobflow_controller.go mode change 100755 => 100644 pkg/controllers/jobflow/jobflow_controller_action.go mode change 100755 => 100644 pkg/controllers/jobflow/jobflow_controller_action_test.go mode change 100755 => 100644 pkg/controllers/jobflow/jobflow_controller_handler.go mode change 100755 => 100644 pkg/controllers/jobflow/jobflow_controller_handler_test.go mode change 100755 => 100644 pkg/controllers/jobflow/jobflow_controller_util.go mode change 100755 => 100644 pkg/controllers/jobflow/jobflow_controller_util_test.go mode change 100755 => 100644 pkg/controllers/jobflow/state/factory.go mode change 100755 => 100644 pkg/controllers/jobflow/state/failed.go mode change 100755 => 100644 pkg/controllers/jobflow/state/pending.go mode change 100755 => 100644 pkg/controllers/jobflow/state/running.go mode change 100755 => 100644 pkg/controllers/jobflow/state/succeed.go mode change 100755 => 100644 pkg/controllers/jobflow/state/terminating.go mode change 100755 => 100644 pkg/controllers/jobtemplate/constant.go mode change 100755 => 100644 pkg/controllers/jobtemplate/jobTemplate_controller_util_test.go mode change 100755 => 100644 pkg/controllers/jobtemplate/jobtemplate_controller.go mode change 100755 => 100644 pkg/controllers/jobtemplate/jobtemplate_controller_action.go mode change 100755 => 100644 pkg/controllers/jobtemplate/jobtemplate_controller_action_test.go mode change 100755 => 100644 pkg/controllers/jobtemplate/jobtemplate_controller_handler.go mode change 100755 => 100644 pkg/controllers/jobtemplate/jobtemplate_controller_handler_test.go mode change 100755 => 100644 pkg/controllers/jobtemplate/jobtemplate_controller_util.go mode change 100755 => 100644 pkg/webhooks/admission/pods/mutate/annotation.go mode change 100755 => 100644 pkg/webhooks/admission/pods/mutate/factory.go mode change 100755 => 100644 pkg/webhooks/admission/pods/mutate/namespace.go diff --git a/docs/design/jobflow/README.md b/docs/design/jobflow/README.md old mode 100755 new mode 100644 diff --git a/pkg/controllers/jobflow/constant.go b/pkg/controllers/jobflow/constant.go old mode 100755 new mode 100644 diff --git a/pkg/controllers/jobflow/jobflow_controller.go b/pkg/controllers/jobflow/jobflow_controller.go old mode 100755 new mode 100644 diff --git a/pkg/controllers/jobflow/jobflow_controller_action.go b/pkg/controllers/jobflow/jobflow_controller_action.go old mode 100755 new mode 100644 diff --git a/pkg/controllers/jobflow/jobflow_controller_action_test.go b/pkg/controllers/jobflow/jobflow_controller_action_test.go old mode 100755 new mode 100644 diff --git a/pkg/controllers/jobflow/jobflow_controller_handler.go b/pkg/controllers/jobflow/jobflow_controller_handler.go old mode 100755 new mode 100644 diff --git a/pkg/controllers/jobflow/jobflow_controller_handler_test.go b/pkg/controllers/jobflow/jobflow_controller_handler_test.go old mode 100755 new mode 100644 diff --git a/pkg/controllers/jobflow/jobflow_controller_util.go b/pkg/controllers/jobflow/jobflow_controller_util.go old mode 100755 new mode 100644 diff --git a/pkg/controllers/jobflow/jobflow_controller_util_test.go b/pkg/controllers/jobflow/jobflow_controller_util_test.go old mode 100755 new mode 100644 diff --git a/pkg/controllers/jobflow/state/factory.go b/pkg/controllers/jobflow/state/factory.go old mode 100755 new mode 100644 diff --git a/pkg/controllers/jobflow/state/failed.go b/pkg/controllers/jobflow/state/failed.go old mode 100755 new mode 100644 diff --git a/pkg/controllers/jobflow/state/pending.go b/pkg/controllers/jobflow/state/pending.go old mode 100755 new mode 100644 diff --git a/pkg/controllers/jobflow/state/running.go b/pkg/controllers/jobflow/state/running.go old mode 100755 new mode 100644 diff --git a/pkg/controllers/jobflow/state/succeed.go b/pkg/controllers/jobflow/state/succeed.go old mode 100755 new mode 100644 diff --git a/pkg/controllers/jobflow/state/terminating.go b/pkg/controllers/jobflow/state/terminating.go old mode 100755 new mode 100644 diff --git a/pkg/controllers/jobtemplate/constant.go b/pkg/controllers/jobtemplate/constant.go old mode 100755 new mode 100644 diff --git a/pkg/controllers/jobtemplate/jobTemplate_controller_util_test.go b/pkg/controllers/jobtemplate/jobTemplate_controller_util_test.go old mode 100755 new mode 100644 diff --git a/pkg/controllers/jobtemplate/jobtemplate_controller.go b/pkg/controllers/jobtemplate/jobtemplate_controller.go old mode 100755 new mode 100644 diff --git a/pkg/controllers/jobtemplate/jobtemplate_controller_action.go b/pkg/controllers/jobtemplate/jobtemplate_controller_action.go old mode 100755 new mode 100644 diff --git a/pkg/controllers/jobtemplate/jobtemplate_controller_action_test.go b/pkg/controllers/jobtemplate/jobtemplate_controller_action_test.go old mode 100755 new mode 100644 diff --git a/pkg/controllers/jobtemplate/jobtemplate_controller_handler.go b/pkg/controllers/jobtemplate/jobtemplate_controller_handler.go old mode 100755 new mode 100644 diff --git a/pkg/controllers/jobtemplate/jobtemplate_controller_handler_test.go b/pkg/controllers/jobtemplate/jobtemplate_controller_handler_test.go old mode 100755 new mode 100644 diff --git a/pkg/controllers/jobtemplate/jobtemplate_controller_util.go b/pkg/controllers/jobtemplate/jobtemplate_controller_util.go old mode 100755 new mode 100644 diff --git a/pkg/webhooks/admission/pods/mutate/annotation.go b/pkg/webhooks/admission/pods/mutate/annotation.go old mode 100755 new mode 100644 diff --git a/pkg/webhooks/admission/pods/mutate/factory.go b/pkg/webhooks/admission/pods/mutate/factory.go old mode 100755 new mode 100644 diff --git a/pkg/webhooks/admission/pods/mutate/namespace.go b/pkg/webhooks/admission/pods/mutate/namespace.go old mode 100755 new mode 100644 From 7c6478a51bab7e38a5a9164872f90a2681fadae9 Mon Sep 17 00:00:00 2001 From: JesseStutler Date: Thu, 3 Apr 2025 12:34:34 +0800 Subject: [PATCH 2/4] add a switch to control whether enable pprof in scheduler Signed-off-by: JesseStutler --- cmd/scheduler/app/options/options.go | 2 ++ cmd/scheduler/app/server.go | 33 ++++++++++++++++--- cmd/scheduler/main.go | 3 -- .../chart/volcano/templates/scheduler.yaml | 3 ++ installer/helm/chart/volcano/values.yaml | 1 + 5 files changed, 34 insertions(+), 8 deletions(-) diff --git a/cmd/scheduler/app/options/options.go b/cmd/scheduler/app/options/options.go index ec2bcedd9d..b40858aaa6 100644 --- a/cmd/scheduler/app/options/options.go +++ b/cmd/scheduler/app/options/options.go @@ -67,6 +67,7 @@ type ServerOption struct { DefaultQueue string PrintVersion bool EnableMetrics bool + EnablePprof bool ListenAddress string EnablePriorityClass bool EnableCSIStorage bool @@ -141,6 +142,7 @@ func (s *ServerOption) AddFlags(fs *pflag.FlagSet) { "Enable tracking of available storage capacity that CSI drivers provide; it is false by default") fs.BoolVar(&s.EnableHealthz, "enable-healthz", false, "Enable the health check; it is false by default") fs.BoolVar(&s.EnableMetrics, "enable-metrics", false, "Enable the metrics function; it is false by default") + fs.BoolVar(&s.EnablePprof, "enable-pprof", false, "Enable the pprof endpoint; it is false by default") fs.StringSliceVar(&s.NodeSelector, "node-selector", nil, "volcano only work with the labeled node, like: --node-selector=volcano.sh/role:train --node-selector=volcano.sh/role:serving") fs.BoolVar(&s.EnableCacheDumper, "cache-dumper", true, "Enable the cache dumper, it's true by default") fs.StringVar(&s.CacheDumpFileDir, "cache-dump-dir", "/tmp", "The target dir where the json file put at when dump cache info to json file") diff --git a/cmd/scheduler/app/server.go b/cmd/scheduler/app/server.go index cbe5909d9b..3cd8fc02b3 100644 --- a/cmd/scheduler/app/server.go +++ b/cmd/scheduler/app/server.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "net/http" + "net/http/pprof" "os" "volcano.sh/apis/pkg/apis/helpers" @@ -69,11 +70,8 @@ func Run(opt *options.ServerOption) error { panic(err) } - if opt.EnableMetrics { - go func() { - http.Handle("/metrics", commonutil.PromHandler()) - klog.Fatalf("Prometheus Http Server failed %s", http.ListenAndServe(opt.ListenAddress, nil)) - }() + if opt.EnableMetrics || opt.EnablePprof { + go startMetricsServer(opt) } if opt.EnableHealthz { @@ -142,3 +140,28 @@ func Run(opt *options.ServerOption) error { }) return fmt.Errorf("lost lease") } + +func startMetricsServer(opt *options.ServerOption) { + mux := http.NewServeMux() + + if opt.EnableMetrics { + mux.Handle("/metrics", commonutil.PromHandler()) + } + + if opt.EnablePprof { + mux.HandleFunc("/debug/pprof/", pprof.Index) + mux.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline) + mux.HandleFunc("/debug/pprof/profile", pprof.Profile) + mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol) + mux.HandleFunc("/debug/pprof/trace", pprof.Trace) + } + + server := &http.Server{ + Addr: opt.ListenAddress, + Handler: mux, + } + + if err := server.ListenAndServe(); err != nil { + klog.Errorf("start metrics/pprof http server failed: %v", err) + } +} diff --git a/cmd/scheduler/main.go b/cmd/scheduler/main.go index bd128d9885..3a9a7d1f88 100644 --- a/cmd/scheduler/main.go +++ b/cmd/scheduler/main.go @@ -29,9 +29,6 @@ import ( componentbaseoptions "k8s.io/component-base/config/options" "k8s.io/klog/v2" - // init pprof server - _ "net/http/pprof" - "volcano.sh/volcano/cmd/scheduler/app" "volcano.sh/volcano/cmd/scheduler/app/options" commonutil "volcano.sh/volcano/pkg/util" diff --git a/installer/helm/chart/volcano/templates/scheduler.yaml b/installer/helm/chart/volcano/templates/scheduler.yaml index 887c5c9733..cdba8bfe5e 100644 --- a/installer/helm/chart/volcano/templates/scheduler.yaml +++ b/installer/helm/chart/volcano/templates/scheduler.yaml @@ -185,6 +185,9 @@ spec: {{- if .Values.custom.scheduler_metrics_enable }} - --enable-metrics=true {{- end }} + {{- if .Values.custom.scheduler_pprof_enable }} + - --enable-pprof=true + {{- end }} - --leader-elect={{ .Values.custom.leader_elect_enable }} {{- if .Values.custom.leader_elect_enable }} - --leader-elect-resource-namespace={{ .Release.Namespace }} diff --git a/installer/helm/chart/volcano/values.yaml b/installer/helm/chart/volcano/values.yaml index 7584ed7a6c..25cc5c782e 100644 --- a/installer/helm/chart/volcano/values.yaml +++ b/installer/helm/chart/volcano/values.yaml @@ -21,6 +21,7 @@ custom: scheduler_enable: true scheduler_replicas: 1 scheduler_metrics_enable: true + scheduler_pprof_enable: false scheduler_name: ~ leader_elect_enable: false controller_kube_api_qps: 50 From 25e529ceaf426b54fe6caeef753649b64390b0e1 Mon Sep 17 00:00:00 2001 From: Monokaix Date: Tue, 22 Apr 2025 11:32:31 +0800 Subject: [PATCH 3/4] Add http server timeout Signed-off-by: Monokaix --- cmd/agent/app/app.go | 8 ++++++-- cmd/controller-manager/app/server.go | 13 +++++++++++-- cmd/scheduler/app/server.go | 8 +++++--- cmd/webhook-manager/app/server.go | 7 +++++-- go.mod | 2 +- go.sum | 4 ++-- pkg/util/socket.go | 7 ++++++- 7 files changed, 36 insertions(+), 13 deletions(-) diff --git a/cmd/agent/app/app.go b/cmd/agent/app/app.go index c2982a7780..0c1a5c42cd 100644 --- a/cmd/agent/app/app.go +++ b/cmd/agent/app/app.go @@ -32,6 +32,7 @@ import ( "k8s.io/controller-manager/pkg/clientbuilder" "k8s.io/klog/v2" + "volcano.sh/apis/pkg/apis/helpers" "volcano.sh/volcano/cmd/agent/app/options" "volcano.sh/volcano/pkg/agent/healthcheck" "volcano.sh/volcano/pkg/agent/utils" @@ -81,8 +82,11 @@ func RunServer(checker healthcheck.HealthChecker, address string, port int) { mux.HandleFunc("/healthz", checker.HealthCheck) mux.Handle("/metrics", promhttp.Handler()) s := &http.Server{ - Addr: net.JoinHostPort(address, strconv.Itoa(port)), - Handler: mux, + Addr: net.JoinHostPort(address, strconv.Itoa(port)), + Handler: mux, + ReadHeaderTimeout: helpers.DefaultReadHeaderTimeout, + ReadTimeout: helpers.DefaultReadTimeout, + WriteTimeout: helpers.DefaultWriteTimeout, } if err := s.ListenAndServe(); err != nil { klog.Fatalf("failed to start health check server: %v", err) diff --git a/cmd/controller-manager/app/server.go b/cmd/controller-manager/app/server.go index 84d223b427..8e10fe0e0d 100644 --- a/cmd/controller-manager/app/server.go +++ b/cmd/controller-manager/app/server.go @@ -58,8 +58,17 @@ func Run(opt *options.ServerOption) error { if opt.EnableMetrics { go func() { - http.Handle("/metrics", commonutil.PromHandler()) - klog.Fatalf("Prometheus Http Server failed %s", http.ListenAndServe(opt.ListenAddress, nil)) + mux := http.NewServeMux() + mux.Handle("/metrics", commonutil.PromHandler()) + + server := &http.Server{ + Addr: opt.ListenAddress, + Handler: mux, + ReadHeaderTimeout: helpers.DefaultReadHeaderTimeout, + ReadTimeout: helpers.DefaultReadTimeout, + WriteTimeout: helpers.DefaultWriteTimeout, + } + klog.Fatalf("Prometheus Http Server failed: %s", server.ListenAndServe()) }() } diff --git a/cmd/scheduler/app/server.go b/cmd/scheduler/app/server.go index 3cd8fc02b3..bc0cdce9f5 100644 --- a/cmd/scheduler/app/server.go +++ b/cmd/scheduler/app/server.go @@ -24,7 +24,6 @@ import ( "os" "volcano.sh/apis/pkg/apis/helpers" - "volcano.sh/volcano/cmd/scheduler/app/options" "volcano.sh/volcano/pkg/kube" "volcano.sh/volcano/pkg/scheduler" @@ -157,8 +156,11 @@ func startMetricsServer(opt *options.ServerOption) { } server := &http.Server{ - Addr: opt.ListenAddress, - Handler: mux, + Addr: opt.ListenAddress, + Handler: mux, + ReadHeaderTimeout: helpers.DefaultReadHeaderTimeout, + ReadTimeout: helpers.DefaultReadTimeout, + WriteTimeout: helpers.DefaultWriteTimeout, } if err := server.ListenAndServe(); err != nil { diff --git a/cmd/webhook-manager/app/server.go b/cmd/webhook-manager/app/server.go index 5f3c833610..cf51ef3050 100644 --- a/cmd/webhook-manager/app/server.go +++ b/cmd/webhook-manager/app/server.go @@ -108,8 +108,11 @@ func Run(config *options.Config) error { } server := &http.Server{ - Addr: config.ListenAddress + ":" + strconv.Itoa(config.Port), - TLSConfig: configTLS(config, restConfig), + Addr: config.ListenAddress + ":" + strconv.Itoa(config.Port), + TLSConfig: configTLS(config, restConfig), + ReadHeaderTimeout: helpers.DefaultReadHeaderTimeout, + ReadTimeout: helpers.DefaultReadTimeout, + WriteTimeout: helpers.DefaultWriteTimeout, } go func() { err = server.ListenAndServeTLS("", "") diff --git a/go.mod b/go.mod index d1d10a4dbd..f2c78cea7e 100644 --- a/go.mod +++ b/go.mod @@ -46,7 +46,7 @@ require ( sigs.k8s.io/controller-runtime v0.13.0 sigs.k8s.io/yaml v1.4.0 stathat.com/c/consistent v1.0.0 - volcano.sh/apis v1.11.1 + volcano.sh/apis v1.11.2 ) require ( diff --git a/go.sum b/go.sum index 9cb4407288..884ff1a725 100644 --- a/go.sum +++ b/go.sum @@ -510,5 +510,5 @@ sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= stathat.com/c/consistent v1.0.0 h1:ezyc51EGcRPJUxfHGSgJjWzJdj3NiMU9pNfLNGiXV0c= stathat.com/c/consistent v1.0.0/go.mod h1:QkzMWzcbB+yQBL2AttO6sgsQS/JSTapcDISJalmCDS0= -volcano.sh/apis v1.11.1 h1:BuewlHccLIJVJmVcBF32KewXJmtwpCjx4d7fxVxG900= -volcano.sh/apis v1.11.1/go.mod h1:FOdmG++9+8lgENJ9XXDh+O3Jcb9YVRnlMSpgIh3NSVI= +volcano.sh/apis v1.11.2 h1:Vz8NzP0af8vyxRccrEUt6/FikD5eeEOnCZRolVzZvK8= +volcano.sh/apis v1.11.2/go.mod h1:FOdmG++9+8lgENJ9XXDh+O3Jcb9YVRnlMSpgIh3NSVI= diff --git a/pkg/util/socket.go b/pkg/util/socket.go index 481fadc5c2..fa0a2893d4 100644 --- a/pkg/util/socket.go +++ b/pkg/util/socket.go @@ -31,6 +31,8 @@ import ( "golang.org/x/sys/unix" "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/klog/v2" + + "volcano.sh/apis/pkg/apis/helpers" ) const ( @@ -201,7 +203,10 @@ func listenUnix(componentName string, socketDir string) (net.Listener, error) { // serveOnListener starts the server using given listener, loops forever. func serveOnListener(l net.Listener, m *http.ServeMux) error { server := http.Server{ - Handler: m, + Handler: m, + ReadHeaderTimeout: helpers.DefaultReadHeaderTimeout, + ReadTimeout: helpers.DefaultReadTimeout, + WriteTimeout: helpers.DefaultWriteTimeout, } return server.Serve(l) } From fba23c8d45c680adbdabdda48b6eab8d1046d41c Mon Sep 17 00:00:00 2001 From: Monokaix Date: Tue, 22 Apr 2025 16:46:22 +0800 Subject: [PATCH 4/4] Add warning msg when TLS verification disabled Signed-off-by: Monokaix (cherry picked from commit b4f2da4d86324a809f18276e1bae0cc475d0813d) --- pkg/scheduler/metrics/source/metrics_client_elasticsearch.go | 5 +++++ pkg/scheduler/metrics/source/metrics_client_prometheus.go | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/pkg/scheduler/metrics/source/metrics_client_elasticsearch.go b/pkg/scheduler/metrics/source/metrics_client_elasticsearch.go index 6957e2e21d..62faa7f119 100644 --- a/pkg/scheduler/metrics/source/metrics_client_elasticsearch.go +++ b/pkg/scheduler/metrics/source/metrics_client_elasticsearch.go @@ -26,6 +26,7 @@ import ( "time" "github.com/elastic/go-elasticsearch/v7" + "k8s.io/klog/v2" ) const ( @@ -65,6 +66,10 @@ func NewElasticsearchMetricsClient(conf map[string]string) (*ElasticsearchMetric } var err error insecureSkipVerify := conf["tls.insecureSkipVerify"] == "true" + if insecureSkipVerify { + klog.Warningf("WARNING: TLS certificate verification is disabled which is insecure. This should not be used in production environments") + } + e.es, err = elasticsearch.NewClient(elasticsearch.Config{ Addresses: []string{address}, Username: conf["elasticsearch.username"], diff --git a/pkg/scheduler/metrics/source/metrics_client_prometheus.go b/pkg/scheduler/metrics/source/metrics_client_prometheus.go index 97bc16fa81..d042cc57b9 100644 --- a/pkg/scheduler/metrics/source/metrics_client_prometheus.go +++ b/pkg/scheduler/metrics/source/metrics_client_prometheus.go @@ -61,6 +61,10 @@ func (p *PrometheusMetricsClient) NodeMetricsAvg(ctx context.Context, nodeName s var client api.Client var err error insecureSkipVerify := p.conf["tls.insecureSkipVerify"] == "true" + if insecureSkipVerify { + klog.Warningf("WARNING: TLS certificate verification is disabled which is insecure. This should not be used in production environments") + } + tr := &http.Transport{ TLSClientConfig: &tls.Config{ InsecureSkipVerify: insecureSkipVerify,