From cc443f21e4b062a3b4466519a5023130cec1c0f1 Mon Sep 17 00:00:00 2001 From: Rob Kenis Date: Wed, 15 Jan 2025 09:18:57 +0100 Subject: [PATCH] Add profiler address parameter on node-agent This allows us to enable the profiler endpoints on both the server and the node agent. This helps me in troubleshooting the high memory usage when restoring lots of small files. Refs: #8582 --- pkg/cmd/cli/nodeagent/server.go | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/pkg/cmd/cli/nodeagent/server.go b/pkg/cmd/cli/nodeagent/server.go index d5e7193cc1..243a5551b9 100644 --- a/pkg/cmd/cli/nodeagent/server.go +++ b/pkg/cmd/cli/nodeagent/server.go @@ -21,6 +21,7 @@ import ( "fmt" "math" "net/http" + "net/http/pprof" "os" "strings" "time" @@ -86,6 +87,7 @@ const ( type nodeAgentServerConfig struct { metricsAddress string + profilerAddress string resourceTimeout time.Duration dataMoverPrepareTimeout time.Duration nodeAgentConfig string @@ -124,7 +126,8 @@ func NewServerCommand(f client.Factory) *cobra.Command { command.Flags().Var(formatFlag, "log-format", fmt.Sprintf("The format for log output. Valid values are %s.", strings.Join(formatFlag.AllowedValues(), ", "))) command.Flags().DurationVar(&config.resourceTimeout, "resource-timeout", config.resourceTimeout, "How long to wait for resource processes which are not covered by other specific timeout parameters. Default is 10 minutes.") command.Flags().DurationVar(&config.dataMoverPrepareTimeout, "data-mover-prepare-timeout", config.dataMoverPrepareTimeout, "How long to wait for preparing a DataUpload/DataDownload. Default is 30 minutes.") - command.Flags().StringVar(&config.metricsAddress, "metrics-address", config.metricsAddress, "The address to expose prometheus metrics") + command.Flags().StringVar(&config.metricsAddress, "metrics-address", config.metricsAddress, "The address to expose prometheus metrics.") + command.Flags().StringVar(&config.profilerAddress, "profiler-address", config.profilerAddress, "The address to expose the pprof profiler.") command.Flags().StringVar(&config.nodeAgentConfig, "node-agent-configmap", config.nodeAgentConfig, "The name of ConfigMap containing node-agent configurations.") return command @@ -263,6 +266,10 @@ func newNodeAgentServer(logger logrus.FieldLogger, factory client.Factory, confi func (s *nodeAgentServer) run() { signals.CancelOnShutdown(s.cancelFunc, s.logger) + if s.config.profilerAddress != "" { + go s.runProfiler() + } + go func() { metricsMux := http.NewServeMux() metricsMux.Handle("/metrics", promhttp.Handler()) @@ -386,6 +393,24 @@ func (s *nodeAgentServer) run() { } } +func (s *nodeAgentServer) runProfiler() { + mux := http.NewServeMux() + mux.HandleFunc("/debug/pprof/", pprof.Index) + mux.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline) + mux.HandleFunc("/debug/pprof/profile", pprof.Profile) + mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol) + mux.HandleFunc("/debug/pprof/trace", pprof.Trace) + + server := &http.Server{ + Addr: s.config.profilerAddress, + Handler: mux, + ReadHeaderTimeout: 3 * time.Second, + } + if err := server.ListenAndServe(); err != nil { + s.logger.WithError(errors.WithStack(err)).Error("error running profiler http server") + } +} + func (s *nodeAgentServer) waitCacheForResume() error { podInformer, err := s.mgr.GetCache().GetInformer(s.ctx, &v1.Pod{}) if err != nil {