From c956c492843845def7516820cf115c5173f0ad54 Mon Sep 17 00:00:00 2001 From: Simon Worthington Date: Thu, 4 Apr 2024 11:58:08 +1100 Subject: [PATCH 01/17] Improve ranking error output - Show the number of nodes that were available even if unsuitable, because users often misinterpret "0 available" as meaning there is a connection error - Summarise nodes when they all print the same output rather than listing all of them --- pkg/orchestrator/errors.go | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/pkg/orchestrator/errors.go b/pkg/orchestrator/errors.go index cae16fe074..a747c30bfc 100644 --- a/pkg/orchestrator/errors.go +++ b/pkg/orchestrator/errors.go @@ -2,8 +2,10 @@ package orchestrator import ( "fmt" + "strings" "github.com/bacalhau-project/bacalhau/pkg/util/idgen" + "github.com/samber/lo" ) // ErrSchedulerNotFound is returned when the scheduler is not found for a given evaluation type @@ -33,16 +35,22 @@ func NewErrNotEnoughNodes(requestedNodes int, availableNodes []NodeRank) ErrNotE } func (e ErrNotEnoughNodes) Error() string { - nodeErrors := "" - available := 0 - for _, rank := range e.AvailableNodes { - if rank.MeetsRequirement() { - available += 1 + suitable := lo.CountBy(e.AvailableNodes, func(rank NodeRank) bool { return rank.MeetsRequirement() }) + reasons := lo.GroupBy(e.AvailableNodes, func(rank NodeRank) string { return rank.Reason }) + + var message strings.Builder + fmt.Fprint(&message, "not enough nodes to run job. ") + fmt.Fprintf(&message, "requested: %d, available: %d, suitable: %d.", e.RequestedNodes, len(e.AvailableNodes), suitable) + for reason, nodes := range reasons { + fmt.Fprint(&message, "\n• ") + if len(nodes) > 1 { + fmt.Fprintf(&message, "%d of %d nodes", len(nodes), len(e.AvailableNodes)) } else { - nodeErrors += fmt.Sprintf("\n\tNode %s: %s", idgen.ShortNodeID(rank.NodeInfo.ID()), rank.Reason) + fmt.Fprintf(&message, "Node %s", idgen.ShortNodeID(nodes[0].NodeInfo.ID())) } + fmt.Fprintf(&message, ": %s", reason) } - return fmt.Sprintf("not enough nodes to run job. requested: %d, available: %d. %s", e.RequestedNodes, available, nodeErrors) + return message.String() } // ErrNoMatchingNodes is returned when no matching nodes in the network to run a job From 6572703ac274ea873931e39817262285670b5e3e Mon Sep 17 00:00:00 2001 From: Ross Jones Date: Mon, 8 Apr 2024 16:05:26 +0100 Subject: [PATCH 02/17] Node heartbeats (#3709) Implements heartbeats for compute nodes, sending heartbeat messages to the requester node over NATS PubSub. The server, upon receiving a heartbeat updates the map of nodes to include the current server-side timestamp. Compute nodes using the heartbeat client, will continuously send heartbeat messages every n seconds. The heartbeat server receiving these heartbeats maintains a priority queue, which dequeues oldest items (lowest timestamp) first. Every 5 seconds any item older than a specific timestamp is dequeued, and its state either set to unhealthy (if it is the first missed heartbeat) or unknown if it is the second. The default for timestamps is * 30s since heartbeat - unhealthy * 60s since heartbeat - unknown (node may be live but disconnected) The next heartbeat sent by a unhealthy of unknown node will make it healthy again and ready to receive work. The current state of the node is added to the nodeinfo during a Get/GetByPrefix/List call to the node info store. This means that the liveness is dynamic and not persisted to the kvstore for node info. --- .cspell/custom-dictionary.txt | 2 +- cmd/cli/node/columns.go | 12 +- cmd/cli/node/list.go | 31 ++- .../cli-reference/cli/node/approve/index.md | 67 ++++++ .../cli-reference/cli/node/delete/index.md | 67 ++++++ docs/docs/dev/cli-reference/cli/node/index.md | 77 ++++--- .../dev/cli-reference/cli/node/list/index.md | 25 +++ .../cli-reference/cli/node/reject/index.md | 67 ++++++ docs/docs/dev/node_management.md | 94 ++++++++ go.work.sum | 10 + pkg/compute/executor_buffer.go | 2 +- pkg/compute/management_client.go | 78 +++++-- pkg/compute/mocks.go | 69 ++++++ pkg/config/configenv/dev.go | 11 + pkg/config/configenv/local.go | 11 + pkg/config/configenv/production.go | 11 + pkg/config/configenv/staging.go | 11 + pkg/config/configenv/test.go | 11 + pkg/config/types/compute.go | 36 ++- pkg/config/types/generated_constants.go | 9 + pkg/config/types/generated_viper_defaults.go | 19 +- pkg/config/types/requester.go | 17 ++ pkg/lib/collections/hashed_priority_queue.go | 3 +- pkg/lib/collections/priority_queue.go | 10 +- pkg/lib/collections/priority_queue_test.go | 33 ++- pkg/lib/concurrency/striped_map.go | 2 - pkg/models/node_info.go | 1 + pkg/models/node_state.go | 95 ++++++++ pkg/nats/proxy/management_proxy.go | 2 +- pkg/nats/transport/nats.go | 6 +- pkg/node/compute.go | 8 +- pkg/node/config_compute.go | 19 ++ pkg/node/config_defaults.go | 18 ++ pkg/node/config_requester.go | 3 + pkg/node/heartbeat/client.go | 43 ++++ pkg/node/heartbeat/heartbeat_test.go | 154 +++++++++++++ pkg/node/heartbeat/server.go | 205 ++++++++++++++++++ pkg/node/heartbeat/types.go | 11 + pkg/node/manager/node_manager.go | 34 ++- pkg/node/node.go | 53 ++++- pkg/publicapi/apimodels/node.go | 9 +- pkg/publicapi/endpoint/orchestrator/node.go | 73 ++++--- pkg/test/compute/setup_test.go | 1 + 43 files changed, 1392 insertions(+), 128 deletions(-) create mode 100644 docs/docs/dev/cli-reference/cli/node/approve/index.md create mode 100644 docs/docs/dev/cli-reference/cli/node/delete/index.md create mode 100644 docs/docs/dev/cli-reference/cli/node/reject/index.md create mode 100644 docs/docs/dev/node_management.md create mode 100644 pkg/models/node_state.go create mode 100644 pkg/node/heartbeat/client.go create mode 100644 pkg/node/heartbeat/heartbeat_test.go create mode 100644 pkg/node/heartbeat/server.go create mode 100644 pkg/node/heartbeat/types.go diff --git a/.cspell/custom-dictionary.txt b/.cspell/custom-dictionary.txt index 2abd486b7d..571e732cf4 100644 --- a/.cspell/custom-dictionary.txt +++ b/.cspell/custom-dictionary.txt @@ -221,7 +221,7 @@ multierror multiformats Muxed mypy -nats +NATS nbconvert nemt NOAA diff --git a/cmd/cli/node/columns.go b/cmd/cli/node/columns.go index b14d34cd88..5e14ea579d 100644 --- a/cmd/cli/node/columns.go +++ b/cmd/cli/node/columns.go @@ -25,9 +25,19 @@ var alwaysColumns = []output.TableColumn[*models.NodeInfo]{ Value: func(ni *models.NodeInfo) string { return ni.NodeType.String() }, }, { - ColumnConfig: table.ColumnConfig{Name: "status"}, + ColumnConfig: table.ColumnConfig{Name: "approval"}, Value: func(ni *models.NodeInfo) string { return ni.Approval.String() }, }, + { + ColumnConfig: table.ColumnConfig{Name: "status"}, + Value: func(ni *models.NodeInfo) string { + if ni.ComputeNodeInfo != nil { + return ni.State.String() + } + + return "" // nothing for requester nodes + }, + }, } var toggleColumns = map[string][]output.TableColumn[*models.NodeInfo]{ diff --git a/cmd/cli/node/list.go b/cmd/cli/node/list.go index a891930dcc..7a1fe44c04 100644 --- a/cmd/cli/node/list.go +++ b/cmd/cli/node/list.go @@ -16,15 +16,17 @@ import ( var defaultColumnGroups = []string{"labels", "capacity"} var orderByFields = []string{"id", "type", "available_cpu", "available_memory", "available_disk", "available_gpu", "status"} -var filterStatusValues = []string{"approved", "pending", "rejected"} +var filterApprovalValues = []string{"approved", "pending", "rejected"} +var filterStatusValues = []string{"connected", "disconnected"} // ListOptions is a struct to support node command type ListOptions struct { output.OutputOptions cliflags.ListOptions - ColumnGroups []string - Labels string - FilterByStatus string + ColumnGroups []string + Labels string + FilterByApproval string + FilterByStatus string } // NewListOptions returns initialized Options @@ -42,7 +44,7 @@ func NewListCmd() *cobra.Command { Use: "list", Short: "List info of network nodes. ", Args: cobra.NoArgs, - Run: o.run, + RunE: o.run, } nodeCmd.Flags().StringSliceVar(&o.ColumnGroups, "show", o.ColumnGroups, fmt.Sprintf("What column groups to show. Zero or more of: %q", maps.Keys(toggleColumns))) @@ -50,6 +52,8 @@ func NewListCmd() *cobra.Command { "Filter nodes by labels. See https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ for more information.") nodeCmd.Flags().AddFlagSet(cliflags.ListFlags(&o.ListOptions)) nodeCmd.Flags().AddFlagSet(cliflags.OutputFormatFlags(&o.OutputOptions)) + nodeCmd.Flags().StringVar(&o.FilterByApproval, "filter-approval", o.FilterByApproval, + fmt.Sprintf("Filter nodes by approval. One of: %q", filterApprovalValues)) nodeCmd.Flags().StringVar(&o.FilterByStatus, "filter-status", o.FilterByStatus, fmt.Sprintf("Filter nodes by status. One of: %q", filterStatusValues)) @@ -57,7 +61,7 @@ func NewListCmd() *cobra.Command { } // Run executes node command -func (o *ListOptions) run(cmd *cobra.Command, _ []string) { +func (o *ListOptions) run(cmd *cobra.Command, _ []string) error { ctx := cmd.Context() var err error @@ -69,15 +73,22 @@ func (o *ListOptions) run(cmd *cobra.Command, _ []string) { } } + if o.FilterByApproval != "" { + if !slices.Contains(filterApprovalValues, o.FilterByApproval) { + return fmt.Errorf("cannot use '%s' as filter-approval value, should be one of: %q", o.FilterByApproval, filterApprovalValues) + } + } + if o.FilterByStatus != "" { if !slices.Contains(filterStatusValues, o.FilterByStatus) { - util.Fatal(cmd, fmt.Errorf("cannot use '%s' as filter status value, should be one of: %q", o.FilterByStatus, filterStatusValues), 1) + return fmt.Errorf("cannot use '%s' as filter-status value, should be one of: %q", o.FilterByStatus, filterStatusValues) } } response, err := util.GetAPIClientV2(cmd).Nodes().List(ctx, &apimodels.ListNodesRequest{ - Labels: labelRequirements, - FilterByStatus: o.FilterByStatus, + Labels: labelRequirements, + FilterByApproval: o.FilterByApproval, + FilterByStatus: o.FilterByStatus, BaseListRequest: apimodels.BaseListRequest{ Limit: o.Limit, NextToken: o.NextToken, @@ -97,4 +108,6 @@ func (o *ListOptions) run(cmd *cobra.Command, _ []string) { if err = output.Output(cmd, columns, o.OutputOptions, response.Nodes); err != nil { util.Fatal(cmd, fmt.Errorf("failed to output: %w", err), 1) } + + return nil } diff --git a/docs/docs/dev/cli-reference/cli/node/approve/index.md b/docs/docs/dev/cli-reference/cli/node/approve/index.md new file mode 100644 index 0000000000..6d818782ec --- /dev/null +++ b/docs/docs/dev/cli-reference/cli/node/approve/index.md @@ -0,0 +1,67 @@ +--- +sidebar_label: approve +--- + +# Command: `node approve` + +The `bacalhau node approve` command offers administrators the ability to approve the cluster membership for a node using its name. + +## Description: + +Using the `approve` sub-command under the `bacalhau node` umbrella, users can allow a node in the pending state to join the cluster and receive work. This feature is crucial for system administrators to manage the cluster. + +## Usage: + +```bash +bacalhau node approve [id] [flags] +``` + +## Flags: + +- `[id]`: + + - The unique identifier of the node you wish to describe. + +- `-h`, `--help`: + + - Displays the help documentation for the `describe` command. + +- `-m message`: + + - A message to be attached to the approval action. + +## Global Flags: + +- `--api-host string`: + + - Specifies the host for client-server communication through REST. This flag is overridden if the `BACALHAU_API_HOST` environment variable is set. + - Default: `"bootstrap.production.bacalhau.org"` + +- `--api-port int`: + + - Designates the port for REST-based communication between client and server. This flag is overlooked if the `BACALHAU_API_PORT` environment variable is defined. + - Default: `1234` + +- `--log-mode logging-mode`: + + - Determines the log format preference. + - Options: `'default','station','json','combined','event'` + - Default: `'default'` + +- `--repo string`: + - Points to the bacalhau repository's path. + - Default: `"`$HOME/.bacalhau"` + +## Examples: + +1. Approve a Node with ID `nodeID123`: + + ```bash + bacalhau node approve nodeID123 + ``` + +2. Approve a Node with an audit message: + + ```bash + bacalhau node approve nodeID123 -m "okay" + ``` diff --git a/docs/docs/dev/cli-reference/cli/node/delete/index.md b/docs/docs/dev/cli-reference/cli/node/delete/index.md new file mode 100644 index 0000000000..964f1bd86c --- /dev/null +++ b/docs/docs/dev/cli-reference/cli/node/delete/index.md @@ -0,0 +1,67 @@ +--- +sidebar_label: delete +--- + +# Command: `node delete` + +The `bacalhau node delete` command offers administrators the ability to remove a node from the cluster using its name. + +## Description: + +Using the `delete` sub-command, administrators can remove a node from the list of available compute nodes in the cluster. This feature is necessary for the management of the infrastructure. + +## Usage: + +```bash +bacalhau node delete [id] [flags] +``` + +## Flags: + +- `[id]`: + + - The unique identifier of the node you wish to describe. + +- `-h`, `--help`: + + - Displays the help documentation for the `describe` command. + +- `-m message`: + + - A message to be attached to the deletion action. + +## Global Flags: + +- `--api-host string`: + + - Specifies the host for client-server communication through REST. This flag is overridden if the `BACALHAU_API_HOST` environment variable is set. + - Default: `"bootstrap.production.bacalhau.org"` + +- `--api-port int`: + + - Designates the port for REST-based communication between client and server. This flag is overlooked if the `BACALHAU_API_PORT` environment variable is defined. + - Default: `1234` + +- `--log-mode logging-mode`: + + - Determines the log format preference. + - Options: `'default','station','json','combined','event'` + - Default: `'default'` + +- `--repo string`: + - Points to the bacalhau repository's path. + - Default: `"`$HOME/.bacalhau"` + +## Examples: + +1. Delete the Node with ID `nodeID123`: + + ```bash + bacalhau node delete nodeID123 + ``` + +2. Delete a Node with an audit message: + + ```bash + bacalhau node delete nodeID123 -m "bad actor" + ``` diff --git a/docs/docs/dev/cli-reference/cli/node/index.md b/docs/docs/dev/cli-reference/cli/node/index.md index cc8741e0c7..49eae88de8 100644 --- a/docs/docs/dev/cli-reference/cli/node/index.md +++ b/docs/docs/dev/cli-reference/cli/node/index.md @@ -12,21 +12,49 @@ bacalhau node [command] ## Available Commands +1. **[approve](./approve)**: + + - Description: Approves a single node to join the cluster. + - Usage: + + ```bash + bacalhau node approve + ``` + +1. **[delete](./delete)**: + + - Description: Deletes a node from the cluster using its ID. + - Usage: + ```bash + bacalhau node delete + ``` + 1. **[describe](./describe)**: - - Description: Retrieves detailed information of a node using its ID. - - Usage: - ```bash - bacalhau node describe - ``` - -2. **[list](./list)**: - - Description: Lists the details of all nodes present in the network. - - Usage: - ```bash - bacalhau node list - ``` + + - Description: Retrieves detailed information of a node using its ID. + - Usage: + ```bash + bacalhau node describe + ``` + +1. **[list](./list)**: + + - Description: Lists the details of all nodes present in the network. + - Usage: + ```bash + bacalhau node list + ``` + +1. **[reject](./reject)**: + +- Description: Reject a specific node's request to join the cluster. +- Usage: + ```bash + bacalhau node reject + ``` For comprehensive details on any of the sub-commands, run: + ```bash bacalhau node [command] --help ``` @@ -34,26 +62,25 @@ bacalhau node [command] --help ## Flags - `-h`, `--help`: - - Description: Shows the help information for the `node` command. + - Description: Shows the help information for the `node` command. ## Global Flags - `--api-host string`: - - Description: Specifies the host for RESTful communication between the client and server. The flag will be ignored if the `BACALHAU_API_HOST` environment variable is set. - - Default: `bootstrap.production.bacalhau.org` + + - Description: Specifies the host for RESTful communication between the client and server. The flag will be ignored if the `BACALHAU_API_HOST` environment variable is set. + - Default: `bootstrap.production.bacalhau.org` - `--api-port int`: - - Description: Designates the port for RESTful communication. The flag will be bypassed if the `BACALHAU_API_PORT` environment variable is active. - - Default: `1234` -- `--log-mode logging-mode`: - - Description: Chooses the preferred log format. Available choices are: `default`, `station`, `json`, `combined`, and `event`. - - Default: `default` + - Description: Designates the port for RESTful communication. The flag will be bypassed if the `BACALHAU_API_PORT` environment variable is active. + - Default: `1234` -- `--repo string`: - - Description: Specifies the path to the bacalhau repository. - - Default: `/Users/walid/.bacalhau` +- `--log-mode logging-mode`: ---- + - Description: Chooses the preferred log format. Available choices are: `default`, `station`, `json`, `combined`, and `event`. + - Default: `default` -This should provide an organized and structured overview of the `node` command and its functionalities! +- `--repo string`: + - Description: Specifies the path to the bacalhau repository. + - Default: `/Users/walid/.bacalhau` diff --git a/docs/docs/dev/cli-reference/cli/node/list/index.md b/docs/docs/dev/cli-reference/cli/node/list/index.md index 0c31be90a7..fc07cf8f18 100644 --- a/docs/docs/dev/cli-reference/cli/node/list/index.md +++ b/docs/docs/dev/cli-reference/cli/node/list/index.md @@ -1,14 +1,17 @@ --- sidebar_label: list --- + # Command: `node list` The `bacalhau node list` command is designed to provide users with a comprehensive list of network nodes along with details based on specified flags. ## Description: + The `list` sub-command under the `bacalhau node` category enumerates information about nodes in the network. It supports various filtering, ordering, and output formatting options, allowing users to tailor the output to their needs. ## Usage: + ```bash bacalhau node list [flags] ``` @@ -16,37 +19,56 @@ bacalhau node list [flags] ## Flags: - `-h`, `--help`: + - Show the help message for the `list` command. - `--hide-header`: + - Do not display the column headers in the output. +- `--filter-approval`: + + - Only show nodes with the specified approval status. Valid values are: `approved`, `pending`, `rejected`. + +- `--filter-status`: + + - Only show nodes with the specified state. Valid values are: `healthy`, `unhealthy`, `unknown`. + - `--labels string`: + - Filter nodes based on labels. This follows the filtering format provided by Kubernetes, as shown in their documentation about labels. - `--limit uint32`: + - Restrict the number of results displayed. - `--next-token string`: + - Provide the next token for pagination. - `--no-style`: + - Output the table without any style. - `--order-by string`: + - Sort the results based on a specific field. Valid sorting fields are: `id`, `type`, `available_cpu`, `available_memory`, `available_disk`, `available_gpu`. - `--order-reversed`: + - Display the results in reverse order. - `--output format`: + - Choose the output format. Available options: `table`, `csv`, `json`, `yaml`. - Default: `table`. - `--pretty`: + - Enhance the visual appeal of the output. This is applicable only to `json` and `yaml` formats. - `--show strings`: + - Determine the column groups to be displayed. Acceptable values are: `labels`, `version`, `features`, `capacity`. - Default: `labels`, `capacity`. @@ -56,14 +78,17 @@ bacalhau node list [flags] ## Global Flags: - `--api-host string`: + - Specify the host for client-server communication via REST. This gets ignored if the `BACALHAU_API_HOST` environment variable is defined. - Default: `"bootstrap.production.bacalhau.org"`. - `--api-port int`: + - Specify the port for RESTful communication between client and server. Gets overlooked if the `BACALHAU_API_PORT` environment variable is set. - Default: `1234`. - `--log-mode logging-mode`: + - Choose the desired log format. - Options: `'default', 'station', 'json', 'combined', 'event'`. - Default: `'default'`. diff --git a/docs/docs/dev/cli-reference/cli/node/reject/index.md b/docs/docs/dev/cli-reference/cli/node/reject/index.md new file mode 100644 index 0000000000..277069d693 --- /dev/null +++ b/docs/docs/dev/cli-reference/cli/node/reject/index.md @@ -0,0 +1,67 @@ +--- +sidebar_label: reject +--- + +# Command: `node reject` + +The `bacalhau node reject` command offers administrators the ability to reject a compute node's request to join the cluster. + +## Description: + +Using the `reject` sub-command, administrators can reject a node in the pending state from joining the cluster and receiving work. This feature is crucial for system administrators to manage the cluster and will stop the node from taking part in the cluster until approved. + +## Usage: + +```bash +bacalhau node rejected [id] [flags] +``` + +## Flags: + +- `[id]`: + + - The unique identifier of the node you wish to describe. + +- `-h`, `--help`: + + - Displays the help documentation for the `describe` command. + +- `-m message`: + + - A message to be attached to the rejection action. + +## Global Flags: + +- `--api-host string`: + + - Specifies the host for client-server communication through REST. This flag is overridden if the `BACALHAU_API_HOST` environment variable is set. + - Default: `"bootstrap.production.bacalhau.org"` + +- `--api-port int`: + + - Designates the port for REST-based communication between client and server. This flag is overlooked if the `BACALHAU_API_PORT` environment variable is defined. + - Default: `1234` + +- `--log-mode logging-mode`: + + - Determines the log format preference. + - Options: `'default','station','json','combined','event'` + - Default: `'default'` + +- `--repo string`: + - Points to the bacalhau repository's path. + - Default: `"`$HOME/.bacalhau"` + +## Examples: + +1. Reject a Node with ID `nodeID123`: + + ```bash + bacalhau node reject nodeID123 + ``` + +2. Reject a Node with an audit message: + + ```bash + bacalhau node reject nodeID123 -m "potentially bad" + ``` diff --git a/docs/docs/dev/node_management.md b/docs/docs/dev/node_management.md new file mode 100644 index 0000000000..4e022b29bf --- /dev/null +++ b/docs/docs/dev/node_management.md @@ -0,0 +1,94 @@ +# Node Management + +Bacalhau clusters are composed of requester nodes, and compute nodes. The requester nodes are responsible for managing the compute nodes that make up the cluster. This functionality is only currently available when using NATS for the network transport. + +The two main areas of functionality for the requester nodes are, managing the membership of compute nodes that require approval to take part in the cluster, and monitoring the health of the compute nodes. They are also responsible for collecting information provided by the compute nodes on a regular schedule. + +## Compute node membership + +As compute nodes start, they register their existence with the requester nodes. Once registered, they will maintain a sentinel file to note that they are already registered, this avoids unnecessary registration attempts. + +Once registered, the requester node will need to approve the compute node before it can take part in the cluster. This is to ensure that the requester node is aware of all the compute nodes that are part of the cluster. In future, we may provide mechanisms for auto-approval of nodes joining the cluster, but currently all compute nodes registering default to the PENDING state. + +Listing the current nodes in the system will show requester nodes automatically APPROVED, and compute nodes in the PENDING state. + +```shell +$ bacalhau node list # extra columns removed + +ID TYPE APPROVAL STATUS +node-0 Requester APPROVED UNKNOWN +node-1 Compute PENDING HEALTHY +node-2 Compute PENDING HEALTHY +node-3 Compute PENDING HEALTHY +``` + +Nodes can be rejected using their node id, and optionally specifying a reason with the -m flag. + +```shell +$ bacalhau node reject node-3 -m "malicious node?" +Ok +``` + +Nodes can be approved using their node id. + +```shell +$ bacalhau node approve node-1 +Ok +``` + +There is currently no support for auto-eviction of nodes, but they can be manually removed from the cluster using the `node delete` command. Note, if +they are manually removed, they are able to manually re-register, so this is +most useful when you know the node will not be coming back. + +```shell +$ bacalhau node delete node-2 +``` + +After all of these actions, the node list looks like + +```shell +$ bacalhau node list # extra columns removed + +ID TYPE APPROVAL STATUS +node-0 Requester APPROVED UNKNOWN +node-1 Compute APPROVED HEALTHY +node-3 Compute REJECTED HEALTHY +``` + +## Compute node updates + +Compute nodes will provide information about themselves to the requester nodes on a regular schedule. This information is used to help the requester nodes make decisions about where to schedule workloads. + +These updates are broken down into: + +- **Node Information**: This is the information about the node itself, such as the hostname, CPU architecture, and any labels associated with the node. This information is persisted to the Node Info Store. +- **Resource Information**: This is the information about the resources available on the node, such as the amount of memory, storage and CPU available. This information is held in memory and used to make scheduling decisions. It is not persisted to disk as it is considered transient. +- **Health Information**: This heartbeat is used to determine if the node is still healthy, and if it is not, the requester node will mark the node as unhealthy. Eventually, the node will be marked as Unknown if it does not recover. This information is held in memory and used to make scheduling decisions. Like the resource information, it is not persisted to disk as it is considered transient. + +Various configuration options are available to control the frequency of these updates, and the timeout for the health check. These can be set in the configuration file. + +For the compute node, these settings are: + +- **Node Information**: `InfoUpdateFrequency` - The interval between updates of the node information. + +- **Resource Information**: `ResourceUpdateFrequency` - The interval between updates of the resource information. + +- **Heartbeat**: `HeartbeatFrequency` - The interval between heartbeats sent by the compute node. + +- **Heartbeat**: `HeartbeatTopic` - The name of the pubsub topic that heartbeat messages are sent via. + +For the requester node, these settings are: + +- **Heartbeat** `HeartbeatFrequency` - How often the heartbeat server will check the priority queue of node heartbeats. + +- **Heartbeat** `HeartbeatTopic` - The name of the pubsub topic that heartbeat messages are sent via. Should be the same as the compute node value. + +- **Node health** `NodeDisconnectedAfter` - The interval after which the node will be considered disconnected if a heartbeat has not been received. + +## Cluster membership events + +As compute nodes are added and removed from the cluster, the requester nodes will emit events to the NATS PubSub system. These events can be consumed by other systems to react to changes in the cluster membership. + +``` + +``` diff --git a/go.work.sum b/go.work.sum index 7e63379d9a..eee5b88d45 100644 --- a/go.work.sum +++ b/go.work.sum @@ -1478,6 +1478,7 @@ github.com/golang/groupcache v0.0.0-20160516000752-02826c3e7903/go.mod h1:cIg4er github.com/golang/groupcache v0.0.0-20191027212112-611e8accdfc9/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/lint v0.0.0-20180702182130-06c8688daad7 h1:2hRPrmiwPrp3fQX967rNJIhQPtiGXdlQWAxKbKw3VHA= github.com/golang/mock v1.5.0/go.mod h1:CWnOUgYIOo4TcNZ0wHX3YZCqsaM1I1Jvs6v3mP3KVu8= +github.com/golang/mock v1.6.0/go.mod h1:p6yTPP+5HYm5mzsMV8JkE6ZKdX+/wYM6Hr+LicevLPs= github.com/golang/protobuf v1.3.0/go.mod h1:Qd/q+1AKNOZr9uGQzbzCmRO6sUih6GTPZv6a1/R87v0= github.com/golang/protobuf v1.5.1/go.mod h1:DopwsBzvsk0Fs44TXzsVbJyPhcCPeIwnvohx4u74HPM= github.com/golang/snappy v0.0.3/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= @@ -1590,6 +1591,7 @@ github.com/hailocab/go-hostpool v0.0.0-20160125115350-e80d13ce29ed h1:5upAirOpQc github.com/hailocab/go-hostpool v0.0.0-20160125115350-e80d13ce29ed/go.mod h1:tMWxXQ9wFIaZeTI9F+hmhFiGpFmhOHzyShyFUhRm0H4= github.com/hannahhoward/cbor-gen-for v0.0.0-20200817222906-ea96cece81f1 h1:F9k+7wv5OIk1zcq23QpdiL0hfDuXPjuOmMNaC6fgQ0Q= github.com/hannahhoward/cbor-gen-for v0.0.0-20200817222906-ea96cece81f1/go.mod h1:jvfsLIxk0fY/2BKSQ1xf2406AKA5dwMmKKv0ADcOfN8= +github.com/hannahhoward/go-pubsub v0.0.0-20200423002714-8d62886cc36e/go.mod h1:I8h3MITA53gN9OnWGCgaMa0JWVRdXthWw4M3CPM54OY= github.com/hashicorp/consul/api v1.3.0/go.mod h1:MmDNSzIMUjNpY/mQ398R4bk2FnqQLoPndWW5VkKPlCE= github.com/hashicorp/consul/api v1.18.0 h1:R7PPNzTCeN6VuQNDwwhZWJvzCtGSrNpJqfb22h3yH9g= github.com/hashicorp/consul/api v1.20.0 h1:9IHTjNVSZ7MIwjlW3N3a7iGiykCMDpxZu8jsxFJh0yc= @@ -1683,6 +1685,7 @@ github.com/ipfs/go-datastore v0.4.1/go.mod h1:SX/xMIKoCszPqp+z9JhPYCmoOoXTvaa13X github.com/ipfs/go-datastore v0.4.4/go.mod h1:SX/xMIKoCszPqp+z9JhPYCmoOoXTvaa13XEbGtsFUhA= github.com/ipfs/go-datastore v0.4.5/go.mod h1:eXTcaaiN6uOlVCLS9GjJUJtlvJfM3xk23w3fyfrmmJs= github.com/ipfs/go-datastore v0.5.1/go.mod h1:9zhEApYMTl17C8YDp7JmU7sQZi2/wqiYh73hakZ90Bk= +github.com/ipfs/go-delegated-routing v0.8.0/go.mod h1:18Dds6ZoNTsff9S/7R49Nh2t2YNXIIKR/RLQmBZdjjY= github.com/ipfs/go-ds-badger v0.0.2/go.mod h1:Y3QpeSFWQf6MopLTiZD+VT6IC1yZqaGmjvRcKeSGij8= github.com/ipfs/go-ds-badger v0.0.5/go.mod h1:g5AuuCGmr7efyzQhLL8MzwqcauPojGPUaHzfGTzuE3s= github.com/ipfs/go-ds-badger v0.2.1/go.mod h1:Tx7l3aTph3FMFrRS838dcSJh+jjA7cX9DrGVwx/NOwE= @@ -1693,6 +1696,7 @@ github.com/ipfs/go-ds-leveldb v0.4.2/go.mod h1:jpbku/YqBSsBc1qgME8BkWS4AxzF2cEu1 github.com/ipfs/go-fetcher v1.5.0/go.mod h1:5pDZ0393oRF/fHiLmtFZtpMNBQfHOYNPtryWedVuSWE= github.com/ipfs/go-fetcher v1.6.1 h1:UFuRVYX5AIllTiRhi5uK/iZkfhSpBCGX7L70nSZEmK8= github.com/ipfs/go-fetcher v1.6.1/go.mod h1:27d/xMV8bodjVs9pugh/RCjjK2OZ68UgAMspMdingNo= +github.com/ipfs/go-graphsync v0.14.4/go.mod h1:yT0AfjFgicOoWdAlUJ96tQ5AkuGI4r1taIQX/aHbBQo= github.com/ipfs/go-hamt-ipld v0.1.1 h1:0IQdvwnAAUKmDE+PMJa5y1QiwOPHpI9+eAbQEEEYthk= github.com/ipfs/go-ipfs-blockstore v0.1.4/go.mod h1:Jxm3XMVjh6R17WvxFEiyKBLUGr86HgIYJW/D/MwqeYQ= github.com/ipfs/go-ipfs-blockstore v0.2.1/go.mod h1:jGesd8EtCM3/zPgx+qr0/feTXGUeRai6adgwC+Q+JvE= @@ -1764,6 +1768,7 @@ github.com/ipfs/iptb v1.4.0/go.mod h1:1rzHpCYtNp87/+hTxG5TfCVn/yMY3dKnLn8tBiMfdm github.com/ipfs/iptb-plugins v0.5.0/go.mod h1:/6crDf3s58T70BhZ+m9SyyKpK7VvSDS2Ny4kafxXDp4= github.com/ipfs/kubo v0.16.0/go.mod h1:mkYGiXL+oi2TCkbUwaDSiOuqvLORuUfnDadIL6GWQIQ= github.com/ipfs/tar-utils v0.0.2 h1:UNgHB4x/PPzbMkmJi+7EqC9LNMPDztOVSnx1HAqSNg4= +github.com/ipld/edelweiss v0.2.0/go.mod h1:FJAzJRCep4iI8FOFlRriN9n0b7OuX3T/S9++NpBDmA4= github.com/ipld/go-car/v2 v2.1.1/go.mod h1:+2Yvf0Z3wzkv7NeI69i8tuZ+ft7jyjPYIWZzeVNeFcI= github.com/ipld/go-car/v2 v2.5.1/go.mod h1:jKjGOqoCj5zn6KjnabD6JbnCsMntqU2hLiU6baZVO3E= github.com/ipld/go-car/v2 v2.8.0/go.mod h1:a+BnAxUqgr7wcWxW/lI6ctyEQ2v9gjBChPytwFMp2f4= @@ -2066,6 +2071,7 @@ github.com/libp2p/go-mplex v0.1.1/go.mod h1:Xgz2RDCi3co0LeZfgjm4OgUF15+sVR8SRcu3 github.com/libp2p/go-mplex v0.1.2/go.mod h1:Xgz2RDCi3co0LeZfgjm4OgUF15+sVR8SRcu3SFXI1lk= github.com/libp2p/go-mplex v0.2.0/go.mod h1:0Oy/A9PQlwBytDRp4wSkFnzHYDKcpLot35JQ6msjvYQ= github.com/libp2p/go-mplex v0.3.0/go.mod h1:0Oy/A9PQlwBytDRp4wSkFnzHYDKcpLot35JQ6msjvYQ= +github.com/libp2p/go-mplex v0.7.0/go.mod h1:rW8ThnRcYWft/Jb2jeORBmPd6xuG3dGxWN/W168L9EU= github.com/libp2p/go-msgio v0.0.2/go.mod h1:63lBBgOTDKQL6EWazRMCwXsEeEeK9O2Cd+0+6OOuipQ= github.com/libp2p/go-msgio v0.0.6/go.mod h1:4ecVB6d9f4BDSL5fqvPiC4A3KivjWn+Venn/1ALLMWA= github.com/libp2p/go-msgio v0.2.0/go.mod h1:dBVM1gW3Jk9XqHkU4eKdGvVHdLa51hoGfll6jMJMSlY= @@ -2183,6 +2189,7 @@ github.com/mattn/go-runewidth v0.0.2/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzp github.com/mattn/go-shellwords v1.0.12 h1:M2zGm7EW6UQJvDeQxo4T51eKPurbeFbe8WtebGE2xrk= github.com/mattn/go-sqlite3 v1.14.16 h1:yOQRA0RpS5PFz/oikGwBEqvAWhWg5ufRz4ETLjwpU1Y= github.com/mattn/go-sqlite3 v1.14.16/go.mod h1:2eHXhiwb8IkHr+BDWZGa96P6+rkvnG63S2DGjv9HUNg= +github.com/matttproud/golang_protobuf_extensions v1.0.4/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4= github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0/go.mod h1:QUyp042oQthUoa9bqDv0ER0wrtXnBruoNd7aNjkbP+k= github.com/maxbrunsfeld/counterfeiter/v6 v6.2.2 h1:g+4J5sZg6osfvEfkRZxJ1em0VT95/UOZgi/l7zi1/oE= github.com/microcosm-cc/bluemonday v1.0.1 h1:SIYunPjnlXcW+gVfvm0IlSeR5U3WZUOLfVmqg85Go44= @@ -2425,6 +2432,7 @@ github.com/quic-go/qtls-go1-18 v0.2.0/go.mod h1:moGulGHK7o6O8lSPSZNoOwcLvJKJ85vV github.com/quic-go/qtls-go1-19 v0.2.0/go.mod h1:ySOI96ew8lnoKPtSqx2BlI5wCpUVPT05RMAlajtnyOI= github.com/quic-go/qtls-go1-19 v0.2.1/go.mod h1:ySOI96ew8lnoKPtSqx2BlI5wCpUVPT05RMAlajtnyOI= github.com/quic-go/qtls-go1-19 v0.3.2/go.mod h1:ySOI96ew8lnoKPtSqx2BlI5wCpUVPT05RMAlajtnyOI= +github.com/quic-go/qtls-go1-19 v0.3.3/go.mod h1:ySOI96ew8lnoKPtSqx2BlI5wCpUVPT05RMAlajtnyOI= github.com/quic-go/qtls-go1-20 v0.1.0/go.mod h1:JKtK6mjbAVcUTN/9jZpvLbGxvdWIKS8uT7EiStoU1SM= github.com/quic-go/qtls-go1-20 v0.1.1/go.mod h1:JKtK6mjbAVcUTN/9jZpvLbGxvdWIKS8uT7EiStoU1SM= github.com/quic-go/qtls-go1-20 v0.2.2/go.mod h1:JKtK6mjbAVcUTN/9jZpvLbGxvdWIKS8uT7EiStoU1SM= @@ -2670,6 +2678,7 @@ go.opentelemetry.io/otel v1.18.0/go.mod h1:9lWqYO0Db579XzVuCKFNPDl4s73Voa+zEck3w go.opentelemetry.io/otel v1.19.0/go.mod h1:i0QyjOq3UPoTzff0PJB2N66fb4S0+rSbSB15/oyH9fY= go.opentelemetry.io/otel v1.21.0/go.mod h1:QZzNPQPm1zLX4gZK4cMi+71eaorMSGT3A4znnUvNNEo= go.opentelemetry.io/otel v1.22.0/go.mod h1:eoV4iAi3Ea8LkAEI9+GFT44O6T/D0GWAVFyZVCC6pMI= +go.opentelemetry.io/otel/exporters/jaeger v1.14.0/go.mod h1:4Ay9kk5vELRrbg5z4cpP9EtmQRFap2Wb0woPG4lujZA= go.opentelemetry.io/otel/exporters/otlp v0.20.0 h1:PTNgq9MRmQqqJY0REVbZFvwkYOA85vbdQU/nVfxDyqg= go.opentelemetry.io/otel/exporters/otlp/internal/retry v1.7.0/go.mod h1:M1hVZHNxcbkAlcvrOMlpQ4YOO3Awf+4N2dxkZL3xm04= go.opentelemetry.io/otel/exporters/otlp/internal/retry v1.13.0/go.mod h1:rqbht/LlhVBgn5+k3M5QK96K5Xb0DvXpMJ5SFQpY6uw= @@ -3076,6 +3085,7 @@ modernc.org/token v1.0.0 h1:a0jaWiNMDhDUtqOj09wvjWWAqd3q7WpBulmL9H2egsk= modernc.org/token v1.0.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM= modernc.org/zappy v1.0.0 h1:dPVaP+3ueIUv4guk8PuZ2wiUGcJ1WUVvIheeSSTD0yk= modernc.org/zappy v1.0.0/go.mod h1:hHe+oGahLVII/aTTyWK/b53VDHMAGCBYYeZ9sn83HC4= +nhooyr.io/websocket v1.8.7/go.mod h1:B70DZP8IakI65RVQ51MsWP/8jndNma26DVA/nFSCgW0= oras.land/oras-go/v2 v2.3.1 h1:lUC6q8RkeRReANEERLfH86iwGn55lbSWP20egdFHVec= oras.land/oras-go/v2 v2.3.1/go.mod h1:5AQXVEu1X/FKp1F9DMOb5ZItZBOa0y5dha0yCm4NR9c= rsc.io/binaryregexp v0.2.0 h1:HfqmD5MEmC0zvwBuF187nq9mdnXjXsSivRiXN7SmRkE= diff --git a/pkg/compute/executor_buffer.go b/pkg/compute/executor_buffer.go index c089df54bd..c129a1a401 100644 --- a/pkg/compute/executor_buffer.go +++ b/pkg/compute/executor_buffer.go @@ -120,7 +120,7 @@ func (s *ExecutorBuffer) Run(ctx context.Context, localExecutionState store.Loca execution.AllocateResources(execution.Job.Task().Name, *added) } - s.queuedTasks.Enqueue(newBufferTask(localExecutionState), execution.Job.Priority) + s.queuedTasks.Enqueue(newBufferTask(localExecutionState), int64(execution.Job.Priority)) s.deque() return err } diff --git a/pkg/compute/management_client.go b/pkg/compute/management_client.go index 31b0726ed9..c0c9946cd0 100644 --- a/pkg/compute/management_client.go +++ b/pkg/compute/management_client.go @@ -9,13 +9,10 @@ import ( "github.com/rs/zerolog/log" "github.com/bacalhau-project/bacalhau/pkg/compute/capacity" + "github.com/bacalhau-project/bacalhau/pkg/config/types" "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/models/requests" -) - -const ( - infoUpdateFrequencyMinutes = 5 - resourceUpdateFrequencySeconds = 30 + "github.com/bacalhau-project/bacalhau/pkg/node/heartbeat" ) type ManagementClientParams struct { @@ -23,8 +20,10 @@ type ManagementClientParams struct { LabelsProvider models.LabelsProvider ManagementProxy ManagementEndpoint NodeInfoDecorator models.NodeInfoDecorator - RegistrationFilePath string ResourceTracker capacity.Tracker + RegistrationFilePath string + HeartbeatClient *heartbeat.HeartbeatClient + ControlPlaneSettings types.ComputeControlPlaneConfig } // ManagementClient is used to call management functions with @@ -32,24 +31,28 @@ type ManagementClientParams struct { // it will periodically send an update to the requester node with // the latest node info for this node. type ManagementClient struct { - closeChannel chan struct{} + done chan struct{} labelsProvider models.LabelsProvider managementProxy ManagementEndpoint nodeID string nodeInfoDecorator models.NodeInfoDecorator - registrationFile *RegistrationFile resourceTracker capacity.Tracker + registrationFile *RegistrationFile + heartbeatClient *heartbeat.HeartbeatClient + settings types.ComputeControlPlaneConfig } -func NewManagementClient(params ManagementClientParams) *ManagementClient { +func NewManagementClient(params *ManagementClientParams) *ManagementClient { return &ManagementClient{ - closeChannel: make(chan struct{}, 1), + done: make(chan struct{}, 1), labelsProvider: params.LabelsProvider, managementProxy: params.ManagementProxy, nodeID: params.NodeID, nodeInfoDecorator: params.NodeInfoDecorator, registrationFile: NewRegistrationFile(params.RegistrationFilePath), resourceTracker: params.ResourceTracker, + heartbeatClient: params.HeartbeatClient, + settings: params.ControlPlaneSettings, } } @@ -106,6 +109,7 @@ func (m *ManagementClient) deliverInfo(ctx context.Context) { }) if err != nil { log.Ctx(ctx).Error().Err(err).Msg("failed to send update info to requester node") + return } if response.Accepted { @@ -124,36 +128,62 @@ func (m *ManagementClient) updateResources(ctx context.Context) { Resources: resources, }) if err != nil { - log.Ctx(ctx).Error().Err(err).Msg("failed to send resource update to requester node") + log.Ctx(ctx).Warn().Err(err).Msg("failed to send resource update to requester node") } } -func (m *ManagementClient) Start(ctx context.Context) { - infoTicker := time.NewTicker(infoUpdateFrequencyMinutes * time.Minute) - resourceTicker := time.NewTicker(resourceUpdateFrequencySeconds * time.Second) +func (m *ManagementClient) heartbeat(ctx context.Context, seq uint64) { + if err := m.heartbeatClient.SendHeartbeat(ctx, seq); err != nil { + log.Ctx(ctx).Error().Err(err).Msgf("heartbeat failed sending sequence %d", seq) + } +} - loop := true - for loop { +func (m *ManagementClient) Start(ctx context.Context) { + infoTicker := time.NewTicker(m.settings.InfoUpdateFrequency.AsTimeDuration()) + resourceTicker := time.NewTicker(m.settings.ResourceUpdateFrequency.AsTimeDuration()) + + // The heartbeat ticker will be used to send heartbeats to the requester node and + // should be configured to be about half of the heartbeat frequency of the requester node + // to ensure that the requester node does not consider this node as dead. If the server + // heartbeat frequency is 30 seconds, the client heartbeat frequency should be configured to + // fire more than once in that 30 seconds. + heartbeatTicker := time.NewTicker(m.settings.HeartbeatFrequency.AsTimeDuration()) + + defer func() { + heartbeatTicker.Stop() + resourceTicker.Stop() + infoTicker.Stop() + + // Close the heartbeat client and it's resources + m.heartbeatClient.Close(ctx) + }() + + // Send an initial heartbeat when we start up + var heartbeatSequence uint64 = 1 + m.heartbeat(ctx, heartbeatSequence) + + for { select { case <-ctx.Done(): - loop = false - case <-m.closeChannel: - loop = false + return + case <-m.done: + return case <-infoTicker.C: // Send the latest node info to the requester node m.deliverInfo(ctx) case <-resourceTicker.C: // Send the latest resource info m.updateResources(ctx) + case <-heartbeatTicker.C: + // Send a heartbeat to the requester node + heartbeatSequence += 1 + m.heartbeat(ctx, heartbeatSequence) } } - - resourceTicker.Stop() - infoTicker.Stop() } func (m *ManagementClient) Stop() { - if m.closeChannel != nil { - m.closeChannel <- struct{}{} + if m.done != nil { + m.done <- struct{}{} } } diff --git a/pkg/compute/mocks.go b/pkg/compute/mocks.go index 8abfba525d..3f2c662ee2 100644 --- a/pkg/compute/mocks.go +++ b/pkg/compute/mocks.go @@ -16,6 +16,7 @@ import ( store "github.com/bacalhau-project/bacalhau/pkg/compute/store" concurrency "github.com/bacalhau-project/bacalhau/pkg/lib/concurrency" models "github.com/bacalhau-project/bacalhau/pkg/models" + requests "github.com/bacalhau-project/bacalhau/pkg/models/requests" gomock "go.uber.org/mock/gomock" ) @@ -238,3 +239,71 @@ func (mr *MockCallbackMockRecorder) OnRunComplete(ctx, result any) *gomock.Call mr.mock.ctrl.T.Helper() return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "OnRunComplete", reflect.TypeOf((*MockCallback)(nil).OnRunComplete), ctx, result) } + +// MockManagementEndpoint is a mock of ManagementEndpoint interface. +type MockManagementEndpoint struct { + ctrl *gomock.Controller + recorder *MockManagementEndpointMockRecorder +} + +// MockManagementEndpointMockRecorder is the mock recorder for MockManagementEndpoint. +type MockManagementEndpointMockRecorder struct { + mock *MockManagementEndpoint +} + +// NewMockManagementEndpoint creates a new mock instance. +func NewMockManagementEndpoint(ctrl *gomock.Controller) *MockManagementEndpoint { + mock := &MockManagementEndpoint{ctrl: ctrl} + mock.recorder = &MockManagementEndpointMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockManagementEndpoint) EXPECT() *MockManagementEndpointMockRecorder { + return m.recorder +} + +// Register mocks base method. +func (m *MockManagementEndpoint) Register(arg0 context.Context, arg1 requests.RegisterRequest) (*requests.RegisterResponse, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Register", arg0, arg1) + ret0, _ := ret[0].(*requests.RegisterResponse) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// Register indicates an expected call of Register. +func (mr *MockManagementEndpointMockRecorder) Register(arg0, arg1 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Register", reflect.TypeOf((*MockManagementEndpoint)(nil).Register), arg0, arg1) +} + +// UpdateInfo mocks base method. +func (m *MockManagementEndpoint) UpdateInfo(arg0 context.Context, arg1 requests.UpdateInfoRequest) (*requests.UpdateInfoResponse, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "UpdateInfo", arg0, arg1) + ret0, _ := ret[0].(*requests.UpdateInfoResponse) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// UpdateInfo indicates an expected call of UpdateInfo. +func (mr *MockManagementEndpointMockRecorder) UpdateInfo(arg0, arg1 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateInfo", reflect.TypeOf((*MockManagementEndpoint)(nil).UpdateInfo), arg0, arg1) +} + +// UpdateResources mocks base method. +func (m *MockManagementEndpoint) UpdateResources(arg0 context.Context, arg1 requests.UpdateResourcesRequest) (*requests.UpdateResourcesResponse, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "UpdateResources", arg0, arg1) + ret0, _ := ret[0].(*requests.UpdateResourcesResponse) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// UpdateResources indicates an expected call of UpdateResources. +func (mr *MockManagementEndpointMockRecorder) UpdateResources(arg0, arg1 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateResources", reflect.TypeOf((*MockManagementEndpoint)(nil).UpdateResources), arg0, arg1) +} diff --git a/pkg/config/configenv/dev.go b/pkg/config/configenv/dev.go index 83fa4eb506..723d6ded4e 100644 --- a/pkg/config/configenv/dev.go +++ b/pkg/config/configenv/dev.go @@ -151,6 +151,12 @@ var DevelopmentComputeConfig = types.ComputeConfig{ Address: "127.0.0.1", Port: 6001, }, + ControlPlaneSettings: types.ComputeControlPlaneConfig{ + InfoUpdateFrequency: types.Duration(60 * time.Second), + ResourceUpdateFrequency: types.Duration(30 * time.Second), + HeartbeatFrequency: types.Duration(15 * time.Second), + HeartbeatTopic: "heartbeat", + }, } var DevelopmentRequesterConfig = types.RequesterConfig{ @@ -192,4 +198,9 @@ var DevelopmentRequesterConfig = types.RequesterConfig{ PreSignedURLExpiration: types.Duration(30 * time.Minute), }, }, + ControlPlaneSettings: types.RequesterControlPlaneConfig{ + HeartbeatCheckFrequency: types.Duration(30 * time.Second), + HeartbeatTopic: "heartbeat", + NodeDisconnectedAfter: types.Duration(30 * time.Second), + }, } diff --git a/pkg/config/configenv/local.go b/pkg/config/configenv/local.go index 47edee396d..3846b98814 100644 --- a/pkg/config/configenv/local.go +++ b/pkg/config/configenv/local.go @@ -142,6 +142,12 @@ var LocalComputeConfig = types.ComputeConfig{ Address: "127.0.0.1", Port: 6001, }, + ControlPlaneSettings: types.ComputeControlPlaneConfig{ + InfoUpdateFrequency: types.Duration(60 * time.Second), + ResourceUpdateFrequency: types.Duration(30 * time.Second), + HeartbeatFrequency: types.Duration(15 * time.Second), + HeartbeatTopic: "heartbeat", + }, } var LocalRequesterConfig = types.RequesterConfig{ @@ -183,4 +189,9 @@ var LocalRequesterConfig = types.RequesterConfig{ PreSignedURLExpiration: types.Duration(30 * time.Minute), }, }, + ControlPlaneSettings: types.RequesterControlPlaneConfig{ + HeartbeatCheckFrequency: types.Duration(30 * time.Second), + HeartbeatTopic: "heartbeat", + NodeDisconnectedAfter: types.Duration(30 * time.Second), + }, } diff --git a/pkg/config/configenv/production.go b/pkg/config/configenv/production.go index e2f39f026c..c3dbc25efb 100644 --- a/pkg/config/configenv/production.go +++ b/pkg/config/configenv/production.go @@ -159,6 +159,12 @@ var ProductionComputeConfig = types.ComputeConfig{ Address: "public", Port: 6001, }, + ControlPlaneSettings: types.ComputeControlPlaneConfig{ + InfoUpdateFrequency: types.Duration(60 * time.Second), + ResourceUpdateFrequency: types.Duration(30 * time.Second), + HeartbeatFrequency: types.Duration(15 * time.Second), + HeartbeatTopic: "heartbeat", + }, } var ProductionRequesterConfig = types.RequesterConfig{ @@ -200,4 +206,9 @@ var ProductionRequesterConfig = types.RequesterConfig{ PreSignedURLExpiration: types.Duration(30 * time.Minute), }, }, + ControlPlaneSettings: types.RequesterControlPlaneConfig{ + HeartbeatCheckFrequency: types.Duration(30 * time.Second), + HeartbeatTopic: "heartbeat", + NodeDisconnectedAfter: types.Duration(30 * time.Second), + }, } diff --git a/pkg/config/configenv/staging.go b/pkg/config/configenv/staging.go index 6fc7839a9d..c2d2e03282 100644 --- a/pkg/config/configenv/staging.go +++ b/pkg/config/configenv/staging.go @@ -157,6 +157,12 @@ var StagingComputeConfig = types.ComputeConfig{ Address: "public", Port: 6001, }, + ControlPlaneSettings: types.ComputeControlPlaneConfig{ + InfoUpdateFrequency: types.Duration(60 * time.Second), + ResourceUpdateFrequency: types.Duration(30 * time.Second), + HeartbeatFrequency: types.Duration(15 * time.Second), + HeartbeatTopic: "heartbeat", + }, } var StagingRequesterConfig = types.RequesterConfig{ @@ -198,4 +204,9 @@ var StagingRequesterConfig = types.RequesterConfig{ PreSignedURLExpiration: types.Duration(30 * time.Minute), }, }, + ControlPlaneSettings: types.RequesterControlPlaneConfig{ + HeartbeatCheckFrequency: types.Duration(30 * time.Second), + HeartbeatTopic: "heartbeat", + NodeDisconnectedAfter: types.Duration(30 * time.Second), + }, } diff --git a/pkg/config/configenv/test.go b/pkg/config/configenv/test.go index dfbc05d2a6..67fde0a196 100644 --- a/pkg/config/configenv/test.go +++ b/pkg/config/configenv/test.go @@ -146,6 +146,12 @@ var TestingComputeConfig = types.ComputeConfig{ Address: "private", Port: 6001, }, + ControlPlaneSettings: types.ComputeControlPlaneConfig{ + InfoUpdateFrequency: types.Duration(60 * time.Second), + ResourceUpdateFrequency: types.Duration(30 * time.Second), + HeartbeatFrequency: types.Duration(15 * time.Second), + HeartbeatTopic: "heartbeat", + }, } var TestingRequesterConfig = types.RequesterConfig{ @@ -187,4 +193,9 @@ var TestingRequesterConfig = types.RequesterConfig{ PreSignedURLExpiration: types.Duration(30 * time.Minute), }, }, + ControlPlaneSettings: types.RequesterControlPlaneConfig{ + HeartbeatCheckFrequency: types.Duration(30 * time.Second), + HeartbeatTopic: "heartbeat", + NodeDisconnectedAfter: types.Duration(30 * time.Second), + }, } diff --git a/pkg/config/types/compute.go b/pkg/config/types/compute.go index fbc2423218..5e6dde5603 100644 --- a/pkg/config/types/compute.go +++ b/pkg/config/types/compute.go @@ -6,15 +6,16 @@ import ( ) type ComputeConfig struct { - Capacity CapacityConfig `yaml:"Capacity"` - ExecutionStore JobStoreConfig `yaml:"ExecutionStore"` - JobTimeouts JobTimeoutConfig `yaml:"JobTimeouts"` - JobSelection model.JobSelectionPolicy `yaml:"JobSelection"` - Queue QueueConfig `yaml:"Queue"` - Logging LoggingConfig `yaml:"Logging"` - ManifestCache DockerCacheConfig `yaml:"ManifestCache"` - LogStreamConfig LogStreamConfig `yaml:"LogStream"` - LocalPublisher LocalPublisherConfig `yaml:"LocalPublisher"` + Capacity CapacityConfig `yaml:"Capacity"` + ExecutionStore JobStoreConfig `yaml:"ExecutionStore"` + JobTimeouts JobTimeoutConfig `yaml:"JobTimeouts"` + JobSelection model.JobSelectionPolicy `yaml:"JobSelection"` + Queue QueueConfig `yaml:"Queue"` + Logging LoggingConfig `yaml:"Logging"` + ManifestCache DockerCacheConfig `yaml:"ManifestCache"` + LogStreamConfig LogStreamConfig `yaml:"LogStream"` + LocalPublisher LocalPublisherConfig `yaml:"LocalPublisher"` + ControlPlaneSettings ComputeControlPlaneConfig `yaml:"ClusterTimeouts"` } type CapacityConfig struct { @@ -62,3 +63,20 @@ type LocalPublisherConfig struct { Port int `yaml:"Port"` Directory string `yaml:"Directory"` } + +type ComputeControlPlaneConfig struct { + // The frequency with which the compute node will send node info (inc current labels) + // to the controlling requester node. + InfoUpdateFrequency Duration `yaml:"InfoUpdateFrequency"` + + // How often the compute node will send current resource availability to the requester node. + ResourceUpdateFrequency Duration `yaml:"ResourceUpdateFrequency"` + + // How often the compute node will send a heartbeat to the requester node to let it know + // that the compute node is still alive. This should be less than the requester's configured + // heartbeat timeout to avoid flapping. + HeartbeatFrequency Duration `yaml:"HeartbeatFrequency"` + + // This is the pubsub topic that the compute node will use to send heartbeats to the requester node. + HeartbeatTopic string `yaml:"HeartbeatTopic"` +} diff --git a/pkg/config/types/generated_constants.go b/pkg/config/types/generated_constants.go index 29c31a9d99..b0f72d6fe6 100644 --- a/pkg/config/types/generated_constants.go +++ b/pkg/config/types/generated_constants.go @@ -95,6 +95,11 @@ const NodeComputeLocalPublisher = "Node.Compute.LocalPublisher" const NodeComputeLocalPublisherAddress = "Node.Compute.LocalPublisher.Address" const NodeComputeLocalPublisherPort = "Node.Compute.LocalPublisher.Port" const NodeComputeLocalPublisherDirectory = "Node.Compute.LocalPublisher.Directory" +const NodeComputeControlPlaneSettings = "Node.Compute.ControlPlaneSettings" +const NodeComputeControlPlaneSettingsInfoUpdateFrequency = "Node.Compute.ControlPlaneSettings.InfoUpdateFrequency" +const NodeComputeControlPlaneSettingsResourceUpdateFrequency = "Node.Compute.ControlPlaneSettings.ResourceUpdateFrequency" +const NodeComputeControlPlaneSettingsHeartbeatFrequency = "Node.Compute.ControlPlaneSettings.HeartbeatFrequency" +const NodeComputeControlPlaneSettingsHeartbeatTopic = "Node.Compute.ControlPlaneSettings.HeartbeatTopic" const NodeRequester = "Node.Requester" const NodeRequesterJobDefaults = "Node.Requester.JobDefaults" const NodeRequesterJobDefaultsExecutionTimeout = "Node.Requester.JobDefaults.ExecutionTimeout" @@ -133,6 +138,10 @@ const NodeRequesterTagCacheSize = "Node.Requester.TagCache.Size" const NodeRequesterTagCacheDuration = "Node.Requester.TagCache.Duration" const NodeRequesterTagCacheFrequency = "Node.Requester.TagCache.Frequency" const NodeRequesterDefaultPublisher = "Node.Requester.DefaultPublisher" +const NodeRequesterControlPlaneSettings = "Node.Requester.ControlPlaneSettings" +const NodeRequesterControlPlaneSettingsHeartbeatCheckFrequency = "Node.Requester.ControlPlaneSettings.HeartbeatCheckFrequency" +const NodeRequesterControlPlaneSettingsHeartbeatTopic = "Node.Requester.ControlPlaneSettings.HeartbeatTopic" +const NodeRequesterControlPlaneSettingsNodeDisconnectedAfter = "Node.Requester.ControlPlaneSettings.NodeDisconnectedAfter" const NodeBootstrapAddresses = "Node.BootstrapAddresses" const NodeDownloadURLRequestRetries = "Node.DownloadURLRequestRetries" const NodeDownloadURLRequestTimeout = "Node.DownloadURLRequestTimeout" diff --git a/pkg/config/types/generated_viper_defaults.go b/pkg/config/types/generated_viper_defaults.go index b2e362338a..503992dcc2 100644 --- a/pkg/config/types/generated_viper_defaults.go +++ b/pkg/config/types/generated_viper_defaults.go @@ -1,4 +1,3 @@ - // CODE GENERATED BY pkg/config/types/gen_viper DO NOT EDIT package types @@ -118,6 +117,11 @@ func SetDefaults(cfg BacalhauConfig, opts ...SetOption) { p.Viper.SetDefault(NodeComputeLocalPublisherAddress, cfg.Node.Compute.LocalPublisher.Address) p.Viper.SetDefault(NodeComputeLocalPublisherPort, cfg.Node.Compute.LocalPublisher.Port) p.Viper.SetDefault(NodeComputeLocalPublisherDirectory, cfg.Node.Compute.LocalPublisher.Directory) + p.Viper.SetDefault(NodeComputeControlPlaneSettings, cfg.Node.Compute.ControlPlaneSettings) + p.Viper.SetDefault(NodeComputeControlPlaneSettingsInfoUpdateFrequency, cfg.Node.Compute.ControlPlaneSettings.InfoUpdateFrequency.AsTimeDuration()) + p.Viper.SetDefault(NodeComputeControlPlaneSettingsResourceUpdateFrequency, cfg.Node.Compute.ControlPlaneSettings.ResourceUpdateFrequency.AsTimeDuration()) + p.Viper.SetDefault(NodeComputeControlPlaneSettingsHeartbeatFrequency, cfg.Node.Compute.ControlPlaneSettings.HeartbeatFrequency.AsTimeDuration()) + p.Viper.SetDefault(NodeComputeControlPlaneSettingsHeartbeatTopic, cfg.Node.Compute.ControlPlaneSettings.HeartbeatTopic) p.Viper.SetDefault(NodeRequester, cfg.Node.Requester) p.Viper.SetDefault(NodeRequesterJobDefaults, cfg.Node.Requester.JobDefaults) p.Viper.SetDefault(NodeRequesterJobDefaultsExecutionTimeout, cfg.Node.Requester.JobDefaults.ExecutionTimeout.AsTimeDuration()) @@ -156,6 +160,10 @@ func SetDefaults(cfg BacalhauConfig, opts ...SetOption) { p.Viper.SetDefault(NodeRequesterTagCacheDuration, cfg.Node.Requester.TagCache.Duration.AsTimeDuration()) p.Viper.SetDefault(NodeRequesterTagCacheFrequency, cfg.Node.Requester.TagCache.Frequency.AsTimeDuration()) p.Viper.SetDefault(NodeRequesterDefaultPublisher, cfg.Node.Requester.DefaultPublisher) + p.Viper.SetDefault(NodeRequesterControlPlaneSettings, cfg.Node.Requester.ControlPlaneSettings) + p.Viper.SetDefault(NodeRequesterControlPlaneSettingsHeartbeatCheckFrequency, cfg.Node.Requester.ControlPlaneSettings.HeartbeatCheckFrequency.AsTimeDuration()) + p.Viper.SetDefault(NodeRequesterControlPlaneSettingsHeartbeatTopic, cfg.Node.Requester.ControlPlaneSettings.HeartbeatTopic) + p.Viper.SetDefault(NodeRequesterControlPlaneSettingsNodeDisconnectedAfter, cfg.Node.Requester.ControlPlaneSettings.NodeDisconnectedAfter.AsTimeDuration()) p.Viper.SetDefault(NodeBootstrapAddresses, cfg.Node.BootstrapAddresses) p.Viper.SetDefault(NodeDownloadURLRequestRetries, cfg.Node.DownloadURLRequestRetries) p.Viper.SetDefault(NodeDownloadURLRequestTimeout, cfg.Node.DownloadURLRequestTimeout.AsTimeDuration()) @@ -306,6 +314,11 @@ func Set(cfg BacalhauConfig, opts ...SetOption) { p.Viper.Set(NodeComputeLocalPublisherAddress, cfg.Node.Compute.LocalPublisher.Address) p.Viper.Set(NodeComputeLocalPublisherPort, cfg.Node.Compute.LocalPublisher.Port) p.Viper.Set(NodeComputeLocalPublisherDirectory, cfg.Node.Compute.LocalPublisher.Directory) + p.Viper.Set(NodeComputeControlPlaneSettings, cfg.Node.Compute.ControlPlaneSettings) + p.Viper.Set(NodeComputeControlPlaneSettingsInfoUpdateFrequency, cfg.Node.Compute.ControlPlaneSettings.InfoUpdateFrequency.AsTimeDuration()) + p.Viper.Set(NodeComputeControlPlaneSettingsResourceUpdateFrequency, cfg.Node.Compute.ControlPlaneSettings.ResourceUpdateFrequency.AsTimeDuration()) + p.Viper.Set(NodeComputeControlPlaneSettingsHeartbeatFrequency, cfg.Node.Compute.ControlPlaneSettings.HeartbeatFrequency.AsTimeDuration()) + p.Viper.Set(NodeComputeControlPlaneSettingsHeartbeatTopic, cfg.Node.Compute.ControlPlaneSettings.HeartbeatTopic) p.Viper.Set(NodeRequester, cfg.Node.Requester) p.Viper.Set(NodeRequesterJobDefaults, cfg.Node.Requester.JobDefaults) p.Viper.Set(NodeRequesterJobDefaultsExecutionTimeout, cfg.Node.Requester.JobDefaults.ExecutionTimeout.AsTimeDuration()) @@ -344,6 +357,10 @@ func Set(cfg BacalhauConfig, opts ...SetOption) { p.Viper.Set(NodeRequesterTagCacheDuration, cfg.Node.Requester.TagCache.Duration.AsTimeDuration()) p.Viper.Set(NodeRequesterTagCacheFrequency, cfg.Node.Requester.TagCache.Frequency.AsTimeDuration()) p.Viper.Set(NodeRequesterDefaultPublisher, cfg.Node.Requester.DefaultPublisher) + p.Viper.Set(NodeRequesterControlPlaneSettings, cfg.Node.Requester.ControlPlaneSettings) + p.Viper.Set(NodeRequesterControlPlaneSettingsHeartbeatCheckFrequency, cfg.Node.Requester.ControlPlaneSettings.HeartbeatCheckFrequency.AsTimeDuration()) + p.Viper.Set(NodeRequesterControlPlaneSettingsHeartbeatTopic, cfg.Node.Requester.ControlPlaneSettings.HeartbeatTopic) + p.Viper.Set(NodeRequesterControlPlaneSettingsNodeDisconnectedAfter, cfg.Node.Requester.ControlPlaneSettings.NodeDisconnectedAfter.AsTimeDuration()) p.Viper.Set(NodeBootstrapAddresses, cfg.Node.BootstrapAddresses) p.Viper.Set(NodeDownloadURLRequestRetries, cfg.Node.DownloadURLRequestRetries) p.Viper.Set(NodeDownloadURLRequestTimeout, cfg.Node.DownloadURLRequestTimeout.AsTimeDuration()) diff --git a/pkg/config/types/requester.go b/pkg/config/types/requester.go index 8d1aefcb13..42644d7299 100644 --- a/pkg/config/types/requester.go +++ b/pkg/config/types/requester.go @@ -25,6 +25,8 @@ type RequesterConfig struct { TagCache DockerCacheConfig `yaml:"TagCache"` DefaultPublisher string `yaml:"DefaultPublisher"` + + ControlPlaneSettings RequesterControlPlaneConfig `yaml:"ControlPlaneSettings"` } type EvaluationBrokerConfig struct { @@ -53,3 +55,18 @@ type S3StorageProviderConfig struct { type JobDefaults struct { ExecutionTimeout Duration `yaml:"ExecutionTimeout"` } + +type RequesterControlPlaneConfig struct { + // This setting is the time period after which a compute node is considered to be unresponsive. + // If the compute node misses two of these frequencies, it will be marked as unknown. The compute + // node should have a frequency setting less than this one to ensure that it does not keep + // switching between unknown and active too frequently. + HeartbeatCheckFrequency Duration `yaml:"HeartbeatFrequency"` + + // This is the pubsub topic that the compute node will use to send heartbeats to the requester node. + HeartbeatTopic string `yaml:"HeartbeatTopic"` + + // This is the time period after which a compute node is considered to be disconnected. If the compute + // node does not deliver a heartbeat every `NodeDisconnectedAfter` then it is considered disconnected. + NodeDisconnectedAfter Duration `yaml:"NodeDisconnectedAfter"` +} diff --git a/pkg/lib/collections/hashed_priority_queue.go b/pkg/lib/collections/hashed_priority_queue.go index 3e68c3fa0a..ef9025b352 100644 --- a/pkg/lib/collections/hashed_priority_queue.go +++ b/pkg/lib/collections/hashed_priority_queue.go @@ -33,7 +33,7 @@ func (q *HashedPriorityQueue[K, T]) Contains(id K) bool { // Enqueue will add the item specified by `data` to the queue with the // the priority given by `priority`. -func (q *HashedPriorityQueue[K, T]) Enqueue(data T, priority int) { +func (q *HashedPriorityQueue[K, T]) Enqueue(data T, priority int64) { q.mu.Lock() defer q.mu.Unlock() @@ -56,6 +56,7 @@ func (q *HashedPriorityQueue[K, T]) Dequeue() *QueueItem[T] { return nil } + // Find the key for the item and delete it from the presence map k := q.indexer(item.Value) delete(q.identifiers, k) diff --git a/pkg/lib/collections/priority_queue.go b/pkg/lib/collections/priority_queue.go index 200c6227c7..72129a414e 100644 --- a/pkg/lib/collections/priority_queue.go +++ b/pkg/lib/collections/priority_queue.go @@ -20,7 +20,7 @@ var ( type PriorityQueueInterface[T any] interface { // Enqueue will add the item specified by `data` to the queue with the // the priority given by `priority`. - Enqueue(data T, priority int) + Enqueue(data T, priority int64) // Dequeue returns the next highest priority item, returning both // the data Enqueued previously, and the priority with which it was @@ -55,7 +55,7 @@ type PriorityQueue[T any] struct { // the various dequeue methods type QueueItem[T any] struct { Value T - Priority int + Priority int64 } // MatchingFunction can be used when 'iterating' the priority queue to find @@ -73,7 +73,7 @@ func NewPriorityQueue[T any]() *PriorityQueue[T] { // Enqueue will add the item specified by `data` to the queue with the // the priority given by `priority`. -func (pq *PriorityQueue[T]) Enqueue(data T, priority int) { +func (pq *PriorityQueue[T]) Enqueue(data T, priority int64) { pq.mu.Lock() defer pq.mu.Unlock() @@ -82,7 +82,7 @@ func (pq *PriorityQueue[T]) Enqueue(data T, priority int) { // enqueue is a lock-free version of Enqueue for internal use when a // method already has a lock. -func (pq *PriorityQueue[T]) enqueue(data T, priority int) { +func (pq *PriorityQueue[T]) enqueue(data T, priority int64) { heap.Push( &pq.internalQueue, &heapItem{ @@ -180,7 +180,7 @@ type queueHeap []*heapItem type heapItem struct { value any - priority int + priority int64 index int // The index for update } diff --git a/pkg/lib/collections/priority_queue_test.go b/pkg/lib/collections/priority_queue_test.go index 1bd9ac8f14..9a2d115ab8 100644 --- a/pkg/lib/collections/priority_queue_test.go +++ b/pkg/lib/collections/priority_queue_test.go @@ -20,7 +20,7 @@ func TestPriorityQueueSuite(t *testing.T) { func (s *PriorityQueueSuite) TestSimple() { type testcase struct { v string - p int + p int64 } inputs := []testcase{ {"B", 2}, {"A", 3}, {"C", 1}, {"A", 3}, {"C", 1}, {"B", 2}, @@ -31,7 +31,34 @@ func (s *PriorityQueueSuite) TestSimple() { pq := collections.NewPriorityQueue[string]() for _, tc := range inputs { - pq.Enqueue(tc.v, tc.p) + pq.Enqueue(tc.v, int64(tc.p)) + } + + for _, tc := range expected { + qitem := pq.Dequeue() + s.Require().NotNil(qitem) + s.Require().Equal(tc.v, qitem.Value) + s.Require().Equal(tc.p, qitem.Priority) + } + + s.Require().True(pq.IsEmpty()) +} + +func (s *PriorityQueueSuite) TestSimpleMin() { + type testcase struct { + v string + p int64 + } + inputs := []testcase{ + {"B", -2}, {"A", -3}, {"C", -1}, {"A", -3}, {"C", -1}, {"B", -2}, + } + expected := []testcase{ + {"C", -1}, {"C", -1}, {"B", -2}, {"B", -2}, {"A", -3}, {"A", -3}, + } + + pq := collections.NewPriorityQueue[string]() + for _, tc := range inputs { + pq.Enqueue(tc.v, int64(tc.p)) } for _, tc := range expected { @@ -69,7 +96,7 @@ func (s *PriorityQueueSuite) TestDequeueWhere() { s.Require().NotNil(qitem) s.Require().Equal("B", qitem.Value) - s.Require().Equal(3, qitem.Priority) + s.Require().Equal(int64(3), qitem.Priority) s.Require().Equal(count-1, pq.Len()) } diff --git a/pkg/lib/concurrency/striped_map.go b/pkg/lib/concurrency/striped_map.go index 6fb6775b18..1c3d1bbe04 100644 --- a/pkg/lib/concurrency/striped_map.go +++ b/pkg/lib/concurrency/striped_map.go @@ -55,7 +55,6 @@ func (s *StripedMap[T]) Put(key string, value T) { func (s *StripedMap[T]) Get(key string) (T, bool) { idx := s.hash(key) - s.locks[idx].RLock() defer s.locks[idx].RUnlock() @@ -65,7 +64,6 @@ func (s *StripedMap[T]) Get(key string) (T, bool) { func (s *StripedMap[T]) Delete(key string) { idx := s.hash(key) - _, found := s.Get(key) if !found { // Return early if the key does not exist. diff --git a/pkg/models/node_info.go b/pkg/models/node_info.go index e5b286dd36..1a0a5a5dc2 100644 --- a/pkg/models/node_info.go +++ b/pkg/models/node_info.go @@ -84,6 +84,7 @@ type NodeInfo struct { ComputeNodeInfo *ComputeNodeInfo `json:"ComputeNodeInfo,omitempty" yaml:",omitempty"` BacalhauVersion BuildVersionInfo `json:"BacalhauVersion"` Approval NodeApproval `json:"Approval"` + State NodeState `json:"State"` } // ID returns the node ID diff --git a/pkg/models/node_state.go b/pkg/models/node_state.go new file mode 100644 index 0000000000..94aea6101b --- /dev/null +++ b/pkg/models/node_state.go @@ -0,0 +1,95 @@ +package models + +import ( + "fmt" +) + +type NodeState struct { + liveness +} + +type liveness int + +// To add a new state (for instance, a state beyond which the node is considered +// lost) then: +// * add it to the end of the list in the const below +// * add it to strLivenessArray and typeLivenessMap +// * add it to the livenessContainer and corresponding NodeStates var. +// * add it to the All() method in the livenessContainer +const ( + connected liveness = iota + disconnected +) + +var ( + strLivenessArray = [...]string{ + connected: "CONNECTED", + disconnected: "DISCONNECTED", + } + + typeLivenessMap = map[string]liveness{ + "CONNECTED": connected, + "DISCONNECTED": disconnected, + } +) + +func (t liveness) String() string { + return strLivenessArray[t] +} + +func ParseState(a any) NodeState { + switch v := a.(type) { + case NodeState: + return v + case string: + return NodeState{stringToLiveness(v)} + case fmt.Stringer: + return NodeState{stringToLiveness(v.String())} + case int: + return NodeState{liveness(v)} + case int64: + return NodeState{liveness(int(v))} + case int32: + return NodeState{liveness(int(v))} + } + return NodeState{disconnected} +} + +func stringToLiveness(s string) liveness { + if v, ok := typeLivenessMap[s]; ok { + return v + } + return disconnected +} + +func (t liveness) IsValid() bool { + return t >= liveness(1) && t <= liveness(len(strLivenessArray)) +} + +type livenessContainer struct { + CONNECTED NodeState + DISCONNECTED NodeState + HEALTHY NodeState +} + +var NodeStates = livenessContainer{ + CONNECTED: NodeState{connected}, + DISCONNECTED: NodeState{disconnected}, +} + +func (c livenessContainer) All() []NodeState { + return []NodeState{ + c.CONNECTED, + c.DISCONNECTED, + } +} + +func (s NodeState) MarshalJSON() ([]byte, error) { + return []byte(`"` + s.String() + `"`), nil +} + +func (s *NodeState) UnmarshalJSON(b []byte) error { + val := string(trimQuotes(b)) + *s = ParseState(val) + return nil +} diff --git a/pkg/nats/proxy/management_proxy.go b/pkg/nats/proxy/management_proxy.go index 2a1de9da4e..f802e6d45f 100644 --- a/pkg/nats/proxy/management_proxy.go +++ b/pkg/nats/proxy/management_proxy.go @@ -115,7 +115,7 @@ func send[Q managementRequest, R managementResponse]( respMsg, err := conn.Request(subject, data, requestTimeout) if err != nil { - log.Ctx(ctx).Error().Err(err).Msgf("error sending request to subject %s", subject) + log.Ctx(ctx).Warn().Err(err).Msgf("error sending request to subject %s", subject) return nil, err } diff --git a/pkg/nats/transport/nats.go b/pkg/nats/transport/nats.go index 85338de923..f249c322b1 100644 --- a/pkg/nats/transport/nats.go +++ b/pkg/nats/transport/nats.go @@ -79,7 +79,7 @@ func (c *NATSTransportConfig) Validate() error { } type NATSTransport struct { - Config NATSTransportConfig + Config *NATSTransportConfig nodeID string natsServer *nats_helper.ServerManager natsClient *nats_helper.ClientManager @@ -92,7 +92,7 @@ type NATSTransport struct { //nolint:funlen func NewNATSTransport(ctx context.Context, - config NATSTransportConfig) (*NATSTransport, error) { + config *NATSTransportConfig) (*NATSTransport, error) { log.Debug().Msgf("Creating NATS transport with config: %+v", config) if err := config.Validate(); err != nil { return nil, fmt.Errorf("error validating nats transport config. %w", err) @@ -189,7 +189,7 @@ func NewNATSTransport(ctx context.Context, }, nil } -func CreateClient(ctx context.Context, config NATSTransportConfig) (*nats_helper.ClientManager, error) { +func CreateClient(ctx context.Context, config *NATSTransportConfig) (*nats_helper.ClientManager, error) { // create nats client log.Debug().Msgf("Creating NATS client with servers: %s", strings.Join(config.Orchestrators, ",")) clientOptions := []nats.Option{ diff --git a/pkg/node/compute.go b/pkg/node/compute.go index 0774bebe6e..b6168c21fd 100644 --- a/pkg/node/compute.go +++ b/pkg/node/compute.go @@ -20,6 +20,7 @@ import ( executor_util "github.com/bacalhau-project/bacalhau/pkg/executor/util" "github.com/bacalhau-project/bacalhau/pkg/model" "github.com/bacalhau-project/bacalhau/pkg/models" + "github.com/bacalhau-project/bacalhau/pkg/node/heartbeat" "github.com/bacalhau-project/bacalhau/pkg/publicapi" compute_endpoint "github.com/bacalhau-project/bacalhau/pkg/publicapi/endpoint/compute" "github.com/bacalhau-project/bacalhau/pkg/publisher" @@ -58,6 +59,7 @@ func NewComputeNode( computeCallback compute.Callback, managementProxy compute.ManagementEndpoint, configuredLabels map[string]string, + heartbeatClient *heartbeat.HeartbeatClient, ) (*Compute, error) { executionStore := config.ExecutionStore @@ -191,7 +193,7 @@ func NewComputeNode( var managementClient *compute.ManagementClient // TODO: When we no longer use libP2P for management, we should remove this - // as the managementProxy will always be set. + // as the managementProxy will always be set for NATS if managementProxy != nil { // TODO: Make the registration lock folder a config option so that we have it // available and don't have to depend on getting the repo folder. @@ -202,13 +204,15 @@ func NewComputeNode( // Set up the management client which will attempt to register this node // with the requester node, and then if successful will send regular node // info updates. - managementClient = compute.NewManagementClient(compute.ManagementClientParams{ + managementClient = compute.NewManagementClient(&compute.ManagementClientParams{ NodeID: nodeID, LabelsProvider: labelsProvider, ManagementProxy: managementProxy, NodeInfoDecorator: nodeInfoDecorator, RegistrationFilePath: regFilename, ResourceTracker: runningCapacityTracker, + HeartbeatClient: heartbeatClient, + ControlPlaneSettings: config.ControlPlaneSettings, }) if err := managementClient.RegisterNode(ctx); err != nil { return nil, fmt.Errorf("failed to register node with requester: %s", err) diff --git a/pkg/node/config_compute.go b/pkg/node/config_compute.go index e44c05a270..25e739180f 100644 --- a/pkg/node/config_compute.go +++ b/pkg/node/config_compute.go @@ -75,6 +75,8 @@ type ComputeConfigParams struct { ExecutionStore store.ExecutionStore LocalPublisher types.LocalPublisherConfig + + ControlPlaneSettings types.ComputeControlPlaneConfig } type ComputeConfig struct { @@ -119,6 +121,8 @@ type ComputeConfig struct { ExecutionStore store.ExecutionStore LocalPublisher types.LocalPublisherConfig + + ControlPlaneSettings types.ComputeControlPlaneConfig } func NewComputeConfigWithDefaults() (ComputeConfig, error) { @@ -152,6 +156,20 @@ func NewComputeConfigWith(params ComputeConfigParams) (ComputeConfig, error) { } } + // Control plan settings defaults + if params.ControlPlaneSettings.HeartbeatFrequency == 0 { + params.ControlPlaneSettings.HeartbeatFrequency = DefaultComputeConfig.ControlPlaneSettings.HeartbeatFrequency + } + if params.ControlPlaneSettings.InfoUpdateFrequency == 0 { + params.ControlPlaneSettings.InfoUpdateFrequency = DefaultComputeConfig.ControlPlaneSettings.InfoUpdateFrequency + } + if params.ControlPlaneSettings.ResourceUpdateFrequency == 0 { + params.ControlPlaneSettings.ResourceUpdateFrequency = DefaultComputeConfig.ControlPlaneSettings.ResourceUpdateFrequency + } + if params.ControlPlaneSettings.HeartbeatTopic == "" { + params.ControlPlaneSettings.HeartbeatTopic = DefaultComputeConfig.ControlPlaneSettings.HeartbeatTopic + } + // Get available physical resources in the host physicalResourcesProvider := params.PhysicalResourcesProvider if physicalResourcesProvider == nil { @@ -204,6 +222,7 @@ func NewComputeConfigWith(params ComputeConfigParams) (ComputeConfig, error) { BidResourceStrategy: params.BidResourceStrategy, ExecutionStore: params.ExecutionStore, LocalPublisher: params.LocalPublisher, + ControlPlaneSettings: params.ControlPlaneSettings, } if err := validateConfig(config, physicalResources); err != nil { diff --git a/pkg/node/config_defaults.go b/pkg/node/config_defaults.go index 35f32b35ca..8cd6edbe76 100644 --- a/pkg/node/config_defaults.go +++ b/pkg/node/config_defaults.go @@ -33,6 +33,12 @@ var DefaultComputeConfig = ComputeConfigParams{ LocalPublisher: types.LocalPublisherConfig{ Directory: path.Join(config.GetStoragePath(), "bacalhau-local-publisher"), }, + ControlPlaneSettings: types.ComputeControlPlaneConfig{ + InfoUpdateFrequency: types.Duration(60 * time.Second), //nolint:gomnd + ResourceUpdateFrequency: types.Duration(30 * time.Second), //nolint:gomnd + HeartbeatFrequency: types.Duration(15 * time.Second), //nolint:gomnd + HeartbeatTopic: "heartbeat", + }, } var DefaultRequesterConfig = RequesterConfigParams{ @@ -62,6 +68,12 @@ var DefaultRequesterConfig = RequesterConfigParams{ S3PreSignedURLExpiration: 30 * time.Minute, TranslationEnabled: false, + + ControlPlaneSettings: types.RequesterControlPlaneConfig{ + HeartbeatCheckFrequency: types.Duration(30 * time.Second), //nolint:gomnd + HeartbeatTopic: "heartbeat", + NodeDisconnectedAfter: types.Duration(30 * time.Second), //nolint:gomnd + }, } var TestRequesterConfig = RequesterConfigParams{ @@ -90,6 +102,12 @@ var TestRequesterConfig = RequesterConfigParams{ S3PreSignedURLDisabled: false, S3PreSignedURLExpiration: 30 * time.Minute, + + ControlPlaneSettings: types.RequesterControlPlaneConfig{ + HeartbeatCheckFrequency: types.Duration(30 * time.Second), //nolint:gomnd + HeartbeatTopic: "heartbeat", + NodeDisconnectedAfter: types.Duration(30 * time.Second), //nolint:gomnd + }, } func getRequesterConfigParams() RequesterConfigParams { diff --git a/pkg/node/config_requester.go b/pkg/node/config_requester.go index b150bf28d3..e074c78720 100644 --- a/pkg/node/config_requester.go +++ b/pkg/node/config_requester.go @@ -5,6 +5,7 @@ import ( "net/url" "time" + "github.com/bacalhau-project/bacalhau/pkg/config/types" "github.com/bacalhau-project/bacalhau/pkg/jobstore" "github.com/imdario/mergo" "github.com/rs/zerolog/log" @@ -51,6 +52,8 @@ type RequesterConfigParams struct { JobStore jobstore.Store DefaultPublisher string + + ControlPlaneSettings types.RequesterControlPlaneConfig } type RequesterConfig struct { diff --git a/pkg/node/heartbeat/client.go b/pkg/node/heartbeat/client.go new file mode 100644 index 0000000000..0c661e13ac --- /dev/null +++ b/pkg/node/heartbeat/client.go @@ -0,0 +1,43 @@ +package heartbeat + +import ( + "context" + + natsPubSub "github.com/bacalhau-project/bacalhau/pkg/nats/pubsub" + "github.com/bacalhau-project/bacalhau/pkg/pubsub" + + "github.com/nats-io/nats.go" +) + +type HeartbeatClient struct { + publisher *natsPubSub.PubSub[Heartbeat] + nodeID string +} + +func NewClient(conn *nats.Conn, nodeID string, topic string) (*HeartbeatClient, error) { + subParams := natsPubSub.PubSubParams{ + Subject: topic, + Conn: conn, + } + + publisher, err := natsPubSub.NewPubSub[Heartbeat](subParams) + if err != nil { + return nil, err + } + + return &HeartbeatClient{publisher: publisher, nodeID: nodeID}, nil +} + +func (h *HeartbeatClient) SendHeartbeat(ctx context.Context, sequence uint64) error { + return h.Publish(ctx, Heartbeat{NodeID: h.nodeID, Sequence: sequence}) +} + +func (h *HeartbeatClient) Publish(ctx context.Context, message Heartbeat) error { + return h.publisher.Publish(ctx, message) +} + +func (h *HeartbeatClient) Close(ctx context.Context) error { + return h.publisher.Close(ctx) +} + +var _ pubsub.Publisher[Heartbeat] = (*HeartbeatClient)(nil) diff --git a/pkg/node/heartbeat/heartbeat_test.go b/pkg/node/heartbeat/heartbeat_test.go new file mode 100644 index 0000000000..9ecb1e9bc2 --- /dev/null +++ b/pkg/node/heartbeat/heartbeat_test.go @@ -0,0 +1,154 @@ +//go:build unit || !integration + +package heartbeat + +import ( + "context" + "fmt" + "strconv" + "testing" + "time" + + "github.com/bacalhau-project/bacalhau/pkg/models" + "github.com/benbjohnson/clock" + "github.com/nats-io/nats-server/v2/server" + natsserver "github.com/nats-io/nats-server/v2/test" + "github.com/nats-io/nats.go" + "github.com/stretchr/testify/suite" +) + +const ( + TestPort = 8369 + TestTopic = "test" +) + +type HeartbeatTestSuite struct { + suite.Suite + + clock *clock.Mock + + nats *server.Server + client *nats.Conn +} + +func TestHeartbeatTestSuite(t *testing.T) { + suite.Run(t, new(HeartbeatTestSuite)) +} + +func (s *HeartbeatTestSuite) SetupTest() { + opts := &natsserver.DefaultTestOptions + opts.Port = TestPort + opts.JetStream = true + opts.StoreDir = s.T().TempDir() + + s.nats = natsserver.RunServer(opts) + client, err := nats.Connect(s.nats.Addr().String()) + s.Require().NoError(err) + + s.client = client +} + +func (s *HeartbeatTestSuite) TearDownTest() { + s.nats.Shutdown() +} + +func (s *HeartbeatTestSuite) TestSendHeartbeat() { + ctx := context.Background() + ctx, cancel := context.WithCancel(ctx) + defer cancel() + + s.clock = clock.NewMock() + server, err := NewServer(HeartbeatServerParams{ + Clock: s.clock, + Client: s.client, + Topic: TestTopic, + CheckFrequency: 1 * time.Second, + NodeDisconnectedAfter: 10 * time.Second, + }) + s.Require().NoError(err) + + err = server.Start(ctx) + s.Require().NoError(err) + + type testcase struct { + name string + includeInitial bool + heartbeats []time.Duration + expectedState models.NodeState + waitUntil time.Duration + } + + testcases := []testcase{ + // No heartbeats, node should be HEALTHY after the initial connection + {name: "simple", includeInitial: true, heartbeats: []time.Duration{}, expectedState: models.NodeStates.CONNECTED, waitUntil: time.Duration(5 * time.Second)}, + + // Node should be CONNECTED after the initial connection and a heartbeat but then misses second + { + name: "disconnected", + includeInitial: true, + heartbeats: []time.Duration{ + time.Duration(30 * time.Second), + }, + expectedState: models.NodeStates.DISCONNECTED, + waitUntil: time.Duration(30 * time.Second), + }, + + // Node should be DISCONNECTED after missing schedule + { + name: "unknown", + includeInitial: true, + heartbeats: []time.Duration{ + time.Duration(1 * time.Second), + // time.Duration(30 * time.Second), + }, + expectedState: models.NodeStates.DISCONNECTED, + waitUntil: time.Duration(30 * time.Second), + }, + + // Nodes that have never been seen should be DISCONNECTED + { + name: "never seen (default)", + includeInitial: false, + heartbeats: []time.Duration{}, + expectedState: models.NodeStates.DISCONNECTED, + waitUntil: time.Duration(10 * time.Second), + }, + } + + for i, tc := range testcases { + nodeInfo := models.NodeInfo{ + NodeID: "node-" + strconv.Itoa(i), + } + + s.T().Run(tc.name, func(t *testing.T) { + // Wait for the first heartbeat to be sent + client, err := NewClient(s.client, nodeInfo.NodeID, TestTopic) + s.Require().NoError(err) + defer client.Close(ctx) + + var seq uint64 = 1 + + // Optionally send initial connection heartbeat + if tc.includeInitial { + err = client.SendHeartbeat(ctx, seq) + s.Require().NoError(err) + } + + // Wait for the first check frequency to pass before we check the state + s.clock.Add(1 * time.Second) + + // Send heartbeats after each duration in the test case + for _, duration := range tc.heartbeats { + s.clock.Add(duration) // wait for + seq += 1 + err = client.SendHeartbeat(ctx, seq) + s.Require().NoError(err) + } + + s.clock.Add(tc.waitUntil) + + server.UpdateNodeInfo(&nodeInfo) + s.Require().Equal(nodeInfo.State, tc.expectedState, fmt.Sprintf("incorrect state in %s", tc.name)) + }) + } +} diff --git a/pkg/node/heartbeat/server.go b/pkg/node/heartbeat/server.go new file mode 100644 index 0000000000..296b38423d --- /dev/null +++ b/pkg/node/heartbeat/server.go @@ -0,0 +1,205 @@ +package heartbeat + +import ( + "context" + "time" + + "github.com/benbjohnson/clock" + "github.com/nats-io/nats.go" + "github.com/rs/zerolog/log" + + "github.com/bacalhau-project/bacalhau/pkg/lib/collections" + "github.com/bacalhau-project/bacalhau/pkg/lib/concurrency" + "github.com/bacalhau-project/bacalhau/pkg/models" + natsPubSub "github.com/bacalhau-project/bacalhau/pkg/nats/pubsub" + "github.com/bacalhau-project/bacalhau/pkg/pubsub" +) + +type HeartbeatServerParams struct { + Client *nats.Conn + Topic string + Clock clock.Clock + CheckFrequency time.Duration + NodeDisconnectedAfter time.Duration +} + +type HeartbeatServer struct { + clock clock.Clock + subscription *natsPubSub.PubSub[Heartbeat] + pqueue *collections.HashedPriorityQueue[string, TimestampedHeartbeat] + livenessMap *concurrency.StripedMap[models.NodeState] + checkFrequency time.Duration + disconnectedAfter time.Duration +} + +type TimestampedHeartbeat struct { + Heartbeat + Timestamp int64 +} + +func NewServer(params HeartbeatServerParams) (*HeartbeatServer, error) { + subParams := natsPubSub.PubSubParams{ + Subject: params.Topic, + Conn: params.Client, + } + + subscription, err := natsPubSub.NewPubSub[Heartbeat](subParams) + if err != nil { + return nil, err + } + + pqueue := collections.NewHashedPriorityQueue[string, TimestampedHeartbeat]( + func(h TimestampedHeartbeat) string { + return h.NodeID + }, + ) + + // If no clock was specified, use the real time clock + clk := params.Clock + if clk == nil { + clk = clock.New() + } + + return &HeartbeatServer{ + clock: clk, + subscription: subscription, + pqueue: pqueue, + livenessMap: concurrency.NewStripedMap[models.NodeState](0), // no particular stripe count for now + checkFrequency: params.CheckFrequency, + disconnectedAfter: params.NodeDisconnectedAfter, + }, nil +} + +func (h *HeartbeatServer) Start(ctx context.Context) error { + if err := h.subscription.Subscribe(ctx, h); err != nil { + return err + } + + log.Ctx(ctx).Info().Msg("Heartbeat server started") + + tickerStartCh := make(chan struct{}) + + go func(ctx context.Context) { + defer func() { + if err := h.subscription.Close(ctx); err != nil { + log.Ctx(ctx).Error().Err(err).Msg("Error during heartbeat server shutdown") + } else { + log.Ctx(ctx).Info().Msg("Heartbeat server shutdown") + } + }() + + ticker := h.clock.Ticker(h.checkFrequency) + tickerStartCh <- struct{}{} + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + h.CheckQueue(ctx) + } + } + }(ctx) + + // Wait for the ticker to be created before returning + <-tickerStartCh + + return nil +} + +// CheckQueue will check the queue for old heartbeats that might make a node's +// liveness either unhealthy or unknown, and will update the node's status accordingly. +func (h *HeartbeatServer) CheckQueue(ctx context.Context) { + // These are the timestamps, below which we'll consider the item in one of those two + // states + nowStamp := h.clock.Now().UTC().Unix() + disconnectedUnder := nowStamp - int64(h.disconnectedAfter.Seconds()) + + for { + // Dequeue anything older than the unknown timestamp + item := h.pqueue.DequeueWhere(func(item TimestampedHeartbeat) bool { + return item.Timestamp < disconnectedUnder + }) + + // We haven't found anything old enough yet. We can stop the loop and wait + // for the next cycle. + if item == nil { + break + } + + if item.Value.Timestamp < disconnectedUnder { + h.markNodeAs(item.Value.NodeID, models.NodeStates.DISCONNECTED) + } + } +} + +// markNode will mark a node as being in a certain state. This will be used to update the node's +// info to include the liveness state. +func (h *HeartbeatServer) markNodeAs(nodeID string, state models.NodeState) { + h.livenessMap.Put(nodeID, state) +} + +// UpdateNode will add the liveness for specific nodes to their NodeInfo +func (h *HeartbeatServer) UpdateNodeInfo(nodeInfo *models.NodeInfo) { + if liveness, ok := h.livenessMap.Get(nodeInfo.NodeID); ok { + nodeInfo.State = liveness + } else { + // We've never seen this, so we'll mark it as unknown + nodeInfo.State = models.NodeStates.DISCONNECTED + } +} + +// FilterNodeInfos will return only those NodeInfos that have the requested liveness +func (h *HeartbeatServer) FilterNodeInfos(nodeInfos []*models.NodeInfo, state models.NodeState) []*models.NodeInfo { + result := make([]*models.NodeInfo, 0) + for _, nodeInfo := range nodeInfos { + if liveness, ok := h.livenessMap.Get(nodeInfo.NodeID); ok { + if liveness == state { + result = append(result, nodeInfo) + } + } + } + return result +} + +// RemoveNode will handle removing the liveness for a specific node. This is useful when a node +// is removed from the cluster. +func (h *HeartbeatServer) RemoveNode(nodeID string) { + h.livenessMap.Delete(nodeID) +} + +func (h *HeartbeatServer) Handle(ctx context.Context, message Heartbeat) error { + log.Ctx(ctx).Trace().Msgf("heartbeat received from %s", message.NodeID) + + timestamp := h.clock.Now().UTC().Unix() + + if h.pqueue.Contains(message.NodeID) { + // If we think we already have a heartbeat from this node, we'll update the + // timestamp of the entry so it is re-prioritized in the queue by dequeuing + // and re-enqueuing it (this will ensure it is heapified correctly). + result := h.pqueue.DequeueWhere(func(item TimestampedHeartbeat) bool { + return item.NodeID == message.NodeID + }) + + if result == nil { + log.Ctx(ctx).Warn().Msgf("consistency error in heartbeat heap, node %s not found", message.NodeID) + return nil + } + + log.Ctx(ctx).Trace().Msgf("Re-enqueueing heartbeat from %s", message.NodeID) + result.Value.Timestamp = timestamp + h.pqueue.Enqueue(result.Value, timestamp) + } else { + log.Ctx(ctx).Trace().Msgf("Enqueueing heartbeat from %s", message.NodeID) + + // We'll enqueue the heartbeat message with the current timestamp. The older + // the entry, the lower the timestamp (trending to 0) and the higher the priority. + h.pqueue.Enqueue(TimestampedHeartbeat{Heartbeat: message, Timestamp: timestamp}, timestamp) + } + + h.markNodeAs(message.NodeID, models.NodeStates.HEALTHY) + + return nil +} + +var _ pubsub.Subscriber[Heartbeat] = (*HeartbeatServer)(nil) diff --git a/pkg/node/heartbeat/types.go b/pkg/node/heartbeat/types.go new file mode 100644 index 0000000000..0d7876ec59 --- /dev/null +++ b/pkg/node/heartbeat/types.go @@ -0,0 +1,11 @@ +package heartbeat + +// Heartbeat represents a heartbeat message from a specific node. +// It contains the node ID and the sequence number of the heartbeat +// which is monotonically increasing (reboots aside). We do not +// use timestamps on the client, we rely solely on the server-side +// time to avoid clock drift issues. +type Heartbeat struct { + NodeID string + Sequence uint64 +} diff --git a/pkg/node/manager/node_manager.go b/pkg/node/manager/node_manager.go index abb6acf809..bc8eb5fc7d 100644 --- a/pkg/node/manager/node_manager.go +++ b/pkg/node/manager/node_manager.go @@ -8,6 +8,7 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/lib/concurrency" "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/models/requests" + "github.com/bacalhau-project/bacalhau/pkg/node/heartbeat" "github.com/bacalhau-project/bacalhau/pkg/routing" "github.com/libp2p/go-libp2p/core/peer" "github.com/pkg/errors" @@ -25,10 +26,12 @@ const ( type NodeManager struct { nodeInfo routing.NodeInfoStore resourceMap *concurrency.StripedMap[models.Resources] + heartbeats *heartbeat.HeartbeatServer } type NodeManagerParams struct { - NodeInfo routing.NodeInfoStore + NodeInfo routing.NodeInfoStore + Heartbeats *heartbeat.HeartbeatServer } // NewNodeManager constructs a new node manager and returns a pointer @@ -37,9 +40,24 @@ func NewNodeManager(params NodeManagerParams) *NodeManager { return &NodeManager{ resourceMap: concurrency.NewStripedMap[models.Resources](resourceMapLockCount), nodeInfo: params.NodeInfo, + heartbeats: params.Heartbeats, } } +func (n *NodeManager) Start(ctx context.Context) error { + if n.heartbeats != nil { + err := n.heartbeats.Start(ctx) + if err != nil { + log.Ctx(ctx).Error().Err(err).Msg("failed to start heartbeat server") + return err + } + } + + log.Ctx(ctx).Info().Msg("Node manager started") + + return nil +} + // // ---- Implementation of compute.ManagementEndpoint ---- // @@ -101,7 +119,7 @@ func (n *NodeManager) UpdateInfo(ctx context.Context, request requests.UpdateInf }, nil } - // TODO(ross): Add a Put endpoint that takes the revision into account + // TODO: Add a Put endpoint that takes the revision into account? if err := n.nodeInfo.Add(ctx, request.Info); err != nil { return nil, errors.Wrap(err, "failed to save nodeinfo during node registration") } @@ -142,11 +160,15 @@ func (n *NodeManager) Add(ctx context.Context, nodeInfo models.NodeInfo) error { return n.nodeInfo.Add(ctx, nodeInfo) } -func (n *NodeManager) addResourcesToInfo(ctx context.Context, info *models.NodeInfo) { +func (n *NodeManager) addToInfo(ctx context.Context, info *models.NodeInfo) { resources, found := n.resourceMap.Get(info.NodeID) if found && info.ComputeNodeInfo != nil { info.ComputeNodeInfo.AvailableCapacity = resources } + + if n.heartbeats != nil { + n.heartbeats.UpdateNodeInfo(info) + } } func (n *NodeManager) Get(ctx context.Context, nodeID string) (models.NodeInfo, error) { @@ -154,7 +176,7 @@ func (n *NodeManager) Get(ctx context.Context, nodeID string) (models.NodeInfo, if err != nil { return models.NodeInfo{}, err } - n.addResourcesToInfo(ctx, &info) + n.addToInfo(ctx, &info) return info, nil } @@ -163,7 +185,7 @@ func (n *NodeManager) GetByPrefix(ctx context.Context, prefix string) (models.No if err != nil { return models.NodeInfo{}, err } - n.addResourcesToInfo(ctx, &info) + n.addToInfo(ctx, &info) return info, nil } @@ -174,7 +196,7 @@ func (n *NodeManager) List(ctx context.Context, filters ...routing.NodeInfoFilte } for i := range items { - n.addResourcesToInfo(ctx, &items[i]) + n.addToInfo(ctx, &items[i]) } return items, nil diff --git a/pkg/node/node.go b/pkg/node/node.go index 9cd738f386..d43d8dcd0d 100644 --- a/pkg/node/node.go +++ b/pkg/node/node.go @@ -22,6 +22,7 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/model" "github.com/bacalhau-project/bacalhau/pkg/models" nats_transport "github.com/bacalhau-project/bacalhau/pkg/nats/transport" + "github.com/bacalhau-project/bacalhau/pkg/node/heartbeat" "github.com/bacalhau-project/bacalhau/pkg/node/manager" "github.com/bacalhau-project/bacalhau/pkg/node/metrics" "github.com/bacalhau-project/bacalhau/pkg/publicapi" @@ -195,10 +196,13 @@ func NewNode( } // node info store that is used for both discovering compute nodes, as to find addresses of other nodes for routing requests. + var natsConfig *nats_transport.NATSTransportConfig var transportLayer transport.TransportLayer var tracingInfoStore routing.NodeInfoStore + var heartbeatSvr *heartbeat.HeartbeatServer + if config.NetworkConfig.Type == models.NetworkTypeNATS { - natsConfig := nats_transport.NATSTransportConfig{ + natsConfig = &nats_transport.NATSTransportConfig{ NodeID: config.NodeID, Port: config.NetworkConfig.Port, AdvertisedAddress: config.NetworkConfig.AdvertisedAddress, @@ -234,6 +238,17 @@ func NewNode( } tracingInfoStore = tracing.NewNodeStore(nodeInfoStore) + heartbeatParams := heartbeat.HeartbeatServerParams{ + Client: natsClient.Client, + Topic: config.RequesterNodeConfig.ControlPlaneSettings.HeartbeatTopic, + CheckFrequency: config.RequesterNodeConfig.ControlPlaneSettings.HeartbeatCheckFrequency.AsTimeDuration(), + NodeDisconnectedAfter: config.RequesterNodeConfig.ControlPlaneSettings.NodeDisconnectedAfter.AsTimeDuration(), + } + heartbeatSvr, err = heartbeat.NewServer(heartbeatParams) + if err != nil { + return nil, pkgerrors.Wrap(err, "failed to create heartbeat server using NATS transport connection info") + } + // Once the KV store has been created, it can be offered to the transport layer to be used as a consumer // of node info. if err := transportLayer.RegisterNodeInfoConsumer(ctx, tracingInfoStore); err != nil { @@ -283,11 +298,20 @@ func NewNode( ) // Create a new node manager to keep track of compute nodes connecting - // to the network. + // to the network. Provide it with a mechanism to lookup (and enhance) + // node info, and a reference to the heartbeat server if running NATS. nodeManager := manager.NewNodeManager(manager.NodeManagerParams{ - NodeInfo: tracingInfoStore, + NodeInfo: tracingInfoStore, + Heartbeats: heartbeatSvr, }) + // Start the nodemanager, ensuring it doesn't block the main thread and + // that any errors are logged. If we are unable to start the manager + // then we should not start the node. + if err := nodeManager.Start(ctx); err != nil { + return nil, pkgerrors.Wrap(err, "failed to start node manager") + } + // NodeManager node wraps the node manager and implements the routing.NodeInfoStore // interface so that it can return nodes and add the most recent resource information // to the node info returned. When the libp2p transport is no longer necessary, we @@ -348,6 +372,28 @@ func NewNode( attribute.StringSlice("node_engines", executors.Keys(ctx)), ) + var hbClient *heartbeat.HeartbeatClient + + // We want to provide a heartbeat client to the compute node if we are using NATS. + // We can only create a heartbeat client if we have a NATS client, and we can + // only do that if the configuration is available. Whilst we support libp2p this + // is not always the case. + if natsConfig != nil { + natsClient, err := nats_transport.CreateClient(ctx, natsConfig) + if err != nil { + return nil, pkgerrors.Wrap(err, "failed to create NATS client for node info store") + } + + hbClient, err = heartbeat.NewClient( + natsClient.Client, + config.NodeID, + config.ComputeConfig.ControlPlaneSettings.HeartbeatTopic, + ) + if err != nil { + return nil, pkgerrors.Wrap(err, "failed to create heartbeat client") + } + } + // setup compute node computeNode, err = NewComputeNode( ctx, @@ -362,6 +408,7 @@ func NewNode( transportLayer.CallbackProxy(), transportLayer.ManagementProxy(), config.Labels, + hbClient, ) if err != nil { return nil, err diff --git a/pkg/publicapi/apimodels/node.go b/pkg/publicapi/apimodels/node.go index 05ebf60b65..39933fdd28 100644 --- a/pkg/publicapi/apimodels/node.go +++ b/pkg/publicapi/apimodels/node.go @@ -17,8 +17,9 @@ type GetNodeResponse struct { type ListNodesRequest struct { BaseListRequest - Labels []labels.Requirement `query:"-"` // don't auto bind as it requires special handling - FilterByStatus string `query:"filter-status"` + Labels []labels.Requirement `query:"-"` // don't auto bind as it requires special handling + FilterByApproval string `query:"filter-approval"` + FilterByStatus string `query:"filter-status"` } // ToHTTPRequest is used to convert the request to an HTTP request @@ -29,6 +30,10 @@ func (o *ListNodesRequest) ToHTTPRequest() *HTTPRequest { r.Params.Add("labels", v.String()) } + if o.FilterByApproval != "" { + r.Params.Add("filter-approval", o.FilterByApproval) + } + if o.FilterByStatus != "" { r.Params.Add("filter-status", o.FilterByStatus) } diff --git a/pkg/publicapi/endpoint/orchestrator/node.go b/pkg/publicapi/endpoint/orchestrator/node.go index c4817f5341..7533a35768 100644 --- a/pkg/publicapi/endpoint/orchestrator/node.go +++ b/pkg/publicapi/endpoint/orchestrator/node.go @@ -28,7 +28,6 @@ func (e *Endpoint) getNode(c echo.Context) error { }) } -//nolint:gocyclo // cyclomatic complexity is high here becomes of the complex sorting logic func (e *Endpoint) listNodes(c echo.Context) error { ctx := c.Request().Context() var args apimodels.ListNodesRequest @@ -53,35 +52,11 @@ func (e *Endpoint) listNodes(c echo.Context) error { } // parse order_by - var sortFnc func(a, b *models.NodeInfo) int - switch args.OrderBy { - case "id", "": - sortFnc = func(a, b *models.NodeInfo) int { return util.Compare[string]{}.Cmp(a.ID(), b.ID()) } - case "type": - sortFnc = func(a, b *models.NodeInfo) int { return util.Compare[models.NodeType]{}.Cmp(a.NodeType, b.NodeType) } - case "available_cpu": - sortFnc = func(a, b *models.NodeInfo) int { - return util.Compare[float64]{}.CmpRev(capacity(a).CPU, capacity(b).CPU) - } - case "available_memory": - sortFnc = func(a, b *models.NodeInfo) int { - return util.Compare[uint64]{}.CmpRev(capacity(a).Memory, capacity(b).Memory) - } - case "available_disk": - sortFnc = func(a, b *models.NodeInfo) int { - return util.Compare[uint64]{}.CmpRev(capacity(a).Disk, capacity(b).Disk) - } - case "available_gpu": - sortFnc = func(a, b *models.NodeInfo) int { - return util.Compare[uint64]{}.CmpRev(capacity(a).GPU, capacity(b).GPU) - } - case "approval", "status": - sortFnc = func(a, b *models.NodeInfo) int { - return util.Compare[string]{}.Cmp(a.Approval.String(), b.Approval.String()) - } - default: + sortFnc := e.getSortFunction(args.OrderBy, capacity) + if sortFnc == nil { return echo.NewHTTPError(http.StatusBadRequest, "invalid order_by") } + if args.Reverse { baseSortFnc := sortFnc sortFnc = func(a, b *models.NodeInfo) int { @@ -102,12 +77,17 @@ func (e *Endpoint) listNodes(c echo.Context) error { return err } + args.FilterByApproval = strings.ToUpper(args.FilterByApproval) args.FilterByStatus = strings.ToUpper(args.FilterByStatus) // filter nodes, first by status, then by label selectors res := make([]*models.NodeInfo, 0) for i, node := range allNodes { - if args.FilterByStatus != "" && args.FilterByStatus != node.Approval.String() { + if args.FilterByApproval != "" && args.FilterByApproval != node.Approval.String() { + continue + } + + if args.FilterByStatus != "" && args.FilterByStatus != node.State.String() { continue } @@ -130,6 +110,41 @@ func (e *Endpoint) listNodes(c echo.Context) error { }) } +type resourceFunc func(node *models.NodeInfo) *models.Resources +type sortFunc func(a, b *models.NodeInfo) int + +func (e *Endpoint) getSortFunction(orderBy string, capacity resourceFunc) sortFunc { + switch orderBy { + case "id", "": + return func(a, b *models.NodeInfo) int { return util.Compare[string]{}.Cmp(a.ID(), b.ID()) } + case "type": + return func(a, b *models.NodeInfo) int { return util.Compare[models.NodeType]{}.Cmp(a.NodeType, b.NodeType) } + case "available_cpu": + return func(a, b *models.NodeInfo) int { + return util.Compare[float64]{}.CmpRev(capacity(a).CPU, capacity(b).CPU) + } + case "available_memory": + return func(a, b *models.NodeInfo) int { + return util.Compare[uint64]{}.CmpRev(capacity(a).Memory, capacity(b).Memory) + } + case "available_disk": + return func(a, b *models.NodeInfo) int { + return util.Compare[uint64]{}.CmpRev(capacity(a).Disk, capacity(b).Disk) + } + case "available_gpu": + return func(a, b *models.NodeInfo) int { + return util.Compare[uint64]{}.CmpRev(capacity(a).GPU, capacity(b).GPU) + } + case "approval", "status": + return func(a, b *models.NodeInfo) int { + return util.Compare[string]{}.Cmp(a.Approval.String(), b.Approval.String()) + } + default: + } + + return nil +} + func (e *Endpoint) updateNode(c echo.Context) error { ctx := c.Request().Context() diff --git a/pkg/test/compute/setup_test.go b/pkg/test/compute/setup_test.go index 78c50d8868..45eb87fa47 100644 --- a/pkg/test/compute/setup_test.go +++ b/pkg/test/compute/setup_test.go @@ -119,6 +119,7 @@ func (s *ComputeSuite) setupNode() { callback, nil, // until we switch to testing with NATS map[string]string{}, // empty configured labels + nil, // no heartbeat client ) s.NoError(err) s.stateResolver = *resolver.NewStateResolver(resolver.StateResolverParams{ From c48db717472d3e89e3fca074431d01dc8ee00c54 Mon Sep 17 00:00:00 2001 From: Simon Worthington Date: Tue, 9 Apr 2024 15:13:06 +1000 Subject: [PATCH 03/17] Ensure compute node labels are published over LibP2P connections A previous commit accidentally removed the labels used by the compute node from the NodeInfoProvider used by LibP2P. This commit restores those labels to that provider and adds a test that labels are available via the CLI in both LibP2P and NATS networking. --- pkg/node/compute.go | 4 ++-- pkg/node/node.go | 10 +++++---- test/labels.sh | 49 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+), 6 deletions(-) create mode 100755 test/labels.sh diff --git a/pkg/node/compute.go b/pkg/node/compute.go index b6168c21fd..0a2fe69032 100644 --- a/pkg/node/compute.go +++ b/pkg/node/compute.go @@ -41,7 +41,7 @@ type Compute struct { ManagementClient *compute.ManagementClient cleanupFunc func(ctx context.Context) nodeInfoDecorator models.NodeInfoDecorator - autoLabelsProvider models.LabelsProvider + labelsProvider models.LabelsProvider debugInfoProviders []model.DebugInfoProvider } @@ -240,7 +240,7 @@ func NewComputeNode( Bidder: bidder, cleanupFunc: cleanupFunc, nodeInfoDecorator: nodeInfoDecorator, - autoLabelsProvider: labelsProvider, + labelsProvider: labelsProvider, debugInfoProviders: debugInfoProviders, ManagementClient: managementClient, }, nil diff --git a/pkg/node/node.go b/pkg/node/node.go index d43d8dcd0d..89447add8b 100644 --- a/pkg/node/node.go +++ b/pkg/node/node.go @@ -281,10 +281,7 @@ func NewNode( var requesterNode *Requester var computeNode *Compute - labelsProvider := models.MergeLabelsInOrder( - &ConfigLabelsProvider{staticLabels: config.Labels}, - &RuntimeLabelsProvider{}, - ) + var labelsProvider models.LabelsProvider // setup requester node if config.IsRequesterNode { @@ -351,6 +348,10 @@ func NewNode( } } + labelsProvider = models.MergeLabelsInOrder( + &ConfigLabelsProvider{staticLabels: config.Labels}, + &RuntimeLabelsProvider{}, + ) debugInfoProviders = append(debugInfoProviders, requesterNode.debugInfoProviders...) } @@ -419,6 +420,7 @@ func NewNode( return nil, err } + labelsProvider = computeNode.labelsProvider debugInfoProviders = append(debugInfoProviders, computeNode.debugInfoProviders...) } diff --git a/test/labels.sh b/test/labels.sh new file mode 100755 index 0000000000..8a9ccec96e --- /dev/null +++ b/test/labels.sh @@ -0,0 +1,49 @@ +#!bin/bashtub + +source bin/bacalhau.sh + +run_test() { + WORD=$RANDOM + subject bacalhau config set node.labels key=value "random=$WORD" + create_node $1 + + # Wait for node to have published information. + subject bacalhau node list --output=json + while ! jq -e 'length > 0' <<< $stdout 1>/dev/null; do + sleep 0.05; + subject bacalhau node list --output=json + done + + assert_equal 1 $(jq -rcM length <<< $stdout) + assert_not_equal 0 $(jq -rcM '.[0].Labels | length' <<< $stdout) + assert_equal false $(jq -rcM '.[0].Labels["Operating-System"] == null' <<< $stdout) + assert_equal false $(jq -rcM '.[0].Labels["Architecture"] == null' <<< $stdout) + assert_equal value $(jq -rcM '.[0].Labels["key"]' <<< $stdout) + assert_equal $WORD $(jq -rcM '.[0].Labels["random"]' <<< $stdout) +} + +testcase_receive_labels_about_requester_node_for_nats() { + subject bacalhau config set node.network.type nats + assert_equal 0 $status + run_test requester +} + +testcase_receive_extra_labels_about_compute_node_for_nats() { + subject bacalhau config set node.network.type nats + assert_equal 0 $status + run_test requester,compute + assert_equal false $(jq -rcM '.[0].Labels["git-lfs"] == null' <<< $stdout) +} + +testcase_receive_labels_about_requester_node_for_libp2p() { + subject bacalhau config set node.network.type libp2p + assert_equal 0 $status + run_test requester +} + +testcase_receive_extra_labels_about_compute_node_for_libp2p() { + subject bacalhau config set node.network.type libp2p + assert_equal 0 $status + run_test requester,compute + assert_equal false $(jq -rcM '.[0].Labels["git-lfs"] == null' <<< $stdout) +} From 860d58c19467903d7616a74cc5f1988e100b8050 Mon Sep 17 00:00:00 2001 From: Ross Jones Date: Tue, 9 Apr 2024 11:27:08 +0100 Subject: [PATCH 04/17] Use RunE instead of Run for cobra commands (#3767) We were inconsistently using Run/RunE in various places, and this PR fixes that to ensure that all commands use RunE and rely on the call to util.Fatal in the root of the command. Resolves #3764 --- cmd/cli/agent/alive.go | 10 +++++---- cmd/cli/agent/node.go | 10 +++++---- cmd/cli/agent/version.go | 10 +++++---- cmd/cli/create/create.go | 6 ++---- cmd/cli/create/create_test.go | 13 +---------- cmd/cli/describe/describe.go | 6 ++---- cmd/cli/describe/describe_test.go | 6 +----- cmd/cli/devstack/devstack.go | 12 ++++------- cmd/cli/docker/docker_run.go | 6 ++---- cmd/cli/docker/docker_run_test.go | 14 +++--------- cmd/cli/exec/exec.go | 6 ++---- cmd/cli/get/get.go | 6 ++---- cmd/cli/id/id.go | 12 ++++------- cmd/cli/job/describe.go | 14 ++++++------ cmd/cli/job/executions.go | 10 +++++---- cmd/cli/job/history.go | 10 +++++---- cmd/cli/job/list.go | 12 ++++++----- cmd/cli/job/logs.go | 6 ++---- cmd/cli/job/run.go | 36 ++++++++++++++----------------- cmd/cli/list/list.go | 6 ++---- cmd/cli/node/describe.go | 10 +++++---- cmd/cli/node/list.go | 6 +++--- cmd/cli/serve/serve.go | 6 ++---- cmd/cli/validate/validate.go | 7 ++---- cmd/cli/validate/validate_test.go | 12 ++++------- cmd/cli/version/version.go | 6 ++---- cmd/cli/wasm/wasm_run.go | 6 ++---- cmd/cli/wasm/wasm_run_test.go | 2 -- cmd/testing/base.go | 2 -- cmd/testing/basetls.go | 3 --- go.work.sum | 1 + 31 files changed, 110 insertions(+), 162 deletions(-) diff --git a/cmd/cli/agent/alive.go b/cmd/cli/agent/alive.go index 93199bac50..53ea1f6cd1 100644 --- a/cmd/cli/agent/alive.go +++ b/cmd/cli/agent/alive.go @@ -28,22 +28,24 @@ func NewAliveCmd() *cobra.Command { Use: "alive", Short: "Get the agent's liveness and health info.", Args: cobra.NoArgs, - Run: o.runAlive, + RunE: o.runAlive, } aliveCmd.Flags().AddFlagSet(cliflags.OutputNonTabularFormatFlags(&o.OutputOpts)) return aliveCmd } // Run executes alive command -func (o *AliveOptions) runAlive(cmd *cobra.Command, _ []string) { +func (o *AliveOptions) runAlive(cmd *cobra.Command, _ []string) error { ctx := cmd.Context() response, err := util.GetAPIClientV2(cmd).Agent().Alive(ctx) if err != nil { - util.Fatal(cmd, fmt.Errorf("could not get server alive: %w", err), 1) + return fmt.Errorf("could not get server alive: %w", err) } writeErr := output.OutputOneNonTabular(cmd, o.OutputOpts, response) if writeErr != nil { - util.Fatal(cmd, fmt.Errorf("failed to write alive: %w", writeErr), 1) + return fmt.Errorf("failed to write alive: %w", writeErr) } + + return nil } diff --git a/cmd/cli/agent/node.go b/cmd/cli/agent/node.go index 4a01f7bad1..4fe10f36c5 100644 --- a/cmd/cli/agent/node.go +++ b/cmd/cli/agent/node.go @@ -29,22 +29,24 @@ func NewNodeCmd() *cobra.Command { Use: "node", Short: "Get the agent's node info.", Args: cobra.NoArgs, - Run: o.runNode, + RunE: o.runNode, } nodeCmd.Flags().AddFlagSet(cliflags.OutputNonTabularFormatFlags(&o.OutputOpts)) return nodeCmd } // Run executes node command -func (o *NodeOptions) runNode(cmd *cobra.Command, _ []string) { +func (o *NodeOptions) runNode(cmd *cobra.Command, _ []string) error { ctx := cmd.Context() response, err := util.GetAPIClientV2(cmd).Agent().Node(ctx, &apimodels.GetAgentNodeRequest{}) if err != nil { - util.Fatal(cmd, fmt.Errorf("could not get server node: %w", err), 1) + return fmt.Errorf("could not get server node: %w", err) } writeErr := output.OutputOneNonTabular(cmd, o.OutputOpts, response.NodeInfo) if writeErr != nil { - util.Fatal(cmd, fmt.Errorf("failed to write node: %w", writeErr), 1) + return fmt.Errorf("failed to write node: %w", writeErr) } + + return nil } diff --git a/cmd/cli/agent/version.go b/cmd/cli/agent/version.go index b53ad854f8..c794afa7fd 100644 --- a/cmd/cli/agent/version.go +++ b/cmd/cli/agent/version.go @@ -29,18 +29,18 @@ func NewVersionCmd() *cobra.Command { Use: "version", Short: "Get the agent version.", Args: cobra.NoArgs, - Run: oV.runVersion, + RunE: oV.runVersion, } versionCmd.Flags().AddFlagSet(cliflags.OutputNonTabularFormatFlags(&oV.OutputOpts)) return versionCmd } // Run executes version command -func (oV *VersionOptions) runVersion(cmd *cobra.Command, _ []string) { +func (oV *VersionOptions) runVersion(cmd *cobra.Command, _ []string) error { ctx := cmd.Context() serverVersionResponse, err := util.GetAPIClientV2(cmd).Agent().Version(ctx) if err != nil { - util.Fatal(cmd, fmt.Errorf("could not get server version: %w", err), 1) + return fmt.Errorf("could not get server version: %w", err) } v := serverVersionResponse.BuildVersionInfo @@ -58,6 +58,8 @@ func (oV *VersionOptions) runVersion(cmd *cobra.Command, _ []string) { } if writeErr != nil { - util.Fatal(cmd, fmt.Errorf("failed to write version: %w", writeErr), 1) + return fmt.Errorf("failed to write version: %w", writeErr) } + + return nil } diff --git a/cmd/cli/create/create.go b/cmd/cli/create/create.go index 45313ae93d..2a9c1bf5b4 100644 --- a/cmd/cli/create/create.go +++ b/cmd/cli/create/create.go @@ -67,10 +67,8 @@ func NewCmd() *cobra.Command { Args: cobra.MinimumNArgs(0), PreRunE: hook.RemoteCmdPreRunHooks, PostRunE: hook.RemoteCmdPostRunHooks, - Run: func(cmd *cobra.Command, cmdArgs []string) { - if err := create(cmd, cmdArgs, OC); err != nil { - util.Fatal(cmd, err, 1) - } + RunE: func(cmd *cobra.Command, cmdArgs []string) error { + return create(cmd, cmdArgs, OC) }, } diff --git a/cmd/cli/create/create_test.go b/cmd/cli/create/create_test.go index 8ce6684fd5..ca0087446b 100644 --- a/cmd/cli/create/create_test.go +++ b/cmd/cli/create/create_test.go @@ -5,11 +5,9 @@ package create_test import ( "context" "os" - "strings" "sync" "testing" - "github.com/bacalhau-project/bacalhau/pkg/lib/marshaller" "github.com/spf13/cobra" "github.com/stretchr/testify/require" "github.com/stretchr/testify/suite" @@ -139,14 +137,5 @@ func (s *CreateSuite) TestCreateDontPanicOnEmptyFile() { commandReturnValue := <-commandChan - errorOutputMap := make(map[string]interface{}) - for _, o := range strings.Split(commandReturnValue.out, "\n") { - err := marshaller.YAMLUnmarshalWithMax([]byte(o), &errorOutputMap) - if err != nil { - continue - } - } - - require.Contains(s.T(), errorOutputMap["Message"], "the job provided is invalid", "Output message should error properly.") - require.Equal(s.T(), int(errorOutputMap["Code"].(float64)), 1, "Expected no error when no input is provided") + require.Contains(s.T(), commandReturnValue.out, "the job provided is invalid", "Output message should error properly.") } diff --git a/cmd/cli/describe/describe.go b/cmd/cli/describe/describe.go index b737bb080a..88fbdeee3b 100644 --- a/cmd/cli/describe/describe.go +++ b/cmd/cli/describe/describe.go @@ -59,10 +59,8 @@ func NewCmd() *cobra.Command { Args: cobra.ExactArgs(1), PreRunE: hook.RemoteCmdPreRunHooks, PostRunE: hook.RemoteCmdPostRunHooks, - Run: func(cmd *cobra.Command, cmdArgs []string) { // nolintunparam // incorrectly suggesting unused - if err := describe(cmd, cmdArgs, OD); err != nil { - util.Fatal(cmd, err, 1) - } + RunE: func(cmd *cobra.Command, cmdArgs []string) error { + return describe(cmd, cmdArgs, OD) }, } diff --git a/cmd/cli/describe/describe_test.go b/cmd/cli/describe/describe_test.go index dda4225585..2e133e88fa 100644 --- a/cmd/cli/describe/describe_test.go +++ b/cmd/cli/describe/describe_test.go @@ -13,7 +13,6 @@ import ( "github.com/stretchr/testify/suite" cmdtesting "github.com/bacalhau-project/bacalhau/cmd/testing" - "github.com/bacalhau-project/bacalhau/cmd/util" "github.com/bacalhau-project/bacalhau/pkg/bacerrors" legacy_job "github.com/bacalhau-project/bacalhau/pkg/legacyjob" "github.com/bacalhau-project/bacalhau/pkg/model" @@ -196,7 +195,6 @@ func (s *DescribeSuite) TestDescribeJobEdgeCases() { {numOfJobs: 1}, // just enough that describe could get screwed up } - util.Fatal = util.FakeFatalErrorHandler for _, tc := range tests { for _, n := range numOfJobsTests { func() { @@ -247,10 +245,8 @@ func (s *DescribeSuite) TestDescribeJobEdgeCases() { returnedJobEngineSpec, fmt.Sprintf("Submitted job entrypoints not the same as the description. Edgecase: %s", tc.describeIDEdgecase)) } else { - c := &model.TestFatalErrorHandlerContents{} - s.Require().NoError(model.JSONUnmarshalWithMax([]byte(out), &c)) e := bacerrors.NewJobNotFound(tc.describeIDEdgecase) - s.Require().Contains(c.Message, e.GetMessage(), "Job not found error string not found.", err) + s.Require().Contains(string(out), e.GetMessage(), "Job not found error string not found.", err) } }() diff --git a/cmd/cli/devstack/devstack.go b/cmd/cli/devstack/devstack.go index 42310e1b02..ecd854c50c 100644 --- a/cmd/cli/devstack/devstack.go +++ b/cmd/cli/devstack/devstack.go @@ -81,15 +81,11 @@ func NewCmd() *cobra.Command { Short: "Start a cluster of bacalhau nodes for testing and development", Long: devStackLong, Example: devstackExample, - PreRun: func(cmd *cobra.Command, _ []string) { - if err := configflags.BindFlags(cmd, devstackFlags); err != nil { - util.Fatal(cmd, err, 1) - } + PreRunE: func(cmd *cobra.Command, _ []string) error { + return configflags.BindFlags(cmd, devstackFlags) }, - Run: func(cmd *cobra.Command, _ []string) { - if err := runDevstack(cmd, ODs, IsNoop); err != nil { - util.Fatal(cmd, err, 1) - } + RunE: func(cmd *cobra.Command, _ []string) error { + return runDevstack(cmd, ODs, IsNoop) }, } diff --git a/cmd/cli/docker/docker_run.go b/cmd/cli/docker/docker_run.go index f9c5cef7a2..ec60c78cdf 100644 --- a/cmd/cli/docker/docker_run.go +++ b/cmd/cli/docker/docker_run.go @@ -105,10 +105,8 @@ func newDockerRunCmd() *cobra.Command { //nolint:funlen Args: cobra.MinimumNArgs(1), PreRunE: hook.Chain(hook.RemoteCmdPreRunHooks, configflags.PreRun(dockerRunFlags)), PostRunE: hook.RemoteCmdPostRunHooks, - Run: func(cmd *cobra.Command, cmdArgs []string) { - if err := dockerRun(cmd, cmdArgs, opts); err != nil { - util.Fatal(cmd, err, 1) - } + RunE: func(cmd *cobra.Command, cmdArgs []string) error { + return dockerRun(cmd, cmdArgs, opts) }, } diff --git a/cmd/cli/docker/docker_run_test.go b/cmd/cli/docker/docker_run_test.go index 54f66fbca7..14f14a3d64 100644 --- a/cmd/cli/docker/docker_run_test.go +++ b/cmd/cli/docker/docker_run_test.go @@ -21,7 +21,6 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/models" cmdtesting "github.com/bacalhau-project/bacalhau/cmd/testing" - "github.com/bacalhau-project/bacalhau/cmd/util" "github.com/bacalhau-project/bacalhau/pkg/devstack" "github.com/bacalhau-project/bacalhau/pkg/docker" "github.com/bacalhau-project/bacalhau/pkg/ipfs" @@ -45,7 +44,6 @@ type DockerRunSuite struct { // In order for 'go test' to run this suite, we need to create // a normal test function and pass our suite to suite.Run func TestDockerRunSuite(t *testing.T) { - util.Fatal = util.FakeFatalErrorHandler suite.Run(t, new(DockerRunSuite)) } @@ -359,11 +357,8 @@ func (s *DockerRunSuite) TestRun_SubmitOutputs() { _, out, err := s.ExecuteTestCobraCommand(flagsArray...) if tcids.err != "" { - firstFatalError, err := testutils.FirstFatalError(s.T(), out) - - s.Require().NoErrorf(err, "Error unmarshaling errors. Run - Number of Jobs: %s. Job number: %s", tc.numberOfJobs, i) - s.Require().Greaterf(firstFatalError.Code, 0, "Expected an error, but none provided. %+v", tcids) - s.Require().Contains(firstFatalError.Message, "invalid output volume", "Missed detection of invalid output volume.") + s.Require().Error(err) + s.Require().Contains(string(out), "invalid output volume", "Missed detection of invalid output volume.") return // Go to next in loop } s.Require().NoError(err, "Error submitting job. Run - Number of Jobs: %d. Job number: %d", tc.numberOfJobs, i) @@ -579,10 +574,7 @@ func (s *DockerRunSuite) TestRun_SubmitWorkdir() { _, out, err := s.ExecuteTestCobraCommand(flagsArray...) if tc.errorCode != 0 { - fatalError, err := testutils.FirstFatalError(s.T(), out) - s.Require().NoError(err, "Error getting first fatal error") - - s.Require().NotNil(fatalError, "Expected fatal error, but none found") + s.Require().NotNil(err, "Expected fatal error, but none found") } else { s.Require().NoError(err, "Error submitting job.") diff --git a/cmd/cli/exec/exec.go b/cmd/cli/exec/exec.go index 9e9e944127..d3039b3a39 100644 --- a/cmd/cli/exec/exec.go +++ b/cmd/cli/exec/exec.go @@ -80,15 +80,13 @@ func NewCmdWithOptions(options *ExecOptions) *cobra.Command { PreRunE: hook.RemoteCmdPreRunHooks, PostRunE: hook.RemoteCmdPostRunHooks, FParseErrWhitelist: cobra.FParseErrWhitelist{UnknownFlags: true}, - Run: func(cmd *cobra.Command, cmdArgs []string) { + RunE: func(cmd *cobra.Command, cmdArgs []string) error { // Find the unknown arguments from the original args. We only want to find the // flags that are unknown. We will only support the long form for custom // job types as we will want to use them as keys in template completions. unknownArgs := ExtractUnknownArgs(cmd.Flags(), os.Args[1:]) - if err := exec(cmd, cmdArgs, unknownArgs, options); err != nil { - util.Fatal(cmd, err, 1) - } + return exec(cmd, cmdArgs, unknownArgs, options) }, } diff --git a/cmd/cli/get/get.go b/cmd/cli/get/get.go index 0978ce187e..381fd2dd33 100644 --- a/cmd/cli/get/get.go +++ b/cmd/cli/get/get.go @@ -55,10 +55,8 @@ func NewCmd() *cobra.Command { Args: cobra.ExactArgs(1), PreRunE: hook.Chain(hook.RemoteCmdPreRunHooks, configflags.PreRun(getFlags)), PostRunE: hook.RemoteCmdPostRunHooks, - Run: func(cmd *cobra.Command, cmdArgs []string) { - if err := get(cmd, cmdArgs, OG); err != nil { - util.Fatal(cmd, err, 1) - } + RunE: func(cmd *cobra.Command, cmdArgs []string) error { + return get(cmd, cmdArgs, OG) }, } diff --git a/cmd/cli/id/id.go b/cmd/cli/id/id.go index b914d5a8d4..09b9e1b097 100644 --- a/cmd/cli/id/id.go +++ b/cmd/cli/id/id.go @@ -31,15 +31,11 @@ func NewCmd() *cobra.Command { idCmd := &cobra.Command{ Use: "id", Short: "Show bacalhau node id info", - PreRun: func(cmd *cobra.Command, _ []string) { - if err := configflags.BindFlags(cmd, idFlags); err != nil { - util.Fatal(cmd, err, 1) - } + PreRunE: func(cmd *cobra.Command, _ []string) error { + return configflags.BindFlags(cmd, idFlags) }, - Run: func(cmd *cobra.Command, _ []string) { - if err := id(cmd, outputOpts); err != nil { - util.Fatal(cmd, err, 1) - } + RunE: func(cmd *cobra.Command, _ []string) error { + return id(cmd, outputOpts) }, } diff --git a/cmd/cli/job/describe.go b/cmd/cli/job/describe.go index c76088caee..9eebe1e29f 100644 --- a/cmd/cli/job/describe.go +++ b/cmd/cli/job/describe.go @@ -54,13 +54,13 @@ func NewDescribeCmd() *cobra.Command { Long: describeLong, Example: describeExample, Args: cobra.ExactArgs(1), - Run: o.run, + RunE: o.run, } jobCmd.Flags().AddFlagSet(cliflags.OutputNonTabularFormatFlags(&o.OutputOpts)) return jobCmd } -func (o *DescribeOptions) run(cmd *cobra.Command, args []string) { +func (o *DescribeOptions) run(cmd *cobra.Command, args []string) error { ctx := cmd.Context() jobID := args[0] response, err := util.GetAPIClientV2(cmd).Jobs().Get(ctx, &apimodels.GetJobRequest{ @@ -69,14 +69,14 @@ func (o *DescribeOptions) run(cmd *cobra.Command, args []string) { }) if err != nil { - util.Fatal(cmd, fmt.Errorf("could not get job %s: %w", jobID, err), 1) + return fmt.Errorf("could not get job %s: %w", jobID, err) } if o.OutputOpts.Format != "" { if err = output.OutputOneNonTabular(cmd, o.OutputOpts, response); err != nil { - util.Fatal(cmd, fmt.Errorf("failed to write job %s: %w", jobID, err), 1) + return fmt.Errorf("failed to write job %s: %w", jobID, err) } - return + return nil } job := response.Job @@ -89,9 +89,11 @@ func (o *DescribeOptions) run(cmd *cobra.Command, args []string) { o.printHeaderData(cmd, job) o.printExecutionsSummary(cmd, executions) if err = o.printExecutions(cmd, executions); err != nil { - util.Fatal(cmd, fmt.Errorf("failed to write job executions %s: %w", jobID, err), 1) + return fmt.Errorf("failed to write job executions %s: %w", jobID, err) } o.printOutputs(cmd, executions) + + return nil } func (o *DescribeOptions) printHeaderData(cmd *cobra.Command, job *models.Job) { diff --git a/cmd/cli/job/executions.go b/cmd/cli/job/executions.go index 82804eb60e..699b333009 100644 --- a/cmd/cli/job/executions.go +++ b/cmd/cli/job/executions.go @@ -59,7 +59,7 @@ func NewExecutionCmd() *cobra.Command { Long: executionLong, Example: executionExample, Args: cobra.ExactArgs(1), - Run: o.run, + RunE: o.run, } nodeCmd.Flags().AddFlagSet(cliflags.ListFlags(&o.ListOptions)) @@ -120,7 +120,7 @@ var executionColumns = []output.TableColumn[*models.Execution]{ executionColumnDesired, } -func (o *ExecutionOptions) run(cmd *cobra.Command, args []string) { +func (o *ExecutionOptions) run(cmd *cobra.Command, args []string) error { ctx := cmd.Context() jobID := args[0] response, err := util.GetAPIClientV2(cmd).Jobs().Executions(ctx, &apimodels.ListJobExecutionsRequest{ @@ -133,10 +133,12 @@ func (o *ExecutionOptions) run(cmd *cobra.Command, args []string) { }, }) if err != nil { - util.Fatal(cmd, err, 1) + return err } if err = output.Output(cmd, executionColumns, o.OutputOptions, response.Executions); err != nil { - util.Fatal(cmd, fmt.Errorf("failed to output: %w", err), 1) + return fmt.Errorf("failed to output: %w", err) } + + return nil } diff --git a/cmd/cli/job/history.go b/cmd/cli/job/history.go index 616d17a875..dbe5cad02a 100644 --- a/cmd/cli/job/history.go +++ b/cmd/cli/job/history.go @@ -63,7 +63,7 @@ func NewHistoryCmd() *cobra.Command { Long: historyLong, Example: historyExample, Args: cobra.ExactArgs(1), - Run: o.run, + RunE: o.run, } nodeCmd.Flags().StringVar(&o.EventType, "event-type", o.EventType, @@ -123,7 +123,7 @@ var historyColumns = []output.TableColumn[*models.JobHistory]{ }, } -func (o *HistoryOptions) run(cmd *cobra.Command, args []string) { +func (o *HistoryOptions) run(cmd *cobra.Command, args []string) error { ctx := cmd.Context() jobID := args[0] response, err := util.GetAPIClientV2(cmd).Jobs().History(ctx, &apimodels.ListJobHistoryRequest{ @@ -139,10 +139,12 @@ func (o *HistoryOptions) run(cmd *cobra.Command, args []string) { }, }) if err != nil { - util.Fatal(cmd, err, 1) + return err } if err = output.Output(cmd, historyColumns, o.OutputOptions, response.History); err != nil { - util.Fatal(cmd, fmt.Errorf("failed to output: %w", err), 1) + return fmt.Errorf("failed to output: %w", err) } + + return nil } diff --git a/cmd/cli/job/list.go b/cmd/cli/job/list.go index 58e799b50a..9eef6a12bc 100644 --- a/cmd/cli/job/list.go +++ b/cmd/cli/job/list.go @@ -68,7 +68,7 @@ func NewListCmd() *cobra.Command { Long: listLong, Example: listExample, Args: cobra.NoArgs, - Run: o.run, + RunE: o.run, } listCmd.Flags().StringVar(&o.Labels, "labels", o.Labels, @@ -113,7 +113,7 @@ var listColumns = []output.TableColumn[*models.Job]{ }, } -func (o *ListOptions) run(cmd *cobra.Command, _ []string) { +func (o *ListOptions) run(cmd *cobra.Command, _ []string) error { ctx := cmd.Context() var err error @@ -121,7 +121,7 @@ func (o *ListOptions) run(cmd *cobra.Command, _ []string) { if o.Labels != "" { labelRequirements, err = labels.ParseToRequirements(o.Labels) if err != nil { - util.Fatal(cmd, fmt.Errorf("could not parse labels: %w", err), 1) + return fmt.Errorf("could not parse labels: %w", err) } } response, err := util.GetAPIClientV2(cmd).Jobs().List(ctx, &apimodels.ListJobsRequest{ @@ -134,15 +134,17 @@ func (o *ListOptions) run(cmd *cobra.Command, _ []string) { }, }) if err != nil { - util.Fatal(cmd, fmt.Errorf("failed request: %w", err), 1) + return fmt.Errorf("failed request: %w", err) } if err = output.Output(cmd, listColumns, o.OutputOptions, response.Jobs); err != nil { - util.Fatal(cmd, fmt.Errorf("failed to output: %w", err), 1) + return fmt.Errorf("failed to output: %w", err) } if response.NextToken != "" { msg := fmt.Sprintf("To fetch more records use `--next-token %s`", response.NextToken) cmd.Printf("\n%s\n", msg) } + + return nil } diff --git a/cmd/cli/job/logs.go b/cmd/cli/job/logs.go index 2ff0f0d308..bb964ef3cd 100644 --- a/cmd/cli/job/logs.go +++ b/cmd/cli/job/logs.go @@ -39,16 +39,14 @@ func NewLogCmd() *cobra.Command { Short: logsShortDesc, Example: logsExample, Args: cobra.ExactArgs(1), - Run: func(cmd *cobra.Command, cmdArgs []string) { + RunE: func(cmd *cobra.Command, cmdArgs []string) error { opts := util.LogOptions{ JobID: cmdArgs[0], ExecutionID: options.ExecutionID, Follow: options.Follow, Tail: options.Tail, } - if err := util.Logs(cmd, opts); err != nil { - util.Fatal(cmd, err, 1) - } + return util.Logs(cmd, opts) }, } diff --git a/cmd/cli/job/run.go b/cmd/cli/job/run.go index d629c23cc7..040b7b4549 100644 --- a/cmd/cli/job/run.go +++ b/cmd/cli/job/run.go @@ -61,7 +61,7 @@ func NewRunCmd() *cobra.Command { Long: runLong, Example: runExample, Args: cobra.MinimumNArgs(0), - Run: o.run, + RunE: o.run, } runCmd.Flags().AddFlagSet(cliflags.NewRunTimeSettingsFlags(o.RunTimeSettings)) @@ -77,7 +77,7 @@ func NewRunCmd() *cobra.Command { return runCmd } -func (o *RunOptions) run(cmd *cobra.Command, args []string) { +func (o *RunOptions) run(cmd *cobra.Command, args []string) error { ctx := cmd.Context() // read the job spec from stdin or file @@ -86,23 +86,23 @@ func (o *RunOptions) run(cmd *cobra.Command, args []string) { if len(args) == 0 { byteResult, err = util.ReadFromStdinIfAvailable(cmd) if err != nil { - util.Fatal(cmd, fmt.Errorf("unknown error reading from file or stdin: %w", err), 1) + return fmt.Errorf("unknown error reading from file or stdin: %w", err) } } else { var fileContent *os.File fileContent, err = os.Open(args[0]) if err != nil { - util.Fatal(cmd, fmt.Errorf("error opening file: %w", err), 1) + return fmt.Errorf("error opening file: %w", err) } defer fileContent.Close() byteResult, err = io.ReadAll(fileContent) if err != nil { - util.Fatal(cmd, fmt.Errorf("error reading file: %w", err), 1) + return fmt.Errorf("error reading file: %w", err) } } if len(byteResult) == 0 { - util.Fatal(cmd, errors.New(userstrings.JobSpecBad), 1) + return errors.New(userstrings.JobSpecBad) } if !o.NoTemplate { @@ -111,13 +111,11 @@ func (o *RunOptions) run(cmd *cobra.Command, args []string) { EnvPattern: o.TemplateEnvVarsPattern, }) if err != nil { - util.Fatal(cmd, fmt.Errorf("failed to create template parser: %w", err), 1) - return + return fmt.Errorf("failed to create template parser: %w", err) } byteResult, err = parser.ParseBytes(byteResult) if err != nil { - util.Fatal(cmd, fmt.Errorf("%s: %w", userstrings.JobSpecBad, err), 1) - return + return fmt.Errorf("%s: %w", userstrings.JobSpecBad, err) } } @@ -126,16 +124,14 @@ func (o *RunOptions) run(cmd *cobra.Command, args []string) { var j *models.Job err = marshaller.YAMLUnmarshalWithMax(byteResult, &j) if err != nil { - util.Fatal(cmd, fmt.Errorf("%s: %w", userstrings.JobSpecBad, err), 1) - return + return fmt.Errorf("%s: %w", userstrings.JobSpecBad, err) } // Normalize and validate the job spec j.Normalize() err = j.ValidateSubmission() if err != nil { - util.Fatal(cmd, fmt.Errorf("%s: %w", userstrings.JobSpecBad, err), 1) - return + return fmt.Errorf("%s: %w", userstrings.JobSpecBad, err) } if o.RunTimeSettings.DryRun { @@ -145,9 +141,9 @@ func (o *RunOptions) run(cmd *cobra.Command, args []string) { } outputOps := output.NonTabularOutputOptions{Format: output.YAMLFormat} if err = output.OutputOneNonTabular(cmd, outputOps, j); err != nil { - util.Fatal(cmd, fmt.Errorf("failed to write job: %w", err), 1) + return fmt.Errorf("failed to write job: %w", err) } - return + return nil } // Submit the job @@ -156,8 +152,7 @@ func (o *RunOptions) run(cmd *cobra.Command, args []string) { Job: j, }) if err != nil { - util.Fatal(cmd, fmt.Errorf("failed request: %w", err), 1) - return + return fmt.Errorf("failed request: %w", err) } if o.ShowWarnings && len(resp.Warnings) > 0 { @@ -165,9 +160,10 @@ func (o *RunOptions) run(cmd *cobra.Command, args []string) { } if err := printer.PrintJobExecution(ctx, resp.JobID, cmd, o.RunTimeSettings, client); err != nil { - util.Fatal(cmd, fmt.Errorf("failed to print job execution: %w", err), 1) - return + return fmt.Errorf("failed to print job execution: %w", err) } + + return nil } func (o *RunOptions) printWarnings(cmd *cobra.Command, warnings []string) { diff --git a/cmd/cli/list/list.go b/cmd/cli/list/list.go index faaecf0f8d..eed9dd1d2c 100644 --- a/cmd/cli/list/list.go +++ b/cmd/cli/list/list.go @@ -75,10 +75,8 @@ func NewCmd() *cobra.Command { Example: listExample, PreRunE: hook.RemoteCmdPreRunHooks, PostRunE: hook.RemoteCmdPostRunHooks, - Run: func(cmd *cobra.Command, _ []string) { - if err := list(cmd, OL); err != nil { - util.Fatal(cmd, err, 1) - } + RunE: func(cmd *cobra.Command, _ []string) error { + return list(cmd, OL) }, } diff --git a/cmd/cli/node/describe.go b/cmd/cli/node/describe.go index 6206e3a63a..9ff61903ee 100644 --- a/cmd/cli/node/describe.go +++ b/cmd/cli/node/describe.go @@ -29,24 +29,26 @@ func NewDescribeCmd() *cobra.Command { Use: "describe [id]", Short: "Get the info of a node by id.", Args: cobra.ExactArgs(1), - Run: o.runDescribe, + RunE: o.runDescribe, } nodeCmd.Flags().AddFlagSet(cliflags.OutputNonTabularFormatFlags(&o.OutputOpts)) return nodeCmd } // Run executes node command -func (o *DescribeOptions) runDescribe(cmd *cobra.Command, args []string) { +func (o *DescribeOptions) runDescribe(cmd *cobra.Command, args []string) error { ctx := cmd.Context() nodeID := args[0] response, err := util.GetAPIClientV2(cmd).Nodes().Get(ctx, &apimodels.GetNodeRequest{ NodeID: nodeID, }) if err != nil { - util.Fatal(cmd, fmt.Errorf("could not get node %s: %w", nodeID, err), 1) + return fmt.Errorf("could not get node %s: %w", nodeID, err) } if err = output.OutputOneNonTabular(cmd, o.OutputOpts, response.Node); err != nil { - util.Fatal(cmd, fmt.Errorf("failed to write node %s: %w", nodeID, err), 1) + return fmt.Errorf("failed to write node %s: %w", nodeID, err) } + + return nil } diff --git a/cmd/cli/node/list.go b/cmd/cli/node/list.go index 7a1fe44c04..cd6a104c5e 100644 --- a/cmd/cli/node/list.go +++ b/cmd/cli/node/list.go @@ -69,7 +69,7 @@ func (o *ListOptions) run(cmd *cobra.Command, _ []string) error { if o.Labels != "" { labelRequirements, err = labels.ParseToRequirements(o.Labels) if err != nil { - util.Fatal(cmd, fmt.Errorf("could not parse labels: %w", err), 1) + return fmt.Errorf("could not parse labels: %w", err) } } @@ -97,7 +97,7 @@ func (o *ListOptions) run(cmd *cobra.Command, _ []string) error { }, }) if err != nil { - util.Fatal(cmd, fmt.Errorf("failed request: %w", err), 1) + return fmt.Errorf("failed request: %w", err) } columns := alwaysColumns @@ -106,7 +106,7 @@ func (o *ListOptions) run(cmd *cobra.Command, _ []string) error { } if err = output.Output(cmd, columns, o.OutputOptions, response.Nodes); err != nil { - util.Fatal(cmd, fmt.Errorf("failed to output: %w", err), 1) + return fmt.Errorf("failed to output: %w", err) } return nil diff --git a/cmd/cli/serve/serve.go b/cmd/cli/serve/serve.go index 68222b4f1b..6168238cea 100644 --- a/cmd/cli/serve/serve.go +++ b/cmd/cli/serve/serve.go @@ -126,7 +126,7 @@ func NewCmd() *cobra.Command { Short: "Start the bacalhau compute node", Long: serveLong, Example: serveExample, - PreRun: func(cmd *cobra.Command, args []string) { + PreRunE: func(cmd *cobra.Command, args []string) error { /* NB(forrest): (I learned a lot more about viper and cobra than was intended...) @@ -150,9 +150,7 @@ func NewCmd() *cobra.Command { return the value of the last flag bound to it. This is why it's important to manage flag binding thoughtfully, ensuring each command's context is respected. */ - if err := configflags.BindFlags(cmd, serveFlags); err != nil { - util.Fatal(cmd, err, 1) - } + return configflags.BindFlags(cmd, serveFlags) }, RunE: func(cmd *cobra.Command, _ []string) error { return serve(cmd) diff --git a/cmd/cli/validate/validate.go b/cmd/cli/validate/validate.go index a52e4ddb1c..e6eb5c8047 100644 --- a/cmd/cli/validate/validate.go +++ b/cmd/cli/validate/validate.go @@ -10,7 +10,6 @@ import ( "github.com/invopop/jsonschema" "github.com/spf13/cobra" - "github.com/bacalhau-project/bacalhau/cmd/util" "github.com/bacalhau-project/bacalhau/pkg/model" "github.com/bacalhau-project/bacalhau/pkg/util/templates" @@ -67,10 +66,8 @@ func NewCmd() *cobra.Command { Long: validateLong, Example: validateExample, Args: cobra.MinimumNArgs(0), - Run: func(cmd *cobra.Command, cmdArgs []string) { //nolint:unparam // incorrect that cmd is unused. - if err := validate(cmd, cmdArgs, OV); err != nil { - util.Fatal(cmd, err, 1) - } + RunE: func(cmd *cobra.Command, cmdArgs []string) error { + return validate(cmd, cmdArgs, OV) }, } diff --git a/cmd/cli/validate/validate_test.go b/cmd/cli/validate/validate_test.go index a533204686..7e12d600c4 100644 --- a/cmd/cli/validate/validate_test.go +++ b/cmd/cli/validate/validate_test.go @@ -10,8 +10,6 @@ import ( "github.com/stretchr/testify/suite" cmdtesting "github.com/bacalhau-project/bacalhau/cmd/testing" - "github.com/bacalhau-project/bacalhau/cmd/util" - testutils "github.com/bacalhau-project/bacalhau/pkg/test/utils" "github.com/bacalhau-project/bacalhau/testdata" ) @@ -33,8 +31,6 @@ func (s *ValidateSuite) TestValidate() { } for name, test := range tests { s.Run(name, func() { - util.Fatal = util.FakeFatalErrorHandler - _, out, err := s.ExecuteTestCobraCommand("validate", test.testFile.AsTempFile(s.T(), fmt.Sprintf("%s.*.yaml", name)), ) @@ -44,10 +40,10 @@ func (s *ValidateSuite) TestValidate() { require.Contains(s.T(), out, "The Job is valid", fmt.Sprintf("%s: Jobspec Invalid", name)) } else { require.Error(s.T(), err) - fatalError, err := testutils.FirstFatalError(s.T(), out) - require.NoError(s.T(), err) - require.Contains(s.T(), fatalError.Message, "The Job is not valid.", fmt.Sprintf("%s: Jobspec Invalid returning valid", name)) - require.Contains(s.T(), fatalError.Message, "APIVersion is required", fmt.Sprintf("%s: Jobspec Invalid returning valid", name)) + + fatalError := string(out) + require.Contains(s.T(), fatalError, "The Job is not valid.", fmt.Sprintf("%s: Jobspec Invalid returning valid", name)) + require.Contains(s.T(), fatalError, "APIVersion is required", fmt.Sprintf("%s: Jobspec Invalid returning valid", name)) } }) diff --git a/cmd/cli/version/version.go b/cmd/cli/version/version.go index f3010d03db..d26f8b8d67 100644 --- a/cmd/cli/version/version.go +++ b/cmd/cli/version/version.go @@ -50,10 +50,8 @@ func NewCmd() *cobra.Command { Short: "Get the client and server version.", Args: cobra.NoArgs, PreRun: hook.ApplyPorcelainLogLevel, - Run: func(cmd *cobra.Command, _ []string) { - if err := runVersion(cmd, oV); err != nil { - util.Fatal(cmd, err, 1) - } + RunE: func(cmd *cobra.Command, _ []string) error { + return runVersion(cmd, oV) }, } versionCmd.Flags().BoolVar(&oV.ClientOnly, "client", oV.ClientOnly, "If true, shows client version only (no server required).") diff --git a/cmd/cli/wasm/wasm_run.go b/cmd/cli/wasm/wasm_run.go index d45049801f..cae8df28ae 100644 --- a/cmd/cli/wasm/wasm_run.go +++ b/cmd/cli/wasm/wasm_run.go @@ -102,10 +102,8 @@ func newRunCmd() *cobra.Command { Args: cobra.MinimumNArgs(1), PreRunE: hook.Chain(hook.ClientPreRunHooks, configflags.PreRun(wasmRunFlags)), PostRunE: hook.ClientPostRunHooks, - Run: func(cmd *cobra.Command, args []string) { - if err := runWasm(cmd, args, opts); err != nil { - util.Fatal(cmd, err, 1) - } + RunE: func(cmd *cobra.Command, args []string) error { + return runWasm(cmd, args, opts) }, } diff --git a/cmd/cli/wasm/wasm_run_test.go b/cmd/cli/wasm/wasm_run_test.go index df80802b10..484fbdefd5 100644 --- a/cmd/cli/wasm/wasm_run_test.go +++ b/cmd/cli/wasm/wasm_run_test.go @@ -10,7 +10,6 @@ import ( "github.com/stretchr/testify/suite" cmdtesting "github.com/bacalhau-project/bacalhau/cmd/testing" - "github.com/bacalhau-project/bacalhau/cmd/util" "github.com/bacalhau-project/bacalhau/pkg/model" testutils "github.com/bacalhau-project/bacalhau/pkg/test/utils" ) @@ -20,7 +19,6 @@ type WasmRunSuite struct { } func TestWasmRunSuite(t *testing.T) { - util.Fatal = util.FakeFatalErrorHandler suite.Run(t, new(WasmRunSuite)) } diff --git a/cmd/testing/base.go b/cmd/testing/base.go index 20b655600d..823d366c08 100644 --- a/cmd/testing/base.go +++ b/cmd/testing/base.go @@ -37,7 +37,6 @@ type BaseSuite struct { // before each test func (s *BaseSuite) SetupTest() { logger.ConfigureTestLogging(s.T()) - util.Fatal = util.FakeFatalErrorHandler computeConfig, err := node.NewComputeConfigWith(node.ComputeConfigParams{ JobSelectionPolicy: node.JobSelectionPolicy{ @@ -70,7 +69,6 @@ func (s *BaseSuite) SetupTest() { // After each test func (s *BaseSuite) TearDownTest() { - util.Fatal = util.FakeFatalErrorHandler if s.Node != nil { s.Node.CleanupManager.Cleanup(context.Background()) } diff --git a/cmd/testing/basetls.go b/cmd/testing/basetls.go index 04fda1d087..9d5867423b 100644 --- a/cmd/testing/basetls.go +++ b/cmd/testing/basetls.go @@ -6,7 +6,6 @@ import ( "path/filepath" "time" - "github.com/bacalhau-project/bacalhau/cmd/util" "github.com/bacalhau-project/bacalhau/pkg/bidstrategy/semantic" "github.com/bacalhau-project/bacalhau/pkg/devstack" noop_executor "github.com/bacalhau-project/bacalhau/pkg/executor/noop" @@ -26,7 +25,6 @@ type BaseTLSSuite struct { // before each test func (s *BaseTLSSuite) SetupTest() { logger.ConfigureTestLogging(s.T()) - util.Fatal = util.FakeFatalErrorHandler computeConfig, err := node.NewComputeConfigWith(node.ComputeConfigParams{ JobSelectionPolicy: node.JobSelectionPolicy{ @@ -71,7 +69,6 @@ func (s *BaseTLSSuite) SetupTest() { // After each test func (s *BaseTLSSuite) TearDownTest() { - util.Fatal = util.FakeFatalErrorHandler if s.Node != nil { s.Node.CleanupManager.Cleanup(context.Background()) } diff --git a/go.work.sum b/go.work.sum index eee5b88d45..32aae5d08e 100644 --- a/go.work.sum +++ b/go.work.sum @@ -1218,6 +1218,7 @@ github.com/cncf/xds/go v0.0.0-20230105202645-06c439db220b h1:ACGZRIr7HsgBKHsueQ1 github.com/cncf/xds/go v0.0.0-20230105202645-06c439db220b/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= github.com/cncf/xds/go v0.0.0-20230607035331-e9ce68804cb4 h1:/inchEIKaYC1Akx+H+gqO04wryn5h75LSazbRlnya1k= github.com/cncf/xds/go v0.0.0-20230607035331-e9ce68804cb4/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= +github.com/cncf/xds/go v0.0.0-20231109132714-523115ebc101/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= github.com/cockroachdb/apd v1.1.0 h1:3LFP3629v+1aKXU5Q37mxmRxX/pIu1nijXydLShEq5I= github.com/cockroachdb/cockroach-go/v2 v2.1.1 h1:3XzfSMuUT0wBe1a3o5C0eOTcArhmmFAg2Jzh/7hhKqo= github.com/cockroachdb/cockroach-go/v2 v2.1.1/go.mod h1:7NtUnP6eK+l6k483WSYNrq3Kb23bWV10IRV1TyeSpwM= From c304c422cbbf907193952b22c29126eecc8a14eb Mon Sep 17 00:00:00 2001 From: Ross Jones Date: Wed, 10 Apr 2024 09:28:55 +0100 Subject: [PATCH 05/17] Optionally constrain to connected, approved compute nodes when selecting/ranking them (#3768) When ranking jobs to send a job to, exclude those that are not connected or not approved. By defaults nodes MUST be both approved and connected, but we have added options to the NodeSelector (in the form of `orchestrator.NodeSelectionCriteria`) to allow for explicit specification of whether we want them to be approved/connected. ```go # Show top matching approved nodes whether they are connected # or not selectedNodes, err := b.nodeSelector.TopMatchingNodes( ctx, job, len(execs), &orchestrator.NodeSelectionConstraints{ RequireApproval: true, RequireConnected: false, }, ) ``` To make this workable, we change the default node state to APPROVED, requiring aconfig-option to rely on manual approval - it is this way around to ensure there is no required change for users upgrading to use this feature until they are ready to enforce it. ```yaml # Require manual node approval (not default) node: requester: manualnodeapproval: true # Do not require manual node approval (default) node: requester: manualnodeapproval: false ``` --- cmd/cli/serve/util.go | 15 +++++++- pkg/config/types/generated_constants.go | 1 + pkg/config/types/generated_viper_defaults.go | 2 + pkg/config/types/requester.go | 5 +++ pkg/config/types/storagetype_string.go | 2 +- pkg/devstack/devstack.go | 4 ++ pkg/devstack/option.go | 7 ++++ pkg/node/config_defaults.go | 4 ++ pkg/node/config_requester.go | 5 +++ pkg/node/manager/node_manager.go | 23 ++++++------ pkg/node/node.go | 5 ++- pkg/node/requester.go | 4 ++ pkg/orchestrator/interfaces.go | 6 ++- pkg/orchestrator/mocks.go | 16 ++++---- pkg/orchestrator/scheduler/batch_job_test.go | 14 ++++++- .../scheduler/batch_service_job.go | 11 +++++- pkg/orchestrator/scheduler/daemon_job.go | 8 +++- pkg/orchestrator/scheduler/daemon_job_test.go | 10 ++--- pkg/orchestrator/scheduler/ops_job.go | 7 +++- pkg/orchestrator/scheduler/ops_job_test.go | 2 +- .../scheduler/service_job_test.go | 8 +++- pkg/orchestrator/scheduler/utils_test.go | 4 +- .../selection/selector/node_selector.go | 37 ++++++++++++++++--- pkg/orchestrator/types.go | 5 +++ pkg/publicapi/test/setup_test.go | 1 + 25 files changed, 161 insertions(+), 45 deletions(-) diff --git a/cmd/cli/serve/util.go b/cmd/cli/serve/util.go index 1c4216d8f1..e178f827a4 100644 --- a/cmd/cli/serve/util.go +++ b/cmd/cli/serve/util.go @@ -11,6 +11,7 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/compute/store/boltdb" "github.com/bacalhau-project/bacalhau/pkg/jobstore" boltjobstore "github.com/bacalhau-project/bacalhau/pkg/jobstore/boltdb" + "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/util/idgen" pkgerrors "github.com/pkg/errors" "github.com/rs/zerolog/log" @@ -91,7 +92,8 @@ func GetRequesterConfig(ctx context.Context, createJobStore bool) (node.Requeste return node.RequesterConfig{}, pkgerrors.Wrapf(err, "failed to create job store") } } - return node.NewRequesterConfigWith(node.RequesterConfigParams{ + + requesterConfig, err := node.NewRequesterConfigWith(node.RequesterConfigParams{ JobDefaults: transformer.JobDefaults{ ExecutionTimeout: time.Duration(cfg.JobDefaults.ExecutionTimeout), }, @@ -120,6 +122,17 @@ func GetRequesterConfig(ctx context.Context, createJobStore bool) (node.Requeste JobStore: jobStore, DefaultPublisher: cfg.DefaultPublisher, }) + if err != nil { + return node.RequesterConfig{}, err + } + + if cfg.ManualNodeApproval { + requesterConfig.DefaultApprovalState = models.NodeApprovals.PENDING + } else { + requesterConfig.DefaultApprovalState = models.NodeApprovals.APPROVED + } + + return requesterConfig, nil } func getNodeType() (requester, compute bool, err error) { diff --git a/pkg/config/types/generated_constants.go b/pkg/config/types/generated_constants.go index b0f72d6fe6..ef397a06c3 100644 --- a/pkg/config/types/generated_constants.go +++ b/pkg/config/types/generated_constants.go @@ -142,6 +142,7 @@ const NodeRequesterControlPlaneSettings = "Node.Requester.ControlPlaneSettings" const NodeRequesterControlPlaneSettingsHeartbeatCheckFrequency = "Node.Requester.ControlPlaneSettings.HeartbeatCheckFrequency" const NodeRequesterControlPlaneSettingsHeartbeatTopic = "Node.Requester.ControlPlaneSettings.HeartbeatTopic" const NodeRequesterControlPlaneSettingsNodeDisconnectedAfter = "Node.Requester.ControlPlaneSettings.NodeDisconnectedAfter" +const NodeRequesterManualNodeApproval = "Node.Requester.ManualNodeApproval" const NodeBootstrapAddresses = "Node.BootstrapAddresses" const NodeDownloadURLRequestRetries = "Node.DownloadURLRequestRetries" const NodeDownloadURLRequestTimeout = "Node.DownloadURLRequestTimeout" diff --git a/pkg/config/types/generated_viper_defaults.go b/pkg/config/types/generated_viper_defaults.go index 503992dcc2..75217fd072 100644 --- a/pkg/config/types/generated_viper_defaults.go +++ b/pkg/config/types/generated_viper_defaults.go @@ -164,6 +164,7 @@ func SetDefaults(cfg BacalhauConfig, opts ...SetOption) { p.Viper.SetDefault(NodeRequesterControlPlaneSettingsHeartbeatCheckFrequency, cfg.Node.Requester.ControlPlaneSettings.HeartbeatCheckFrequency.AsTimeDuration()) p.Viper.SetDefault(NodeRequesterControlPlaneSettingsHeartbeatTopic, cfg.Node.Requester.ControlPlaneSettings.HeartbeatTopic) p.Viper.SetDefault(NodeRequesterControlPlaneSettingsNodeDisconnectedAfter, cfg.Node.Requester.ControlPlaneSettings.NodeDisconnectedAfter.AsTimeDuration()) + p.Viper.SetDefault(NodeRequesterManualNodeApproval, cfg.Node.Requester.ManualNodeApproval) p.Viper.SetDefault(NodeBootstrapAddresses, cfg.Node.BootstrapAddresses) p.Viper.SetDefault(NodeDownloadURLRequestRetries, cfg.Node.DownloadURLRequestRetries) p.Viper.SetDefault(NodeDownloadURLRequestTimeout, cfg.Node.DownloadURLRequestTimeout.AsTimeDuration()) @@ -361,6 +362,7 @@ func Set(cfg BacalhauConfig, opts ...SetOption) { p.Viper.Set(NodeRequesterControlPlaneSettingsHeartbeatCheckFrequency, cfg.Node.Requester.ControlPlaneSettings.HeartbeatCheckFrequency.AsTimeDuration()) p.Viper.Set(NodeRequesterControlPlaneSettingsHeartbeatTopic, cfg.Node.Requester.ControlPlaneSettings.HeartbeatTopic) p.Viper.Set(NodeRequesterControlPlaneSettingsNodeDisconnectedAfter, cfg.Node.Requester.ControlPlaneSettings.NodeDisconnectedAfter.AsTimeDuration()) + p.Viper.Set(NodeRequesterManualNodeApproval, cfg.Node.Requester.ManualNodeApproval) p.Viper.Set(NodeBootstrapAddresses, cfg.Node.BootstrapAddresses) p.Viper.Set(NodeDownloadURLRequestRetries, cfg.Node.DownloadURLRequestRetries) p.Viper.Set(NodeDownloadURLRequestTimeout, cfg.Node.DownloadURLRequestTimeout.AsTimeDuration()) diff --git a/pkg/config/types/requester.go b/pkg/config/types/requester.go index 42644d7299..b2217cf4f0 100644 --- a/pkg/config/types/requester.go +++ b/pkg/config/types/requester.go @@ -27,6 +27,11 @@ type RequesterConfig struct { DefaultPublisher string `yaml:"DefaultPublisher"` ControlPlaneSettings RequesterControlPlaneConfig `yaml:"ControlPlaneSettings"` + + // ManualNodeApproval is a flag that determines if nodes should be manually approved or not. + // By default, nodes are auto-approved to simplify upgrades, by setting this property to + // true, nodes will need to be manually approved before they are included in node selection. + ManualNodeApproval bool `yaml:"ManualNodeApproval"` } type EvaluationBrokerConfig struct { diff --git a/pkg/config/types/storagetype_string.go b/pkg/config/types/storagetype_string.go index 50ad2a704e..268f6e281c 100644 --- a/pkg/config/types/storagetype_string.go +++ b/pkg/config/types/storagetype_string.go @@ -1,4 +1,4 @@ -// Code generated by "stringer -type=StorageType -linecomment"; DO NOT EDIT. +// Code generated by "stringer -type=StorageType"; DO NOT EDIT. package types diff --git a/pkg/devstack/devstack.go b/pkg/devstack/devstack.go index 6008f0ff42..cfdf3ba7b5 100644 --- a/pkg/devstack/devstack.go +++ b/pkg/devstack/devstack.go @@ -306,6 +306,10 @@ func Setup( } } + // Set the default approval state from the config provided, either PENDING if the user has + // chosen manual approval, or the default otherwise. + nodeConfig.RequesterNodeConfig.DefaultApprovalState = stackConfig.RequesterConfig.DefaultApprovalState + // Create dedicated store paths for each node err = setStorePaths(ctx, fsRepo, &nodeConfig) if err != nil { diff --git a/pkg/devstack/option.go b/pkg/devstack/option.go index 444bd0f9cd..9de52c60b3 100644 --- a/pkg/devstack/option.go +++ b/pkg/devstack/option.go @@ -7,6 +7,7 @@ import ( "github.com/rs/zerolog" + "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/node" "github.com/bacalhau-project/bacalhau/pkg/routing" ) @@ -121,6 +122,12 @@ func (o *DevStackConfig) Validate() error { return errs } +func WithAutoNodeApproval() ConfigOption { + return func(cfg *DevStackConfig) { + cfg.RequesterConfig.DefaultApprovalState = models.NodeApprovals.APPROVED + } +} + func WithNodeOverrides(overrides ...node.NodeConfig) ConfigOption { return func(cfg *DevStackConfig) { cfg.NodeOverrides = overrides diff --git a/pkg/node/config_defaults.go b/pkg/node/config_defaults.go index 8cd6edbe76..283c834f04 100644 --- a/pkg/node/config_defaults.go +++ b/pkg/node/config_defaults.go @@ -74,6 +74,8 @@ var DefaultRequesterConfig = RequesterConfigParams{ HeartbeatTopic: "heartbeat", NodeDisconnectedAfter: types.Duration(30 * time.Second), //nolint:gomnd }, + + DefaultApprovalState: models.NodeApprovals.APPROVED, } var TestRequesterConfig = RequesterConfigParams{ @@ -108,6 +110,8 @@ var TestRequesterConfig = RequesterConfigParams{ HeartbeatTopic: "heartbeat", NodeDisconnectedAfter: types.Duration(30 * time.Second), //nolint:gomnd }, + + DefaultApprovalState: models.NodeApprovals.APPROVED, } func getRequesterConfigParams() RequesterConfigParams { diff --git a/pkg/node/config_requester.go b/pkg/node/config_requester.go index e074c78720..e45df01f3d 100644 --- a/pkg/node/config_requester.go +++ b/pkg/node/config_requester.go @@ -53,6 +53,11 @@ type RequesterConfigParams struct { DefaultPublisher string + // When new nodes join the cluster, what state do they have? By default, APPROVED, and + // for tests, APPROVED. We will provide an option to set this to PENDING for production + // or for when operators are ready to control node approval. + DefaultApprovalState models.NodeApproval + ControlPlaneSettings types.RequesterControlPlaneConfig } diff --git a/pkg/node/manager/node_manager.go b/pkg/node/manager/node_manager.go index bc8eb5fc7d..a5233ce53a 100644 --- a/pkg/node/manager/node_manager.go +++ b/pkg/node/manager/node_manager.go @@ -24,23 +24,26 @@ const ( // also provides operations for querying and managing compute // node information. type NodeManager struct { - nodeInfo routing.NodeInfoStore - resourceMap *concurrency.StripedMap[models.Resources] - heartbeats *heartbeat.HeartbeatServer + nodeInfo routing.NodeInfoStore + resourceMap *concurrency.StripedMap[models.Resources] + heartbeats *heartbeat.HeartbeatServer + defaultApprovalState models.NodeApproval } type NodeManagerParams struct { - NodeInfo routing.NodeInfoStore - Heartbeats *heartbeat.HeartbeatServer + NodeInfo routing.NodeInfoStore + Heartbeats *heartbeat.HeartbeatServer + DefaultApprovalState models.NodeApproval } // NewNodeManager constructs a new node manager and returns a pointer // to the structure. func NewNodeManager(params NodeManagerParams) *NodeManager { return &NodeManager{ - resourceMap: concurrency.NewStripedMap[models.Resources](resourceMapLockCount), - nodeInfo: params.NodeInfo, - heartbeats: params.Heartbeats, + resourceMap: concurrency.NewStripedMap[models.Resources](resourceMapLockCount), + nodeInfo: params.NodeInfo, + heartbeats: params.Heartbeats, + defaultApprovalState: params.DefaultApprovalState, } } @@ -83,9 +86,7 @@ func (n *NodeManager) Register(ctx context.Context, request requests.RegisterReq }, nil } - // TODO: We will default to PENDING, but once we start filtering on NodeApprovals.APPROVED we will need to - // make a decision on how this is determined. - request.Info.Approval = models.NodeApprovals.PENDING + request.Info.Approval = n.defaultApprovalState if err := n.nodeInfo.Add(ctx, request.Info); err != nil { return nil, errors.Wrap(err, "failed to save nodeinfo during node registration") diff --git a/pkg/node/node.go b/pkg/node/node.go index 89447add8b..50944234e2 100644 --- a/pkg/node/node.go +++ b/pkg/node/node.go @@ -298,8 +298,9 @@ func NewNode( // to the network. Provide it with a mechanism to lookup (and enhance) // node info, and a reference to the heartbeat server if running NATS. nodeManager := manager.NewNodeManager(manager.NodeManagerParams{ - NodeInfo: tracingInfoStore, - Heartbeats: heartbeatSvr, + NodeInfo: tracingInfoStore, + Heartbeats: heartbeatSvr, + DefaultApprovalState: config.RequesterNodeConfig.DefaultApprovalState, }) // Start the nodemanager, ensuring it doesn't block the main thread and diff --git a/pkg/node/requester.go b/pkg/node/requester.go index 04cd4b0ddb..8c7319fe3d 100644 --- a/pkg/node/requester.go +++ b/pkg/node/requester.go @@ -80,6 +80,10 @@ func NewRequesterNode( }), ) + log.Ctx(ctx). + Info(). + Msgf("Nodes joining the cluster will be assigned approval state: %s", requesterConfig.DefaultApprovalState.String()) + // compute node ranker nodeRankerChain := ranking.NewChain() nodeRankerChain.Add( diff --git a/pkg/orchestrator/interfaces.go b/pkg/orchestrator/interfaces.go index b75032a0df..3e57367b83 100644 --- a/pkg/orchestrator/interfaces.go +++ b/pkg/orchestrator/interfaces.go @@ -102,11 +102,13 @@ type NodeRanker interface { type NodeSelector interface { // AllNodes returns all nodes in the network. AllNodes(ctx context.Context) ([]models.NodeInfo, error) + // AllMatchingNodes returns all nodes that match the job constrains and selection criteria. - AllMatchingNodes(ctx context.Context, job *models.Job) ([]models.NodeInfo, error) + AllMatchingNodes(ctx context.Context, job *models.Job, constraints *NodeSelectionConstraints) ([]models.NodeInfo, error) + // TopMatchingNodes return the top ranked desiredCount number of nodes that match job constraints // ordered in descending order based on their rank, or error if not enough nodes match. - TopMatchingNodes(ctx context.Context, job *models.Job, desiredCount int) ([]models.NodeInfo, error) + TopMatchingNodes(ctx context.Context, job *models.Job, desiredCount int, constraints *NodeSelectionConstraints) ([]models.NodeInfo, error) } type RetryStrategy interface { diff --git a/pkg/orchestrator/mocks.go b/pkg/orchestrator/mocks.go index 268f6704ea..04356a87a2 100644 --- a/pkg/orchestrator/mocks.go +++ b/pkg/orchestrator/mocks.go @@ -368,18 +368,18 @@ func (m *MockNodeSelector) EXPECT() *MockNodeSelectorMockRecorder { } // AllMatchingNodes mocks base method. -func (m *MockNodeSelector) AllMatchingNodes(ctx context.Context, job *models.Job) ([]models.NodeInfo, error) { +func (m *MockNodeSelector) AllMatchingNodes(ctx context.Context, job *models.Job, constraints *NodeSelectionConstraints) ([]models.NodeInfo, error) { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "AllMatchingNodes", ctx, job) + ret := m.ctrl.Call(m, "AllMatchingNodes", ctx, job, constraints) ret0, _ := ret[0].([]models.NodeInfo) ret1, _ := ret[1].(error) return ret0, ret1 } // AllMatchingNodes indicates an expected call of AllMatchingNodes. -func (mr *MockNodeSelectorMockRecorder) AllMatchingNodes(ctx, job any) *gomock.Call { +func (mr *MockNodeSelectorMockRecorder) AllMatchingNodes(ctx, job, constraints any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AllMatchingNodes", reflect.TypeOf((*MockNodeSelector)(nil).AllMatchingNodes), ctx, job) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AllMatchingNodes", reflect.TypeOf((*MockNodeSelector)(nil).AllMatchingNodes), ctx, job, constraints) } // AllNodes mocks base method. @@ -398,18 +398,18 @@ func (mr *MockNodeSelectorMockRecorder) AllNodes(ctx any) *gomock.Call { } // TopMatchingNodes mocks base method. -func (m *MockNodeSelector) TopMatchingNodes(ctx context.Context, job *models.Job, desiredCount int) ([]models.NodeInfo, error) { +func (m *MockNodeSelector) TopMatchingNodes(ctx context.Context, job *models.Job, desiredCount int, constraints *NodeSelectionConstraints) ([]models.NodeInfo, error) { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "TopMatchingNodes", ctx, job, desiredCount) + ret := m.ctrl.Call(m, "TopMatchingNodes", ctx, job, desiredCount, constraints) ret0, _ := ret[0].([]models.NodeInfo) ret1, _ := ret[1].(error) return ret0, ret1 } // TopMatchingNodes indicates an expected call of TopMatchingNodes. -func (mr *MockNodeSelectorMockRecorder) TopMatchingNodes(ctx, job, desiredCount any) *gomock.Call { +func (mr *MockNodeSelectorMockRecorder) TopMatchingNodes(ctx, job, desiredCount, constraints any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "TopMatchingNodes", reflect.TypeOf((*MockNodeSelector)(nil).TopMatchingNodes), ctx, job, desiredCount) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "TopMatchingNodes", reflect.TypeOf((*MockNodeSelector)(nil).TopMatchingNodes), ctx, job, desiredCount, constraints) } // MockRetryStrategy is a mock of RetryStrategy interface. diff --git a/pkg/orchestrator/scheduler/batch_job_test.go b/pkg/orchestrator/scheduler/batch_job_test.go index e9e715c1f2..598e98521e 100644 --- a/pkg/orchestrator/scheduler/batch_job_test.go +++ b/pkg/orchestrator/scheduler/batch_job_test.go @@ -303,10 +303,20 @@ func (s *BatchJobSchedulerTestSuite) TestProcess_ShouldMarkJobAsFailed_NoRetry() } func (s *BatchJobSchedulerTestSuite) mockNodeSelection(job *models.Job, nodeInfos []models.NodeInfo, desiredCount int) { + constraints := &orchestrator.NodeSelectionConstraints{ + RequireApproval: false, + RequireConnected: false, + } + if len(nodeInfos) < desiredCount { - s.nodeSelector.EXPECT().TopMatchingNodes(gomock.Any(), job, desiredCount).Return(nil, orchestrator.ErrNotEnoughNodes{}) + s.nodeSelector.EXPECT().TopMatchingNodes(gomock.Any(), job, desiredCount, constraints).Return(nil, orchestrator.ErrNotEnoughNodes{}) } else { - s.nodeSelector.EXPECT().TopMatchingNodes(gomock.Any(), job, desiredCount).Return(nodeInfos, nil) + s.nodeSelector.EXPECT().TopMatchingNodes( + gomock.Any(), + job, + desiredCount, + constraints, + ).Return(nodeInfos, nil) } } diff --git a/pkg/orchestrator/scheduler/batch_service_job.go b/pkg/orchestrator/scheduler/batch_service_job.go index 98be071685..13290f6cbf 100644 --- a/pkg/orchestrator/scheduler/batch_service_job.go +++ b/pkg/orchestrator/scheduler/batch_service_job.go @@ -153,7 +153,16 @@ func (b *BatchServiceJobScheduler) createMissingExecs( // placeExecs places the executions func (b *BatchServiceJobScheduler) placeExecs(ctx context.Context, execs execSet, job *models.Job) error { if len(execs) > 0 { - selectedNodes, err := b.nodeSelector.TopMatchingNodes(ctx, job, len(execs)) + // TODO: Remove the options once we are ready to enforce that only connected/approved nodes can be used + selectedNodes, err := b.nodeSelector.TopMatchingNodes( + ctx, + job, + len(execs), + &orchestrator.NodeSelectionConstraints{ + RequireApproval: false, + RequireConnected: false, + }, + ) if err != nil { return err } diff --git a/pkg/orchestrator/scheduler/daemon_job.go b/pkg/orchestrator/scheduler/daemon_job.go index eda155359a..79b3b21b7c 100644 --- a/pkg/orchestrator/scheduler/daemon_job.go +++ b/pkg/orchestrator/scheduler/daemon_job.go @@ -85,7 +85,13 @@ func (b *DaemonJobScheduler) Process(ctx context.Context, evaluation *models.Eva func (b *DaemonJobScheduler) createMissingExecs( ctx context.Context, job *models.Job, plan *models.Plan, existingExecs execSet) (execSet, error) { newExecs := execSet{} - nodes, err := b.nodeSelector.AllMatchingNodes(ctx, job) + + // Require approval when selecting nodes, but do not require them to be connected. + nodes, err := b.nodeSelector.AllMatchingNodes( + ctx, + job, + &orchestrator.NodeSelectionConstraints{RequireApproval: true, RequireConnected: false}, + ) if err != nil { return newExecs, err } diff --git a/pkg/orchestrator/scheduler/daemon_job_test.go b/pkg/orchestrator/scheduler/daemon_job_test.go index 3a1f7e5252..69cb8033f0 100644 --- a/pkg/orchestrator/scheduler/daemon_job_test.go +++ b/pkg/orchestrator/scheduler/daemon_job_test.go @@ -52,7 +52,7 @@ func (s *DaemonJobSchedulerTestSuite) TestProcess_ShouldCreateNewExecutions() { *mockNodeInfo(s.T(), nodeIDs[1]), *mockNodeInfo(s.T(), nodeIDs[2]), } - s.nodeSelector.EXPECT().AllMatchingNodes(gomock.Any(), gomock.Any()).Return(nodeInfos, nil) + s.nodeSelector.EXPECT().AllMatchingNodes(gomock.Any(), gomock.Any(), gomock.Any()).Return(nodeInfos, nil) matcher := NewPlanMatcher(s.T(), PlanMatcherParams{ Evaluation: evaluation, @@ -77,7 +77,7 @@ func (s *DaemonJobSchedulerTestSuite) TestProcess_ShouldNOTMarkJobAsCompleted() executions[1].ComputeState = models.NewExecutionState(models.ExecutionStateCompleted) // Simulate a completed execution s.jobStore.EXPECT().GetJob(gomock.Any(), job.ID).Return(*job, nil) s.jobStore.EXPECT().GetExecutions(gomock.Any(), jobstore.GetExecutionsOptions{JobID: job.ID}).Return(executions, nil) - s.nodeSelector.EXPECT().AllMatchingNodes(gomock.Any(), gomock.Any()).Return([]models.NodeInfo{}, nil) + s.nodeSelector.EXPECT().AllMatchingNodes(gomock.Any(), gomock.Any(), gomock.Any()).Return([]models.NodeInfo{}, nil) // Noop plan matcher := NewPlanMatcher(s.T(), PlanMatcherParams{ @@ -100,7 +100,7 @@ func (s *DaemonJobSchedulerTestSuite) TestProcess_ShouldMarkLostExecutionsOnUnhe *mockNodeInfo(s.T(), executions[1].NodeID), } s.nodeSelector.EXPECT().AllNodes(gomock.Any()).Return(nodeInfos, nil) - s.nodeSelector.EXPECT().AllMatchingNodes(gomock.Any(), job).Return(nodeInfos, nil) + s.nodeSelector.EXPECT().AllMatchingNodes(gomock.Any(), job, gomock.Any()).Return(nodeInfos, nil) matcher := NewPlanMatcher(s.T(), PlanMatcherParams{ Evaluation: evaluation, @@ -129,7 +129,7 @@ func (s *DaemonJobSchedulerTestSuite) TestProcess_ShouldNOTMarkJobAsFailed() { *mockNodeInfo(s.T(), executions[1].NodeID), } s.nodeSelector.EXPECT().AllNodes(gomock.Any()).Return(nodeInfos, nil) - s.nodeSelector.EXPECT().AllMatchingNodes(gomock.Any(), job).Return(nodeInfos, nil) + s.nodeSelector.EXPECT().AllMatchingNodes(gomock.Any(), job, gomock.Any()).Return(nodeInfos, nil) matcher := NewPlanMatcher(s.T(), PlanMatcherParams{ Evaluation: evaluation, @@ -165,7 +165,7 @@ func (s *DaemonJobSchedulerTestSuite) TestProcessFail_NoMatchingNodes() { executions := []models.Execution{} // no executions yet s.jobStore.EXPECT().GetJob(gomock.Any(), job.ID).Return(*job, nil) s.jobStore.EXPECT().GetExecutions(gomock.Any(), jobstore.GetExecutionsOptions{JobID: job.ID}).Return(executions, nil) - s.nodeSelector.EXPECT().AllMatchingNodes(gomock.Any(), job).Return([]models.NodeInfo{}, nil) + s.nodeSelector.EXPECT().AllMatchingNodes(gomock.Any(), job, gomock.Any()).Return([]models.NodeInfo{}, nil) // Noop plan matcher := NewPlanMatcher(s.T(), PlanMatcherParams{ diff --git a/pkg/orchestrator/scheduler/ops_job.go b/pkg/orchestrator/scheduler/ops_job.go index 8815f0cbd5..6783292d68 100644 --- a/pkg/orchestrator/scheduler/ops_job.go +++ b/pkg/orchestrator/scheduler/ops_job.go @@ -103,7 +103,12 @@ func (b *OpsJobScheduler) Process(ctx context.Context, evaluation *models.Evalua func (b *OpsJobScheduler) createMissingExecs( ctx context.Context, job *models.Job, plan *models.Plan) (execSet, error) { newExecs := execSet{} - nodes, err := b.nodeSelector.AllMatchingNodes(ctx, job) + nodes, err := b.nodeSelector.AllMatchingNodes( + ctx, + job, &orchestrator.NodeSelectionConstraints{ + RequireApproval: true, + RequireConnected: false, + }) if err != nil { return newExecs, err } diff --git a/pkg/orchestrator/scheduler/ops_job_test.go b/pkg/orchestrator/scheduler/ops_job_test.go index bcc9606d62..a32f7d6cdf 100644 --- a/pkg/orchestrator/scheduler/ops_job_test.go +++ b/pkg/orchestrator/scheduler/ops_job_test.go @@ -167,7 +167,7 @@ func (s *OpsJobSchedulerTestSuite) TestProcessFail_NoMatchingNodes() { } func (s *OpsJobSchedulerTestSuite) mockNodeSelection(job *models.Job, nodeInfos []models.NodeInfo) { - s.nodeSelector.EXPECT().AllMatchingNodes(gomock.Any(), job).Return(nodeInfos, nil) + s.nodeSelector.EXPECT().AllMatchingNodes(gomock.Any(), job, gomock.Any()).Return(nodeInfos, nil) } func mockOpsJob() (*models.Job, []models.Execution, *models.Evaluation) { diff --git a/pkg/orchestrator/scheduler/service_job_test.go b/pkg/orchestrator/scheduler/service_job_test.go index 755c23437c..48c9fd6fc0 100644 --- a/pkg/orchestrator/scheduler/service_job_test.go +++ b/pkg/orchestrator/scheduler/service_job_test.go @@ -321,10 +321,14 @@ func (s *ServiceJobSchedulerTestSuite) TestProcess_ShouldMarkJobAsFailed_NoRetry } func (s *ServiceJobSchedulerTestSuite) mockNodeSelection(job *models.Job, nodeInfos []models.NodeInfo, desiredCount int) { + constraints := &orchestrator.NodeSelectionConstraints{ + RequireApproval: false, + RequireConnected: false, + } if len(nodeInfos) < desiredCount { - s.nodeSelector.EXPECT().TopMatchingNodes(gomock.Any(), job, desiredCount).Return(nil, orchestrator.ErrNotEnoughNodes{}) + s.nodeSelector.EXPECT().TopMatchingNodes(gomock.Any(), job, desiredCount, constraints).Return(nil, orchestrator.ErrNotEnoughNodes{}) } else { - s.nodeSelector.EXPECT().TopMatchingNodes(gomock.Any(), job, desiredCount).Return(nodeInfos, nil) + s.nodeSelector.EXPECT().TopMatchingNodes(gomock.Any(), job, desiredCount, constraints).Return(nodeInfos, nil) } } diff --git a/pkg/orchestrator/scheduler/utils_test.go b/pkg/orchestrator/scheduler/utils_test.go index cc8b243822..2845a20dbc 100644 --- a/pkg/orchestrator/scheduler/utils_test.go +++ b/pkg/orchestrator/scheduler/utils_test.go @@ -122,6 +122,8 @@ func (m PlanMatcher) String() string { func mockNodeInfo(t *testing.T, nodeID string) *models.NodeInfo { return &models.NodeInfo{ - NodeID: nodeID, + NodeID: nodeID, + Approval: models.NodeApprovals.APPROVED, + State: models.NodeStates.CONNECTED, } } diff --git a/pkg/orchestrator/selection/selector/node_selector.go b/pkg/orchestrator/selection/selector/node_selector.go index d209f75593..80cd1a2703 100644 --- a/pkg/orchestrator/selection/selector/node_selector.go +++ b/pkg/orchestrator/selection/selector/node_selector.go @@ -2,6 +2,7 @@ package selector import ( "context" + "errors" "sort" "github.com/rs/zerolog/log" @@ -34,8 +35,10 @@ func (n NodeSelector) AllNodes(ctx context.Context) ([]models.NodeInfo, error) { return n.nodeDiscoverer.ListNodes(ctx) } -func (n NodeSelector) AllMatchingNodes(ctx context.Context, job *models.Job) ([]models.NodeInfo, error) { - filteredNodes, _, err := n.rankAndFilterNodes(ctx, job) +func (n NodeSelector) AllMatchingNodes(ctx context.Context, + job *models.Job, + constraints *orchestrator.NodeSelectionConstraints) ([]models.NodeInfo, error) { + filteredNodes, _, err := n.rankAndFilterNodes(ctx, job, constraints) if err != nil { return nil, err } @@ -43,11 +46,15 @@ func (n NodeSelector) AllMatchingNodes(ctx context.Context, job *models.Job) ([] nodeInfos := generic.Map(filteredNodes, func(nr orchestrator.NodeRank) models.NodeInfo { return nr.NodeInfo }) return nodeInfos, nil } -func (n NodeSelector) TopMatchingNodes(ctx context.Context, job *models.Job, desiredCount int) ([]models.NodeInfo, error) { - possibleNodes, rejectedNodes, err := n.rankAndFilterNodes(ctx, job) + +func (n NodeSelector) TopMatchingNodes(ctx context.Context, + job *models.Job, desiredCount int, + constraints *orchestrator.NodeSelectionConstraints) ([]models.NodeInfo, error) { + possibleNodes, rejectedNodes, err := n.rankAndFilterNodes(ctx, job, constraints) if err != nil { return nil, err } + if len(possibleNodes) < desiredCount { // TODO: evaluate if we should run the job if some nodes where found err = orchestrator.NewErrNotEnoughNodes(desiredCount, append(possibleNodes, rejectedNodes...)) @@ -63,16 +70,34 @@ func (n NodeSelector) TopMatchingNodes(ctx context.Context, job *models.Job, des return selectedInfos, nil } -func (n NodeSelector) rankAndFilterNodes(ctx context.Context, job *models.Job) (selected, rejected []orchestrator.NodeRank, err error) { +func (n NodeSelector) rankAndFilterNodes(ctx context.Context, + job *models.Job, + constraints *orchestrator.NodeSelectionConstraints) (selected, rejected []orchestrator.NodeRank, err error) { listed, err := n.nodeDiscoverer.ListNodes(ctx) if err != nil { return nil, nil, err } nodeIDs := lo.Filter(listed, func(nodeInfo models.NodeInfo, index int) bool { - return nodeInfo.NodeType == models.NodeTypeCompute + if nodeInfo.NodeType != models.NodeTypeCompute { + return false + } + + if constraints.RequireApproval && nodeInfo.Approval != models.NodeApprovals.APPROVED { + return false + } + + if constraints.RequireConnected && nodeInfo.State != models.NodeStates.CONNECTED { + return false + } + + return true }) + if len(nodeIDs) == 0 { + return nil, nil, errors.New("unable to find any connected and approved nodes") + } + rankedNodes, err := n.nodeRanker.RankNodes(ctx, *job, nodeIDs) if err != nil { return nil, nil, err diff --git a/pkg/orchestrator/types.go b/pkg/orchestrator/types.go index 1a5aad7cc8..3ee4436835 100644 --- a/pkg/orchestrator/types.go +++ b/pkg/orchestrator/types.go @@ -78,3 +78,8 @@ func (r NodeRank) MarshalZerologObject(e *zerolog.Event) { type RetryRequest struct { JobID string } + +type NodeSelectionConstraints struct { + RequireConnected bool + RequireApproval bool +} diff --git a/pkg/publicapi/test/setup_test.go b/pkg/publicapi/test/setup_test.go index 699e6e2a1f..89aa11fa66 100644 --- a/pkg/publicapi/test/setup_test.go +++ b/pkg/publicapi/test/setup_test.go @@ -37,6 +37,7 @@ func (s *ServerSuite) SetupSuite() { devstack.WithNumberOfRequesterOnlyNodes(1), devstack.WithNumberOfComputeOnlyNodes(1), devstack.WithDependencyInjector(devstack.NewNoopNodeDependencyInjector()), + devstack.WithAutoNodeApproval(), ) s.requesterNode = stack.Nodes[0] From 67eaa11beb2ece05f915f2b52f9bab2f8185f666 Mon Sep 17 00:00:00 2001 From: Simon Worthington Date: Wed, 10 Apr 2024 18:49:54 +1000 Subject: [PATCH 06/17] Print command line executable when displaying additional commands (#3770) This was removed in #2560 but it's really helpful for users who aren't using the executable through the name we expect (e.g. because of a shell alias) or for developers who are running a local build. --- cmd/util/printer/print.go | 4 ++-- cmd/util/printer/print_legacy.go | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cmd/util/printer/print.go b/cmd/util/printer/print.go index cefacbfcd8..6eea738af7 100644 --- a/cmd/util/printer/print.go +++ b/cmd/util/printer/print.go @@ -102,11 +102,11 @@ func PrintJobExecution( if !quiet { cmd.Println() cmd.Println("To get more details about the run, execute:") - cmd.Println("\tbacalhau job describe " + jobID) + cmd.Println("\t" + os.Args[0] + " job describe " + jobID) cmd.Println() cmd.Println("To get more details about the run executions, execute:") - cmd.Println("\tbacalhau job executions " + jobID) + cmd.Println("\t" + os.Args[0] + " job executions " + jobID) } return nil diff --git a/cmd/util/printer/print_legacy.go b/cmd/util/printer/print_legacy.go index 7c33bc7617..d83b617162 100644 --- a/cmd/util/printer/print_legacy.go +++ b/cmd/util/printer/print_legacy.go @@ -129,11 +129,11 @@ func PrintJobExecutionLegacy( hasResults := slices.ContainsFunc(js.Executions, func(e model.ExecutionState) bool { return e.RunOutput != nil }) if !quiet && hasResults { - cmd.Printf("\nTo download the results, execute:\n\tbacalhau get %s\n", j.ID()) + cmd.Printf("\nTo download the results, execute:\n\t"+os.Args[0]+" get %s\n", j.ID()) } if !quiet { - cmd.Printf("\nTo get more details about the run, execute:\n\tbacalhau describe %s\n", j.ID()) + cmd.Printf("\nTo get more details about the run, execute:\n\t"+os.Args[0]+" describe %s\n", j.ID()) } if hasResults && runtimeSettings.AutoDownloadResults { From cae6305543f7c20a786c21fe65fc7431677b3229 Mon Sep 17 00:00:00 2001 From: Ross Jones Date: Thu, 11 Apr 2024 16:09:46 +0100 Subject: [PATCH 07/17] Deprecates some IPFS flags (#3778) In order to smooth the migration from an embedded IPFS node to a self-hosted IPFS node, we will deprecate the flags used for the internal node, rather than removing the functionality in one release. This will allow for the functionality to be removed over two or more minor version releases. The Definition type has two new fields, a boolean Deprecated field and a string DeprecatedMessage field. if the former is set then we mark the flag as deprecated and show the second field. --- cmd/util/flags/configflags/ipfs.go | 49 +++++++++++++++++--------- cmd/util/flags/configflags/register.go | 9 ++++- 2 files changed, 41 insertions(+), 17 deletions(-) diff --git a/cmd/util/flags/configflags/ipfs.go b/cmd/util/flags/configflags/ipfs.go index 044490beb3..0521732358 100644 --- a/cmd/util/flags/configflags/ipfs.go +++ b/cmd/util/flags/configflags/ipfs.go @@ -2,6 +2,9 @@ package configflags import "github.com/bacalhau-project/bacalhau/pkg/config/types" +const ipfsEmbeddedDeprecationMessage = "The embedded IPFS node will be removed in a future version" + + "in favour of using --ipfs-connect and a self-hosted IPFS node" + var IPFSFlags = []Definition{ { FlagName: "ipfs-swarm-addrs", @@ -9,6 +12,8 @@ var IPFSFlags = []Definition{ DefaultValue: Default.Node.IPFS.SwarmAddresses, Description: "IPFS multiaddress to connect the in-process IPFS node to - cannot be used with --ipfs-connect.", EnvironmentVariables: []string{"BACALHAU_IPFS_SWARM_ADDRESSES"}, + Deprecated: true, + DeprecatedMessage: ipfsEmbeddedDeprecationMessage, }, { FlagName: "ipfs-swarm-key", @@ -31,6 +36,8 @@ var IPFSFlags = []Definition{ "cannot be used with --ipfs-connect. " + "Use \"--private-internal-ipfs=false\" to disable. " + "To persist a local Ipfs node, set BACALHAU_SERVE_IPFS_PATH to a valid path.", + Deprecated: true, + DeprecatedMessage: ipfsEmbeddedDeprecationMessage, }, { FlagName: "ipfs-serve-path", @@ -38,29 +45,39 @@ var IPFSFlags = []Definition{ DefaultValue: Default.Node.IPFS.ServePath, Description: "path local Ipfs node will persist data to", EnvironmentVariables: []string{"BACALHAU_SERVE_IPFS_PATH"}, + Deprecated: true, + DeprecatedMessage: ipfsEmbeddedDeprecationMessage, }, { - FlagName: "ipfs-profile", - ConfigPath: types.NodeIPFSProfile, - DefaultValue: Default.Node.IPFS.Profile, - Description: "profile for internal IPFS node", + FlagName: "ipfs-profile", + ConfigPath: types.NodeIPFSProfile, + DefaultValue: Default.Node.IPFS.Profile, + Description: "profile for internal IPFS node", + Deprecated: true, + DeprecatedMessage: ipfsEmbeddedDeprecationMessage, }, { - FlagName: "ipfs-swarm-listen-addresses", - ConfigPath: types.NodeIPFSSwarmListenAddresses, - DefaultValue: Default.Node.IPFS.SwarmListenAddresses, - Description: "addresses the internal IPFS node will listen on for swarm connections", + FlagName: "ipfs-swarm-listen-addresses", + ConfigPath: types.NodeIPFSSwarmListenAddresses, + DefaultValue: Default.Node.IPFS.SwarmListenAddresses, + Description: "addresses the internal IPFS node will listen on for swarm connections", + Deprecated: true, + DeprecatedMessage: ipfsEmbeddedDeprecationMessage, }, { - FlagName: "ipfs-gateway-listen-addresses", - ConfigPath: types.NodeIPFSGatewayListenAddresses, - DefaultValue: Default.Node.IPFS.GatewayListenAddresses, - Description: "addresses the internal IPFS node will listen on for gateway connections", + FlagName: "ipfs-gateway-listen-addresses", + ConfigPath: types.NodeIPFSGatewayListenAddresses, + DefaultValue: Default.Node.IPFS.GatewayListenAddresses, + Description: "addresses the internal IPFS node will listen on for gateway connections", + Deprecated: true, + DeprecatedMessage: ipfsEmbeddedDeprecationMessage, }, { - FlagName: "ipfs-api-listen-addresses", - ConfigPath: types.NodeIPFSAPIListenAddresses, - DefaultValue: Default.Node.IPFS.APIListenAddresses, - Description: "addresses the internal IPFS node will listen on for API connections", + FlagName: "ipfs-api-listen-addresses", + ConfigPath: types.NodeIPFSAPIListenAddresses, + DefaultValue: Default.Node.IPFS.APIListenAddresses, + Description: "addresses the internal IPFS node will listen on for API connections", + Deprecated: true, + DeprecatedMessage: ipfsEmbeddedDeprecationMessage, }, } diff --git a/cmd/util/flags/configflags/register.go b/cmd/util/flags/configflags/register.go index cdbf09c8f1..de5c33e4c7 100644 --- a/cmd/util/flags/configflags/register.go +++ b/cmd/util/flags/configflags/register.go @@ -37,6 +37,8 @@ type Definition struct { DefaultValue interface{} Description string EnvironmentVariables []string + Deprecated bool + DeprecatedMessage string } // BindFlags binds flags from a command to Viper using the provided definitions. @@ -47,7 +49,7 @@ func BindFlags(cmd *cobra.Command, register map[string][]Definition) error { // set the default value viper.SetDefault(def.ConfigPath, def.DefaultValue) - // bind the flag to viper + // Bind the flag to viper if err := viper.BindPFlag(def.ConfigPath, cmd.Flags().Lookup(def.FlagName)); err != nil { return err } @@ -107,6 +109,11 @@ func RegisterFlags(cmd *cobra.Command, register map[string][]Definition) error { default: return fmt.Errorf("unhandled type: %T for flag %s", v, def.FlagName) } + + if def.Deprecated { + flag := fset.Lookup(def.FlagName) + flag.Deprecated = def.DeprecatedMessage + } } cmd.PersistentFlags().AddFlagSet(fset) } From 97a4a6ae6b3882d504f568fa61a1970284b09acd Mon Sep 17 00:00:00 2001 From: Ross Jones Date: Thu, 11 Apr 2024 17:12:22 +0100 Subject: [PATCH 08/17] Make it easier to get in progress jobs of a certain type (#3777) Currently the inprogress index is just the ID of the job that is inprogress. This PR switches the inprogress index to a composite key containing both the ID and the Type of the job. This allows for filtering of active jobs of a certain type without having to serde each entire job referenced in the index. We may at a future date want to introduce CompositeIndex type alongside the current Index type. --- pkg/jobstore/boltdb/store.go | 56 ++++++++++++++++++++++++++----- pkg/jobstore/boltdb/store_test.go | 19 ++++++++++- pkg/jobstore/mocks.go | 41 ++++++++++++---------- pkg/jobstore/types.go | 5 +-- pkg/requester/housekeeping.go | 2 +- 5 files changed, 93 insertions(+), 30 deletions(-) diff --git a/pkg/jobstore/boltdb/store.go b/pkg/jobstore/boltdb/store.go index 213bca6f04..0beacdc41d 100644 --- a/pkg/jobstore/boltdb/store.go +++ b/pkg/jobstore/boltdb/store.go @@ -535,17 +535,18 @@ func (b *BoltJobStore) GetExecutions(ctx context.Context, options jobstore.GetEx return state, err } -// GetInProgressJobs gets a list of the currently in-progress jobs -func (b *BoltJobStore) GetInProgressJobs(ctx context.Context) ([]models.Job, error) { +// GetInProgressJobs gets a list of the currently in-progress jobs, if a job type is supplied then +// only jobs of that type will be retrieved +func (b *BoltJobStore) GetInProgressJobs(ctx context.Context, jobType string) ([]models.Job, error) { var infos []models.Job err := b.database.View(func(tx *bolt.Tx) (err error) { - infos, err = b.getInProgressJobs(tx) + infos, err = b.getInProgressJobs(tx, jobType) return }) return infos, err } -func (b *BoltJobStore) getInProgressJobs(tx *bolt.Tx) ([]models.Job, error) { +func (b *BoltJobStore) getInProgressJobs(tx *bolt.Tx, jobType string) ([]models.Job, error) { var infos []models.Job var keys [][]byte @@ -555,7 +556,14 @@ func (b *BoltJobStore) getInProgressJobs(tx *bolt.Tx) ([]models.Job, error) { } for _, jobIDKey := range keys { - job, err := b.getJob(tx, string(jobIDKey)) + k, typ := splitInProgressIndexKey(string(jobIDKey)) + if jobType != "" && jobType != typ { + // If the user supplied a job type to filter on, and it doesn't match the job type + // then skip this job + continue + } + + job, err := b.getJob(tx, k) if err != nil { return nil, err } @@ -565,6 +573,25 @@ func (b *BoltJobStore) getInProgressJobs(tx *bolt.Tx) ([]models.Job, error) { return infos, nil } +// splitInProgressIndexKey returns the job type and the job index from +// the in-progress index key. If no delimiter is found, then this index +// was created before this feature was implemented, and we are unable +// to filter on its type so will return "" as the type. +func splitInProgressIndexKey(key string) (string, string) { + parts := strings.Split(key, ":") + if len(parts) == 1 { + return key, "" + } + + k, typ := parts[1], parts[0] + return k, typ +} + +// createInProgressIndexKey will create a composite key for the in-progress index +func createInProgressIndexKey(job *models.Job) string { + return fmt.Sprintf("%s:%s", job.Type, job.ID) +} + // GetJobHistory returns the job (and execution) history for the provided options func (b *BoltJobStore) GetJobHistory(ctx context.Context, jobID string, @@ -713,7 +740,9 @@ func (b *BoltJobStore) createJob(tx *bolt.Tx, job models.Job) error { } } - if err = b.inProgressIndex.Add(tx, jobIDKey); err != nil { + // Create a composite key for the in progress index + jobkey := createInProgressIndexKey(&job) + if err = b.inProgressIndex.Add(tx, []byte(jobkey)); err != nil { return err } @@ -760,7 +789,12 @@ func (b *BoltJobStore) deleteJob(tx *bolt.Tx, jobID string) error { } } - if err = b.inProgressIndex.Remove(tx, jobIDKey); err != nil { + // We'll remove the job from the in progress index using just it's ID in case + // it predates when we switched to composite keys. + _ = b.inProgressIndex.Remove(tx, []byte(job.ID)) + + compositeKey := createInProgressIndexKey(&job) + if err = b.inProgressIndex.Remove(tx, []byte(compositeKey)); err != nil { return err } @@ -831,7 +865,13 @@ func (b *BoltJobStore) updateJobState(tx *bolt.Tx, request jobstore.UpdateJobSta } if job.IsTerminal() { - err = b.inProgressIndex.Remove(tx, []byte(request.JobID)) + // Remove the job from the in progress index, first checking for legacy items + // and then removing the composite. Once we are confident no legacy items + // are left in the old index we can stick to just the composite + _ = b.inProgressIndex.Remove(tx, []byte(job.ID)) + + composite := createInProgressIndexKey(&job) + err = b.inProgressIndex.Remove(tx, []byte(composite)) if err != nil { return err } diff --git a/pkg/jobstore/boltdb/store_test.go b/pkg/jobstore/boltdb/store_test.go index 1a210d0564..2a2fb6c48f 100644 --- a/pkg/jobstore/boltdb/store_test.go +++ b/pkg/jobstore/boltdb/store_test.go @@ -45,6 +45,7 @@ func (s *BoltJobstoreTestSuite) SetupTest() { jobFixtures := []struct { id string + jobType string client string tags map[string]string jobStates []models.JobStateType @@ -53,6 +54,7 @@ func (s *BoltJobstoreTestSuite) SetupTest() { { id: "110", client: "client1", + jobType: "batch", tags: map[string]string{"gpu": "true", "fast": "true"}, jobStates: []models.JobStateType{models.JobStateTypePending, models.JobStateTypeRunning, models.JobStateTypeStopped}, executionStates: []models.ExecutionStateType{models.ExecutionStateAskForBid, models.ExecutionStateAskForBidAccepted, models.ExecutionStateCancelled}, @@ -60,6 +62,7 @@ func (s *BoltJobstoreTestSuite) SetupTest() { { id: "120", client: "client2", + jobType: "batch", tags: map[string]string{}, jobStates: []models.JobStateType{models.JobStateTypePending, models.JobStateTypeRunning, models.JobStateTypeStopped}, executionStates: []models.ExecutionStateType{models.ExecutionStateAskForBid, models.ExecutionStateAskForBidAccepted, models.ExecutionStateCancelled}, @@ -67,6 +70,7 @@ func (s *BoltJobstoreTestSuite) SetupTest() { { id: "130", client: "client3", + jobType: "batch", tags: map[string]string{"slow": "true", "max": "10"}, jobStates: []models.JobStateType{models.JobStateTypePending, models.JobStateTypeRunning}, executionStates: []models.ExecutionStateType{models.ExecutionStateAskForBid, models.ExecutionStateAskForBidAccepted}, @@ -74,6 +78,7 @@ func (s *BoltJobstoreTestSuite) SetupTest() { { id: "140", client: "client4", + jobType: "batch", tags: map[string]string{"max": "10"}, jobStates: []models.JobStateType{models.JobStateTypePending, models.JobStateTypeRunning}, executionStates: []models.ExecutionStateType{models.ExecutionStateAskForBid, models.ExecutionStateAskForBidAccepted}, @@ -81,6 +86,7 @@ func (s *BoltJobstoreTestSuite) SetupTest() { { id: "150", client: "client5", + jobType: "daemon", tags: map[string]string{"max": "10"}, jobStates: []models.JobStateType{models.JobStateTypePending, models.JobStateTypeRunning}, executionStates: []models.ExecutionStateType{models.ExecutionStateAskForBid, models.ExecutionStateAskForBidAccepted}, @@ -93,6 +99,7 @@ func (s *BoltJobstoreTestSuite) SetupTest() { []string{"bash", "-c", "echo hello"}) job.ID = fixture.id + job.Type = fixture.jobType job.Labels = fixture.tags job.Namespace = fixture.client err := s.store.CreateJob(s.ctx, *job) @@ -519,10 +526,20 @@ func (s *BoltJobstoreTestSuite) TestGetExecutions() { } func (s *BoltJobstoreTestSuite) TestInProgressJobs() { - infos, err := s.store.GetInProgressJobs(s.ctx) + infos, err := s.store.GetInProgressJobs(s.ctx, "") s.Require().NoError(err) s.Require().Equal(3, len(infos)) s.Require().Equal("130", infos[0].ID) + + infos, err = s.store.GetInProgressJobs(s.ctx, "batch") + s.Require().NoError(err) + s.Require().Equal(2, len(infos)) + s.Require().Equal("130", infos[0].ID) + + infos, err = s.store.GetInProgressJobs(s.ctx, "daemon") + s.Require().NoError(err) + s.Require().Equal(1, len(infos)) + s.Require().Equal("150", infos[0].ID) } func (s *BoltJobstoreTestSuite) TestShortIDs() { diff --git a/pkg/jobstore/mocks.go b/pkg/jobstore/mocks.go index 39ab9e0c92..fa6fce108c 100644 --- a/pkg/jobstore/mocks.go +++ b/pkg/jobstore/mocks.go @@ -1,5 +1,10 @@ // Code generated by MockGen. DO NOT EDIT. // Source: types.go +// +// Generated by this command: +// +// mockgen --source types.go --destination mocks.go --package jobstore +// // Package jobstore is a generated GoMock package. package jobstore @@ -44,7 +49,7 @@ func (m *MockStore) Close(ctx context.Context) error { } // Close indicates an expected call of Close. -func (mr *MockStoreMockRecorder) Close(ctx interface{}) *gomock.Call { +func (mr *MockStoreMockRecorder) Close(ctx any) *gomock.Call { mr.mock.ctrl.T.Helper() return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Close", reflect.TypeOf((*MockStore)(nil).Close), ctx) } @@ -58,7 +63,7 @@ func (m *MockStore) CreateEvaluation(ctx context.Context, eval models.Evaluation } // CreateEvaluation indicates an expected call of CreateEvaluation. -func (mr *MockStoreMockRecorder) CreateEvaluation(ctx, eval interface{}) *gomock.Call { +func (mr *MockStoreMockRecorder) CreateEvaluation(ctx, eval any) *gomock.Call { mr.mock.ctrl.T.Helper() return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CreateEvaluation", reflect.TypeOf((*MockStore)(nil).CreateEvaluation), ctx, eval) } @@ -72,7 +77,7 @@ func (m *MockStore) CreateExecution(ctx context.Context, execution models.Execut } // CreateExecution indicates an expected call of CreateExecution. -func (mr *MockStoreMockRecorder) CreateExecution(ctx, execution interface{}) *gomock.Call { +func (mr *MockStoreMockRecorder) CreateExecution(ctx, execution any) *gomock.Call { mr.mock.ctrl.T.Helper() return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CreateExecution", reflect.TypeOf((*MockStore)(nil).CreateExecution), ctx, execution) } @@ -86,7 +91,7 @@ func (m *MockStore) CreateJob(ctx context.Context, j models.Job) error { } // CreateJob indicates an expected call of CreateJob. -func (mr *MockStoreMockRecorder) CreateJob(ctx, j interface{}) *gomock.Call { +func (mr *MockStoreMockRecorder) CreateJob(ctx, j any) *gomock.Call { mr.mock.ctrl.T.Helper() return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CreateJob", reflect.TypeOf((*MockStore)(nil).CreateJob), ctx, j) } @@ -100,7 +105,7 @@ func (m *MockStore) DeleteEvaluation(ctx context.Context, id string) error { } // DeleteEvaluation indicates an expected call of DeleteEvaluation. -func (mr *MockStoreMockRecorder) DeleteEvaluation(ctx, id interface{}) *gomock.Call { +func (mr *MockStoreMockRecorder) DeleteEvaluation(ctx, id any) *gomock.Call { mr.mock.ctrl.T.Helper() return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "DeleteEvaluation", reflect.TypeOf((*MockStore)(nil).DeleteEvaluation), ctx, id) } @@ -114,7 +119,7 @@ func (m *MockStore) DeleteJob(ctx context.Context, jobID string) error { } // DeleteJob indicates an expected call of DeleteJob. -func (mr *MockStoreMockRecorder) DeleteJob(ctx, jobID interface{}) *gomock.Call { +func (mr *MockStoreMockRecorder) DeleteJob(ctx, jobID any) *gomock.Call { mr.mock.ctrl.T.Helper() return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "DeleteJob", reflect.TypeOf((*MockStore)(nil).DeleteJob), ctx, jobID) } @@ -129,7 +134,7 @@ func (m *MockStore) GetEvaluation(ctx context.Context, id string) (models.Evalua } // GetEvaluation indicates an expected call of GetEvaluation. -func (mr *MockStoreMockRecorder) GetEvaluation(ctx, id interface{}) *gomock.Call { +func (mr *MockStoreMockRecorder) GetEvaluation(ctx, id any) *gomock.Call { mr.mock.ctrl.T.Helper() return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetEvaluation", reflect.TypeOf((*MockStore)(nil).GetEvaluation), ctx, id) } @@ -144,24 +149,24 @@ func (m *MockStore) GetExecutions(ctx context.Context, options GetExecutionsOpti } // GetExecutions indicates an expected call of GetExecutions. -func (mr *MockStoreMockRecorder) GetExecutions(ctx, options interface{}) *gomock.Call { +func (mr *MockStoreMockRecorder) GetExecutions(ctx, options any) *gomock.Call { mr.mock.ctrl.T.Helper() return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetExecutions", reflect.TypeOf((*MockStore)(nil).GetExecutions), ctx, options) } // GetInProgressJobs mocks base method. -func (m *MockStore) GetInProgressJobs(ctx context.Context) ([]models.Job, error) { +func (m *MockStore) GetInProgressJobs(ctx context.Context, jobType string) ([]models.Job, error) { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "GetInProgressJobs", ctx) + ret := m.ctrl.Call(m, "GetInProgressJobs", ctx, jobType) ret0, _ := ret[0].([]models.Job) ret1, _ := ret[1].(error) return ret0, ret1 } // GetInProgressJobs indicates an expected call of GetInProgressJobs. -func (mr *MockStoreMockRecorder) GetInProgressJobs(ctx interface{}) *gomock.Call { +func (mr *MockStoreMockRecorder) GetInProgressJobs(ctx, jobType any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetInProgressJobs", reflect.TypeOf((*MockStore)(nil).GetInProgressJobs), ctx) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetInProgressJobs", reflect.TypeOf((*MockStore)(nil).GetInProgressJobs), ctx, jobType) } // GetJob mocks base method. @@ -174,7 +179,7 @@ func (m *MockStore) GetJob(ctx context.Context, id string) (models.Job, error) { } // GetJob indicates an expected call of GetJob. -func (mr *MockStoreMockRecorder) GetJob(ctx, id interface{}) *gomock.Call { +func (mr *MockStoreMockRecorder) GetJob(ctx, id any) *gomock.Call { mr.mock.ctrl.T.Helper() return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetJob", reflect.TypeOf((*MockStore)(nil).GetJob), ctx, id) } @@ -189,7 +194,7 @@ func (m *MockStore) GetJobHistory(ctx context.Context, jobID string, options Job } // GetJobHistory indicates an expected call of GetJobHistory. -func (mr *MockStoreMockRecorder) GetJobHistory(ctx, jobID, options interface{}) *gomock.Call { +func (mr *MockStoreMockRecorder) GetJobHistory(ctx, jobID, options any) *gomock.Call { mr.mock.ctrl.T.Helper() return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetJobHistory", reflect.TypeOf((*MockStore)(nil).GetJobHistory), ctx, jobID, options) } @@ -204,7 +209,7 @@ func (m *MockStore) GetJobs(ctx context.Context, query JobQuery) (*JobQueryRespo } // GetJobs indicates an expected call of GetJobs. -func (mr *MockStoreMockRecorder) GetJobs(ctx, query interface{}) *gomock.Call { +func (mr *MockStoreMockRecorder) GetJobs(ctx, query any) *gomock.Call { mr.mock.ctrl.T.Helper() return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetJobs", reflect.TypeOf((*MockStore)(nil).GetJobs), ctx, query) } @@ -218,7 +223,7 @@ func (m *MockStore) UpdateExecution(ctx context.Context, request UpdateExecution } // UpdateExecution indicates an expected call of UpdateExecution. -func (mr *MockStoreMockRecorder) UpdateExecution(ctx, request interface{}) *gomock.Call { +func (mr *MockStoreMockRecorder) UpdateExecution(ctx, request any) *gomock.Call { mr.mock.ctrl.T.Helper() return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateExecution", reflect.TypeOf((*MockStore)(nil).UpdateExecution), ctx, request) } @@ -232,7 +237,7 @@ func (m *MockStore) UpdateJobState(ctx context.Context, request UpdateJobStateRe } // UpdateJobState indicates an expected call of UpdateJobState. -func (mr *MockStoreMockRecorder) UpdateJobState(ctx, request interface{}) *gomock.Call { +func (mr *MockStoreMockRecorder) UpdateJobState(ctx, request any) *gomock.Call { mr.mock.ctrl.T.Helper() return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateJobState", reflect.TypeOf((*MockStore)(nil).UpdateJobState), ctx, request) } @@ -246,7 +251,7 @@ func (m *MockStore) Watch(ctx context.Context, types StoreWatcherType, events St } // Watch indicates an expected call of Watch. -func (mr *MockStoreMockRecorder) Watch(ctx, types, events interface{}) *gomock.Call { +func (mr *MockStoreMockRecorder) Watch(ctx, types, events any) *gomock.Call { mr.mock.ctrl.T.Helper() return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Watch", reflect.TypeOf((*MockStore)(nil).Watch), ctx, types, events) } diff --git a/pkg/jobstore/types.go b/pkg/jobstore/types.go index 46e6db1968..ba0b246752 100644 --- a/pkg/jobstore/types.go +++ b/pkg/jobstore/types.go @@ -56,8 +56,9 @@ type Store interface { GetJobs(ctx context.Context, query JobQuery) (*JobQueryResponse, error) // GetInProgressJobs retrieves all jobs that have a state that can be - // considered, 'in progress'. Failure generates an error. - GetInProgressJobs(ctx context.Context) ([]models.Job, error) + // considered, 'in progress'. Failure generates an error. If the jobType + // is provided, only active jobs of that type will be returned. + GetInProgressJobs(ctx context.Context, jobType string) ([]models.Job, error) // GetJobHistory retrieves the history for the specified job. The // history returned is filtered by the contents of the provided diff --git a/pkg/requester/housekeeping.go b/pkg/requester/housekeeping.go index 80b91a1b34..37bed6d137 100644 --- a/pkg/requester/housekeeping.go +++ b/pkg/requester/housekeeping.go @@ -46,7 +46,7 @@ func (h *Housekeeping) housekeepingBackgroundTask() { for { select { case <-ticker.C: - jobs, err := h.jobStore.GetInProgressJobs(ctx) + jobs, err := h.jobStore.GetInProgressJobs(ctx, "") if err != nil { log.Ctx(ctx).Err(err).Msg("failed to get in progress jobs") continue From af445045bfb497e295e9aa197a5eed5aae474dc7 Mon Sep 17 00:00:00 2001 From: Ross Jones Date: Fri, 12 Apr 2024 09:14:44 +0100 Subject: [PATCH 09/17] Fixes a panic in the wasm executor when entrypoint is not known (#3779) Currently we try to reference the `ExportedFunction` which is documented as returning nil when the name is not found. Instead it returns a non-nil object with an inner nil field which will cause a panic when `Call`-ed. This commit instead looks up the function before referencing it to ensure it exists, returning an error early if not. This allows us to remove the call to recover which was handling the panic() for us. Fixes #3753 --- pkg/executor/wasm/handler.go | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/pkg/executor/wasm/handler.go b/pkg/executor/wasm/handler.go index 30188bfc90..2acb054f09 100644 --- a/pkg/executor/wasm/handler.go +++ b/pkg/executor/wasm/handler.go @@ -68,9 +68,15 @@ func (h *executionHandler) run(ctx context.Context) { h.logger.Error(). Str("recover", fmt.Sprintf("%v", r)). Msg("execution recovered from panic") - // TODO don't do this. - h.result = &models.RunCommandResult{} + + // The recover was originally here for a bug we think is now fixed, but given + // the propensity for panics in this area, we're being extra-cautious and + // ensuring we can handle any future panics that arise. + h.result = executor.NewFailedResult( + fmt.Sprintf("WASM executor failed with an internal error: %v", r), + ) } + ActiveExecutions.Dec(ctx) }() @@ -167,7 +173,23 @@ func (h *executionHandler) run(ctx context.Context) { return } - // invoke the function entry point + // Calling instance.ExportedFunction with an invalid name returns an item that + // is not null. Or rather, the returned item is not null, but something internal + // when calling Call() _is_ null, causing a panic. + // + // To avoid this, we need to check the keys of the definitions map and + // see if the entry point is there and if not we will not attempt to look + // for it. + definitions := instance.ExportedFunctionDefinitions() + _, found := definitions[h.arguments.EntryPoint] + + if !found { + h.result = executor.NewFailedResult( + fmt.Sprintf("unable to find the entrypoint '%s' in the WASM module", h.arguments.EntryPoint), + ) + return + } + entryFunc := instance.ExportedFunction(h.arguments.EntryPoint) h.logger.Info().Msg("running execution") From f453e32b0c77d9a2345cd5892f7702c8afd7ee30 Mon Sep 17 00:00:00 2001 From: Simon Worthington Date: Mon, 15 Apr 2024 08:40:09 +1000 Subject: [PATCH 10/17] Structured events in place of comment strings (#3771) This PR implements the structure proposed in [Improve Error Reporting](https://www.notion.so/expanso/Improve-Error-Reporting-c19f5516822b47de980d76ff43ff4bbe) as a first step towards providing richer progress reporting during job execution. The "tl;dr;" is that we will move to using an event stream for reporting progress on jobs. The event stream will help users understand the progress of their job and give them extra context about any failures that occur. This will allow us to show a richer view in the UI, e.g. the user will be able to see "downloading Docker image" instead of just "job running". To achieve this vision, we need to build this infrastructure for generating events, recording them in the job history, and displaying them (done), replace the orchestrator/compute callbacks mechanism (later PR), and then give lower level components the ability to push events (later PR). This PR also includes some facility for structured error reporting. This allows low-level components to throw structured errors that provide a richer event than the ones generated automatically. This is used in e.g. the ErrNotEnoughNodes case and docker ImageUnavailable case so far. This gives us the ability to output hints as part of our messages back to the user: ![carbon](https://github.com/bacalhau-project/bacalhau/assets/4951176/04e63f09-4f7e-44ed-883a-85fb4b9a7f2d) The output of `describe` now shows a split history between the overall job and its executions: ``` % ./bin/darwin/arm64/bacalhau job describe j-66081fef-8dd2-48de-9997-bbe23a62f0be ID = j-66081fef-8dd2-48de-9997-bbe23a62f0be Name = Docker Job Namespace = default Type = batch State = Completed Count = 1 Created Time = 2024-04-10 06:56:26 Modified Time = 2024-04-10 06:56:29 Version = 0 Summary Completed = 1 Job History TIME REV. STATE TOPIC EVENT DETAILS 0s 1 Pending Submission Job submitted 2.618376s 2 Running 2.840423s 3 Completed Executions ID NODE ID STATE DESIRED REV. CREATED MODIFIED COMMENT e-5886e01f n-ffc3e455 Completed Stopped 6 8s ago 5s ago Accepted job Execution e-5886e01f History TIME REV. STATE TOPIC EVENT DETAILS 0s 1 New 8.165ms 2 AskForBid 2.569966s 3 AskForBidAccepted Requesting node Accepted job FailsExecution: false IsError: false Retryable: false 2.590902s 4 AskForBidAccepted 2.613668s 5 BidAccepted 2.803923s 6 Completed Standard Output 15 ``` Resolves https://github.com/bacalhau-project/expanso-planning/issues/693. Resolves https://github.com/bacalhau-project/expanso-planning/issues/694. ### TODO in this PR - [x] Add more documentation - [x] Sort execution histories by time DESC so that most relevant execution is first - [x] Do some more examples of using structured errors from compute node components --------- Co-authored-by: Ross Jones Co-authored-by: Walid Baruni --- cmd/cli/docker/docker_run_test.go | 2 - cmd/cli/job/describe.go | 59 +++++++- cmd/cli/job/executions.go | 2 +- cmd/cli/job/history.go | 95 ++++++++----- cmd/util/output/output.go | 17 +++ cmd/util/printer/print.go | 112 +++++++++++++-- go.mod | 2 +- pkg/compute/bidder.go | 69 +++++----- pkg/compute/events.go | 30 ++++ pkg/compute/executor.go | 13 +- pkg/compute/executor_buffer.go | 12 +- pkg/compute/types.go | 6 +- pkg/config/getters.go | 9 +- pkg/docker/docker.go | 50 ++++++- .../bidstrategy/semantic/image_platform.go | 34 ++--- pkg/executor/docker/executor.go | 3 +- pkg/jobstore/boltdb/store.go | 34 ++--- pkg/jobstore/boltdb/store_test.go | 27 ++-- pkg/jobstore/mocks.go | 16 +-- pkg/jobstore/types.go | 8 +- pkg/models/error.go | 115 ++++++++++++++++ pkg/models/error_test.go | 81 +++++++++++ pkg/models/event.go | 130 ++++++++++++++++++ pkg/models/event_test.go | 124 +++++++++++++++++ pkg/models/execution.go | 7 + pkg/models/job_history.go | 15 ++ pkg/models/plan.go | 12 +- pkg/models/plan_test.go | 4 +- pkg/orchestrator/endpoint.go | 20 ++- pkg/orchestrator/errors.go | 21 ++- pkg/orchestrator/events.go | 72 ++++++++++ pkg/orchestrator/planner/compute_forwarder.go | 2 +- .../planner/compute_forwarder_test.go | 1 - pkg/orchestrator/planner/event_emitter.go | 5 +- pkg/orchestrator/planner/logging_planner.go | 15 +- pkg/orchestrator/planner/state_updater.go | 7 +- .../planner/state_updater_test.go | 13 +- pkg/orchestrator/planner/utils_test.go | 16 +-- .../scheduler/batch_service_job.go | 21 ++- pkg/orchestrator/scheduler/constants.go | 15 -- pkg/orchestrator/scheduler/daemon_job.go | 4 +- pkg/orchestrator/scheduler/ops_job.go | 12 +- pkg/orchestrator/scheduler/types.go | 15 +- pkg/orchestrator/scheduler/types_test.go | 15 -- .../selection/ranking/features.go | 7 +- pkg/orchestrator/selection/ranking/fixed.go | 5 +- pkg/orchestrator/selection/ranking/labels.go | 7 +- .../selection/ranking/max_usage.go | 7 +- .../selection/ranking/min_version.go | 7 +- .../selection/ranking/previous_executions.go | 3 + pkg/orchestrator/types.go | 11 ++ pkg/requester/endpoint.go | 10 +- pkg/test/logstream/stream_address_test.go | 4 +- pkg/test/requester/retries_test.go | 3 +- 54 files changed, 1109 insertions(+), 297 deletions(-) create mode 100644 pkg/compute/events.go create mode 100644 pkg/models/error.go create mode 100644 pkg/models/error_test.go create mode 100644 pkg/models/event.go create mode 100644 pkg/models/event_test.go create mode 100644 pkg/orchestrator/events.go delete mode 100644 pkg/orchestrator/scheduler/constants.go diff --git a/cmd/cli/docker/docker_run_test.go b/cmd/cli/docker/docker_run_test.go index 14f14a3d64..886320fd83 100644 --- a/cmd/cli/docker/docker_run_test.go +++ b/cmd/cli/docker/docker_run_test.go @@ -791,8 +791,6 @@ func (s *DockerRunSuite) TestRun_InvalidImage() { s.Require().NoError(err) s.T().Log(info) - s.Len(info.State.Executions, 1) - s.Equal(model.ExecutionStateAskForBidRejected, info.State.Executions[0].State) s.Contains(info.State.Executions[0].Status, `Could not inspect image "@" - could be due to repo/image not existing`) } diff --git a/cmd/cli/job/describe.go b/cmd/cli/job/describe.go index 9eebe1e29f..736291370a 100644 --- a/cmd/cli/job/describe.go +++ b/cmd/cli/job/describe.go @@ -1,12 +1,17 @@ package job import ( + "cmp" "fmt" + "slices" "time" "github.com/bacalhau-project/bacalhau/pkg/lib/collections" "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/util/idgen" + "github.com/jedib0t/go-pretty/v6/table" + "github.com/jedib0t/go-pretty/v6/text" + "github.com/samber/lo" "github.com/spf13/cobra" "k8s.io/kubectl/pkg/util/i18n" @@ -65,7 +70,7 @@ func (o *DescribeOptions) run(cmd *cobra.Command, args []string) error { jobID := args[0] response, err := util.GetAPIClientV2(cmd).Jobs().Get(ctx, &apimodels.GetJobRequest{ JobID: jobID, - Include: "executions", + Include: "executions,history", }) if err != nil { @@ -85,12 +90,39 @@ func (o *DescribeOptions) run(cmd *cobra.Command, args []string) error { // TODO: #520 rename Executions.Executions to Executions.Items executions = response.Executions.Executions } + // Show most relevant execution first: sort by time DESC + slices.SortFunc(executions, func(a, b *models.Execution) int { + return cmp.Compare(b.CreateTime, a.CreateTime) + }) + + var history []*models.JobHistory + if response.History != nil { + history = response.History.History + } o.printHeaderData(cmd, job) o.printExecutionsSummary(cmd, executions) + + jobHistory := lo.Filter(history, func(entry *models.JobHistory, _ int) bool { + return entry.Type == models.JobHistoryTypeJobLevel + }) + if err = o.printHistory(cmd, "Job", jobHistory); err != nil { + util.Fatal(cmd, fmt.Errorf("failed to write job history: %w", err), 1) + } + if err = o.printExecutions(cmd, executions); err != nil { return fmt.Errorf("failed to write job executions %s: %w", jobID, err) } + + for _, execution := range executions { + executionHistory := lo.Filter(history, func(item *models.JobHistory, _ int) bool { + return item.ExecutionID == execution.ID + }) + if err = o.printHistory(cmd, "Execution "+idgen.ShortUUID(execution.ID), executionHistory); err != nil { + util.Fatal(cmd, fmt.Errorf("failed to write execution history for %s: %w", execution.ID, err), 1) + } + } + o.printOutputs(cmd, executions) return nil @@ -156,6 +188,31 @@ func (o *DescribeOptions) printExecutions(cmd *cobra.Command, executions []*mode return output.Output(cmd, executionCols, tableOptions, executions) } +func (o *DescribeOptions) printHistory(cmd *cobra.Command, label string, history []*models.JobHistory) error { + if len(history) < 1 { + return nil + } + + timeCol := output.TableColumn[*models.JobHistory]{ + ColumnConfig: table.ColumnConfig{Name: historyTimeCol.ColumnConfig.Name, WidthMax: 20, WidthMaxEnforcer: text.WrapText}, + Value: func(h *models.JobHistory) string { return h.Occurred().Format(time.DateTime) }, + } + + tableOptions := output.OutputOptions{ + Format: output.TableFormat, + NoStyle: true, + } + jobHistoryCols := []output.TableColumn[*models.JobHistory]{ + timeCol, + historyRevisionCol, + historyStateCol, + historyTopicCol, + historyEventCol, + } + output.Bold(cmd, fmt.Sprintf("\n%s History\n", label)) + return output.Output(cmd, jobHistoryCols, tableOptions, history) +} + func (o *DescribeOptions) printOutputs(cmd *cobra.Command, executions []*models.Execution) { outputs := make(map[string]string) for _, e := range executions { diff --git a/cmd/cli/job/executions.go b/cmd/cli/job/executions.go index 699b333009..3e71219612 100644 --- a/cmd/cli/job/executions.go +++ b/cmd/cli/job/executions.go @@ -97,7 +97,7 @@ var ( Value: func(e *models.Execution) string { return strconv.FormatUint(e.Revision, 10) }, } executionColumnState = output.TableColumn[*models.Execution]{ - ColumnConfig: table.ColumnConfig{Name: "State", WidthMax: 10, WidthMaxEnforcer: text.WrapText}, + ColumnConfig: table.ColumnConfig{Name: "State", WidthMax: 17, WidthMaxEnforcer: text.WrapText}, Value: func(e *models.Execution) string { return e.ComputeState.StateType.String() }, } executionColumnDesired = output.TableColumn[*models.Execution]{ diff --git a/cmd/cli/job/history.go b/cmd/cli/job/history.go index dbe5cad02a..e88f530459 100644 --- a/cmd/cli/job/history.go +++ b/cmd/cli/job/history.go @@ -7,6 +7,7 @@ import ( "github.com/jedib0t/go-pretty/v6/table" "github.com/jedib0t/go-pretty/v6/text" + "github.com/rs/zerolog" "github.com/spf13/cobra" "k8s.io/kubectl/pkg/util/i18n" @@ -77,50 +78,82 @@ func NewHistoryCmd() *cobra.Command { return nodeCmd } -var historyColumns = []output.TableColumn[*models.JobHistory]{ - { - ColumnConfig: table.ColumnConfig{Name: "Time", WidthMax: 8, WidthMaxEnforcer: output.ShortenTime}, - Value: func(j *models.JobHistory) string { return j.Time.Format(time.DateTime) }, - }, - { +var ( + historyTimeCol = output.TableColumn[*models.JobHistory]{ + ColumnConfig: table.ColumnConfig{Name: "Time", WidthMax: len(time.StampMilli), WidthMaxEnforcer: output.ShortenTime}, + Value: func(j *models.JobHistory) string { return j.Occurred().Format(time.StampMilli) }, + } + historyLevelCol = output.TableColumn[*models.JobHistory]{ ColumnConfig: table.ColumnConfig{Name: "Level", WidthMax: 15, WidthMaxEnforcer: text.WrapText}, Value: func(jwi *models.JobHistory) string { return jwi.Type.String() }, - }, - { + } + historyRevisionCol = output.TableColumn[*models.JobHistory]{ + ColumnConfig: table.ColumnConfig{Name: "Rev.", WidthMax: 4, WidthMaxEnforcer: text.WrapText}, + Value: func(j *models.JobHistory) string { return strconv.FormatUint(j.NewRevision, 10) }, + } + historyExecIDCol = output.TableColumn[*models.JobHistory]{ ColumnConfig: table.ColumnConfig{Name: "Exec. ID", WidthMax: 10, WidthMaxEnforcer: text.WrapText}, Value: func(j *models.JobHistory) string { return idgen.ShortUUID(j.ExecutionID) }, - }, - { + } + historyNodeIDCol = output.TableColumn[*models.JobHistory]{ ColumnConfig: table.ColumnConfig{Name: "Node ID", WidthMax: 10, WidthMaxEnforcer: text.WrapText}, Value: func(j *models.JobHistory) string { return idgen.ShortNodeID(j.NodeID) }, - }, - { - ColumnConfig: table.ColumnConfig{Name: "Rev.", WidthMax: 4, WidthMaxEnforcer: text.WrapText}, - Value: func(j *models.JobHistory) string { return strconv.FormatUint(j.NewRevision, 10) }, - }, - { - ColumnConfig: table.ColumnConfig{Name: "Previous State", WidthMax: 20, WidthMaxEnforcer: text.WrapText}, - Value: func(j *models.JobHistory) string { - if j.Type == models.JobHistoryTypeJobLevel { - return j.JobState.Previous.String() - } - return j.ExecutionState.Previous.String() - }, - }, - { - ColumnConfig: table.ColumnConfig{Name: "New State", WidthMax: 20, WidthMaxEnforcer: text.WrapText}, + } + historyStateCol = output.TableColumn[*models.JobHistory]{ + ColumnConfig: table.ColumnConfig{Name: "State", WidthMax: 20, WidthMaxEnforcer: text.WrapText}, Value: func(j *models.JobHistory) string { if j.Type == models.JobHistoryTypeJobLevel { return j.JobState.New.String() } return j.ExecutionState.New.String() }, - }, + } + historyTopicCol = output.TableColumn[*models.JobHistory]{ + ColumnConfig: table.ColumnConfig{Name: "Topic", WidthMax: 15, WidthMaxEnforcer: text.WrapSoft}, + Value: func(jh *models.JobHistory) string { return string(jh.Event.Topic) }, + } + historyEventCol = output.TableColumn[*models.JobHistory]{ + ColumnConfig: table.ColumnConfig{Name: "Event", WidthMax: 60, WidthMaxEnforcer: text.WrapText}, + Value: func(h *models.JobHistory) string { + res := h.Event.Message + + if h.Event.Details != nil { + // if is error, then the event is in red + if h.Event.Details[models.DetailsKeyIsError] == "true" { + res = output.RedStr(res) + } + + // print hint in green + if h.Event.Details[models.DetailsKeyHint] != "" { + res += "\n" + fmt.Sprintf( + "%s %s", output.BoldStr(output.GreenStr("* Hint:")), h.Event.Details[models.DetailsKeyHint]) + } + + // print all other details in debug mode + if zerolog.GlobalLevel() <= zerolog.DebugLevel { + for k, v := range h.Event.Details { + // don't print hint and error since they are already represented + if k == models.DetailsKeyHint || k == models.DetailsKeyIsError { + continue + } + res += "\n" + fmt.Sprintf("* %s %s", output.BoldStr(k+":"), v) + } + } + } + return res + }, + } +) - { - ColumnConfig: table.ColumnConfig{Name: "Comment", WidthMax: 40, WidthMaxEnforcer: text.WrapText}, - Value: func(j *models.JobHistory) string { return j.Comment }, - }, +var historyColumns = []output.TableColumn[*models.JobHistory]{ + historyTimeCol, + historyLevelCol, + historyRevisionCol, + historyExecIDCol, + historyNodeIDCol, + historyStateCol, + historyTopicCol, + historyEventCol, } func (o *HistoryOptions) run(cmd *cobra.Command, args []string) error { diff --git a/cmd/util/output/output.go b/cmd/util/output/output.go index 9c71b715d4..31fabc9312 100644 --- a/cmd/util/output/output.go +++ b/cmd/util/output/output.go @@ -22,6 +22,8 @@ const ( const ( bold = "\033[1m" + red = "\033[31m" + green = "\033[32m" reset = "\033[0m" ) @@ -147,6 +149,21 @@ func Bold(cmd *cobra.Command, s string) { cmd.Print(bold + s + reset) } +// BoldStr returns the given string in bold +func BoldStr(s any) string { + return bold + fmt.Sprint(s) + reset +} + +// RedStr returns the given string in red +func RedStr(s any) string { + return red + fmt.Sprint(s) + reset +} + +// GreenStr returns the given string in green +func GreenStr(s any) string { + return green + fmt.Sprint(s) + reset +} + func OutputOneNonTabular[T any](cmd *cobra.Command, options NonTabularOutputOptions, item T) error { switch options.Format { case JSONFormat: diff --git a/cmd/util/printer/print.go b/cmd/util/printer/print.go index 6eea738af7..510ad3e1a3 100644 --- a/cmd/util/printer/print.go +++ b/cmd/util/printer/print.go @@ -4,19 +4,26 @@ import ( "context" "fmt" "io" + "math" "os" "os/signal" + "slices" "strings" "time" + "github.com/fatih/color" + "github.com/mitchellh/go-wordwrap" "github.com/pkg/errors" "github.com/rs/zerolog/log" + "github.com/samber/lo" "github.com/spf13/cobra" + "golang.org/x/exp/maps" + "golang.org/x/term" "github.com/bacalhau-project/bacalhau/cmd/util" "github.com/bacalhau-project/bacalhau/cmd/util/flags/cliflags" "github.com/bacalhau-project/bacalhau/pkg/bacerrors" - "github.com/bacalhau-project/bacalhau/pkg/lib/math" + libmath "github.com/bacalhau-project/bacalhau/pkg/lib/math" "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/publicapi/apimodels" clientv2 "github.com/bacalhau-project/bacalhau/pkg/publicapi/client/v2" @@ -72,12 +79,27 @@ func PrintJobExecution( if jobErr != nil { if jobErr.Error() == PrintoutCanceledButRunningNormally { return nil + } + + history, err := client.Jobs().History(ctx, &apimodels.ListJobHistoryRequest{ + JobID: jobID, + EventType: "execution", + }) + if err != nil { + return fmt.Errorf("failed getting job history: %w", err) + } + + historySummary := summariseHistoryEvents(history.History) + if len(historySummary) > 0 { + for _, event := range historySummary { + printEvent(cmd, event) + } } else { - cmd.PrintErrf("\nError submitting job: %s", jobErr) + printError(cmd, jobErr) } } - if runtimeSettings.PrintNodeDetails || jobErr != nil { + if runtimeSettings.PrintNodeDetails { executions, err := client.Jobs().Executions(ctx, &apimodels.ListJobExecutionsRequest{ JobID: jobID, }) @@ -87,13 +109,13 @@ func PrintJobExecution( summary := summariseExecutions(executions.Executions) if len(summary) > 0 { cmd.Println("\nJob Results By Node:") - for message, nodes := range summary { - cmd.Printf("• Node %s: ", strings.Join(nodes, ", ")) - if strings.ContainsRune(message, '\n') { - cmd.Printf("\n\t%s\n", strings.Join(strings.Split(message, "\n"), "\n\t")) - } else { - cmd.Println(message) + for message, runs := range summary { + nodes := len(lo.Uniq(runs)) + prefix := fmt.Sprintf("• Node %s: ", runs[0]) + if len(runs) > 1 { + prefix = fmt.Sprintf("• %d runs on %d nodes: ", len(runs), nodes) } + printIndentedString(cmd, prefix, strings.Trim(message, "\n"), none, 0) } } else { cmd.Println() @@ -179,7 +201,7 @@ To cancel the job, run: widestString := len(startMessage) for _, v := range eventsWorthPrinting { - widestString = math.Max(widestString, len(v.Message)) + widestString = libmath.Max(widestString, len(v.Message)) } spinner, err := NewSpinner(ctx, writer, widestString, false) @@ -297,6 +319,58 @@ To cancel the job, run: return returnError } +var ( + none = color.New(color.Reset) + red = color.New(color.FgRed) + green = color.New(color.FgGreen) +) + +const ( + errorPrefix = "Error: " + hintPrefix = "Hint: " +) + +var terminalWidth int + +func getTerminalWidth(cmd *cobra.Command) uint { + if terminalWidth == 0 { + var err error + terminalWidth, _, err = term.GetSize(int(os.Stderr.Fd())) + if err != nil || terminalWidth <= 0 { + log.Ctx(cmd.Context()).Debug().Err(err).Msg("Failed to get terminal size") + terminalWidth = math.MaxInt8 + } + } + return uint(terminalWidth) +} + +func printEvent(cmd *cobra.Command, event models.Event) { + printIndentedString(cmd, errorPrefix, event.Message, red, 0) + if event.Details != nil && event.Details[models.DetailsKeyHint] != "" { + printIndentedString(cmd, hintPrefix, event.Details[models.DetailsKeyHint], green, uint(len(errorPrefix))) + } +} + +func printError(cmd *cobra.Command, err error) { + printIndentedString(cmd, errorPrefix, err.Error(), red, 0) +} + +func printIndentedString(cmd *cobra.Command, prefix, msg string, prefixColor *color.Color, startIndent uint) { + maxWidth := getTerminalWidth(cmd) + blockIndent := int(startIndent) + len(prefix) + blockTextWidth := maxWidth - startIndent - uint(len(prefix)) + + cmd.PrintErrln() + cmd.PrintErr(strings.Repeat(" ", int(startIndent))) + prefixColor.Fprintf(cmd.ErrOrStderr(), prefix) + for i, line := range strings.Split(wordwrap.WrapString(msg, blockTextWidth), "\n") { + if i > 0 { + cmd.PrintErr(strings.Repeat(" ", blockIndent)) + } + cmd.PrintErrln(line) + } +} + // Groups the executions in the job state, returning a map of printable messages // to node(s) that generated that message. func summariseExecutions(executions []*models.Execution) map[string][]string { @@ -321,3 +395,21 @@ func summariseExecutions(executions []*models.Execution) map[string][]string { } return results } + +func summariseHistoryEvents(history []*models.JobHistory) []models.Event { + slices.SortFunc(history, func(a, b *models.JobHistory) int { + return a.Occurred().Compare(b.Occurred()) + }) + + events := make(map[string]models.Event, len(history)) + for _, entry := range history { + hasDetails := entry.Event.Details != nil + failsExecution := hasDetails && entry.Event.Details[models.DetailsKeyFailsExecution] == "true" + terminalState := entry.ExecutionState.New.IsTermainl() + if (failsExecution || terminalState) && entry.Event.Message != "" { + events[entry.Event.Message] = entry.Event + } + } + + return maps.Values(events) +} diff --git a/go.mod b/go.mod index bd2d460dd9..24ecde71dd 100644 --- a/go.mod +++ b/go.mod @@ -230,7 +230,7 @@ require ( github.com/elastic/gosigar v0.14.2 // indirect github.com/elgris/jsondiff v0.0.0-20160530203242-765b5c24c302 // indirect github.com/facebookgo/atomicfile v0.0.0-20151019160806-2de1f203e7d5 // indirect - github.com/fatih/color v1.15.0 // indirect + github.com/fatih/color v1.15.0 github.com/flynn/noise v1.1.0 // indirect github.com/francoispqt/gojay v1.2.13 // indirect github.com/fsnotify/fsnotify v1.7.0 // indirect diff --git a/pkg/compute/bidder.go b/pkg/compute/bidder.go index f6cea4f230..5c9455f718 100644 --- a/pkg/compute/bidder.go +++ b/pkg/compute/bidder.go @@ -11,6 +11,7 @@ import ( "github.com/rs/zerolog/log" "github.com/bacalhau-project/bacalhau/pkg/compute/capacity" + "github.com/bacalhau-project/bacalhau/pkg/logger" "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/bidstrategy" @@ -68,7 +69,7 @@ func (b Bidder) ReturnBidResult( }, ExecutionMetadata: NewExecutionMetadata(localExecutionState.Execution), Accepted: response.ShouldBid, - Reason: response.Reason, + Event: RespondedToBidEvent(response), } b.callback.OnBidComplete(ctx, result) } @@ -107,7 +108,7 @@ func (b Bidder) RunBidding(ctx context.Context, bidRequest *BidderRequest) { b.callback.OnComputeFailure(ctx, ComputeError{ RoutingMetadata: routingMetadata, ExecutionMetadata: executionMetadata, - Err: err.Error(), + Event: models.EventFromError(EventTopicExecutionBidding, err), }) return } @@ -140,18 +141,14 @@ func (b Bidder) handleBidResult( JobID: execution.JobID, } handleComputeFailure = func(ctx context.Context, err error, reason string) { - var errMsg string - if err != nil { - log.Ctx(ctx).Error().Err(err).Msg(reason) - errMsg = fmt.Sprintf("%s: %s", reason, err) - } else { - log.Ctx(ctx).Info().Msg(reason) - errMsg = reason + log.Ctx(ctx).WithLevel(logger.ErrOrDebug(err)).Err(err).Msg(reason) + if err == nil { + err = errors.New(reason) } b.callback.OnComputeFailure(ctx, ComputeError{ RoutingMetadata: routingMetadata, ExecutionMetadata: executionMetadata, - Err: errMsg, + Event: models.EventFromError(EventTopicExecutionBidding, err), }) } handleBidComplete = func(ctx context.Context, result *bidStrategyResponse) { @@ -160,7 +157,11 @@ func (b Bidder) handleBidResult( ExecutionMetadata: executionMetadata, Accepted: result.bid, Wait: result.wait, - Reason: result.reason, + Event: RespondedToBidEvent(&bidstrategy.BidStrategyResponse{ + ShouldBid: result.bid, + ShouldWait: result.wait, + Reason: result.reason, + }), }) } ) @@ -230,7 +231,7 @@ func (b Bidder) doBidding( // semantically the job cannot run. semanticResponse, err := b.runSemanticBidding(ctx, job) if err != nil { - return nil, fmt.Errorf("semantic bidding: %w", err) + return nil, err } // we shouldn't bid, and we're not waiting, bail. @@ -241,7 +242,7 @@ func (b Bidder) doBidding( // else the request is semantically biddable or waiting, calculate resource usage and check resource-based bidding. resourceResponse, err := b.runResourceBidding(ctx, job, resourceUsage) if err != nil { - return nil, fmt.Errorf("resource bidding: %w", err) + return nil, err } return &bidStrategyResponse{ @@ -272,19 +273,20 @@ func (b Bidder) runSemanticBidding( // TODO(forrest): this can be parallelized with a wait group, although semantic checks ought to be quick. strategyType := reflect.TypeOf(s).String() resp, err := s.ShouldBid(ctx, request) + + log.Ctx(ctx).WithLevel(logger.ErrOrDebug(err)). + Err(err). + Str("Job", request.Job.ID). + Str("Strategy", strategyType). + Bool("Bid", resp.ShouldBid). + Bool("Wait", resp.ShouldWait). + Str("Reason", resp.Reason). + Send() + if err != nil { - errMsg := fmt.Sprintf("bid strategy: %s failed", strategyType) - log.Ctx(ctx).Error().Err(err).Msgf(errMsg) // NB: failure here results in a callback to OnComputeFailure - return nil, errors.Wrap(err, errMsg) + return nil, err } - log.Ctx(ctx).Info(). - Str("Job", request.Job.ID). - Str("strategy", strategyType). - Bool("bid", resp.ShouldBid). - Bool("wait", resp.ShouldWait). - Str("reason", resp.Reason). - Msgf("bit strategy response") if resp.ShouldWait { shouldWait = true @@ -332,19 +334,20 @@ func (b Bidder) runResourceBidding( for _, s := range b.resourceStrategy { strategyType := reflect.TypeOf(s).String() resp, err := s.ShouldBidBasedOnUsage(ctx, request, *resourceUsage) + + log.Ctx(ctx).WithLevel(logger.ErrOrDebug(err)). + Err(err). + Str("Job", request.Job.ID). + Str("Strategy", strategyType). + Bool("Bid", resp.ShouldBid). + Bool("Wait", resp.ShouldWait). + Str("Reason", resp.Reason). + Send() + if err != nil { - errMsg := fmt.Sprintf("bid strategy: %s failed", strategyType) - log.Ctx(ctx).Error().Err(err).Msgf(errMsg) // NB: failure here results in a callback to OnComputeFailure - return nil, errors.Wrap(err, errMsg) + return nil, err } - log.Ctx(ctx).Info(). - Str("Job", request.Job.ID). - Str("strategy", strategyType). - Bool("bid", resp.ShouldBid). - Bool("wait", resp.ShouldWait). - Str("reason", resp.Reason). - Msgf("bit strategy response") if resp.ShouldWait { shouldWait = true diff --git a/pkg/compute/events.go b/pkg/compute/events.go new file mode 100644 index 0000000000..7129b2d459 --- /dev/null +++ b/pkg/compute/events.go @@ -0,0 +1,30 @@ +package compute + +import ( + "time" + + "github.com/bacalhau-project/bacalhau/pkg/bidstrategy" + "github.com/bacalhau-project/bacalhau/pkg/models" +) + +const ( + EventTopicExecutionBidding models.EventTopic = "Requesting Node" + EventTopicExecutionDownloading models.EventTopic = "Downloading Inputs" + EventTopicExecutionPreparing models.EventTopic = "Preparing Environment" + EventTopicExecutionRunning models.EventTopic = "Running Execution" + EventTopicExecutionPublishing models.EventTopic = "Publishing Results" +) + +func RespondedToBidEvent(response *bidstrategy.BidStrategyResponse) models.Event { + message := response.Reason + if message == "" && response.ShouldBid { + message = "Accepted job" + } + + return models.Event{ + Message: message, + Topic: EventTopicExecutionBidding, + Timestamp: time.Now(), + Details: map[string]string{}, + } +} diff --git a/pkg/compute/executor.go b/pkg/compute/executor.go index b2f4e93267..faf2b6465c 100644 --- a/pkg/compute/executor.go +++ b/pkg/compute/executor.go @@ -299,10 +299,10 @@ func (e *BaseExecutor) Run(ctx context.Context, state store.LocalExecutionState) Logger().WithContext(ctx) stopwatch := telemetry.Timer(ctx, jobDurationMilliseconds, state.Execution.Job.MetricAttributes()...) - operation := "Running" + topic := EventTopicExecutionRunning defer func() { if err != nil { - e.handleFailure(ctx, state, err, operation) + e.handleFailure(ctx, state, err, topic) } dur := stopwatch() log.Ctx(ctx).Debug(). @@ -368,7 +368,7 @@ func (e *BaseExecutor) Run(ctx context.Context, state store.LocalExecutionState) // publish if the job has a publisher defined if !execution.Job.Task().Publisher.IsEmpty() { - operation = "Publishing" + topic = EventTopicExecutionPublishing if err := e.store.UpdateExecutionState(ctx, store.UpdateExecutionStateRequest{ ExecutionID: execution.ID, ExpectedStates: []store.LocalExecutionStateType{expectedState}, @@ -474,9 +474,10 @@ func (e *BaseExecutor) Cancel(ctx context.Context, state store.LocalExecutionSta return err } -func (e *BaseExecutor) handleFailure(ctx context.Context, state store.LocalExecutionState, err error, operation string) { +func (e *BaseExecutor) handleFailure(ctx context.Context, state store.LocalExecutionState, err error, topic models.EventTopic) { + log.Ctx(ctx).Warn().Err(err).Msgf("%s failed", topic) + execution := state.Execution - log.Ctx(ctx).Error().Err(err).Msgf("%s execution %s failed", operation, execution.ID) updateError := e.store.UpdateExecutionState(ctx, store.UpdateExecutionStateRequest{ ExecutionID: execution.ID, NewState: store.ExecutionStateFailed, @@ -492,7 +493,7 @@ func (e *BaseExecutor) handleFailure(ctx context.Context, state store.LocalExecu SourcePeerID: e.ID, TargetPeerID: state.RequesterNodeID, }, - Err: err.Error(), + Event: models.EventFromError(topic, err), }) } } diff --git a/pkg/compute/executor_buffer.go b/pkg/compute/executor_buffer.go index c129a1a401..0f6e99f411 100644 --- a/pkg/compute/executor_buffer.go +++ b/pkg/compute/executor_buffer.go @@ -2,7 +2,6 @@ package compute import ( "context" - "fmt" "sync" "time" @@ -10,6 +9,7 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/compute/store" "github.com/bacalhau-project/bacalhau/pkg/lib/collections" "github.com/bacalhau-project/bacalhau/pkg/logger" + "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/system" "github.com/rs/zerolog/log" ) @@ -88,7 +88,7 @@ func (s *ExecutorBuffer) Run(ctx context.Context, localExecutionState store.Loca SourcePeerID: s.ID, TargetPeerID: localExecutionState.RequesterNodeID, }, - Err: err.Error(), + Event: models.EventFromError(EventTopicExecutionPreparing, err), }) } }() @@ -96,20 +96,20 @@ func (s *ExecutorBuffer) Run(ctx context.Context, localExecutionState store.Loca // There is no point in enqueuing a job that requires more than the total capacity of the node. Such jobs should // have not reached this backend in the first place, and should have been rejected by the frontend when asked to bid if !s.runningCapacity.IsWithinLimits(ctx, *execution.TotalAllocatedResources()) { - err = fmt.Errorf("not enough capacity to run job") + err = models.NewBaseError("not enough capacity to run job").WithFailsExecution() return err } if s.queuedTasks.Contains(execution.ID) { - err = fmt.Errorf("execution %s already enqueued", execution.ID) + err = models.NewBaseError("execution %s already enqueued", execution.ID) return err } if _, ok := s.running[execution.ID]; ok { - err = fmt.Errorf("execution %s already running", execution.ID) + err = models.NewBaseError("execution %s already running", execution.ID) return err } if added := s.enqueuedCapacity.AddIfHasCapacity(ctx, *execution.TotalAllocatedResources()); added == nil { - err = fmt.Errorf("not enough capacity to enqueue job") + err = models.NewBaseError("not enough capacity to enqueue job").WithRetryable() return err } else { // Update the execution to include all the resources that have diff --git a/pkg/compute/types.go b/pkg/compute/types.go index 172496b7ff..d681b9a065 100644 --- a/pkg/compute/types.go +++ b/pkg/compute/types.go @@ -144,7 +144,7 @@ type BidResult struct { ExecutionMetadata Accepted bool Wait bool - Reason string + Event models.Event } // RunResult Result of a job execution that is returned to the caller through a Callback. @@ -164,9 +164,9 @@ type CancelResult struct { type ComputeError struct { RoutingMetadata ExecutionMetadata - Err string + Event models.Event } func (e ComputeError) Error() string { - return e.Err + return e.Event.Message } diff --git a/pkg/config/getters.go b/pkg/config/getters.go index 97be8d46b7..692fd53430 100644 --- a/pkg/config/getters.go +++ b/pkg/config/getters.go @@ -130,6 +130,11 @@ func ShouldKeepStack() bool { return os.Getenv("KEEP_STACK") != "" } +const ( + DockerUsernameEnvVar = "DOCKER_USERNAME" + DockerPasswordEnvVar = "DOCKER_PASSWORD" +) + type DockerCredentials struct { Username string Password string @@ -141,8 +146,8 @@ func (d *DockerCredentials) IsValid() bool { func GetDockerCredentials() DockerCredentials { return DockerCredentials{ - Username: os.Getenv("DOCKER_USERNAME"), - Password: os.Getenv("DOCKER_PASSWORD"), + Username: os.Getenv(DockerUsernameEnvVar), + Password: os.Getenv(DockerPasswordEnvVar), } } diff --git a/pkg/docker/docker.go b/pkg/docker/docker.go index 3cde905fcd..058a9d911e 100644 --- a/pkg/docker/docker.go +++ b/pkg/docker/docker.go @@ -34,11 +34,49 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/util/closer" ) -const ImagePullError = `Could not pull image %q - could be due to repo/image not existing, ` + - `or registry needing authorization` +type ImageUnavailableError struct { + Verb string + Image string + Creds config.DockerCredentials + Err error +} + +func (die ImageUnavailableError) Error() string { + return pkgerrors.Wrapf(die.Err, + "Could not %s image %q - could be due to repo/image not existing, "+ + "or registry needing authorization", + die.Verb, + die.Image, + ).Error() +} -const DistributionInspectError = `Could not inspect image %q - could be due to repo/image not existing, ` + - `or registry needing authorization` +func (die ImageUnavailableError) Hint() string { + if !die.Creds.IsValid() { + return "If the image is private, supply the node with valid Docker login credentials " + + "using the " + config.DockerUsernameEnvVar + " and " + config.DockerPasswordEnvVar + + " environment variables" + } + + return "" +} + +func NewImageInspectError(image string, creds config.DockerCredentials, err error) error { + return ImageUnavailableError{ + Verb: "inspect", + Image: image, + Creds: creds, + Err: err, + } +} + +func NewImagePullError(image string, creds config.DockerCredentials, err error) error { + return ImageUnavailableError{ + Verb: "pull", + Image: image, + Creds: creds, + Err: err, + } +} type Client struct { tracing.TracedClient @@ -228,7 +266,7 @@ func (c *Client) ImagePlatforms(ctx context.Context, image string, dockerCreds c distribution, err := c.DistributionInspect(ctx, image, authToken) if err != nil { - return nil, pkgerrors.Wrapf(err, DistributionInspectError, image) + return nil, NewImageInspectError(image, dockerCreds, err) } return distribution.Platforms, nil @@ -342,7 +380,7 @@ func (c *Client) ImageDistribution( authToken := getAuthToken(ctx, image, creds) dist, err := c.DistributionInspect(ctx, image, authToken) if err != nil { - return nil, pkgerrors.Wrapf(err, DistributionInspectError, image) + return nil, NewImageInspectError(image, creds, err) } obj := dist.Descriptor.Digest diff --git a/pkg/executor/docker/bidstrategy/semantic/image_platform.go b/pkg/executor/docker/bidstrategy/semantic/image_platform.go index 10034a1fe5..5fa376b1e6 100644 --- a/pkg/executor/docker/bidstrategy/semantic/image_platform.go +++ b/pkg/executor/docker/bidstrategy/semantic/image_platform.go @@ -2,7 +2,6 @@ package semantic import ( "context" - "errors" "sync" dockermodels "github.com/bacalhau-project/bacalhau/pkg/executor/docker/models" @@ -49,28 +48,25 @@ func (s *ImagePlatformBidStrategy) ShouldBid( return bidstrategy.NewBidResponse(true, "examine images for non-Docker jobs"), nil } - supported, serr := s.client.SupportedPlatforms(ctx) - - var ierr error = nil - var manifest docker.ImageManifest + supported, err := s.client.SupportedPlatforms(ctx) + if err != nil { + return bidstrategy.BidStrategyResponse{}, err + } dockerEngine, err := dockermodels.DecodeSpec(request.Job.Task().Engine) if err != nil { - return bidstrategy.BidStrategyResponse{ - ShouldBid: false, - ShouldWait: false, - Reason: err.Error(), - }, nil + return bidstrategy.BidStrategyResponse{}, err } manifest, found := ManifestCache.Get(dockerEngine.Image) if !found { log.Ctx(ctx).Debug().Str("Image", dockerEngine.Image).Msg("Image not found in manifest cache") - var m *docker.ImageManifest - m, ierr = s.client.ImageDistribution(ctx, dockerEngine.Image, config.GetDockerCredentials()) - if ierr != nil { - return bidstrategy.NewBidResponse(false, ierr.Error()), nil + creds := config.GetDockerCredentials() + + m, err := s.client.ImageDistribution(ctx, dockerEngine.Image, creds) + if err != nil { + return bidstrategy.BidStrategyResponse{}, err } if m != nil { @@ -90,7 +86,7 @@ func (s *ImagePlatformBidStrategy) ShouldBid( // processing log.Ctx(ctx).Warn(). Str("Image", dockerEngine.Image). - Str("Error", err.Error()). + Err(err). Msg("Failed to save to manifest cache") } }() @@ -98,14 +94,6 @@ func (s *ImagePlatformBidStrategy) ShouldBid( log.Ctx(ctx).Debug().Str("Image", dockerEngine.Image).Msg("Image found in manifest cache") } - errs := errors.Join(serr, ierr) - if errs != nil { - return bidstrategy.BidStrategyResponse{ - ShouldBid: false, - Reason: errs.Error(), - }, nil - } - imageHasPlatforms := make([]string, 0, len(manifest.Platforms)) for _, imageHas := range manifest.Platforms { imageHasPlatforms = append(imageHasPlatforms, imageHas.OS+"/"+imageHas.Architecture) diff --git a/pkg/executor/docker/executor.go b/pkg/executor/docker/executor.go index 1af6f4bbee..2269559b9c 100644 --- a/pkg/executor/docker/executor.go +++ b/pkg/executor/docker/executor.go @@ -360,8 +360,7 @@ func (e *Executor) newDockerJobContainer(ctx context.Context, params *dockerJobC if _, set := os.LookupEnv("SKIP_IMAGE_PULL"); !set { dockerCreds := config.GetDockerCredentials() if pullErr := e.client.PullImage(ctx, dockerArgs.Image, dockerCreds); pullErr != nil { - pullErr = errors.Wrapf(pullErr, docker.ImagePullError, dockerArgs.Image) - return container.CreateResponse{}, fmt.Errorf("failed to pull docker image: %w", pullErr) + return container.CreateResponse{}, docker.NewImagePullError(dockerArgs.Image, dockerCreds, pullErr) } } log.Ctx(ctx).Trace().Msgf("Container: %+v %+v", containerConfig, mounts) diff --git a/pkg/jobstore/boltdb/store.go b/pkg/jobstore/boltdb/store.go index 0beacdc41d..872e4b17c1 100644 --- a/pkg/jobstore/boltdb/store.go +++ b/pkg/jobstore/boltdb/store.go @@ -36,8 +36,6 @@ const ( BucketNamespacesIndex = "idx_namespaces" // namespace -> Job id BucketExecutionsIndex = "idx_executions" // execution-id -> Job id BucketEvaluationsIndex = "idx_evaluations" // evaluation-id -> Job id - - newJobComment = "Job created" ) var SpecKey = []byte("spec") @@ -682,7 +680,7 @@ func (b *BoltJobStore) getJobHistory(tx *bolt.Tx, jobID string, } // CreateJob creates a new record of a job in the data store -func (b *BoltJobStore) CreateJob(ctx context.Context, job models.Job) error { +func (b *BoltJobStore) CreateJob(ctx context.Context, job models.Job, event models.Event) error { job.State = models.NewJobState(models.JobStateTypePending) job.Revision = 1 job.CreateTime = b.clock.Now().UTC().UnixNano() @@ -693,11 +691,11 @@ func (b *BoltJobStore) CreateJob(ctx context.Context, job models.Job) error { return err } return b.database.Update(func(tx *bolt.Tx) (err error) { - return b.createJob(tx, job) + return b.createJob(tx, job, event) }) } -func (b *BoltJobStore) createJob(tx *bolt.Tx, job models.Job) error { +func (b *BoltJobStore) createJob(tx *bolt.Tx, job models.Job, event models.Event) error { if b.jobExists(tx, job.ID) { return jobstore.NewErrJobAlreadyExists(job.ID) } @@ -758,7 +756,7 @@ func (b *BoltJobStore) createJob(tx *bolt.Tx, job models.Job) error { } } - return b.appendJobHistory(tx, job, models.JobStateTypePending, newJobComment) + return b.appendJobHistory(tx, job, models.JobStateTypePending, event) } // DeleteJob removes the specified job from the system entirely @@ -849,7 +847,7 @@ func (b *BoltJobStore) updateJobState(tx *bolt.Tx, request jobstore.UpdateJobSta // update the job state previousState := job.State.StateType job.State.StateType = request.NewState - job.State.Message = request.Comment + job.State.Message = request.Event.Message job.Revision++ job.ModifyTime = b.clock.Now().UTC().UnixNano() @@ -877,10 +875,10 @@ func (b *BoltJobStore) updateJobState(tx *bolt.Tx, request jobstore.UpdateJobSta } } - return b.appendJobHistory(tx, job, previousState, request.Comment) + return b.appendJobHistory(tx, job, previousState, request.Event) } -func (b *BoltJobStore) appendJobHistory(tx *bolt.Tx, updateJob models.Job, previousState models.JobStateType, comment string) error { +func (b *BoltJobStore) appendJobHistory(tx *bolt.Tx, updateJob models.Job, previousState models.JobStateType, event models.Event) error { historyEntry := models.JobHistory{ Type: models.JobHistoryTypeJobLevel, JobID: updateJob.ID, @@ -889,7 +887,8 @@ func (b *BoltJobStore) appendJobHistory(tx *bolt.Tx, updateJob models.Job, previ New: updateJob.State.StateType, }, NewRevision: updateJob.Revision, - Comment: comment, + Comment: event.Message, + Event: event, Time: time.Unix(0, updateJob.ModifyTime), } data, err := b.marshaller.Marshal(historyEntry) @@ -910,7 +909,7 @@ func (b *BoltJobStore) appendJobHistory(tx *bolt.Tx, updateJob models.Job, previ } // CreateExecution creates a record of a new execution -func (b *BoltJobStore) CreateExecution(ctx context.Context, execution models.Execution) error { +func (b *BoltJobStore) CreateExecution(ctx context.Context, execution models.Execution, event models.Event) error { if execution.CreateTime == 0 { execution.CreateTime = b.clock.Now().UTC().UnixNano() } @@ -928,11 +927,11 @@ func (b *BoltJobStore) CreateExecution(ctx context.Context, execution models.Exe return err } return b.database.Update(func(tx *bolt.Tx) (err error) { - return b.createExecution(tx, execution) + return b.createExecution(tx, execution, event) }) } -func (b *BoltJobStore) createExecution(tx *bolt.Tx, execution models.Execution) error { +func (b *BoltJobStore) createExecution(tx *bolt.Tx, execution models.Execution, event models.Event) error { if !b.jobExists(tx, execution.JobID) { return jobstore.NewErrJobNotFound(execution.JobID) } @@ -970,7 +969,7 @@ func (b *BoltJobStore) createExecution(tx *bolt.Tx, execution models.Execution) } } - return b.appendExecutionHistory(tx, execution, models.ExecutionStateNew, "") + return b.appendExecutionHistory(tx, execution, models.ExecutionStateNew, event) } // UpdateExecution updates the state of a single execution by loading from storage, @@ -1031,11 +1030,11 @@ func (b *BoltJobStore) updateExecution(tx *bolt.Tx, request jobstore.UpdateExecu } } - return b.appendExecutionHistory(tx, newExecution, existingExecution.ComputeState.StateType, request.Comment) + return b.appendExecutionHistory(tx, newExecution, existingExecution.ComputeState.StateType, request.Event) } func (b *BoltJobStore) appendExecutionHistory(tx *bolt.Tx, updated models.Execution, - previous models.ExecutionStateType, cmt string) error { + previous models.ExecutionStateType, event models.Event) error { historyEntry := models.JobHistory{ Type: models.JobHistoryTypeExecutionLevel, JobID: updated.JobID, @@ -1046,7 +1045,8 @@ func (b *BoltJobStore) appendExecutionHistory(tx *bolt.Tx, updated models.Execut New: updated.ComputeState.StateType, }, NewRevision: updated.Revision, - Comment: cmt, + Comment: event.Message, + Event: event, Time: time.Unix(0, updated.ModifyTime), } diff --git a/pkg/jobstore/boltdb/store_test.go b/pkg/jobstore/boltdb/store_test.go index 2a2fb6c48f..767d1b3d38 100644 --- a/pkg/jobstore/boltdb/store_test.go +++ b/pkg/jobstore/boltdb/store_test.go @@ -5,7 +5,6 @@ package boltjobstore import ( "context" "encoding/json" - "fmt" "os" "path/filepath" "testing" @@ -102,13 +101,13 @@ func (s *BoltJobstoreTestSuite) SetupTest() { job.Type = fixture.jobType job.Labels = fixture.tags job.Namespace = fixture.client - err := s.store.CreateJob(s.ctx, *job) + err := s.store.CreateJob(s.ctx, *job, models.Event{}) s.Require().NoError(err) s.clock.Add(1 * time.Second) execution := mock.ExecutionForJob(job) execution.ComputeState.StateType = models.ExecutionStateNew - err = s.store.CreateExecution(s.ctx, *execution) + err = s.store.CreateExecution(s.ctx, *execution, models.Event{}) s.Require().NoError(err) for i, state := range fixture.jobStates { @@ -126,7 +125,7 @@ func (s *BoltJobstoreTestSuite) SetupTest() { ExpectedState: oldState, ExpectedRevision: uint64(i + 1), }, - Comment: fmt.Sprintf("moved to %+v", state), + Event: models.Event{}, } err = s.store.UpdateJobState(s.ctx, request) s.Require().NoError(err) @@ -151,7 +150,7 @@ func (s *BoltJobstoreTestSuite) SetupTest() { ExpectedRevision: uint64(i + 1), }, NewValues: *execution, - Comment: fmt.Sprintf("exec update to %+v", state), + Event: models.Event{}, } err = s.store.UpdateExecution(s.ctx, request) @@ -442,7 +441,7 @@ func (s *BoltJobstoreTestSuite) TestDeleteJob() { job.ID = "deleteme" job.Namespace = "client1" - err := s.store.CreateJob(s.ctx, *job) + err := s.store.CreateJob(s.ctx, *job, models.Event{}) s.Require().NoError(err) err = s.store.DeleteJob(s.ctx, job.ID) @@ -461,8 +460,8 @@ func (s *BoltJobstoreTestSuite) TestGetJob() { func (s *BoltJobstoreTestSuite) TestCreateExecution() { job := mock.Job() execution := mock.ExecutionForJob(job) - s.Require().NoError(s.store.CreateJob(s.ctx, *job)) - s.Require().NoError(s.store.CreateExecution(s.ctx, *execution)) + s.Require().NoError(s.store.CreateJob(s.ctx, *job, models.Event{})) + s.Require().NoError(s.store.CreateExecution(s.ctx, *execution, models.Event{})) // Ensure that the execution is created exec, err := s.store.GetExecutions(s.ctx, jobstore.GetExecutionsOptions{ @@ -558,7 +557,7 @@ func (s *BoltJobstoreTestSuite) TestShortIDs() { s.Require().IsType(err, &bacerrors.JobNotFound{}) // Create and fetch the single entry - err = s.store.CreateJob(s.ctx, *job) + err = s.store.CreateJob(s.ctx, *job, models.Event{}) s.Require().NoError(err) j, err := s.store.GetJob(s.ctx, shortString) @@ -567,7 +566,7 @@ func (s *BoltJobstoreTestSuite) TestShortIDs() { // Add a record that will also match and expect an appropriate error job.ID = uuidString2 - err = s.store.CreateJob(s.ctx, *job) + err = s.store.CreateJob(s.ctx, *job, models.Event{}) s.Require().NoError(err) _, err = s.store.GetJob(s.ctx, shortString) @@ -589,7 +588,7 @@ func (s *BoltJobstoreTestSuite) TestEvents() { var execution models.Execution s.Run("job create event", func() { - err := s.store.CreateJob(s.ctx, *job) + err := s.store.CreateJob(s.ctx, *job, models.Event{}) s.Require().NoError(err) // Read an event, it should be a jobcreate @@ -608,7 +607,7 @@ func (s *BoltJobstoreTestSuite) TestEvents() { execution = *mock.Execution() execution.JobID = "10" execution.ComputeState = models.State[models.ExecutionStateType]{StateType: models.ExecutionStateNew} - err := s.store.CreateExecution(s.ctx, execution) + err := s.store.CreateExecution(s.ctx, execution, models.Event{}) s.Require().NoError(err) // Read an event, it should be a ExecutionForJob Create @@ -624,7 +623,7 @@ func (s *BoltJobstoreTestSuite) TestEvents() { Condition: jobstore.UpdateJobCondition{ ExpectedState: models.JobStateTypePending, }, - Comment: "event test", + Event: models.Event{Message: "event test"}, } _ = s.store.UpdateJobState(s.ctx, request) ev := <-ch @@ -641,7 +640,7 @@ func (s *BoltJobstoreTestSuite) TestEvents() { ExpectedStates: []models.ExecutionStateType{models.ExecutionStateNew}, }, NewValues: execution, - Comment: "event test", + Event: models.Event{Message: "event test"}, }) ev := <-ch s.Require().Equal(ev.Event, jobstore.UpdateEvent) diff --git a/pkg/jobstore/mocks.go b/pkg/jobstore/mocks.go index fa6fce108c..6fdbfd6c14 100644 --- a/pkg/jobstore/mocks.go +++ b/pkg/jobstore/mocks.go @@ -69,31 +69,31 @@ func (mr *MockStoreMockRecorder) CreateEvaluation(ctx, eval any) *gomock.Call { } // CreateExecution mocks base method. -func (m *MockStore) CreateExecution(ctx context.Context, execution models.Execution) error { +func (m *MockStore) CreateExecution(ctx context.Context, execution models.Execution, event models.Event) error { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "CreateExecution", ctx, execution) + ret := m.ctrl.Call(m, "CreateExecution", ctx, execution, event) ret0, _ := ret[0].(error) return ret0 } // CreateExecution indicates an expected call of CreateExecution. -func (mr *MockStoreMockRecorder) CreateExecution(ctx, execution any) *gomock.Call { +func (mr *MockStoreMockRecorder) CreateExecution(ctx, execution, event any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CreateExecution", reflect.TypeOf((*MockStore)(nil).CreateExecution), ctx, execution) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CreateExecution", reflect.TypeOf((*MockStore)(nil).CreateExecution), ctx, execution, event) } // CreateJob mocks base method. -func (m *MockStore) CreateJob(ctx context.Context, j models.Job) error { +func (m *MockStore) CreateJob(ctx context.Context, j models.Job, event models.Event) error { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "CreateJob", ctx, j) + ret := m.ctrl.Call(m, "CreateJob", ctx, j, event) ret0, _ := ret[0].(error) return ret0 } // CreateJob indicates an expected call of CreateJob. -func (mr *MockStoreMockRecorder) CreateJob(ctx, j any) *gomock.Call { +func (mr *MockStoreMockRecorder) CreateJob(ctx, j, event any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CreateJob", reflect.TypeOf((*MockStore)(nil).CreateJob), ctx, j) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CreateJob", reflect.TypeOf((*MockStore)(nil).CreateJob), ctx, j, event) } // DeleteEvaluation mocks base method. diff --git a/pkg/jobstore/types.go b/pkg/jobstore/types.go index ba0b246752..4c63e3f5fb 100644 --- a/pkg/jobstore/types.go +++ b/pkg/jobstore/types.go @@ -66,7 +66,7 @@ type Store interface { GetJobHistory(ctx context.Context, jobID string, options JobHistoryFilterOptions) ([]models.JobHistory, error) // CreateJob will create a new job and persist it in the store. - CreateJob(ctx context.Context, j models.Job) error + CreateJob(ctx context.Context, j models.Job, event models.Event) error // GetExecutions retrieves all executions for the specified job. GetExecutions(ctx context.Context, options GetExecutionsOptions) ([]models.Execution, error) @@ -76,7 +76,7 @@ type Store interface { UpdateJobState(ctx context.Context, request UpdateJobStateRequest) error // CreateExecution creates a new execution - CreateExecution(ctx context.Context, execution models.Execution) error + CreateExecution(ctx context.Context, execution models.Execution, event models.Event) error // UpdateExecution updates the execution state according to the values // within [UpdateExecutionRequest]. @@ -103,14 +103,14 @@ type UpdateJobStateRequest struct { JobID string Condition UpdateJobCondition NewState models.JobStateType - Comment string + Event models.Event } type UpdateExecutionRequest struct { ExecutionID string Condition UpdateExecutionCondition NewValues models.Execution - Comment string + Event models.Event } type UpdateJobCondition struct { diff --git a/pkg/models/error.go b/pkg/models/error.go new file mode 100644 index 0000000000..a4289c0b5e --- /dev/null +++ b/pkg/models/error.go @@ -0,0 +1,115 @@ +package models + +import "fmt" + +const ( + DetailsKeyIsError = "IsError" + DetailsKeyHint = "Hint" + DetailsKeyRetryable = "Retryable" + DetailsKeyFailsExecution = "FailsExecution" +) + +type HasHint interface { + // Hint A human-readable string that advises the user on how they might solve the error. + Hint() string +} + +type HasRetryable interface { + // Retryable Whether the error could be retried, assuming the same input and + // node configuration; i.e. the error is transient and due to network + // capacity or service outage. + // + // If a component raises an error with Retryable() as true, the system may + // retry the last action after some length of time. If it is false, it + // should not try the action again, and may choose an alternative action or + // fail the job. + Retryable() bool +} + +type HasFailsExecution interface { + // FailsExecution Whether this error means that the associated execution cannot + // continue. + // + // If a component raises an error with FailsExecution() as true, + // the hosting executor should report the execution as failed and stop any + // further steps. + FailsExecution() bool +} + +type HasDetails interface { + // Details An extra set of metadata provided by the error. + Details() map[string]string +} + +// BaseError is a custom error type in Go that provides additional fields +// and methods for more detailed error handling. It implements the error +// interface, as well as additional interfaces for providing a hint, +// indicating whether the error is retryable, whether it fails execution, +// and for providing additional details. +type BaseError struct { + message string + hint string + retryable bool + failsExecution bool + details map[string]string +} + +// NewBaseError is a constructor function that creates a new BaseError with +// only the message field set. +func NewBaseError(format string, a ...any) *BaseError { + return &BaseError{message: fmt.Sprintf(format, a...)} +} + +// WithHint is a method that sets the hint field of BaseError and returns +// the BaseError itself for chaining. +func (e *BaseError) WithHint(hint string) *BaseError { + e.hint = hint + return e +} + +// WithRetryable is a method that sets the retryable field of BaseError and +// returns the BaseError itself for chaining. +func (e *BaseError) WithRetryable() *BaseError { + e.retryable = true + return e +} + +// WithFailsExecution is a method that sets the failsExecution field of +// BaseError and returns the BaseError itself for chaining. +func (e *BaseError) WithFailsExecution() *BaseError { + e.failsExecution = true + return e +} + +// WithDetails is a method that sets the details field of BaseError and +// returns the BaseError itself for chaining. +func (e *BaseError) WithDetails(details map[string]string) *BaseError { + e.details = details + return e +} + +// Error is a method that returns the message field of BaseError. This +// method makes BaseError satisfy the error interface. +func (e *BaseError) Error() string { + return e.message +} + +// Hint is a method that returns the hint field of BaseError. +func (e *BaseError) Hint() string { + return e.hint +} + +// Retryable is a method that returns the retryable field of BaseError. +func (e *BaseError) Retryable() bool { + return e.retryable +} + +// FailsExecution is a method that returns the failsExecution field of BaseError. +func (e *BaseError) FailsExecution() bool { + return e.failsExecution +} + +// Details is a method that returns the details field of BaseError. +func (e *BaseError) Details() map[string]string { + return e.details +} diff --git a/pkg/models/error_test.go b/pkg/models/error_test.go new file mode 100644 index 0000000000..ac84cf348a --- /dev/null +++ b/pkg/models/error_test.go @@ -0,0 +1,81 @@ +//go:build unit || !integration + +package models + +import ( + "testing" + + "github.com/stretchr/testify/suite" +) + +type BaseErrorTestSuite struct { + suite.Suite +} + +func (suite *BaseErrorTestSuite) TestBaseErrorWithMessage() { + message := "TestMessage" + err := NewBaseError(message) + + suite.Equal(message, err.Error()) + suite.Empty(err.Hint()) + suite.False(err.Retryable()) + suite.False(err.FailsExecution()) + suite.Nil(err.Details()) +} + +func (suite *BaseErrorTestSuite) TestBaseErrorWithFormattedMessage() { + // test that NewBaseError can accept a message with format specifiers + message := "TestMessage %s" + err := NewBaseError(message, "withFormat") + suite.Equal("TestMessage withFormat", err.Error()) +} + +func (suite *BaseErrorTestSuite) TestBaseErrorWithHint() { + message := "TestMessage" + hint := "TestHint" + err := NewBaseError(message).WithHint(hint) + + suite.Equal(message, err.Error()) + suite.Equal(hint, err.Hint()) + suite.False(err.Retryable()) + suite.False(err.FailsExecution()) + suite.Nil(err.Details()) +} + +func (suite *BaseErrorTestSuite) TestBaseErrorWithRetryable() { + message := "TestMessage" + err := NewBaseError(message).WithRetryable() + + suite.Equal(message, err.Error()) + suite.Empty(err.Hint()) + suite.True(err.Retryable()) + suite.False(err.FailsExecution()) + suite.Nil(err.Details()) +} + +func (suite *BaseErrorTestSuite) TestBaseErrorWithFailsExecution() { + message := "TestMessage" + err := NewBaseError(message).WithFailsExecution() + + suite.Equal(message, err.Error()) + suite.Empty(err.Hint()) + suite.False(err.Retryable()) + suite.True(err.FailsExecution()) + suite.Nil(err.Details()) +} + +func (suite *BaseErrorTestSuite) TestBaseErrorWithDetails() { + message := "TestMessage" + details := map[string]string{"key1": "value1", "key2": "value2"} + err := NewBaseError(message).WithDetails(details) + + suite.Equal(message, err.Error()) + suite.Empty(err.Hint()) + suite.False(err.Retryable()) + suite.False(err.FailsExecution()) + suite.Equal(details, err.Details()) +} + +func TestBaseErrorTestSuite(t *testing.T) { + suite.Run(t, new(BaseErrorTestSuite)) +} diff --git a/pkg/models/event.go b/pkg/models/event.go new file mode 100644 index 0000000000..f72930bd65 --- /dev/null +++ b/pkg/models/event.go @@ -0,0 +1,130 @@ +package models + +import ( + "maps" + "time" +) + +// EventTopic is a high level categorisation that can be applied to an event. It +// should be a human-readable string with no dynamic content. They are used to +// disambiguate events from the same component occurring in different contexts. +// For example, an event emitted by S3 storage used as an input source and the +// same event emitted by S3 storage used as a publisher would be tagged with +// different topics. +// +// EventTopics do not need to conform to a centralized list – each module should +// use event topics that make sense for their own logic. Event topics SHOULD be +// unique. +type EventTopic string + +// Event represents a progress report made by the system in its attempt to run a +// job. Events are generated by the orchestrator and also passed back to the +// orchestrator from the compute node. +// +// Events may be delivered in an async fashion – i.e, they may arrive much later +// than the moment they occurred. +type Event struct { + // A human-readable string giving the user all the information they need to + // understand and respond to an Event, if a response is required. + Message string `json:"Message"` + + // The topic of the event. See the documentation on EventTopic. + Topic EventTopic `json:"Topic"` + + // The moment the event occurred, which may be different to the moment it + // was recorded. + Timestamp time.Time `json:"Timestamp"` + + // Any additional metadata that the system or user may need to know about + // the event in order to handle it properly. + Details map[string]string `json:"Details,omitempty"` +} + +// NewEvent returns a new Event with the given topic. +func NewEvent(topic EventTopic) *Event { + return &Event{ + Topic: topic, + Timestamp: time.Now(), + Details: make(map[string]string), + } +} + +// WithMessage returns a new Event with the given message and topic. +func (e *Event) WithMessage(message string) *Event { + e.Message = message + return e +} + +// WithError returns a new Event with the given error. +func (e *Event) WithError(err error) *Event { + e.Message = err.Error() + return e.WithDetail(DetailsKeyIsError, "true") +} + +// WithHint returns a new Event with the given hint. +func (e *Event) WithHint(hint string) *Event { + if hint != "" { + return e.WithDetail(DetailsKeyHint, hint) + } + return e +} + +// WithRetryable returns a new Event with the given retryable flag. +func (e *Event) WithRetryable(retryable bool) *Event { + if retryable { + return e.WithDetail(DetailsKeyRetryable, "true") + } + return e +} + +// WithFailsExecution returns a new Event with the given fails execution flag. +func (e *Event) WithFailsExecution(failsExecution bool) *Event { + if failsExecution { + return e.WithDetail(DetailsKeyFailsExecution, "true") + } + return e +} + +// WithDetails returns a new Event with the given details and topic. +func (e *Event) WithDetails(details map[string]string) *Event { + maps.Copy(e.Details, details) + return e +} + +// WithDetail returns a new Event with the given detail and topic. +func (e *Event) WithDetail(key, value string) *Event { + e.Details[key] = value + return e +} + +// EventFromError converts an error into an Event tagged with the passed event +// topic. +// +// This method allows errors to implement extra interfaces (above) to do +// "attribute-based error reporting". The design principle is that errors can +// report a set of extra flags that have well defined semantics which the system +// can then respond to with specific behavior. This allows introducing or +// refactoring error types without higher-level components needing to be +// modified – they simply continue to respond to the presence of attributes. +// +// This is instead of the system having a centralized set of known error types +// and programming in specific behavior in response to them, which is brittle +// and requires updating all of the error responses when the types change. +func EventFromError(topic EventTopic, err error) Event { + event := NewEvent(topic).WithError(err) + + if hasDetails, ok := err.(HasDetails); ok { + event = event.WithDetails(hasDetails.Details()) + } + if hasHint, ok := err.(HasHint); ok { + event = event.WithHint(hasHint.Hint()) + } + if hasRetryable, ok := err.(HasRetryable); ok { + event = event.WithRetryable(hasRetryable.Retryable()) + } + if hasFailsExecution, ok := err.(HasFailsExecution); ok { + event = event.WithFailsExecution(hasFailsExecution.FailsExecution()) + } + + return *event +} diff --git a/pkg/models/event_test.go b/pkg/models/event_test.go new file mode 100644 index 0000000000..041e8a004a --- /dev/null +++ b/pkg/models/event_test.go @@ -0,0 +1,124 @@ +//go:build unit || !integration + +package models + +import ( + "fmt" + "testing" + "time" + + "github.com/stretchr/testify/suite" +) + +type EventTestSuite struct { + suite.Suite + topic EventTopic +} + +func (suite *EventTestSuite) SetupTest() { + suite.topic = EventTopic("TestTopic") +} + +func (suite *EventTestSuite) TestNewEvent() { + event := NewEvent(suite.topic) + + suite.Equal(suite.topic, event.Topic) + suite.WithinDuration(time.Now(), event.Timestamp, time.Second) + suite.Empty(event.Details) +} + +func (suite *EventTestSuite) TestEventWithMessage() { + message := "TestMessage" + event := NewEvent(suite.topic).WithMessage(message) + + suite.Equal(message, event.Message) +} + +func (suite *EventTestSuite) TestEventWithError() { + err := fmt.Errorf("TestError") + event := NewEvent(suite.topic).WithError(err) + + suite.Equal(err.Error(), event.Message) + suite.Equal("true", event.Details[DetailsKeyIsError]) +} + +func (suite *EventTestSuite) TestEventWithHint() { + hint := "TestHint" + event := NewEvent(suite.topic).WithHint(hint) + + suite.Equal(hint, event.Details[DetailsKeyHint]) +} + +func (suite *EventTestSuite) TestEventWithRetryable() { + event := NewEvent(suite.topic).WithRetryable(true) + + suite.Equal("true", event.Details[DetailsKeyRetryable]) +} + +func (suite *EventTestSuite) TestEventWithFailsExecution() { + event := NewEvent(suite.topic).WithFailsExecution(true) + + suite.Equal("true", event.Details[DetailsKeyFailsExecution]) +} + +func (suite *EventTestSuite) TestEventWithDetails() { + details := map[string]string{"key1": "value1", "key2": "value2"} + event := NewEvent(suite.topic).WithDetails(details) + + suite.Equal(details, event.Details) +} + +func (suite *EventTestSuite) TestEventWithDetail() { + key := "TestKey" + value := "TestValue" + event := NewEvent(suite.topic).WithDetail(key, value) + + suite.Equal(value, event.Details[key]) +} + +func (suite *EventTestSuite) TestEventFromError() { + errMessage := "TestError" + err := &BaseError{ + message: errMessage, + hint: "TestHint", + retryable: true, + failsExecution: true, + details: map[string]string{"key1": "value1", "key2": "value2"}, + } + event := EventFromError(suite.topic, err) + + suite.Equal(errMessage, event.Message) + suite.Equal(suite.topic, event.Topic) + suite.Equal("true", event.Details[DetailsKeyIsError]) + suite.Equal("true", event.Details[DetailsKeyRetryable]) + suite.Equal("true", event.Details[DetailsKeyFailsExecution]) + suite.Equal(err.hint, event.Details[DetailsKeyHint]) + suite.Equal("value1", event.Details["key1"]) + suite.Equal("value2", event.Details["key2"]) +} + +func (suite *EventTestSuite) TestEventFromErrorNoDetails() { + errMessage := "TestError" + err := NewBaseError(errMessage) + event := EventFromError(suite.topic, err) + + suite.Equal(errMessage, event.Message) + suite.Equal(suite.topic, event.Topic) + suite.Equal("true", event.Details[DetailsKeyIsError]) + suite.Len(event.Details, 1) +} + +func (suite *EventTestSuite) TestEventFromSimpleError() { + errMessage := "TestError" + err := fmt.Errorf(errMessage) + event := EventFromError(suite.topic, err) + + suite.Equal(errMessage, event.Message) + suite.Equal(suite.topic, event.Topic) + suite.Equal("true", event.Details[DetailsKeyIsError]) + suite.Len(event.Details, 1) +} + +func TestEventTestSuite(t *testing.T) { + suite.Run(t, new(EventTestSuite)) +} diff --git a/pkg/models/execution.go b/pkg/models/execution.go index 6338814c52..81199cb957 100644 --- a/pkg/models/execution.go +++ b/pkg/models/execution.go @@ -42,6 +42,13 @@ func (s ExecutionStateType) IsUndefined() bool { return s == ExecutionStateUndefined } +func (s ExecutionStateType) IsTermainl() bool { + return s == ExecutionStateBidRejected || + s == ExecutionStateCompleted || + s == ExecutionStateFailed || + s == ExecutionStateCancelled +} + type ExecutionDesiredStateType int const ( diff --git a/pkg/models/job_history.go b/pkg/models/job_history.go index 4dd92c598c..406c895d22 100644 --- a/pkg/models/job_history.go +++ b/pkg/models/job_history.go @@ -49,5 +49,20 @@ type JobHistory struct { ExecutionState *StateChange[ExecutionStateType] `json:"ExecutionState,omitempty"` NewRevision uint64 `json:"NewRevision"` Comment string `json:"Comment,omitempty"` + Event Event `json:"Event,omitempty"` Time time.Time `json:"Time"` } + +// Occurred returns when the action that triggered an update to job history +// actually occurred. +// +// The Time field represents the moment that the JobHistory item was recorded, +// i.e. it is almost always set to time.Now() when creating the object. This is +// different to the Event.Timestamp which represents when the source of the +// history update actually occurred. +func (jh JobHistory) Occurred() time.Time { + if !jh.Event.Timestamp.Equal(time.Time{}) { + return jh.Event.Timestamp + } + return jh.Time +} diff --git a/pkg/models/plan.go b/pkg/models/plan.go index c8f2cf9db8..33ab3adcd5 100644 --- a/pkg/models/plan.go +++ b/pkg/models/plan.go @@ -3,7 +3,7 @@ package models type PlanExecutionDesiredUpdate struct { Execution *Execution `json:"Execution"` DesiredState ExecutionDesiredStateType `json:"DesiredState"` - Comment string `json:"Comment,omitempty"` + Event Event `json:"Event"` } // Plan holds actions as a result of processing an evaluation by the scheduler. @@ -17,7 +17,7 @@ type Plan struct { Job *Job `json:"Job,omitempty"` DesiredJobState JobStateType `json:"DesiredJobState,omitempty"` - Comment string `json:"Comment,omitempty"` + Event Event `json:"Event,omitempty"` // NewExecutions holds the executions to be created. NewExecutions []*Execution `json:"NewExecutions,omitempty"` @@ -43,11 +43,11 @@ func (p *Plan) AppendExecution(execution *Execution) { } // AppendStoppedExecution marks an execution to be stopped. -func (p *Plan) AppendStoppedExecution(execution *Execution, comment string) { +func (p *Plan) AppendStoppedExecution(execution *Execution, event Event) { updateRequest := &PlanExecutionDesiredUpdate{ Execution: execution, DesiredState: ExecutionDesiredStateStopped, - Comment: comment, + Event: event, } p.UpdatedExecutions[execution.ID] = updateRequest } @@ -87,9 +87,9 @@ func (p *Plan) MarkJobRunningIfEligible() { p.DesiredJobState = JobStateTypeRunning } -func (p *Plan) MarkJobFailed(comment string) { +func (p *Plan) MarkJobFailed(event Event) { p.DesiredJobState = JobStateTypeFailed - p.Comment = comment + p.Event = event p.NewExecutions = []*Execution{} // drop any update that is not stopping an execution diff --git a/pkg/models/plan_test.go b/pkg/models/plan_test.go index 143ff3b1fb..a806529190 100644 --- a/pkg/models/plan_test.go +++ b/pkg/models/plan_test.go @@ -32,9 +32,9 @@ func (s *PlanTestSuite) TestNewPlan() { } func (s *PlanTestSuite) TestMarkJobFailed() { - s.plan.MarkJobFailed("Test failure") + s.plan.MarkJobFailed(models.Event{Message: "Test failure"}) s.Equal(models.JobStateTypeFailed, s.plan.DesiredJobState) - s.Equal("Test failure", s.plan.Comment) + s.Equal("Test failure", s.plan.Event.Message) } func (s *PlanTestSuite) TestMarkJobRunningIfApplicable() { diff --git a/pkg/orchestrator/endpoint.go b/pkg/orchestrator/endpoint.go index d088cf9f41..1aff9e5385 100644 --- a/pkg/orchestrator/endpoint.go +++ b/pkg/orchestrator/endpoint.go @@ -58,6 +58,10 @@ func NewBaseEndpoint(params *BaseEndpointParams) *BaseEndpoint { // SubmitJob submits a job to the evaluation broker. func (e *BaseEndpoint) SubmitJob(ctx context.Context, request *SubmitJobRequest) (*SubmitJobResponse, error) { job := request.Job + events := []models.Event{ + JobSubmittedEvent(), + } + job.Normalize() warnings := job.SanitizeSubmission() @@ -84,14 +88,24 @@ func (e *BaseEndpoint) SubmitJob(ctx context.Context, request *SubmitJobRequest) return nil, errors.Wrap(err, "failure converting job to JSON") } else { translatedJob.Meta[models.MetaDerivedFrom] = base64.StdEncoding.EncodeToString(b) + events = append(events, JobTranslatedEvent(job, translatedJob)) } job = translatedJob } } - if err := e.store.CreateJob(ctx, *job); err != nil { - return nil, err + for i, event := range events { + if i == 0 { + if err := e.store.CreateJob(ctx, *job, events[0]); err != nil { + return nil, err + } + } else { + req := jobstore.UpdateJobStateRequest{JobID: job.ID, Event: event, NewState: models.JobStateTypePending} + if err := e.store.UpdateJobState(ctx, req); err != nil { + return nil, err + } + } } eval := &models.Evaluation{ @@ -145,7 +159,7 @@ func (e *BaseEndpoint) StopJob(ctx context.Context, request *StopJobRequest) (St }, }, NewState: models.JobStateTypeStopped, - Comment: request.Reason, + Event: JobStoppedEvent(request.Reason), }) if err != nil { return StopJobResponse{}, err diff --git a/pkg/orchestrator/errors.go b/pkg/orchestrator/errors.go index a747c30bfc..f3edcbee1a 100644 --- a/pkg/orchestrator/errors.go +++ b/pkg/orchestrator/errors.go @@ -34,13 +34,16 @@ func NewErrNotEnoughNodes(requestedNodes int, availableNodes []NodeRank) ErrNotE } } +func (e ErrNotEnoughNodes) SuitableNodes() int { + return lo.CountBy(e.AvailableNodes, func(rank NodeRank) bool { return rank.MeetsRequirement() }) +} + func (e ErrNotEnoughNodes) Error() string { - suitable := lo.CountBy(e.AvailableNodes, func(rank NodeRank) bool { return rank.MeetsRequirement() }) reasons := lo.GroupBy(e.AvailableNodes, func(rank NodeRank) string { return rank.Reason }) var message strings.Builder fmt.Fprint(&message, "not enough nodes to run job. ") - fmt.Fprintf(&message, "requested: %d, available: %d, suitable: %d.", e.RequestedNodes, len(e.AvailableNodes), suitable) + fmt.Fprintf(&message, "requested: %d, available: %d, suitable: %d.", e.RequestedNodes, len(e.AvailableNodes), e.SuitableNodes()) for reason, nodes := range reasons { fmt.Fprint(&message, "\n• ") if len(nodes) > 1 { @@ -53,6 +56,20 @@ func (e ErrNotEnoughNodes) Error() string { return message.String() } +func (e ErrNotEnoughNodes) Retryable() bool { + return lo.ContainsBy(e.AvailableNodes, func(rank NodeRank) bool { + return !rank.MeetsRequirement() && rank.Retryable + }) +} + +func (e ErrNotEnoughNodes) Details() map[string]string { + return map[string]string{ + "NodesRequested": fmt.Sprint(e.RequestedNodes), + "NodesAvailable": fmt.Sprint(len(e.AvailableNodes)), + "NodesSuitable": fmt.Sprint(e.SuitableNodes()), + } +} + // ErrNoMatchingNodes is returned when no matching nodes in the network to run a job type ErrNoMatchingNodes struct { } diff --git a/pkg/orchestrator/events.go b/pkg/orchestrator/events.go new file mode 100644 index 0000000000..d8d5ea833b --- /dev/null +++ b/pkg/orchestrator/events.go @@ -0,0 +1,72 @@ +package orchestrator + +import ( + "time" + + "github.com/bacalhau-project/bacalhau/pkg/models" +) + +const ( + EventTopicJobSubmission models.EventTopic = "Submission" + EventTopicJobScheduling models.EventTopic = "Scheduling" +) + +const ( + jobSubmittedMessage = "Job submitted" + jobTranslatedMessage = "Job tasks translated to new type" + jobStopRequestedMessage = "Job requested to stop before completion" + jobExhaustedRetriesMessage = "Job failed because it has been retried too many times" + + execStoppedByJobStopMessage = "Execution stop requested because job has been stopped" + execStoppedByNodeUnhealthyMessage = "Execution stop requested because node has disappeared" + execStoppedByNodeRejectedMessage = "Execution stop requested because node has been rejected" + execStoppedByOversubscriptionMessage = "Execution stop requested because there are more executions than needed" + execRejectedByNodeMessage = "Node responded to execution run request" + execFailedMessage = "Execution did not complete successfully" +) + +func event(topic models.EventTopic, msg string, details map[string]string) models.Event { + return models.Event{ + Message: msg, + Topic: topic, + Timestamp: time.Now(), + Details: details, + } +} + +func JobSubmittedEvent() models.Event { + return event(EventTopicJobSubmission, jobSubmittedMessage, map[string]string{}) +} + +func JobTranslatedEvent(old, new *models.Job) models.Event { + return event(EventTopicJobSubmission, jobTranslatedMessage, map[string]string{ + "PreviousTaskType": old.Task().Engine.Type, + "NewTaskType": new.Task().Engine.Type, + }) +} + +func JobStoppedEvent(reason string) models.Event { + return event(EventTopicJobScheduling, jobStopRequestedMessage, map[string]string{ + "Reason": reason, + }) +} + +func JobExhaustedRetriesEvent() models.Event { + return event(EventTopicJobScheduling, jobExhaustedRetriesMessage, map[string]string{}) +} + +func ExecStoppedByJobStopEvent() models.Event { + return event(EventTopicJobScheduling, execStoppedByJobStopMessage, map[string]string{}) +} + +func ExecStoppedByNodeUnhealthyEvent() models.Event { + return event(EventTopicJobScheduling, execStoppedByNodeUnhealthyMessage, map[string]string{}) +} + +func ExecStoppedByNodeRejectedEvent() models.Event { + return event(EventTopicJobScheduling, execStoppedByNodeRejectedMessage, map[string]string{}) +} + +func ExecStoppedByOversubscriptionEvent() models.Event { + return event(EventTopicJobScheduling, execStoppedByOversubscriptionMessage, map[string]string{}) +} diff --git a/pkg/orchestrator/planner/compute_forwarder.go b/pkg/orchestrator/planner/compute_forwarder.go index f7e193fd64..714fdf8e40 100644 --- a/pkg/orchestrator/planner/compute_forwarder.go +++ b/pkg/orchestrator/planner/compute_forwarder.go @@ -67,7 +67,7 @@ func (s *ComputeForwarder) doProcess(ctx context.Context, plan *models.Plan) { if observedState == models.ExecutionStateAskForBidAccepted { s.doNotifyBidRejected(ctx, u.Execution) } else if !u.Execution.IsTerminalComputeState() { - s.notifyCancel(ctx, u.Comment, u.Execution) + s.notifyCancel(ctx, u.Event.Message, u.Execution) } } } diff --git a/pkg/orchestrator/planner/compute_forwarder_test.go b/pkg/orchestrator/planner/compute_forwarder_test.go index f7f385174b..4fa21212fc 100644 --- a/pkg/orchestrator/planner/compute_forwarder_test.go +++ b/pkg/orchestrator/planner/compute_forwarder_test.go @@ -141,7 +141,6 @@ func (suite *ComputeForwarderSuite) mockUpdateExecution(plan *models.Plan, id st update := &models.PlanExecutionDesiredUpdate{ Execution: execution, DesiredState: desiredState, - Comment: "update", } plan.UpdatedExecutions[execution.ID] = update return update diff --git a/pkg/orchestrator/planner/event_emitter.go b/pkg/orchestrator/planner/event_emitter.go index cf08410556..3d2e9a5c56 100644 --- a/pkg/orchestrator/planner/event_emitter.go +++ b/pkg/orchestrator/planner/event_emitter.go @@ -2,7 +2,6 @@ package planner import ( "context" - "time" "github.com/bacalhau-project/bacalhau/pkg/model" "github.com/bacalhau-project/bacalhau/pkg/models" @@ -43,9 +42,9 @@ func (s *EventEmitter) Process(ctx context.Context, plan *models.Plan) error { s.eventEmitter.EmitEventSilently(ctx, model.JobEvent{ SourceNodeID: s.id, JobID: plan.Job.ID, - Status: plan.Comment, + Status: plan.Event.Message, EventName: eventName, - EventTime: time.Now(), + EventTime: plan.Event.Timestamp, }) } return nil diff --git a/pkg/orchestrator/planner/logging_planner.go b/pkg/orchestrator/planner/logging_planner.go index d23e0d5d3c..834bdf7994 100644 --- a/pkg/orchestrator/planner/logging_planner.go +++ b/pkg/orchestrator/planner/logging_planner.go @@ -5,6 +5,7 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/orchestrator" + "github.com/rs/zerolog" "github.com/rs/zerolog/log" ) @@ -16,14 +17,24 @@ func NewLoggingPlanner() *LoggingPlanner { } func (s *LoggingPlanner) Process(ctx context.Context, plan *models.Plan) error { + dict := zerolog.Dict() + for key, value := range plan.Event.Details { + dict = dict.Str(key, value) + } + + level := zerolog.TraceLevel + message := "Job updated" switch plan.DesiredJobState { case models.JobStateTypeCompleted: - log.Info().Msgf("Job %s completed successfully", plan.Job.ID) + level = zerolog.InfoLevel + message = "Job completed successfully" case models.JobStateTypeFailed: - log.Error().Msgf("Job %s failed due to `%s`", plan.Job.ID, plan.Comment) + level = zerolog.WarnLevel + message = "Job failed" default: } + log.Ctx(ctx).WithLevel(level).Dict("Details", dict).Str("Event", plan.Event.Message).Str("JobID", plan.Job.ID).Msg(message) return nil } diff --git a/pkg/orchestrator/planner/state_updater.go b/pkg/orchestrator/planner/state_updater.go index b16145cb16..ee4b29514b 100644 --- a/pkg/orchestrator/planner/state_updater.go +++ b/pkg/orchestrator/planner/state_updater.go @@ -30,7 +30,7 @@ func (s *StateUpdater) Process(ctx context.Context, plan *models.Plan) error { // Create new executions for _, exec := range plan.NewExecutions { - err := s.store.CreateExecution(ctx, *exec) + err := s.store.CreateExecution(ctx, *exec, plan.Event) if err != nil { return err } @@ -43,12 +43,13 @@ func (s *StateUpdater) Process(ctx context.Context, plan *models.Plan) error { NewValues: models.Execution{ DesiredState: models.State[models.ExecutionDesiredStateType]{ StateType: u.DesiredState, - Message: u.Comment, + Message: u.Event.Message, }, }, Condition: jobstore.UpdateExecutionCondition{ ExpectedRevision: u.Execution.Revision, }, + Event: plan.Event, }) if err != nil { return err @@ -60,7 +61,7 @@ func (s *StateUpdater) Process(ctx context.Context, plan *models.Plan) error { err := s.store.UpdateJobState(ctx, jobstore.UpdateJobStateRequest{ JobID: plan.Job.ID, NewState: plan.DesiredJobState, - Comment: plan.Comment, + Event: plan.Event, Condition: jobstore.UpdateJobCondition{ ExpectedRevision: plan.Job.Revision, }, diff --git a/pkg/orchestrator/planner/state_updater_test.go b/pkg/orchestrator/planner/state_updater_test.go index 3e2b5930fe..9f3742df13 100644 --- a/pkg/orchestrator/planner/state_updater_test.go +++ b/pkg/orchestrator/planner/state_updater_test.go @@ -33,8 +33,8 @@ func (suite *StateUpdaterSuite) TestStateUpdater_Process_CreateExecutions_Succes plan := mock.Plan() execution1, execution2 := mockCreateExecutions(plan) - suite.mockStore.EXPECT().CreateExecution(suite.ctx, *execution1).Times(1) - suite.mockStore.EXPECT().CreateExecution(suite.ctx, *execution2).Times(1) + suite.mockStore.EXPECT().CreateExecution(suite.ctx, *execution1, models.Event{}).Times(1) + suite.mockStore.EXPECT().CreateExecution(suite.ctx, *execution2, models.Event{}).Times(1) suite.NoError(suite.stateUpdater.Process(suite.ctx, plan)) } @@ -44,7 +44,7 @@ func (suite *StateUpdaterSuite) TestStateUpdater_Process_CreateExecutions_Error( execution1, _ := mockCreateExecutions(plan) // no attempt to create execution2 - suite.mockStore.EXPECT().CreateExecution(suite.ctx, *execution1).Return(errors.New("create error")).Times(1) + suite.mockStore.EXPECT().CreateExecution(suite.ctx, *execution1, models.Event{}).Return(errors.New("create error")).Times(1) suite.Error(suite.stateUpdater.Process(suite.ctx, plan)) } @@ -61,7 +61,6 @@ func (suite *StateUpdaterSuite) TestStateUpdater_Process_UpdateExecutions_Succes func (suite *StateUpdaterSuite) TestStateUpdater_Process_UpdateJobState_Success() { plan := mock.Plan() plan.DesiredJobState = models.JobStateTypeCompleted - plan.Comment = "update job state" suite.mockStore.EXPECT().UpdateJobState(suite.ctx, NewUpdateJobMatcherFromPlanUpdate(suite.T(), plan)).Times(1) suite.NoError(suite.stateUpdater.Process(suite.ctx, plan)) @@ -70,7 +69,6 @@ func (suite *StateUpdaterSuite) TestStateUpdater_Process_UpdateJobState_Success( func (suite *StateUpdaterSuite) TestStateUpdater_Process_UpdateJobState_Error() { plan := mock.Plan() plan.DesiredJobState = models.JobStateTypeCompleted - plan.Comment = "update job state" suite.mockStore.EXPECT().UpdateJobState(suite.ctx, NewUpdateJobMatcherFromPlanUpdate(suite.T(), plan)).Return(errors.New("create error")).Times(1) suite.Error(suite.stateUpdater.Process(suite.ctx, plan)) @@ -87,10 +85,9 @@ func (suite *StateUpdaterSuite) TestStateUpdater_Process_MultiOp() { update1, update2 := mockUpdateExecutions(plan) plan.DesiredJobState = models.JobStateTypeCompleted - plan.Comment = "update job state" - suite.mockStore.EXPECT().CreateExecution(suite.ctx, *execution1).Times(1) - suite.mockStore.EXPECT().CreateExecution(suite.ctx, *execution2).Times(1) + suite.mockStore.EXPECT().CreateExecution(suite.ctx, *execution1, models.Event{}).Times(1) + suite.mockStore.EXPECT().CreateExecution(suite.ctx, *execution2, models.Event{}).Times(1) suite.mockStore.EXPECT().UpdateExecution(suite.ctx, NewUpdateExecutionMatcherFromPlanUpdate(suite.T(), update1)).Times(1) suite.mockStore.EXPECT().UpdateExecution(suite.ctx, NewUpdateExecutionMatcherFromPlanUpdate(suite.T(), update2)).Times(1) suite.mockStore.EXPECT().UpdateJobState(suite.ctx, NewUpdateJobMatcherFromPlanUpdate(suite.T(), plan)).Times(1) diff --git a/pkg/orchestrator/planner/utils_test.go b/pkg/orchestrator/planner/utils_test.go index a5ee58e1dd..29d206d1be 100644 --- a/pkg/orchestrator/planner/utils_test.go +++ b/pkg/orchestrator/planner/utils_test.go @@ -29,12 +29,12 @@ func mockUpdateExecutions(plan *models.Plan) (*models.PlanExecutionDesiredUpdate update1 := &models.PlanExecutionDesiredUpdate{ Execution: execution1, DesiredState: models.ExecutionDesiredStateRunning, - Comment: "update 1", + Event: models.Event{Message: "update 1"}, } update2 := &models.PlanExecutionDesiredUpdate{ Execution: execution2, DesiredState: models.ExecutionDesiredStateStopped, - Comment: "update 2", + Event: models.Event{Message: "update 2"}, } plan.UpdatedExecutions[execution1.ID] = update1 plan.UpdatedExecutions[execution2.ID] = update2 @@ -78,7 +78,7 @@ func NewUpdateExecutionMatcher(t *testing.T, execution *models.Execution, params func NewUpdateExecutionMatcherFromPlanUpdate(t *testing.T, update *models.PlanExecutionDesiredUpdate) *UpdateExecutionMatcher { return NewUpdateExecutionMatcher(t, update.Execution, UpdateExecutionMatcherParams{ NewDesiredState: update.DesiredState, - DesiredStateComment: update.Comment, + DesiredStateComment: update.Event.Message, ExpectedRevision: update.Execution.Revision, }) } @@ -119,13 +119,13 @@ type UpdateJobMatcher struct { t *testing.T job *models.Job newState models.JobStateType - comment string + event models.Event expectedRevision uint64 } type UpdateJobMatcherParams struct { NewState models.JobStateType - Comment string + Event models.Event ExpectedRevision uint64 } @@ -134,7 +134,7 @@ func NewUpdateJobMatcher(t *testing.T, job *models.Job, params UpdateJobMatcherP t: t, job: job, newState: params.NewState, - comment: params.Comment, + event: params.Event, expectedRevision: params.ExpectedRevision, } } @@ -142,7 +142,7 @@ func NewUpdateJobMatcher(t *testing.T, job *models.Job, params UpdateJobMatcherP func NewUpdateJobMatcherFromPlanUpdate(t *testing.T, plan *models.Plan) *UpdateJobMatcher { return NewUpdateJobMatcher(t, plan.Job, UpdateJobMatcherParams{ NewState: plan.DesiredJobState, - Comment: plan.Comment, + Event: plan.Event, ExpectedRevision: plan.Job.Revision, }) } @@ -157,7 +157,7 @@ func (m *UpdateJobMatcher) Matches(x interface{}) bool { expectedRequest := jobstore.UpdateJobStateRequest{ JobID: m.job.ID, NewState: m.newState, - Comment: m.comment, + Event: m.event, Condition: jobstore.UpdateJobCondition{ ExpectedRevision: m.expectedRevision, }, diff --git a/pkg/orchestrator/scheduler/batch_service_job.go b/pkg/orchestrator/scheduler/batch_service_job.go index 13290f6cbf..bc4010cc0d 100644 --- a/pkg/orchestrator/scheduler/batch_service_job.go +++ b/pkg/orchestrator/scheduler/batch_service_job.go @@ -64,7 +64,7 @@ func (b *BatchServiceJobScheduler) Process(ctx context.Context, evaluation *mode // early exit if the job is stopped if job.IsTerminal() { - nonTerminalExecs.markStopped(execNotNeeded, plan) + nonTerminalExecs.markStopped(orchestrator.ExecStoppedByJobStopEvent(), plan) return b.planner.Process(ctx, plan) } @@ -76,7 +76,7 @@ func (b *BatchServiceJobScheduler) Process(ctx context.Context, evaluation *mode // Mark executions that are running on nodes that are not healthy as failed nonTerminalExecs, lost := nonTerminalExecs.filterByNodeHealth(nodeInfos) - lost.markStopped(execLost, plan) + lost.markStopped(orchestrator.ExecStoppedByNodeUnhealthyEvent(), plan) // Calculate remaining job count // Service jobs run until the user stops the job, and would be a bug if an execution is marked completed. So the desired @@ -90,7 +90,7 @@ func (b *BatchServiceJobScheduler) Process(ctx context.Context, evaluation *mode // Approve/Reject nodes execsByApprovalStatus := nonTerminalExecs.filterByApprovalStatus(desiredRemainingCount) execsByApprovalStatus.toApprove.markApproved(plan) - execsByApprovalStatus.toReject.markStopped(execRejected, plan) + execsByApprovalStatus.toReject.markStopped(orchestrator.ExecStoppedByNodeRejectedEvent(), plan) // create new executions if needed remainingExecutionCount := desiredRemainingCount - execsByApprovalStatus.activeCount() @@ -99,6 +99,7 @@ func (b *BatchServiceJobScheduler) Process(ctx context.Context, evaluation *mode var placementErr error if len(allFailed) > 0 && !b.retryStrategy.ShouldRetry(ctx, orchestrator.RetryRequest{JobID: job.ID}) { placementErr = fmt.Errorf("exceeded max retries for job %s", job.ID) + plan.Event = orchestrator.JobExhaustedRetriesEvent() } else { _, placementErr = b.createMissingExecs(ctx, remainingExecutionCount, &job, plan) } @@ -110,7 +111,7 @@ func (b *BatchServiceJobScheduler) Process(ctx context.Context, evaluation *mode // stop executions if we over-subscribed and exceeded the desired number of executions _, overSubscriptions := execsByApprovalStatus.running.filterByOverSubscriptions(desiredRemainingCount) - overSubscriptions.markStopped(execNotNeeded, plan) + overSubscriptions.markStopped(orchestrator.ExecStoppedByOversubscriptionEvent(), plan) // Check the job's state and update it accordingly. if desiredRemainingCount <= 0 { @@ -141,6 +142,7 @@ func (b *BatchServiceJobScheduler) createMissingExecs( if len(newExecs) > 0 { err := b.placeExecs(ctx, newExecs, job) if err != nil { + plan.Event = models.EventFromError(orchestrator.EventTopicJobScheduling, err) return newExecs, err } } @@ -178,15 +180,10 @@ func (b *BatchServiceJobScheduler) placeExecs(ctx context.Context, execs execSet func (b *BatchServiceJobScheduler) handleFailure(nonTerminalExecs execSet, failed execSet, plan *models.Plan, err error) { // TODO: allow scheduling retries in a later time if don't find nodes instead of failing the job // mark all non-terminal executions as failed - nonTerminalExecs.markStopped(jobFailed, plan) + nonTerminalExecs.markStopped(plan.Event, plan) - // mark the job as failed, using the error message of the latest failed execution, if any, or use - // the error message passed by the scheduler - latestErr := err.Error() - if len(failed) > 0 { - latestErr = failed.latest().ComputeState.Message - } - plan.MarkJobFailed(latestErr) + // mark the job as failed, use the error message passed by the scheduler + plan.MarkJobFailed(plan.Event) } // compile-time assertion that BatchServiceJobScheduler satisfies the Scheduler interface diff --git a/pkg/orchestrator/scheduler/constants.go b/pkg/orchestrator/scheduler/constants.go deleted file mode 100644 index 57971114d3..0000000000 --- a/pkg/orchestrator/scheduler/constants.go +++ /dev/null @@ -1,15 +0,0 @@ -package scheduler - -const ( - // execNotNeeded is the status used when a job no longer requires an execution - execNotNeeded = "execution not needed due to job update" - - // execLost is the status used when an execution is lost - execLost = "execution is lost since its node is down" - - // execRejected is the status used when an execution is rejected - execRejected = "execution is rejected in favor of another execution" - - // jobFailed is the status used when a job has failed - jobFailed = "overall job has failed" -) diff --git a/pkg/orchestrator/scheduler/daemon_job.go b/pkg/orchestrator/scheduler/daemon_job.go index 79b3b21b7c..da0cc3ed75 100644 --- a/pkg/orchestrator/scheduler/daemon_job.go +++ b/pkg/orchestrator/scheduler/daemon_job.go @@ -58,7 +58,7 @@ func (b *DaemonJobScheduler) Process(ctx context.Context, evaluation *models.Eva // early exit if the job is stopped if job.IsTerminal() { - nonTerminalExecs.markStopped(execNotNeeded, plan) + nonTerminalExecs.markStopped(orchestrator.ExecStoppedByJobStopEvent(), plan) return b.planner.Process(ctx, plan) } @@ -70,7 +70,7 @@ func (b *DaemonJobScheduler) Process(ctx context.Context, evaluation *models.Eva // Mark executions that are running on nodes that are not healthy as failed _, lost := nonTerminalExecs.filterByNodeHealth(nodeInfos) - lost.markStopped(execLost, plan) + lost.markStopped(orchestrator.ExecStoppedByNodeUnhealthyEvent(), plan) // Look for new matching nodes and create new executions every time we evaluate the job _, err = b.createMissingExecs(ctx, &job, plan, existingExecs) diff --git a/pkg/orchestrator/scheduler/ops_job.go b/pkg/orchestrator/scheduler/ops_job.go index 6783292d68..57d3a27b9b 100644 --- a/pkg/orchestrator/scheduler/ops_job.go +++ b/pkg/orchestrator/scheduler/ops_job.go @@ -59,7 +59,7 @@ func (b *OpsJobScheduler) Process(ctx context.Context, evaluation *models.Evalua // early exit if the job is stopped if job.IsTerminal() { - nonTerminalExecs.markStopped(execNotNeeded, plan) + nonTerminalExecs.markStopped(orchestrator.ExecStoppedByJobStopEvent(), plan) return b.planner.Process(ctx, plan) } @@ -71,7 +71,7 @@ func (b *OpsJobScheduler) Process(ctx context.Context, evaluation *models.Evalua // Mark executions that are running on nodes that are not healthy as failed nonTerminalExecs, lost := nonTerminalExecs.filterByNodeHealth(nodeInfos) - lost.markStopped(execLost, plan) + lost.markStopped(orchestrator.ExecStoppedByNodeUnhealthyEvent(), plan) allFailed := existingExecs.filterFailed().union(lost) @@ -137,15 +137,11 @@ func (b *OpsJobScheduler) createMissingExecs( func (b *OpsJobScheduler) handleFailure(nonTerminalExecs execSet, failed execSet, plan *models.Plan, err error) { // mark all non-terminal executions as failed - nonTerminalExecs.markStopped(jobFailed, plan) + nonTerminalExecs.markStopped(plan.Event, plan) // mark the job as failed, using the error message of the latest failed execution, if any, or use // the error message passed by the scheduler - latestErr := err.Error() - if len(failed) > 0 { - latestErr = failed.latest().ComputeState.Message - } - plan.MarkJobFailed(latestErr) + plan.MarkJobFailed(plan.Event) } // compile-time assertion that OpsJobScheduler satisfies the Scheduler interface diff --git a/pkg/orchestrator/scheduler/types.go b/pkg/orchestrator/scheduler/types.go index ed0c5204ea..b7c00d7412 100644 --- a/pkg/orchestrator/scheduler/types.go +++ b/pkg/orchestrator/scheduler/types.go @@ -193,9 +193,9 @@ func (set execSet) filterByApprovalStatus(desiredCount int) executionsByApproval } // markStopped -func (set execSet) markStopped(comment string, plan *models.Plan) { +func (set execSet) markStopped(event models.Event, plan *models.Plan) { for _, exec := range set { - plan.AppendStoppedExecution(exec, comment) + plan.AppendStoppedExecution(exec, event) } } @@ -218,17 +218,6 @@ func (set execSet) union(other execSet) execSet { return union } -// latest returns the latest execution in the set by the time it was last updated. -func (set execSet) latest() *models.Execution { - var latest *models.Execution - for _, exec := range set { - if latest == nil || exec.ModifyTime > latest.ModifyTime { - latest = exec - } - } - return latest -} - // countByState counts the number of executions in each state. func (set execSet) countByState() map[models.ExecutionStateType]int { counts := map[models.ExecutionStateType]int{} diff --git a/pkg/orchestrator/scheduler/types_test.go b/pkg/orchestrator/scheduler/types_test.go index 2e5b354f0b..62b48a149d 100644 --- a/pkg/orchestrator/scheduler/types_test.go +++ b/pkg/orchestrator/scheduler/types_test.go @@ -93,21 +93,6 @@ func TestExecSet_Union(t *testing.T) { assert.Equal(t, models.ExecutionStateCompleted, union["exec2"].ComputeState.StateType) } -func TestExecSet_Latest(t *testing.T) { - now := time.Now() - executions := []*models.Execution{ - {ID: "exec1", ModifyTime: now.UnixNano()}, - {ID: "exec2", ModifyTime: now.Add(+1 * time.Second).UnixNano()}, - {ID: "exec3", ModifyTime: now.Add(-1 * time.Second).UnixNano()}, - } - - set := execSetFromSlice(executions) - latest := set.latest() - - assert.NotNil(t, latest) - assert.Equal(t, "exec2", latest.ID) -} - func TestExecSet_CountByState(t *testing.T) { executions := []*models.Execution{ {ID: "exec1", ComputeState: models.NewExecutionState(models.ExecutionStateBidAccepted)}, diff --git a/pkg/orchestrator/selection/ranking/features.go b/pkg/orchestrator/selection/ranking/features.go index af5e4714cb..05a1dc91c0 100644 --- a/pkg/orchestrator/selection/ranking/features.go +++ b/pkg/orchestrator/selection/ranking/features.go @@ -94,9 +94,10 @@ func (s *featureNodeRanker) RankNodes( for i, node := range nodes { rank, reason := s.rankNode(ctx, node, requiredKeys) ranks[i] = orchestrator.NodeRank{ - NodeInfo: node, - Rank: rank, - Reason: reason, + NodeInfo: node, + Rank: rank, + Reason: reason, + Retryable: false, } log.Ctx(ctx).Trace().Object("Rank", ranks[i]).Msg("Ranked node") } diff --git a/pkg/orchestrator/selection/ranking/fixed.go b/pkg/orchestrator/selection/ranking/fixed.go index bf3f5aeda0..968e55bfb9 100644 --- a/pkg/orchestrator/selection/ranking/fixed.go +++ b/pkg/orchestrator/selection/ranking/fixed.go @@ -22,8 +22,9 @@ func (f *fixedRanker) RankNodes(_ context.Context, _ models.Job, nodes []models. ranks := make([]orchestrator.NodeRank, len(nodes)) for i, rank := range f.ranks { ranks[i] = orchestrator.NodeRank{ - NodeInfo: nodes[i], - Rank: rank, + NodeInfo: nodes[i], + Rank: rank, + Retryable: false, } } return ranks, nil diff --git a/pkg/orchestrator/selection/ranking/labels.go b/pkg/orchestrator/selection/ranking/labels.go index 5e61752202..bcaf936af9 100644 --- a/pkg/orchestrator/selection/ranking/labels.go +++ b/pkg/orchestrator/selection/ranking/labels.go @@ -64,9 +64,10 @@ func (s *LabelsNodeRanker) RankNodes(ctx context.Context, job models.Job, nodes } } ranks[i] = orchestrator.NodeRank{ - NodeInfo: node, - Rank: rank, - Reason: reason, + NodeInfo: node, + Rank: rank, + Reason: reason, + Retryable: false, } log.Ctx(ctx).Trace().Object("Rank", ranks[i]).Msg("Ranked node") } diff --git a/pkg/orchestrator/selection/ranking/max_usage.go b/pkg/orchestrator/selection/ranking/max_usage.go index c0a0e5ae91..4cf1082462 100644 --- a/pkg/orchestrator/selection/ranking/max_usage.go +++ b/pkg/orchestrator/selection/ranking/max_usage.go @@ -42,9 +42,10 @@ func (s *MaxUsageNodeRanker) RankNodes(ctx context.Context, job models.Job, node } } ranks[i] = orchestrator.NodeRank{ - NodeInfo: node, - Rank: rank, - Reason: reason, + NodeInfo: node, + Rank: rank, + Reason: reason, + Retryable: true, } log.Ctx(ctx).Trace().Object("Rank", ranks[i]).Msg("Ranked node") } diff --git a/pkg/orchestrator/selection/ranking/min_version.go b/pkg/orchestrator/selection/ranking/min_version.go index e3f6e584b3..57c77b2864 100644 --- a/pkg/orchestrator/selection/ranking/min_version.go +++ b/pkg/orchestrator/selection/ranking/min_version.go @@ -43,9 +43,10 @@ func (s *MinVersionNodeRanker) RankNodes(ctx context.Context, job models.Job, no reason = "Bacalhau version is incompatible" } ranks[i] = orchestrator.NodeRank{ - NodeInfo: node, - Rank: rank, - Reason: reason, + NodeInfo: node, + Rank: rank, + Reason: reason, + Retryable: false, } log.Ctx(ctx).Trace().Object("Rank", ranks[i]).Msg("Ranked node") } diff --git a/pkg/orchestrator/selection/ranking/previous_executions.go b/pkg/orchestrator/selection/ranking/previous_executions.go index 04383703ae..856c9868ee 100644 --- a/pkg/orchestrator/selection/ranking/previous_executions.go +++ b/pkg/orchestrator/selection/ranking/previous_executions.go @@ -68,6 +68,9 @@ func (s *PreviousExecutionsNodeRanker) RankNodes(ctx context.Context, NodeInfo: node, Rank: rank, Reason: reason, + // No logic in here (yet) that will ignore failed executions after e.g. a period of time. + // If we introduce that, this should become true. + Retryable: false, } log.Ctx(ctx).Trace().Object("Rank", ranks[i]).Msg("Ranked node") } diff --git a/pkg/orchestrator/types.go b/pkg/orchestrator/types.go index 3ee4436835..814e6d8e1a 100644 --- a/pkg/orchestrator/types.go +++ b/pkg/orchestrator/types.go @@ -51,6 +51,17 @@ type NodeRank struct { NodeInfo models.NodeInfo Rank int Reason string + + // Retryable should be true only if the system could defer this job until + // later and the rank could change without any human intervention on the + // assessed node. I.e. it should only reflect transient things like node + // usage, capacity or approval status. + // + // E.g. if this node is excluded because it does not support a required + // feature, this could be fixed if the feature was configured at the other + // node, but Retryable should be false because this is unlikely to happen + // over the lifetime of the job. + Retryable bool } const ( diff --git a/pkg/requester/endpoint.go b/pkg/requester/endpoint.go index fadce21f4e..8d4697be57 100644 --- a/pkg/requester/endpoint.go +++ b/pkg/requester/endpoint.go @@ -130,7 +130,7 @@ func (e *BaseEndpoint) SubmitJob(ctx context.Context, data model.JobCreatePayloa JobsSubmitted.Inc(ctx, job.MetricAttributes()...) - err = e.store.CreateJob(ctx, *job) + err = e.store.CreateJob(ctx, *job, orchestrator.JobSubmittedEvent()) if err != nil { return nil, err } @@ -185,7 +185,7 @@ func (e *BaseEndpoint) CancelJob(ctx context.Context, request CancelJobRequest) }, }, NewState: models.JobStateTypeStopped, - Comment: "job canceled by user", + Event: orchestrator.JobStoppedEvent("job canceled by user"), }) if err != nil { return CancelJobResult{}, err @@ -243,9 +243,9 @@ func (e *BaseEndpoint) OnBidComplete(ctx context.Context, response compute.BidRe }, }, NewValues: models.Execution{ - ComputeState: models.NewExecutionState(models.ExecutionStateAskForBidAccepted).WithMessage(response.Reason), + ComputeState: models.NewExecutionState(models.ExecutionStateAskForBidAccepted).WithMessage(response.Event.Message), }, - Comment: response.Reason, + Event: response.Event, } if !response.Accepted { @@ -340,7 +340,7 @@ func (e *BaseEndpoint) OnComputeFailure(ctx context.Context, result compute.Comp ComputeState: models.NewExecutionState(models.ExecutionStateFailed).WithMessage(result.Error()), DesiredState: models.NewExecutionDesiredState(models.ExecutionDesiredStateStopped).WithMessage("execution failed"), }, - Comment: result.Err, + Event: result.Event, }) if err != nil { log.Ctx(ctx).Error().Err(err).Msgf("[OnComputeFailure] failed to update execution") diff --git a/pkg/test/logstream/stream_address_test.go b/pkg/test/logstream/stream_address_test.go index d729c1d5e7..0078d31fc5 100644 --- a/pkg/test/logstream/stream_address_test.go +++ b/pkg/test/logstream/stream_address_test.go @@ -31,7 +31,7 @@ func (s *LogStreamTestSuite) TestStreamAddress() { execution.NodeID = node.ID execution.AllocateResources(task.Name, models.Resources{}) - err := node.RequesterNode.JobStore.CreateJob(s.ctx, *job) + err := node.RequesterNode.JobStore.CreateJob(s.ctx, *job, models.Event{}) require.NoError(s.T(), err) exec, err := node.ComputeNode.Executors.Get(s.ctx, models.EngineDocker) @@ -71,7 +71,7 @@ func (s *LogStreamTestSuite) TestStreamAddress() { node.ComputeNode.ExecutionStore.CreateExecution(s.ctx, *localExecutionState) execution.ComputeState.StateType = models.ExecutionStateBidAccepted - err = node.RequesterNode.JobStore.CreateExecution(s.ctx, *execution) + err = node.RequesterNode.JobStore.CreateExecution(s.ctx, *execution, models.Event{}) require.NoError(s.T(), err) ch, err := node.RequesterNode.EndpointV2.ReadLogs(s.ctx, orchestrator.ReadLogsRequest{ diff --git a/pkg/test/requester/retries_test.go b/pkg/test/requester/retries_test.go index 65d89c3c7f..58a5970cb0 100644 --- a/pkg/test/requester/retries_test.go +++ b/pkg/test/requester/retries_test.go @@ -273,8 +273,7 @@ func (s *RetriesSuite) TestRetry() { model.ExecutionStateCancelled: NewIntMatch(1), }, expectedExecutionErrors: map[model.ExecutionStateType]string{ - model.ExecutionStateFailed: errExecution.Error(), - model.ExecutionStateCancelled: "overall job has failed", + model.ExecutionStateFailed: errExecution.Error(), }, }, { From 3b3d8a8a7e4ddd79f38a4cd4b731885237e45245 Mon Sep 17 00:00:00 2001 From: Forrest <6546409+frrist@users.noreply.github.com> Date: Wed, 17 Apr 2024 22:31:33 -0700 Subject: [PATCH 11/17] fix: remove Liveness and Acceptance from NodeInfo (#3785) - fixes #3783 - Introduces NodeState type used to track NodeInfo, Connection, and Membershio. Removes the idea of Connection and Membership from data sent by compute nodes to the Requester(s) since compute nodes should not influence their Connection state or mmembership. Those are values related to the Requesters view of the network. --------- Co-authored-by: frrist --- cmd/cli/agent/node.go | 2 +- cmd/cli/agent/node_test.go | 11 +- cmd/cli/job/describe.go | 7 +- cmd/cli/node/columns.go | 49 +++---- cmd/cli/serve/util.go | 13 +- .../lambda/cmd/alarm_slack_handler/utils.go | 9 +- pkg/compute/management_client.go | 3 +- pkg/devstack/option.go | 2 +- pkg/libp2p/transport/libp2p.go | 23 ++-- pkg/models/node_approval.go | 70 +++++----- pkg/models/node_connection.go | 97 ++++++++++++++ pkg/models/node_info.go | 14 +- pkg/models/node_state.go | 96 +------------- pkg/nats/transport/nats.go | 17 +-- pkg/node/config_defaults.go | 4 +- pkg/node/config_requester.go | 7 +- pkg/node/heartbeat/heartbeat_test.go | 15 ++- pkg/node/heartbeat/server.go | 16 +-- pkg/node/manager/node_manager.go | 120 ++++++++++-------- pkg/node/node.go | 27 ++-- pkg/node/requester.go | 22 ++-- pkg/orchestrator/interfaces.go | 4 +- pkg/orchestrator/scheduler/batch_job_test.go | 43 ++++--- pkg/orchestrator/scheduler/daemon_job_test.go | 17 +-- pkg/orchestrator/scheduler/ops_job_test.go | 17 +-- .../scheduler/service_job_test.go | 57 +++++---- pkg/orchestrator/scheduler/utils_test.go | 6 +- .../selection/discovery/chained.go | 63 --------- .../selection/discovery/chained_test.go | 83 ------------ pkg/orchestrator/selection/discovery/fixed.go | 26 ---- .../selection/discovery/info_provider.go | 2 +- pkg/orchestrator/selection/discovery/store.go | 31 ----- .../selection/discovery/store_test.go | 70 ---------- .../selection/selector/node_selector.go | 36 ++++-- pkg/publicapi/apimodels/agent.go | 2 +- pkg/publicapi/apimodels/node.go | 7 +- pkg/publicapi/endpoint/agent/endpoint.go | 15 ++- pkg/publicapi/endpoint/orchestrator/node.go | 42 +++--- .../endpoint/requester/endpoints_nodes.go | 2 +- pkg/publicapi/endpoint/shared/endpoint.go | 23 ++-- pkg/publicapi/test/agent_test.go | 38 ++---- pkg/routing/inmemory/inmemory.go | 68 +++++----- pkg/routing/inmemory/inmemory_test.go | 77 ++++++----- pkg/routing/kvstore/kvstore.go | 56 ++++---- pkg/routing/kvstore/kvstore_test.go | 75 ++++++----- pkg/routing/node_info_provider.go | 45 ++++--- pkg/routing/node_info_publisher.go | 31 ++--- pkg/routing/tracing/tracing.go | 12 +- pkg/routing/types.go | 20 +-- pkg/swagger/docs.go | 2 +- pkg/test/teststack/stack.go | 4 +- pkg/test/utils/node/utils.go | 10 +- pkg/transport/interfaces.go | 2 +- test/labels.sh | 14 +- 54 files changed, 711 insertions(+), 913 deletions(-) create mode 100644 pkg/models/node_connection.go delete mode 100644 pkg/orchestrator/selection/discovery/chained.go delete mode 100644 pkg/orchestrator/selection/discovery/chained_test.go delete mode 100644 pkg/orchestrator/selection/discovery/fixed.go delete mode 100644 pkg/orchestrator/selection/discovery/store.go delete mode 100644 pkg/orchestrator/selection/discovery/store_test.go diff --git a/cmd/cli/agent/node.go b/cmd/cli/agent/node.go index 4fe10f36c5..8781a76c63 100644 --- a/cmd/cli/agent/node.go +++ b/cmd/cli/agent/node.go @@ -43,7 +43,7 @@ func (o *NodeOptions) runNode(cmd *cobra.Command, _ []string) error { return fmt.Errorf("could not get server node: %w", err) } - writeErr := output.OutputOneNonTabular(cmd, o.OutputOpts, response.NodeInfo) + writeErr := output.OutputOneNonTabular(cmd, o.OutputOpts, response.NodeState) if writeErr != nil { return fmt.Errorf("failed to write node: %w", writeErr) } diff --git a/cmd/cli/agent/node_test.go b/cmd/cli/agent/node_test.go index f23a63243b..ffe404c2d7 100644 --- a/cmd/cli/agent/node_test.go +++ b/cmd/cli/agent/node_test.go @@ -5,9 +5,10 @@ package agent_test import ( "testing" + "github.com/stretchr/testify/suite" + "github.com/bacalhau-project/bacalhau/pkg/lib/marshaller" "github.com/bacalhau-project/bacalhau/pkg/models" - "github.com/stretchr/testify/suite" cmdtesting "github.com/bacalhau-project/bacalhau/cmd/testing" "github.com/bacalhau-project/bacalhau/cmd/util/output" @@ -25,18 +26,18 @@ func (s *NodeSuite) TestNodeJSONOutput() { _, out, err := s.ExecuteTestCobraCommand("agent", "node", "--output", string(output.JSONFormat)) s.Require().NoError(err, "Could not request node with json output.") - nodeInfo := &models.NodeInfo{} + nodeInfo := &models.NodeState{} err = marshaller.JSONUnmarshalWithMax([]byte(out), &nodeInfo) s.Require().NoError(err, "Could not unmarshall the output into json - %+v", err) - s.Require().Equal(s.Node.ID, nodeInfo.ID(), "Node ID does not match in json.") + s.Require().Equal(s.Node.ID, nodeInfo.Info.ID(), "Node ID does not match in json.") } func (s *NodeSuite) TestNodeYAMLOutput() { _, out, err := s.ExecuteTestCobraCommand("agent", "node") s.Require().NoError(err, "Could not request node with yaml output.") - nodeInfo := &models.NodeInfo{} + nodeInfo := &models.NodeState{} err = marshaller.YAMLUnmarshalWithMax([]byte(out), &nodeInfo) s.Require().NoError(err, "Could not unmarshall the output into yaml - %+v", err) - s.Require().Equal(s.Node.ID, nodeInfo.ID(), "Node ID does not match in yaml.") + s.Require().Equal(s.Node.ID, nodeInfo.Info.ID(), "Node ID does not match in yaml.") } diff --git a/cmd/cli/job/describe.go b/cmd/cli/job/describe.go index 736291370a..e3244b6b45 100644 --- a/cmd/cli/job/describe.go +++ b/cmd/cli/job/describe.go @@ -6,15 +6,16 @@ import ( "slices" "time" - "github.com/bacalhau-project/bacalhau/pkg/lib/collections" - "github.com/bacalhau-project/bacalhau/pkg/models" - "github.com/bacalhau-project/bacalhau/pkg/util/idgen" "github.com/jedib0t/go-pretty/v6/table" "github.com/jedib0t/go-pretty/v6/text" "github.com/samber/lo" "github.com/spf13/cobra" "k8s.io/kubectl/pkg/util/i18n" + "github.com/bacalhau-project/bacalhau/pkg/lib/collections" + "github.com/bacalhau-project/bacalhau/pkg/models" + "github.com/bacalhau-project/bacalhau/pkg/util/idgen" + "github.com/bacalhau-project/bacalhau/cmd/util" "github.com/bacalhau-project/bacalhau/cmd/util/flags/cliflags" "github.com/bacalhau-project/bacalhau/cmd/util/output" diff --git a/cmd/cli/node/columns.go b/cmd/cli/node/columns.go index 5e14ea579d..9663922797 100644 --- a/cmd/cli/node/columns.go +++ b/cmd/cli/node/columns.go @@ -4,35 +4,36 @@ import ( "fmt" "strings" - "github.com/bacalhau-project/bacalhau/cmd/util/output" - "github.com/bacalhau-project/bacalhau/pkg/model" - "github.com/bacalhau-project/bacalhau/pkg/models" - "github.com/bacalhau-project/bacalhau/pkg/util/idgen" "github.com/c2h5oh/datasize" "github.com/jedib0t/go-pretty/v6/table" "github.com/jedib0t/go-pretty/v6/text" "github.com/samber/lo" "golang.org/x/exp/slices" + + "github.com/bacalhau-project/bacalhau/cmd/util/output" + "github.com/bacalhau-project/bacalhau/pkg/model" + "github.com/bacalhau-project/bacalhau/pkg/models" + "github.com/bacalhau-project/bacalhau/pkg/util/idgen" ) -var alwaysColumns = []output.TableColumn[*models.NodeInfo]{ +var alwaysColumns = []output.TableColumn[*models.NodeState]{ { ColumnConfig: table.ColumnConfig{Name: "id"}, - Value: func(node *models.NodeInfo) string { return idgen.ShortNodeID(node.ID()) }, + Value: func(node *models.NodeState) string { return idgen.ShortNodeID(node.Info.ID()) }, }, { ColumnConfig: table.ColumnConfig{Name: "type"}, - Value: func(ni *models.NodeInfo) string { return ni.NodeType.String() }, + Value: func(ni *models.NodeState) string { return ni.Info.NodeType.String() }, }, { ColumnConfig: table.ColumnConfig{Name: "approval"}, - Value: func(ni *models.NodeInfo) string { return ni.Approval.String() }, + Value: func(ni *models.NodeState) string { return ni.Membership.String() }, }, { ColumnConfig: table.ColumnConfig{Name: "status"}, - Value: func(ni *models.NodeInfo) string { - if ni.ComputeNodeInfo != nil { - return ni.State.String() + Value: func(ni *models.NodeState) string { + if ni.Info.ComputeNodeInfo != nil { + return ni.Connection.String() } return "" // nothing for requester nodes @@ -40,12 +41,12 @@ var alwaysColumns = []output.TableColumn[*models.NodeInfo]{ }, } -var toggleColumns = map[string][]output.TableColumn[*models.NodeInfo]{ +var toggleColumns = map[string][]output.TableColumn[*models.NodeState]{ "labels": { { ColumnConfig: table.ColumnConfig{Name: "labels", WidthMax: 50, WidthMaxEnforcer: text.WrapSoft}, - Value: func(ni *models.NodeInfo) string { - labels := lo.MapToSlice(ni.Labels, func(key, val string) string { return fmt.Sprintf("%s=%s", key, val) }) + Value: func(ni *models.NodeState) string { + labels := lo.MapToSlice(ni.Info.Labels, func(key, val string) string { return fmt.Sprintf("%s=%s", key, val) }) slices.Sort(labels) return strings.Join(labels, " ") }, @@ -54,20 +55,20 @@ var toggleColumns = map[string][]output.TableColumn[*models.NodeInfo]{ "version": { { ColumnConfig: table.ColumnConfig{Name: "version"}, - Value: func(ni *models.NodeInfo) string { - return ni.BacalhauVersion.GitVersion + Value: func(ni *models.NodeState) string { + return ni.Info.BacalhauVersion.GitVersion }, }, { ColumnConfig: table.ColumnConfig{Name: "architecture"}, - Value: func(ni *models.NodeInfo) string { - return ni.BacalhauVersion.GOARCH + Value: func(ni *models.NodeState) string { + return ni.Info.BacalhauVersion.GOARCH }, }, { ColumnConfig: table.ColumnConfig{Name: "os"}, - Value: func(ni *models.NodeInfo) string { - return ni.BacalhauVersion.GOOS + Value: func(ni *models.NodeState) string { + return ni.Info.BacalhauVersion.GOOS }, }, }, @@ -123,11 +124,11 @@ func maxLen(val []string) int { return lo.Max(lo.Map[string, int](val, func(item string, index int) int { return len(item) })) + 1 } -func ifComputeNode(getFromCNInfo func(*models.ComputeNodeInfo) string) func(*models.NodeInfo) string { - return func(ni *models.NodeInfo) string { - if ni.ComputeNodeInfo == nil { +func ifComputeNode(getFromCNInfo func(*models.ComputeNodeInfo) string) func(state *models.NodeState) string { + return func(ni *models.NodeState) string { + if ni.Info.ComputeNodeInfo == nil { return "" } - return getFromCNInfo(ni.ComputeNodeInfo) + return getFromCNInfo(ni.Info.ComputeNodeInfo) } } diff --git a/cmd/cli/serve/util.go b/cmd/cli/serve/util.go index e178f827a4..d3650094cf 100644 --- a/cmd/cli/serve/util.go +++ b/cmd/cli/serve/util.go @@ -7,16 +7,17 @@ import ( "path/filepath" "time" + pkgerrors "github.com/pkg/errors" + "github.com/rs/zerolog/log" + "github.com/samber/lo" + "github.com/spf13/viper" + "github.com/bacalhau-project/bacalhau/pkg/compute/store" "github.com/bacalhau-project/bacalhau/pkg/compute/store/boltdb" "github.com/bacalhau-project/bacalhau/pkg/jobstore" boltjobstore "github.com/bacalhau-project/bacalhau/pkg/jobstore/boltdb" "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/util/idgen" - pkgerrors "github.com/pkg/errors" - "github.com/rs/zerolog/log" - "github.com/samber/lo" - "github.com/spf13/viper" "github.com/bacalhau-project/bacalhau/cmd/util/flags/configflags" "github.com/bacalhau-project/bacalhau/pkg/orchestrator/transformer" @@ -127,9 +128,9 @@ func GetRequesterConfig(ctx context.Context, createJobStore bool) (node.Requeste } if cfg.ManualNodeApproval { - requesterConfig.DefaultApprovalState = models.NodeApprovals.PENDING + requesterConfig.DefaultApprovalState = models.NodeMembership.PENDING } else { - requesterConfig.DefaultApprovalState = models.NodeApprovals.APPROVED + requesterConfig.DefaultApprovalState = models.NodeMembership.APPROVED } return requesterConfig, nil diff --git a/ops/aws/canary/lambda/cmd/alarm_slack_handler/utils.go b/ops/aws/canary/lambda/cmd/alarm_slack_handler/utils.go index 1664b3546d..812b8536b1 100644 --- a/ops/aws/canary/lambda/cmd/alarm_slack_handler/utils.go +++ b/ops/aws/canary/lambda/cmd/alarm_slack_handler/utils.go @@ -2,20 +2,21 @@ package main import ( "encoding/json" + "os" + "strconv" + "time" + "github.com/aws/aws-lambda-go/events" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/secretsmanager" "github.com/slack-go/slack" - "os" - "strconv" - "time" ) func mustGetSlackSecret() slackSecretType { secretName := os.Getenv("SLACK_SECRET_NAME") - //Create a Secrets Manager client + // Create a Secrets Manager client sess, err := session.NewSession() if err != nil { panic(err) diff --git a/pkg/compute/management_client.go b/pkg/compute/management_client.go index c0c9946cd0..11bbb990e1 100644 --- a/pkg/compute/management_client.go +++ b/pkg/compute/management_client.go @@ -57,11 +57,12 @@ func NewManagementClient(params *ManagementClientParams) *ManagementClient { } func (m *ManagementClient) getNodeInfo(ctx context.Context) models.NodeInfo { - return m.nodeInfoDecorator.DecorateNodeInfo(ctx, models.NodeInfo{ + ni := m.nodeInfoDecorator.DecorateNodeInfo(ctx, models.NodeInfo{ NodeID: m.nodeID, NodeType: models.NodeTypeCompute, Labels: m.labelsProvider.GetLabels(ctx), }) + return ni } // RegisterNode sends a registration request to the requester node. If we successfully diff --git a/pkg/devstack/option.go b/pkg/devstack/option.go index 9de52c60b3..208a9e01d3 100644 --- a/pkg/devstack/option.go +++ b/pkg/devstack/option.go @@ -124,7 +124,7 @@ func (o *DevStackConfig) Validate() error { func WithAutoNodeApproval() ConfigOption { return func(cfg *DevStackConfig) { - cfg.RequesterConfig.DefaultApprovalState = models.NodeApprovals.APPROVED + cfg.RequesterConfig.DefaultApprovalState = models.NodeMembership.APPROVED } } diff --git a/pkg/libp2p/transport/libp2p.go b/pkg/libp2p/transport/libp2p.go index 3fe9fd623a..2e258e4c53 100644 --- a/pkg/libp2p/transport/libp2p.go +++ b/pkg/libp2p/transport/libp2p.go @@ -6,6 +6,13 @@ import ( "fmt" "time" + libp2p_pubsub "github.com/libp2p/go-libp2p-pubsub" + "github.com/libp2p/go-libp2p/core/host" + basichost "github.com/libp2p/go-libp2p/p2p/host/basic" + routedhost "github.com/libp2p/go-libp2p/p2p/host/routed" + "github.com/libp2p/go-libp2p/p2p/protocol/identify" + "github.com/multiformats/go-multiaddr" + "github.com/bacalhau-project/bacalhau/pkg/compute" pkgconfig "github.com/bacalhau-project/bacalhau/pkg/config" "github.com/bacalhau-project/bacalhau/pkg/lib/validate" @@ -18,12 +25,6 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/system" core_transport "github.com/bacalhau-project/bacalhau/pkg/transport" "github.com/bacalhau-project/bacalhau/pkg/transport/bprotocol" - libp2p_pubsub "github.com/libp2p/go-libp2p-pubsub" - "github.com/libp2p/go-libp2p/core/host" - basichost "github.com/libp2p/go-libp2p/p2p/host/basic" - routedhost "github.com/libp2p/go-libp2p/p2p/host/routed" - "github.com/libp2p/go-libp2p/p2p/protocol/identify" - "github.com/multiformats/go-multiaddr" ) const NodeInfoTopic = "bacalhau-node-info" @@ -46,7 +47,7 @@ type Libp2pTransport struct { Host host.Host computeProxy *bprotocol.ComputeProxy callbackProxy *bprotocol.CallbackProxy - nodeInfoPubSub pubsub.PubSub[models.NodeInfo] + nodeInfoPubSub pubsub.PubSub[models.NodeState] nodeInfoDecorator models.NodeInfoDecorator } @@ -71,7 +72,7 @@ func NewLibp2pTransport(ctx context.Context, } // PubSub to publish node info to the network - nodeInfoPubSub, err := libp2p.NewPubSub[models.NodeInfo](libp2p.PubSubParams{ + nodeInfoPubSub, err := libp2p.NewPubSub[models.NodeState](libp2p.PubSubParams{ Host: libp2pHost, TopicName: NodeInfoTopic, PubSub: gossipSub, @@ -129,8 +130,8 @@ func NewLibp2pTransport(ctx context.Context, func (t *Libp2pTransport) RegisterNodeInfoConsumer(ctx context.Context, nodeInfoStore routing.NodeInfoStore) error { // register consumers of node info published over gossipSub - nodeInfoSubscriber := pubsub.NewChainedSubscriber[models.NodeInfo](true) - nodeInfoSubscriber.Add(pubsub.SubscriberFunc[models.NodeInfo](nodeInfoStore.Add)) + nodeInfoSubscriber := pubsub.NewChainedSubscriber[models.NodeState](true) + nodeInfoSubscriber.Add(pubsub.SubscriberFunc[models.NodeState](nodeInfoStore.Add)) return t.nodeInfoPubSub.Subscribe(ctx, nodeInfoSubscriber) } @@ -181,7 +182,7 @@ func (t *Libp2pTransport) ManagementProxy() compute.ManagementEndpoint { } // NodeInfoPubSub returns the node info pubsub. -func (t *Libp2pTransport) NodeInfoPubSub() pubsub.PubSub[models.NodeInfo] { +func (t *Libp2pTransport) NodeInfoPubSub() pubsub.PubSub[models.NodeState] { return t.nodeInfoPubSub } diff --git a/pkg/models/node_approval.go b/pkg/models/node_approval.go index 93f8e141d6..0f5dc5fcf2 100644 --- a/pkg/models/node_approval.go +++ b/pkg/models/node_approval.go @@ -4,93 +4,95 @@ import ( "fmt" ) -type NodeApproval struct { - approval +// TODO if we ever pass a pointer to this type and use `==` comparison on it we're gonna have a bad time +// implement an `Equal()` method for this type and default to it. +type NodeMembershipState struct { + membership } -type approval int +type membership int const ( - unknown approval = iota + unknown membership = iota pending approved rejected ) var ( - strApprovalArray = [...]string{ + strMembershipArray = [...]string{ pending: "PENDING", approved: "APPROVED", rejected: "REJECTED", } - typeApprovalMap = map[string]approval{ + typeMembershipMap = map[string]membership{ "PENDING": pending, "APPROVED": approved, "REJECTED": rejected, } ) -func (t approval) String() string { - return strApprovalArray[t] +func (t membership) String() string { + return strMembershipArray[t] } -func Parse(a any) NodeApproval { +func Parse(a any) NodeMembershipState { switch v := a.(type) { - case NodeApproval: + case NodeMembershipState: return v case string: - return NodeApproval{stringToApproval(v)} + return NodeMembershipState{stringToApproval(v)} case fmt.Stringer: - return NodeApproval{stringToApproval(v.String())} + return NodeMembershipState{stringToApproval(v.String())} case int: - return NodeApproval{approval(v)} + return NodeMembershipState{membership(v)} case int64: - return NodeApproval{approval(int(v))} + return NodeMembershipState{membership(int(v))} case int32: - return NodeApproval{approval(int(v))} + return NodeMembershipState{membership(int(v))} } - return NodeApproval{unknown} + return NodeMembershipState{unknown} } -func stringToApproval(s string) approval { - if v, ok := typeApprovalMap[s]; ok { +func stringToApproval(s string) membership { + if v, ok := typeMembershipMap[s]; ok { return v } return unknown } -func (t approval) IsValid() bool { - return t >= approval(1) && t <= approval(len(strApprovalArray)) +func (t membership) IsValid() bool { + return t >= membership(1) && t <= membership(len(strMembershipArray)) } -type approvalsContainer struct { - UNKNOWN NodeApproval - PENDING NodeApproval - APPROVED NodeApproval - REJECTED NodeApproval +type membershipContainer struct { + UNKNOWN NodeMembershipState + PENDING NodeMembershipState + APPROVED NodeMembershipState + REJECTED NodeMembershipState } -var NodeApprovals = approvalsContainer{ - UNKNOWN: NodeApproval{unknown}, - PENDING: NodeApproval{pending}, - APPROVED: NodeApproval{approved}, - REJECTED: NodeApproval{rejected}, +var NodeMembership = membershipContainer{ + UNKNOWN: NodeMembershipState{unknown}, + PENDING: NodeMembershipState{pending}, + APPROVED: NodeMembershipState{approved}, + REJECTED: NodeMembershipState{rejected}, } -func (c approvalsContainer) All() []NodeApproval { - return []NodeApproval{ +func (c membershipContainer) All() []NodeMembershipState { + return []NodeMembershipState{ c.PENDING, c.APPROVED, c.REJECTED, } } -func (t NodeApproval) MarshalJSON() ([]byte, error) { +func (t NodeMembershipState) MarshalJSON() ([]byte, error) { return []byte(`"` + t.String() + `"`), nil } -func (t *NodeApproval) UnmarshalJSON(b []byte) error { +func (t *NodeMembershipState) UnmarshalJSON(b []byte) error { val := string(trimQuotes(b)) *t = Parse(val) return nil diff --git a/pkg/models/node_connection.go b/pkg/models/node_connection.go new file mode 100644 index 0000000000..a8fe879f35 --- /dev/null +++ b/pkg/models/node_connection.go @@ -0,0 +1,97 @@ +package models + +import ( + "fmt" +) + +// TODO if we ever pass a pointer to this type and use `==` comparison on it we're gonna have a bad time +// implement an `Equal()` method for this type and default to it. +type NodeConnectionState struct { + connection +} + +type connection int + +// To add a new state (for instance, a state beyond which the node is considered +// lost) then: +// * add it to the end of the list in the const below +// * add it to strConnectionArray and typeConnectionMap +// * add it to the livenessContainer and corresponding NodeStates var. +// * add it to the All() method in the livenessContainer +const ( + connected connection = iota + disconnected +) + +var ( + strConnectionArray = [...]string{ + connected: "CONNECTED", + disconnected: "DISCONNECTED", + } + + typeConnectionMap = map[string]connection{ + "CONNECTED": connected, + "DISCONNECTED": disconnected, + } +) + +func (t connection) String() string { + return strConnectionArray[t] +} + +func ParseConnection(a any) NodeConnectionState { + switch v := a.(type) { + case NodeConnectionState: + return v + case string: + return NodeConnectionState{stringToConnection(v)} + case fmt.Stringer: + return NodeConnectionState{stringToConnection(v.String())} + case int: + return NodeConnectionState{connection(v)} + case int64: + return NodeConnectionState{connection(int(v))} + case int32: + return NodeConnectionState{connection(int(v))} + } + return NodeConnectionState{disconnected} +} + +func stringToConnection(s string) connection { + if v, ok := typeConnectionMap[s]; ok { + return v + } + return disconnected +} + +func (t connection) IsValid() bool { + return t >= connection(1) && t <= connection(len(strConnectionArray)) +} + +type livenessContainer struct { + CONNECTED NodeConnectionState + DISCONNECTED NodeConnectionState + HEALTHY NodeConnectionState +} + +var NodeStates = livenessContainer{ + CONNECTED: NodeConnectionState{connected}, + DISCONNECTED: NodeConnectionState{disconnected}, +} + +func (c livenessContainer) All() []NodeConnectionState { + return []NodeConnectionState{ + c.CONNECTED, + c.DISCONNECTED, + } +} + +func (s NodeConnectionState) MarshalJSON() ([]byte, error) { + return []byte(`"` + s.String() + `"`), nil +} + +func (s *NodeConnectionState) UnmarshalJSON(b []byte) error { + val := string(trimQuotes(b)) + *s = ParseConnection(val) + return nil +} diff --git a/pkg/models/node_info.go b/pkg/models/node_info.go index 1a0a5a5dc2..793ef499ca 100644 --- a/pkg/models/node_info.go +++ b/pkg/models/node_info.go @@ -38,8 +38,8 @@ func (e *NodeType) UnmarshalText(text []byte) (err error) { return } -type NodeInfoProvider interface { - GetNodeInfo(ctx context.Context) NodeInfo +type NodeStateProvider interface { + GetNodeState(ctx context.Context) NodeState } type LabelsProvider interface { @@ -74,17 +74,17 @@ func (n NoopNodeInfoDecorator) DecorateNodeInfo(ctx context.Context, nodeInfo No return nodeInfo } -// NodeInfo -// TODO: add Validate() method to NodeInfo and make sure it is called in all the places where it is initialized +// NodeInfo contains metadata about a node on the network. Compute nodes share their NodeInfo with Requester nodes +// to further its view of the networks conditions. ComputeNodeInfo is non-nil iff the NodeType is NodeTypeCompute. +// TODO(walid): add Validate() method to NodeInfo and make sure it is called in all the places where it is initialized type NodeInfo struct { + // TODO replace all access on this field with the `ID()` method NodeID string `json:"NodeID"` PeerInfo *peer.AddrInfo `json:"PeerInfo,omitempty" yaml:",omitempty"` NodeType NodeType `json:"NodeType"` Labels map[string]string `json:"Labels"` ComputeNodeInfo *ComputeNodeInfo `json:"ComputeNodeInfo,omitempty" yaml:",omitempty"` BacalhauVersion BuildVersionInfo `json:"BacalhauVersion"` - Approval NodeApproval `json:"Approval"` - State NodeState `json:"State"` } // ID returns the node ID @@ -102,6 +102,8 @@ func (n NodeInfo) IsComputeNode() bool { return n.NodeType == NodeTypeCompute } +// ComputeNodeInfo contains metadata about the current state and abilities of a compute node. Compute Nodes share +// this state with Requester nodes by including it in the NodeInfo they share across the network. type ComputeNodeInfo struct { ExecutionEngines []string `json:"ExecutionEngines"` Publishers []string `json:"Publishers"` diff --git a/pkg/models/node_state.go b/pkg/models/node_state.go index 94aea6101b..c9130216ef 100644 --- a/pkg/models/node_state.go +++ b/pkg/models/node_state.go @@ -1,95 +1,9 @@ package models -import ( - "fmt" -) - +// NodeState contains metadata about the state of a node on the network. Requester nodes maintain a NodeState for +// each node they are aware of. The NodeState represents a Requester nodes view of another node on the network. type NodeState struct { - liveness -} - -type liveness int - -// To add a new state (for instance, a state beyond which the node is considered -// lost) then: -// * add it to the end of the list in the const below -// * add it to strLivenessArray and typeLivenessMap -// * add it to the livenessContainer and corresponding NodeStates var. -// * add it to the All() method in the livenessContainer -const ( - connected liveness = iota - disconnected -) - -var ( - strLivenessArray = [...]string{ - connected: "CONNECTED", - disconnected: "DISCONNECTED", - } - - typeLivenessMap = map[string]liveness{ - "CONNECTED": connected, - "DISCONNECTED": disconnected, - } -) - -func (t liveness) String() string { - return strLivenessArray[t] -} - -func ParseState(a any) NodeState { - switch v := a.(type) { - case NodeState: - return v - case string: - return NodeState{stringToLiveness(v)} - case fmt.Stringer: - return NodeState{stringToLiveness(v.String())} - case int: - return NodeState{liveness(v)} - case int64: - return NodeState{liveness(int(v))} - case int32: - return NodeState{liveness(int(v))} - } - return NodeState{disconnected} -} - -func stringToLiveness(s string) liveness { - if v, ok := typeLivenessMap[s]; ok { - return v - } - return disconnected -} - -func (t liveness) IsValid() bool { - return t >= liveness(1) && t <= liveness(len(strLivenessArray)) -} - -type livenessContainer struct { - CONNECTED NodeState - DISCONNECTED NodeState - HEALTHY NodeState -} - -var NodeStates = livenessContainer{ - CONNECTED: NodeState{connected}, - DISCONNECTED: NodeState{disconnected}, -} - -func (c livenessContainer) All() []NodeState { - return []NodeState{ - c.CONNECTED, - c.DISCONNECTED, - } -} - -func (s NodeState) MarshalJSON() ([]byte, error) { - return []byte(`"` + s.String() + `"`), nil -} - -func (s *NodeState) UnmarshalJSON(b []byte) error { - val := string(trimQuotes(b)) - *s = ParseState(val) - return nil + Info NodeInfo `json:"Info"` + Membership NodeMembershipState `json:"Membership"` + Connection NodeConnectionState `json:"Connection"` } diff --git a/pkg/nats/transport/nats.go b/pkg/nats/transport/nats.go index f249c322b1..48cb0d91ac 100644 --- a/pkg/nats/transport/nats.go +++ b/pkg/nats/transport/nats.go @@ -6,6 +6,10 @@ import ( "fmt" "strings" + "github.com/nats-io/nats-server/v2/server" + "github.com/nats-io/nats.go" + "github.com/rs/zerolog/log" + "github.com/bacalhau-project/bacalhau/pkg/compute" "github.com/bacalhau-project/bacalhau/pkg/lib/validate" "github.com/bacalhau-project/bacalhau/pkg/model" @@ -16,9 +20,6 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/pubsub" "github.com/bacalhau-project/bacalhau/pkg/routing" core_transport "github.com/bacalhau-project/bacalhau/pkg/transport" - "github.com/nats-io/nats-server/v2/server" - "github.com/nats-io/nats.go" - "github.com/rs/zerolog/log" ) const NodeInfoSubjectPrefix = "node.info." @@ -85,7 +86,7 @@ type NATSTransport struct { natsClient *nats_helper.ClientManager computeProxy compute.Endpoint callbackProxy compute.Callback - nodeInfoPubSub pubsub.PubSub[models.NodeInfo] + nodeInfoPubSub pubsub.PubSub[models.NodeState] nodeInfoDecorator models.NodeInfoDecorator managementProxy compute.ManagementEndpoint } @@ -149,7 +150,7 @@ func NewNATSTransport(ctx context.Context, } // PubSub to publish and consume node info messages - nodeInfoPubSub, err := nats_pubsub.NewPubSub[models.NodeInfo](nats_pubsub.PubSubParams{ + nodeInfoPubSub, err := nats_pubsub.NewPubSub[models.NodeState](nats_pubsub.PubSubParams{ Conn: nc.Client, Subject: NodeInfoSubjectPrefix + config.NodeID, SubscriptionSubject: NodeInfoSubjectPrefix + "*", @@ -206,8 +207,8 @@ func CreateClient(ctx context.Context, config *NATSTransportConfig) (*nats_helpe func (t *NATSTransport) RegisterNodeInfoConsumer(ctx context.Context, infostore routing.NodeInfoStore) error { // subscribe to nodeInfo subject and add nodeInfo to nodeInfoStore - nodeInfoSubscriber := pubsub.NewChainedSubscriber[models.NodeInfo](true) - nodeInfoSubscriber.Add(pubsub.SubscriberFunc[models.NodeInfo](infostore.Add)) + nodeInfoSubscriber := pubsub.NewChainedSubscriber[models.NodeState](true) + nodeInfoSubscriber.Add(pubsub.SubscriberFunc[models.NodeState](infostore.Add)) return t.nodeInfoPubSub.Subscribe(ctx, nodeInfoSubscriber) } @@ -256,7 +257,7 @@ func (t *NATSTransport) ManagementProxy() compute.ManagementEndpoint { } // NodeInfoPubSub returns the node info pubsub. -func (t *NATSTransport) NodeInfoPubSub() pubsub.PubSub[models.NodeInfo] { +func (t *NATSTransport) NodeInfoPubSub() pubsub.PubSub[models.NodeState] { return t.nodeInfoPubSub } diff --git a/pkg/node/config_defaults.go b/pkg/node/config_defaults.go index 283c834f04..2eb4e54252 100644 --- a/pkg/node/config_defaults.go +++ b/pkg/node/config_defaults.go @@ -75,7 +75,7 @@ var DefaultRequesterConfig = RequesterConfigParams{ NodeDisconnectedAfter: types.Duration(30 * time.Second), //nolint:gomnd }, - DefaultApprovalState: models.NodeApprovals.APPROVED, + DefaultApprovalState: models.NodeMembership.APPROVED, } var TestRequesterConfig = RequesterConfigParams{ @@ -111,7 +111,7 @@ var TestRequesterConfig = RequesterConfigParams{ NodeDisconnectedAfter: types.Duration(30 * time.Second), //nolint:gomnd }, - DefaultApprovalState: models.NodeApprovals.APPROVED, + DefaultApprovalState: models.NodeMembership.APPROVED, } func getRequesterConfigParams() RequesterConfigParams { diff --git a/pkg/node/config_requester.go b/pkg/node/config_requester.go index e45df01f3d..d6b683c11c 100644 --- a/pkg/node/config_requester.go +++ b/pkg/node/config_requester.go @@ -5,11 +5,12 @@ import ( "net/url" "time" - "github.com/bacalhau-project/bacalhau/pkg/config/types" - "github.com/bacalhau-project/bacalhau/pkg/jobstore" "github.com/imdario/mergo" "github.com/rs/zerolog/log" + "github.com/bacalhau-project/bacalhau/pkg/config/types" + "github.com/bacalhau-project/bacalhau/pkg/jobstore" + "github.com/bacalhau-project/bacalhau/pkg/model" "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/orchestrator" @@ -56,7 +57,7 @@ type RequesterConfigParams struct { // When new nodes join the cluster, what state do they have? By default, APPROVED, and // for tests, APPROVED. We will provide an option to set this to PENDING for production // or for when operators are ready to control node approval. - DefaultApprovalState models.NodeApproval + DefaultApprovalState models.NodeMembershipState ControlPlaneSettings types.RequesterControlPlaneConfig } diff --git a/pkg/node/heartbeat/heartbeat_test.go b/pkg/node/heartbeat/heartbeat_test.go index 9ecb1e9bc2..42fe31eb41 100644 --- a/pkg/node/heartbeat/heartbeat_test.go +++ b/pkg/node/heartbeat/heartbeat_test.go @@ -9,12 +9,13 @@ import ( "testing" "time" - "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/benbjohnson/clock" "github.com/nats-io/nats-server/v2/server" natsserver "github.com/nats-io/nats-server/v2/test" "github.com/nats-io/nats.go" "github.com/stretchr/testify/suite" + + "github.com/bacalhau-project/bacalhau/pkg/models" ) const ( @@ -74,7 +75,7 @@ func (s *HeartbeatTestSuite) TestSendHeartbeat() { name string includeInitial bool heartbeats []time.Duration - expectedState models.NodeState + expectedState models.NodeConnectionState waitUntil time.Duration } @@ -116,13 +117,13 @@ func (s *HeartbeatTestSuite) TestSendHeartbeat() { } for i, tc := range testcases { - nodeInfo := models.NodeInfo{ - NodeID: "node-" + strconv.Itoa(i), + nodeState := models.NodeState{ + Info: models.NodeInfo{NodeID: "node-" + strconv.Itoa(i)}, } s.T().Run(tc.name, func(t *testing.T) { // Wait for the first heartbeat to be sent - client, err := NewClient(s.client, nodeInfo.NodeID, TestTopic) + client, err := NewClient(s.client, nodeState.Info.NodeID, TestTopic) s.Require().NoError(err) defer client.Close(ctx) @@ -147,8 +148,8 @@ func (s *HeartbeatTestSuite) TestSendHeartbeat() { s.clock.Add(tc.waitUntil) - server.UpdateNodeInfo(&nodeInfo) - s.Require().Equal(nodeInfo.State, tc.expectedState, fmt.Sprintf("incorrect state in %s", tc.name)) + server.UpdateNodeInfo(&nodeState) + s.Require().Equal(nodeState.Connection, tc.expectedState, fmt.Sprintf("incorrect state in %s", tc.name)) }) } } diff --git a/pkg/node/heartbeat/server.go b/pkg/node/heartbeat/server.go index 296b38423d..cc9443da57 100644 --- a/pkg/node/heartbeat/server.go +++ b/pkg/node/heartbeat/server.go @@ -27,7 +27,7 @@ type HeartbeatServer struct { clock clock.Clock subscription *natsPubSub.PubSub[Heartbeat] pqueue *collections.HashedPriorityQueue[string, TimestampedHeartbeat] - livenessMap *concurrency.StripedMap[models.NodeState] + livenessMap *concurrency.StripedMap[models.NodeConnectionState] checkFrequency time.Duration disconnectedAfter time.Duration } @@ -64,7 +64,7 @@ func NewServer(params HeartbeatServerParams) (*HeartbeatServer, error) { clock: clk, subscription: subscription, pqueue: pqueue, - livenessMap: concurrency.NewStripedMap[models.NodeState](0), // no particular stripe count for now + livenessMap: concurrency.NewStripedMap[models.NodeConnectionState](0), // no particular stripe count for now checkFrequency: params.CheckFrequency, disconnectedAfter: params.NodeDisconnectedAfter, }, nil @@ -135,22 +135,22 @@ func (h *HeartbeatServer) CheckQueue(ctx context.Context) { // markNode will mark a node as being in a certain state. This will be used to update the node's // info to include the liveness state. -func (h *HeartbeatServer) markNodeAs(nodeID string, state models.NodeState) { +func (h *HeartbeatServer) markNodeAs(nodeID string, state models.NodeConnectionState) { h.livenessMap.Put(nodeID, state) } // UpdateNode will add the liveness for specific nodes to their NodeInfo -func (h *HeartbeatServer) UpdateNodeInfo(nodeInfo *models.NodeInfo) { - if liveness, ok := h.livenessMap.Get(nodeInfo.NodeID); ok { - nodeInfo.State = liveness +func (h *HeartbeatServer) UpdateNodeInfo(state *models.NodeState) { + if liveness, ok := h.livenessMap.Get(state.Info.NodeID); ok { + state.Connection = liveness } else { // We've never seen this, so we'll mark it as unknown - nodeInfo.State = models.NodeStates.DISCONNECTED + state.Connection = models.NodeStates.DISCONNECTED } } // FilterNodeInfos will return only those NodeInfos that have the requested liveness -func (h *HeartbeatServer) FilterNodeInfos(nodeInfos []*models.NodeInfo, state models.NodeState) []*models.NodeInfo { +func (h *HeartbeatServer) FilterNodeInfos(nodeInfos []*models.NodeInfo, state models.NodeConnectionState) []*models.NodeInfo { result := make([]*models.NodeInfo, 0) for _, nodeInfo := range nodeInfos { if liveness, ok := h.livenessMap.Get(nodeInfo.NodeID); ok { diff --git a/pkg/node/manager/node_manager.go b/pkg/node/manager/node_manager.go index a5233ce53a..bb6079c310 100644 --- a/pkg/node/manager/node_manager.go +++ b/pkg/node/manager/node_manager.go @@ -4,15 +4,16 @@ import ( "context" "fmt" + "github.com/libp2p/go-libp2p/core/peer" + "github.com/pkg/errors" + "github.com/rs/zerolog/log" + "github.com/bacalhau-project/bacalhau/pkg/compute" "github.com/bacalhau-project/bacalhau/pkg/lib/concurrency" "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/models/requests" "github.com/bacalhau-project/bacalhau/pkg/node/heartbeat" "github.com/bacalhau-project/bacalhau/pkg/routing" - "github.com/libp2p/go-libp2p/core/peer" - "github.com/pkg/errors" - "github.com/rs/zerolog/log" ) const ( @@ -24,16 +25,16 @@ const ( // also provides operations for querying and managing compute // node information. type NodeManager struct { - nodeInfo routing.NodeInfoStore + store routing.NodeInfoStore resourceMap *concurrency.StripedMap[models.Resources] heartbeats *heartbeat.HeartbeatServer - defaultApprovalState models.NodeApproval + defaultApprovalState models.NodeMembershipState } type NodeManagerParams struct { NodeInfo routing.NodeInfoStore Heartbeats *heartbeat.HeartbeatServer - DefaultApprovalState models.NodeApproval + DefaultApprovalState models.NodeMembershipState } // NewNodeManager constructs a new node manager and returns a pointer @@ -41,7 +42,7 @@ type NodeManagerParams struct { func NewNodeManager(params NodeManagerParams) *NodeManager { return &NodeManager{ resourceMap: concurrency.NewStripedMap[models.Resources](resourceMapLockCount), - nodeInfo: params.NodeInfo, + store: params.NodeInfo, heartbeats: params.Heartbeats, defaultApprovalState: params.DefaultApprovalState, } @@ -68,10 +69,10 @@ func (n *NodeManager) Start(ctx context.Context) error { // Register is part of the implementation of the ManagementEndpoint // interface. It is used to register a compute node with the cluster. func (n *NodeManager) Register(ctx context.Context, request requests.RegisterRequest) (*requests.RegisterResponse, error) { - existing, err := n.nodeInfo.Get(ctx, request.Info.NodeID) + existing, err := n.store.Get(ctx, request.Info.NodeID) if err == nil { // If we have already seen this node and rejected it, then let the node know - if existing.Approval == models.NodeApprovals.REJECTED { + if existing.Membership == models.NodeMembership.REJECTED { return &requests.RegisterResponse{ Accepted: false, Reason: "node has been rejected", @@ -86,10 +87,13 @@ func (n *NodeManager) Register(ctx context.Context, request requests.RegisterReq }, nil } - request.Info.Approval = n.defaultApprovalState - - if err := n.nodeInfo.Add(ctx, request.Info); err != nil { - return nil, errors.Wrap(err, "failed to save nodeinfo during node registration") + if err := n.store.Add(ctx, models.NodeState{ + Info: request.Info, + Membership: n.defaultApprovalState, + // NB(forrest): by virtue of a compute node calling this endpoint we can consider it connected + Connection: models.NodeStates.CONNECTED, + }); err != nil { + return nil, errors.Wrap(err, "failed to save nodestate during node registration") } return &requests.RegisterResponse{ @@ -98,9 +102,9 @@ func (n *NodeManager) Register(ctx context.Context, request requests.RegisterReq } // UpdateInfo is part of the implementation of the ManagementEndpoint -// interface. It is used to update the node info for a particular node +// interface. It is used to update the node state for a particular node func (n *NodeManager) UpdateInfo(ctx context.Context, request requests.UpdateInfoRequest) (*requests.UpdateInfoResponse, error) { - existing, err := n.nodeInfo.Get(ctx, request.Info.NodeID) + existing, err := n.store.Get(ctx, request.Info.NodeID) if errors.Is(err, routing.ErrNodeNotFound{}) { return &requests.UpdateInfoResponse{ @@ -110,10 +114,10 @@ func (n *NodeManager) UpdateInfo(ctx context.Context, request requests.UpdateInf } if err != nil { - return nil, errors.Wrap(err, "failed to get nodeinfo during node registration") + return nil, errors.Wrap(err, "failed to get nodestate during node registration") } - if existing.Approval == models.NodeApprovals.REJECTED { + if existing.Membership == models.NodeMembership.REJECTED { return &requests.UpdateInfoResponse{ Accepted: false, Reason: "node registration rejected", @@ -121,8 +125,14 @@ func (n *NodeManager) UpdateInfo(ctx context.Context, request requests.UpdateInf } // TODO: Add a Put endpoint that takes the revision into account? - if err := n.nodeInfo.Add(ctx, request.Info); err != nil { - return nil, errors.Wrap(err, "failed to save nodeinfo during node registration") + if err := n.store.Add(ctx, models.NodeState{ + Info: request.Info, + // the nodes approval state is assumed to be approved here, but re-use existing state + Membership: existing.Membership, + // TODO can we assume the node is connected here? + Connection: models.NodeStates.CONNECTED, + }); err != nil { + return nil, errors.Wrap(err, "failed to save nodestate during node registration") } return &requests.UpdateInfoResponse{ @@ -134,12 +144,12 @@ func (n *NodeManager) UpdateInfo(ctx context.Context, request requests.UpdateInf // is used to augment information about the available resources for each node. func (n *NodeManager) UpdateResources(ctx context.Context, request requests.UpdateResourcesRequest) (*requests.UpdateResourcesResponse, error) { - existing, err := n.nodeInfo.Get(ctx, request.NodeID) + existing, err := n.store.Get(ctx, request.NodeID) if errors.Is(err, routing.ErrNodeNotFound{}) { return nil, fmt.Errorf("unable to update resources for missing node: %s", request.NodeID) } - if existing.Approval == models.NodeApprovals.REJECTED { + if existing.Membership == models.NodeMembership.REJECTED { log.Ctx(ctx).Debug().Msg("not updating resources for rejected node ") return &requests.UpdateResourcesResponse{}, nil } @@ -154,44 +164,44 @@ func (n *NodeManager) UpdateResources(ctx context.Context, // ---- Implementation of routing.NodeInfoStore ---- func (n *NodeManager) FindPeer(ctx context.Context, peerID peer.ID) (peer.AddrInfo, error) { - return n.nodeInfo.FindPeer(ctx, peerID) + return n.store.FindPeer(ctx, peerID) } -func (n *NodeManager) Add(ctx context.Context, nodeInfo models.NodeInfo) error { - return n.nodeInfo.Add(ctx, nodeInfo) +func (n *NodeManager) Add(ctx context.Context, nodeInfo models.NodeState) error { + return n.store.Add(ctx, nodeInfo) } -func (n *NodeManager) addToInfo(ctx context.Context, info *models.NodeInfo) { - resources, found := n.resourceMap.Get(info.NodeID) - if found && info.ComputeNodeInfo != nil { - info.ComputeNodeInfo.AvailableCapacity = resources +func (n *NodeManager) addToInfo(ctx context.Context, state *models.NodeState) { + resources, found := n.resourceMap.Get(state.Info.NodeID) + if found && state.Info.ComputeNodeInfo != nil { + state.Info.ComputeNodeInfo.AvailableCapacity = resources } if n.heartbeats != nil { - n.heartbeats.UpdateNodeInfo(info) + n.heartbeats.UpdateNodeInfo(state) } } -func (n *NodeManager) Get(ctx context.Context, nodeID string) (models.NodeInfo, error) { - info, err := n.nodeInfo.Get(ctx, nodeID) +func (n *NodeManager) Get(ctx context.Context, nodeID string) (models.NodeState, error) { + nodeState, err := n.store.Get(ctx, nodeID) if err != nil { - return models.NodeInfo{}, err + return models.NodeState{}, err } - n.addToInfo(ctx, &info) - return info, nil + n.addToInfo(ctx, &nodeState) + return nodeState, nil } -func (n *NodeManager) GetByPrefix(ctx context.Context, prefix string) (models.NodeInfo, error) { - info, err := n.nodeInfo.GetByPrefix(ctx, prefix) +func (n *NodeManager) GetByPrefix(ctx context.Context, prefix string) (models.NodeState, error) { + state, err := n.store.GetByPrefix(ctx, prefix) if err != nil { - return models.NodeInfo{}, err + return models.NodeState{}, err } - n.addToInfo(ctx, &info) - return info, nil + n.addToInfo(ctx, &state) + return state, nil } -func (n *NodeManager) List(ctx context.Context, filters ...routing.NodeInfoFilter) ([]models.NodeInfo, error) { - items, err := n.nodeInfo.List(ctx, filters...) +func (n *NodeManager) List(ctx context.Context, filters ...routing.NodeStateFilter) ([]models.NodeState, error) { + items, err := n.store.List(ctx, filters...) if err != nil { return nil, err } @@ -204,7 +214,7 @@ func (n *NodeManager) List(ctx context.Context, filters ...routing.NodeInfoFilte } func (n *NodeManager) Delete(ctx context.Context, nodeID string) error { - return n.nodeInfo.Delete(ctx, nodeID) + return n.store.Delete(ctx, nodeID) } // ---- Implementation of node actions ---- @@ -213,20 +223,20 @@ func (n *NodeManager) Delete(ctx context.Context, nodeID string) error { // reason for the approval (for audit). The return values denote success and any // failure of the operation as a human readable string. func (n *NodeManager) ApproveAction(ctx context.Context, nodeID string, reason string) (bool, string) { - info, err := n.nodeInfo.GetByPrefix(ctx, nodeID) + state, err := n.store.GetByPrefix(ctx, nodeID) if err != nil { return false, err.Error() } - if info.Approval == models.NodeApprovals.APPROVED { + if state.Membership == models.NodeMembership.APPROVED { return false, "node already approved" } - info.Approval = models.NodeApprovals.APPROVED + state.Membership = models.NodeMembership.APPROVED log.Ctx(ctx).Info().Str("reason", reason).Msgf("node %s approved", nodeID) - if err := n.nodeInfo.Add(ctx, info); err != nil { - return false, "failed to save nodeinfo during node approval" + if err := n.store.Add(ctx, state); err != nil { + return false, "failed to save nodestate during node approval" } return true, "" @@ -236,20 +246,20 @@ func (n *NodeManager) ApproveAction(ctx context.Context, nodeID string, reason s // reason for the rejection (for audit). The return values denote success and any // failure of the operation as a human readable string. func (n *NodeManager) RejectAction(ctx context.Context, nodeID string, reason string) (bool, string) { - info, err := n.nodeInfo.GetByPrefix(ctx, nodeID) + state, err := n.store.GetByPrefix(ctx, nodeID) if err != nil { return false, err.Error() } - if info.Approval == models.NodeApprovals.REJECTED { + if state.Membership == models.NodeMembership.REJECTED { return false, "node already rejected" } - info.Approval = models.NodeApprovals.REJECTED + state.Membership = models.NodeMembership.REJECTED log.Ctx(ctx).Info().Str("reason", reason).Msgf("node %s rejected", nodeID) - if err := n.nodeInfo.Add(ctx, info); err != nil { - return false, "failed to save nodeinfo during node rejection" + if err := n.store.Add(ctx, state); err != nil { + return false, "failed to save nodestate during node rejection" } return true, "" @@ -259,13 +269,13 @@ func (n *NodeManager) RejectAction(ctx context.Context, nodeID string, reason st // reason for the rejection (for audit). The return values denote success and any // failure of the operation as a human readable string. func (n *NodeManager) DeleteAction(ctx context.Context, nodeID string, reason string) (bool, string) { - info, err := n.nodeInfo.GetByPrefix(ctx, nodeID) + state, err := n.store.GetByPrefix(ctx, nodeID) if err != nil { return false, err.Error() } - if err := n.nodeInfo.Delete(ctx, info.NodeID); err != nil { - return false, fmt.Sprintf("failed to delete nodeinfo: %s", err) + if err := n.store.Delete(ctx, state.Info.NodeID); err != nil { + return false, fmt.Sprintf("failed to delete nodestate: %s", err) } return true, "" diff --git a/pkg/node/node.go b/pkg/node/node.go index 50944234e2..9ac0bcd2a6 100644 --- a/pkg/node/node.go +++ b/pkg/node/node.go @@ -427,11 +427,11 @@ func NewNode( // Create a node info provider for LibP2P, and specify the default node approval state // of Approved to avoid confusion as approval state is not used for this transport type. - nodeInfoProvider := routing.NewNodeInfoProvider(routing.NodeInfoProviderParams{ + nodeInfoProvider := routing.NewNodeStateProvider(routing.NodeStateProviderParams{ NodeID: config.NodeID, LabelsProvider: labelsProvider, BacalhauVersion: *version.Get(), - DefaultNodeApproval: models.NodeApprovals.APPROVED, + DefaultNodeApproval: models.NodeMembership.APPROVED, }) nodeInfoProvider.RegisterNodeInfoDecorator(transportLayer.NodeInfoDecorator()) if computeNode != nil { @@ -439,14 +439,14 @@ func NewNode( } shared.NewEndpoint(shared.EndpointParams{ - Router: apiServer.Router, - NodeID: config.NodeID, - NodeInfoProvider: nodeInfoProvider, + Router: apiServer.Router, + NodeID: config.NodeID, + NodeStateProvider: nodeInfoProvider, }) agent.NewEndpoint(agent.EndpointParams{ Router: apiServer.Router, - NodeInfoProvider: nodeInfoProvider, + NodeStateProvider: nodeInfoProvider, DebugInfoProviders: debugInfoProviders, }) @@ -463,18 +463,19 @@ func NewNode( // NB(forrest): this must be done last to avoid eager publishing before nodes are constructed // TODO(forrest) [fixme] we should fix this to make it less racy in testing nodeInfoPublisher = routing.NewNodeInfoPublisher(routing.NodeInfoPublisherParams{ - PubSub: transportLayer.NodeInfoPubSub(), - NodeInfoProvider: nodeInfoProvider, - IntervalConfig: nodeInfoPublisherInterval, + PubSub: transportLayer.NodeInfoPubSub(), + NodeStateProvider: nodeInfoProvider, + IntervalConfig: nodeInfoPublisherInterval, }) } else { // We want to register the current requester node to the node store if config.IsRequesterNode { - nodeInfo := nodeInfoProvider.GetNodeInfo(ctx) - nodeInfo.Approval = models.NodeApprovals.APPROVED - err := tracingInfoStore.Add(ctx, nodeInfo) - if err != nil { + nodeState := nodeInfoProvider.GetNodeState(ctx) + // TODO what is the liveness here? We are adding ourselves so I assume connected? + nodeState.Membership = models.NodeMembership.APPROVED + if err := tracingInfoStore.Add(ctx, nodeState); err != nil { log.Ctx(ctx).Error().Err(err).Msg("failed to add requester node to the node store") + return nil, fmt.Errorf("registering node to the node store: %w", err) } } } diff --git a/pkg/node/requester.go b/pkg/node/requester.go index 8c7319fe3d..cf1a88b655 100644 --- a/pkg/node/requester.go +++ b/pkg/node/requester.go @@ -3,6 +3,8 @@ package node import ( "context" + "github.com/rs/zerolog/log" + "github.com/bacalhau-project/bacalhau/pkg/authn" "github.com/bacalhau-project/bacalhau/pkg/job" "github.com/bacalhau-project/bacalhau/pkg/lib/backoff" @@ -23,7 +25,6 @@ import ( s3helper "github.com/bacalhau-project/bacalhau/pkg/s3" "github.com/bacalhau-project/bacalhau/pkg/translation" "github.com/bacalhau-project/bacalhau/pkg/util" - "github.com/rs/zerolog/log" "github.com/bacalhau-project/bacalhau/pkg/compute" "github.com/bacalhau-project/bacalhau/pkg/eventhandler" @@ -72,14 +73,11 @@ func NewRequesterNode( jobStore := requesterConfig.JobStore + // TODO(forrest) [simplify]: given the current state of the code this interface obfuscates what is happening here, + // there isn't any "node discovery" happening here, we are simply listing a node store. + // The todo here is to simply pass a node store where it's needed instead of this chain wrapping a discoverer wrapping + // a store... // compute node discoverer - nodeDiscoveryChain := discovery.NewChain(true) - nodeDiscoveryChain.Add( - discovery.NewStoreNodeDiscoverer(discovery.StoreNodeDiscovererParams{ - Store: nodeInfoStore, - }), - ) - log.Ctx(ctx). Info(). Msgf("Nodes joining the cluster will be assigned approval state: %s", requesterConfig.DefaultApprovalState.String()) @@ -103,7 +101,7 @@ func NewRequesterNode( // node selector nodeSelector := selector.NewNodeSelector(selector.NodeSelectorParams{ - NodeDiscoverer: nodeDiscoveryChain, + NodeDiscoverer: nodeInfoStore, NodeRanker: nodeRankerChain, }) @@ -260,7 +258,7 @@ func NewRequesterNode( // register debug info providers for the /debug endpoint debugInfoProviders := []model.DebugInfoProvider{ - discovery.NewDebugInfoProvider(nodeDiscoveryChain), + discovery.NewDebugInfoProvider(nodeInfoStore), } // register requester public http apis @@ -269,7 +267,7 @@ func NewRequesterNode( Requester: endpoint, DebugInfoProviders: debugInfoProviders, JobStore: jobStore, - NodeDiscoverer: nodeDiscoveryChain, + NodeDiscoverer: nodeInfoStore, }) orchestrator_endpoint.NewEndpoint(orchestrator_endpoint.EndpointParams{ @@ -329,7 +327,7 @@ func NewRequesterNode( Endpoint: endpoint, localCallback: endpoint, EndpointV2: endpointV2, - NodeDiscoverer: nodeDiscoveryChain, + NodeDiscoverer: nodeInfoStore, NodeInfoStore: nodeInfoStore, JobStore: jobStore, nodeManager: nodeManager, diff --git a/pkg/orchestrator/interfaces.go b/pkg/orchestrator/interfaces.go index 3e57367b83..e137456b71 100644 --- a/pkg/orchestrator/interfaces.go +++ b/pkg/orchestrator/interfaces.go @@ -6,6 +6,7 @@ import ( "time" "github.com/bacalhau-project/bacalhau/pkg/models" + "github.com/bacalhau-project/bacalhau/pkg/routing" ) // EvaluationBroker is used to manage brokering of evaluations. When an evaluation is @@ -89,8 +90,9 @@ type Planner interface { } // NodeDiscoverer discovers nodes in the network that are suitable to execute a job. +// NodeDiscoverer is a subset of the routing.NodeInfoStore interface. type NodeDiscoverer interface { - ListNodes(ctx context.Context) ([]models.NodeInfo, error) + List(ctx context.Context, filter ...routing.NodeStateFilter) ([]models.NodeState, error) } // NodeRanker ranks nodes based on their suitability to execute a job. diff --git a/pkg/orchestrator/scheduler/batch_job_test.go b/pkg/orchestrator/scheduler/batch_job_test.go index 598e98521e..bc34c88148 100644 --- a/pkg/orchestrator/scheduler/batch_job_test.go +++ b/pkg/orchestrator/scheduler/batch_job_test.go @@ -6,14 +6,15 @@ import ( "context" "testing" + "github.com/google/uuid" + "github.com/stretchr/testify/suite" + "go.uber.org/mock/gomock" + "github.com/bacalhau-project/bacalhau/pkg/jobstore" "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/orchestrator" "github.com/bacalhau-project/bacalhau/pkg/orchestrator/retry" "github.com/bacalhau-project/bacalhau/pkg/test/mock" - "github.com/google/uuid" - "github.com/stretchr/testify/suite" - "go.uber.org/mock/gomock" ) const ( @@ -69,11 +70,11 @@ func (s *BatchJobSchedulerTestSuite) TestProcess_ShouldCreateEnoughExecutions() // we need 3 executions. discover enough nodes nodeInfos := []models.NodeInfo{ - *mockNodeInfo(s.T(), nodeIDs[0]), - *mockNodeInfo(s.T(), nodeIDs[1]), - *mockNodeInfo(s.T(), nodeIDs[2]), - *mockNodeInfo(s.T(), nodeIDs[3]), - *mockNodeInfo(s.T(), nodeIDs[4]), + *fakeNodeInfo(s.T(), nodeIDs[0]), + *fakeNodeInfo(s.T(), nodeIDs[1]), + *fakeNodeInfo(s.T(), nodeIDs[2]), + *fakeNodeInfo(s.T(), nodeIDs[3]), + *fakeNodeInfo(s.T(), nodeIDs[4]), } s.mockNodeSelection(job, nodeInfos, job.Count) @@ -97,8 +98,8 @@ func (s *BatchJobSchedulerTestSuite) TestProcess_AlreadyEnoughExecutions() { // mock active executions' nodes to be healthy nodeInfos := []models.NodeInfo{ - *mockNodeInfo(s.T(), executions[execAskForBid].NodeID), - *mockNodeInfo(s.T(), executions[execBidAccepted].NodeID), + *fakeNodeInfo(s.T(), executions[execAskForBid].NodeID), + *fakeNodeInfo(s.T(), executions[execBidAccepted].NodeID), } s.nodeSelector.EXPECT().AllNodes(gomock.Any()).Return(nodeInfos, nil) @@ -125,9 +126,9 @@ func (s *BatchJobSchedulerTestSuite) TestProcess_RejectExtraExecutions() { // mock active executions' nodes to be healthy nodeInfos := []models.NodeInfo{ - *mockNodeInfo(s.T(), executions[0].NodeID), - *mockNodeInfo(s.T(), executions[1].NodeID), - *mockNodeInfo(s.T(), executions[2].NodeID), + *fakeNodeInfo(s.T(), executions[0].NodeID), + *fakeNodeInfo(s.T(), executions[1].NodeID), + *fakeNodeInfo(s.T(), executions[2].NodeID), } s.nodeSelector.EXPECT().AllNodes(gomock.Any()).Return(nodeInfos, nil) @@ -151,8 +152,8 @@ func (s *BatchJobSchedulerTestSuite) TestProcess_TooManyExecutions() { // mock active executions' nodes to be healthy nodeInfos := []models.NodeInfo{ - *mockNodeInfo(s.T(), executions[execAskForBid].NodeID), - *mockNodeInfo(s.T(), executions[execBidAccepted].NodeID), + *fakeNodeInfo(s.T(), executions[execAskForBid].NodeID), + *fakeNodeInfo(s.T(), executions[execBidAccepted].NodeID), } s.nodeSelector.EXPECT().AllNodes(gomock.Any()).Return(nodeInfos, nil) matcher := NewPlanMatcher(s.T(), PlanMatcherParams{ @@ -172,8 +173,8 @@ func (s *BatchJobSchedulerTestSuite) TestProcessFail_NotEnoughExecutions() { // we need 3 executions. discover fewer nodes nodeInfos := []models.NodeInfo{ - *mockNodeInfo(s.T(), nodeIDs[0]), - *mockNodeInfo(s.T(), nodeIDs[1]), + *fakeNodeInfo(s.T(), nodeIDs[0]), + *fakeNodeInfo(s.T(), nodeIDs[1]), } s.mockNodeSelection(job, nodeInfos, job.Count) @@ -221,8 +222,8 @@ func (s *BatchJobSchedulerTestSuite) TestFailUnhealthyExecs_ShouldMarkExecutions // mock node discoverer to exclude the node in BidAccepted state nodeInfos := []models.NodeInfo{ - *mockNodeInfo(s.T(), executions[execAskForBid].NodeID), - *mockNodeInfo(s.T(), executions[execCanceled].NodeID), + *fakeNodeInfo(s.T(), executions[execAskForBid].NodeID), + *fakeNodeInfo(s.T(), executions[execCanceled].NodeID), } s.nodeSelector.EXPECT().AllNodes(gomock.Any()).Return(nodeInfos, nil) s.mockNodeSelection(job, nodeInfos, 1) @@ -285,8 +286,8 @@ func (s *BatchJobSchedulerTestSuite) TestProcess_ShouldMarkJobAsFailed_NoRetry() // mark askForBid exec as lost so we attempt to retry nodeInfos := []models.NodeInfo{ - *mockNodeInfo(s.T(), executions[execBidAccepted].NodeID), - *mockNodeInfo(s.T(), executions[execCompleted].NodeID), + *fakeNodeInfo(s.T(), executions[execBidAccepted].NodeID), + *fakeNodeInfo(s.T(), executions[execCompleted].NodeID), } s.nodeSelector.EXPECT().AllNodes(gomock.Any()).Return(nodeInfos, nil) diff --git a/pkg/orchestrator/scheduler/daemon_job_test.go b/pkg/orchestrator/scheduler/daemon_job_test.go index 69cb8033f0..5f74a87077 100644 --- a/pkg/orchestrator/scheduler/daemon_job_test.go +++ b/pkg/orchestrator/scheduler/daemon_job_test.go @@ -6,13 +6,14 @@ import ( "context" "testing" + "github.com/google/uuid" + "github.com/stretchr/testify/suite" + "go.uber.org/mock/gomock" + "github.com/bacalhau-project/bacalhau/pkg/jobstore" "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/orchestrator" "github.com/bacalhau-project/bacalhau/pkg/test/mock" - "github.com/google/uuid" - "github.com/stretchr/testify/suite" - "go.uber.org/mock/gomock" ) type DaemonJobSchedulerTestSuite struct { @@ -48,9 +49,9 @@ func (s *DaemonJobSchedulerTestSuite) TestProcess_ShouldCreateNewExecutions() { s.jobStore.EXPECT().GetExecutions(gomock.Any(), jobstore.GetExecutionsOptions{JobID: job.ID}).Return(executions, nil) nodeInfos := []models.NodeInfo{ - *mockNodeInfo(s.T(), nodeIDs[0]), - *mockNodeInfo(s.T(), nodeIDs[1]), - *mockNodeInfo(s.T(), nodeIDs[2]), + *fakeNodeInfo(s.T(), nodeIDs[0]), + *fakeNodeInfo(s.T(), nodeIDs[1]), + *fakeNodeInfo(s.T(), nodeIDs[2]), } s.nodeSelector.EXPECT().AllMatchingNodes(gomock.Any(), gomock.Any(), gomock.Any()).Return(nodeInfos, nil) @@ -97,7 +98,7 @@ func (s *DaemonJobSchedulerTestSuite) TestProcess_ShouldMarkLostExecutionsOnUnhe // mock node discoverer to exclude the first node nodeInfos := []models.NodeInfo{ - *mockNodeInfo(s.T(), executions[1].NodeID), + *fakeNodeInfo(s.T(), executions[1].NodeID), } s.nodeSelector.EXPECT().AllNodes(gomock.Any()).Return(nodeInfos, nil) s.nodeSelector.EXPECT().AllMatchingNodes(gomock.Any(), job, gomock.Any()).Return(nodeInfos, nil) @@ -126,7 +127,7 @@ func (s *DaemonJobSchedulerTestSuite) TestProcess_ShouldNOTMarkJobAsFailed() { // mock node discoverer to exclude the first node nodeInfos := []models.NodeInfo{ - *mockNodeInfo(s.T(), executions[1].NodeID), + *fakeNodeInfo(s.T(), executions[1].NodeID), } s.nodeSelector.EXPECT().AllNodes(gomock.Any()).Return(nodeInfos, nil) s.nodeSelector.EXPECT().AllMatchingNodes(gomock.Any(), job, gomock.Any()).Return(nodeInfos, nil) diff --git a/pkg/orchestrator/scheduler/ops_job_test.go b/pkg/orchestrator/scheduler/ops_job_test.go index a32f7d6cdf..c9813071c3 100644 --- a/pkg/orchestrator/scheduler/ops_job_test.go +++ b/pkg/orchestrator/scheduler/ops_job_test.go @@ -6,13 +6,14 @@ import ( "context" "testing" + "github.com/google/uuid" + "github.com/stretchr/testify/suite" + "go.uber.org/mock/gomock" + "github.com/bacalhau-project/bacalhau/pkg/jobstore" "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/orchestrator" "github.com/bacalhau-project/bacalhau/pkg/test/mock" - "github.com/google/uuid" - "github.com/stretchr/testify/suite" - "go.uber.org/mock/gomock" ) type OpsJobSchedulerTestSuite struct { @@ -48,9 +49,9 @@ func (s *OpsJobSchedulerTestSuite) TestProcess_ShouldCreateNewExecutions() { s.jobStore.EXPECT().GetExecutions(gomock.Any(), jobstore.GetExecutionsOptions{JobID: job.ID}).Return(executions, nil) nodeInfos := []models.NodeInfo{ - *mockNodeInfo(s.T(), nodeIDs[0]), - *mockNodeInfo(s.T(), nodeIDs[1]), - *mockNodeInfo(s.T(), nodeIDs[2]), + *fakeNodeInfo(s.T(), nodeIDs[0]), + *fakeNodeInfo(s.T(), nodeIDs[1]), + *fakeNodeInfo(s.T(), nodeIDs[2]), } s.mockNodeSelection(job, nodeInfos) @@ -93,7 +94,7 @@ func (s *OpsJobSchedulerTestSuite) TestProcess_ShouldMarkLostExecutionsOnUnhealt // mock node discoverer to exclude the first node nodeInfos := []models.NodeInfo{ - *mockNodeInfo(s.T(), executions[1].NodeID), + *fakeNodeInfo(s.T(), executions[1].NodeID), } s.nodeSelector.EXPECT().AllNodes(gomock.Any()).Return(nodeInfos, nil) @@ -118,7 +119,7 @@ func (s *OpsJobSchedulerTestSuite) TestProcess_ShouldMarkJobAsFailed() { // mock node discoverer to exclude the first node nodeInfos := []models.NodeInfo{ - *mockNodeInfo(s.T(), executions[1].NodeID), + *fakeNodeInfo(s.T(), executions[1].NodeID), } s.nodeSelector.EXPECT().AllNodes(gomock.Any()).Return(nodeInfos, nil) diff --git a/pkg/orchestrator/scheduler/service_job_test.go b/pkg/orchestrator/scheduler/service_job_test.go index 48c9fd6fc0..9fcac97b15 100644 --- a/pkg/orchestrator/scheduler/service_job_test.go +++ b/pkg/orchestrator/scheduler/service_job_test.go @@ -6,14 +6,15 @@ import ( "context" "testing" + "github.com/google/uuid" + "github.com/stretchr/testify/suite" + "go.uber.org/mock/gomock" + "github.com/bacalhau-project/bacalhau/pkg/jobstore" "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/orchestrator" "github.com/bacalhau-project/bacalhau/pkg/orchestrator/retry" "github.com/bacalhau-project/bacalhau/pkg/test/mock" - "github.com/google/uuid" - "github.com/stretchr/testify/suite" - "go.uber.org/mock/gomock" ) const ( @@ -61,11 +62,11 @@ func (s *ServiceJobSchedulerTestSuite) TestProcess_ShouldCreateEnoughExecutions( // we need 3 executions. discover enough nodes nodeInfos := []models.NodeInfo{ - *mockNodeInfo(s.T(), nodeIDs[0]), - *mockNodeInfo(s.T(), nodeIDs[1]), - *mockNodeInfo(s.T(), nodeIDs[2]), - *mockNodeInfo(s.T(), nodeIDs[3]), - *mockNodeInfo(s.T(), nodeIDs[4]), + *fakeNodeInfo(s.T(), nodeIDs[0]), + *fakeNodeInfo(s.T(), nodeIDs[1]), + *fakeNodeInfo(s.T(), nodeIDs[2]), + *fakeNodeInfo(s.T(), nodeIDs[3]), + *fakeNodeInfo(s.T(), nodeIDs[4]), } s.mockNodeSelection(job, nodeInfos, job.Count) @@ -89,9 +90,9 @@ func (s *ServiceJobSchedulerTestSuite) TestProcess_AlreadyEnoughExecutions() { // mock active executions' nodes to be healthy nodeInfos := []models.NodeInfo{ - *mockNodeInfo(s.T(), executions[execServiceAskForBid].NodeID), - *mockNodeInfo(s.T(), executions[execServiceBidAccepted1].NodeID), - *mockNodeInfo(s.T(), executions[execServiceBidAccepted2].NodeID), + *fakeNodeInfo(s.T(), executions[execServiceAskForBid].NodeID), + *fakeNodeInfo(s.T(), executions[execServiceBidAccepted1].NodeID), + *fakeNodeInfo(s.T(), executions[execServiceBidAccepted2].NodeID), } s.nodeSelector.EXPECT().AllNodes(gomock.Any()).Return(nodeInfos, nil) @@ -118,9 +119,9 @@ func (s *ServiceJobSchedulerTestSuite) TestProcess_RejectExtraExecutions() { // mock active executions' nodes to be healthy nodeInfos := []models.NodeInfo{ - *mockNodeInfo(s.T(), executions[0].NodeID), - *mockNodeInfo(s.T(), executions[1].NodeID), - *mockNodeInfo(s.T(), executions[2].NodeID), + *fakeNodeInfo(s.T(), executions[0].NodeID), + *fakeNodeInfo(s.T(), executions[1].NodeID), + *fakeNodeInfo(s.T(), executions[2].NodeID), } s.nodeSelector.EXPECT().AllNodes(gomock.Any()).Return(nodeInfos, nil) @@ -145,9 +146,9 @@ func (s *ServiceJobSchedulerTestSuite) TestProcess_TooManyExecutions() { // mock active executions' nodes to be healthy nodeInfos := []models.NodeInfo{ - *mockNodeInfo(s.T(), executions[execServiceAskForBid].NodeID), - *mockNodeInfo(s.T(), executions[execServiceBidAccepted1].NodeID), - *mockNodeInfo(s.T(), executions[execServiceBidAccepted2].NodeID), + *fakeNodeInfo(s.T(), executions[execServiceAskForBid].NodeID), + *fakeNodeInfo(s.T(), executions[execServiceBidAccepted1].NodeID), + *fakeNodeInfo(s.T(), executions[execServiceBidAccepted2].NodeID), } s.nodeSelector.EXPECT().AllNodes(gomock.Any()).Return(nodeInfos, nil) matcher := NewPlanMatcher(s.T(), PlanMatcherParams{ @@ -167,8 +168,8 @@ func (s *ServiceJobSchedulerTestSuite) TestProcessFail_NotEnoughExecutions() { // we need 3 executions. discover fewer nodes nodeInfos := []models.NodeInfo{ - *mockNodeInfo(s.T(), nodeIDs[0]), - *mockNodeInfo(s.T(), nodeIDs[1]), + *fakeNodeInfo(s.T(), nodeIDs[0]), + *fakeNodeInfo(s.T(), nodeIDs[1]), } s.mockNodeSelection(job, nodeInfos, job.Count) @@ -217,8 +218,8 @@ func (s *ServiceJobSchedulerTestSuite) TestFailUnhealthyExecs_ShouldMarkExecutio // mock node discoverer to exclude the node in BidAccepted state nodeInfos := []models.NodeInfo{ - *mockNodeInfo(s.T(), executions[execServiceAskForBid].NodeID), - *mockNodeInfo(s.T(), executions[execServiceCanceled].NodeID), + *fakeNodeInfo(s.T(), executions[execServiceAskForBid].NodeID), + *fakeNodeInfo(s.T(), executions[execServiceCanceled].NodeID), } s.nodeSelector.EXPECT().AllNodes(gomock.Any()).Return(nodeInfos, nil) s.mockNodeSelection(job, nodeInfos, 2) @@ -250,11 +251,11 @@ func (s *ServiceJobSchedulerTestSuite) TestProcess_TreatCompletedExecutionsAsFai // discover all nodes to avoid treating active executions as unhealthy nodeInfos := []models.NodeInfo{ - *mockNodeInfo(s.T(), nodeIDs[0]), - *mockNodeInfo(s.T(), nodeIDs[1]), - *mockNodeInfo(s.T(), nodeIDs[2]), - *mockNodeInfo(s.T(), nodeIDs[3]), - *mockNodeInfo(s.T(), nodeIDs[4]), + *fakeNodeInfo(s.T(), nodeIDs[0]), + *fakeNodeInfo(s.T(), nodeIDs[1]), + *fakeNodeInfo(s.T(), nodeIDs[2]), + *fakeNodeInfo(s.T(), nodeIDs[3]), + *fakeNodeInfo(s.T(), nodeIDs[4]), } s.nodeSelector.EXPECT().AllNodes(gomock.Any()).Return(nodeInfos, nil) s.mockNodeSelection(job, nodeInfos, 2) @@ -302,8 +303,8 @@ func (s *ServiceJobSchedulerTestSuite) TestProcess_ShouldMarkJobAsFailed_NoRetry // mark askForBid exec as lost so we attempt to retry nodeInfos := []models.NodeInfo{ - *mockNodeInfo(s.T(), executions[execServiceBidAccepted1].NodeID), - *mockNodeInfo(s.T(), executions[execServiceBidAccepted2].NodeID), + *fakeNodeInfo(s.T(), executions[execServiceBidAccepted1].NodeID), + *fakeNodeInfo(s.T(), executions[execServiceBidAccepted2].NodeID), } s.nodeSelector.EXPECT().AllNodes(gomock.Any()).Return(nodeInfos, nil) diff --git a/pkg/orchestrator/scheduler/utils_test.go b/pkg/orchestrator/scheduler/utils_test.go index 2845a20dbc..598abdd308 100644 --- a/pkg/orchestrator/scheduler/utils_test.go +++ b/pkg/orchestrator/scheduler/utils_test.go @@ -120,10 +120,8 @@ func (m PlanMatcher) String() string { m.JobState, m.Evaluation, m.NewExecutionsNodes, m.StoppedExecutions, m.ApprovedExecutions) } -func mockNodeInfo(t *testing.T, nodeID string) *models.NodeInfo { +func fakeNodeInfo(t *testing.T, nodeID string) *models.NodeInfo { return &models.NodeInfo{ - NodeID: nodeID, - Approval: models.NodeApprovals.APPROVED, - State: models.NodeStates.CONNECTED, + NodeID: nodeID, } } diff --git a/pkg/orchestrator/selection/discovery/chained.go b/pkg/orchestrator/selection/discovery/chained.go deleted file mode 100644 index 51f96bd1d8..0000000000 --- a/pkg/orchestrator/selection/discovery/chained.go +++ /dev/null @@ -1,63 +0,0 @@ -package discovery - -import ( - "context" - "errors" - - "github.com/bacalhau-project/bacalhau/pkg/models" - "github.com/bacalhau-project/bacalhau/pkg/orchestrator" - pkgerrors "github.com/pkg/errors" - "github.com/rs/zerolog/log" - "golang.org/x/exp/maps" -) - -type Chain struct { - discoverers []orchestrator.NodeDiscoverer - ignoreErrors bool -} - -func NewChain(ignoreErrors bool) *Chain { - return &Chain{ - ignoreErrors: ignoreErrors, - } -} - -func (c *Chain) Add(discoverer ...orchestrator.NodeDiscoverer) { - c.discoverers = append(c.discoverers, discoverer...) -} - -func (c *Chain) ListNodes(ctx context.Context) ([]models.NodeInfo, error) { - return c.chainDiscovery(ctx, "ListNodes", func(r orchestrator.NodeDiscoverer) ([]models.NodeInfo, error) { - return r.ListNodes(ctx) - }) -} - -func (c *Chain) chainDiscovery( - ctx context.Context, - caller string, - getNodes func(orchestrator.NodeDiscoverer) ([]models.NodeInfo, error), -) ([]models.NodeInfo, error) { - var err error - uniqueNodes := make(map[string]models.NodeInfo, 0) - for _, discoverer := range c.discoverers { - nodeInfos, discoverErr := getNodes(discoverer) - err = errors.Join(err, pkgerrors.Wrapf(discoverErr, "error finding nodes from %T", discoverer)) - currentNodesCount := len(uniqueNodes) - for _, nodeInfo := range nodeInfos { - if _, ok := uniqueNodes[nodeInfo.ID()]; !ok { - uniqueNodes[nodeInfo.ID()] = nodeInfo - } - } - log.Ctx(ctx).Debug().Msgf("[%s] found %d more nodes by %T", caller, len(uniqueNodes)-currentNodesCount, discoverer) - } - - if err != nil && c.ignoreErrors { - log.Ctx(ctx).Warn().Err(err).Msg("ignoring error finding nodes") - err = nil - } - - return maps.Values(uniqueNodes), err -} - -// compile-time interface assertions -var _ orchestrator.NodeDiscoverer = (*Chain)(nil) diff --git a/pkg/orchestrator/selection/discovery/chained_test.go b/pkg/orchestrator/selection/discovery/chained_test.go deleted file mode 100644 index b542a5eb77..0000000000 --- a/pkg/orchestrator/selection/discovery/chained_test.go +++ /dev/null @@ -1,83 +0,0 @@ -//go:build unit || !integration - -package discovery - -import ( - "context" - "errors" - "testing" - - "github.com/bacalhau-project/bacalhau/pkg/models" - "github.com/stretchr/testify/suite" -) - -type ChainedSuite struct { - suite.Suite - chain *Chain - peerID1 models.NodeInfo - peerID2 models.NodeInfo - peerID3 models.NodeInfo -} - -func (s *ChainedSuite) SetupSuite() { - s.peerID1 = models.NodeInfo{NodeID: "peerID1"} - s.peerID2 = models.NodeInfo{NodeID: "peerID2"} - s.peerID3 = models.NodeInfo{NodeID: "peerID3"} -} - -func (s *ChainedSuite) SetupTest() { - s.chain = NewChain(false) // don't ignore errors -} - -func TestChainedSuite(t *testing.T) { - suite.Run(t, new(ChainedSuite)) -} - -func (s *ChainedSuite) TestListNodes() { - s.chain.Add(NewFixedDiscoverer(s.peerID1)) - s.chain.Add(NewFixedDiscoverer(s.peerID2)) - s.chain.Add(NewFixedDiscoverer(s.peerID3)) - - peerIDs, err := s.chain.ListNodes(context.Background()) - s.NoError(err) - s.ElementsMatch([]models.NodeInfo{s.peerID1, s.peerID2, s.peerID3}, peerIDs) -} - -func (s *ChainedSuite) TestListNodes_Overlap() { - s.chain.Add(NewFixedDiscoverer(s.peerID1, s.peerID2)) - s.chain.Add(NewFixedDiscoverer(s.peerID2, s.peerID3)) - - peerIDs, err := s.chain.ListNodes(context.Background()) - s.NoError(err) - s.ElementsMatch([]models.NodeInfo{s.peerID1, s.peerID2, s.peerID3}, peerIDs) -} - -func (s *ChainedSuite) TestHandle_Error() { - s.chain.Add(NewFixedDiscoverer(s.peerID1, s.peerID2)) - s.chain.Add(newBadDiscoverer()) - s.chain.Add(NewFixedDiscoverer(s.peerID3)) - _, err := s.chain.ListNodes(context.Background()) - s.Error(err) -} - -func (s *ChainedSuite) TestHandle_IgnoreError() { - s.chain.ignoreErrors = true - s.chain.Add(NewFixedDiscoverer(s.peerID1, s.peerID2)) - s.chain.Add(newBadDiscoverer()) - s.chain.Add(NewFixedDiscoverer(s.peerID3)) - - peerIDs, err := s.chain.ListNodes(context.Background()) - s.NoError(err) - s.ElementsMatch([]models.NodeInfo{s.peerID1, s.peerID2, s.peerID3}, peerIDs) -} - -// node discoverer that always returns an error -type badDiscoverer struct{} - -func newBadDiscoverer() *badDiscoverer { - return &badDiscoverer{} -} - -func (b *badDiscoverer) ListNodes(context.Context) ([]models.NodeInfo, error) { - return nil, errors.New("bad discoverer") -} diff --git a/pkg/orchestrator/selection/discovery/fixed.go b/pkg/orchestrator/selection/discovery/fixed.go deleted file mode 100644 index 656e8dac58..0000000000 --- a/pkg/orchestrator/selection/discovery/fixed.go +++ /dev/null @@ -1,26 +0,0 @@ -package discovery - -import ( - "context" - - "github.com/bacalhau-project/bacalhau/pkg/models" -) - -// node discoverer that always returns the same set of nodes -type fixedDiscoverer struct { - peerIDs []models.NodeInfo -} - -func NewFixedDiscoverer(peerIDs ...models.NodeInfo) *fixedDiscoverer { - return &fixedDiscoverer{ - peerIDs: peerIDs, - } -} - -func (f *fixedDiscoverer) FindNodes(context.Context, models.Job) ([]models.NodeInfo, error) { - return f.peerIDs, nil -} - -func (f *fixedDiscoverer) ListNodes(context.Context) ([]models.NodeInfo, error) { - return f.peerIDs, nil -} diff --git a/pkg/orchestrator/selection/discovery/info_provider.go b/pkg/orchestrator/selection/discovery/info_provider.go index 605ec4d7fb..9e0151989a 100644 --- a/pkg/orchestrator/selection/discovery/info_provider.go +++ b/pkg/orchestrator/selection/discovery/info_provider.go @@ -17,7 +17,7 @@ func NewDebugInfoProvider(discoverer orchestrator.NodeDiscoverer) model.DebugInf // GetDebugInfo implements models.DebugInfoProvider func (p *discoveredNodesProvider) GetDebugInfo(ctx context.Context) (info model.DebugInfo, err error) { - nodes, err := p.discoverer.ListNodes(ctx) + nodes, err := p.discoverer.List(ctx) info.Component = "DiscoveredNodes" info.Info = nodes return info, err diff --git a/pkg/orchestrator/selection/discovery/store.go b/pkg/orchestrator/selection/discovery/store.go deleted file mode 100644 index 431a5f9868..0000000000 --- a/pkg/orchestrator/selection/discovery/store.go +++ /dev/null @@ -1,31 +0,0 @@ -package discovery - -import ( - "context" - - "github.com/bacalhau-project/bacalhau/pkg/models" - "github.com/bacalhau-project/bacalhau/pkg/orchestrator" - "github.com/bacalhau-project/bacalhau/pkg/routing" -) - -type StoreNodeDiscovererParams struct { - Store routing.NodeInfoStore -} - -type StoreNodeDiscoverer struct { - store routing.NodeInfoStore -} - -func NewStoreNodeDiscoverer(params StoreNodeDiscovererParams) *StoreNodeDiscoverer { - return &StoreNodeDiscoverer{ - store: params.Store, - } -} - -// ListNodes implements orchestrator.NodeDiscoverer -func (d *StoreNodeDiscoverer) ListNodes(ctx context.Context) ([]models.NodeInfo, error) { - return d.store.List(ctx) -} - -// compile time check that StoreNodeDiscoverer implements NodeDiscoverer -var _ orchestrator.NodeDiscoverer = (*StoreNodeDiscoverer)(nil) diff --git a/pkg/orchestrator/selection/discovery/store_test.go b/pkg/orchestrator/selection/discovery/store_test.go deleted file mode 100644 index be35d0d05a..0000000000 --- a/pkg/orchestrator/selection/discovery/store_test.go +++ /dev/null @@ -1,70 +0,0 @@ -//go:build unit || !integration - -package discovery - -import ( - "context" - "math" - "testing" - - "github.com/bacalhau-project/bacalhau/pkg/models" - "github.com/bacalhau-project/bacalhau/pkg/routing/inmemory" - "github.com/bacalhau-project/bacalhau/pkg/test/mock" - "github.com/stretchr/testify/suite" -) - -type StoreNodeDiscovererSuite struct { - suite.Suite - discoverer *StoreNodeDiscoverer - store *inmemory.NodeStore -} - -func (s *StoreNodeDiscovererSuite) SetupTest() { - s.store = inmemory.NewNodeStore(inmemory.NodeStoreParams{ - TTL: math.MaxInt64, - }) - s.discoverer = NewStoreNodeDiscoverer(StoreNodeDiscovererParams{ - Store: s.store, - }) -} - -func TestStoreNodeDiscovererSuite(t *testing.T) { - suite.Run(t, new(StoreNodeDiscovererSuite)) -} - -func (s *StoreNodeDiscovererSuite) TestListNodes() { - ctx := context.Background() - nodeInfo1 := generateNodeInfo("node1", models.EngineDocker) - s.NoError(s.store.Add(ctx, nodeInfo1)) - - // both nodes are returned when asked for docker nodes - job := mock.Job() - job.Task().Engine.Type = models.EngineDocker - - peerIDs, err := s.discoverer.ListNodes(context.Background()) - s.NoError(err) - s.ElementsMatch([]models.NodeInfo{nodeInfo1}, peerIDs) - - nodeInfo2 := generateNodeInfo("node2", models.EngineDocker, models.EngineWasm) - s.NoError(s.store.Add(ctx, nodeInfo2)) - // only node2 is returned when asked for noop nodes - peerIDs, err = s.discoverer.ListNodes(context.Background()) - s.NoError(err) - s.ElementsMatch([]models.NodeInfo{nodeInfo1, nodeInfo2}, peerIDs) -} - -func (s *StoreNodeDiscovererSuite) TestListNodes_Empty() { - peerIDs, err := s.discoverer.ListNodes(context.Background()) - s.NoError(err) - s.Empty(peerIDs) -} - -func generateNodeInfo(id string, engines ...string) models.NodeInfo { - return models.NodeInfo{ - NodeID: id, - NodeType: models.NodeTypeCompute, - ComputeNodeInfo: &models.ComputeNodeInfo{ - ExecutionEngines: engines, - }, - } -} diff --git a/pkg/orchestrator/selection/selector/node_selector.go b/pkg/orchestrator/selection/selector/node_selector.go index 80cd1a2703..3dd3d4cf27 100644 --- a/pkg/orchestrator/selection/selector/node_selector.go +++ b/pkg/orchestrator/selection/selector/node_selector.go @@ -3,6 +3,7 @@ package selector import ( "context" "errors" + "fmt" "sort" "github.com/rs/zerolog/log" @@ -32,7 +33,16 @@ func NewNodeSelector(params NodeSelectorParams) *NodeSelector { } func (n NodeSelector) AllNodes(ctx context.Context) ([]models.NodeInfo, error) { - return n.nodeDiscoverer.ListNodes(ctx) + nodeStates, err := n.nodeDiscoverer.List(ctx) + if err != nil { + return nil, fmt.Errorf("failed to list discovered nodes: %w", err) + } + // extract slice of models.NodeInfo from slice of routing.NodeConnectionState + nodeInfos := make([]models.NodeInfo, 0, len(nodeStates)) + for _, ns := range nodeStates { + nodeInfos = append(nodeInfos, ns.Info) + } + return nodeInfos, nil } func (n NodeSelector) AllMatchingNodes(ctx context.Context, @@ -73,32 +83,42 @@ func (n NodeSelector) TopMatchingNodes(ctx context.Context, func (n NodeSelector) rankAndFilterNodes(ctx context.Context, job *models.Job, constraints *orchestrator.NodeSelectionConstraints) (selected, rejected []orchestrator.NodeRank, err error) { - listed, err := n.nodeDiscoverer.ListNodes(ctx) + listed, err := n.nodeDiscoverer.List(ctx) if err != nil { return nil, nil, err } - nodeIDs := lo.Filter(listed, func(nodeInfo models.NodeInfo, index int) bool { - if nodeInfo.NodeType != models.NodeTypeCompute { + // filter node states to return a slice of nodes that are: + // - compute nodes + // - approved to executor jobs + // - connected (alive) + nodeStates := lo.Filter(listed, func(nodeState models.NodeState, index int) bool { + if nodeState.Info.NodeType != models.NodeTypeCompute { return false } - if constraints.RequireApproval && nodeInfo.Approval != models.NodeApprovals.APPROVED { + if constraints.RequireApproval && nodeState.Membership != models.NodeMembership.APPROVED { return false } - if constraints.RequireConnected && nodeInfo.State != models.NodeStates.CONNECTED { + if constraints.RequireConnected && nodeState.Connection != models.NodeStates.CONNECTED { return false } return true }) - if len(nodeIDs) == 0 { + if len(nodeStates) == 0 { return nil, nil, errors.New("unable to find any connected and approved nodes") } - rankedNodes, err := n.nodeRanker.RankNodes(ctx, *job, nodeIDs) + // extract the nodeInfo from the slice of node states for ranking + nodeInfos := make([]models.NodeInfo, 0, len(nodeStates)) + for _, ns := range nodeStates { + nodeInfos = append(nodeInfos, ns.Info) + } + + rankedNodes, err := n.nodeRanker.RankNodes(ctx, *job, nodeInfos) if err != nil { return nil, nil, err } diff --git a/pkg/publicapi/apimodels/agent.go b/pkg/publicapi/apimodels/agent.go index cf76b0a84c..4705a1b656 100644 --- a/pkg/publicapi/apimodels/agent.go +++ b/pkg/publicapi/apimodels/agent.go @@ -28,5 +28,5 @@ type GetAgentNodeRequest struct { type GetAgentNodeResponse struct { BaseGetResponse - *models.NodeInfo + *models.NodeState } diff --git a/pkg/publicapi/apimodels/node.go b/pkg/publicapi/apimodels/node.go index 39933fdd28..76f5b77aae 100644 --- a/pkg/publicapi/apimodels/node.go +++ b/pkg/publicapi/apimodels/node.go @@ -1,8 +1,9 @@ package apimodels import ( - "github.com/bacalhau-project/bacalhau/pkg/models" "k8s.io/apimachinery/pkg/labels" + + "github.com/bacalhau-project/bacalhau/pkg/models" ) type GetNodeRequest struct { @@ -12,7 +13,7 @@ type GetNodeRequest struct { type GetNodeResponse struct { BaseGetResponse - Node *models.NodeInfo + Node *models.NodeState } type ListNodesRequest struct { @@ -43,7 +44,7 @@ func (o *ListNodesRequest) ToHTTPRequest() *HTTPRequest { type ListNodesResponse struct { BaseListResponse - Nodes []*models.NodeInfo + Nodes []*models.NodeState } type PutNodeRequest struct { diff --git a/pkg/publicapi/endpoint/agent/endpoint.go b/pkg/publicapi/endpoint/agent/endpoint.go index ad0e5a82ea..4bfc56840c 100644 --- a/pkg/publicapi/endpoint/agent/endpoint.go +++ b/pkg/publicapi/endpoint/agent/endpoint.go @@ -3,31 +3,32 @@ package agent import ( "net/http" + "github.com/labstack/echo/v4" + "github.com/rs/zerolog/log" + "github.com/bacalhau-project/bacalhau/pkg/model" "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/publicapi/apimodels" "github.com/bacalhau-project/bacalhau/pkg/publicapi/middleware" "github.com/bacalhau-project/bacalhau/pkg/version" - "github.com/labstack/echo/v4" - "github.com/rs/zerolog/log" ) type EndpointParams struct { Router *echo.Echo - NodeInfoProvider models.NodeInfoProvider + NodeStateProvider models.NodeStateProvider DebugInfoProviders []model.DebugInfoProvider } type Endpoint struct { router *echo.Echo - nodeInfoProvider models.NodeInfoProvider + nodeStateProvider models.NodeStateProvider debugInfoProviders []model.DebugInfoProvider } func NewEndpoint(params EndpointParams) *Endpoint { e := &Endpoint{ router: params.Router, - nodeInfoProvider: params.NodeInfoProvider, + nodeStateProvider: params.NodeStateProvider, debugInfoProviders: params.DebugInfoProviders, } @@ -80,9 +81,9 @@ func (e *Endpoint) version(c echo.Context) error { // @Failure 500 {object} string // @Router /api/v1/agent/node [get] func (e *Endpoint) node(c echo.Context) error { - nodeInfo := e.nodeInfoProvider.GetNodeInfo(c.Request().Context()) + nodeState := e.nodeStateProvider.GetNodeState(c.Request().Context()) return c.JSON(http.StatusOK, apimodels.GetAgentNodeResponse{ - NodeInfo: &nodeInfo, + NodeState: &nodeState, }) } diff --git a/pkg/publicapi/endpoint/orchestrator/node.go b/pkg/publicapi/endpoint/orchestrator/node.go index 7533a35768..4704e15afd 100644 --- a/pkg/publicapi/endpoint/orchestrator/node.go +++ b/pkg/publicapi/endpoint/orchestrator/node.go @@ -19,12 +19,12 @@ func (e *Endpoint) getNode(c echo.Context) error { if c.Param("id") == "" { return echo.NewHTTPError(http.StatusBadRequest, "missing node id") } - job, err := e.nodeManager.GetByPrefix(ctx, c.Param("id")) + nodeState, err := e.nodeManager.GetByPrefix(ctx, c.Param("id")) if err != nil { return err } return c.JSON(http.StatusOK, apimodels.GetNodeResponse{ - Node: &job, + Node: &nodeState, }) } @@ -44,9 +44,9 @@ func (e *Endpoint) listNodes(c echo.Context) error { return err } - capacity := func(node *models.NodeInfo) *models.Resources { - if node.ComputeNodeInfo != nil { - return &node.ComputeNodeInfo.AvailableCapacity + capacity := func(node *models.NodeState) *models.Resources { + if node.Info.ComputeNodeInfo != nil { + return &node.Info.ComputeNodeInfo.AvailableCapacity } return &models.Resources{} } @@ -59,7 +59,7 @@ func (e *Endpoint) listNodes(c echo.Context) error { if args.Reverse { baseSortFnc := sortFnc - sortFnc = func(a, b *models.NodeInfo) int { + sortFnc = func(a, b *models.NodeState) int { x := baseSortFnc(a, b) if x == -1 { return 1 @@ -81,17 +81,17 @@ func (e *Endpoint) listNodes(c echo.Context) error { args.FilterByStatus = strings.ToUpper(args.FilterByStatus) // filter nodes, first by status, then by label selectors - res := make([]*models.NodeInfo, 0) + res := make([]*models.NodeState, 0) for i, node := range allNodes { - if args.FilterByApproval != "" && args.FilterByApproval != node.Approval.String() { + if args.FilterByApproval != "" && args.FilterByApproval != node.Membership.String() { continue } - if args.FilterByStatus != "" && args.FilterByStatus != node.State.String() { + if args.FilterByStatus != "" && args.FilterByStatus != node.Connection.String() { continue } - if selector.Matches(labels.Set(node.Labels)) { + if selector.Matches(labels.Set(node.Info.Labels)) { res = append(res, &allNodes[i]) } } @@ -110,34 +110,36 @@ func (e *Endpoint) listNodes(c echo.Context) error { }) } -type resourceFunc func(node *models.NodeInfo) *models.Resources -type sortFunc func(a, b *models.NodeInfo) int +type resourceFunc func(node *models.NodeState) *models.Resources +type sortFunc func(a, b *models.NodeState) int func (e *Endpoint) getSortFunction(orderBy string, capacity resourceFunc) sortFunc { switch orderBy { case "id", "": - return func(a, b *models.NodeInfo) int { return util.Compare[string]{}.Cmp(a.ID(), b.ID()) } + return func(a, b *models.NodeState) int { return util.Compare[string]{}.Cmp(a.Info.ID(), b.Info.ID()) } case "type": - return func(a, b *models.NodeInfo) int { return util.Compare[models.NodeType]{}.Cmp(a.NodeType, b.NodeType) } + return func(a, b *models.NodeState) int { + return util.Compare[models.NodeType]{}.Cmp(a.Info.NodeType, b.Info.NodeType) + } case "available_cpu": - return func(a, b *models.NodeInfo) int { + return func(a, b *models.NodeState) int { return util.Compare[float64]{}.CmpRev(capacity(a).CPU, capacity(b).CPU) } case "available_memory": - return func(a, b *models.NodeInfo) int { + return func(a, b *models.NodeState) int { return util.Compare[uint64]{}.CmpRev(capacity(a).Memory, capacity(b).Memory) } case "available_disk": - return func(a, b *models.NodeInfo) int { + return func(a, b *models.NodeState) int { return util.Compare[uint64]{}.CmpRev(capacity(a).Disk, capacity(b).Disk) } case "available_gpu": - return func(a, b *models.NodeInfo) int { + return func(a, b *models.NodeState) int { return util.Compare[uint64]{}.CmpRev(capacity(a).GPU, capacity(b).GPU) } case "approval", "status": - return func(a, b *models.NodeInfo) int { - return util.Compare[string]{}.Cmp(a.Approval.String(), b.Approval.String()) + return func(a, b *models.NodeState) int { + return util.Compare[string]{}.Cmp(a.Membership.String(), b.Membership.String()) } default: } diff --git a/pkg/publicapi/endpoint/requester/endpoints_nodes.go b/pkg/publicapi/endpoint/requester/endpoints_nodes.go index 5a98e80ad5..1b31ca034b 100644 --- a/pkg/publicapi/endpoint/requester/endpoints_nodes.go +++ b/pkg/publicapi/endpoint/requester/endpoints_nodes.go @@ -17,7 +17,7 @@ import ( // @Router /api/v1/requester/nodes [get] func (s *Endpoint) nodes(c echo.Context) error { ctx := c.Request().Context() - nodes, err := s.nodeDiscoverer.ListNodes(ctx) + nodes, err := s.nodeDiscoverer.List(ctx) if err != nil { return echo.NewHTTPError(http.StatusInternalServerError, err.Error()) } diff --git a/pkg/publicapi/endpoint/shared/endpoint.go b/pkg/publicapi/endpoint/shared/endpoint.go index 0235e474a3..6b2fe1f2c6 100644 --- a/pkg/publicapi/endpoint/shared/endpoint.go +++ b/pkg/publicapi/endpoint/shared/endpoint.go @@ -3,30 +3,31 @@ package shared import ( "net/http" + "github.com/labstack/echo/v4" + "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/publicapi/apimodels/legacymodels" "github.com/bacalhau-project/bacalhau/pkg/publicapi/middleware" "github.com/bacalhau-project/bacalhau/pkg/version" - "github.com/labstack/echo/v4" ) type EndpointParams struct { - Router *echo.Echo - NodeID string - NodeInfoProvider models.NodeInfoProvider + Router *echo.Echo + NodeID string + NodeStateProvider models.NodeStateProvider } type Endpoint struct { - router *echo.Echo - nodeID string - nodeInfoProvider models.NodeInfoProvider + router *echo.Echo + nodeID string + nodeStateProvider models.NodeStateProvider } func NewEndpoint(params EndpointParams) *Endpoint { e := &Endpoint{ - router: params.Router, - nodeID: params.NodeID, - nodeInfoProvider: params.NodeInfoProvider, + router: params.Router, + nodeID: params.NodeID, + nodeStateProvider: params.NodeStateProvider, } // JSON group @@ -74,7 +75,7 @@ func (e *Endpoint) id(c echo.Context) error { // @Failure 500 {object} string // @Router /api/v1/node_info [get] func (e *Endpoint) nodeInfo(c echo.Context) error { - return c.JSON(http.StatusOK, e.nodeInfoProvider.GetNodeInfo(c.Request().Context())) + return c.JSON(http.StatusOK, e.nodeStateProvider.GetNodeState(c.Request().Context())) } // version godoc diff --git a/pkg/publicapi/test/agent_test.go b/pkg/publicapi/test/agent_test.go index 2a1315cd91..c6a06eb9bc 100644 --- a/pkg/publicapi/test/agent_test.go +++ b/pkg/publicapi/test/agent_test.go @@ -4,11 +4,7 @@ package test import ( "context" - "testing" - "github.com/stretchr/testify/require" - - "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/publicapi/apimodels" "github.com/bacalhau-project/bacalhau/pkg/version" ) @@ -36,12 +32,15 @@ func (s *ServerSuite) TestAgentNode() { resp, err := s.client.Agent().Node(ctx, &apimodels.GetAgentNodeRequest{}) s.Require().NoError(err) s.Require().NotEmpty(resp) - s.Require().NotNil(resp.NodeInfo) + s.Require().NotNil(resp.NodeState) - node := s.requesterNode - expectedNode, err := node.RequesterNode.NodeInfoStore.Get(context.Background(), s.requesterNode.ID) + requesterNode := s.requesterNode + // NB(forrest): we are only asserting NodeInfos are equal (which excludes approvals and liveness from NodeState) + // because we are asking the requester's NodeInfoStore for the NodeState it contains on itself (s.requesterNode.ID) + // and since the requester doesn't send heartbeat messages to itself it will consider itself disconnected + expectedNode, err := requesterNode.RequesterNode.NodeInfoStore.Get(context.Background(), s.requesterNode.ID) s.Require().NoError(err) - equalNodeInfo(s.T(), expectedNode, *resp.NodeInfo) + s.Require().Equal(expectedNode.Info, resp.Info) } func (s *ServerSuite) TestAgentNodeCompute() { @@ -49,26 +48,5 @@ func (s *ServerSuite) TestAgentNodeCompute() { resp, err := s.computeClient.Agent().Node(ctx, &apimodels.GetAgentNodeRequest{}) s.Require().NoError(err) s.Require().NotEmpty(resp) - s.Require().NotNil(resp.NodeInfo) -} - -func equalNodeInfo(t *testing.T, a, b models.NodeInfo) { - require.Equal(t, a.BacalhauVersion, b.BacalhauVersion) - require.Equal(t, a.ID(), b.ID()) - require.Equal(t, a.NodeType, b.NodeType) - require.Equal(t, a.Labels, b.Labels) - - if a.ComputeNodeInfo == nil { - require.Nil(t, b.ComputeNodeInfo) - return - } - require.ElementsMatch(t, a.ComputeNodeInfo.ExecutionEngines, b.ComputeNodeInfo.ExecutionEngines) - require.ElementsMatch(t, a.ComputeNodeInfo.Publishers, b.ComputeNodeInfo.Publishers) - require.ElementsMatch(t, a.ComputeNodeInfo.StorageSources, b.ComputeNodeInfo.StorageSources) - require.Equal(t, a.ComputeNodeInfo.MaxCapacity, b.ComputeNodeInfo.MaxCapacity) - require.Equal(t, a.ComputeNodeInfo.AvailableCapacity, b.ComputeNodeInfo.AvailableCapacity) - require.Equal(t, a.ComputeNodeInfo.MaxJobRequirements, b.ComputeNodeInfo.MaxJobRequirements) - require.Equal(t, a.ComputeNodeInfo.RunningExecutions, b.ComputeNodeInfo.RunningExecutions) - require.Equal(t, a.ComputeNodeInfo.RunningExecutions, b.ComputeNodeInfo.RunningExecutions) - + s.Require().NotNil(resp.NodeState) } diff --git a/pkg/routing/inmemory/inmemory.go b/pkg/routing/inmemory/inmemory.go index 666fe8b849..397d431ccd 100644 --- a/pkg/routing/inmemory/inmemory.go +++ b/pkg/routing/inmemory/inmemory.go @@ -6,16 +6,16 @@ import ( "sync" "time" - "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/libp2p/go-libp2p/core/peer" "github.com/rs/zerolog/log" + "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/routing" ) // TODO: replace the manual and lazy eviction with a more efficient caching library type nodeInfoWrapper struct { - models.NodeInfo + models.NodeState evictAt time.Time } @@ -36,48 +36,48 @@ func NewNodeStore(params NodeStoreParams) *NodeStore { } } -func (r *NodeStore) Add(ctx context.Context, nodeInfo models.NodeInfo) error { +func (r *NodeStore) Add(ctx context.Context, state models.NodeState) error { r.mu.Lock() defer r.mu.Unlock() // add or update the node info - nodeID := nodeInfo.ID() + nodeID := state.Info.ID() r.nodeInfoMap[nodeID] = nodeInfoWrapper{ - NodeInfo: nodeInfo, - evictAt: time.Now().Add(r.ttl), + NodeState: state, + evictAt: time.Now().Add(r.ttl), } - log.Ctx(ctx).Trace().Msgf("Added node info %+v", nodeInfo) + log.Ctx(ctx).Trace().Msgf("Added node state %+v", state) return nil } -func (r *NodeStore) Get(ctx context.Context, nodeID string) (models.NodeInfo, error) { +func (r *NodeStore) Get(ctx context.Context, nodeID string) (models.NodeState, error) { r.mu.RLock() defer r.mu.RUnlock() infoWrapper, ok := r.nodeInfoMap[nodeID] if !ok { - return models.NodeInfo{}, routing.NewErrNodeNotFound(nodeID) + return models.NodeState{}, routing.NewErrNodeNotFound(nodeID) } if time.Now().After(infoWrapper.evictAt) { go r.evict(ctx, infoWrapper) - return models.NodeInfo{}, routing.NewErrNodeNotFound(nodeID) + return models.NodeState{}, routing.NewErrNodeNotFound(nodeID) } - return infoWrapper.NodeInfo, nil + return infoWrapper.NodeState, nil } -func (r *NodeStore) GetByPrefix(ctx context.Context, prefix string) (models.NodeInfo, error) { +func (r *NodeStore) GetByPrefix(ctx context.Context, prefix string) (models.NodeState, error) { r.mu.RLock() defer r.mu.RUnlock() - nodeInfo, err := r.Get(ctx, prefix) + state, err := r.Get(ctx, prefix) // we found a node with the exact ID if err == nil { - return nodeInfo, nil + return state, nil } // return the error if it's not a node not found error var errNotFound routing.ErrNodeNotFound if !errors.As(err, &errNotFound) { - return models.NodeInfo{}, err + return models.NodeState{}, err } // look for a node with the prefix. if there are multiple nodes with the same prefix, return ErrMultipleNodesFound error @@ -96,14 +96,14 @@ func (r *NodeStore) GetByPrefix(ctx context.Context, prefix string) (models.Node } if len(nodeIDsWithPrefix) == 0 { - return models.NodeInfo{}, routing.NewErrNodeNotFound(prefix) + return models.NodeState{}, routing.NewErrNodeNotFound(prefix) } if len(nodeIDsWithPrefix) > 1 { - return models.NodeInfo{}, routing.NewErrMultipleNodesFound(prefix, nodeIDsWithPrefix) + return models.NodeState{}, routing.NewErrMultipleNodesFound(prefix, nodeIDsWithPrefix) } - return r.nodeInfoMap[nodeIDsWithPrefix[0]].NodeInfo, nil + return r.nodeInfoMap[nodeIDsWithPrefix[0]].NodeState, nil } func (r *NodeStore) FindPeer(ctx context.Context, peerID peer.ID) (peer.AddrInfo, error) { @@ -113,40 +113,40 @@ func (r *NodeStore) FindPeer(ctx context.Context, peerID peer.ID) (peer.AddrInfo if !ok { return peer.AddrInfo{}, nil } - if infoWrapper.PeerInfo != nil && len(infoWrapper.PeerInfo.Addrs) > 0 { - return *infoWrapper.PeerInfo, nil + if infoWrapper.Info.PeerInfo != nil && len(infoWrapper.Info.PeerInfo.Addrs) > 0 { + return *infoWrapper.Info.PeerInfo, nil } return peer.AddrInfo{}, nil } -func (r *NodeStore) List(ctx context.Context, filters ...routing.NodeInfoFilter) ([]models.NodeInfo, error) { +func (r *NodeStore) List(ctx context.Context, filters ...routing.NodeStateFilter) ([]models.NodeState, error) { r.mu.RLock() defer r.mu.RUnlock() - megaFilter := func(info models.NodeInfo) bool { + megaFilter := func(state models.NodeState) bool { for _, filter := range filters { - if !filter(info) { + if !filter(state) { return false } } return true } - var nodeInfos []models.NodeInfo + var nodeStates []models.NodeState var toEvict []nodeInfoWrapper - for _, nodeInfo := range r.nodeInfoMap { - if time.Now().After(nodeInfo.evictAt) { - toEvict = append(toEvict, nodeInfo) + for _, nodeState := range r.nodeInfoMap { + if time.Now().After(nodeState.evictAt) { + toEvict = append(toEvict, nodeState) } else { - if megaFilter(nodeInfo.NodeInfo) { - nodeInfos = append(nodeInfos, nodeInfo.NodeInfo) + if megaFilter(nodeState.NodeState) { + nodeStates = append(nodeStates, nodeState.NodeState) } } } if len(toEvict) > 0 { go r.evict(ctx, toEvict...) } - return nodeInfos, nil + return nodeStates, nil } func (r *NodeStore) Delete(ctx context.Context, nodeID string) error { @@ -155,13 +155,13 @@ func (r *NodeStore) Delete(ctx context.Context, nodeID string) error { return r.doDelete(ctx, nodeID) } -func (r *NodeStore) evict(ctx context.Context, infoWrappers ...nodeInfoWrapper) { +func (r *NodeStore) evict(ctx context.Context, stateWrappers ...nodeInfoWrapper) { r.mu.Lock() defer r.mu.Unlock() - for _, infoWrapper := range infoWrappers { - nodeID := infoWrapper.ID() + for _, stateWrapper := range stateWrappers { + nodeID := stateWrapper.Info.ID() nodeInfo, ok := r.nodeInfoMap[nodeID] - if !ok || nodeInfo.evictAt != infoWrapper.evictAt { + if !ok || nodeInfo.evictAt != stateWrapper.evictAt { return // node info already evicted or has been updated since it was scheduled for eviction } err := r.doDelete(ctx, nodeID) diff --git a/pkg/routing/inmemory/inmemory_test.go b/pkg/routing/inmemory/inmemory_test.go index 3d5c55aeb1..91dd46337c 100644 --- a/pkg/routing/inmemory/inmemory_test.go +++ b/pkg/routing/inmemory/inmemory_test.go @@ -8,10 +8,11 @@ import ( "testing" "time" + "github.com/stretchr/testify/suite" + "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/routing" "github.com/bacalhau-project/bacalhau/pkg/routing/inmemory" - "github.com/stretchr/testify/suite" ) var nodeIDs = []string{ @@ -37,17 +38,17 @@ func TestInMemoryNodeStoreSuite(t *testing.T) { func (s *InMemoryNodeStoreSuite) Test_Get() { ctx := context.Background() - nodeInfo0 := generateNodeInfo(s.T(), nodeIDs[0], models.EngineDocker) - nodeInfo1 := generateNodeInfo(s.T(), nodeIDs[1], models.EngineWasm) + nodeInfo0 := generateNodeState(nodeIDs[0], models.EngineDocker) + nodeInfo1 := generateNodeState(nodeIDs[1], models.EngineWasm) s.NoError(s.store.Add(ctx, nodeInfo0)) s.NoError(s.store.Add(ctx, nodeInfo1)) // test Get - res1, err := s.store.Get(ctx, nodeInfo0.ID()) + res1, err := s.store.Get(ctx, nodeInfo0.Info.ID()) s.NoError(err) s.Equal(nodeInfo0, res1) - res2, err := s.store.Get(ctx, nodeInfo1.ID()) + res2, err := s.store.Get(ctx, nodeInfo1.Info.ID()) s.NoError(err) s.Equal(nodeInfo1, res2) } @@ -62,7 +63,7 @@ func (s *InMemoryNodeStoreSuite) Test_GetNotFound() { func (s *InMemoryNodeStoreSuite) Test_GetByPrefix_SingleMatch() { ctx := context.Background() - nodeInfo := generateNodeInfo(s.T(), nodeIDs[0], models.EngineDocker) + nodeInfo := generateNodeState(nodeIDs[0], models.EngineDocker) s.NoError(s.store.Add(ctx, nodeInfo)) res, err := s.store.GetByPrefix(ctx, "QmdZQ7") @@ -72,8 +73,8 @@ func (s *InMemoryNodeStoreSuite) Test_GetByPrefix_SingleMatch() { func (s *InMemoryNodeStoreSuite) Test_GetByPrefix_MultipleMatches() { ctx := context.Background() - nodeInfo0 := generateNodeInfo(s.T(), nodeIDs[0], models.EngineDocker) - nodeInfo1 := generateNodeInfo(s.T(), nodeIDs[1], models.EngineWasm) + nodeInfo0 := generateNodeState(nodeIDs[0], models.EngineDocker) + nodeInfo1 := generateNodeState(nodeIDs[1], models.EngineWasm) s.NoError(s.store.Add(ctx, nodeInfo0)) s.NoError(s.store.Add(ctx, nodeInfo1)) @@ -95,7 +96,7 @@ func (s *InMemoryNodeStoreSuite) Test_GetByPrefix_ExpiredNode() { TTL: 10 * time.Millisecond, }) - nodeInfo := generateNodeInfo(s.T(), nodeIDs[0], models.EngineDocker) + nodeInfo := generateNodeState(nodeIDs[0], models.EngineDocker) s.NoError(store.Add(ctx, nodeInfo)) // Wait for the item to expire @@ -108,44 +109,44 @@ func (s *InMemoryNodeStoreSuite) Test_GetByPrefix_ExpiredNode() { func (s *InMemoryNodeStoreSuite) Test_List() { ctx := context.Background() - nodeInfo0 := generateNodeInfo(s.T(), nodeIDs[0], models.EngineDocker) - nodeInfo1 := generateNodeInfo(s.T(), nodeIDs[1], models.EngineWasm) + nodeInfo0 := generateNodeState(nodeIDs[0], models.EngineDocker) + nodeInfo1 := generateNodeState(nodeIDs[1], models.EngineWasm) s.NoError(s.store.Add(ctx, nodeInfo0)) s.NoError(s.store.Add(ctx, nodeInfo1)) // test List allNodeInfos, err := s.store.List(ctx) s.NoError(err) - s.ElementsMatch([]models.NodeInfo{nodeInfo0, nodeInfo1}, allNodeInfos) + s.ElementsMatch([]models.NodeState{nodeInfo0, nodeInfo1}, allNodeInfos) } func (s *InMemoryNodeStoreSuite) Test_ListWithFilters() { ctx := context.Background() - nodeInfo0 := generateNodeInfo(s.T(), nodeIDs[0], models.EngineDocker) - nodeInfo1 := generateNodeInfo(s.T(), nodeIDs[1], models.EngineWasm) + nodeInfo0 := generateNodeState(nodeIDs[0], models.EngineDocker) + nodeInfo1 := generateNodeState(nodeIDs[1], models.EngineWasm) s.NoError(s.store.Add(ctx, nodeInfo0)) s.NoError(s.store.Add(ctx, nodeInfo1)) // Match one record - filterPartialID := func(node models.NodeInfo) bool { - return strings.HasPrefix(node.ID(), string(nodeIDs[0][0:8])) + filterPartialID := func(node models.NodeState) bool { + return strings.HasPrefix(node.Info.ID(), string(nodeIDs[0][0:8])) } nodes, err := s.store.List(ctx, filterPartialID) s.NoError(err) s.Equal(1, len(nodes)) - s.Equal(nodeIDs[0], nodes[0].ID()) + s.Equal(nodeIDs[0], nodes[0].Info.ID()) // Match all records - filterPartialID = func(node models.NodeInfo) bool { - return strings.HasPrefix(node.ID(), "Qm") + filterPartialID = func(node models.NodeState) bool { + return strings.HasPrefix(node.Info.ID(), "Qm") } nodes, err = s.store.List(ctx, filterPartialID) s.NoError(err) s.Equal(2, len(nodes)) // Match no records - filterPartialID = func(node models.NodeInfo) bool { - return strings.HasPrefix(node.ID(), "XYZ") + filterPartialID = func(node models.NodeState) bool { + return strings.HasPrefix(node.Info.ID(), "XYZ") } nodes, err = s.store.List(ctx, filterPartialID) s.NoError(err) @@ -154,19 +155,19 @@ func (s *InMemoryNodeStoreSuite) Test_ListWithFilters() { func (s *InMemoryNodeStoreSuite) Test_Delete() { ctx := context.Background() - nodeInfo0 := generateNodeInfo(s.T(), nodeIDs[0], models.EngineDocker) - nodeInfo1 := generateNodeInfo(s.T(), nodeIDs[1], models.EngineDocker, models.EngineWasm) + nodeInfo0 := generateNodeState(nodeIDs[0], models.EngineDocker) + nodeInfo1 := generateNodeState(nodeIDs[1], models.EngineDocker, models.EngineWasm) s.NoError(s.store.Add(ctx, nodeInfo0)) s.NoError(s.store.Add(ctx, nodeInfo1)) // delete first node - s.NoError(s.store.Delete(ctx, nodeInfo0.ID())) + s.NoError(s.store.Delete(ctx, nodeInfo0.Info.ID())) nodes, err := s.store.List(ctx) s.NoError(err) - s.ElementsMatch([]models.NodeInfo{nodeInfo1}, nodes) + s.ElementsMatch([]models.NodeState{nodeInfo1}, nodes) // delete second node - s.NoError(s.store.Delete(ctx, nodeInfo1.ID())) + s.NoError(s.store.Delete(ctx, nodeInfo1.Info.ID())) nodes, err = s.store.List(ctx) s.NoError(err) s.Empty(nodes) @@ -174,21 +175,21 @@ func (s *InMemoryNodeStoreSuite) Test_Delete() { func (s *InMemoryNodeStoreSuite) Test_Replace() { ctx := context.Background() - nodeInfo0 := generateNodeInfo(s.T(), nodeIDs[0], models.EngineDocker) + nodeInfo0 := generateNodeState(nodeIDs[0], models.EngineDocker) s.NoError(s.store.Add(ctx, nodeInfo0)) - nodeInfo1 := generateNodeInfo(s.T(), nodeIDs[0], models.EngineWasm) - nodeInfo1.NodeID = nodeInfo0.NodeID + nodeInfo1 := generateNodeState(nodeIDs[0], models.EngineWasm) + nodeInfo1.Info.NodeID = nodeInfo0.Info.NodeID s.NoError(s.store.Add(ctx, nodeInfo1)) - res, err := s.store.Get(ctx, nodeInfo0.ID()) + res, err := s.store.Get(ctx, nodeInfo0.Info.ID()) s.NoError(err) s.Equal(nodeInfo1, res) // test List allNodeInfos, err := s.store.List(ctx) s.NoError(err) - s.ElementsMatch([]models.NodeInfo{nodeInfo1}, allNodeInfos) + s.ElementsMatch([]models.NodeState{nodeInfo1}, allNodeInfos) } func (s *InMemoryNodeStoreSuite) Test_Eviction() { @@ -197,22 +198,28 @@ func (s *InMemoryNodeStoreSuite) Test_Eviction() { TTL: ttl, }) ctx := context.Background() - nodeInfo0 := generateNodeInfo(s.T(), nodeIDs[0], models.EngineDocker) + nodeInfo0 := generateNodeState(nodeIDs[0], models.EngineDocker) s.NoError(s.store.Add(ctx, nodeInfo0)) // test Get - res, err := s.store.Get(ctx, nodeInfo0.ID()) + res, err := s.store.Get(ctx, nodeInfo0.Info.ID()) s.NoError(err) s.Equal(nodeInfo0, res) // wait for eviction time.Sleep(ttl + 100*time.Millisecond) - _, err = s.store.Get(ctx, nodeInfo0.ID()) + _, err = s.store.Get(ctx, nodeInfo0.Info.ID()) s.Error(err) s.IsType(routing.ErrNodeNotFound{}, err) } -func generateNodeInfo(t *testing.T, peerID string, engines ...string) models.NodeInfo { +func generateNodeState(peerID string, engines ...string) models.NodeState { + return models.NodeState{ + Info: generateNodeInfo(peerID, engines...), + } +} + +func generateNodeInfo(peerID string, engines ...string) models.NodeInfo { return models.NodeInfo{ NodeID: peerID, NodeType: models.NodeTypeCompute, diff --git a/pkg/routing/kvstore/kvstore.go b/pkg/routing/kvstore/kvstore.go index 5e8bbd3ef7..f288adbb3c 100644 --- a/pkg/routing/kvstore/kvstore.go +++ b/pkg/routing/kvstore/kvstore.go @@ -58,59 +58,59 @@ func (n *NodeStore) FindPeer(ctx context.Context, peerID peer.ID) (peer.AddrInfo // We are temporarily matching the code of the inmemory.NodeStore which never returns an // error for this method. nodeID := peerID.String() - info, err := n.Get(ctx, nodeID) + state, err := n.Get(ctx, nodeID) if err != nil { return peer.AddrInfo{}, nil } - return *info.PeerInfo, nil + return *state.Info.PeerInfo, nil } -// Add adds a node info to the repo. -func (n *NodeStore) Add(ctx context.Context, nodeInfo models.NodeInfo) error { - data, err := json.Marshal(nodeInfo) +// Add adds a node state to the repo. +func (n *NodeStore) Add(ctx context.Context, state models.NodeState) error { + data, err := json.Marshal(state) if err != nil { - return pkgerrors.Wrap(err, "failed to marshal node info adding to node store") + return pkgerrors.Wrap(err, "failed to marshal node state adding to node store") } - _, err = n.kv.Put(ctx, nodeInfo.ID(), data) + _, err = n.kv.Put(ctx, state.Info.ID(), data) if err != nil { - return pkgerrors.Wrap(err, "failed to write node info to node store") + return pkgerrors.Wrap(err, "failed to write node state to node store") } return nil } -// Get returns the node info for the given node ID. -func (n *NodeStore) Get(ctx context.Context, nodeID string) (models.NodeInfo, error) { +// Get returns the node state for the given node ID. +func (n *NodeStore) Get(ctx context.Context, nodeID string) (models.NodeState, error) { entry, err := n.kv.Get(ctx, nodeID) if err != nil { if pkgerrors.Is(err, jetstream.ErrKeyNotFound) { - return models.NodeInfo{}, routing.NewErrNodeNotFound(nodeID) + return models.NodeState{}, routing.NewErrNodeNotFound(nodeID) } - return models.NodeInfo{}, pkgerrors.Wrap(err, "failed to get node info from node store") + return models.NodeState{}, pkgerrors.Wrap(err, "failed to get node state from node store") } - var node models.NodeInfo + var node models.NodeState err = json.Unmarshal(entry.Value(), &node) if err != nil { - return models.NodeInfo{}, pkgerrors.Wrap(err, "failed to unmarshal node info from node store") + return models.NodeState{}, pkgerrors.Wrap(err, "failed to unmarshal node state from node store") } return node, nil } -// GetByPrefix returns the node info for the given node ID. +// GetByPrefix returns the node state for the given node ID. // Supports both full and short node IDs band currently iterates through all of the // keys to find matches, due to NATS KVStore not supporting prefix searches (yet). -func (n *NodeStore) GetByPrefix(ctx context.Context, prefix string) (models.NodeInfo, error) { +func (n *NodeStore) GetByPrefix(ctx context.Context, prefix string) (models.NodeState, error) { keys, err := n.kv.Keys(ctx) if err != nil { if pkgerrors.Is(err, jetstream.ErrNoKeysFound) { - return models.NodeInfo{}, routing.NewErrNodeNotFound(prefix) + return models.NodeState{}, routing.NewErrNodeNotFound(prefix) } - return models.NodeInfo{}, pkgerrors.Wrap(err, "failed to get by prefix when listing keys") + return models.NodeState{}, pkgerrors.Wrap(err, "failed to get by prefix when listing keys") } // Filter the list down to just the matching keys @@ -119,38 +119,38 @@ func (n *NodeStore) GetByPrefix(ctx context.Context, prefix string) (models.Node }) if len(keys) == 0 { - return models.NodeInfo{}, routing.NewErrNodeNotFound(prefix) + return models.NodeState{}, routing.NewErrNodeNotFound(prefix) } else if len(keys) > 1 { - return models.NodeInfo{}, routing.NewErrMultipleNodesFound(prefix, keys) + return models.NodeState{}, routing.NewErrMultipleNodesFound(prefix, keys) } return n.Get(ctx, keys[0]) } // List returns a list of nodes -func (n *NodeStore) List(ctx context.Context, filters ...routing.NodeInfoFilter) ([]models.NodeInfo, error) { +func (n *NodeStore) List(ctx context.Context, filters ...routing.NodeStateFilter) ([]models.NodeState, error) { keys, err := n.kv.Keys(ctx) if err != nil { // Return an empty list rather than an error if there are no keys in the bucket if pkgerrors.Is(err, jetstream.ErrNoKeysFound) { - return []models.NodeInfo{}, nil + return []models.NodeState{}, nil } - return nil, pkgerrors.Wrap(err, "failed to list node info from node store") + return nil, pkgerrors.Wrap(err, "failed to list node state from node store") } var mErr error // Create a mega filter that combines all the filters into one - megaFilter := func(info models.NodeInfo) bool { + megaFilter := func(state models.NodeState) bool { for _, filter := range filters { - if !filter(info) { + if !filter(state) { return false } } return true } - nodes := make([]models.NodeInfo, 0, len(keys)) + nodes := make([]models.NodeState, 0, len(keys)) for _, key := range keys { node, err := n.Get(ctx, key) if err != nil { @@ -165,10 +165,10 @@ func (n *NodeStore) List(ctx context.Context, filters ...routing.NodeInfoFilter) return nodes, mErr } -// Delete deletes a node info from the repo. +// Delete deletes a node state from the repo. func (n *NodeStore) Delete(ctx context.Context, nodeID string) error { if err := n.kv.Purge(ctx, nodeID); err != nil { - return pkgerrors.Wrap(err, "failed to purge node info from node store") + return pkgerrors.Wrap(err, "failed to purge node state from node store") } return nil diff --git a/pkg/routing/kvstore/kvstore_test.go b/pkg/routing/kvstore/kvstore_test.go index 78cc42a95b..1acffa1c36 100644 --- a/pkg/routing/kvstore/kvstore_test.go +++ b/pkg/routing/kvstore/kvstore_test.go @@ -11,10 +11,11 @@ import ( natsserver "github.com/nats-io/nats-server/v2/test" "github.com/nats-io/nats.go" + "github.com/stretchr/testify/suite" + "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/routing" "github.com/bacalhau-project/bacalhau/pkg/routing/kvstore" - "github.com/stretchr/testify/suite" ) const TEST_PORT = 8369 @@ -57,19 +58,19 @@ func TestKVNodeInfoStoreSuite(t *testing.T) { func (s *KVNodeInfoStoreSuite) Test_Get() { ctx := context.Background() - nodeInfo0 := generateNodeInfo(s.T(), nodeIDs[0], models.EngineDocker) - nodeInfo1 := generateNodeInfo(s.T(), nodeIDs[1], models.EngineWasm) + nodeInfo0 := generateNodeState(nodeIDs[0], models.EngineDocker) + nodeInfo1 := generateNodeState(nodeIDs[1], models.EngineWasm) s.NoError(s.store.Add(ctx, nodeInfo0)) s.NoError(s.store.Add(ctx, nodeInfo1)) // test Get - res1, err := s.store.Get(ctx, nodeInfo0.ID()) + res1, err := s.store.Get(ctx, nodeInfo0.Info.ID()) s.NoError(err) - s.Equal(nodeInfo0.ID(), res1.ID()) + s.Equal(nodeInfo0.Info.ID(), res1.Info.ID()) - res2, err := s.store.Get(ctx, nodeInfo1.ID()) + res2, err := s.store.Get(ctx, nodeInfo1.Info.ID()) s.NoError(err) - s.Equal(nodeInfo1.ID(), res2.ID()) + s.Equal(nodeInfo1.Info.ID(), res2.Info.ID()) } func (s *KVNodeInfoStoreSuite) Test_GetNotFound() { @@ -81,7 +82,7 @@ func (s *KVNodeInfoStoreSuite) Test_GetNotFound() { func (s *KVNodeInfoStoreSuite) Test_GetByPrefix_SingleMatch() { ctx := context.Background() - nodeInfo := generateNodeInfo(s.T(), nodeIDs[0], models.EngineDocker) + nodeInfo := generateNodeState(nodeIDs[0], models.EngineDocker) s.NoError(s.store.Add(ctx, nodeInfo)) res, err := s.store.GetByPrefix(ctx, "QmdZQ7") @@ -91,8 +92,8 @@ func (s *KVNodeInfoStoreSuite) Test_GetByPrefix_SingleMatch() { func (s *KVNodeInfoStoreSuite) Test_GetByPrefix_MultipleMatches() { ctx := context.Background() - nodeInfo0 := generateNodeInfo(s.T(), nodeIDs[0], models.EngineDocker) - nodeInfo1 := generateNodeInfo(s.T(), nodeIDs[1], models.EngineWasm) + nodeInfo0 := generateNodeState(nodeIDs[0], models.EngineDocker) + nodeInfo1 := generateNodeState(nodeIDs[1], models.EngineWasm) s.NoError(s.store.Add(ctx, nodeInfo0)) s.NoError(s.store.Add(ctx, nodeInfo1)) @@ -111,7 +112,7 @@ func (s *KVNodeInfoStoreSuite) Test_GetByPrefix_NoMatch_Empty() { func (s *KVNodeInfoStoreSuite) Test_GetByPrefix_NoMatch_NotEmpty() { ctx := context.Background() - nodeInfo0 := generateNodeInfo(s.T(), nodeIDs[1], models.EngineWasm) + nodeInfo0 := generateNodeState(nodeIDs[1], models.EngineWasm) s.NoError(s.store.Add(ctx, nodeInfo0)) _, err := s.store.GetByPrefix(ctx, "nonexistent") @@ -121,44 +122,44 @@ func (s *KVNodeInfoStoreSuite) Test_GetByPrefix_NoMatch_NotEmpty() { func (s *KVNodeInfoStoreSuite) Test_List() { ctx := context.Background() - nodeInfo0 := generateNodeInfo(s.T(), nodeIDs[0], models.EngineDocker) - nodeInfo1 := generateNodeInfo(s.T(), nodeIDs[1], models.EngineWasm) + nodeInfo0 := generateNodeState(nodeIDs[0], models.EngineDocker) + nodeInfo1 := generateNodeState(nodeIDs[1], models.EngineWasm) s.NoError(s.store.Add(ctx, nodeInfo0)) s.NoError(s.store.Add(ctx, nodeInfo1)) // test List allNodeInfos, err := s.store.List(ctx) s.NoError(err) - s.ElementsMatch([]models.NodeInfo{nodeInfo0, nodeInfo1}, allNodeInfos) + s.ElementsMatch([]models.NodeState{nodeInfo0, nodeInfo1}, allNodeInfos) } func (s *KVNodeInfoStoreSuite) Test_ListWithFilters() { ctx := context.Background() - nodeInfo0 := generateNodeInfo(s.T(), nodeIDs[0], models.EngineDocker) - nodeInfo1 := generateNodeInfo(s.T(), nodeIDs[1], models.EngineWasm) + nodeInfo0 := generateNodeState(nodeIDs[0], models.EngineDocker) + nodeInfo1 := generateNodeState(nodeIDs[1], models.EngineWasm) s.NoError(s.store.Add(ctx, nodeInfo0)) s.NoError(s.store.Add(ctx, nodeInfo1)) // Match one record - filterPartialID := func(node models.NodeInfo) bool { - return strings.HasPrefix(node.ID(), string(nodeIDs[0][0:8])) + filterPartialID := func(node models.NodeState) bool { + return strings.HasPrefix(node.Info.ID(), string(nodeIDs[0][0:8])) } nodes, err := s.store.List(ctx, filterPartialID) s.NoError(err) s.Equal(1, len(nodes)) - s.Equal(nodeIDs[0], nodes[0].ID()) + s.Equal(nodeIDs[0], nodes[0].Info.ID()) // Match all records - filterPartialID = func(node models.NodeInfo) bool { - return strings.HasPrefix(node.ID(), "Qm") + filterPartialID = func(node models.NodeState) bool { + return strings.HasPrefix(node.Info.ID(), "Qm") } nodes, err = s.store.List(ctx, filterPartialID) s.NoError(err) s.Equal(2, len(nodes)) // Match no records - filterPartialID = func(node models.NodeInfo) bool { - return strings.HasPrefix(node.ID(), "XYZ") + filterPartialID = func(node models.NodeState) bool { + return strings.HasPrefix(node.Info.ID(), "XYZ") } nodes, err = s.store.List(ctx, filterPartialID) s.NoError(err) @@ -167,20 +168,20 @@ func (s *KVNodeInfoStoreSuite) Test_ListWithFilters() { func (s *KVNodeInfoStoreSuite) Test_Delete() { ctx := context.Background() - nodeInfo0 := generateNodeInfo(s.T(), nodeIDs[0], models.EngineDocker) - nodeInfo1 := generateNodeInfo(s.T(), nodeIDs[1], models.EngineDocker, models.EngineWasm) + nodeInfo0 := generateNodeState(nodeIDs[0], models.EngineDocker) + nodeInfo1 := generateNodeState(nodeIDs[1], models.EngineDocker, models.EngineWasm) s.NoError(s.store.Add(ctx, nodeInfo0)) s.NoError(s.store.Add(ctx, nodeInfo1)) // delete first node - s.NoError(s.store.Delete(ctx, nodeInfo0.ID())) + s.NoError(s.store.Delete(ctx, nodeInfo0.Info.ID())) nodes, err := s.store.List(ctx) s.NoError(err) - s.ElementsMatch([]models.NodeInfo{nodeInfo1}, nodes) + s.ElementsMatch([]models.NodeState{nodeInfo1}, nodes) // delete second node - s.NoError(s.store.Delete(ctx, nodeInfo1.ID())) + s.NoError(s.store.Delete(ctx, nodeInfo1.Info.ID())) nodes, err = s.store.List(ctx) s.NoError(err) @@ -189,24 +190,30 @@ func (s *KVNodeInfoStoreSuite) Test_Delete() { func (s *KVNodeInfoStoreSuite) Test_Replace() { ctx := context.Background() - nodeInfo0 := generateNodeInfo(s.T(), nodeIDs[0], models.EngineDocker) + nodeInfo0 := generateNodeState(nodeIDs[0], models.EngineDocker) s.NoError(s.store.Add(ctx, nodeInfo0)) - nodeInfo1 := generateNodeInfo(s.T(), nodeIDs[0], models.EngineWasm) - nodeInfo1.NodeID = nodeInfo0.NodeID + nodeInfo1 := generateNodeState(nodeIDs[0], models.EngineWasm) + nodeInfo1.Info.NodeID = nodeInfo0.Info.NodeID s.NoError(s.store.Add(ctx, nodeInfo1)) - res, err := s.store.Get(ctx, nodeInfo0.ID()) + res, err := s.store.Get(ctx, nodeInfo0.Info.ID()) s.NoError(err) s.Equal(nodeInfo1, res) // test List allNodeInfos, err := s.store.List(ctx) s.NoError(err) - s.ElementsMatch([]models.NodeInfo{nodeInfo1}, allNodeInfos) + s.ElementsMatch([]models.NodeState{nodeInfo1}, allNodeInfos) +} + +func generateNodeState(peerID string, engines ...string) models.NodeState { + return models.NodeState{ + Info: generateNodeInfo(peerID, engines...), + } } -func generateNodeInfo(t *testing.T, peerID string, engines ...string) models.NodeInfo { +func generateNodeInfo(peerID string, engines ...string) models.NodeInfo { return models.NodeInfo{ NodeID: peerID, NodeType: models.NodeTypeCompute, diff --git a/pkg/routing/node_info_provider.go b/pkg/routing/node_info_provider.go index 507d7fedbc..3b9a65a3b7 100644 --- a/pkg/routing/node_info_provider.go +++ b/pkg/routing/node_info_provider.go @@ -6,23 +6,23 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/models" ) -type NodeInfoProviderParams struct { +type NodeStateProviderParams struct { NodeID string LabelsProvider models.LabelsProvider BacalhauVersion models.BuildVersionInfo - DefaultNodeApproval models.NodeApproval + DefaultNodeApproval models.NodeMembershipState } -type NodeInfoProvider struct { +type NodeStateProvider struct { nodeID string labelsProvider models.LabelsProvider bacalhauVersion models.BuildVersionInfo nodeInfoDecorators []models.NodeInfoDecorator - defaultNodeApproval models.NodeApproval + defaultNodeApproval models.NodeMembershipState } -func NewNodeInfoProvider(params NodeInfoProviderParams) *NodeInfoProvider { - provider := &NodeInfoProvider{ +func NewNodeStateProvider(params NodeStateProviderParams) *NodeStateProvider { + provider := &NodeStateProvider{ nodeID: params.NodeID, labelsProvider: params.LabelsProvider, bacalhauVersion: params.BacalhauVersion, @@ -32,35 +32,48 @@ func NewNodeInfoProvider(params NodeInfoProviderParams) *NodeInfoProvider { // If we were not given a default approval, we default to PENDING if !provider.defaultNodeApproval.IsValid() { - provider.defaultNodeApproval = models.NodeApprovals.PENDING + provider.defaultNodeApproval = models.NodeMembership.PENDING } return provider } // RegisterNodeInfoDecorator registers a node info decorator with the node info provider. -func (n *NodeInfoProvider) RegisterNodeInfoDecorator(decorator models.NodeInfoDecorator) { +func (n *NodeStateProvider) RegisterNodeInfoDecorator(decorator models.NodeInfoDecorator) { n.nodeInfoDecorators = append(n.nodeInfoDecorators, decorator) } -func (n *NodeInfoProvider) GetNodeInfo(ctx context.Context) models.NodeInfo { - res := models.NodeInfo{ +func (n *NodeStateProvider) GetNodeState(ctx context.Context) models.NodeState { + info := models.NodeInfo{ NodeID: n.nodeID, BacalhauVersion: n.bacalhauVersion, Labels: n.labelsProvider.GetLabels(ctx), NodeType: models.NodeTypeRequester, - Approval: n.defaultNodeApproval, } for _, decorator := range n.nodeInfoDecorators { - res = decorator.DecorateNodeInfo(ctx, res) + info = decorator.DecorateNodeInfo(ctx, info) } - if !res.Approval.IsValid() { - res.Approval = models.NodeApprovals.PENDING + state := models.NodeState{ + Info: info, + Membership: n.defaultNodeApproval, + // NB(forrest): we are returning NodeState about ourselves (the Requester) + // the concept of a disconnected requester node could only exist from the + // perspective of a ComputeNode or another RequesterNode. + // We don't support multiple requester nodes nor querying the state of one from a Compute node. (yet) + // So we allways say we are connected here. + + // This is all pretty funky and my comment here will hopefully become outdates at some-point and need adjusting, + // but for now: "you can tell the requester node is connected because of the way it is". + Connection: models.NodeStates.CONNECTED, + } + + if !state.Membership.IsValid() { + state.Membership = models.NodeMembership.PENDING } - return res + return state } // compile-time interface check -var _ models.NodeInfoProvider = &NodeInfoProvider{} +var _ models.NodeStateProvider = &NodeStateProvider{} diff --git a/pkg/routing/node_info_publisher.go b/pkg/routing/node_info_publisher.go index 6f99ba0cdb..719f0d0ca5 100644 --- a/pkg/routing/node_info_publisher.go +++ b/pkg/routing/node_info_publisher.go @@ -5,10 +5,11 @@ import ( "sync" "time" + "github.com/rs/zerolog/log" + "github.com/bacalhau-project/bacalhau/pkg/models" "github.com/bacalhau-project/bacalhau/pkg/pubsub" "github.com/bacalhau-project/bacalhau/pkg/system" - "github.com/rs/zerolog/log" ) type NodeInfoPublisherIntervalConfig struct { @@ -34,26 +35,26 @@ func (n NodeInfoPublisherIntervalConfig) IsEagerPublishEnabled() bool { } type NodeInfoPublisherParams struct { - PubSub pubsub.Publisher[models.NodeInfo] - NodeInfoProvider models.NodeInfoProvider - IntervalConfig NodeInfoPublisherIntervalConfig + PubSub pubsub.Publisher[models.NodeState] + NodeStateProvider models.NodeStateProvider + IntervalConfig NodeInfoPublisherIntervalConfig } type NodeInfoPublisher struct { - pubSub pubsub.Publisher[models.NodeInfo] - nodeInfoProvider models.NodeInfoProvider - intervalConfig NodeInfoPublisherIntervalConfig - stopped bool - stopChannel chan struct{} - stopOnce sync.Once + pubSub pubsub.Publisher[models.NodeState] + nodeStateProvider models.NodeStateProvider + intervalConfig NodeInfoPublisherIntervalConfig + stopped bool + stopChannel chan struct{} + stopOnce sync.Once } func NewNodeInfoPublisher(params NodeInfoPublisherParams) *NodeInfoPublisher { p := &NodeInfoPublisher{ - pubSub: params.PubSub, - nodeInfoProvider: params.NodeInfoProvider, - intervalConfig: params.IntervalConfig, - stopChannel: make(chan struct{}), + pubSub: params.PubSub, + nodeStateProvider: params.NodeStateProvider, + intervalConfig: params.IntervalConfig, + stopChannel: make(chan struct{}), } go func() { @@ -71,7 +72,7 @@ func (n *NodeInfoPublisher) Publish(ctx context.Context) error { ctx, span := system.NewSpan(ctx, system.GetTracer(), "pkg/routing.NodeInfoPublisher.publish") defer span.End() - return n.pubSub.Publish(ctx, n.nodeInfoProvider.GetNodeInfo(ctx)) + return n.pubSub.Publish(ctx, n.nodeStateProvider.GetNodeState(ctx)) } func (n *NodeInfoPublisher) eagerPublishBackgroundTask() { diff --git a/pkg/routing/tracing/tracing.go b/pkg/routing/tracing/tracing.go index 8e23bceb3a..c1fabb03c4 100644 --- a/pkg/routing/tracing/tracing.go +++ b/pkg/routing/tracing/tracing.go @@ -22,7 +22,7 @@ func NewNodeStore(delegate routing.NodeInfoStore) *NodeStore { } } -func (r *NodeStore) Add(ctx context.Context, nodeInfo models.NodeInfo) error { +func (r *NodeStore) Add(ctx context.Context, state models.NodeState) error { ctx, span := system.NewSpan(ctx, system.GetTracer(), "pkg/routing.NodeInfoStore.Add") //nolint:govet defer span.End() @@ -31,14 +31,14 @@ func (r *NodeStore) Add(ctx context.Context, nodeInfo models.NodeInfo) error { dur := stopwatch() log.Ctx(ctx).Trace(). Dur("duration", dur). - Str("node", nodeInfo.ID()). + Str("node", state.Info.ID()). Msg("node added") }() - return r.delegate.Add(ctx, nodeInfo) + return r.delegate.Add(ctx, state) } -func (r *NodeStore) Get(ctx context.Context, nodeID string) (models.NodeInfo, error) { +func (r *NodeStore) Get(ctx context.Context, nodeID string) (models.NodeState, error) { ctx, span := system.NewSpan(ctx, system.GetTracer(), "pkg/routing.NodeInfoStore.Get") //nolint:govet defer span.End() @@ -54,7 +54,7 @@ func (r *NodeStore) Get(ctx context.Context, nodeID string) (models.NodeInfo, er return r.delegate.Get(ctx, nodeID) } -func (r *NodeStore) GetByPrefix(ctx context.Context, prefix string) (models.NodeInfo, error) { +func (r *NodeStore) GetByPrefix(ctx context.Context, prefix string) (models.NodeState, error) { ctx, span := system.NewSpan(ctx, system.GetTracer(), "pkg/routing.NodeInfoStore.GetByPrefix") //nolint:govet defer span.End() @@ -74,7 +74,7 @@ func (r *NodeStore) FindPeer(ctx context.Context, peerID peer.ID) (peer.AddrInfo return r.delegate.FindPeer(ctx, peerID) } -func (r *NodeStore) List(ctx context.Context, filters ...routing.NodeInfoFilter) ([]models.NodeInfo, error) { +func (r *NodeStore) List(ctx context.Context, filters ...routing.NodeStateFilter) ([]models.NodeState, error) { ctx, span := system.NewSpan(ctx, system.GetTracer(), "pkg/routing.NodeInfoStore.List") //nolint:govet defer span.End() diff --git a/pkg/routing/types.go b/pkg/routing/types.go index 75cf0aadeb..1facd0c611 100644 --- a/pkg/routing/types.go +++ b/pkg/routing/types.go @@ -3,33 +3,35 @@ package routing import ( "context" - "github.com/bacalhau-project/bacalhau/pkg/models" libp2p_routing "github.com/libp2p/go-libp2p/core/routing" + + "github.com/bacalhau-project/bacalhau/pkg/models" ) +// TODO rename this interface to NodeStore, it tracks more than their info type NodeInfoStore interface { // TODO: Remove this interface once we switch to nats libp2p_routing.PeerRouting // Add adds a node info to the repo. - Add(ctx context.Context, nodeInfo models.NodeInfo) error + Add(ctx context.Context, nodeInfo models.NodeState) error // Get returns the node info for the given node ID. - Get(ctx context.Context, nodeID string) (models.NodeInfo, error) + Get(ctx context.Context, nodeID string) (models.NodeState, error) // GetByPrefix returns the node info for the given node ID. // Supports both full and short node IDs. - GetByPrefix(ctx context.Context, prefix string) (models.NodeInfo, error) + GetByPrefix(ctx context.Context, prefix string) (models.NodeState, error) // List returns a list of nodes - List(ctx context.Context, filters ...NodeInfoFilter) ([]models.NodeInfo, error) + List(ctx context.Context, filters ...NodeStateFilter) ([]models.NodeState, error) // Delete deletes a node info from the repo. Delete(ctx context.Context, nodeID string) error } -// NodeInfoFilter is a function that filters node info -// when listing nodes. It returns true if the node info -// should be returned, and false if the node info should +// NodeStateFilter is a function that filters node state +// when listing nodes. It returns true if the node state +// should be returned, and false if the node state should // be ignored. -type NodeInfoFilter func(models.NodeInfo) bool +type NodeStateFilter func(models.NodeState) bool diff --git a/pkg/swagger/docs.go b/pkg/swagger/docs.go index 07187d003a..8aac73dfaf 100644 --- a/pkg/swagger/docs.go +++ b/pkg/swagger/docs.go @@ -2831,7 +2831,7 @@ const docTemplate = `{ "models.NodeInfo": { "type": "object", "properties": { - "Approval": { + "Membership": { "$ref": "#/definitions/models.NodeApproval" }, "BacalhauVersion": { diff --git a/pkg/test/teststack/stack.go b/pkg/test/teststack/stack.go index 2376cce3ec..f319db6d2c 100644 --- a/pkg/test/teststack/stack.go +++ b/pkg/test/teststack/stack.go @@ -111,7 +111,7 @@ func allNodesDiscovered(t testing.TB, stack *devstack.DevStack) bool { } expectedNodes := stack.GetNodeIds() - discoveredNodes, err := node.RequesterNode.NodeDiscoverer.ListNodes(ctx) + discoveredNodes, err := node.RequesterNode.NodeDiscoverer.List(ctx) require.NoError(t, err) if len(discoveredNodes) < len(expectedNodes) { @@ -121,7 +121,7 @@ func allNodesDiscovered(t testing.TB, stack *devstack.DevStack) bool { discoveredNodeIDs := make([]string, len(discoveredNodes)) for i, discoveredNode := range discoveredNodes { - discoveredNodeIDs[i] = discoveredNode.ID() + discoveredNodeIDs[i] = discoveredNode.Info.ID() } require.ElementsMatch(t, expectedNodes, discoveredNodeIDs) } diff --git a/pkg/test/utils/node/utils.go b/pkg/test/utils/node/utils.go index ff47f5d6e0..de8fb24e0e 100644 --- a/pkg/test/utils/node/utils.go +++ b/pkg/test/utils/node/utils.go @@ -22,15 +22,15 @@ func WaitForNodeDiscovery(t *testing.T, requesterNode *node.Requester, expectedN loggingGap := 1 * time.Second waitLoggingUntil := time.Now().Add(loggingGap) - var nodeInfos []models.NodeInfo + var nodeInfos []models.NodeState for time.Now().Before(waitUntil) { var err error nodeInfos, err = requesterNode.NodeInfoStore.List(ctx) require.NoError(t, err) if time.Now().After(waitLoggingUntil) { - t.Logf("connected to %d peers: %v", len(nodeInfos), logger.ToSliceStringer(nodeInfos, func(t models.NodeInfo) string { - return t.ID() + t.Logf("connected to %d peers: %v", len(nodeInfos), logger.ToSliceStringer(nodeInfos, func(t models.NodeState) string { + return t.Info.ID() })) waitLoggingUntil = time.Now().Add(loggingGap) } @@ -40,7 +40,7 @@ func WaitForNodeDiscovery(t *testing.T, requesterNode *node.Requester, expectedN time.Sleep(waitGaps) } require.FailNowf(t, fmt.Sprintf("requester node didn't read all node infos even after waiting for %s", waitDuration), - "expected 4 node infos, got %d. %+v", len(nodeInfos), logger.ToSliceStringer(nodeInfos, func(t models.NodeInfo) string { - return t.ID() + "expected 4 node infos, got %d. %+v", len(nodeInfos), logger.ToSliceStringer(nodeInfos, func(t models.NodeState) string { + return t.Info.ID() })) } diff --git a/pkg/transport/interfaces.go b/pkg/transport/interfaces.go index 8079eeb935..7c18fde790 100644 --- a/pkg/transport/interfaces.go +++ b/pkg/transport/interfaces.go @@ -23,7 +23,7 @@ type TransportLayer interface { // NodeInfoPubSub enables compute nodes to publish their info and capabilities // to orchestrator nodes for job matching and discovery. - NodeInfoPubSub() pubsub.PubSub[models.NodeInfo] + NodeInfoPubSub() pubsub.PubSub[models.NodeState] // NodeInfoDecorator enables transport layer to enrich node info with data // required for request routing diff --git a/test/labels.sh b/test/labels.sh index 8a9ccec96e..f0ccebeac5 100755 --- a/test/labels.sh +++ b/test/labels.sh @@ -15,11 +15,11 @@ run_test() { done assert_equal 1 $(jq -rcM length <<< $stdout) - assert_not_equal 0 $(jq -rcM '.[0].Labels | length' <<< $stdout) - assert_equal false $(jq -rcM '.[0].Labels["Operating-System"] == null' <<< $stdout) - assert_equal false $(jq -rcM '.[0].Labels["Architecture"] == null' <<< $stdout) - assert_equal value $(jq -rcM '.[0].Labels["key"]' <<< $stdout) - assert_equal $WORD $(jq -rcM '.[0].Labels["random"]' <<< $stdout) + assert_not_equal 0 $(jq -rcM '.[0].Info.Labels | length' <<< $stdout) + assert_equal false $(jq -rcM '.[0].Info.Labels["Operating-System"] == null' <<< $stdout) + assert_equal false $(jq -rcM '.[0].Info.Labels["Architecture"] == null' <<< $stdout) + assert_equal value $(jq -rcM '.[0].Info.Labels["key"]' <<< $stdout) + assert_equal $WORD $(jq -rcM '.[0].Info.Labels["random"]' <<< $stdout) } testcase_receive_labels_about_requester_node_for_nats() { @@ -32,7 +32,7 @@ testcase_receive_extra_labels_about_compute_node_for_nats() { subject bacalhau config set node.network.type nats assert_equal 0 $status run_test requester,compute - assert_equal false $(jq -rcM '.[0].Labels["git-lfs"] == null' <<< $stdout) + assert_equal false $(jq -rcM '.[0].Info.Labels["git-lfs"] == null' <<< $stdout) } testcase_receive_labels_about_requester_node_for_libp2p() { @@ -45,5 +45,5 @@ testcase_receive_extra_labels_about_compute_node_for_libp2p() { subject bacalhau config set node.network.type libp2p assert_equal 0 $status run_test requester,compute - assert_equal false $(jq -rcM '.[0].Labels["git-lfs"] == null' <<< $stdout) + assert_equal false $(jq -rcM '.[0].Info.Labels["git-lfs"] == null' <<< $stdout) } From 3a4c578249c46e0796bb2305b1bd6d91b588e6e0 Mon Sep 17 00:00:00 2001 From: Walid Baruni Date: Mon, 22 Apr 2024 16:03:36 +0200 Subject: [PATCH 12/17] Add issue templates (#3951) Add a template for when users open a new issue --- .github/ISSUE_TEMPLATE/bug_report.md | 40 +++++++++++++++++++++++ .github/ISSUE_TEMPLATE/config.yml | 8 +++++ .github/ISSUE_TEMPLATE/feature_request.md | 20 ++++++++++++ .github/dependabot.yml | 6 ++++ 4 files changed, 74 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md create mode 100644 .github/ISSUE_TEMPLATE/config.yml create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000000..e60993b507 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,40 @@ +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: request/new, type/bug +assignees: '' + +--- + +### Bug Description +*Briefly describe the unexpected behavior or performance regression. What happened that wasn’t supposed to?* + +### Expected Behavior +*Detail what you expected to happen instead of the bug.* + +### Steps to Reproduce +1. Step one to reproduce +2. Step two +3. Step three +4. (Continue as necessary) + +### Bacalhau Versions +- **Agent Version**: Run `bacalhau agent version` to get this. +- **CLI Client Version**: Run `bacalhau version` for the client info. + +### Host Environment +*Provide details about the environment where the bug occurred:* +- Operating System: +- CPU Architecture: +- Any other relevant environment details: + +### Job Specification +*(If applicable, provide the job spec used when the issue occurred.)* + +### Logs +#### Agent Logs: +*(Include here if applicable.)* + +#### Client Logs: +*(Include here if applicable.)* diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000000..4bd5ee8682 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,8 @@ +blank_issues_enabled: false +contact_links: + - name: Discussion + url: https://github.com/bacalhau-project/bacalhau/discussions + about: Ideal for ideas, feedback, or longer form questions. + - name: Chat + url: https://bacalhauproject.slack.com + about: Ideal for short questions, looking for advice, general conversation, and meeting other Bacalhau users! \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000000..8d43c9a249 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,20 @@ +--- +name: Feature request +about: Suggest a new feature or enhancement to Bacalhau +title: '' +labels: request/new, type/enhancement +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or screenshots about the feature request here. diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 95e0572d3c..1ab32ba32b 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -4,9 +4,15 @@ updates: directory: "/" schedule: interval: daily + labels: + - "lang/go" + - "th/dependencies" open-pull-requests-limit: 1 - package-ecosystem: "pip" directory: "/integration/" schedule: interval: daily + labels: + - "lang/python" + - "th/dependencies" open-pull-requests-limit: 1 From 0b1081ffea659a48c44614309d6156daea65211b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 26 Apr 2024 14:46:55 -0700 Subject: [PATCH 13/17] Bump golang.org/x/net from 0.21.0 to 0.23.0 (#3809) Bumps [golang.org/x/net](https://github.com/golang/net) from 0.21.0 to 0.23.0.
Commits
  • c48da13 http2: fix TestServerContinuationFlood flakes
  • 762b58d http2: fix tipos in comment
  • ba87210 http2: close connections when receiving too many headers
  • ebc8168 all: fix some typos
  • 3678185 http2: make TestCanonicalHeaderCacheGrowth faster
  • 448c44f http2: remove clientTester
  • c7877ac http2: convert the remaining clientTester tests to testClientConn
  • d8870b0 http2: use synthetic time in TestIdleConnTimeout
  • d73acff http2: only set up deadline when Server.IdleTimeout is positive
  • 89f602b http2: validate client/outgoing trailers
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=golang.org/x/net&package-manager=go_modules&previous-version=0.21.0&new-version=0.23.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself) You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/bacalhau-project/bacalhau/network/alerts).
--------- Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: David Aronchick --- .github/workflows/dependabot-auto-merge.yml | 37 +++++++++++++++++++++ .gitignore | 1 + go.mod | 2 +- go.sum | 3 +- ops/aws/canary/lambda/go.mod | 2 +- ops/aws/canary/lambda/go.sum | 2 +- 6 files changed, 43 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/dependabot-auto-merge.yml diff --git a/.github/workflows/dependabot-auto-merge.yml b/.github/workflows/dependabot-auto-merge.yml new file mode 100644 index 0000000000..64ca803f5c --- /dev/null +++ b/.github/workflows/dependabot-auto-merge.yml @@ -0,0 +1,37 @@ +name: Dependabot Auto Merge + +on: + pull_request: + types: [opened, synchronize, reopened] + branches: + - main + paths-ignore: + - "**.md" + +jobs: + trigger-circleci: + runs-on: ubuntu-latest + if: github.actor == 'dependabot[bot]' + steps: + - name: Trigger CircleCI Pipeline + run: .circleci/trigger_pipeline.sh + env: + BRANCH: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.ref || github.event.ref }} + CIRCLE_TOKEN: ${{ secrets.CIRCLE_CI_TOKEN }} + + auto-merge: + runs-on: ubuntu-latest + needs: trigger-circleci + steps: + - name: Auto merge PR + uses: actions/github-script@v3 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + const mergePayload = { + owner: context.repo.owner, + repo: context.repo.repo, + pull_number: context.payload.pull_request.number, + merge_method: 'squash' + }; + github.pulls.merge(mergePayload); diff --git a/.gitignore b/.gitignore index e5e36d5e8f..27540746b0 100644 --- a/.gitignore +++ b/.gitignore @@ -63,3 +63,4 @@ dist/ .secret .arg .ruff_cache +.aider* diff --git a/go.mod b/go.mod index 24ecde71dd..745739cbfb 100644 --- a/go.mod +++ b/go.mod @@ -381,7 +381,7 @@ require ( go.uber.org/dig v1.17.1 // indirect go.uber.org/fx v1.20.1 // indirect go4.org v0.0.0-20230225012048-214862532bf5 // indirect - golang.org/x/net v0.21.0 // indirect + golang.org/x/net v0.23.0 // indirect golang.org/x/oauth2 v0.16.0 // indirect golang.org/x/sync v0.6.0 golang.org/x/sys v0.18.0 // indirect diff --git a/go.sum b/go.sum index 972fcb5e63..5713950b71 100644 --- a/go.sum +++ b/go.sum @@ -1488,8 +1488,9 @@ golang.org/x/net v0.11.0/go.mod h1:2L/ixqYpgIVXmeoSA/4Lu7BzTG4KIyPIryS4IsOd1oQ= golang.org/x/net v0.13.0/go.mod h1:zEVYFnQC7m/vmpQFELhcD1EWkZlX69l4oqgmer6hfKA= golang.org/x/net v0.14.0/go.mod h1:PpSgVXXLK0OxS0F31C1/tv6XNguvCrnXIDrFMspZIUI= golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= -golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= +golang.org/x/net v0.23.0 h1:7EYJ93RZ9vYSZAIb2x3lnuvqO5zneoD6IvWjuhfxjTs= +golang.org/x/net v0.23.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20181017192945-9dcd33a902f4/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20181203162652-d668ce993890/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= diff --git a/ops/aws/canary/lambda/go.mod b/ops/aws/canary/lambda/go.mod index 5cfae67d86..9b98eb01df 100644 --- a/ops/aws/canary/lambda/go.mod +++ b/ops/aws/canary/lambda/go.mod @@ -338,7 +338,7 @@ require ( golang.org/x/crypto v0.21.0 // indirect golang.org/x/exp v0.0.0-20240213143201-ec583247a57a // indirect golang.org/x/mod v0.15.0 // indirect - golang.org/x/net v0.21.0 // indirect + golang.org/x/net v0.23.0 // indirect golang.org/x/oauth2 v0.16.0 // indirect golang.org/x/sync v0.6.0 // indirect golang.org/x/sys v0.18.0 // indirect diff --git a/ops/aws/canary/lambda/go.sum b/ops/aws/canary/lambda/go.sum index 5961ae71d2..974e664d20 100644 --- a/ops/aws/canary/lambda/go.sum +++ b/ops/aws/canary/lambda/go.sum @@ -1284,8 +1284,8 @@ golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= -golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= +golang.org/x/net v0.23.0 h1:7EYJ93RZ9vYSZAIb2x3lnuvqO5zneoD6IvWjuhfxjTs= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20181017192945-9dcd33a902f4/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20181203162652-d668ce993890/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= From 71119100f5e3ff7d64b4755deac565bc6555a01a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 26 Apr 2024 15:28:47 -0700 Subject: [PATCH 14/17] Bump golang.org/x/net from 0.21.0 to 0.23.0 in /ops/aws/canary/lambda (#3808) Bumps [golang.org/x/net](https://github.com/golang/net) from 0.21.0 to 0.23.0.
Commits
  • c48da13 http2: fix TestServerContinuationFlood flakes
  • 762b58d http2: fix tipos in comment
  • ba87210 http2: close connections when receiving too many headers
  • ebc8168 all: fix some typos
  • 3678185 http2: make TestCanonicalHeaderCacheGrowth faster
  • 448c44f http2: remove clientTester
  • c7877ac http2: convert the remaining clientTester tests to testClientConn
  • d8870b0 http2: use synthetic time in TestIdleConnTimeout
  • d73acff http2: only set up deadline when Server.IdleTimeout is positive
  • 89f602b http2: validate client/outgoing trailers
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=golang.org/x/net&package-manager=go_modules&previous-version=0.21.0&new-version=0.23.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself) You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/bacalhau-project/bacalhau/network/alerts).
--------- Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: David Aronchick --- ops/aws/canary/lambda/go.sum | 160 +++++++++++++++++++++++++++++++++++ 1 file changed, 160 insertions(+) diff --git a/ops/aws/canary/lambda/go.sum b/ops/aws/canary/lambda/go.sum index 974e664d20..9d14ff45be 100644 --- a/ops/aws/canary/lambda/go.sum +++ b/ops/aws/canary/lambda/go.sum @@ -59,6 +59,7 @@ github.com/BTBurke/k8sresource v1.2.0/go.mod h1:3Sa2yHvNmOvwzP/WU8joqU4ZbBGUzToZ github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= github.com/Jorropo/jsync v1.0.1 h1:6HgRolFZnsdfzRUj+ImB9og1JYOxQoReSywkHOGSaUU= +github.com/Jorropo/jsync v1.0.1/go.mod h1:jCOZj3vrBCri3bSU3ErUYvevKlnbssrXeCivybS5ABQ= github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3QEww= github.com/Masterminds/semver v1.5.0/go.mod h1:MB6lktGJrhw8PrUyiEoblNEGEQ+RzHPF078ddwwvV3Y= github.com/Microsoft/go-winio v0.5.2/go.mod h1:WpS1mjBmmwHBEWmogvA2mj8546UReBk4v8QkMxJ6pZY= @@ -79,14 +80,17 @@ github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRF github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d/go.mod h1:rBZYJk541a8SKzHPHnH3zbiI+7dagKZ0cgpgrD7Fyho= github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137/go.mod h1:OMCwj8VM1Kc9e19TLln2VL61YJF0x1XFtfdL4JdbSyE= github.com/alecthomas/units v0.0.0-20231202071711-9a357b53e9c9 h1:ez/4by2iGztzR4L0zgAOR8lTQK9VlyBVVd7G4omaOQs= +github.com/alecthomas/units v0.0.0-20231202071711-9a357b53e9c9/go.mod h1:OMCwj8VM1Kc9e19TLln2VL61YJF0x1XFtfdL4JdbSyE= github.com/alexbrainman/goissue34681 v0.0.0-20191006012335-3fc7a47baff5 h1:iW0a5ljuFxkLGPNem5Ui+KBjFJzKg4Fv2fnxe4dvzpM= github.com/alexbrainman/goissue34681 v0.0.0-20191006012335-3fc7a47baff5/go.mod h1:Y2QMoi1vgtOIfc+6DhrMOGkLoGzqSV2rKp4Sm+opsyA= github.com/anmitsu/go-shlex v0.0.0-20161002113705-648efa622239/go.mod h1:2FmKhYUyUczH0OGQWaF5ceTx0UBShxjsH6f8oGKYe2c= github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be h1:9AeTilPcZAjCFIImctFaOjnTIavg87rW78vTPkQqLI8= +github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be/go.mod h1:ySMOLuWl6zY27l47sB3qLNK6tF2fkHG55UZxx8oIVo4= github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0 h1:jfIu9sQUG6Ig+0+Ap1h4unLjW6YQJpKZVmUzxsD4E/Q= github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0/go.mod h1:t2tdKJDJF9BV14lnkjHmOQgcvEKgtqs5a1N3LNdJhGE= github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= +github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= github.com/aws/aws-lambda-go v1.34.1 h1:M3a/uFYBjii+tDcOJ0wL/WyFi2550FHoECdPf27zvOs= github.com/aws/aws-lambda-go v1.34.1/go.mod h1:jwFe2KmMsHmffA1X2R09hH6lFzJQxzI8qK17ewzbQMM= github.com/aws/aws-sdk-go v1.44.96 h1:S9paaqnJ0AJ95t5AB+iK8RM6YNZN0W0Lek1gOVJsEr8= @@ -154,9 +158,11 @@ github.com/btcsuite/snappy-go v0.0.0-20151229074030-0bdef8d06723/go.mod h1:8woku github.com/btcsuite/websocket v0.0.0-20150119174127-31079b680792/go.mod h1:ghJtEyQwv5/p4Mg4C0fgbePVuGr935/5ddU9Z3TmDRY= github.com/btcsuite/winsvc v1.0.0/go.mod h1:jsenWakMcC0zFBFurPLEAyrnc/teJEM1O46fmI40EZs= github.com/bufbuild/protocompile v0.4.0 h1:LbFKd2XowZvQ/kajzguUp2DC9UEIQhIq77fZZlaQsNA= +github.com/bufbuild/protocompile v0.4.0/go.mod h1:3v93+mbWn/v3xzN+31nwkJfrEpAUwp+BagBSZWx+TP8= github.com/buger/jsonparser v0.0.0-20181115193947-bf1c66bbce23/go.mod h1:bbYlZJ7hK1yFx9hf58LP0zeX7UjIGs20ufpu3evjr+s= github.com/bwesterb/go-ristretto v1.2.3/go.mod h1:fUIoIZaG73pV5biE2Blr2xEzDoMj7NFEuV9ekS419A0= github.com/bytecodealliance/wasmtime-go/v3 v3.0.2 h1:3uZCA/BLTIu+DqCfguByNMJa2HVHpXvjfy0Dy7g6fuA= +github.com/bytecodealliance/wasmtime-go/v3 v3.0.2/go.mod h1:RnUjnIXxEJcL6BgCvNyzCCRzZcxCgsZCi+RNlvYor5Q= github.com/c2h5oh/datasize v0.0.0-20220606134207-859f65c6625b h1:6+ZFm0flnudZzdSE0JxlhR2hKnGPcNB35BjQf4RYQDY= github.com/c2h5oh/datasize v0.0.0-20220606134207-859f65c6625b/go.mod h1:S/7n9copUssQ56c7aAgHqftWO4LTf4xY6CGWt8Bc+3M= github.com/cenkalti/backoff/v4 v4.2.1 h1:y4OZtCnogmCPw98Zjyt5a6+QwPLGkiQsYW5oUqylYbM= @@ -187,6 +193,7 @@ github.com/containerd/cgroups v0.0.0-20201119153540-4cbc285b3327/go.mod h1:ZJeTF github.com/containerd/cgroups v1.1.0 h1:v8rEWFl6EoqHB+swVNjVoCJE8o3jX7e8nqBGPLaDFBM= github.com/containerd/cgroups v1.1.0/go.mod h1:6ppBcbh/NOOUU+dMKrykgaBnK9lCIBxHqJDGwsa1mIw= github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I= +github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo= github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE= github.com/coreos/go-etcd v2.0.0+incompatible/go.mod h1:Jez6KQU2B/sWsbdaef3ED8NzMklzPG4d5KIOhIy30Tk= github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk= @@ -200,8 +207,10 @@ github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:ma github.com/cpuguy83/go-md2man/v2 v2.0.0/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= github.com/cpuguy83/go-md2man/v2 v2.0.3/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/crackcomm/go-gitignore v0.0.0-20231225121904-e25f5bc08668 h1:ZFUue+PNxmHlu7pYv+IYMtqlaO/0VwaGEqKepZf9JpA= +github.com/crackcomm/go-gitignore v0.0.0-20231225121904-e25f5bc08668/go.mod h1:p1d6YEZWvFzEh4KLyvBcVSnrfNDDvK2zfK/4x2v/4pE= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/creack/pty v1.1.18 h1:n56/Zwd5o6whRC5PMGretI4IdRLlmBXYNjScPaBgsbY= +github.com/creack/pty v1.1.18/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4= github.com/cskr/pubsub v1.0.2 h1:vlOzMhl6PFn60gRlTQQsIfVwaPB/B/8MziK8FhEPt/0= github.com/cskr/pubsub v1.0.2/go.mod h1:/8MzYXk/NJAz782G8RPkFzXTZVu63VotefPnR9TIRis= github.com/cyphar/filepath-securejoin v0.2.4 h1:Ugdm7cg7i6ZK6x3xDF1oEu1nfkyfH53EtKeQYTC3kyg= @@ -220,16 +229,19 @@ github.com/dgraph-io/badger v1.6.0/go.mod h1:zwt7syl517jmP8s94KqSxTlM6IMsdhYy6ps github.com/dgraph-io/badger v1.6.2 h1:mNw0qs90GVgGGWylh0umH5iag1j6n/PeJtNvL6KY/x8= github.com/dgraph-io/badger v1.6.2/go.mod h1:JW2yswe3V058sS0kZ2h/AXeDSqFjxnZcRrVH//y2UQE= github.com/dgraph-io/badger/v3 v3.2103.5 h1:ylPa6qzbjYRQMU6jokoj4wzcaweHylt//CH0AKt0akg= +github.com/dgraph-io/badger/v3 v3.2103.5/go.mod h1:4MPiseMeDQ3FNCYwRbbcBOGJLf5jsE0PPFzRiKjtcdw= github.com/dgraph-io/ristretto v0.0.2/go.mod h1:KPxhHT9ZxKefz+PCeOGsrHpl1qZ7i70dGTu2u+Ahh6E= github.com/dgraph-io/ristretto v0.1.1 h1:6CWw5tJNgpegArSHpNHJKldNeq03FQCwYvfMVWajOK8= github.com/dgraph-io/ristretto v0.1.1/go.mod h1:S1GPSBCYCIhmVNfcth17y2zZtQT6wzkzgwUve0VDWWA= github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2/go.mod h1:SqUrOPUnsFjfmXRMNPybcSiG0BgUW2AuFH8PAnS2iTw= github.com/dgryski/go-farm v0.0.0-20200201041132-a6ae2369ad13 h1:fAjc9m62+UWV/WAFKLNi6ZS0675eEUC9y3AlwSbQu1Y= +github.com/dgryski/go-farm v0.0.0-20200201041132-a6ae2369ad13/go.mod h1:SqUrOPUnsFjfmXRMNPybcSiG0BgUW2AuFH8PAnS2iTw= github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48 h1:fRzb/w+pyskVMQ+UbP35JkH8yB7MYb4q/qhBarqZE6g= github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA= github.com/distribution/reference v0.5.0 h1:/FUIFXtfc/x2gpa5/VGfiGLuOIdYa1t65IKK2OFGvA0= github.com/distribution/reference v0.5.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E= github.com/docker/docker v25.0.4+incompatible h1:XITZTrq+52tZyZxUOtFIahUf3aH367FLxJzt9vZeAF8= +github.com/docker/docker v25.0.4+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= github.com/docker/go-connections v0.4.0 h1:El9xVISelRB7BuFusrZozjnkIM5YnzCViNKohAFqRJQ= github.com/docker/go-connections v0.4.0/go.mod h1:Gbd7IOopHjR8Iph03tsViu4nIes5XhDvyHbTtUxmeec= github.com/docker/go-units v0.4.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= @@ -244,6 +256,7 @@ github.com/elastic/gosigar v0.12.0/go.mod h1:iXRIGg2tLnu7LBdpqzyQfGDEidKCfWcCMS0 github.com/elastic/gosigar v0.14.2 h1:Dg80n8cr90OZ7x+bAax/QjoW/XqTI11RmA79ZwIm9/4= github.com/elastic/gosigar v0.14.2/go.mod h1:iXRIGg2tLnu7LBdpqzyQfGDEidKCfWcCMS0WKyPWoMs= github.com/elazarl/goproxy v0.0.0-20230808193330-2592e75ae04a h1:mATvB/9r/3gvcejNsXKSkQ6lcIaNec2nyfOdlTBR2lU= +github.com/elazarl/goproxy v0.0.0-20230808193330-2592e75ae04a/go.mod h1:Ro8st/ElPeALwNFlcTpWmkr6IoMFfkjXAvTHpevnDsM= github.com/elgris/jsondiff v0.0.0-20160530203242-765b5c24c302 h1:QV0ZrfBLpFc2KDk+a4LJefDczXnonRwrYrQJY/9L4dA= github.com/elgris/jsondiff v0.0.0-20160530203242-765b5c24c302/go.mod h1:qBlWZqWeVx9BjvqBsnC/8RUlAYpIFmPvgROcw0n1scE= github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc= @@ -266,22 +279,28 @@ github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2 github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568/go.mod h1:xEzjJPgXI435gkrCt3MPfRiAkVrwSbHsst4LCFVfpJc= github.com/flynn/noise v1.1.0 h1:KjPQoQCEFdZDiP03phOvGi11+SVVhBG2wOWAorLsstg= +github.com/flynn/noise v1.1.0/go.mod h1:xbMo+0i6+IGbYdJhF31t2eR1BIU0CYc12+BNAKwUTag= github.com/fortytw2/leaktest v1.3.0 h1:u8491cBMTQ8ft8aeV+adlcytMZylmA5nnwwkRZjI8vw= +github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g= github.com/foxcpp/go-mockdns v1.0.0 h1:7jBqxd3WDWwi/6WhDvacvH1XsN3rOLXyHM1uhvIx6FI= +github.com/foxcpp/go-mockdns v1.0.0/go.mod h1:lgRN6+KxQBawyIghpnl5CezHFGS9VLzvtVlwxvzXTQ4= github.com/francoispqt/gojay v1.2.13 h1:d2m3sFjloqoIUQU3TsHBgj6qg/BVGlTBeHDUmyJnXKk= github.com/francoispqt/gojay v1.2.13/go.mod h1:ehT5mTG4ua4581f1++1WLG0vPdaA9HaiDsoyrBGkyDY= github.com/frankban/quicktest v1.11.3/go.mod h1:wRf/ReqHper53s+kmmSZizM8NamnL3IM0I9ntUbOk+k= github.com/frankban/quicktest v1.14.0/go.mod h1:NeW+ay9A/U67EYXNFA1nPE8e/tnQv/09mUdL/ijj8og= github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= +github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ= github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= github.com/gabriel-vasile/mimetype v1.4.3 h1:in2uUcidCuFcDKtdcBxlR0rJ1+fsokWf+uqxgUFjbI0= +github.com/gabriel-vasile/mimetype v1.4.3/go.mod h1:d8uq/6HKRL6CGdk+aubisF/M5GcPfT7nKyLpA0lbSSk= github.com/ghodss/yaml v1.0.0 h1:wQHKEahhL6wmXdzwWG11gIVCkOv05bNOh+Rxn0yngAk= github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= github.com/gliderlabs/ssh v0.1.1/go.mod h1:U7qILu1NlMHj9FlMhZLlkCdDnU1DBEAqr0aevW3Awn0= github.com/gliderlabs/ssh v0.3.5 h1:OcaySEmAQJgyYcArR+gGGTHCyE7nvhEMTlYY+Dp8CpY= +github.com/gliderlabs/ssh v0.3.5/go.mod h1:8XB4KraRrX39qHhT6yxPsHedjA08I/uBVwj4xC+/+z4= github.com/go-check/check v0.0.0-20180628173108-788fd7840127/go.mod h1:9ES+weclKsC9YodN5RgxqK/VD9HM9JsCSh7rNhMZE98= github.com/go-errors/errors v1.0.1/go.mod h1:f4zRHt4oKfwPJE5k8C9vpYG+aDHdBFUsgrm6/TyX73Q= github.com/go-git/gcfg v1.5.1-0.20230307220236-3a3c6141e376 h1:+zs/tPmkDkHx3U66DAb0lQFJrpS6731Oaa12ikc+DiI= @@ -289,6 +308,7 @@ github.com/go-git/gcfg v1.5.1-0.20230307220236-3a3c6141e376/go.mod h1:an3vInlBmS github.com/go-git/go-billy/v5 v5.5.0 h1:yEY4yhzCDuMGSv83oGxiBotRzhwhNr8VZyphhiu+mTU= github.com/go-git/go-billy/v5 v5.5.0/go.mod h1:hmexnoNsr2SJU1Ju67OaNz5ASJY3+sHgFRpCtpDCKow= github.com/go-git/go-git-fixtures/v4 v4.3.2-0.20231010084843-55a94097c399 h1:eMje31YglSBqCdIqdhKBW8lokaMrL3uTkpGYlE2OOT4= +github.com/go-git/go-git-fixtures/v4 v4.3.2-0.20231010084843-55a94097c399/go.mod h1:1OCfN199q1Jm3HZlxleg+Dw/mwps2Wbk9frAWm+4FII= github.com/go-git/go-git/v5 v5.11.0 h1:XIZc1p+8YzypNr34itUfSvYJcv+eYdTnTvOZ2vD3cA4= github.com/go-git/go-git/v5 v5.11.0/go.mod h1:6GFcX2P3NM7FPBfpePbpLd21XxsgdAt+lKqXmCUiUCY= github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU= @@ -314,6 +334,7 @@ github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ4 github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s= +github.com/go-playground/assert/v2 v2.2.0/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4= github.com/go-playground/locales v0.14.1 h1:EWaQ/wswjilfKLTECiXz7Rh+3BjFhfDFKv/oXslEjJA= github.com/go-playground/locales v0.14.1/go.mod h1:hxrqLVvrK65+Rwrd5Fc6F2O76J/NuW9t0sjnWqG1slY= github.com/go-playground/universal-translator v0.18.1 h1:Bcnm0ZwsGyWbCzImXv+pAJnYK9S473LQFuzCbDbfSFY= @@ -321,6 +342,7 @@ github.com/go-playground/universal-translator v0.18.1/go.mod h1:xekY+UJKNuX9WP91 github.com/go-playground/validator/v10 v10.16.0 h1:x+plE831WK4vaKHO/jpgUGsvLKIqRRkz6M78GuJAfGE= github.com/go-playground/validator/v10 v10.16.0/go.mod h1:9iXMNT7sEkjXb0I+enO7QXmzG6QCsPWY4zveKFVRSyU= github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= +github.com/go-task/slim-sprig v0.0.0-20210107165309-348f09dbbbc0/go.mod h1:fyg7847qk6SyHyPtNmDHnmrv/HOrqktSC+C9fM+CJOE= github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI= github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572/go.mod h1:9Pwr4B2jHnOSGXyyzV8ROjYa2ojvAY6HCGYYfMoC3Ls= github.com/go-test/deep v1.0.4 h1:u2CU3YKy9I2pmu9pX0eq50wCgjfGIt539SqR7FbHiho= @@ -382,6 +404,7 @@ github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEW github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= github.com/google/flatbuffers v2.0.8+incompatible h1:ivUb1cGomAB101ZM1T0nOiWz9pSrTMoa9+EiY7igmkM= +github.com/google/flatbuffers v2.0.8+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8= github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= @@ -398,6 +421,7 @@ github.com/google/go-cmp v0.5.7/go.mod h1:n+brtR0CgQNWTVd5ZUFpTBC8YFBDLK/h/bpaJ8 github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-github v17.0.0+incompatible/go.mod h1:zLgOLi98H3fifZn+44m+umXrS52loVEgC2AApnigrVQ= github.com/google/go-querystring v1.0.0/go.mod h1:odCYkC5MyYFN7vkCjXpyrEuKhc/BUO6wN/zVPAxq5ck= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= @@ -417,9 +441,11 @@ github.com/google/pprof v0.0.0-20201023163331-3e6fc7fc9c4c/go.mod h1:kpwsk12EmLe github.com/google/pprof v0.0.0-20201203190320-1bf35d6f28c2/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/pprof v0.0.0-20201218002935-b9804c9f04c2/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/pprof v0.0.0-20240207164012-fb44976bdcd5 h1:E/LAvt58di64hlYjx7AsNS6C/ysHWYo+2qPCZKTQhRo= +github.com/google/pprof v0.0.0-20240207164012-fb44976bdcd5/go.mod h1:czg5+yv1E0ZGTi6S6vVK1mke0fV+FaUhNGcd6VRS9Ik= github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/google/uuid v1.3.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.5.0 h1:1p67kYwdtXjb0gL0BPiP1Av9wiZPo5A8z2cWkTZ+eyU= github.com/google/uuid v1.5.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/googleapis/gax-go v2.0.0+incompatible/go.mod h1:SFVmujtThgffbyetf+mdk2eWhX2bMyUtNHzFKcPA9HY= @@ -429,6 +455,7 @@ github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5m github.com/googleapis/google-cloud-go-testing v0.0.0-20200911160855-bcd43fbb19e8/go.mod h1:dvDLG8qkwmyD9a/MJJN3XJcT3xFxOKAvTZGvuZmac9g= github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= github.com/gopherjs/gopherjs v0.0.0-20190812055157-5d271430af9f h1:KMlcu9X58lhTA/KrfX8Bi1LQSO4pzoVjTiL3h4Jk+Zk= +github.com/gopherjs/gopherjs v0.0.0-20190812055157-5d271430af9f/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ= github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= @@ -457,13 +484,16 @@ github.com/hashicorp/go-retryablehttp v0.7.5/go.mod h1:Jy/gPYAdjqffZ/yFGCFV2doI5 github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/hashicorp/golang-lru v1.0.2 h1:dV3g9Z/unq5DpblPpw+Oqcv4dU/1omnb4Ok8iPY6p1c= +github.com/hashicorp/golang-lru v1.0.2/go.mod h1:iADmTwqILo4mZ8BN3D2Q6+9jd8WM5uGBxy+E8yxSoD4= github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= +github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= github.com/hashicorp/yamux v0.1.1 h1:yrQxtgseBDrq9Y652vSRDvsKCJKOUD+GzTS4Y0Y8pvE= github.com/hashicorp/yamux v0.1.1/go.mod h1:CtWFDAQgb7dxtzFs4tWbplKIe2jSi3+5vKbgIO0SLnQ= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= github.com/huin/goupnp v1.3.0 h1:UvLUlWDNpoUdYzb2TCn+MuTWtcjXKSza2n6CBdQ0xXc= +github.com/huin/goupnp v1.3.0/go.mod h1:gnGPsThkYa7bFi/KWmEysQRf48l2dvR5bxr2OFckNX8= github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= github.com/ianlancetaylor/demangle v0.0.0-20230524184225-eabc099b10ab h1:BA4a7pe6ZTd9F8kXETBoijjFJ/ntaa//1wiH9BZu4zU= @@ -474,17 +504,22 @@ github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANyt github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/ipfs-shipyard/nopfs v0.0.12 h1:mvwaoefDF5VI9jyvgWCmaoTJIJFAfrbyQV5fJz35hlk= +github.com/ipfs-shipyard/nopfs v0.0.12/go.mod h1:mQyd0BElYI2gB/kq/Oue97obP4B3os4eBmgfPZ+hnrE= github.com/ipfs-shipyard/nopfs/ipfs v0.13.2-0.20231027223058-cde3b5ba964c h1:7UynTbtdlt+w08ggb1UGLGaGjp1mMaZhoTZSctpn5Ak= +github.com/ipfs-shipyard/nopfs/ipfs v0.13.2-0.20231027223058-cde3b5ba964c/go.mod h1:6EekK/jo+TynwSE/ZOiOJd4eEvRXoavEC3vquKtv4yI= github.com/ipfs/bbloom v0.0.4 h1:Gi+8EGJ2y5qiD5FbsbpX/TMNcJw8gSqr7eyjHa4Fhvs= github.com/ipfs/bbloom v0.0.4/go.mod h1:cS9YprKXpoZ9lT0n/Mw/a6/aFV6DTjTLYHeA+gyqMG0= github.com/ipfs/boxo v0.18.0 h1:MOL9/AgoV3e7jlVMInicaSdbgralfqSsbkc31dZ9tmw= +github.com/ipfs/boxo v0.18.0/go.mod h1:pIZgTWdm3k3pLF9Uq6MB8JEcW07UDwNJjlXW1HELW80= github.com/ipfs/go-bitfield v1.1.0 h1:fh7FIo8bSwaJEh6DdTWbCeZ1eqOaOkKFI74SCnsWbGA= github.com/ipfs/go-bitfield v1.1.0/go.mod h1:paqf1wjq/D2BBmzfTVFlJQ9IlFOZpg422HL0HqsGWHU= github.com/ipfs/go-bitswap v0.11.0 h1:j1WVvhDX1yhG32NTC9xfxnqycqYIlhzEzLXG/cU1HyQ= +github.com/ipfs/go-bitswap v0.11.0/go.mod h1:05aE8H3XOU+LXpTedeAS0OZpcO1WFsj5niYQH9a1Tmk= github.com/ipfs/go-block-format v0.0.3/go.mod h1:4LmD4ZUw0mhO+JSKdpWwrzATiEfM7WWgQ8H5l6P8MVk= github.com/ipfs/go-block-format v0.2.0 h1:ZqrkxBA2ICbDRbK8KJs/u0O3dlp6gmAuuXUJNiW1Ycs= github.com/ipfs/go-block-format v0.2.0/go.mod h1:+jpL11nFx5A/SPpsoBn6Bzkra/zaArfSmsknbPMYgzM= github.com/ipfs/go-blockservice v0.5.0 h1:B2mwhhhVQl2ntW2EIpaWPwSCxSuqr5fFA93Ms4bYLEY= +github.com/ipfs/go-blockservice v0.5.0/go.mod h1:W6brZ5k20AehbmERplmERn8o2Ni3ZZubvAxaIUeaT6w= github.com/ipfs/go-cid v0.0.3/go.mod h1:GHWU/WuQdMPmIosc4Yn1bcCT7dSeX4lBafM7iqUPQvM= github.com/ipfs/go-cid v0.0.4/go.mod h1:4LLaPOQwmk5z9LBgQnpkivrx8BJjUyGwTXCd5Xfj6+M= github.com/ipfs/go-cid v0.0.5/go.mod h1:plgt+Y5MnOey4vO4UlUazGqdbEXuFYitED67FexhXog= @@ -516,20 +551,26 @@ github.com/ipfs/go-fs-lock v0.0.7/go.mod h1:Js8ka+FNYmgQRLrRXzU3CB/+Csr1BwrRilEc github.com/ipfs/go-ipfs-blockstore v1.3.0 h1:m2EXaWgwTzAfsmt5UdJ7Is6l4gJcaM/A12XwJyvYvMM= github.com/ipfs/go-ipfs-blockstore v1.3.0/go.mod h1:KgtZyc9fq+P2xJUiCAzbRdhhqJHvsw8u2Dlqy2MyRTE= github.com/ipfs/go-ipfs-blocksutil v0.0.1 h1:Eh/H4pc1hsvhzsQoMEP3Bke/aW5P5rVM1IWFJMcGIPQ= +github.com/ipfs/go-ipfs-blocksutil v0.0.1/go.mod h1:Yq4M86uIOmxmGPUHv/uI7uKqZNtLb449gwKqXjIsnRk= github.com/ipfs/go-ipfs-chunker v0.0.5 h1:ojCf7HV/m+uS2vhUGWcogIIxiO5ubl5O57Q7NapWLY8= +github.com/ipfs/go-ipfs-chunker v0.0.5/go.mod h1:jhgdF8vxRHycr00k13FM8Y0E+6BoalYeobXmUyTreP8= github.com/ipfs/go-ipfs-cmds v0.10.0 h1:ZB4+RgYaH4UARfJY0uLKl5UXgApqnRjKbuCiJVcErYk= +github.com/ipfs/go-ipfs-cmds v0.10.0/go.mod h1:sX5d7jkCft9XLPnkgEfXY0z2UBOB5g6fh/obBS0enJE= github.com/ipfs/go-ipfs-delay v0.0.0-20181109222059-70721b86a9a8/go.mod h1:8SP1YXK1M1kXuc4KJZINY3TQQ03J2rwBG9QfXmbRPrw= github.com/ipfs/go-ipfs-delay v0.0.1 h1:r/UXYyRcddO6thwOnhiznIAiSvxMECGgtv35Xs1IeRQ= github.com/ipfs/go-ipfs-delay v0.0.1/go.mod h1:8SP1YXK1M1kXuc4KJZINY3TQQ03J2rwBG9QfXmbRPrw= github.com/ipfs/go-ipfs-ds-help v1.1.0 h1:yLE2w9RAsl31LtfMt91tRZcrx+e61O5mDxFRR994w4Q= github.com/ipfs/go-ipfs-ds-help v1.1.0/go.mod h1:YR5+6EaebOhfcqVCyqemItCLthrpVNot+rsOU/5IatU= github.com/ipfs/go-ipfs-exchange-interface v0.2.0 h1:8lMSJmKogZYNo2jjhUs0izT+dck05pqUw4mWNW9Pw6Y= +github.com/ipfs/go-ipfs-exchange-interface v0.2.0/go.mod h1:z6+RhJuDQbqKguVyslSOuVDhqF9JtTrO3eptSAiW2/Y= github.com/ipfs/go-ipfs-exchange-offline v0.3.0 h1:c/Dg8GDPzixGd0MC8Jh6mjOwU57uYokgWRFidfvEkuA= +github.com/ipfs/go-ipfs-exchange-offline v0.3.0/go.mod h1:MOdJ9DChbb5u37M1IcbrRB02e++Z7521fMxqCNRrz9s= github.com/ipfs/go-ipfs-pq v0.0.3 h1:YpoHVJB+jzK15mr/xsWC574tyDLkezVrDNeaalQBsTE= github.com/ipfs/go-ipfs-pq v0.0.3/go.mod h1:btNw5hsHBpRcSSgZtiNm/SLj5gYIZ18AKtv3kERkRb4= github.com/ipfs/go-ipfs-redirects-file v0.1.1 h1:Io++k0Vf/wK+tfnhEh63Yte1oQK5VGT2hIEYpD0Rzx8= github.com/ipfs/go-ipfs-redirects-file v0.1.1/go.mod h1:tAwRjCV0RjLTjH8DR/AU7VYvfQECg+lpUy2Mdzv7gyk= github.com/ipfs/go-ipfs-routing v0.3.0 h1:9W/W3N+g+y4ZDeffSgqhgo7BsBSJwPMcyssET9OWevc= +github.com/ipfs/go-ipfs-routing v0.3.0/go.mod h1:dKqtTFIql7e1zYsEuWLyuOU+E0WJWW8JjbTPLParDWo= github.com/ipfs/go-ipfs-util v0.0.1/go.mod h1:spsl5z8KUnrve+73pOhSVZND1SIxPW5RyBCNzQxlJBc= github.com/ipfs/go-ipfs-util v0.0.2/go.mod h1:CbPtkWJzjLdEcezDns2XYaehFVNXG9zrdrtMecczcsQ= github.com/ipfs/go-ipfs-util v0.0.3 h1:2RFdGez6bu2ZlZdI+rWfIdbQb1KudQp3VGwPtdNCmE0= @@ -559,12 +600,17 @@ github.com/ipfs/go-metrics-interface v0.0.1/go.mod h1:6s6euYU4zowdslK0GKHmqaIZ3j github.com/ipfs/go-peertaskqueue v0.8.1 h1:YhxAs1+wxb5jk7RvS0LHdyiILpNmRIRnZVztekOF0pg= github.com/ipfs/go-peertaskqueue v0.8.1/go.mod h1:Oxxd3eaK279FxeydSPPVGHzbwVeHjatZ2GA8XD+KbPU= github.com/ipfs/go-unixfs v0.4.5 h1:wj8JhxvV1G6CD7swACwSKYa+NgtdWC1RUit+gFnymDU= +github.com/ipfs/go-unixfs v0.4.5/go.mod h1:BIznJNvt/gEx/ooRMI4Us9K8+qeGO7vx1ohnbk8gjFg= github.com/ipfs/go-unixfsnode v1.9.0 h1:ubEhQhr22sPAKO2DNsyVBW7YB/zA8Zkif25aBvz8rc8= github.com/ipfs/go-unixfsnode v1.9.0/go.mod h1:HxRu9HYHOjK6HUqFBAi++7DVoWAHn0o4v/nZ/VA+0g8= github.com/ipfs/go-verifcid v0.0.2 h1:XPnUv0XmdH+ZIhLGKg6U2vaPaRDXb9urMyNVCE7uvTs= +github.com/ipfs/go-verifcid v0.0.2/go.mod h1:40cD9x1y4OWnFXbLNJYRe7MpNvWlMn3LZAG5Wb4xnPU= github.com/ipfs/kubo v0.27.0 h1:rVWKI9VGYt8Eyr/4vflUbT6OrOgOWG0ddHeEAajKClA= +github.com/ipfs/kubo v0.27.0/go.mod h1:7HMQUnD+S1q9P3G7iV3VfwHzukJ/PeUm4geYYDC+hx0= github.com/ipld/go-car v0.5.0 h1:kcCEa3CvYMs0iE5BzD5sV7O2EwMiCIp3uF8tA6APQT8= +github.com/ipld/go-car v0.5.0/go.mod h1:ppiN5GWpjOZU9PgpAZ9HbZd9ZgSpwPMr48fGRJOWmvE= github.com/ipld/go-car/v2 v2.13.1 h1:KnlrKvEPEzr5IZHKTXLAEub+tPrzeAFQVRlSQvuxBO4= +github.com/ipld/go-car/v2 v2.13.1/go.mod h1:QkdjjFNGit2GIkpQ953KBwowuoukoM75nP/JI1iDJdo= github.com/ipld/go-codec-dagpb v1.6.0 h1:9nYazfyu9B1p3NAgfVdpRco3Fs2nFC72DqVsMj6rOcc= github.com/ipld/go-codec-dagpb v1.6.0/go.mod h1:ANzFhfP2uMJxRBr8CE+WQWs5UsNa0pYtmKZ+agnUw9s= github.com/ipld/go-ipld-prime v0.11.0/go.mod h1:+WIAkokurHmZ/KwzDOMUuoeJgaRQktHtEaLglS3ZeV8= @@ -572,6 +618,7 @@ github.com/ipld/go-ipld-prime v0.14.1/go.mod h1:QcE4Y9n/ZZr8Ijg5bGPT0GqYWgZ1704n github.com/ipld/go-ipld-prime v0.21.0 h1:n4JmcpOlPDIxBcY037SVfpd1G+Sj1nKZah0m6QH9C2E= github.com/ipld/go-ipld-prime v0.21.0/go.mod h1:3RLqy//ERg/y5oShXXdx5YIp50cFGOanyMctpPjsvxQ= github.com/ipld/go-ipld-prime/storage/bsadapter v0.0.0-20230102063945-1a409dc236dd h1:gMlw/MhNr2Wtp5RwGdsW23cs+yCuj9k2ON7i9MiJlRo= +github.com/ipld/go-ipld-prime/storage/bsadapter v0.0.0-20230102063945-1a409dc236dd/go.mod h1:wZ8hH8UxeryOs4kJEJaiui/s00hDSbE37OKsL47g+Sw= github.com/jackpal/go-nat-pmp v1.0.2 h1:KzKSgb7qkJvOUTqYl9/Hg/me3pWgBmERKrTGD7BdWus= github.com/jackpal/go-nat-pmp v1.0.2/go.mod h1:QPH045xvCAeXUZOxsnwmrtiCoxIr9eob+4orBN1SBKc= github.com/jbenet/go-cienv v0.1.0 h1:Vc/s0QbQtoxX8MwwSLWWh+xNNZvM3Lw7NsTcHrvvhMc= @@ -590,6 +637,7 @@ github.com/jellevandenhooff/dkim v0.0.0-20150330215556-f50fe3d243e1/go.mod h1:E0 github.com/jessevdk/go-flags v0.0.0-20141203071132-1679536dcc89/go.mod h1:4FA24M0QyGHXBuZZK/XkWh8h0e1EYbRYJSGM75WSRxI= github.com/jessevdk/go-flags v1.4.0/go.mod h1:4FA24M0QyGHXBuZZK/XkWh8h0e1EYbRYJSGM75WSRxI= github.com/jhump/protoreflect v1.15.1 h1:HUMERORf3I3ZdX05WaQ6MIpd/NJ434hTp5YiKgfCL6c= +github.com/jhump/protoreflect v1.15.1/go.mod h1:jD/2GMKKE6OqX8qTjhADU1e6DShO+gavG9e0Q693nKo= github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo= github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8= @@ -619,6 +667,7 @@ github.com/klauspost/compress v1.17.6/go.mod h1:/dCuZOvVtNoHsyb+cuJD3itjs3NbnF6K github.com/klauspost/cpuid/v2 v2.0.4/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= github.com/klauspost/cpuid/v2 v2.2.7 h1:ZWSB3igEs+d0qvnxR/ZBzXVmxkgt8DdzP6m9pfuVLDM= +github.com/klauspost/cpuid/v2 v2.2.7/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws= github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/koron/go-ssdp v0.0.4 h1:1IDwrghSKYM7yLf7XCzbByg2sJ/JcNOZRXS2jczTwz0= @@ -630,6 +679,7 @@ github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfn github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/pty v1.1.3/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= @@ -650,6 +700,7 @@ github.com/lestrrat-go/httpcc v1.0.1/go.mod h1:qiltp3Mt56+55GPVCbTdM9MlqhvzyuL6W github.com/lestrrat-go/iter v1.0.2 h1:gMXo1q4c2pHmC3dn8LzRhJfP1ceCbgSiT9lUydIzltI= github.com/lestrrat-go/iter v1.0.2/go.mod h1:Momfcq3AnRlRjI5b5O8/G5/BvpzrhoFTZcn06fEOPt4= github.com/lestrrat-go/jwx v1.2.29 h1:QT0utmUJ4/12rmsVQrJ3u55bycPkKqGYuGT4tyRhxSQ= +github.com/lestrrat-go/jwx v1.2.29/go.mod h1:hU8k2l6WF0ncx20uQdOmik/Gjg6E3/wIRtXSNFeZuB8= github.com/lestrrat-go/option v1.0.0/go.mod h1:5ZHFbivi4xwXxhxY9XHDe2FHo6/Z7WWmtT7T5nBBp3I= github.com/lestrrat-go/option v1.0.1 h1:oAzP2fvZGQKWkvHa1/SAcFolBEca1oN+mQ7eooNBEYU= github.com/lestrrat-go/option v1.0.1/go.mod h1:5ZHFbivi4xwXxhxY9XHDe2FHo6/Z7WWmtT7T5nBBp3I= @@ -666,7 +717,9 @@ github.com/libp2p/go-flow-metrics v0.0.3/go.mod h1:HeoSNUrOJVK1jEpDqVEiUOIXqhbnS github.com/libp2p/go-flow-metrics v0.1.0 h1:0iPhMI8PskQwzh57jB9WxIuIOQ0r+15PChFGkx3Q3WM= github.com/libp2p/go-flow-metrics v0.1.0/go.mod h1:4Xi8MX8wj5aWNDAZttg6UPmc0ZrnFNsMtpsYUClFtro= github.com/libp2p/go-libp2p v0.33.0 h1:yTPSr8sJRbfeEYXyeN8VPVSlTlFjtMUwGDRniwaf/xQ= +github.com/libp2p/go-libp2p v0.33.0/go.mod h1:RIJFRQVUBKy82dnW7J5f1homqqv6NcsDJAl3e7CRGfE= github.com/libp2p/go-libp2p-asn-util v0.4.1 h1:xqL7++IKD9TBFMgnLPZR6/6iYhawHKHl950SO9L6n94= +github.com/libp2p/go-libp2p-asn-util v0.4.1/go.mod h1:d/NI6XZ9qxw67b4e+NgpQexCIiFYJjErASrYW4PFDN8= github.com/libp2p/go-libp2p-core v0.2.4/go.mod h1:STh4fdfa5vDYr0/SzYYeqnt+E6KfEV5VxfIrm0bcI0g= github.com/libp2p/go-libp2p-core v0.3.0/go.mod h1:ACp3DmS3/N64c2jDzcV429ukDpicbL6+TrrxANBjPGw= github.com/libp2p/go-libp2p-gostream v0.6.0 h1:QfAiWeQRce6pqnYfmIVWJFXNdDyfiR/qkCnjyaZUPYU= @@ -674,29 +727,36 @@ github.com/libp2p/go-libp2p-gostream v0.6.0/go.mod h1:Nywu0gYZwfj7Jc91PQvbGU8dIp github.com/libp2p/go-libp2p-http v0.5.0 h1:+x0AbLaUuLBArHubbbNRTsgWz0RjNTy6DJLOxQ3/QBc= github.com/libp2p/go-libp2p-http v0.5.0/go.mod h1:glh87nZ35XCQyFsdzZps6+F4HYI6DctVFY5u1fehwSg= github.com/libp2p/go-libp2p-kad-dht v0.24.4 h1:ktNiJe7ffsJ1wX3ULpMCwXts99mPqGFSE/Qn1i8pErQ= +github.com/libp2p/go-libp2p-kad-dht v0.24.4/go.mod h1:ybWBJ5Fbvz9sSLkNtXt+2+bK0JB8+tRPvhBbRGHegRU= github.com/libp2p/go-libp2p-kbucket v0.3.1/go.mod h1:oyjT5O7tS9CQurok++ERgc46YLwEpuGoFq9ubvoUOio= github.com/libp2p/go-libp2p-kbucket v0.6.3 h1:p507271wWzpy2f1XxPzCQG9NiN6R6lHL9GiSErbQQo0= github.com/libp2p/go-libp2p-kbucket v0.6.3/go.mod h1:RCseT7AH6eJWxxk2ol03xtP9pEHetYSPXOaJnOiD8i0= github.com/libp2p/go-libp2p-peerstore v0.1.4/go.mod h1:+4BDbDiiKf4PzpANZDAT+knVdLxvqh7hXOujessqdzs= github.com/libp2p/go-libp2p-pubsub v0.10.0 h1:wS0S5FlISavMaAbxyQn3dxMOe2eegMfswM471RuHJwA= +github.com/libp2p/go-libp2p-pubsub v0.10.0/go.mod h1:1OxbaT/pFRO5h+Dpze8hdHQ63R0ke55XTs6b6NwLLkw= github.com/libp2p/go-libp2p-pubsub-router v0.6.0 h1:D30iKdlqDt5ZmLEYhHELCMRj8b4sFAqrUcshIUvVP/s= github.com/libp2p/go-libp2p-pubsub-router v0.6.0/go.mod h1:FY/q0/RBTKsLA7l4vqC2cbRbOvyDotg8PJQ7j8FDudE= github.com/libp2p/go-libp2p-record v0.2.0 h1:oiNUOCWno2BFuxt3my4i1frNrt7PerzB3queqa1NkQ0= github.com/libp2p/go-libp2p-record v0.2.0/go.mod h1:I+3zMkvvg5m2OcSdoL0KPljyJyvNDFGKX7QdlpYUcwk= github.com/libp2p/go-libp2p-routing-helpers v0.7.3 h1:u1LGzAMVRK9Nqq5aYDVOiq/HaB93U9WWczBzGyAC5ZY= +github.com/libp2p/go-libp2p-routing-helpers v0.7.3/go.mod h1:cN4mJAD/7zfPKXBcs9ze31JGYAZgzdABEm+q/hkswb8= github.com/libp2p/go-libp2p-testing v0.12.0 h1:EPvBb4kKMWO29qP4mZGyhVzUyR25dvfUIK5WDu6iPUA= +github.com/libp2p/go-libp2p-testing v0.12.0/go.mod h1:KcGDRXyN7sQCllucn1cOOS+Dmm7ujhfEyXQL5lvkcPg= github.com/libp2p/go-libp2p-xor v0.1.0 h1:hhQwT4uGrBcuAkUGXADuPltalOdpf9aag9kaYNT2tLA= github.com/libp2p/go-libp2p-xor v0.1.0/go.mod h1:LSTM5yRnjGZbWNTA/hRwq2gGFrvRIbQJscoIL/u6InY= github.com/libp2p/go-msgio v0.0.4/go.mod h1:63lBBgOTDKQL6EWazRMCwXsEeEeK9O2Cd+0+6OOuipQ= github.com/libp2p/go-msgio v0.3.0 h1:mf3Z8B1xcFN314sWX+2vOTShIE0Mmn2TXn3YCUQGNj0= github.com/libp2p/go-msgio v0.3.0/go.mod h1:nyRM819GmVaF9LX3l03RMh10QdOroF++NBbxAb0mmDM= github.com/libp2p/go-nat v0.2.0 h1:Tyz+bUFAYqGyJ/ppPPymMGbIgNRH+WqC5QrT5fKrrGk= +github.com/libp2p/go-nat v0.2.0/go.mod h1:3MJr+GRpRkyT65EpVPBstXLvOlAPzUVlG6Pwg9ohLJk= github.com/libp2p/go-netroute v0.2.1 h1:V8kVrpD8GK0Riv15/7VN6RbUQ3URNZVosw7H2v9tksU= github.com/libp2p/go-netroute v0.2.1/go.mod h1:hraioZr0fhBjG0ZRXJJ6Zj2IVEVNx6tDTFQfSmcq7mQ= github.com/libp2p/go-openssl v0.0.3/go.mod h1:unDrJpgy3oFr+rqXsarWifmJuNnJR4chtO1HmaZjggc= github.com/libp2p/go-openssl v0.0.4/go.mod h1:unDrJpgy3oFr+rqXsarWifmJuNnJR4chtO1HmaZjggc= github.com/libp2p/go-reuseport v0.4.0 h1:nR5KU7hD0WxXCJbmw7r2rhRYruNRl2koHw8fQscQm2s= +github.com/libp2p/go-reuseport v0.4.0/go.mod h1:ZtI03j/wO5hZVDFo2jKywN6bYKWLOy8Se6DrI2E1cLU= github.com/libp2p/go-yamux/v4 v4.0.1 h1:FfDR4S1wj6Bw2Pqbc8Uz7pCxeRBPbwsBbEdfwiCypkQ= +github.com/libp2p/go-yamux/v4 v4.0.1/go.mod h1:NWjl8ZTLOGlozrXSOZ/HlfG++39iKNnM5wwmtQP1YB4= github.com/libp2p/zeroconf/v2 v2.2.0 h1:Cup06Jv6u81HLhIj1KasuNM/RHHrJ8T7wOTS4+Tv53Q= github.com/libp2p/zeroconf/v2 v2.2.0/go.mod h1:fuJqLnUwZTshS3U/bMRJ3+ow/v9oid1n0DmyYyNO1Xs= github.com/lunixbochs/vtclean v1.0.0/go.mod h1:pHhQNgMf3btfWnGBVipUOjRYhoOsdGqdm/+2c2E2WMI= @@ -735,6 +795,7 @@ github.com/microcosm-cc/bluemonday v1.0.1/go.mod h1:hsXNsILzKxV+sX77C5b8FSuKF00v github.com/miekg/dns v1.1.41/go.mod h1:p6aan82bvRIyn+zDIv9xYNUpwa73JcSh9BKwknJysuI= github.com/miekg/dns v1.1.43/go.mod h1:+evo5L0630/F6ca/Z9+GAqzhjGyn8/c+TBaOyfEl0V4= github.com/miekg/dns v1.1.58 h1:ca2Hdkz+cDg/7eNF6V56jjzuZ4aCAE+DbVkILdQWG/4= +github.com/miekg/dns v1.1.58/go.mod h1:Ypv+3b/KadlvW9vJfXOTf300O4UqaHFzFCuHz+rPkBY= github.com/mikioh/tcp v0.0.0-20190314235350-803a9b46060c h1:bzE/A84HN25pxAuk9Eej1Kz9OUelF97nAc82bDquQI8= github.com/mikioh/tcp v0.0.0-20190314235350-803a9b46060c/go.mod h1:0SQS9kMwD2VsyFEB++InYyBJroV/FRmBgcydeSUcJms= github.com/mikioh/tcpinfo v0.0.0-20190314235526-30a79bb1804b h1:z78hV3sbSMAUoyUMM0I83AUIT6Hu17AWfgjzIbtrYFc= @@ -805,6 +866,7 @@ github.com/multiformats/go-multihash v0.1.0/go.mod h1:RJlXsxt6vHGaia+S8We0Erjhoj github.com/multiformats/go-multihash v0.2.3 h1:7Lyc8XfX/IY2jWb/gI7JP+o7JEq9hOa7BFvVU9RSh+U= github.com/multiformats/go-multihash v0.2.3/go.mod h1:dXgKXCXjBzdscBLk9JkjINiEsCKRVch90MdaGiKsvSM= github.com/multiformats/go-multistream v0.5.0 h1:5htLSLl7lvJk3xx3qT/8Zm9J4K8vEOf/QGkvOGQAyiE= +github.com/multiformats/go-multistream v0.5.0/go.mod h1:n6tMZiwiP2wUsR8DgfDWw1dydlEqV3l6N3/GBsX6ILA= github.com/multiformats/go-varint v0.0.1/go.mod h1:3Ls8CIEsrijN6+B7PbrXRPxHRPuXSrVKRY101jdMZYE= github.com/multiformats/go-varint v0.0.5/go.mod h1:3Ls8CIEsrijN6+B7PbrXRPxHRPuXSrVKRY101jdMZYE= github.com/multiformats/go-varint v0.0.6/go.mod h1:3Ls8CIEsrijN6+B7PbrXRPxHRPuXSrVKRY101jdMZYE= @@ -826,18 +888,24 @@ github.com/neelance/astrewrite v0.0.0-20160511093645-99348263ae86/go.mod h1:kHJE github.com/neelance/sourcemap v0.0.0-20151028013722-8c68805598ab/go.mod h1:Qr6/a/Q4r9LP1IltGz7tA7iOK1WonHEYhu1HRBA7ZiM= github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A= github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE= +github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU= github.com/oklog/run v1.0.0 h1:Ru7dDtJNOyC66gQ5dQmaCa0qIsAUFY3sFpK1Xk8igrw= github.com/oklog/run v1.0.0/go.mod h1:dlhp/R75TPv97u0XWUtDeV/lRKWPKSdTuV0TZvrmrQA= github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/ginkgo v1.7.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/ginkgo v1.12.1/go.mod h1:zj2OWP4+oCPe1qIXoGWkgMRwljMUYCdkwsT2108oapk= github.com/onsi/ginkgo v1.14.0/go.mod h1:iSB4RoI2tjJc9BBv4NKIKWKya62Rps+oPG/Lv9klQyY= +github.com/onsi/ginkgo v1.16.4/go.mod h1:dX+/inL/fNMqNlz0e9LfyB9TswhZpCVdJM/Z6Vvnwo0= github.com/onsi/ginkgo v1.16.5 h1:8xi0RTUf59SOSfEtZMvwTvXYMzG4gV23XVHOZiXNtnE= +github.com/onsi/ginkgo v1.16.5/go.mod h1:+E8gABHa3K6zRBolWtd+ROzc/U5bkGt0FwiG042wbpU= github.com/onsi/ginkgo/v2 v2.15.0 h1:79HwNRBAZHOEwrczrgSOPy+eFTTlIGELKy5as+ClttY= +github.com/onsi/ginkgo/v2 v2.15.0/go.mod h1:HlxMHtYF57y6Dpf+mc5529KKmSq9h2FpCF+/ZkwUxKM= github.com/onsi/gomega v1.4.3/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY= github.com/onsi/gomega v1.7.1/go.mod h1:XdKZgCCFLUoM/7CFJVPcG8C1xQ1AJ0vpAezJrB7JYyY= github.com/onsi/gomega v1.10.1/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1ybHNo= +github.com/onsi/gomega v1.17.0/go.mod h1:HnhC7FXeEQY45zxNK3PPoIUhzk/80Xly9PcubAlGdZY= github.com/onsi/gomega v1.30.0 h1:hvMK7xYz4D3HapigLTeGdId/NcfQx1VHMJc60ew99+8= +github.com/onsi/gomega v1.30.0/go.mod h1:9sxs+SwGrKI0+PWe4Fxa9tFQQBG5xSsSbMXOI8PPpoQ= github.com/open-policy-agent/opa v0.60.0 h1:ZPoPt4yeNs5UXCpd/P/btpSyR8CR0wfhVoh9BOwgJNs= github.com/open-policy-agent/opa v0.60.0/go.mod h1:aD5IK6AiLNYBjNXn7E02++yC8l4Z+bRDvgM6Ss0bBzA= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= @@ -846,12 +914,14 @@ github.com/opencontainers/image-spec v1.1.0-rc5 h1:Ygwkfw9bpDvs+c9E34SdgGOj41dX/ github.com/opencontainers/image-spec v1.1.0-rc5/go.mod h1:X4pATf0uXsnn3g5aiGIsVnJBR4mxhKzfwmvK/B2NTm8= github.com/opencontainers/runtime-spec v1.0.2/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/opencontainers/runtime-spec v1.2.0 h1:z97+pHb3uELt/yiAWD691HNHQIF07bE7dzrbT927iTk= +github.com/opencontainers/runtime-spec v1.2.0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/opentracing/opentracing-go v1.0.2/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= github.com/opentracing/opentracing-go v1.2.0 h1:uEJPy/1a5RIPAJ0Ov+OIO8OxWu77jEv+1B0VhjKrZUs= github.com/opentracing/opentracing-go v1.2.0/go.mod h1:GxEUsuufX4nBwe+T+Wl9TAgYrxe9dPLANfrWvHYVTgc= github.com/openzipkin/zipkin-go v0.1.1/go.mod h1:NtoC/o8u3JlF1lSlyPNswIbeQH9bJTmOf0Erfk+hxe8= github.com/openzipkin/zipkin-go v0.4.2 h1:zjqfqHjUpPmB3c1GlCvvgsM1G4LkvqQbBDueDOCg/jA= +github.com/openzipkin/zipkin-go v0.4.2/go.mod h1:ZeVkFjuuBiSy13y8vpSDCjMi9GoI3hPpCJSBx/EYFhY= github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 h1:onHthvaw9LFnH4t2DcNVpwGmV9E1BkGknEliJkfwQj0= github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58/go.mod h1:DXv8WO4yhMYhSNPKjeNKa5WY9YCIEBRbNzFFPJbWO6Y= github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic= @@ -860,23 +930,52 @@ github.com/pelletier/go-toml/v2 v2.0.8/go.mod h1:vuYfssBdrU2XDZ9bYydBu6t+6a6PYNc github.com/petar/GoLLRB v0.0.0-20210522233825-ae3b015fd3e9 h1:1/WtZae0yGtPq+TI6+Tv1WTxkukpXeMlviSxvL7SRgk= github.com/petar/GoLLRB v0.0.0-20210522233825-ae3b015fd3e9/go.mod h1:x3N5drFsm2uilKKuuYo6LdyD8vZAW55sH/9w+pbo1sw= github.com/pion/datachannel v1.5.5 h1:10ef4kwdjije+M9d7Xm9im2Y3O6A6ccQb0zcqZcJew8= +github.com/pion/datachannel v1.5.5/go.mod h1:iMz+lECmfdCMqFRhXhcA/219B0SQlbpoR2V118yimL0= +github.com/pion/dtls/v2 v2.2.7/go.mod h1:8WiMkebSHFD0T+dIU+UeBaoV7kDhOW5oDCzZ7WZ/F9s= github.com/pion/dtls/v2 v2.2.8 h1:BUroldfiIbV9jSnC6cKOMnyiORRWrWWpV11JUyEu5OA= +github.com/pion/dtls/v2 v2.2.8/go.mod h1:8WiMkebSHFD0T+dIU+UeBaoV7kDhOW5oDCzZ7WZ/F9s= github.com/pion/ice/v2 v2.3.11 h1:rZjVmUwyT55cmN8ySMpL7rsS8KYsJERsrxJLLxpKhdw= +github.com/pion/ice/v2 v2.3.11/go.mod h1:hPcLC3kxMa+JGRzMHqQzjoSj3xtE9F+eoncmXLlCL4E= github.com/pion/interceptor v0.1.25 h1:pwY9r7P6ToQ3+IF0bajN0xmk/fNw/suTgaTdlwTDmhc= +github.com/pion/interceptor v0.1.25/go.mod h1:wkbPYAak5zKsfpVDYMtEfWEy8D4zL+rpxCxPImLOg3Y= github.com/pion/logging v0.2.2 h1:M9+AIj/+pxNsDfAT64+MAVgJO0rsyLnoJKCqf//DoeY= +github.com/pion/logging v0.2.2/go.mod h1:k0/tDVsRCX2Mb2ZEmTqNa7CWsQPc+YYCB7Q+5pahoms= +github.com/pion/mdns v0.0.8/go.mod h1:hYE72WX8WDveIhg7fmXgMKivD3Puklk0Ymzog0lSyaI= github.com/pion/mdns v0.0.9 h1:7Ue5KZsqq8EuqStnpPWV33vYYEH0+skdDN5L7EiEsI4= +github.com/pion/mdns v0.0.9/go.mod h1:2JA5exfxwzXiCihmxpTKgFUpiQws2MnipoPK09vecIc= github.com/pion/randutil v0.1.0 h1:CFG1UdESneORglEsnimhUjf33Rwjubwj6xfiOXBa3mA= +github.com/pion/randutil v0.1.0/go.mod h1:XcJrSMMbbMRhASFVOlj/5hQial/Y8oH/HVo7TBZq+j8= +github.com/pion/rtcp v1.2.10/go.mod h1:ztfEwXZNLGyF1oQDttz/ZKIBaeeg/oWbRYqzBM9TL1I= +github.com/pion/rtcp v1.2.12/go.mod h1:sn6qjxvnwyAkkPzPULIbVqSKI5Dv54Rv7VG0kNxh9L4= github.com/pion/rtcp v1.2.13 h1:+EQijuisKwm/8VBs8nWllr0bIndR7Lf7cZG200mpbNo= +github.com/pion/rtcp v1.2.13/go.mod h1:sn6qjxvnwyAkkPzPULIbVqSKI5Dv54Rv7VG0kNxh9L4= +github.com/pion/rtp v1.8.2/go.mod h1:pBGHaFt/yW7bf1jjWAoUjpSNoDnw98KTMg+jWWvziqU= github.com/pion/rtp v1.8.3 h1:VEHxqzSVQxCkKDSHro5/4IUUG1ea+MFdqR2R3xSpNU8= +github.com/pion/rtp v1.8.3/go.mod h1:pBGHaFt/yW7bf1jjWAoUjpSNoDnw98KTMg+jWWvziqU= +github.com/pion/sctp v1.8.5/go.mod h1:SUFFfDpViyKejTAdwD1d/HQsCu+V/40cCs2nZIvC3s0= +github.com/pion/sctp v1.8.8/go.mod h1:igF9nZBrjh5AtmKc7U30jXltsFHicFCXSmWA2GWRaWs= github.com/pion/sctp v1.8.9 h1:TP5ZVxV5J7rz7uZmbyvnUvsn7EJ2x/5q9uhsTtXbI3g= +github.com/pion/sctp v1.8.9/go.mod h1:cMLT45jqw3+jiJCrtHVwfQLnfR0MGZ4rgOJwUOIqLkI= github.com/pion/sdp/v3 v3.0.6 h1:WuDLhtuFUUVpTfus9ILC4HRyHsW6TdugjEX/QY9OiUw= +github.com/pion/sdp/v3 v3.0.6/go.mod h1:iiFWFpQO8Fy3S5ldclBkpXqmWy02ns78NOKoLLL0YQw= github.com/pion/srtp/v2 v2.0.18 h1:vKpAXfawO9RtTRKZJbG4y0v1b11NZxQnxRl85kGuUlo= +github.com/pion/srtp/v2 v2.0.18/go.mod h1:0KJQjA99A6/a0DOVTu1PhDSw0CXF2jTkqOoMg3ODqdA= github.com/pion/stun v0.6.1 h1:8lp6YejULeHBF8NmV8e2787BogQhduZugh5PdhDyyN4= +github.com/pion/stun v0.6.1/go.mod h1:/hO7APkX4hZKu/D0f2lHzNyvdkTGtIy3NDmLR7kSz/8= github.com/pion/transport v0.14.1 h1:XSM6olwW+o8J4SCmOBb/BpwZypkHeyM0PGFCxNQBr40= +github.com/pion/transport v0.14.1/go.mod h1:4tGmbk00NeYA3rUa9+n+dzCCoKkcy3YlYb99Jn2fNnI= +github.com/pion/transport/v2 v2.2.1/go.mod h1:cXXWavvCnFF6McHTft3DWS9iic2Mftcz1Aq29pGcU5g= +github.com/pion/transport/v2 v2.2.2/go.mod h1:OJg3ojoBJopjEeECq2yJdXH9YVrUJ1uQ++NjXLOUorc= +github.com/pion/transport/v2 v2.2.3/go.mod h1:q2U/tf9FEfnSBGSW6w5Qp5PFWRLRj3NjLhCCgpRK4p0= github.com/pion/transport/v2 v2.2.4 h1:41JJK6DZQYSeVLxILA2+F4ZkKb4Xd/tFJZRFZQ9QAlo= +github.com/pion/transport/v2 v2.2.4/go.mod h1:q2U/tf9FEfnSBGSW6w5Qp5PFWRLRj3NjLhCCgpRK4p0= github.com/pion/transport/v3 v3.0.1 h1:gDTlPJwROfSfz6QfSi0ZmeCSkFcnWWiiR9ES0ouANiM= +github.com/pion/transport/v3 v3.0.1/go.mod h1:UY7kiITrlMv7/IKgd5eTUcaahZx5oUN3l9SzK5f5xE0= +github.com/pion/turn/v2 v2.1.3/go.mod h1:huEpByKKHix2/b9kmTAM3YoX6MKP+/D//0ClgUYR2fY= github.com/pion/turn/v2 v2.1.4 h1:2xn8rduI5W6sCZQkEnIUDAkrBQNl2eYIBCHMZ3QMmP8= +github.com/pion/turn/v2 v2.1.4/go.mod h1:huEpByKKHix2/b9kmTAM3YoX6MKP+/D//0ClgUYR2fY= github.com/pion/webrtc/v3 v3.2.23 h1:GbqEuxBbVLFhXk0GwxKAoaIJYiEa9TyoZPEZC+2HZxM= +github.com/pion/webrtc/v3 v3.2.23/go.mod h1:1CaT2fcZzZ6VZA+O1i9yK2DU4EOcXVvSbWG9pr5jefs= github.com/pjbgf/sha1cd v0.3.0 h1:4D5XXmUUBUl/xQ6IjCkEAbqXskkq/4O7LmGn0AqMDs4= github.com/pjbgf/sha1cd v0.3.0/go.mod h1:nZ1rrWOcGJ5uZgEEVL1VUM9iRQiZvWdbZjkKyFzPPsI= github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= @@ -898,11 +997,13 @@ github.com/prometheus/client_golang v1.12.1/go.mod h1:3Z9XVyYiZYEO+YQWt3RD2R3jrb github.com/prometheus/client_golang v1.12.2/go.mod h1:3Z9XVyYiZYEO+YQWt3RD2R3jrbd179Rt297l4aS6nDY= github.com/prometheus/client_golang v1.13.0/go.mod h1:vTeo+zgvILHsnnj/39Ou/1fPN5nJFOEMgftOUOmlvYQ= github.com/prometheus/client_golang v1.18.0 h1:HzFfmkOzH5Q8L8G+kSJKUx5dtG87sewO+FoDDqP5Tbk= +github.com/prometheus/client_golang v1.18.0/go.mod h1:T+GXkCk5wSJyOqMIzVgvvjFDlkOQntgjkJWKrN5txjA= github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.2.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.6.0 h1:k1v3CzpSRUTrKMppY35TLwPvxHqBu0bYgxZzqGIgaos= +github.com/prometheus/client_model v0.6.0/go.mod h1:NTQHnmxFpouOD0DpvP4XujX3CdOAGQPoaGhyTchlyt8= github.com/prometheus/common v0.0.0-20180801064454-c7de2306084e/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro= github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= github.com/prometheus/common v0.10.0/go.mod h1:Tlit/dnDKsSWFlCLTWaA1cyBgKHSMdTB80sz/V91rCo= @@ -911,6 +1012,7 @@ github.com/prometheus/common v0.32.1/go.mod h1:vu+V0TpY+O6vW9J44gczi3Ap/oXXR10b+ github.com/prometheus/common v0.35.0/go.mod h1:phzohg0JFMnBEFGxTDbfu3QyL5GI8gTQJFhYO5B3mfA= github.com/prometheus/common v0.37.0/go.mod h1:phzohg0JFMnBEFGxTDbfu3QyL5GI8gTQJFhYO5B3mfA= github.com/prometheus/common v0.47.0 h1:p5Cz0FNHo7SnWOmWmoRozVcjEp0bIVU8cV7OShpjL1k= +github.com/prometheus/common v0.47.0/go.mod h1:0/KsvlIEfPQCQ5I2iNSAWKPZziNCvRs5EC6ILDTlAPc= github.com/prometheus/procfs v0.0.0-20180725123919-05ee40e3a273/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= @@ -919,12 +1021,15 @@ github.com/prometheus/procfs v0.6.0/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1 github.com/prometheus/procfs v0.7.3/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA= github.com/prometheus/procfs v0.8.0/go.mod h1:z7EfXMXOkbkqb9IINtpCn86r/to3BnA0uaxHdg830/4= github.com/prometheus/procfs v0.12.0 h1:jluTpSng7V9hY0O2R9DzzJHYb2xULk9VTR1V1R/k6Bo= +github.com/prometheus/procfs v0.12.0/go.mod h1:pcuDEFsWDnvcgNzo4EEweacyhjeA9Zk3cnaOZAZEfOo= github.com/prometheus/statsd_exporter v0.22.7 h1:7Pji/i2GuhK6Lu7DHrtTkFmNBCudCPT1pX2CziuyQR0= github.com/prometheus/statsd_exporter v0.22.7/go.mod h1:N/TevpjkIh9ccs6nuzY3jQn9dFqnUakOjnEuMPJJJnI= github.com/quic-go/qpack v0.4.0 h1:Cr9BXA1sQS2SmDUWjSofMPNKmvF6IiIfDRmgU0w1ZCo= github.com/quic-go/qpack v0.4.0/go.mod h1:UZVnYIfi5GRk+zI9UMaCPsmZ2xKJP7XBUvVyT1Knj9A= github.com/quic-go/quic-go v0.41.0 h1:aD8MmHfgqTURWNJy48IYFg2OnxwHT3JL7ahGs73lb4k= +github.com/quic-go/quic-go v0.41.0/go.mod h1:qCkNjqczPEvgsOnxZ0eCD14lv+B2LHlFAB++CNOh9hA= github.com/quic-go/webtransport-go v0.6.0 h1:CvNsKqc4W2HljHJnoT+rMmbRJybShZ0YPFDD3NxaZLY= +github.com/quic-go/webtransport-go v0.6.0/go.mod h1:9KjU4AEBqEQidGHNDkZrb8CAa1abRaosM2yGOyiikEc= github.com/raulk/go-watchdog v1.3.0 h1:oUmdlHxdkXRJlwfG0O9omj8ukerm8MEQavSiDTEtBsk= github.com/raulk/go-watchdog v1.3.0/go.mod h1:fIvOnLbF0b0ZwkB9YU4mOW9Did//4vPZtDqv66NfsMU= github.com/rcrowley/go-metrics v0.0.0-20201227073835-cf1acfcdf475 h1:N/ElC8H3+5XpJzTSTfLsJV/mx9Q9g7kxmchpfZyxgzM= @@ -937,6 +1042,7 @@ github.com/rivo/uniseg v0.4.4/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUc github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc= github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= +github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= github.com/rs/cors v1.7.0 h1:+88SsELBHx5r+hZ8TCkggzSstaWNbDvThkVK8H6f9ik= github.com/rs/cors v1.7.0/go.mod h1:gFx+x8UowdsKA9AchylcLynDq+nNFfI8FkUZdN/jGCU= github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg= @@ -948,6 +1054,7 @@ github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQD github.com/rwcarlsen/goexif v0.0.0-20190401172101-9e8deecbddbd/go.mod h1:hPqNNc0+uJM6H+SuU8sEs5K5IQeKccPqeSjfgcKGgPk= github.com/samber/lo v1.39.0 h1:4gTz1wUhNYLhFSKl6O+8peW0v2F4BCY034GRpU9WnuA= github.com/samber/lo v1.39.0/go.mod h1:+m/ZKRl6ClXCE2Lgf3MsQlWfh4bn1bz6CXEOxnEXnEA= +github.com/sclevine/agouti v3.0.0+incompatible/go.mod h1:b4WX9W9L1sfQKXeJf1mUTLZKJ48R1S7H23Ji7oFO5Bw= github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8= github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I= @@ -1021,6 +1128,7 @@ github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+ github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= @@ -1050,6 +1158,7 @@ github.com/tetratelabs/wabin v0.0.0-20230304001439-f6f874872834/go.mod h1:m9ymHT github.com/tetratelabs/wazero v1.6.0 h1:z0H1iikCdP8t+q341xqepY4EWvHEw8Es7tlqiVzlP3g= github.com/tetratelabs/wazero v1.6.0/go.mod h1:0U0G41+ochRKoPKCJlh0jMg1CHkyfK8kDqiirMmKY8A= github.com/tj/assert v0.0.3 h1:Df/BlaZ20mq6kuai7f5z2TvPFiwC3xaWJSDQNiIS3Rk= +github.com/tj/assert v0.0.3/go.mod h1:Ne6X72Q+TB1AteidzQncjw9PabbMp4PBMZ1k+vd1Pvk= github.com/tv42/httpunix v0.0.0-20191220191345-2ba4b9c3382c h1:u6SKchux2yDvFQnDHS3lPnIRmfVJ5Sxy3ao2SIdysLQ= github.com/tv42/httpunix v0.0.0-20191220191345-2ba4b9c3382c/go.mod h1:hzIxponao9Kjc7aWznkXaL4U4TWaDSs8zcsY4Ka08nM= github.com/ucarion/urlpath v0.0.0-20200424170820-7ccc79b76bbb h1:Ywfo8sUltxogBpFuMOFRrrSifO788kAFxmvVw31PtQQ= @@ -1069,6 +1178,7 @@ github.com/wangjia184/sortedset v0.0.0-20160527075905-f5d03557ba30/go.mod h1:Yko github.com/warpfork/go-testmark v0.3.0/go.mod h1:jhEf8FVxd+F17juRubpmut64NEG6I2rgkUhlcqqXwE0= github.com/warpfork/go-testmark v0.9.0/go.mod h1:jhEf8FVxd+F17juRubpmut64NEG6I2rgkUhlcqqXwE0= github.com/warpfork/go-testmark v0.12.1 h1:rMgCpJfwy1sJ50x0M0NgyphxYYPMOODIJHhsXyEHU0s= +github.com/warpfork/go-testmark v0.12.1/go.mod h1:kHwy7wfvGSPh1rQJYKayD4AbtNaeyZdcGi9tNJTaa5Y= github.com/warpfork/go-wish v0.0.0-20200122115046-b9ea61034e4a/go.mod h1:x6AKhvSSexNrVSrViXSHUEbICjmGXhtgABaHIySUSGw= github.com/warpfork/go-wish v0.0.0-20220906213052-39a1cc7a02d0 h1:GDDkbFiaK8jsSDJfjId/PEGEShv6ugrt4kYsC5UIDaQ= github.com/warpfork/go-wish v0.0.0-20220906213052-39a1cc7a02d0/go.mod h1:x6AKhvSSexNrVSrViXSHUEbICjmGXhtgABaHIySUSGw= @@ -1077,6 +1187,7 @@ github.com/whyrusleeping/base32 v0.0.0-20170828182744-c30ac30633cc/go.mod h1:r45 github.com/whyrusleeping/cbor v0.0.0-20171005072247-63513f603b11 h1:5HZfQkwe0mIfyDmc1Em5GqlNRzcdtlv4HTNmdpt7XH0= github.com/whyrusleeping/cbor v0.0.0-20171005072247-63513f603b11/go.mod h1:Wlo/SzPmxVp6vXpGt/zaXhHH0fn4IxgqZc82aKg6bpQ= github.com/whyrusleeping/cbor-gen v0.0.0-20240109153615-66e95c3e8a87 h1:S4wCk+ZL4WGGaI+GsmqCRyt68ISbnZWsK9dD9jYL0fA= +github.com/whyrusleeping/cbor-gen v0.0.0-20240109153615-66e95c3e8a87/go.mod h1:fgkXqYy7bV2cFeIEOkVTZS/WjXARfBqSH6Q2qHL33hQ= github.com/whyrusleeping/chunker v0.0.0-20181014151217-fe64bd25879f h1:jQa4QT2UP9WYv2nzyawpKMOCl+Z/jW7djv2/J50lj9E= github.com/whyrusleeping/chunker v0.0.0-20181014151217-fe64bd25879f/go.mod h1:p9UJB6dDgdPgMJZs7UjUOdulKyRr9fqkS+6JKAInPy8= github.com/whyrusleeping/go-keyspace v0.0.0-20160322163242-5b898ac5add1 h1:EKhdznlJHPMoKr0XTrX+IlJs1LH3lyx2nfr1dOlZ79k= @@ -1118,11 +1229,13 @@ go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.46.1 h1:aFJWCqJMNjENlcleuuOkGAPH82y0yULBScfXcIEdS24= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.46.1/go.mod h1:sEGXWArGqc3tVa+ekntsN65DmVbVeW+7lTKTjZF3/Fo= go.opentelemetry.io/otel v1.24.0 h1:0LAOdjNmQeSTzGBzduGe/rU4tZhMwL5rWgtp9Ku5Jfo= +go.opentelemetry.io/otel v1.24.0/go.mod h1:W7b9Ozg4nkF5tWI5zsXkaKKDjdVjpD4oAt9Qi/MArHo= go.opentelemetry.io/otel/exporters/otlp/otlpmetric v0.40.0 h1:MZbjiZeMmn5wFMORhozpouGKDxj9POHTuU5UA8msBQk= go.opentelemetry.io/otel/exporters/otlp/otlpmetric v0.40.0/go.mod h1:C7tOYVCJmrDTCwxNny0MuUtnDIR3032vFHYke0F2ZrU= go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v0.40.0 h1:q3FNPi8FLQVjLlmV+WWHQfH9ZCCtQIS0O/+dn1+4cJ4= go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v0.40.0/go.mod h1:rmx4n0uSIAkKBeQYkygcv9dENAlL2/tv3OSq68h1JAo= go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.24.0 h1:mM8nKi6/iFQ0iqst80wDHU2ge198Ye/TfN0WBS5U24Y= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.24.0/go.mod h1:0PrIIzDteLSmNyxqcGYRL4mDIo8OTuBAOI/Bn1URxac= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.23.1 h1:o8iWeVFa1BcLtVEV0LzrCxV2/55tB3xLxADr6Kyoey4= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.23.1/go.mod h1:SEVfdK4IoBnbT2FXNM/k8yC08MrfbhWk3U4ljM8B3HE= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.21.0 h1:tIqheXEFWAZ7O8A7m+J0aPTmpJN3YQ7qetUAdkkkKpk= @@ -1130,11 +1243,17 @@ go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.21.0/go.mod h go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.22.0 h1:FyjCyI9jVEfqhUh2MoSkmolPjfh5fp2hnV0b0irxH4Q= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.22.0/go.mod h1:hYwym2nDEeZfG/motx0p7L7J1N1vyzIThemQsb4g2qY= go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.21.0 h1:VhlEQAPp9R1ktYfrPk5SOryw1e9LDDTZCbIPFrho0ec= +go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.21.0/go.mod h1:kB3ufRbfU+CQ4MlUcqtW8Z7YEOBeK2DJ6CmR5rYYF3E= go.opentelemetry.io/otel/exporters/zipkin v1.21.0 h1:D+Gv6lSfrFBWmQYyxKjDd0Zuld9SRXpIrEsKZvE4DO4= +go.opentelemetry.io/otel/exporters/zipkin v1.21.0/go.mod h1:83oMKR6DzmHisFOW3I+yIMGZUTjxiWaiBI8M8+TU5zE= go.opentelemetry.io/otel/metric v1.24.0 h1:6EhoGWWK28x1fbpA4tYTOWBkPefTDQnb8WSGXlc88kI= +go.opentelemetry.io/otel/metric v1.24.0/go.mod h1:VYhLe1rFfxuTXLgj4CBiyz+9WYBA8pNGJgDcSFRKBco= go.opentelemetry.io/otel/sdk v1.24.0 h1:YMPPDNymmQN3ZgczicBY3B6sf9n62Dlj9pWD3ucgoDw= +go.opentelemetry.io/otel/sdk v1.24.0/go.mod h1:KVrIYw6tEubO9E96HQpcmpTKDVn9gdv35HoYiQWGDFg= go.opentelemetry.io/otel/sdk/metric v1.24.0 h1:yyMQrPzF+k88/DbH7o4FMAs80puqd+9osbiBrJrz/w8= +go.opentelemetry.io/otel/sdk/metric v1.24.0/go.mod h1:I6Y5FjH6rvEnTTAYQz3Mmv2kl6Ek5IIrmwTLqMrrOE0= go.opentelemetry.io/otel/trace v1.24.0 h1:CsKnnL4dUAr/0llH9FKuc698G04IrpWV0MQA/Y1YELI= +go.opentelemetry.io/otel/trace v1.24.0/go.mod h1:HPc3Xr/cOApsBI154IU0OI0HJexz+aw5uPdbs3UCjNU= go.opentelemetry.io/proto/otlp v1.1.0 h1:2Di21piLrCqJ3U3eXGCTPHE9R8Nh+0uglSnOyxikMeI= go.opentelemetry.io/proto/otlp v1.1.0/go.mod h1:GpBHCBWiqvVLDqmHZsoMM3C5ySeKTC7ej/RNTae6MdY= go.ptx.dk/multierrgroup v0.0.3 h1:HNaevFVERSZ7/DyCAnOICWF86B8s+76QwdwOlgCPvQM= @@ -1145,9 +1264,12 @@ go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= go.uber.org/atomic v1.11.0 h1:ZvwS0R+56ePWxUNi+Atn9dWONBPp/AUETXlHW0DxSjE= go.uber.org/atomic v1.11.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0= go.uber.org/dig v1.17.1 h1:Tga8Lz8PcYNsWsyHMZ1Vm0OQOUaJNDyvPImgbAu9YSc= +go.uber.org/dig v1.17.1/go.mod h1:Us0rSJiThwCv2GteUN0Q7OKvU7n5J4dxZ9JKUXozFdE= go.uber.org/fx v1.20.1 h1:zVwVQGS8zYvhh9Xxcu4w1M6ESyeMzebzj2NbSayZ4Mk= +go.uber.org/fx v1.20.1/go.mod h1:iSYNbHf2y55acNCwCXKx7LbWb5WG1Bnue5RDXz1OREg= go.uber.org/goleak v1.1.11-0.20210813005559-691160354723/go.mod h1:cwTWslyiVhfpKIDGSZEM2HlOvcqm+tG4zioyIeLoqMQ= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/mock v0.4.0 h1:VcM4ZOtdbR4f6VXfiOpwpVJDL6lCReaZ6mw31wqh7KU= go.uber.org/mock v0.4.0/go.mod h1:a6FSlNadKUHUa9IP5Vyt1zh4fC7uAwxMutEAscFbkZc= go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0= @@ -1161,6 +1283,7 @@ go.uber.org/zap v1.14.1/go.mod h1:Mb2vm2krFEG5DV0W9qcHBYFtp/Wku1cvYaqPsS/WYfc= go.uber.org/zap v1.16.0/go.mod h1:MA8QOfq0BHJwdXa996Y4dYkAqRKB8/1K1QMMZVaNZjQ= go.uber.org/zap v1.19.1/go.mod h1:j3DNczoxDZroyBnOT1L/Q79cfUMGZxlv/9dzN7SM1rI= go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= +go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= go4.org v0.0.0-20180809161055-417644f6feb5/go.mod h1:MkTOUMDaeVYJUOUsaDXIhWPZYa1yOyC1qaOBpL57BhE= go4.org v0.0.0-20200411211856-f5505b9728dd/go.mod h1:CIiUVy99QCPfoE13bO4EZaz5GZMZXMSBGhxRdsvzbkg= go4.org v0.0.0-20230225012048-214862532bf5 h1:nifaUDeh+rPaBCMPMQHZmvJf+QdpLFnuQPwx+LxVmtc= @@ -1187,6 +1310,12 @@ golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0 golang.org/x/crypto v0.0.0-20220722155217-630584e8d5aa/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.3.1-0.20221117191849-2c476679df9a/go.mod h1:hebNnKkNXi2UzZN1eVRvBB7co0a+JxK6XbPiWVs/3J4= golang.org/x/crypto v0.7.0/go.mod h1:pYwdfH91IfpZVANVyUOhSIPZaFoJGxTFbZhFTx+dXZU= +golang.org/x/crypto v0.8.0/go.mod h1:mRqEX+O9/h5TFCrQhkgjo2yKi0yYA+9ecGkdQoHrywE= +golang.org/x/crypto v0.10.0/go.mod h1:o4eNf7Ede1fv+hwOwZsTHl9EsPFO6q6ZvYR8vYfY45I= +golang.org/x/crypto v0.11.0/go.mod h1:xgJhtzW8F9jGdVFWZESrid1U1bjeNy4zgy5cRr/CIio= +golang.org/x/crypto v0.12.0/go.mod h1:NF0Gs7EO5K4qLn+Ylc+fih8BSTeIjAP05siRnAh98yw= +golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= +golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= golang.org/x/crypto v0.21.0 h1:X31++rzVUdKhX5sWmSOFZxx8UW/ldWx55cbf08iNAMA= golang.org/x/crypto v0.21.0/go.mod h1:0BP7YvVV9gBbVKyeTG0Gyn+gZm94bibOW5BjDEYAOMs= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= @@ -1274,15 +1403,18 @@ golang.org/x/net v0.0.0-20210119194325-5f4716e94777/go.mod h1:m0MpNAwzfU5UDzcl9v golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM= golang.org/x/net v0.0.0-20210423184538-5f58ad60dda6/go.mod h1:OJAsFXCWl8Ukc7SiCT/9KSuxbyM7479/AVlXFRxuMCk= +golang.org/x/net v0.0.0-20210428140749-89ef3d95e781/go.mod h1:OJAsFXCWl8Ukc7SiCT/9KSuxbyM7479/AVlXFRxuMCk= golang.org/x/net v0.0.0-20210525063256-abc453219eb5/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= golang.org/x/net v0.0.0-20220225172249-27dd8689420f/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco= golang.org/x/net v0.2.0/go.mod h1:KqCZLdyyvdV855qA2rE3GC2aiw5xGR5TEjj8smXukLY= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc= +golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= golang.org/x/net v0.23.0 h1:7EYJ93RZ9vYSZAIb2x3lnuvqO5zneoD6IvWjuhfxjTs= @@ -1300,6 +1432,7 @@ golang.org/x/oauth2 v0.0.0-20210218202405-ba52d332ba99/go.mod h1:KelEdhl1UZF7XfJ golang.org/x/oauth2 v0.0.0-20210514164344-f6687ab2804c/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b/go.mod h1:DAh4E804XQdzx2j+YRIaUnCqCV2RuMz24cGBJ5QYIrc= golang.org/x/oauth2 v0.16.0 h1:aDkGMBSYxElaoP81NpoUoz2oo2R2wHdZpGToUxfyQrQ= +golang.org/x/oauth2 v0.16.0/go.mod h1:hqZ+0LWXsiVoZpeld6jVt06P3adbS2Uu911W1SsJv2o= golang.org/x/perf v0.0.0-20180704124530-6e6d33e29852/go.mod h1:JLpeXjPJfIyPr5TlbXLkXWLhP8nz10XfvxElABhCtcw= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -1372,6 +1505,7 @@ golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201201145000-ef89a241ccb3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210104204734-6f8348627aad/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210112080510-489259a85091/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210119212857-b64e53b001e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210225134936-a50acf3fe073/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -1396,22 +1530,36 @@ golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20221010170243-090e33056c14/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.3.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.9.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.18.0 h1:DBdB3niSjOA/O0blCZBqDefyWNYveAYMNF1Wum0DYQ4= golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.2.0/go.mod h1:TVmDHMZPmdnySmBfhjOoOdhjzdE1h4u1VwSiw2l1Nuc= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U= +golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/term v0.9.0/go.mod h1:M6DEAAIenWoTxdKrOltXcmDY3rSplQUkrvaDU5FcQyo= +golang.org/x/term v0.10.0/go.mod h1:lpqdcUyK/oCiQxvxVrppt5ggO2KCZ5QblwqPnfZ6d5o= +golang.org/x/term v0.11.0/go.mod h1:zC9APTIj3jG3FdV/Ons+XE1riIZXG4aZ4GTHiPZJPIU= +golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU= +golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= golang.org/x/term v0.18.0 h1:FcHjZXDMxI8mM3nwhX9HlKop4C0YQvCVCdwYl2wOtE8= +golang.org/x/term v0.18.0/go.mod h1:ILwASektA3OnRv7amZ1xhE/KTR+u50pbXfZ03+6Nx58= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -1425,6 +1573,10 @@ golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.10.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.11.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.12.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= @@ -1486,6 +1638,7 @@ golang.org/x/tools v0.0.0-20200904185747-39188db58858/go.mod h1:Cj7w3i3Rnn0Xh82u golang.org/x/tools v0.0.0-20201110124207-079ba7bd75cd/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.0.0-20201201161351-ac6f37ff4c2a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.0.0-20201208233053-a543418bbed2/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/tools v0.0.0-20201224043029-2b0845dc783e/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.0.0-20210105154028-b0ab187a4818/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.0.0-20210108195828-e2f9c7f1fc8e/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= @@ -1500,7 +1653,9 @@ golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8T golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 h1:+cNy6SZtPcJQH3LJVLOSmiC7MMxXNOb3PU/VUEz+EhU= +golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028/go.mod h1:NDW/Ps6MPRej6fsCIbMTohpP40sJ/P/vI1MoTEGwX90= gonum.org/v1/gonum v0.14.0 h1:2NiG67LD1tEH0D7kM+ps2V+fXmsAnpUeec7n8tcr4S0= +gonum.org/v1/gonum v0.14.0/go.mod h1:AoWeoz0becf9QMWtE8iWXNXc27fK4fNeHNf/oMejGfU= google.golang.org/api v0.0.0-20180910000450-7ca32eb868bf/go.mod h1:4mhQ8q/RsB7i+udVvVy5NUi08OU8ZlA0gRVgrF7VFY0= google.golang.org/api v0.0.0-20181030000543-1d582fd0359e/go.mod h1:4mhQ8q/RsB7i+udVvVy5NUi08OU8ZlA0gRVgrF7VFY0= google.golang.org/api v0.1.0/go.mod h1:UGEZY7KEX120AnNLIHFMKIo4obdJhkp2tPbaPlQx13Y= @@ -1575,8 +1730,11 @@ google.golang.org/genproto v0.0.0-20201214200347-8c77b98c765d/go.mod h1:FWY/as6D google.golang.org/genproto v0.0.0-20210108203827-ffc7fda8c3d7/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= google.golang.org/genproto v0.0.0-20210226172003-ab064af71705/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= google.golang.org/genproto v0.0.0-20240102182953-50ed04b92917 h1:nz5NESFLZbJGPFxDT/HCn+V1mZ8JGNoY4nUpmW/Y2eg= +google.golang.org/genproto v0.0.0-20240102182953-50ed04b92917/go.mod h1:pZqR+glSb11aJ+JQcczCvgf47+duRuzNSKqE8YAQnV0= google.golang.org/genproto/googleapis/api v0.0.0-20240108191215-35c7eff3a6b1 h1:OPXtXn7fNMaXwO3JvOmF1QyTc00jsSFFz1vXXBOdCDo= +google.golang.org/genproto/googleapis/api v0.0.0-20240108191215-35c7eff3a6b1/go.mod h1:B5xPO//w8qmBDjGReYLpR6UJPnkldGkCSMoH/2vxJeg= google.golang.org/genproto/googleapis/rpc v0.0.0-20240108191215-35c7eff3a6b1 h1:gphdwh0npgs8elJ4T6J+DQJHPVF7RsuJHCfwztUb4J4= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240108191215-35c7eff3a6b1/go.mod h1:daQN87bsDqDoe316QbbvX60nMoJQa4r6Ds0ZuoAe5yA= google.golang.org/grpc v1.14.0/go.mod h1:yo6s7OP7yaDglbqo1J04qKzAhqBH6lvTonzMVmEdcZw= google.golang.org/grpc v1.16.0/go.mod h1:0JHn/cJsOMiMfNA9+DeHDlAU7KAAB5GDlYFpa9MZMio= google.golang.org/grpc v1.17.0/go.mod h1:6QZJwpn2B+Zp71q/5VxRsJ6NXXVCE5NRUHRo+f3cWCs= @@ -1597,6 +1755,7 @@ google.golang.org/grpc v1.33.2/go.mod h1:JMHMWHQWaTccqQQlmk3MJZS+GWXOdAesneDmEnv google.golang.org/grpc v1.34.0/go.mod h1:WotjhfgOW/POjDeRt8vscBtXq+2VjORFy659qA51WJ8= google.golang.org/grpc v1.35.0/go.mod h1:qjiiYl8FncCW8feJPdyg3v6XW24KsRHe+dy9BAGRRjU= google.golang.org/grpc v1.61.1 h1:kLAiWrZs7YeDM6MumDe7m3y4aM6wacLzM1Y/wiLP9XY= +google.golang.org/grpc v1.61.1/go.mod h1:VUbo7IFqmF1QtCAstipjG0GIoq49KvMe9+h1jFLBNJs= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= @@ -1645,6 +1804,7 @@ gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gotest.tools/v3 v3.3.0 h1:MfDY1b1/0xN1CyMlQDac0ziEy9zJQd9CXBRRDHw2jJo= +gotest.tools/v3 v3.3.0/go.mod h1:Mcr9QNxkg0uMvy/YElmo4SpXgJKWgQvYrT7Kw5RzJ1A= grpc.go4.org v0.0.0-20170609214715-11d0a25b4919/go.mod h1:77eQGdRu53HpSqPFJFmuJdjuHRquDANNeA4x7B8WQ9o= honnef.co/go/tools v0.0.0-20180728063816-88497007e858/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= From 18220986eaadf7246ff0ce09a2e1505377244e27 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 26 Apr 2024 15:53:52 -0700 Subject: [PATCH 15/17] Bump idna from 3.3 to 3.7 in /ops (#3786) Bumps [idna](https://github.com/kjd/idna) from 3.3 to 3.7.
Release notes

Sourced from idna's releases.

v3.7

What's Changed

  • Fix issue where specially crafted inputs to encode() could take exceptionally long amount of time to process. [CVE-2024-3651]

Thanks to Guido Vranken for reporting the issue.

Full Changelog: https://github.com/kjd/idna/compare/v3.6...v3.7

Changelog

Sourced from idna's changelog.

3.7 (2024-04-11) ++++++++++++++++

  • Fix issue where specially crafted inputs to encode() could take exceptionally long amount of time to process. [CVE-2024-3651]

Thanks to Guido Vranken for reporting the issue.

3.6 (2023-11-25) ++++++++++++++++

  • Fix regression to include tests in source distribution.

3.5 (2023-11-24) ++++++++++++++++

  • Update to Unicode 15.1.0
  • String codec name is now "idna2008" as overriding the system codec "idna" was not working.
  • Fix typing error for codec encoding
  • "setup.cfg" has been added for this release due to some downstream lack of adherence to PEP 517. Should be removed in a future release so please prepare accordingly.
  • Removed reliance on a symlink for the "idna-data" tool to comport with PEP 517 and the Python Packaging User Guide for sdist archives.
  • Added security reporting protocol for project

Thanks Jon Ribbens, Diogo Teles Sant'Anna, Wu Tingfeng for contributions to this release.

3.4 (2022-09-14) ++++++++++++++++

  • Update to Unicode 15.0.0
  • Migrate to pyproject.toml for build information (PEP 621)
  • Correct another instance where generic exception was raised instead of IDNAError for malformed input
  • Source distribution uses zeroized file ownership for improved reproducibility

Thanks to Seth Michael Larson for contributions to this release.

Commits
  • 1d365e1 Release v3.7
  • c1b3154 Merge pull request #172 from kjd/optimize-contextj
  • 0394ec7 Merge branch 'master' into optimize-contextj
  • cd58a23 Merge pull request #152 from elliotwutingfeng/dev
  • 5beb28b More efficient resolution of joiner contexts
  • 1b12148 Update ossf/scorecard-action to v2.3.1
  • d516b87 Update Github actions/checkout to v4
  • c095c75 Merge branch 'master' into dev
  • 60a0a4c Fix typo in GitHub Actions workflow key
  • 5918a0e Merge branch 'master' into dev
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=idna&package-manager=pip&previous-version=3.3&new-version=3.7)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself) You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/bacalhau-project/bacalhau/network/alerts).
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: David Aronchick --- ops/poetry.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ops/poetry.lock b/ops/poetry.lock index 3633253519..e916ede147 100644 --- a/ops/poetry.lock +++ b/ops/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "cachetools" @@ -210,13 +210,13 @@ grpc = ["grpcio (>=1.0.0,<2.0.0dev)"] [[package]] name = "idna" -version = "3.3" +version = "3.7" description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.5" files = [ - {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"}, - {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"}, + {file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"}, + {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"}, ] [[package]] From 68dc7cb399ea6c7186df8e8f581c7f1ca984e685 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 26 Apr 2024 16:24:34 -0700 Subject: [PATCH 16/17] Bump idna from 3.6 to 3.7 in /python (#3788) Bumps [idna](https://github.com/kjd/idna) from 3.6 to 3.7.
Release notes

Sourced from idna's releases.

v3.7

What's Changed

  • Fix issue where specially crafted inputs to encode() could take exceptionally long amount of time to process. [CVE-2024-3651]

Thanks to Guido Vranken for reporting the issue.

Full Changelog: https://github.com/kjd/idna/compare/v3.6...v3.7

Changelog

Sourced from idna's changelog.

3.7 (2024-04-11) ++++++++++++++++

  • Fix issue where specially crafted inputs to encode() could take exceptionally long amount of time to process. [CVE-2024-3651]

Thanks to Guido Vranken for reporting the issue.

Commits
  • 1d365e1 Release v3.7
  • c1b3154 Merge pull request #172 from kjd/optimize-contextj
  • 0394ec7 Merge branch 'master' into optimize-contextj
  • cd58a23 Merge pull request #152 from elliotwutingfeng/dev
  • 5beb28b More efficient resolution of joiner contexts
  • 1b12148 Update ossf/scorecard-action to v2.3.1
  • d516b87 Update Github actions/checkout to v4
  • c095c75 Merge branch 'master' into dev
  • 60a0a4c Fix typo in GitHub Actions workflow key
  • 5918a0e Merge branch 'master' into dev
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=idna&package-manager=pip&previous-version=3.6&new-version=3.7)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself) You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/bacalhau-project/bacalhau/network/alerts).
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: David Aronchick --- python/poetry.lock | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/poetry.lock b/python/poetry.lock index 8e4dc6f7d5..0311e5559a 100644 --- a/python/poetry.lock +++ b/python/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "bacalhau-apiclient" @@ -460,13 +460,13 @@ license = ["ukkonen"] [[package]] name = "idna" -version = "3.6" +version = "3.7" description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.5" files = [ - {file = "idna-3.6-py3-none-any.whl", hash = "sha256:c05567e9c24a6b9faaa835c4821bad0590fbb9d5779e7caa6e1cc4978e7eb24f"}, - {file = "idna-3.6.tar.gz", hash = "sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca"}, + {file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"}, + {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"}, ] [[package]] @@ -1151,6 +1151,7 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, From 05a0d1cec12d380793ebfb28252e6a336c0eec6e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 26 Apr 2024 16:44:35 -0700 Subject: [PATCH 17/17] Bump idna from 3.6 to 3.7 in /integration/airflow (#3789) Bumps [idna](https://github.com/kjd/idna) from 3.6 to 3.7.
Release notes

Sourced from idna's releases.

v3.7

What's Changed

  • Fix issue where specially crafted inputs to encode() could take exceptionally long amount of time to process. [CVE-2024-3651]

Thanks to Guido Vranken for reporting the issue.

Full Changelog: https://github.com/kjd/idna/compare/v3.6...v3.7

Changelog

Sourced from idna's changelog.

3.7 (2024-04-11) ++++++++++++++++

  • Fix issue where specially crafted inputs to encode() could take exceptionally long amount of time to process. [CVE-2024-3651]

Thanks to Guido Vranken for reporting the issue.

Commits
  • 1d365e1 Release v3.7
  • c1b3154 Merge pull request #172 from kjd/optimize-contextj
  • 0394ec7 Merge branch 'master' into optimize-contextj
  • cd58a23 Merge pull request #152 from elliotwutingfeng/dev
  • 5beb28b More efficient resolution of joiner contexts
  • 1b12148 Update ossf/scorecard-action to v2.3.1
  • d516b87 Update Github actions/checkout to v4
  • c095c75 Merge branch 'master' into dev
  • 60a0a4c Fix typo in GitHub Actions workflow key
  • 5918a0e Merge branch 'master' into dev
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=idna&package-manager=pip&previous-version=3.6&new-version=3.7)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself) You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/bacalhau-project/bacalhau/network/alerts).
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: David Aronchick --- integration/airflow/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration/airflow/requirements.txt b/integration/airflow/requirements.txt index ea0d98dfbf..338b1ce635 100644 --- a/integration/airflow/requirements.txt +++ b/integration/airflow/requirements.txt @@ -206,7 +206,7 @@ httpx==0.26.0 # via apache-airflow identify==2.5.35 # via pre-commit -idna==3.6 +idna==3.7 # via # anyio # email-validator