Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add boot-ip-setter #2767

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ OP_WIN_ZIP = neco-operation-cli-windows_$(VERSION)_amd64.zip
OP_MAC_ZIP = neco-operation-cli-mac_$(VERSION)_amd64.zip
DEBBUILD_FLAGS = -Znone
BIN_PKGS = ./pkg/neco
SBIN_PKGS = ./pkg/neco-updater ./pkg/neco-worker
SBIN_PKGS = ./pkg/neco-updater ./pkg/neco-worker ./pkg/boot-ip-setter
OPDEB_BINNAMES = argocd hubble jsonnet jsonnetfmt jsonnet-lint kubectl kubeseal kustomize logcli stern tsh kubectl-moco kubectl-accurate amtool yq tempo-cli flamegraph.pl stackcollapse-perf.pl necoperf-cli necoip nsdump clusterdump cmctl vmalert-tool npv
OPDEB_DOCNAMES = argocd hubble jsonnet kubectl kubeseal kustomize logcli stern teleport moco accurate alertmanager yq tempo flamegraph necoperf cmctl vmalert-tool

Expand Down
18 changes: 18 additions & 0 deletions constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,24 @@ const (
NecoRebooterService = "neco-rebooter"
)

// Virutal IP
const (
VirtualIPAddrDHCPServer1 = "10.71.255.1"
VirtualIPAddrDHCPServer2 = "10.71.255.2"
VirtualIPAddrDHCPServer3 = "10.71.255.3"
VirtualIPAddrDHCPServer4 = "10.71.255.4"
VirtualIPAddrDHCPServer5 = "10.71.255.5"
VirtualIPAddrActiveBootServer = "10.71.255.6"
)

var DHCPServerAddressList = []string{
VirtualIPAddrDHCPServer1,
VirtualIPAddrDHCPServer2,
VirtualIPAddrDHCPServer3,
VirtualIPAddrDHCPServer4,
VirtualIPAddrDHCPServer5,
}

// File locations
var (
RackFile = filepath.Join(NecoDir, "rack")
Expand Down
52 changes: 52 additions & 0 deletions dctest/boot-ip-setter_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
package dctest

import (
"github.com/cybozu-go/neco"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)

// testBootIPSetter tests the behavior of boot-ip-setter in bootstrapping
func testBootIPSetter() {
It("should set Virtual IPs to boot servers", func() {
expectedDHCPServerHostname := map[string]string{
"10.71.255.1": "gcp0-boot-0",
"10.71.255.2": "gcp0-boot-1",
"10.71.255.3": "gcp0-boot-2",
}
expectedActiveBootServerHostname := []string{
"gcp0-boot-0",
"gcp0-boot-1",
"gcp0-boot-2",
}
checkBootServerVirtualIPs(expectedDHCPServerHostname, expectedActiveBootServerHostname)
})
}

func checkBootServerVirtualIPs(expectedDHCPServerHostname map[string]string, expectedActiveBootServerHostname []string) {
machines, err := getSabakanMachines("--without-role=boot")
Expect(err).NotTo(HaveOccurred())

By("checking dhcp server addresses")
for _, m := range machines {
nodeIP := m.Spec.IPv4[0]
for _, vip := range neco.DHCPServerAddressList {
if host := expectedDHCPServerHostname[vip]; host != "" {
stdout, stderr, err := execAt(bootServers[0], "ckecli", "ssh", "cybozu@"+nodeIP, "--", "curl", "-m", "2", "-sS", "http://"+vip+":4192/hostname")
Expect(err).NotTo(HaveOccurred(), "from=%s, to=%s, stdout=%s, stderr=%s", nodeIP, vip, stdout, stderr)
Expect(string(stdout)).To(Equal(host))
} else {
stdout, stderr, err := execAt(bootServers[0], "ckecli", "ssh", "cybozu@"+nodeIP, "--", "curl", "-m", "2", "-sS", "http://"+vip+":4192/hostname")
Expect(err).To(HaveOccurred(), "from=%s, to=%s, stdout=%s, stderr=%s", nodeIP, vip, stdout, stderr)
}
}
}

By("checking active boot server address")
for _, m := range machines {
nodeIP := m.Spec.IPv4[0]
stdout, stderr, err := execAt(bootServers[0], "ckecli", "ssh", "cybozu@"+nodeIP, "--", "curl", "-m", "2", "-sS", "http://"+neco.VirtualIPAddrActiveBootServer+":4192/hostname")
Expect(err).NotTo(HaveOccurred(), "from=%s, stdout=%s, stderr=%s", nodeIP, stdout, stderr)
Expect(string(stdout)).To(BeElementOf(expectedActiveBootServerHostname))
}
}
30 changes: 30 additions & 0 deletions dctest/join_remove_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,22 @@ func testJoinRemove() {
}).Should(Succeed())
})

It("should set Virtual IPs to boot-3", func() {
expectedDHCPServerHostname := map[string]string{
"10.71.255.1": "gcp0-boot-0",
"10.71.255.2": "gcp0-boot-1",
"10.71.255.3": "gcp0-boot-2",
"10.71.255.4": "gcp0-boot-3",
}
expectedActiveBootServerHostname := []string{
"gcp0-boot-0",
"gcp0-boot-1",
"gcp0-boot-2",
"gcp0-boot-3",
}
checkBootServerVirtualIPs(expectedDHCPServerHostname, expectedActiveBootServerHostname)
})

It("should remove boot-3", func() {
By("Running neco leave 3")
token := getVaultToken()
Expand Down Expand Up @@ -203,6 +219,20 @@ func testJoinRemove() {
time.Sleep(3 * time.Minute)
})

It("should remove virtual IPs from boot-3", func() {
expectedDHCPServerHostname := map[string]string{
"10.71.255.1": "gcp0-boot-0",
"10.71.255.2": "gcp0-boot-1",
"10.71.255.3": "gcp0-boot-2",
}
expectedActiveBootServerHostname := []string{
"gcp0-boot-0",
"gcp0-boot-1",
"gcp0-boot-2",
}
checkBootServerVirtualIPs(expectedDHCPServerHostname, expectedActiveBootServerHostname)
})

It("should set state of boot-3 to unreachable", func() {
By("Stopping boot-3")
// In DCtest on CircleCI, ginkgo is executed in the operation pod, so you cannot use pmctl in this context.
Expand Down
1 change: 1 addition & 0 deletions dctest/suites_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ var bootstrapSuite = func() {
Context("init-data", testInitData)
Context("etcdpasswd", testEtcdpasswd)
Context("sabakan-state-setter", testSabakanStateSetter)
Context("boot-ip-setter", testBootIPSetter)
Context("ignitions", testIgnitions)
Context("cke", func() {
testCKESetup()
Expand Down
2 changes: 1 addition & 1 deletion debian/DEBIAN/postinst
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/sh -e

SERVICES="node-exporter neco-updater neco-worker sabakan-state-setter neco-rebooter cke cke-localproxy teleport-node"
SERVICES="node-exporter neco-updater neco-worker sabakan-state-setter neco-rebooter cke cke-localproxy teleport-node boot-ip-setter"
TIMERS="docker-prune kill-old-login-sessions backup-cke-etcd export-unit-status trigger-reboot-all-nodes"

configure() {
Expand Down
2 changes: 1 addition & 1 deletion debian/DEBIAN/prerm
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/sh -e

SERVICES="neco-updater neco-worker node-exporter sabakan-state-setter neco-rebooter cke cke-localproxy teleport-node trigger-reboot-all-nodes"
SERVICES="neco-updater neco-worker node-exporter sabakan-state-setter neco-rebooter cke cke-localproxy teleport-node trigger-reboot-all-nodes boot-ip-setter"
TIMERS="docker-prune kill-old-login-sessions backup-cke-etcd export-unit-status trigger-reboot-all-nodes"

prerm() {
Expand Down
15 changes: 15 additions & 0 deletions debian/lib/systemd/system/boot-ip-setter.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[Unit]
Description=boot ip setter
After=network-online.target sabakan.service
Wants=network-online.target sabakan.service
StartLimitIntervalSec=600s

[Service]
Type=simple
Restart=on-failure
RestartForceExitStatus=SIGPIPE
RestartSec=30s
ExecStart=/usr/sbin/boot-ip-setter

[Install]
WantedBy=multi-user.target
96 changes: 96 additions & 0 deletions docs/boot-ip-setter.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
boot-ip-setter
==============

`boot-ip-setter` is a daemon program for handling virtual IP addresses on the boot servers.
It runs on active boot servers, where the neco package has been installed, and components such as etcd and sabakan, etc., are running.

This program handles the following two types of virtual IP addresses for different uses:

1. DHCP Server Address

The IP address for Sabakan DHCP server. This address is used as the DHCP relay destination in the network switches.

This program selects one of the five addresses from `10.71.255.1` to `10.72.255.5` and sets the address to its running server.
If multiple active boot servers exist, these addresses will be set without bias to each server.

2. Active Boot Server Address

The IP address for accessing one of the boot servers from inside the Kubernetes cluster.
The value is fixed at `10.71.255.6`. The same value is set for all active boot servers.

This program decides whether or not to set these IPs based on the member list of the etcd cluster on boot servers and sets the IPs to the network interface.


## Usage (Options)

```console
$ boot-ip-setter [OPTIONS]
```

| Option | Default value | Description |
| --------------- | -------------- | -------------------------------------------------------- |
| `-debug` | `false` | Show debug log or not. |
| `-interface` | `boot` | The target network interface that this program operates. |
| `-interval` | `1m` | The interval for periodic operation. |
| `-listen-addr ` | `0.0.0.0:4192` | The listen address. |


## HTTP endpoint

This program provides the following HTTP endpoints.

- `/hostname`

This endpoint returns the hostname of the server that this program runs on.
This is mainly intended for use in testing or operational checks.

- `/metrics`

This endpoint returns the metrics. For details on metrics, please refer to the next section.


## Metrics

This program provides the following metrics in the Prometheus format.
Besides this, it also outputs the metrics collected in the `GoCollector` and the `ProcessCollector` of the [Prometheus Go client library](https://github.com/prometheus/client_golang).

| Name | Description | Type | Labels |
| ------------------------------------------------- | --------------------------------------------------- | ------- | ------------------- |
| `boot_ip_setter_hostname` | The hostname this program runs on. | Gauge | `hostname` |
| `boot_ip_setter_interface_address` | The IP address set to the target interface. | Gauge | `interface`, `ipv4` |
| `boot_ip_setter_interface_operation_errors_total` | The number of times the interface operation failed. | Counter | |


## Internals

### Main process

This program repeats the following actions in one-minute cycles.

- Gets member list of the etcd cluster on boot servers.
- Calculates the virtual IPs should be set from the member list.
- Sets the IP address to the target network interface. If there are any unnecessary IPs on the interface, this program deletes them.

This program doesn't advertise the IPs, it just sets IPs to the network interfaces.

### Signal Handling

This program terminates normally when receiving `SIGTERM` or `SIGINT`.

### Error Handling

This program handles errors as follows.

- Connection failure to the etcd

This program will terminate abnormally and delete the IPs on the target interface on exit.
These errors may be resolved by retrying. So terminates early and retries from the beginning.

- Operation failure of the network interface

This program will count up the `boot_ip_setter_interface_operation_errors_total` metric.
These errors may not be recovered by restarting. So this program continues running and notifies errors by using metrics.

- Other failure

If an error other than the above occurs, this program will terminate abnormally and delete the IPs on the target interface on exit.
3 changes: 2 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ require (
go.etcd.io/etcd/client/v3 v3.5.17
golang.org/x/crypto v0.31.0
golang.org/x/oauth2 v0.24.0
golang.org/x/sync v0.10.0
golang.org/x/term v0.27.0
k8s.io/api v0.31.0
k8s.io/apimachinery v0.31.0
Expand Down Expand Up @@ -98,6 +99,7 @@ require (
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/compress v1.17.9 // indirect
github.com/kylelemons/godebug v1.1.0 // indirect
github.com/magiconair/properties v1.8.7 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/miekg/dns v1.1.41 // indirect
Expand Down Expand Up @@ -132,7 +134,6 @@ require (
go.uber.org/zap v1.27.0 // indirect
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect
golang.org/x/net v0.33.0 // indirect
golang.org/x/sync v0.10.0 // indirect
golang.org/x/sys v0.28.0 // indirect
golang.org/x/text v0.21.0 // indirect
golang.org/x/time v0.5.0 // indirect
Expand Down
9 changes: 3 additions & 6 deletions menu/cluster_yaml_generator.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"fmt"
"io"

"github.com/cybozu-go/neco"
"github.com/cybozu-go/placemat/v2/pkg/types"
"k8s.io/apimachinery/pkg/runtime/serializer/json"
"sigs.k8s.io/yaml"
Expand Down Expand Up @@ -487,12 +488,8 @@ func (c *Cluster) createToRNetNs(rack *rack, tor *tor, torIdx int) *types.NetNSS
fmt.Sprintf("--pid-file=/var/run/dnsmasq_%s.pid", name),
"--log-facility=-",
}
for _, r := range c.racks {
if r.name == rack.name {
continue
}
dnsmasqCommand = append(dnsmasqCommand, "--dhcp-relay")
dnsmasqCommand = append(dnsmasqCommand, fmt.Sprintf("%s,%s", tor.nodeAddress.IP.String(), r.bootNode.node0Address.IP.String()))
for _, ip := range neco.DHCPServerAddressList {
dnsmasqCommand = append(dnsmasqCommand, "--dhcp-relay", fmt.Sprintf("%s,%s", tor.nodeAddress.IP.String(), ip))
}
torNs.Apps = append(torNs.Apps, &types.NetNSAppSpec{
Name: "dnsmasq",
Expand Down
40 changes: 36 additions & 4 deletions menu/testdata/cluster.yml
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,15 @@ apps:
- --pid-file=/var/run/dnsmasq_rack0-tor1.pid
- --log-facility=-
- --dhcp-relay
- 10.69.0.65,10.69.0.195
- 10.69.0.65,10.71.255.1
- --dhcp-relay
- 10.69.0.65,10.71.255.2
- --dhcp-relay
- 10.69.0.65,10.71.255.3
- --dhcp-relay
- 10.69.0.65,10.71.255.4
- --dhcp-relay
- 10.69.0.65,10.71.255.5
interfaces:
- addresses:
- 10.72.1.1/31
Expand Down Expand Up @@ -549,7 +557,15 @@ apps:
- --pid-file=/var/run/dnsmasq_rack0-tor2.pid
- --log-facility=-
- --dhcp-relay
- 10.69.0.129,10.69.0.195
- 10.69.0.129,10.71.255.1
- --dhcp-relay
- 10.69.0.129,10.71.255.2
- --dhcp-relay
- 10.69.0.129,10.71.255.3
- --dhcp-relay
- 10.69.0.129,10.71.255.4
- --dhcp-relay
- 10.69.0.129,10.71.255.5
interfaces:
- addresses:
- 10.72.1.3/31
Expand Down Expand Up @@ -579,7 +595,15 @@ apps:
- --pid-file=/var/run/dnsmasq_rack1-tor1.pid
- --log-facility=-
- --dhcp-relay
- 10.69.1.1,10.69.0.3
- 10.69.1.1,10.71.255.1
- --dhcp-relay
- 10.69.1.1,10.71.255.2
- --dhcp-relay
- 10.69.1.1,10.71.255.3
- --dhcp-relay
- 10.69.1.1,10.71.255.4
- --dhcp-relay
- 10.69.1.1,10.71.255.5
interfaces:
- addresses:
- 10.72.1.5/31
Expand Down Expand Up @@ -609,7 +633,15 @@ apps:
- --pid-file=/var/run/dnsmasq_rack1-tor2.pid
- --log-facility=-
- --dhcp-relay
- 10.69.1.65,10.69.0.3
- 10.69.1.65,10.71.255.1
- --dhcp-relay
- 10.69.1.65,10.71.255.2
- --dhcp-relay
- 10.69.1.65,10.71.255.3
- --dhcp-relay
- 10.69.1.65,10.71.255.4
- --dhcp-relay
- 10.69.1.65,10.71.255.5
interfaces:
- addresses:
- 10.72.1.7/31
Expand Down
Loading