From e3ccb40b5d23acb76be8901d1f5a8d19a10398e9 Mon Sep 17 00:00:00 2001 From: Plamen Petrov Date: Mon, 21 Sep 2020 10:42:24 +0300 Subject: [PATCH 1/4] kill shim functionality firecracker update Signed-off-by: Plamen Petrov --- Makefile | 4 +- _submodules/firecracker | 2 +- firecracker-control/local.go | 204 ++++++++++++++++++++++++++++++++++- runtime/service.go | 43 -------- 4 files changed, 205 insertions(+), 48 deletions(-) diff --git a/Makefile b/Makefile index 80f467cd8..812b61fda 100644 --- a/Makefile +++ b/Makefile @@ -278,8 +278,8 @@ demo-network: install-cni-bins $(FCNET_CONFIG) # Firecracker submodule ########################## .PHONY: firecracker -firecracker: $(FIRECRACKER_BIN) $(JAILER_BIN) - +firecracker: + _submodules/firecracker/tools/devtool build --release .PHONY: install-firecracker install-firecracker: firecracker install -D -o root -g root -m755 -t $(INSTALLROOT)/bin $(FIRECRACKER_BIN) diff --git a/_submodules/firecracker b/_submodules/firecracker index 57f4c7ca1..9511525c8 160000 --- a/_submodules/firecracker +++ b/_submodules/firecracker @@ -1 +1 @@ -Subproject commit 57f4c7ca14a31c5536f188cacb669d2cad32b9ca +Subproject commit 9511525c8d8abf836a45f922ca69ef18b04efa41 diff --git a/firecracker-control/local.go b/firecracker-control/local.go index 668e096fc..199eed466 100644 --- a/firecracker-control/local.go +++ b/firecracker-control/local.go @@ -71,6 +71,8 @@ type local struct { processesMu sync.Mutex processes map[string]int32 + + fcControlSocket *net.UnixListener } func newLocal(ic *plugin.InitContext) (*local, error) { @@ -243,7 +245,7 @@ func (s *local) StopVM(requestCtx context.Context, req *proto.StopVMRequest) (*e defer client.Close() resp, shimErr := client.StopVM(requestCtx, req) - waitErr := s.waitForShimToExit(requestCtx, req.VMID) + waitErr := s.waitForShimToExit(requestCtx, req.VMID, false) // Assuming the shim is returning containerd's error code, return the error as is if possible. if waitErr == nil { @@ -252,7 +254,7 @@ func (s *local) StopVM(requestCtx context.Context, req *proto.StopVMRequest) (*e return resp, multierror.Append(shimErr, waitErr).ErrorOrNil() } -func (s *local) waitForShimToExit(ctx context.Context, vmID string) error { +func (s *local) waitForShimToExit(ctx context.Context, vmID string, killShim bool) error { socketAddr, err := fcShim.SocketAddress(ctx, vmID) if err != nil { return err @@ -267,6 +269,17 @@ func (s *local) waitForShimToExit(ctx context.Context, vmID string) error { } defer delete(s.processes, socketAddr) + if killShim { + s.logger.Debug("Killing shim") + + if err := syscall.Kill(int(pid), 9); err != nil { + s.logger.WithError(err).Error("Failed to kill shim process") + return err + } + + return nil + } + return internal.WaitForPidToExit(ctx, stopVMInterval, pid) } @@ -464,6 +477,150 @@ func setShimOOMScore(shimPid int) error { return nil } +func (s *local) loadShim(ctx context.Context, ns, vmID, containerdAddress string) (*exec.Cmd, error) { + logger := s.logger.WithField("vmID", vmID) + logger.Debug("Loading shim") + + shimSocketAddress, err := fcShim.SocketAddress(ctx, vmID) + if err != nil { + err = errors.Wrap(err, "failed to obtain shim socket address") + s.logger.WithError(err).Error() + return nil, err + } + + shimSocket, err := shim.NewSocket(shimSocketAddress) + if isEADDRINUSE(err) { + return nil, status.Errorf(codes.AlreadyExists, "VM with ID %q already exists (socket: %q)", vmID, shimSocketAddress) + } else if err != nil { + err = errors.Wrapf(err, "failed to open shim socket at address %q", shimSocketAddress) + s.logger.WithError(err).Error() + return nil, err + } + + // If we're here, there is no pre-existing shim for this VMID, so we spawn a new one + defer shimSocket.Close() + if err := os.Mkdir(s.config.ShimBaseDir, 0700); err != nil && !os.IsExist(err) { + s.logger.WithError(err).Error() + return nil, errors.Wrapf(err, "failed to make shim base directory: %s", s.config.ShimBaseDir) + } + + shimDir, err := vm.ShimDir(s.config.ShimBaseDir, ns, vmID) + if err != nil { + err = errors.Wrapf(err, "failed to build shim path") + s.logger.WithError(err).Error() + return nil, err + } + + err = shimDir.Mkdir() + if err != nil { + err = errors.Wrapf(err, "failed to create VM dir %q", shimDir.RootPath()) + s.logger.WithError(err).Error() + return nil, err + } + + fcSocketAddress, err := fcShim.FCControlSocketAddress(ctx, vmID) + if err != nil { + err = errors.Wrap(err, "failed to obtain shim socket address") + s.logger.WithError(err).Error() + return nil, err + } + + fcSocket, err := shim.NewSocket(fcSocketAddress) + if err != nil { + err = errors.Wrapf(err, "failed to open fccontrol socket at address %q", fcSocketAddress) + s.logger.WithError(err).Error() + return nil, err + } + + s.fcControlSocket = fcSocket + + args := []string{ + "-namespace", ns, + "-address", containerdAddress, + } + + cmd := exec.Command(internal.ShimBinaryName, args...) + + // note: The working dir of the shim has an effect on the length of the path + // needed to specify various unix sockets that the shim uses to communicate + // with the firecracker VMM and guest agent within. The length of that path + // has a relatively low limit (usually 108 chars), so modifying the working + // dir should be done with caution. See internal/vm/dir.go for the path + // definitions. + cmd.Dir = shimDir.RootPath() + + shimSocketFile, err := shimSocket.File() + if err != nil { + err = errors.Wrap(err, "failed to get shim socket fd") + logger.WithError(err).Error() + return nil, err + } + + fcSocketFile, err := fcSocket.File() + if err != nil { + err = errors.Wrap(err, "failed to get shim fccontrol socket fd") + logger.WithError(err).Error() + return nil, err + } + + cmd.ExtraFiles = append(cmd.ExtraFiles, shimSocketFile, fcSocketFile) + fcSocketFDNum := 2 + len(cmd.ExtraFiles) // "2 +" because ExtraFiles come after stderr (fd #2) + + ttrpc := containerdAddress + ".ttrpc" + cmd.Env = append(os.Environ(), + fmt.Sprintf("%s=%s", ttrpcAddressEnv, ttrpc), + fmt.Sprintf("%s=%s", internal.VMIDEnvVarKey, vmID), + fmt.Sprintf("%s=%s", internal.FCSocketFDEnvKey, strconv.Itoa(fcSocketFDNum))) // TODO remove after containerd is updated to expose ttrpc server to shim + + cmd.SysProcAttr = &syscall.SysProcAttr{ + Setpgid: true, + } + + // shim stderr is just raw text, so pass it through our logrus formatter first + cmd.Stderr = logger.WithField("shim_stream", "stderr").WriterLevel(logrus.ErrorLevel) + // shim stdout on the other hand is already formatted by logrus, so pass that transparently through to containerd logs + cmd.Stdout = logger.Logger.Out + + logger.Debugf("starting %s", internal.ShimBinaryName) + + err = cmd.Start() + if err != nil { + err = errors.Wrap(err, "failed to start shim child process") + logger.WithError(err).Error() + return nil, err + } + + // make sure to wait after start + go func() { + if err := cmd.Wait(); err != nil { + if exitErr, ok := err.(*exec.ExitError); ok { + // shim is usually terminated by cancelling the context + logger.WithError(exitErr).Debug("shim has been terminated") + } else { + logger.WithError(err).Error("shim has been unexpectedly terminated") + } + } + + // Close all Unix abstract sockets. + if err := shimSocketFile.Close(); err != nil { + logger.WithError(err).Errorf("failed to close %q", shimSocketFile.Name()) + } + if err := fcSocketFile.Close(); err != nil { + logger.WithError(err).Errorf("failed to close %q", fcSocketFile.Name()) + } + }() + + err = setShimOOMScore(cmd.Process.Pid) + if err != nil { + logger.WithError(err).Error() + return nil, err + } + + s.addShim(shimSocketAddress, cmd) + + return cmd, nil +} + // PauseVM Pauses a VM func (s *local) PauseVM(ctx context.Context, req *proto.PauseVMRequest) (*empty.Empty, error) { client, err := s.shimFirecrackerClient(ctx, req.VMID) @@ -520,6 +677,18 @@ func (s *local) CreateSnapshot(ctx context.Context, req *proto.CreateSnapshotReq // LoadSnapshot Loads a snapshot of a VM func (s *local) LoadSnapshot(ctx context.Context, req *proto.LoadSnapshotRequest) (*empty.Empty, error) { + ns, err := namespaces.NamespaceRequired(ctx) + if err != nil { + err = errors.Wrap(err, "error retrieving namespace of request") + s.logger.WithError(err).Error() + return nil, err + } + + _, err = s.loadShim(ctx, ns, req.VMID, s.containerdAddress) + if err != nil { + return nil, err + } + client, err := s.shimFirecrackerClient(ctx, req.VMID) if err != nil { return nil, err @@ -553,5 +722,36 @@ func (s *local) Offload(ctx context.Context, req *proto.OffloadRequest) (*empty. return nil, err } + s.fcControlSocket.Close() + + shimSocketAddress, err := fcShim.SocketAddress(ctx, req.VMID) + if err != nil { + err = errors.Wrap(err, "failed to obtain shim socket address") + s.logger.WithError(err).Error() + return nil, err + } + removeErr := os.RemoveAll(shimSocketAddress) + if removeErr != nil { + s.logger.Errorf("failed to remove shim socket addr file: %v", removeErr) + return nil, err + } + + fcSocketAddress, err := fcShim.FCControlSocketAddress(ctx, req.VMID) + if err != nil { + s.logger.Error("failed to get FC socket address") + return nil, err + } + removeErr = os.RemoveAll(fcSocketAddress) + if removeErr != nil { + s.logger.Errorf("failed to remove fc socket addr file: %v", removeErr) + return nil, err + } + + waitErr := s.waitForShimToExit(ctx, req.VMID, true) + if waitErr != nil { + s.logger.Error("failed to wait for shim to exit on offload") + return nil, waitErr + } + return resp, nil } diff --git a/runtime/service.go b/runtime/service.go index ca0d86d5c..c7e03feb0 100644 --- a/runtime/service.go +++ b/runtime/service.go @@ -811,24 +811,6 @@ func (s *service) LoadSnapshot(ctx context.Context, req *proto.LoadSnapshotReque return nil, err } - // Workaround for https://github.com/firecracker-microvm/firecracker/issues/1974 - patchDriveReq, err := formPatchDriveReq("MN2HE43UOVRDA", s.taskDrivePathOnHost) - if err != nil { - s.logger.WithError(err).Error("Failed to create patch drive request") - return nil, err - } - - resp, err = s.httpControlClient.Do(patchDriveReq) - if err != nil { - s.logger.WithError(err).Error("Failed to send patch drive request") - return nil, err - } - if !strings.Contains(resp.Status, "204") { - s.logger.WithError(err).Error("Failed to patch drive") - s.logger.WithError(err).Errorf("Status of request: %s", resp.Status) - return nil, err - } - return &empty.Empty{}, nil } @@ -1643,31 +1625,6 @@ func formCreateSnapReq(snapshotPath, memPath string) (*http.Request, error) { return req, nil } -func formPatchDriveReq(driveID, pathOnHost string) (*http.Request, error) { - var req *http.Request - - data := map[string]string{ - "drive_id": driveID, - "path_on_host": pathOnHost, - } - json, err := json.Marshal(data) - if err != nil { - logrus.WithError(err).Error("Failed to marshal json data") - return nil, err - } - - req, err = http.NewRequest("PATCH", fmt.Sprintf("http+unix://firecracker/drives/%s", driveID), bytes.NewBuffer(json)) - if err != nil { - logrus.WithError(err).Error("Failed to create new HTTP request in formPauseReq") - return nil, err - } - - req.Header.Add("accept", "application/json") - req.Header.Add("Content-Type", "application/json") - - return req, nil -} - func (s *service) startFirecrackerProcess() error { firecPath, err := exec.LookPath("firecracker") if err != nil { From db02b0c7eeab8c5e5106c0d167392c188e77aed1 Mon Sep 17 00:00:00 2001 From: Plamen Petrov Date: Mon, 21 Sep 2020 17:07:54 +0300 Subject: [PATCH 2/4] Remove unnecessary mkdir * Check that shim dir exists when loading shim * No longer try to create shim dir when loading shim, as it must exist Signed-off-by: Plamen Petrov --- firecracker-control/local.go | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/firecracker-control/local.go b/firecracker-control/local.go index 199eed466..973c5e0ed 100644 --- a/firecracker-control/local.go +++ b/firecracker-control/local.go @@ -272,7 +272,7 @@ func (s *local) waitForShimToExit(ctx context.Context, vmID string, killShim boo if killShim { s.logger.Debug("Killing shim") - if err := syscall.Kill(int(pid), 9); err != nil { + if err := syscall.Kill(int(pid), syscall.SIGKILL); err != nil { s.logger.WithError(err).Error("Failed to kill shim process") return err } @@ -442,10 +442,6 @@ func (s *local) newShim(ns, vmID, containerdAddress string, shimSocket *net.Unix if err := fcSocketFile.Close(); err != nil { logger.WithError(err).Errorf("failed to close %q", fcSocketFile.Name()) } - - if err := os.RemoveAll(shimDir.RootPath()); err != nil { - logger.WithError(err).Errorf("failed to remove %q", shimDir.RootPath()) - } }() err = setShimOOMScore(cmd.Process.Pid) @@ -497,11 +493,11 @@ func (s *local) loadShim(ctx context.Context, ns, vmID, containerdAddress string return nil, err } - // If we're here, there is no pre-existing shim for this VMID, so we spawn a new one defer shimSocket.Close() - if err := os.Mkdir(s.config.ShimBaseDir, 0700); err != nil && !os.IsExist(err) { - s.logger.WithError(err).Error() - return nil, errors.Wrapf(err, "failed to make shim base directory: %s", s.config.ShimBaseDir) + + // If we're here, there is no pre-existing shim for this VMID, so we spawn a new one + if _, err := os.Stat(s.config.ShimBaseDir); os.IsNotExist(err) { + return nil, errors.Wrapf(err, "shim base dir does not exist: %s", s.config.ShimBaseDir) } shimDir, err := vm.ShimDir(s.config.ShimBaseDir, ns, vmID) @@ -511,13 +507,6 @@ func (s *local) loadShim(ctx context.Context, ns, vmID, containerdAddress string return nil, err } - err = shimDir.Mkdir() - if err != nil { - err = errors.Wrapf(err, "failed to create VM dir %q", shimDir.RootPath()) - s.logger.WithError(err).Error() - return nil, err - } - fcSocketAddress, err := fcShim.FCControlSocketAddress(ctx, vmID) if err != nil { err = errors.Wrap(err, "failed to obtain shim socket address") From 57e5f9a27c4bee4c9b7eab9bf04f1ba59710981d Mon Sep 17 00:00:00 2001 From: Plamen Petrov Date: Tue, 22 Sep 2020 12:42:38 +0300 Subject: [PATCH 3/4] Use SIGKILL, remove patchDrive artifacts Signed-off-by: Plamen Petrov --- runtime/service.go | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/runtime/service.go b/runtime/service.go index c7e03feb0..48e6e9dd6 100644 --- a/runtime/service.go +++ b/runtime/service.go @@ -152,9 +152,8 @@ type service struct { vsockPortMu sync.Mutex // httpControlClient is to send pause/resume/snapshot commands to the microVM - httpControlClient *http.Client - firecrackerPid int - taskDrivePathOnHost string + httpControlClient *http.Client + firecrackerPid int } func shimOpts(shimCtx context.Context) (*shim.Opts, error) { @@ -838,7 +837,7 @@ func (s *service) CreateSnapshot(ctx context.Context, req *proto.CreateSnapshotR // Offload Shuts down a VM and deletes the corresponding firecracker socket // and vsock. All of the other resources will persist func (s *service) Offload(ctx context.Context, req *proto.OffloadRequest) (*empty.Empty, error) { - if err := syscall.Kill(s.firecrackerPid, 9); err != nil { + if err := syscall.Kill(s.firecrackerPid, syscall.SIGKILL); err != nil { s.logger.WithError(err).Error("Failed to kill firecracker process") return nil, err } @@ -1061,8 +1060,6 @@ func (s *service) Create(requestCtx context.Context, request *taskAPI.CreateTask } rootfsMnt := request.Rootfs[0] - s.taskDrivePathOnHost = rootfsMnt.Source - err = s.containerStubHandler.Reserve(requestCtx, request.ID, rootfsMnt.Source, vmBundleDir.RootfsPath(), "ext4", nil, s.driveMountClient, s.machine) if err != nil { From 968803a5a4c992bb70c5f013a4d9f76146ecb457 Mon Sep 17 00:00:00 2001 From: Plamen Petrov Date: Tue, 22 Sep 2020 14:46:39 +0300 Subject: [PATCH 4/4] store firecracker logs in shimDir Signed-off-by: Plamen Petrov --- internal/common.go | 6 ++++++ internal/vm/dir.go | 13 +++++++++++++ runtime/service.go | 14 ++++---------- runtime/service_test.go | 2 +- 4 files changed, 24 insertions(+), 11 deletions(-) diff --git a/internal/common.go b/internal/common.go index 9e7a78b48..cb8cfb7a9 100644 --- a/internal/common.go +++ b/internal/common.go @@ -50,6 +50,12 @@ const ( // ShimLogFifoName is the name of the FIFO created by containerd for a shim to write its logs to ShimLogFifoName = "log" + // LogPathNameStart is the name of the FIFO created by containerd for a shim to write its logs to + LogPathNameStart = "log_start" + + // LogPathNameLoad is the name of the FIFO created by containerd for a shim to write its logs to + LogPathNameLoad = "log_load" + // OCIConfigName is the name of the OCI bundle's config field OCIConfigName = "config.json" diff --git a/internal/vm/dir.go b/internal/vm/dir.go index 228350a35..6c3fdece7 100644 --- a/internal/vm/dir.go +++ b/internal/vm/dir.go @@ -83,6 +83,19 @@ func (d Dir) OpenLogFifo(requestCtx context.Context) (io.ReadWriteCloser, error) return fifo.OpenFifo(requestCtx, d.LogFifoPath(), unix.O_WRONLY|unix.O_NONBLOCK, 0200) } +// LogStartPath returns the path to the file for storing +// firecracker logs after the microVM is started and until +// it is Offloaded +func (d Dir) LogStartPath() string { + return filepath.Join(d.RootPath(), internal.LogPathNameStart) +} + +// LogLoadPath returns the path to the file for storing +// firecracker logs after the microVM is loaded from a snapshot +func (d Dir) LogLoadPath() string { + return filepath.Join(d.RootPath(), internal.LogPathNameLoad) +} + // FirecrackerSockPath returns the path to the unix socket at which the firecracker VMM // services its API func (d Dir) FirecrackerSockPath() string { diff --git a/runtime/service.go b/runtime/service.go index 48e6e9dd6..99eb0ed77 100644 --- a/runtime/service.go +++ b/runtime/service.go @@ -875,13 +875,8 @@ func (s *service) buildVMConfiguration(req *proto.CreateVMRequest) (*firecracker } // TODO: Remove hardcoding and make a parameter - logFilePath := fmt.Sprintf("/tmp/log_%s_start.logs", s.vmID) - if err := os.RemoveAll(logFilePath); err != nil { - s.logger.WithError(err).Errorf("Failed to delete %s", logFilePath) - return nil, err - } - if _, err := os.OpenFile(logFilePath, os.O_RDONLY|os.O_CREATE, 0600); err != nil { - s.logger.WithError(err).Errorf("Failed to create %s", logFilePath) + if _, err := os.OpenFile(s.shimDir.LogStartPath(), os.O_RDONLY|os.O_CREATE, 0600); err != nil { + s.logger.WithError(err).Errorf("Failed to create %s", s.shimDir.LogStartPath()) return nil, err } @@ -892,7 +887,7 @@ func (s *service) buildVMConfiguration(req *proto.CreateVMRequest) (*firecracker ID: "agent_api", }}, // Put LogPath insteadof LogFifo here to comply with the new Firecracker logging - LogPath: logFilePath, + LogPath: s.shimDir.LogStartPath(), MachineCfg: machineConfigurationFromProto(s.config, req.MachineCfg), LogLevel: s.config.DebugHelper.GetFirecrackerLogLevel(), VMID: s.vmID, @@ -1629,8 +1624,7 @@ func (s *service) startFirecrackerProcess() error { return err } - // TODO: Remove hardcoding and make a parameter - logFilePath := fmt.Sprintf("/tmp/log_%s_after.logs", s.vmID) + logFilePath := s.shimDir.LogLoadPath() if err := os.RemoveAll(logFilePath); err != nil { s.logger.WithError(err).Errorf("Failed to delete %s", logFilePath) return err diff --git a/runtime/service_test.go b/runtime/service_test.go index 80e690221..b10417dd3 100644 --- a/runtime/service_test.go +++ b/runtime/service_test.go @@ -260,7 +260,7 @@ func TestBuildVMConfiguration(t *testing.T) { // TODO: FIX TEST WHEN LogPath is no longer hardcoded //tc.expectedCfg.LogFifo = svc.shimDir.FirecrackerLogFifoPath() //tc.expectedCfg.MetricsFifo = svc.shimDir.FirecrackerMetricsFifoPath() - tc.expectedCfg.LogPath = "/tmp/log__start.logs" + tc.expectedCfg.LogPath = svc.shimDir.LogStartPath() drives := make([]models.Drive, tc.expectedStubDriveCount) for i := 0; i < tc.expectedStubDriveCount; i++ {