从API route开始看StopContainer接口的调用过程。
// NewRouter initializes a new container router
func NewRouter(b Backend, decoder httputils.ContainerDecoder) router.Router {
r := &containerRouter{
backend: b,
decoder: decoder,
}
r.initRoutes()
return r
}
...
// initRoutes initializes the routes in container router
func (r *containerRouter) initRoutes() {
r.routes = []router.Route{
...
router.NewPostRoute("/containers/{name:.*}/stop", r.postContainersStop),
...
}
}
func (s *containerRouter) postContainersStop(ctx context.Context, w http.ResponseWriter, r *http.Request, vars map[string]string) error {
...
if err := s.backend.ContainerStop(vars["name"], seconds); err != nil {
return err
}
w.WriteHeader(http.StatusNoContent)
return nil
}
func (cli *DaemonCli) start(opts *daemonOptions) (err error) {
...
d, err := daemon.NewDaemon(ctx, cli.Config, pluginStore)
...
}
// ContainerStop looks for the given container and stops it.
// In case the container fails to stop gracefully within a time duration
// specified by the timeout argument, in seconds, it is forcefully
// terminated (killed).
//
// If the timeout is nil, the container's StopTimeout value is used, if set,
// otherwise the engine default. A negative timeout value can be specified,
// meaning no timeout, i.e. no forceful termination is performed.
func (daemon *Daemon) ContainerStop(name string, timeout *int) error {
container, err := daemon.GetContainer(name)
if err != nil {
return err
}
if !container.IsRunning() {
return containerNotModifiedError{
running: false}
}
if timeout == nil {
stopTimeout := container.StopTimeout()
timeout = &stopTimeout
}
if err := daemon.containerStop(container, *timeout); err != nil {
return errdefs.System(errors.Wrapf(err, "cannot stop container: %s", name))
}
return nil
}
// containerStop sends a stop signal, waits, sends a kill signal.
func (daemon *Daemon) containerStop(container *containerpkg.Container, seconds int) error {
if !container.IsRunning() {
return nil
}
stopSignal := container.StopSignal()
// 1. Send a stop signal
if err := daemon.killPossiblyDeadProcess(container, stopSignal); err != nil {
// While normally we might "return err" here we're not going to
// because if we can't stop the container by this point then
// it's probably because it's already stopped. Meaning, between
// the time of the IsRunning() call above and now it stopped.
// Also, since the err return will be environment specific we can't
// look for any particular (common) error that would indicate
// that the process is already dead vs something else going wrong.
// So, instead we'll give it up to 2 more seconds to complete and if
// by that time the container is still running, then the error
// we got is probably valid and so we force kill it.
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
if status := <-container.Wait(ctx, containerpkg.WaitConditionNotRunning); status.Err() != nil {
logrus.Infof("Container failed to stop after sending signal %d to the process, force killing", stopSignal)
if err := daemon.killPossiblyDeadProcess(container, 9); err != nil {
return err
}
}
}
// 2. Wait for the process to exit on its own
ctx := context.Background()
if seconds >= 0 {
var cancel context.CancelFunc
ctx, cancel = context.WithTimeout(ctx, time.Duration(seconds)*time.Second)
defer cancel()
}
if status := <-container.Wait(ctx, containerpkg.WaitConditionNotRunning); status.Err() != nil {
logrus.Infof("Container %v failed to exit within %d seconds of signal %d - using the force", container.ID, seconds, stopSignal)
// 3. If it doesn't, then send SIGKILL
if err := daemon.Kill(container); err != nil {
// Wait without a timeout, ignore result.
<-container.Wait(context.Background(), containerpkg.WaitConditionNotRunning)
logrus.Warn(err) // Don't return error because we only care that container is stopped, not what function stopped it
}
}
daemon.LogContainerEvent(container, "stop")
return nil
}
container.StopSignal()
优先用容器指定的信号,如果没有则默认是SIGTERM
, 如果2s
后容器仍未退出,再按上层(kubelet)指定的超时时间等待容器退出。
如果容器始终未退出,daemon.Kill(container)
给容器发送SIGKILL
信号,强制容器退出。
这里涉及容器的两种启动方式:
- shell格式
PID1进程为
/bin/sh -c
,
因为/bin/sh不会转发信号至任何子进程。所以我们的应用将永远不会收到SIGTERM信号。显然要解决这个问题,就需要将我们的进程作为PID1进程运行。
- exec格式
PID进程为应用程序执行文件(脚本或二进制), 我们的程序捕获了
docker stop
命令发送的SIGTERM信号
优先看下强制删除的过程
// Kill forcefully terminates a container.
func (daemon *Daemon) Kill(container *containerpkg.Container) error {
if !container.IsRunning() {
return errNotRunning(container.ID)
}
// 1. Send SIGKILL
if err := daemon.killPossiblyDeadProcess(container, int(syscall.SIGKILL)); err != nil {
// While normally we might "return err" here we're not going to
// because if we can't stop the container by this point then
// it's probably because it's already stopped. Meaning, between
// the time of the IsRunning() call above and now it stopped.
// Also, since the err return will be environment specific we can't
// look for any particular (common) error that would indicate
// that the process is already dead vs something else going wrong.
// So, instead we'll give it up to 2 more seconds to complete and if
// by that time the container is still running, then the error
// we got is probably valid and so we return it to the caller.
if isErrNoSuchProcess(err) {
return nil
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
if status := <-container.Wait(ctx, containerpkg.WaitConditionNotRunning); status.Err() != nil {
return err
}
}
// 2. Wait for the process to die, in last resort, try to kill the process directly
if err := killProcessDirectly(container); err != nil {
if isErrNoSuchProcess(err) {
return nil
}
return err
}
// Wait for exit with no timeout.
// Ignore returned status.
<-container.Wait(context.Background(), containerpkg.WaitConditionNotRunning)
return nil
}
killWithSignal()
先从容器层面尝试停止容器,如果容器是 Restarting
状态,就放弃这次的Kill操作。
如果容器时 Paused
状态,先执行Resume,在容器恢复后,立刻发送SIGKILL。
等待2s,容器状态没有转成 NotRunning, 就直接给容器中的进程发送SIGKILL。到这里再等上10s,如果容器还不退,就查询容器的1号进程,发送SIGKILL。
<-container.Wait
发送完SIGKILL后,开始阻塞等, 这次没有设置超时,就是死等, 这时当前goroutine 握着一把容器级别的锁(state.Lock()) 。
TODO: daemon.containerd.Resume()
// killWithSignal sends the container the given signal. This wrapper for the
// host specific kill command prepares the container before attempting
// to send the signal. An error is returned if the container is paused
// or not running, or if there is a problem returned from the
// underlying kill command.
func (daemon *Daemon) killWithSignal(container *containerpkg.Container, sig int) error {
logrus.Debugf("Sending kill signal %d to container %s", sig, container.ID)
container.Lock()
defer container.Unlock()