diff --git a/executors/docker/machine/consts.go b/executors/docker/machine/consts.go index 4de59d65a4acb847e6ff49c0e02dcdfccaf7f1da..168348ebf9a6ca5e311d3f04e6f552aaa2faee83 100644 --- a/executors/docker/machine/consts.go +++ b/executors/docker/machine/consts.go @@ -3,3 +3,6 @@ package machine import "time" var provisionRetryInterval = time.Second +var removalRetryInterval = time.Minute +var useMachineRetries = 10 +var useMachineRetryInterval = 10 * time.Second diff --git a/executors/docker/machine/details.go b/executors/docker/machine/details.go index 731921efafb8434859a7d851a94a6aa1088d1f60..4cf428e301ad12d5e2ae82d146e3cebe5bb422b0 100644 --- a/executors/docker/machine/details.go +++ b/executors/docker/machine/details.go @@ -8,15 +8,17 @@ import ( "github.com/Sirupsen/logrus" "gitlab.com/gitlab-org/gitlab-ci-multi-runner/helpers" + "gitlab.com/gitlab-org/gitlab-ci-multi-runner/helpers/docker" ) type machineDetails struct { - Name string - Created time.Time `yaml:"-"` - Used time.Time `yaml:"-"` - UsedCount int - State machineState - Reason string + Name string + Created time.Time `yaml:"-"` + Used time.Time `yaml:"-"` + UsedCount int + State machineState + Reason string + Credentials *docker_helpers.DockerCredentials } func (m *machineDetails) isUsed() bool { diff --git a/executors/docker/machine/executor.go b/executors/docker/machine/executor.go index 4f5c973bdeb40fd530aec686e310193701d167dc..c5b418347492f872b68e7635a5c913868d5ad7f8 100644 --- a/executors/docker/machine/executor.go +++ b/executors/docker/machine/executor.go @@ -12,6 +12,7 @@ import ( type machineExecutor struct { provider *machineProvider executor common.Executor + build *common.Build data common.ExecutorData config common.RunnerConfig } @@ -24,6 +25,8 @@ func (e *machineExecutor) Shell() *common.ShellScriptInfo { } func (e *machineExecutor) Prepare(globalConfig *common.Config, config *common.RunnerConfig, build *common.Build) (err error) { + e.build = build + // Use the machine e.config, e.data, err = e.provider.Use(config, build.ExecutorData) if err != nil { @@ -32,9 +35,7 @@ func (e *machineExecutor) Prepare(globalConfig *common.Config, config *common.Ru // TODO: Currently the docker-machine doesn't support multiple builds build.ProjectRunnerID = 0 - if details, _ := build.ExecutorData.(*machineDetails); details != nil { - build.Hostname = details.Name - } else if details, _ := e.data.(*machineDetails); details != nil { + if details, _ := e.data.(*machineDetails); details != nil { build.Hostname = details.Name } @@ -66,7 +67,7 @@ func (e *machineExecutor) Cleanup() { } // Release allocated machine - if e.data != "" { + if e.data != nil && e.data != e.build.ExecutorData { e.provider.Release(&e.config, e.data) e.data = nil } diff --git a/executors/docker/machine/provider.go b/executors/docker/machine/provider.go index cc87927deba141a1f8d287f115917c9a75b95b08..bfb1d2b65b668dbd893ed95e9a6d4b15f945b2c0 100644 --- a/executors/docker/machine/provider.go +++ b/executors/docker/machine/provider.go @@ -85,10 +85,8 @@ func (m *machineProvider) findFreeMachine(machines ...string) (details *machineD continue } - // Check if node is running - canConnect := m.machine.CanConnect(name) - if !canConnect { - m.remove(name, "machine is unavailable") + if !m.machine.Exist(name) { + m.remove(name, "machine does not exists") continue } return details @@ -97,28 +95,29 @@ func (m *machineProvider) findFreeMachine(machines ...string) (details *machineD return nil } -func (m *machineProvider) useMachine(config *common.RunnerConfig) (details *machineDetails, err error) { +func (m *machineProvider) findAndUseMachine(config *common.RunnerConfig) (dc docker_helpers.DockerCredentials, details *machineDetails, err error) { machines, err := m.loadMachines(config) if err != nil { return } details = m.findFreeMachine(machines...) - if details == nil { - var errCh chan error - details, errCh = m.create(config, machineStateAcquired) - err = <-errCh + if details != nil { + dc, err = m.useCredentials(config, details) + if err != nil { + details = nil + } } return } -func (m *machineProvider) retryUseMachine(config *common.RunnerConfig) (details *machineDetails, err error) { +func (m *machineProvider) retryFindAndUseMachine(config *common.RunnerConfig) (dc docker_helpers.DockerCredentials, details *machineDetails, err error) { // Try to find a machine - for i := 0; i < 3; i++ { - details, err = m.useMachine(config) + for i := 0; i < useMachineRetries; i++ { + dc, details, err = m.findAndUseMachine(config) if err == nil { break } - time.Sleep(provisionRetryInterval) + time.Sleep(useMachineRetryInterval) } return } @@ -138,7 +137,7 @@ func (m *machineProvider) finalizeRemoval(details *machineDetails) { if err == nil { break } - time.Sleep(30 * time.Second) + time.Sleep(removalRetryInterval) logrus.WithField("name", details.Name). WithField("created", time.Since(details.Created)). WithField("used", time.Since(details.Used)). @@ -174,6 +173,9 @@ func (m *machineProvider) remove(machineName string, reason ...interface{}) { } func (m *machineProvider) updateMachine(config *common.RunnerConfig, data *machinesData, details *machineDetails) error { + m.lock.RLock() + defer m.lock.RUnlock() + if details.State != machineStateIdle { return nil } @@ -233,6 +235,26 @@ func (m *machineProvider) loadMachines(config *common.RunnerConfig) ([]string, e return m.machine.List(machineFilter(config)) } +func (m *machineProvider) useCredentials(config *common.RunnerConfig, details *machineDetails) (dc docker_helpers.DockerCredentials, err error) { + if details.Credentials != nil { + if m.machine.CanConnect(details.Name) { + dc = *details.Credentials + } else { + err = fmt.Errorf("can't connect to: %s", details.Name) + m.Release(config, details) + } + return + } + + dc, err = m.machine.Credentials(details.Name) + if err == nil { + details.Credentials = &dc + } else { + m.Release(config, details) + } + return +} + func (m *machineProvider) Acquire(config *common.RunnerConfig) (data common.ExecutorData, err error) { if config.Machine == nil || config.Machine.MachineName == "" { err = fmt.Errorf("Missing Machine options") @@ -253,16 +275,17 @@ func (m *machineProvider) Acquire(config *common.RunnerConfig) (data common.Exec // Pre-create machines m.createMachines(config, &machinesData) - m.acquireLock.Unlock() - logrus.WithFields(machinesData.Fields()). WithField("runner", config.ShortDescription()). WithField("minIdleCount", config.Machine.IdleCount). WithField("maxMachines", config.Limit). WithField("time", time.Now()). Debugln("Docker Machine Details") + machinesData.writeDebugInformation() + m.acquireLock.Unlock() + // Try to find a free machine details := m.findFreeMachine(machines...) if details != nil { @@ -271,32 +294,55 @@ func (m *machineProvider) Acquire(config *common.RunnerConfig) (data common.Exec } // If we have a free machines we can process a build - if config.Machine.IdleCount != 0 && machinesData.Idle == 0 { + if config.Machine.IdleCount != 0 { err = errors.New("No free machines that can process builds") } return } +func (m *machineProvider) createAndUseMachine(config *common.RunnerConfig) (dc docker_helpers.DockerCredentials, details *machineDetails, err error) { + var errCh chan error + details, errCh = m.create(config, machineStateAcquired) + err = <-errCh + if err != nil { + return + } + + dc, err = m.useCredentials(config, details) + if err != nil { + details = nil + } + return +} + func (m *machineProvider) Use(config *common.RunnerConfig, data common.ExecutorData) (newConfig common.RunnerConfig, newData common.ExecutorData, err error) { + var dc docker_helpers.DockerCredentials + // Find a new machine details, _ := data.(*machineDetails) + + // Get machine credentials + if details != nil { + dc, err = m.useCredentials(config, details) + if err != nil { + details = nil + } + } + + // Try to find an existing machine if details == nil { - details, err = m.retryUseMachine(config) + dc, details, err = m.retryFindAndUseMachine(config) if err != nil { return } - - // Return details only if this is a new instance - newData = details } - // Get machine credentials - dc, err := m.machine.Credentials(details.Name) - if err != nil { - if newData != nil { - m.Release(config, newData) + // Try to create a new machine + if details == nil { + dc, details, err = m.createAndUseMachine(config) + if err != nil { + return } - return } // Create shallow copy of config and store in it docker credentials @@ -306,6 +352,7 @@ func (m *machineProvider) Use(config *common.RunnerConfig, data common.ExecutorD *newConfig.Docker = *config.Docker } newConfig.Docker.DockerCredentials = dc + newData = details // Mark machine as used details.State = machineStateUsed diff --git a/executors/docker/machine/provider_test.go b/executors/docker/machine/provider_test.go index 38e4e98a6a1e0ed60bcbcc9e6fdf568a808e0d2e..08feb10b032bb79ccc886c62e11e0ba8782f3953 100644 --- a/executors/docker/machine/provider_test.go +++ b/executors/docker/machine/provider_test.go @@ -106,6 +106,9 @@ func (m *testMachine) Remove(name string) error { } func (m *testMachine) Exist(name string) bool { + if strings.Contains(name, "no-can-connect") { + return false + } for _, machine := range m.machines { if machine == name { return true @@ -190,9 +193,9 @@ func assertTotalMachines(t *testing.T, p *machineProvider, expected int, msgAndA return false } -func testMachineProvider(machine ...string) (*machineProvider, *testMachine) { +func testMachineProvider(machines ...string) (*machineProvider, *testMachine) { t := &testMachine{ - machines: machine, + machines: machines, } p := &machineProvider{ details: make(machinesDetails), @@ -222,7 +225,7 @@ func TestMachineDetails(t *testing.T) { } func TestMachineFindFree(t *testing.T) { - p, tm := testMachineProvider("no-can-connect") + p, _ := testMachineProvider("no-can-connect", "machine1", "machine2") d1 := p.findFreeMachine() assert.Nil(t, d1, "no machines, return nil") @@ -236,7 +239,6 @@ func TestMachineFindFree(t *testing.T) { assert.NotNil(t, d4, "acquire a new machine") assert.NotEqual(t, d2, d4, "and it's a different machine") - assert.Len(t, tm.machines, 1, "has one machine") d5 := p.findFreeMachine("machine1", "no-can-connect") assert.Nil(t, d5, "fails to acquire machine to which he can't connect") } @@ -270,33 +272,62 @@ func TestMachineUse(t *testing.T) { p, _ := testMachineProvider("machine1") - d1, err := p.useMachine(machineDefaultConfig) + _, d1, err := p.findAndUseMachine(machineDefaultConfig) assert.NotNil(t, d1) assert.NoError(t, err) assert.Equal(t, machineStateAcquired, d1.State) assert.Equal(t, "machine1", d1.Name, "finds a free machine1") - d2, err := p.useMachine(machineDefaultConfig) - assert.NotNil(t, d2) - assert.NoError(t, err) - assert.Equal(t, machineStateAcquired, d2.State) - assert.NotEqual(t, "machine1", d2.Name, "creates a new machine") - - _, err = p.useMachine(machineProvisionFail) - assert.Error(t, err, "fails to create a new machine") + _, d2, err := p.findAndUseMachine(machineProvisionFail) + assert.Nil(t, d2, "fails to find a machine") + assert.NoError(t, err, "this is not an error") } func TestMachineTestRetry(t *testing.T) { provisionRetryInterval = 0 + useMachineRetryInterval = 0 + useMachineRetries = 3 p, _ := testMachineProvider() - _, err := p.useMachine(machineSecondFail) - assert.Error(t, err, "fails to create a new machine") + _, d, err := p.retryFindAndUseMachine(machineSecondFail) + assert.Nil(t, d, "fails to find a free machine") + assert.NoError(t, err, "this is not an error") +} - p, _ = testMachineProvider() - d1, err := p.retryUseMachine(machineSecondFail) - assert.NoError(t, err, "after replying the same test scenario and using retry it succeeds") - assert.Equal(t, machineStateAcquired, d1.State) +func TestUseCredentials(t *testing.T) { + p, _ := testMachineProvider() + + details := p.machineDetails("machine1", true) + _, err := p.useCredentials(nil, details) + assert.NoError(t, err, "successfully gets credentials") + + _, err = p.useCredentials(nil, details) + assert.NoError(t, err, "successfully gets credentials for second time") + + details = p.machineDetails("no-connect", true) + _, err = p.useCredentials(nil, details) + assert.Error(t, err, "fails to get credentials when can connect to machine") + assert.Equal(t, machineStateIdle, details.State, "machine should be released") + + details = p.machineDetails("no-can-connect", true) + _, err = p.useCredentials(nil, details) + assert.NoError(t, err, "successfully gets credentials for the first time") + assert.Equal(t, machineStateAcquired, details.State, "machine should be acquired") + + _, err = p.useCredentials(nil, details) + assert.Error(t, err, "fails to get credentials when cannnot connect") + assert.Equal(t, machineStateIdle, details.State, "machine should be released") +} + +func TestCreateAndUseMachine(t *testing.T) { + p, _ := testMachineProvider() + + _, d, err := p.createAndUseMachine(machineDefaultConfig) + assert.NoError(t, err, "succesfully creates machine") + assert.Equal(t, machineStateAcquired, d.State, "machine should be acquired") + + _, d, err = p.createAndUseMachine(machineNoConnect) + assert.Error(t, err, "fails to create a machine") } func TestMachineAcquireAndRelease(t *testing.T) { @@ -396,7 +427,7 @@ func TestMachineMaxBuilds(t *testing.T) { _, nd, err := p.Use(config, d) assert.NoError(t, err) - assert.Nil(t, nd, "we passed the data, we should not get the data now") + assert.Equal(t, d, nd, "we passed the data, we should get the same data now") p.Release(config, d) @@ -457,3 +488,44 @@ func TestMachineUseOnDemand(t *testing.T) { assert.Error(t, err, "fail to create a new machine on connect") assertTotalMachines(t, p, 3, "it fails on no-connect, but we leave the machine created") } + +func TestMachineCanConnectFailed(t *testing.T) { + provisionRetryInterval = 0 + removalRetryInterval = 0 + useMachineRetryInterval = 0 + + p, _ := testMachineProvider() + d := p.machineDetails("no-connect", true) + _, nd, err := p.Use(machineProvisionFail, d) + assert.Error(t, err, "it fails to assign a machine") + assert.Nil(t, nd) + assert.Equal(t, machineStateIdle, d.State, "releases invalid machine") +} + +func TestMachineUseCreatesMachine(t *testing.T) { + provisionRetryInterval = 0 + removalRetryInterval = 0 + useMachineRetryInterval = 0 + + p, _ := testMachineProvider() + d := p.machineDetails("no-connect", true) + _, nd, err := p.Use(machineDefaultConfig, d) + assert.NoError(t, err, "it creates a new machine") + assert.NotNil(t, nd) + assert.Equal(t, machineStateIdle, d.State, "releases invalid machine") +} + +func TestMachineUseFindsANewMachine(t *testing.T) { + provisionRetryInterval = 0 + removalRetryInterval = 0 + useMachineRetryInterval = 0 + + p, _ := testMachineProvider("machine2") + d := p.machineDetails("no-connect", true) + _, nd, err := p.Use(machineDefaultConfig, d) + assert.NoError(t, err, "it find an existing machine") + assert.NotNil(t, nd) + d2 := p.machineDetails("machine2", false) + assert.Equal(t, nd, d2, "it should be machine2") + assert.Equal(t, machineStateIdle, d.State, "releases invalid machine") +}