Skip to content

Commit e40435e

Browse files
committed
Do not recreate container when scale resource
1 parent 2f0b190 commit e40435e

File tree

9 files changed

+130
-27
lines changed

9 files changed

+130
-27
lines changed

cluster/cluster.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ type Cluster interface {
3333
InspectContainer(id string) (adoc.ContainerDetail, error)
3434
RemoveContainer(id string, force bool, volumes bool) error
3535
RenameContainer(id string, name string) error
36+
UpdateContainer(id string, config interface{}) error
3637

3738
MonitorEvents(filter string, callback adoc.EventCallback) int64
3839
StopMonitor(monitorId int64)

engine/config.go

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,16 @@ type Guard struct {
2424
Working bool `json:"Working"`
2525
}
2626

27+
type CUpdateConfig struct {
28+
CPUPeriod int64 `json:"CpuPeriod,omitempty"` // CPU CFS (Completely Fair Scheduler) period
29+
CPUQuota int64 `json:"CpuQuota,omitempty"` // CPU CFS (Completely Fair Scheduler) quota
30+
Memory int64 `json:"Memory,omitempty"` // Memory limit (in bytes)
31+
MemorySwap int64 `json:"MemorySwap,omitempty"` // Total memory usage (memory + swap); set `-1` to enable unlimited swap
32+
}
33+
2734
const (
28-
EtcdResourcesKey = "/lain/config/resources"
29-
EtcdGuardSwitchKey = "/lain/config/guardswitch"
35+
EtcdResourcesKey = "/lain/config/resources"
36+
EtcdGuardSwitchKey = "/lain/config/guardswitch"
3037
EtcdCloudVolumeRootKey = "/lain/config/cloud_volumes_root"
3138
EtcdVolumeRootKey = "/lain/config/volumes_root"
3239

engine/pod.go

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,42 @@ func (pc *podController) Drift(cluster cluster.Cluster, fromNode, toNode string,
148148
return true
149149
}
150150

151+
func (pc *podController) Update(cluster cluster.Cluster) error {
152+
log.Infof("%s updating", pc)
153+
start := time.Now()
154+
defer func() {
155+
pc.spec.Filters = []string{} // clear the filter
156+
pc.pod.UpdatedAt = time.Now()
157+
log.Infof("%s updated, state=%+v, duration=%s", pc, pc.pod.ImRuntime, time.Now().Sub(start))
158+
}()
159+
var err error
160+
for i, cSpec := range pc.spec.Containers {
161+
e := pc.updateContainer(cluster, i)
162+
if e != nil {
163+
log.Warnf("%s Cannot update container, error=%q, spec=%+v", pc, err, cSpec)
164+
if err == nil {
165+
err = e
166+
}
167+
}
168+
id := pc.pod.Containers[i].Id
169+
pc.startContainer(cluster, id)
170+
pc.refreshContainer(cluster, i)
171+
if i == 0 && pc.pod.Containers[0].NodeName != "" {
172+
pc.spec.PrevState.NodeName = pc.pod.Containers[i].NodeName
173+
}
174+
pc.spec.PrevState.IPs[i] = pc.pod.Containers[i].ContainerIp
175+
}
176+
if pc.pod.State == RunStatePending {
177+
if err == nil {
178+
pc.pod.State = RunStateSuccess
179+
} else {
180+
pc.pod.State = RunStateError
181+
}
182+
pc.pod.TargetState = ExpectStateRun
183+
}
184+
return err
185+
}
186+
151187
func (pc *podController) Remove(cluster cluster.Cluster) {
152188
log.Infof("%s removing", pc)
153189
start := time.Now()
@@ -338,6 +374,7 @@ func (pc *podController) refreshContainer(kluster cluster.Cluster, index int) {
338374
if network == "" {
339375
network = pc.spec.Namespace
340376
}
377+
log.Infof("pc.spec.PrevState.IPs:%v", pc.spec.PrevState.IPs)
341378
prevIP, nowIP := pc.spec.PrevState.IPs[index], info.NetworkSettings.Networks[network].IPAddress
342379
// NOTE: if the container's ip is not equal to prev ip, try to correct it; if failed, accpet new ip
343380
if prevIP != "" && prevIP != nowIP {
@@ -406,6 +443,19 @@ func (pc *podController) createContainer(cluster cluster.Cluster, filters []stri
406443
return cluster.CreateContainer(cc, hc, nc, name)
407444
}
408445

446+
func (pc *podController) updateContainer(cluster cluster.Cluster, index int) error {
447+
podSpec := pc.spec
448+
spec := podSpec.Containers[index]
449+
id := pc.pod.Containers[index].Id
450+
config := &CUpdateConfig{
451+
Memory: spec.MemoryLimit,
452+
MemorySwap: spec.MemoryLimit, // Memory == MemorySwap means disable swap
453+
CPUPeriod: CPUQuota,
454+
CPUQuota: int64(spec.CpuLimit*resource.Cpu*CPUMaxPctg) * CPUQuota / int64(CPUMaxLevel*100),
455+
}
456+
return cluster.UpdateContainer(id, config)
457+
}
458+
409459
func (pc *podController) createContainerConfig(filters []string, index int) adoc.ContainerConfig {
410460
podSpec := pc.spec
411461
spec := podSpec.Containers[index]
@@ -527,7 +577,7 @@ func (pc *podController) createHostConfig(index int) adoc.HostConfig {
527577
Resources: adoc.Resources{
528578
Memory: spec.MemoryLimit,
529579
MemorySwap: spec.MemoryLimit, // Memory == MemorySwap means disable swap
530-
MemorySwappiness: &swappiness,
580+
MemorySwappiness: &swappiness,
531581
CPUPeriod: CPUQuota,
532582
CPUQuota: int64(spec.CpuLimit*resource.Cpu*CPUMaxPctg) * CPUQuota / int64(CPUMaxLevel*100),
533583
BlkioDeviceReadBps: BlkioDeviceReadBps,

engine/podgroup.go

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -204,21 +204,29 @@ func (pgCtrl *podGroupController) RescheduleSpec(podSpec PodSpec) {
204204
if ok := pgCtrl.updatePodPorts(podSpec); !ok {
205205
return
206206
}
207-
// store oldPodSpec for rollback(with ttl 10min)
208-
pgCtrl.opsChan <- pgOperCacheLastSpec{spec: spec}
209-
210207
oldPodSpec := spec.Pod.Clone()
211208
spec.Pod = spec.Pod.Merge(podSpec)
212-
spec.Version += 1
213209
spec.UpdatedAt = time.Now()
210+
reDeploy := shouldReDeploy(oldPodSpec, podSpec)
211+
if reDeploy {
212+
// store oldPodSpec for rollback(with ttl 10min)
213+
pgCtrl.opsChan <- pgOperCacheLastSpec{spec: spec}
214+
spec.Version += 1
215+
} else {
216+
spec.Pod.Version -= 1
217+
}
214218
pgCtrl.Lock()
215219
pgCtrl.spec = spec
216220
pgCtrl.Unlock()
217221
pgCtrl.opsChan <- pgOperLogOperation{"Start to reschedule spec"}
218222
pgCtrl.opsChan <- pgOperSaveStore{true}
219223
pgCtrl.opsChan <- pgOperSnapshotEagleView{spec.Name}
220224
for i := 0; i < spec.NumInstances; i += 1 {
221-
pgCtrl.opsChan <- pgOperUpgradeInstance{i + 1, spec.Version, oldPodSpec, spec.Pod}
225+
if reDeploy {
226+
pgCtrl.opsChan <- pgOperUpgradeInstance{i + 1, spec.Version, oldPodSpec, spec.Pod}
227+
} else {
228+
pgCtrl.opsChan <- pgOperUpdateInsConfig{i + 1, spec.Version, oldPodSpec, spec.Pod}
229+
}
222230
pgCtrl.opsChan <- pgOperSnapshotGroup{true}
223231
pgCtrl.opsChan <- pgOperSaveStore{true}
224232
}
@@ -336,7 +344,6 @@ func (pgCtrl *podGroupController) Activate(c cluster.Cluster, store storage.Stor
336344
}
337345

338346
func (pgCtrl *podGroupController) LastSpec() *PodGroupSpec {
339-
log.Infof("Fetch LastPodSpec !")
340347
var lastSpec PodGroupSpec
341348
if err := pgCtrl.engine.store.Get(pgCtrl.lastPodSpecKey, &lastSpec); err != nil {
342349
log.Infof("Fetch LastPodSpec with err:%v", err)
@@ -705,3 +712,17 @@ func newPodGroupController(spec PodGroupSpec, states []PodPrevState, pg PodGroup
705712
pgCtrl.Publisher = NewPublisher(true)
706713
return pgCtrl
707714
}
715+
716+
// Assume reschedule spec operation change MemoryLimit or CpuLimit will not change other pod spec
717+
func shouldReDeploy(oldSpec, newSpec PodSpec) bool {
718+
if len(oldSpec.Containers) != len(newSpec.Containers) {
719+
return true
720+
}
721+
for i, _ := range newSpec.Containers {
722+
if oldSpec.Containers[i].MemoryLimit != newSpec.Containers[i].MemoryLimit ||
723+
oldSpec.Containers[i].CpuLimit != newSpec.Containers[i].CpuLimit {
724+
return false
725+
}
726+
}
727+
return true
728+
}

engine/podgroup_ops.go

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ func (op pgOperRefreshInstance) Do(pgCtrl *podGroupController, c cluster.Cluster
158158
pgCtrl.RUnlock()
159159
}()
160160

161-
if(op.instanceNo > len(pgCtrl.podCtrls)){
161+
if op.instanceNo > len(pgCtrl.podCtrls) {
162162
log.Warnf("Pod is not exists")
163163
return false
164164
}
@@ -314,6 +314,33 @@ func (op pgOperVerifyInstanceCount) Do(pgCtrl *podGroupController, c cluster.Clu
314314
return false
315315
}
316316

317+
type pgOperUpdateInsConfig struct {
318+
instanceNo int
319+
version int
320+
oldPodSpec PodSpec
321+
newPodSpec PodSpec
322+
}
323+
324+
func (op pgOperUpdateInsConfig) Do(pgCtrl *podGroupController, c cluster.Cluster, store storage.Store, ev *RuntimeEagleView) bool {
325+
var runtime ImRuntime
326+
start := time.Now()
327+
defer func() {
328+
pgCtrl.RLock()
329+
log.Infof("%s update instance, op=%+v, runtime=%+v, duration=%s", pgCtrl, op, runtime, time.Now().Sub(start))
330+
pgCtrl.RUnlock()
331+
}()
332+
podCtrl := pgCtrl.podCtrls[op.instanceNo-1]
333+
newPodSpec := op.newPodSpec.Clone()
334+
newPodSpec.PrevState = podCtrl.spec.PrevState.Clone() // upgrade action, state should not changed
335+
podCtrl.spec = newPodSpec
336+
podCtrl.pod.State = RunStatePending
337+
if err := podCtrl.Update(c); err != nil {
338+
lowOp := pgOperUpgradeInstance{op.instanceNo, op.version, op.oldPodSpec, op.newPodSpec}
339+
lowOp.Do(pgCtrl, c, store, ev)
340+
}
341+
return false
342+
}
343+
317344
type pgOperDeployInstance struct {
318345
instanceNo int
319346
version int

engine/runtimes.go

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
package engine
22

33
import (
4-
"github.com/mijia/adoc"
5-
"github.com/mijia/sweb/log"
64
"time"
5+
6+
"github.com/mijia/adoc"
77
)
88

99
type RunState int
@@ -14,16 +14,16 @@ type PGOpState int32
1414
var RestartMaxCount int
1515

1616
const (
17-
RunStatePending = iota // waiting for operation
18-
RunStateDrift // drifting from one node to another
19-
RunStateSuccess // ok
20-
RunStateExit // exited
21-
RunStateFail // start failed with error
22-
RunStateInconsistent // container's state is different between deployd and swarm
23-
RunStateMissing // container is missing and need create it. happened when node down .etc
24-
RunStateRemoved // removed
25-
RunStatePaused // paused
26-
RunStateError // call docker interface with error
17+
RunStatePending = iota // waiting for operation
18+
RunStateDrift // drifting from one node to another
19+
RunStateSuccess // ok
20+
RunStateExit // exited
21+
RunStateFail // start failed with error
22+
RunStateInconsistent // container's state is different between deployd and swarm
23+
RunStateMissing // container is missing and need create it. happened when node down .etc
24+
RunStateRemoved // removed
25+
RunStatePaused // paused
26+
RunStateError // call docker interface with error
2727
)
2828

2929
const (
@@ -251,7 +251,6 @@ func (pod Pod) PodIp() string {
251251

252252
func (pod *Pod) ChangeTargetState(state ExpectState) {
253253
pod.TargetState = state
254-
log.Infof("target state:::%v", state)
255254
}
256255

257256
type PodGroup struct {

engine/specs.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ const (
3131

3232
MinPodKillTimeout = 10
3333
MaxPodKillTimeout = 120
34-
3534
)
3635

3736
var (
@@ -441,7 +440,6 @@ func (s PodSpec) Merge(o PodSpec) PodSpec {
441440
s.Stateful = o.Stateful
442441
s.Version += 1
443442
s.UpdatedAt = time.Now()
444-
s.PrevState = o.PrevState
445443
s.SetupTime = o.SetupTime
446444
s.KillTimeout = o.KillTimeout
447445
s.HealthConfig = o.HealthConfig

glide.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

glide.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import:
1010
- store
1111
- store/etcd
1212
- package: github.com/mijia/adoc
13-
version: 1ef227e439ebbac803b4b9ec6f9a111edd4d6831
13+
version: 61dbc8d45a4512b5e1e5c1ff25773cee578418b9
1414
- package: github.com/mijia/go-generics
1515
- package: github.com/mijia/sweb
1616
subpackages:

0 commit comments

Comments
 (0)