Skip to content

Commit

Permalink
Support pods with multiple containers (but boost only one container) (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
norbjd authored Nov 2, 2023
1 parent d96c835 commit c5e4583
Show file tree
Hide file tree
Showing 4 changed files with 150 additions and 27 deletions.
13 changes: 8 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,14 @@ Simple PoC to give pods a CPU boost during startup (before pod is `Ready`).

:warning: **this is pre-alpha / work in progress, don't use in production** :warning:

- supports Kubernetes clusters starting from version 1.27 only (for older versions, see [`v0.1.0`](https://github.com/norbjd/k8s-pod-cpu-booster/tree/v0.1.0))
- requires pods to have a `readinessProbe` configured, and a value for `resources.limits.cpu`
- only works with pods with a single container

Between startup and `Ready` status, the pod benefits from a CPU boost (x10).
- supports Kubernetes clusters starting from version 1.27 only with `InPlacePodVerticalScaling` feature gate enabled (for older versions, see [`v0.1.0`](https://github.com/norbjd/k8s-pod-cpu-booster/tree/v0.1.0))
- requires container to boost to have:
- a `readinessProbe` configured
- a value for `resources.limits.cpu`
- a `resizePolicy` with `resourceName: cpu` and `restartPolicy: NotRequire`
- works with pods with multiple containers, but can **only** boost a **single** container inside the pod

Between startup and `Ready` status, the container benefits from a CPU boost (x10).

## How does it work?

Expand Down
39 changes: 39 additions & 0 deletions examples/knative-service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
apiVersion: serving.knative.dev/v1
kind: Service
metadata:
name: example
spec:
template:
metadata:
annotations:
queue.sidecar.serving.knative.dev/cpu-resource-request: "300m" # otherwise it takes a while to be ready
queue.sidecar.serving.knative.dev/cpu-resource-limit: "300m"
queue.sidecar.serving.knative.dev/memory-resource-request: "10M"
queue.sidecar.serving.knative.dev/memory-resource-limit: "10M"
norbjd.github.io/k8s-pod-cpu-booster-enabled: "true"
norbjd.github.io/k8s-pod-cpu-booster-container: "user-container"
spec:
containers:
- image: python:3.11-alpine
command: ["python"]
args:
- -m
- http.server
ports:
- containerPort: 8000
# readinessProbe is not necessary because it's already configured by Knative with the queue-proxy.
# At startup, it agressively polls the user-container. But, if needed, we can configure it though
readinessProbe:
initialDelaySeconds: 0
periodSeconds: 1
successThreshold: 1
timeoutSeconds: 1
# resizePolicy is not required here, because by default it it set to update the resources without restarting the pod!
resources:
requests:
cpu: 50m
memory: 100M
limits:
cpu: 50m
memory: 100M
timeoutSeconds: 1 # also used to terminate the pod quickly when terminating
43 changes: 43 additions & 0 deletions examples/pod-with-multiple-containers-and-default-boost.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
apiVersion: v1
kind: Pod
metadata:
annotations:
norbjd.github.io/k8s-pod-cpu-booster-enabled: "true"
norbjd.github.io/k8s-pod-cpu-booster-container: "python"
name: pod-with-multiple-containers-and-default-boost
spec:
containers:
- image: python:3.11-alpine
name: python
command: ["python"]
args:
- -m
- http.server
readinessProbe:
tcpSocket:
port: 8000
initialDelaySeconds: 0
periodSeconds: 1
successThreshold: 1
timeoutSeconds: 1
resizePolicy:
- resourceName: cpu
restartPolicy: NotRequired # this is very important to be able to update the CPU resources in place (no need to restart the pod)
resources:
requests:
cpu: 50m
memory: 100M
limits:
cpu: 50m
memory: 100M
- image: alpine:3.18 # a dummy container just sleeping
name: sleep
command: ["sleep", "infinity"]
resources:
requests:
cpu: 50m
memory: 50M
limits:
cpu: 50m
memory: 50M
terminationGracePeriodSeconds: 0
82 changes: 60 additions & 22 deletions pkg/informer/informer.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ const (
cpuBoostMultiplierAnnotation = "norbjd.github.io/k8s-pod-cpu-booster-multiplier"
cpuBoostDefaultMultiplier = uint64(10)

cpuBoostContainerNameAnnotation = "norbjd.github.io/k8s-pod-cpu-booster-container"

cpuBoostProgressLabelName = "norbjd.github.io/k8s-pod-cpu-booster-progress"
cpuBoostInProgressLabelValue = "boosting"
cpuBoostDoneLabelValue = "has-been-boosted"
Expand Down Expand Up @@ -93,23 +95,44 @@ func onUpdate(clientset *kubernetes.Clientset, oldObj interface{}, newObj interf
return
}

if len(newPod.Spec.Containers) != 1 {
klog.Infof("pod %s/%s contains %d containers, skipping...", newPod.Namespace, newPod.Name, len(newPod.Spec.Containers))
return
containerNameToBoost := newPod.Annotations[cpuBoostContainerNameAnnotation]

containerIndex := -1

if containerNameToBoost == "" {
if len(newPod.Spec.Containers) > 1 {
klog.Warningf("pod %s/%s contains %d containers but annotation %s is unset, skipping...",
newPod.Namespace, newPod.Name, len(newPod.Spec.Containers), cpuBoostContainerNameAnnotation)
return
} else {
containerIndex = 0
}
} else {
for i, container := range newPod.Spec.Containers {
if container.Name == containerNameToBoost {
containerIndex = i
break
}
}

if containerIndex == -1 {
klog.Warningf("pod %s/%s contains no containers named %s (found in annotation %s), skipping...",
newPod.Namespace, newPod.Name, containerNameToBoost, cpuBoostContainerNameAnnotation)
return
}
}

currentCPULimit := newPod.Spec.Containers[0].Resources.Limits.Cpu()
boostMultiplier := getBoostMultiplierFromAnnotations(newPod)

if podJustStartedAndNotReadyYet(newPod) {
klog.Infof("will boost %s/%s CPU limit (currently: %s)", newPod.Namespace, newPod.Name, currentCPULimit)
err := boostCPU(clientset, newPod, currentCPULimit, boostMultiplier)
klog.Infof("will boost %s/%s (container %s) CPU limit", newPod.Namespace, newPod.Name, containerNameToBoost)
err := boostCPU(clientset, newPod, containerIndex, boostMultiplier)
if err != nil {
klog.Errorf("error while boosting CPU: %s", err.Error())
}
} else if podIsNowReadyAfterBoosting(newPod) {
klog.Infof("will reset %s/%s CPU limit to default (currently: %s)", newPod.Namespace, newPod.Name, currentCPULimit)
err := resetCPUBoost(clientset, newPod, currentCPULimit, boostMultiplier)
klog.Infof("will reset %s/%s (container %s) CPU limit to default", newPod.Namespace, newPod.Name, containerNameToBoost)
err := resetCPUBoost(clientset, newPod, containerIndex, boostMultiplier)
if err != nil {
klog.Errorf("error while resetting CPU boost: %s", err.Error())
}
Expand Down Expand Up @@ -143,23 +166,31 @@ func podJustStartedAndNotReadyYet(pod *corev1.Pod) bool {
return pod.Status.Phase == corev1.PodRunning && pod.Labels[cpuBoostProgressLabelName] == ""
}

func boostCPU(clientset *kubernetes.Clientset, pod *corev1.Pod, currentCPULimit *resource.Quantity, boostMultiplier uint64) error {
func boostCPU(clientset *kubernetes.Clientset, pod *corev1.Pod, containerIndex int, boostMultiplier uint64) error {
container := pod.Spec.Containers[containerIndex]
currentCPULimit := container.Resources.Limits.Cpu()
cpuLimitAfterBoost := resource.NewScaledQuantity(currentCPULimit.ScaledValue(resource.Nano)*int64(boostMultiplier), resource.Nano)
klog.Infof("Will set new CPU limit to: %s", cpuLimitAfterBoost)

err := writeCPULimit(clientset, pod, cpuLimitAfterBoost, boost)
klog.Infof("Current CPU limit for %s/%s (container %s) is %s, will set new CPU limit to %s",
pod.Namespace, pod.Name, container.Name, currentCPULimit, cpuLimitAfterBoost)

err := writeCPULimit(clientset, pod, containerIndex, cpuLimitAfterBoost, boost)
if err != nil {
return err
}

return nil
}

func resetCPUBoost(clientset *kubernetes.Clientset, pod *corev1.Pod, currentCPULimit *resource.Quantity, boostMultiplier uint64) error {
func resetCPUBoost(clientset *kubernetes.Clientset, pod *corev1.Pod, containerIndex int, boostMultiplier uint64) error {
container := pod.Spec.Containers[containerIndex]
currentCPULimit := container.Resources.Limits.Cpu()
cpuLimitAfterReset := resource.NewScaledQuantity(currentCPULimit.ScaledValue(resource.Nano)/int64(boostMultiplier), resource.Nano)
klog.Infof("Will reset CPU limit to: %s", cpuLimitAfterReset)

err := writeCPULimit(clientset, pod, cpuLimitAfterReset, reset)
klog.Infof("Current CPU limit for %s/%s (container %s) is %s, will reset CPU limit to %s",
pod.Namespace, pod.Name, container.Name, currentCPULimit, cpuLimitAfterReset)

err := writeCPULimit(clientset, pod, containerIndex, cpuLimitAfterReset, reset)
if err != nil {
return err
}
Expand All @@ -174,7 +205,7 @@ const (
reset
)

func writeCPULimit(clientset *kubernetes.Clientset, pod *corev1.Pod, cpuLimit *resource.Quantity, action action) error {
func writeCPULimit(clientset *kubernetes.Clientset, pod *corev1.Pod, containerIndex int, cpuLimit *resource.Quantity, action action) error {
ctx := context.Background()
podsClient := clientset.CoreV1().Pods(pod.Namespace)

Expand Down Expand Up @@ -204,33 +235,40 @@ func writeCPULimit(clientset *kubernetes.Clientset, pod *corev1.Pod, cpuLimit *r
return fmt.Errorf("unknown action: %d (expected %d or %d)", action, boost, reset)
}

container := result.Spec.Containers[containerIndex]

newResources := corev1.ResourceRequirements{
Requests: make(corev1.ResourceList),
Limits: make(corev1.ResourceList),
Claims: result.Spec.Containers[0].Resources.Claims,
Claims: container.Resources.Claims,
}

for resourceName, resourceQuantity := range result.Spec.Containers[0].Resources.Requests {
for resourceName, resourceQuantity := range container.Resources.Requests {
newResources.Requests[resourceName] = resourceQuantity
}

for resourceName, resourceQuantity := range result.Spec.Containers[0].Resources.Limits {
for resourceName, resourceQuantity := range container.Resources.Limits {
newResources.Limits[resourceName] = resourceQuantity
}

newResources.Requests[corev1.ResourceCPU] = *cpuLimit
newResources.Limits[corev1.ResourceCPU] = *cpuLimit

result.Spec.Containers[0].Resources = newResources
container.Resources = newResources

result.Spec.Containers[containerIndex] = container

updatedPod, updateErr := podsClient.Update(ctx, result, metav1.UpdateOptions{})
if updateErr != nil {
return updateErr
}

klog.Infof("CPU request/limit successfully updated to %s/%s",
updatedPod.Spec.Containers[0].Resources.Requests.Cpu(),
updatedPod.Spec.Containers[0].Resources.Limits.Cpu(),
klog.Infof("CPU request/limit for %s/%s (container %s) successfully updated to %s/%s",
updatedPod.Namespace,
updatedPod.Name,
container.Name,
updatedPod.Spec.Containers[containerIndex].Resources.Requests.Cpu(),
updatedPod.Spec.Containers[containerIndex].Resources.Limits.Cpu(),
)
return nil
})
Expand Down

0 comments on commit c5e4583

Please sign in to comment.