Skip to content

Commit

Permalink
add GPU monitoring
Browse files Browse the repository at this point in the history
  • Loading branch information
bxy4543 committed Jul 27, 2023
1 parent 0adc21d commit 7380ae5
Show file tree
Hide file tree
Showing 3 changed files with 188 additions and 35 deletions.
120 changes: 120 additions & 0 deletions controllers/pkg/common/gpu/nvidia.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
package gpu

import (
"context"

corev1 "k8s.io/api/core/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
)

// nvidia labels for gpu
const (
NvidiaGpuKey = "nvidia.com/gpu"
NvidiaCudaDriverMajorKey = "nvidia.com/cuda.driver.major"
NvidiaCudaDriverMinorKey = "nvidia.com/cuda.driver.minor"
NvidiaCudaDriverRevKey = "nvidia.com/cuda.driver.rev"
NvidiaCudaRuntimeMajorKey = "nvidia.com/cuda.runtime.major"
NvidiaCudaRuntimeMinorKey = "nvidia.com/cuda.runtime.minor"
NvidiaGfdTimestampKey = "nvidia.com/gfd.timestamp"
NvidiaGpuComputeMajorKey = "nvidia.com/gpu.compute.major"
NvidiaGpuComputeMinorKey = "nvidia.com/gpu.compute.minor"
NvidiaGpuCountKey = "nvidia.com/gpu.count"
NvidiaGpuDeployContainerToolkitKey = "nvidia.com/gpu.deploy.container-toolkit"
NvidiaGpuDeployDcgmKey = "nvidia.com/gpu.deploy.dcgm"
NvidiaGpuDeployDcgmExporterKey = "nvidia.com/gpu.deploy.dcgm-exporter"
NvidiaGpuDeployDevicePluginKey = "nvidia.com/gpu.deploy.device-plugin"
NvidiaGpuDeployDriverKey = "nvidia.com/gpu.deploy.driver"
NvidiaGpuDeployGpuFeatureDiscoveryKey = "nvidia.com/gpu.deploy.gpu-feature-discovery"
NvidiaGpuDeployNodeStatusExporterKey = "nvidia.com/gpu.deploy.node-status-exporter"
NvidiaGpuDeployOperatorValidatorKey = "nvidia.com/gpu.deploy.operator-validator"
NvidiaGpuFamilyKey = "nvidia.com/gpu.family"
NvidiaGpuMachineKey = "nvidia.com/gpu.machine"
NvidiaGpuMemoryKey = "nvidia.com/gpu.memory"
NvidiaGpuPresentKey = "nvidia.com/gpu.present"
NvidiaGpuProductKey = "nvidia.com/gpu.product"
NvidiaGpuReplicasKey = "nvidia.com/gpu.replicas"
NvidiaMigCapableKey = "nvidia.com/mig.capable"
NvidiaMigStrategyKey = "nvidia.com/mig.strategy"
)

type NvidiaGPU struct {
GpuInfo Information
CudaInfo CudaInformation
GpuDeploy Deployment
GpuDetails DetailInformation
MigInfo MigInformation
}

type Information struct {
Gpu string
GpuCount string
GpuPresent string
GpuProduct string
GpuReplicas string
}

type CudaInformation struct {
CudaDriverMajor string
CudaDriverMinor string
CudaDriverRev string
CudaRuntimeMajor string
CudaRuntimeMinor string
}

type Deployment struct {
GpuDeployContainerToolkit string
GpuDeployDcgm string
GpuDeployDcgmExporter string
GpuDeployDevicePlugin string
GpuDeployDriver string
GpuDeployGpuFeatureDiscovery string
GpuDeployNodeStatusExporter string
GpuDeployOperatorValidator string
}

type DetailInformation struct {
GpuComputeMajor string
GpuComputeMinor string
GpuFamily string
GpuMachine string
GpuMemory string
GfdTimestamp string
}

type MigInformation struct {
MigCapable string
MigStrategy string
}

//nvidia.com/gpu

func GetNodeGpuModel(c client.Client) (map[string]NvidiaGPU, error) {
nodeList := &corev1.NodeList{}
err := c.List(context.Background(), nodeList)
if err != nil {
return nil, err
}

gpuModels := make(map[string]NvidiaGPU)
for _, node := range nodeList.Items {
gpu := NvidiaGPU{
GpuInfo: Information{
Gpu: node.Labels[NvidiaGpuKey],
GpuCount: node.Labels[NvidiaGpuCountKey],
GpuPresent: node.Labels[NvidiaGpuPresentKey],
GpuProduct: node.Labels[NvidiaGpuProductKey],
GpuReplicas: node.Labels[NvidiaGpuReplicasKey],
},
CudaInfo: CudaInformation{
CudaDriverMajor: node.Labels[NvidiaCudaDriverMajorKey],
CudaDriverMinor: node.Labels[NvidiaCudaDriverMinorKey],
CudaDriverRev: node.Labels[NvidiaCudaDriverRevKey],
CudaRuntimeMajor: node.Labels[NvidiaCudaRuntimeMajorKey],
CudaRuntimeMinor: node.Labels[NvidiaCudaRuntimeMinorKey],
},
// fill in the rest similarly...
}
gpuModels[node.Name] = gpu
}
return gpuModels, nil
}
9 changes: 9 additions & 0 deletions controllers/pkg/common/resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ import (
"math"
"time"

"github.com/labring/sealos/controllers/pkg/common/gpu"

"go.mongodb.org/mongo-driver/bson"
"go.mongodb.org/mongo-driver/mongo"

Expand Down Expand Up @@ -112,13 +114,20 @@ const (
PropertyInfraDisk = "infra-disk"
)

const ResourceGPU corev1.ResourceName = gpu.NvidiaGpuKey

func NewGpuResource(product string) corev1.ResourceName {
return corev1.ResourceName("gpu-" + product)
}

var (
bin1Mi = resource.NewQuantity(1<<20, resource.BinarySI)
cpuUnit = resource.MustParse("1m")
)

var PricesUnit = map[corev1.ResourceName]*resource.Quantity{
corev1.ResourceCPU: &cpuUnit, // 1 m CPU (1000 μ)
ResourceGPU: &cpuUnit, // 1 m CPU (1000 μ)
corev1.ResourceMemory: bin1Mi, // 1 MiB
corev1.ResourceStorage: bin1Mi, // 1 MiB
}
Expand Down
94 changes: 59 additions & 35 deletions controllers/resources/controllers/monitor_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ import (
"sync"
"time"

"github.com/labring/sealos/controllers/pkg/common/gpu"

"golang.org/x/sync/semaphore"

"github.com/labring/sealos/pkg/utils/logger"
Expand Down Expand Up @@ -52,6 +54,7 @@ type MonitorReconciler struct {
stopCh chan struct{}
wg sync.WaitGroup
periodicReconcile time.Duration
NvidiaGpu map[string]gpu.NvidiaGPU
}

type quantity struct {
Expand Down Expand Up @@ -87,9 +90,14 @@ func NewMonitorReconciler(mgr ctrl.Manager) (*MonitorReconciler, error) {
return nil, fmt.Errorf("mongo uri is empty")
}
r.initNamespaceFuncs()
if err := r.preApply(); err != nil {
err := r.preApply()
if err != nil {
return nil, err
}
r.NvidiaGpu, err = gpu.GetNodeGpuModel(mgr.GetClient())
if err != nil {
return nil, fmt.Errorf("failed to get node gpu model: %v", err)
}
r.startPeriodicReconcile()
return r, nil
}
Expand Down Expand Up @@ -268,20 +276,17 @@ func (r *MonitorReconciler) podResourceUsage(ctx context.Context, dbClient datab
return err
}
rs := initResources()
hasStorageQuota := false
if err := r.Get(ctx, client.ObjectKey{Name: meteringv1.ResourceQuotaPrefix + namespace.Name, Namespace: namespace.Name}, &quota); err != nil {
if client.IgnoreNotFound(err) != nil {
return err
}
if _, ok := namespace.GetAnnotations()[v1.UserAnnotationCreatorKey]; ok {
//+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=rolebindings,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=roles,verbs=get;list;watch;create;update;patch;delete
//if err = r.syncResourceQuota(ctx, namespace.Name); err != nil {
// r.Logger.Error(err, "sync resource quota failed", "namespace", namespace.Name)
//}
if _, ok := namespace.GetAnnotations()[v1.UserAnnotationOwnerKey]; ok {
r.Logger.Error(fmt.Errorf("resources quota is empty"), "", "namespace", namespace.Name)
}
rs[corev1.ResourceStorage].detail = "no resource quota"
} else {
hasStorageQuota = true
rs[corev1.ResourceStorage].Add(*quota.Status.Used.Name("requests.storage", resource.BinarySI))
}
for _, pod := range podList.Items {
Expand All @@ -300,50 +305,69 @@ func (r *MonitorReconciler) podResourceUsage(ctx context.Context, dbClient datab
} else {
rs[corev1.ResourceMemory].Add(container.Resources.Requests[corev1.ResourceMemory])
}
// gpu only use limit
if gpuRequest, ok := container.Resources.Limits[gpu.NvidiaGpuKey]; ok {
gpuModel, ok := r.NvidiaGpu[pod.Spec.NodeName]
if !ok {
var err error
r.NvidiaGpu, err = gpu.GetNodeGpuModel(r.Client)
if err != nil {
logger.Error(err, "get node gpu model failed")
continue
}
gpuModel, ok = r.NvidiaGpu[pod.Spec.NodeName]
if !ok {
logger.Error(fmt.Errorf("node %s not found gpu model", pod.Spec.NodeName), "")
continue
}
}
rs[common.NewGpuResource(gpuModel.GpuInfo.GpuProduct)].Add(gpuRequest)
}
}
}
cpuValue, memoryValue, storageValue := getResourceValue(corev1.ResourceCPU, rs), getResourceValue(corev1.ResourceMemory, rs), getResourceValue(corev1.ResourceStorage, rs)
var monitors []*common.Monitor
if cpuValue > 0 {
monitors = append(monitors, &common.Monitor{
Category: namespace.Name,
Property: corev1.ResourceCPU.String(),
Value: cpuValue,
Time: timeStamp,
Detail: rs[corev1.ResourceCPU].String(),
})
}
if memoryValue > 0 {
monitors = append(monitors, &common.Monitor{
Category: namespace.Name,
Property: corev1.ResourceMemory.String(),
Value: memoryValue,
Time: timeStamp,
Detail: rs[corev1.ResourceMemory].String(),
})
if !hasStorageQuota {
pvcList := corev1.PersistentVolumeClaimList{}
if err := r.List(context.Background(), &pvcList, &client.ListOptions{Namespace: namespace.Name}); err != nil {
return err
}
for _, pvc := range pvcList.Items {
if pvc.Status.Phase != corev1.ClaimBound {
continue
}
rs[corev1.ResourceStorage].Add(pvc.Spec.Resources.Requests[corev1.ResourceStorage])
}
}
if storageValue > 0 {
monitors = append(monitors, &common.Monitor{
Category: namespace.Name,
Property: corev1.ResourceStorage.String(),
Value: storageValue,
Time: timeStamp,
Detail: rs[corev1.ResourceStorage].String(),
})
var monitors []*common.Monitor
for resour, value := range rs {
v := getResourceValue(resour, rs)
if v > 0 {
monitors = append(monitors, &common.Monitor{
Category: namespace.Name,
Property: resour.String(),
Value: v,
Time: timeStamp,
Detail: value.detail,
})
}
}
return dbClient.InsertMonitor(ctx, monitors...)
}

func getResourceValue(resourceName corev1.ResourceName, res map[corev1.ResourceName]*quantity) int64 {
quantity := res[resourceName]
priceUnit := common.PricesUnit[resourceName]
if strings.Contains(resourceName.String(), "gpu") {
priceUnit = common.PricesUnit[common.ResourceGPU]
}
if quantity != nil && quantity.MilliValue() != 0 {
return int64(math.Ceil(float64(quantity.MilliValue()) / float64(common.PricesUnit[resourceName].MilliValue())))
return int64(math.Ceil(float64(quantity.MilliValue()) / float64(priceUnit.MilliValue())))
}
return 0
}

func initResources() (rs map[corev1.ResourceName]*quantity) {
rs = make(map[corev1.ResourceName]*quantity)
rs[common.ResourceGPU] = &quantity{Quantity: resource.NewQuantity(0, resource.DecimalSI), detail: ""}
rs[corev1.ResourceCPU] = &quantity{Quantity: resource.NewQuantity(0, resource.DecimalSI), detail: ""}
rs[corev1.ResourceMemory] = &quantity{Quantity: resource.NewQuantity(0, resource.BinarySI), detail: ""}
rs[corev1.ResourceStorage] = &quantity{Quantity: resource.NewQuantity(0, resource.BinarySI), detail: ""}
Expand Down

0 comments on commit 7380ae5

Please sign in to comment.