简介 本文介绍GPU Manager的部署流程,其原理本文不做介绍,原理可参考:论文笔记《GaiaGPU:Sharing GPUs in Container Clouds》
源码地址 1 2 3 4 # Github gpu-quota-admission https://github.com/tkestack/gpu-admission gpu-manager https://github.com/tkestack/gpu-manager vcuda-controller https://github.com/tkestack/vcuda-controller
环境信息 1 2 3 4 5 6 7 8 9 [root@k104 vGPU]# uname -a Linux k104 3.10.0-1127.el7.x86_64 #1 SMP Tue Mar 31 23:36:51 UTC 2020 x86_64 x86_64 x86_64 GNU/Linux [root@k104 vGPU]# cat /etc/centos-release CentOS Linux release 7.8.2003 (Core) [root@k104 vGPU]# kubectl version Client Version: version.Info{Major:"1", Minor:"19", GitVersion:"v1.19.10", GitCommit:"98d5dc5d36d34a7ee13368a7893dcb400ec4e566", GitTreeState:"clean", BuildDate:"2021-04-15T03:28:42Z", GoVersion:"go1.15.10", Compiler:"gc", Platform:"linux/amd64"} Server Version: version.Info{Major:"1", Minor:"21", GitVersion:"v1.21.4", GitCommit:"3cce4a82b44f032d0cd1a1790e6d2f5a55d20aae", GitTreeState:"clean", BuildDate:"2021-08-11T18:10:22Z", GoVersion:"go1.16.7", Compiler:"gc", Platform:"linux/amd64"}
部署流程 安装NVIDIA驱动 参考文档:《NVIDIA使用:GPU驱动安装》
安装gpu-quota-admission 1 kubectl apply -f gpu-admission.yaml
gpu-admission.yaml
内容如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 apiVersion: v1 kind: ConfigMap metadata: name: gpu-quota-admission namespace: kube-system data: gpu-quota-admission.config: | { "QuotaConfigMapName": "gpuquota" , "QuotaConfigMapNamespace": "kube-system" , "GPUModelLabel": "gaia.tencent.com/gpu-model" , "GPUPoolLabel": "gaia.tencent.com/gpu-pool" } --- apiVersion: v1 kind: Service metadata: name: gpu-quota-admission namespace: kube-system spec: ports: - port: 3456 protocol: TCP targetPort: 3456 selector: k8s-app: gpu-quota-admission type: ClusterIP --- apiVersion: apps/v1 kind: Deployment metadata: labels: k8s-app: gpu-quota-admission name: gpu-quota-admission namespace: kube-system spec: replicas: 1 selector: matchLabels: k8s-app: gpu-quota-admission template: metadata: labels: k8s-app: gpu-quota-admission namespace: kube-system spec: affinity: nodeAffinity: preferredDuringSchedulingIgnoredDuringExecution: - preference: matchExpressions: - key: node-role.kubernetes.io/master operator: Exists weight: 1 containers: - env: - name: LOG_LEVEL value: "4" - name: EXTRA_FLAGS value: --incluster-mode=true image: ccr.ccs.tencentyun.com/tkeimages/gpu-quota-admission:latest imagePullPolicy: IfNotPresent name: gpu-quota-admission ports: - containerPort: 3456 protocol: TCP resources: limits: cpu: "2" memory: 2Gi requests: cpu: "1" memory: 1Gi volumeMounts: - mountPath: /root/gpu-quota-admission/ name: config dnsPolicy: ClusterFirstWithHostNet initContainers: - command: - sh - -c - ' mkdir -p /etc/kubernetes/ && cp /root/gpu-quota-admission/gpu-quota-admission.config /etc/kubernetes/' image: busybox imagePullPolicy: Always name: init-kube-config securityContext: privileged: true volumeMounts: - mountPath: /root/gpu-quota-admission/ name: config priority: 2000000000 priorityClassName: system-cluster-critical restartPolicy: Always serviceAccount: gpu-manager serviceAccountName: gpu-manager terminationGracePeriodSeconds: 30 tolerations: - effect: NoSchedule key: node-role.kubernetes.io/master volumes: - configMap: defaultMode: 420 name: gpu-quota-admission name: config
安装gpu-manager-daemonset 1 kubectl apply -f gpu-manager.yaml
gpu-manager.yaml
内容如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: gpu-manager roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: cluster-admin subjects: - kind: ServiceAccount name: gpu-manager namespace: kube-system --- apiVersion: v1 kind: ServiceAccount metadata: name: gpu-manager namespace: kube-system --- apiVersion: v1 kind: Service metadata: name: gpu-manager-metric namespace: kube-system annotations: prometheus.io/scrape: "true" labels: kubernetes.io/cluster-service: "true" spec: clusterIP: None ports: - name: metrics port: 5678 protocol: TCP targetPort: 5678 selector: name: gpu-manager-ds --- apiVersion: apps/v1 kind: DaemonSet metadata: name: gpu-manager-daemonset namespace: kube-system spec: updateStrategy: type: RollingUpdate selector: matchLabels: name: gpu-manager-ds template: metadata: annotations: scheduler.alpha.kubernetes.io/critical-pod: "" labels: name: gpu-manager-ds spec: serviceAccount: gpu-manager tolerations: - key: CriticalAddonsOnly operator: Exists - key: tencent.com/vcuda-core operator: Exists effect: NoSchedule priorityClassName: "system-node-critical" nodeSelector: nvidia-device-enable: enable hostPID: true containers: - image: tkestack/gpu-manager:v1.1.5 imagePullPolicy: IfNotPresent name: gpu-manager securityContext: privileged: true ports: - containerPort: 5678 volumeMounts: - name: device-plugin mountPath: /var/lib/kubelet/device-plugins - name: vdriver mountPath: /etc/gpu-manager/vdriver - name: vmdata mountPath: /etc/gpu-manager/vm - name: log mountPath: /var/log/gpu-manager - name: checkpoint mountPath: /etc/gpu-manager/checkpoint - name: run-dir mountPath: /var/run - name: cgroup mountPath: /sys/fs/cgroup readOnly: true - name: usr-directory mountPath: /usr/local/host readOnly: true env: - name: LOG_LEVEL value: "4" - name: EXTRA_FLAGS value: "--logtostderr=false" - name: NODE_NAME valueFrom: fieldRef: fieldPath: spec.nodeName volumes: - name: device-plugin hostPath: type: Directory path: /var/lib/kubelet/device-plugins - name: vmdata hostPath: type: DirectoryOrCreate path: /etc/gpu-manager/vm - name: vdriver hostPath: type: DirectoryOrCreate path: /etc/gpu-manager/vdriver - name: log hostPath: type: DirectoryOrCreate path: /etc/gpu-manager/log - name: checkpoint hostPath: type: DirectoryOrCreate path: /etc/gpu-manager/checkpoint - name: run-dir hostPath: type: Directory path: /var/run - name: cgroup hostPath: type: Directory path: /sys/fs/cgroup - name: usr-directory hostPath: type: Directory path: /usr
GPU节点打标签
1 kubectl label node k104 nvidia-device-enable=enable
验证gpu-manager是否正确派发到GPU节点
1 kubectl get pods -n kube-system
自定义调度器 创建自定义调度器文件/etc/kubernetes/scheduler-policy-config.json
,内容如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 { "kind" : "Policy" , "apiVersion" : "v1" , "predicates" : [ { "name" : "PodFitsHostPorts" }, { "name" : "PodFitsResources" }, { "name" : "NoDiskConflict" }, { "name" : "MatchNodeSelector" }, { "name" : "HostName" } ], "priorities" : [ { "name" : "BalancedResourceAllocation" , "weight" : 1 }, { "name" : "ServiceSpreadingPriority" , "weight" : 1 } ], "extenders" : [ { "urlPrefix" : "http://gpu-quota-admission.kube-system:3456/scheduler" , "apiVersion" : "v1beta1" , "filterVerb" : "predicates" , "enableHttps" : false , "nodeCacheCapable" : false } ], "hardPodAffinitySymmetricWeight" : 10 , "alwaysCheckAllPredicates" : false }
kubeadm部署 如果是kubeadm部署的k8s,调度器是以pod形式运行的,kubelet会一直监听manifest文件的修改,发现文件被修改后会自动重启pod以加载新的配置。因此,这里我们只需要修改调度器的manifest文件即可。
1 cp /etc/kubernetes/manifests/kube-scheduler.yaml /etc/kubernetes/manifests/kube-scheduler.yaml.bak
在command关键字下面加两行内容:
1 2 --policy-config-file=/etc/kubernetes/scheduler-policy-config.json --use-legacy-policy-config=true
修改后文件为:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 apiVersion: v1 kind: Pod metadata: creationTimestamp: null labels: component: kube-scheduler tier: control-plane name: kube-scheduler namespace: kube-system spec: containers: - command: - kube-scheduler - --authentication-kubeconfig=/etc/kubernetes/scheduler.conf - --authorization-kubeconfig=/etc/kubernetes/scheduler.conf - --bind-address=127.0.0.1 - --kubeconfig=/etc/kubernetes/scheduler.conf - --leader-elect=true - --port=0 - --policy-config-file=/etc/kubernetes/scheduler-policy-config.json - --use-legacy-policy-config=true image: 10.2 .57 .16 :5000/kubernetes/kube-scheduler:v1.19.8 imagePullPolicy: IfNotPresent livenessProbe: failureThreshold: 8 httpGet: host: 127.0 .0 .1 path: /healthz port: 10259 scheme: HTTPS initialDelaySeconds: 10 periodSeconds: 10 timeoutSeconds: 15 name: kube-scheduler resources: requests: cpu: 100m startupProbe: failureThreshold: 24 httpGet: host: 127.0 .0 .1 path: /healthz port: 10259 scheme: HTTPS initialDelaySeconds: 10 periodSeconds: 10 timeoutSeconds: 15 volumeMounts: - mountPath: /etc/kubernetes/scheduler.conf name: kubeconfig readOnly: true - mountPath: /etc/kubernetes/scheduler-policy-config.json name: policyconfig readOnly: true hostNetwork: true dnsPolicy: ClusterFirstWithHostNet priorityClassName: system-node-critical volumes: - hostPath: path: /etc/kubernetes/scheduler.conf type: FileOrCreate name: kubeconfig - hostPath: path: /etc/kubernetes/scheduler-policy-config.json type: FileOrCreate name: policyconfig status: {}
rancher部署 cluster.yaml
文件中,增加scheduler的额外启动参数配置,内容如下:
1 2 3 4 5 scheduler: extra_args: policy-config-file: /etc/kubernetes/scheduler-policy-config.json use-legacy-policy-config: true
修改后,文件内容如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 nodes: - address: k104 port: "22" # ssh 端口 internal_address: "" # 内网IP,如果是公有云的这种有公私网两个IP的,则address配置为公网IP role: - controlplane # 控制节点校色,等于k8s的master - etcd - worker user: cloud # 服务器登陆账户 # labels: {} # 标签说明 services: # ETCD相关配置,另外备份是可以备份到s3的,这个配置见官方文档 etcd: extra_args: auto-compaction-retention: 240 #(单位小时) # 修改空间配额为$((6*1024*1024*1024)),默认2G,最大8G quota-backend-bytes: "6442450944" backup_config: enabled: true interval_hours: 12 retention: 6 kube-api: service_cluster_ip_range: 10.96.0.0/12 service_node_port_range: "20000-40000" pod_security_policy: false always_pull_images: false # 控制器的一些配置,比如节点判断失联后多久开始迁移等 kube-controller: extra_args: ## 当节点通信失败后,再等一段时间kubernetes判定节点为notready状态。 ## 这个时间段必须是kubelet的nodeStatusUpdateFrequency(默认10s)的整数倍, ## 其中N表示允许kubelet同步节点状态的重试次数,默认40s。 node-monitor-grace-period: "20s" ## 再持续通信失败一段时间后,kubernetes判定节点为unhealthy状态,默认1m0s。 node-startup-grace-period: "30s" ## 再持续失联一段时间,kubernetes开始迁移失联节点的Pod,默认5m0s。 pod-eviction-timeout: "1m" cluster_cidr: 10.10.0.0/16 service_cluster_ip_range: 10.96.0.0/12 # 集群的一些配置,包括资源预留,集群名字,dns等配置 kubelet: extra_args: serialize-image-pulls: "false" registry-burst: "10" registry-qps: "0" # # 节点资源预留 # enforce-node-allocatable: 'pods' # system-reserved: 'cpu=0.5,memory=500Mi' # kube-reserved: 'cpu=0.5,memory=1500Mi' # # POD驱逐,这个参数只支持内存和磁盘。 # ## 硬驱逐伐值 # ### 当节点上的可用资源降至保留值以下时,就会触发强制驱逐。强制驱逐会强制kill掉POD,不会等POD自动退出。 # eviction-hard: 'memory.available<300Mi,nodefs.available<10%,imagefs.available<15%,nodefs.inodesFree<5%' # ## 软驱逐伐值 # ### 以下四个参数配套使用,当节点上的可用资源少于这个值时但大于硬驱逐伐值时候,会等待eviction-soft-grace-period设置的时长; # ### 等待中每10s检查一次,当最后一次检查还触发了软驱逐伐值就会开始驱逐,驱逐不会直接Kill POD,先发送停止信号给POD,然后等待eviction-max-pod-grace-period设置的时长; # ### 在eviction-max-pod-grace-period时长之后,如果POD还未退出则发送强制kill POD" # eviction-soft: 'memory.available<500Mi,nodefs.available<50%,imagefs.available<50%,nodefs.inodesFree<10%' # eviction-soft-grace-period: 'memory.available=1m30s' # eviction-max-pod-grace-period: '30' # eviction-pressure-transition-period: '30s' volume-plugin-dir: /usr/libexec/kubernetes/kubelet-plugins/volume/exec extra_binds: - /usr/libexec/kubernetes/kubelet-plugins/volume/exec:/usr/libexec/kubernetes/kubelet-plugins/volume/exec cluster_domain: cluster.local infra_container_image: "" cluster_dns_server: 10.96.0.10 fail_swap_on: false kubeproxy: extra_args: # 默认使用iptables进行数据转发,如果要启用ipvs,则此处设置为`ipvs` proxy-mode: "ipvs" scheduler: # 增加scheduler启动额外参数配置 extra_args: policy-config-file: /etc/kubernetes/scheduler-policy-config.json use-legacy-policy-config: true # 配置集群的CNI网络模型 network: plugin: calico # mtu: 1400 # options: # canal_iface: bond_mgmt # flannel_backend_type: "vxlan" ssh_agent_auth: false authorization: mode: rbac ignore_docker_version: false # k8s的版本,可以通过rke config --system-images --all 命令列出所有rke支持的版本 kubernetes_version: v1.21.4-rancher1-1 # 国内使用阿里云的镜像 private_registries: # registry.local:9001/library - url: registry.local:9001/library user: password: is_default: true # 配置ingress,目前RKE支持nginx。 monitoring: provider: metrics-server ingress: provider: "nginx" # 节点选择,和上面node配置结合的 node_selector: app: ingress options: use-forwarded-headers: "true" cluster_name: rancher addon_job_timeout: 0 dns: provider: coredns upstreamnameservers: - 114.114.114.114
更新kubernetes配置:
1 cd /root/.kube && rke up
验证方式 查看GPU资源 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 [root@k104 vGPU]# kubectl describe node k104 ... Capacity: cpu: 20 ephemeral-storage: 333267976Ki hugepages-1Gi: 92Gi hugepages-2Mi: 0 memory: 197620068Ki pods: 110 tencent.com/vcuda-core: 100 tencent.com/vcuda-memory: 59 Allocatable: cpu: 20 ephemeral-storage: 307139766174 hugepages-1Gi: 92Gi hugepages-2Mi: 0 memory: 101048676Ki pods: 110 tencent.com/vcuda-core: 100 tencent.com/vcuda-memory: 59 ...
TensorFlow框架+minst数据集测试
使用TensorFlow框架+minst数据集进行测试验证,TensorFlow镜像:
1 ccr.ccs.tencentyun.com/menghe/tensorflow-gputest:0.2
创建一个测试负载tensorflow-gputest.yaml
,内容如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 apiVersion: apps/v1 kind: Deployment metadata: labels: k8s-app: vcuda-test qcloud-app: vcuda-test name: vcuda-test namespace: default spec: replicas: 1 selector: matchLabels: k8s-app: vcuda-test template: metadata: labels: k8s-app: vcuda-test qcloud-app: vcuda-test spec: containers: - command: - sleep - 360000s env: - name: PATH value: /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin image: ccr.ccs.tencentyun.com/menghe/tensorflow-gputest:0.2 imagePullPolicy: IfNotPresent name: tensorflow-test resources: limits: cpu: "4" memory: 8Gi tencent.com/vcuda-core: "50" tencent.com/vcuda-memory: "32" requests: cpu: "4" memory: 8Gi tencent.com/vcuda-core: "50" tencent.com/vcuda-memory: "32"
进入容器,执行测试命令,可以根据需求选择不同训练框架/数据集
a)Mnist
1 cd /data/tensorflow/mnist && time python convolutional.py
b)AlexNet
1 cd /data/tensorflow/alexnet && time python alexnet_benchmark.py
c)Cifar10
1 cd /data/tensorflow/cifar10 && time python cifar10_train.py
在物理机上通过nvidia-smi pmon -s u -d 1
命令查看GPU资源使用情况
参考文档