简介

本文介绍GPU Manager的部署流程,其原理本文不做介绍,原理可参考:论文笔记《GaiaGPU:Sharing GPUs in Container Clouds》

源码地址

1
2
3
4
# Github
gpu-quota-admission https://github.com/tkestack/gpu-admission
gpu-manager https://github.com/tkestack/gpu-manager
vcuda-controller https://github.com/tkestack/vcuda-controller

环境信息

1
2
3
4
5
6
7
8
9
[root@k104 vGPU]# uname -a
Linux k104 3.10.0-1127.el7.x86_64 #1 SMP Tue Mar 31 23:36:51 UTC 2020 x86_64 x86_64 x86_64 GNU/Linux

[root@k104 vGPU]# cat /etc/centos-release
CentOS Linux release 7.8.2003 (Core)

[root@k104 vGPU]# kubectl version
Client Version: version.Info{Major:"1", Minor:"19", GitVersion:"v1.19.10", GitCommit:"98d5dc5d36d34a7ee13368a7893dcb400ec4e566", GitTreeState:"clean", BuildDate:"2021-04-15T03:28:42Z", GoVersion:"go1.15.10", Compiler:"gc", Platform:"linux/amd64"}
Server Version: version.Info{Major:"1", Minor:"21", GitVersion:"v1.21.4", GitCommit:"3cce4a82b44f032d0cd1a1790e6d2f5a55d20aae", GitTreeState:"clean", BuildDate:"2021-08-11T18:10:22Z", GoVersion:"go1.16.7", Compiler:"gc", Platform:"linux/amd64"}

部署流程

安装NVIDIA驱动

参考文档:《NVIDIA使用:GPU驱动安装》

安装gpu-quota-admission

1
kubectl apply -f gpu-admission.yaml

gpu-admission.yaml内容如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
apiVersion: v1
kind: ConfigMap
metadata:
name: gpu-quota-admission
namespace: kube-system
data:
gpu-quota-admission.config: |
{
"QuotaConfigMapName": "gpuquota",
"QuotaConfigMapNamespace": "kube-system",
"GPUModelLabel": "gaia.tencent.com/gpu-model",
"GPUPoolLabel": "gaia.tencent.com/gpu-pool"
}

---
apiVersion: v1
kind: Service
metadata:
name: gpu-quota-admission
namespace: kube-system
spec:
ports:
- port: 3456
protocol: TCP
targetPort: 3456
selector:
k8s-app: gpu-quota-admission
type: ClusterIP

---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
k8s-app: gpu-quota-admission
name: gpu-quota-admission
namespace: kube-system
spec:
replicas: 1
selector:
matchLabels:
k8s-app: gpu-quota-admission
template:
metadata:
labels:
k8s-app: gpu-quota-admission
namespace: kube-system
spec:
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- preference:
matchExpressions:
- key: node-role.kubernetes.io/master
operator: Exists
weight: 1
containers:
- env:
- name: LOG_LEVEL
value: "4"
- name: EXTRA_FLAGS
value: --incluster-mode=true
image: ccr.ccs.tencentyun.com/tkeimages/gpu-quota-admission:latest
imagePullPolicy: IfNotPresent
name: gpu-quota-admission
ports:
- containerPort: 3456
protocol: TCP
resources:
limits:
cpu: "2"
memory: 2Gi
requests:
cpu: "1"
memory: 1Gi
volumeMounts:
- mountPath: /root/gpu-quota-admission/
name: config
dnsPolicy: ClusterFirstWithHostNet
initContainers:
- command:
- sh
- -c
- ' mkdir -p /etc/kubernetes/ && cp /root/gpu-quota-admission/gpu-quota-admission.config
/etc/kubernetes/'
image: busybox
imagePullPolicy: Always
name: init-kube-config
securityContext:
privileged: true
volumeMounts:
- mountPath: /root/gpu-quota-admission/
name: config
priority: 2000000000
priorityClassName: system-cluster-critical
restartPolicy: Always
serviceAccount: gpu-manager
serviceAccountName: gpu-manager
terminationGracePeriodSeconds: 30
tolerations:
- effect: NoSchedule
key: node-role.kubernetes.io/master
volumes:
- configMap:
defaultMode: 420
name: gpu-quota-admission
name: config

安装gpu-manager-daemonset

1
kubectl apply -f gpu-manager.yaml

gpu-manager.yaml内容如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: gpu-manager
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: cluster-admin
subjects:
- kind: ServiceAccount
name: gpu-manager
namespace: kube-system

---
apiVersion: v1
kind: ServiceAccount
metadata:
name: gpu-manager
namespace: kube-system

---
apiVersion: v1
kind: Service
metadata:
name: gpu-manager-metric
namespace: kube-system
annotations:
prometheus.io/scrape: "true"
labels:
kubernetes.io/cluster-service: "true"
spec:
clusterIP: None
ports:
- name: metrics
port: 5678
protocol: TCP
targetPort: 5678
selector:
name: gpu-manager-ds

---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: gpu-manager-daemonset
namespace: kube-system
spec:
updateStrategy:
type: RollingUpdate
selector:
matchLabels:
name: gpu-manager-ds
template:
metadata:
# This annotation is deprecated. Kept here for backward compatibility
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
annotations:
scheduler.alpha.kubernetes.io/critical-pod: ""
labels:
name: gpu-manager-ds
spec:
serviceAccount: gpu-manager
tolerations:
# This toleration is deprecated. Kept here for backward compatibility
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
- key: CriticalAddonsOnly
operator: Exists
- key: tencent.com/vcuda-core
operator: Exists
effect: NoSchedule
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
# only run node has gpu device
nodeSelector:
nvidia-device-enable: enable
hostPID: true
containers:
- image: tkestack/gpu-manager:v1.1.5
imagePullPolicy: IfNotPresent
name: gpu-manager
securityContext:
privileged: true
ports:
- containerPort: 5678
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: vdriver
mountPath: /etc/gpu-manager/vdriver
- name: vmdata
mountPath: /etc/gpu-manager/vm
- name: log
mountPath: /var/log/gpu-manager
- name: checkpoint
mountPath: /etc/gpu-manager/checkpoint
- name: run-dir
mountPath: /var/run
- name: cgroup
mountPath: /sys/fs/cgroup
readOnly: true
- name: usr-directory
mountPath: /usr/local/host
readOnly: true
env:
- name: LOG_LEVEL
value: "4"
- name: EXTRA_FLAGS
value: "--logtostderr=false"
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
volumes:
- name: device-plugin
hostPath:
type: Directory
path: /var/lib/kubelet/device-plugins
- name: vmdata
hostPath:
type: DirectoryOrCreate
path: /etc/gpu-manager/vm
- name: vdriver
hostPath:
type: DirectoryOrCreate
path: /etc/gpu-manager/vdriver
- name: log
hostPath:
type: DirectoryOrCreate
path: /etc/gpu-manager/log
- name: checkpoint
hostPath:
type: DirectoryOrCreate
path: /etc/gpu-manager/checkpoint
# We have to mount the whole /var/run directory into container, because of bind mount docker.sock
# inode change after host docker is restarted
- name: run-dir
hostPath:
type: Directory
path: /var/run
- name: cgroup
hostPath:
type: Directory
path: /sys/fs/cgroup
# We have to mount /usr directory instead of specified library path, because of non-existing
# problem for different distro
- name: usr-directory
hostPath:
type: Directory
path: /usr

GPU节点打标签

1
kubectl label node k104 nvidia-device-enable=enable

验证gpu-manager是否正确派发到GPU节点

1
kubectl get pods -n kube-system

自定义调度器

创建自定义调度器文件/etc/kubernetes/scheduler-policy-config.json,内容如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
{
"kind": "Policy",
"apiVersion": "v1",
"predicates": [
{
"name": "PodFitsHostPorts"
},
{
"name": "PodFitsResources"
},
{
"name": "NoDiskConflict"
},
{
"name": "MatchNodeSelector"
},
{
"name": "HostName"
}
],
"priorities": [
{
"name": "BalancedResourceAllocation",
"weight": 1
},
{
"name": "ServiceSpreadingPriority",
"weight": 1
}
],
"extenders": [
{
"urlPrefix": "http://gpu-quota-admission.kube-system:3456/scheduler",
"apiVersion": "v1beta1",
"filterVerb": "predicates",
"enableHttps": false,
"nodeCacheCapable": false
}
],
"hardPodAffinitySymmetricWeight": 10,
"alwaysCheckAllPredicates": false
}

kubeadm部署

如果是kubeadm部署的k8s,调度器是以pod形式运行的,kubelet会一直监听manifest文件的修改,发现文件被修改后会自动重启pod以加载新的配置。因此,这里我们只需要修改调度器的manifest文件即可。

1
cp /etc/kubernetes/manifests/kube-scheduler.yaml /etc/kubernetes/manifests/kube-scheduler.yaml.bak

在command关键字下面加两行内容:

1
2
--policy-config-file=/etc/kubernetes/scheduler-policy-config.json
--use-legacy-policy-config=true

修改后文件为:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
apiVersion: v1
kind: Pod
metadata:
creationTimestamp: null
labels:
component: kube-scheduler
tier: control-plane
name: kube-scheduler
namespace: kube-system
spec:
containers:
- command:
- kube-scheduler
- --authentication-kubeconfig=/etc/kubernetes/scheduler.conf
- --authorization-kubeconfig=/etc/kubernetes/scheduler.conf
- --bind-address=127.0.0.1
- --kubeconfig=/etc/kubernetes/scheduler.conf
- --leader-elect=true
- --port=0
- --policy-config-file=/etc/kubernetes/scheduler-policy-config.json #### 增加项
- --use-legacy-policy-config=true #### 增加项
image: 10.2.57.16:5000/kubernetes/kube-scheduler:v1.19.8
imagePullPolicy: IfNotPresent
livenessProbe:
failureThreshold: 8
httpGet:
host: 127.0.0.1
path: /healthz
port: 10259
scheme: HTTPS
initialDelaySeconds: 10
periodSeconds: 10
timeoutSeconds: 15
name: kube-scheduler
resources:
requests:
cpu: 100m
startupProbe:
failureThreshold: 24
httpGet:
host: 127.0.0.1
path: /healthz
port: 10259
scheme: HTTPS
initialDelaySeconds: 10
periodSeconds: 10
timeoutSeconds: 15
volumeMounts:
- mountPath: /etc/kubernetes/scheduler.conf
name: kubeconfig
readOnly: true
- mountPath: /etc/kubernetes/scheduler-policy-config.json #### 将文件挂载
name: policyconfig
readOnly: true
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet #### 修改dns策略
priorityClassName: system-node-critical
volumes:
- hostPath:
path: /etc/kubernetes/scheduler.conf
type: FileOrCreate
name: kubeconfig
- hostPath:
path: /etc/kubernetes/scheduler-policy-config.json
type: FileOrCreate
name: policyconfig
status: {}

rancher部署

cluster.yaml文件中,增加scheduler的额外启动参数配置,内容如下:

1
2
3
4
5
scheduler:
# 增加scheduler启动额外参数配置
extra_args:
policy-config-file: /etc/kubernetes/scheduler-policy-config.json
use-legacy-policy-config: true

修改后,文件内容如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
nodes:
- address: k104
port: "22" # ssh 端口
internal_address: "" # 内网IP,如果是公有云的这种有公私网两个IP的,则address配置为公网IP
role:
- controlplane # 控制节点校色,等于k8s的master
- etcd
- worker
user: cloud # 服务器登陆账户
# labels: {} # 标签说明
services:
# ETCD相关配置,另外备份是可以备份到s3的,这个配置见官方文档
etcd:
extra_args:
auto-compaction-retention: 240 #(单位小时)
# 修改空间配额为$((6*1024*1024*1024)),默认2G,最大8G
quota-backend-bytes: "6442450944"
backup_config:
enabled: true
interval_hours: 12
retention: 6
kube-api:
service_cluster_ip_range: 10.96.0.0/12
service_node_port_range: "20000-40000"
pod_security_policy: false
always_pull_images: false
# 控制器的一些配置,比如节点判断失联后多久开始迁移等
kube-controller:
extra_args:
## 当节点通信失败后,再等一段时间kubernetes判定节点为notready状态。
## 这个时间段必须是kubelet的nodeStatusUpdateFrequency(默认10s)的整数倍,
## 其中N表示允许kubelet同步节点状态的重试次数,默认40s。
node-monitor-grace-period: "20s"
## 再持续通信失败一段时间后,kubernetes判定节点为unhealthy状态,默认1m0s。
node-startup-grace-period: "30s"
## 再持续失联一段时间,kubernetes开始迁移失联节点的Pod,默认5m0s。
pod-eviction-timeout: "1m"
cluster_cidr: 10.10.0.0/16
service_cluster_ip_range: 10.96.0.0/12
# 集群的一些配置,包括资源预留,集群名字,dns等配置
kubelet:
extra_args:
serialize-image-pulls: "false"
registry-burst: "10"
registry-qps: "0"
# # 节点资源预留
# enforce-node-allocatable: 'pods'
# system-reserved: 'cpu=0.5,memory=500Mi'
# kube-reserved: 'cpu=0.5,memory=1500Mi'
# # POD驱逐,这个参数只支持内存和磁盘。
# ## 硬驱逐伐值
# ### 当节点上的可用资源降至保留值以下时,就会触发强制驱逐。强制驱逐会强制kill掉POD,不会等POD自动退出。
# eviction-hard: 'memory.available<300Mi,nodefs.available<10%,imagefs.available<15%,nodefs.inodesFree<5%'
# ## 软驱逐伐值
# ### 以下四个参数配套使用,当节点上的可用资源少于这个值时但大于硬驱逐伐值时候,会等待eviction-soft-grace-period设置的时长;
# ### 等待中每10s检查一次,当最后一次检查还触发了软驱逐伐值就会开始驱逐,驱逐不会直接Kill POD,先发送停止信号给POD,然后等待eviction-max-pod-grace-period设置的时长;
# ### 在eviction-max-pod-grace-period时长之后,如果POD还未退出则发送强制kill POD"
# eviction-soft: 'memory.available<500Mi,nodefs.available<50%,imagefs.available<50%,nodefs.inodesFree<10%'
# eviction-soft-grace-period: 'memory.available=1m30s'
# eviction-max-pod-grace-period: '30'
# eviction-pressure-transition-period: '30s'
volume-plugin-dir: /usr/libexec/kubernetes/kubelet-plugins/volume/exec
extra_binds:
- /usr/libexec/kubernetes/kubelet-plugins/volume/exec:/usr/libexec/kubernetes/kubelet-plugins/volume/exec
cluster_domain: cluster.local
infra_container_image: ""
cluster_dns_server: 10.96.0.10
fail_swap_on: false
kubeproxy:
extra_args:
# 默认使用iptables进行数据转发,如果要启用ipvs,则此处设置为`ipvs`
proxy-mode: "ipvs"
scheduler:
# 增加scheduler启动额外参数配置
extra_args:
policy-config-file: /etc/kubernetes/scheduler-policy-config.json
use-legacy-policy-config: true
# 配置集群的CNI网络模型
network:
plugin: calico
# mtu: 1400
# options:
# canal_iface: bond_mgmt
# flannel_backend_type: "vxlan"
ssh_agent_auth: false
authorization:
mode: rbac
ignore_docker_version: false
# k8s的版本,可以通过rke config --system-images --all 命令列出所有rke支持的版本
kubernetes_version: v1.21.4-rancher1-1
# 国内使用阿里云的镜像
private_registries:
# registry.local:9001/library
- url: registry.local:9001/library
user:
password:
is_default: true
# 配置ingress,目前RKE支持nginx。
monitoring:
provider: metrics-server
ingress:
provider: "nginx"
# 节点选择,和上面node配置结合的
node_selector:
app: ingress
options:
use-forwarded-headers: "true"
cluster_name: rancher
addon_job_timeout: 0
dns:
provider: coredns
upstreamnameservers:
- 114.114.114.114

更新kubernetes配置:

1
cd /root/.kube && rke up

验证方式

查看GPU资源

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
[root@k104 vGPU]# kubectl describe node k104
...
Capacity:
cpu: 20
ephemeral-storage: 333267976Ki
hugepages-1Gi: 92Gi
hugepages-2Mi: 0
memory: 197620068Ki
pods: 110
tencent.com/vcuda-core: 100
tencent.com/vcuda-memory: 59
Allocatable:
cpu: 20
ephemeral-storage: 307139766174
hugepages-1Gi: 92Gi
hugepages-2Mi: 0
memory: 101048676Ki
pods: 110
tencent.com/vcuda-core: 100
tencent.com/vcuda-memory: 59
...

TensorFlow框架+minst数据集测试

  1. 使用TensorFlow框架+minst数据集进行测试验证,TensorFlow镜像:
1
ccr.ccs.tencentyun.com/menghe/tensorflow-gputest:0.2
  1. 创建一个测试负载tensorflow-gputest.yaml,内容如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
k8s-app: vcuda-test
qcloud-app: vcuda-test
name: vcuda-test
namespace: default
spec:
replicas: 1
selector:
matchLabels:
k8s-app: vcuda-test
template:
metadata:
labels:
k8s-app: vcuda-test
qcloud-app: vcuda-test
spec:
containers:
- command:
- sleep
- 360000s
env:
- name: PATH
value: /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
image: ccr.ccs.tencentyun.com/menghe/tensorflow-gputest:0.2
imagePullPolicy: IfNotPresent
name: tensorflow-test
resources:
limits:
cpu: "4"
memory: 8Gi
tencent.com/vcuda-core: "50"
tencent.com/vcuda-memory: "32"
requests:
cpu: "4"
memory: 8Gi
tencent.com/vcuda-core: "50"
tencent.com/vcuda-memory: "32"
  1. 进入容器,执行测试命令,可以根据需求选择不同训练框架/数据集

    a)Mnist

    1
    cd /data/tensorflow/mnist && time python convolutional.py

    b)AlexNet

    1
    cd /data/tensorflow/alexnet && time python alexnet_benchmark.py

    c)Cifar10

    1
    cd /data/tensorflow/cifar10 && time python cifar10_train.py

    Mnist

  2. 在物理机上通过nvidia-smi pmon -s u -d 1命令查看GPU资源使用情况 pmon

参考文档