本文只研究含有加速资源的虚机在生命周期管理的流程,即深挖NovaCyborg组件之间的交互流程,对Nova与其余组件的交互并不做详细展开,有兴趣可以对代码中具体函数进行深入分析。

组件版本

  • Nova:22.4.0(Victoria)
  • Cyborg: 5.0.1(Victoria)
  • Placement:4.0.0(Victoria)

社区支持情况

Nova supports only specific operations for instances with accelerators. The lists of supported and unsupported operations are as below:

  • Supported operations.
    • Creation and deletion.
    • Reboots (soft and hard).
    • Pause and unpause.
    • Stop and start.
    • Take a snapshot.
    • Backup.
    • Rescue and unrescue.
    • Rebuild.
    • Evacuate.
    • Shelve and unshelve.
  • Unsupported operations
    • Resize.
    • Suspend and resume.
    • Cold migration.
    • Live migration.

Changed in version 22.0.0(Victoria): Added support for rebuild and evacuate operations.

Changed in version 23.0.0(Wallaby): Added support for shelve and unshelve operations.

加速资源虚机创建

流程图

加速资源虚机创建流程.drawio

各模块作用

  • Nova API:响应创建虚机请求,调用conductor进行创建虚机处理。
  • Nova Conductor:先调用Nova Scheduler选择计算节点,其次向Cyborg发起请求,绑定该节点instance uuid与resource provider,最后调用Nova Computer在指定计算节点创建虚机。
  • Cyborg API:响应Nova Conductor请求,调用Cyborg Conductor创建并绑定ARQ。
  • Cyborg Conductor:负责创建并绑定ARQ,数据库层面操作。(Cyborg agent仅处理FPGA烧录及资源发现操作。)
  • Nova Computer:寻找可用的加速资源设备,若没有现成的则创建一个,生成xml并孵化虚机。

代码调用关系

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
##############################
## nova api
##############################
# 1. nova.api.openstack.coumpute.servers
def create(self, req, body):
-> (instances, resv_id) = self.compute_api.create(context, *
# 2. nova.compute.api
def create(self, context, instance_type, *
-> self._create_instance( *
def _create_instance(self, context, instance_type, *
-> self.compute_task_api.schedule_and_build_instances( *

##############################
## nova api -> nova conductor
##############################
# 3. nova.conductor.manager
def schedule_and_build_instances(self, context, build_requests, *
-> host_lists = self._schedule_instances(context, request_specs[0], *
-> accel_uuids = self._create_and_bind_arq_for_instance( *
-> self.compute_rpcapi.build_and_run_instance( *

##############################
## nova conductor -> nova scheduler
##############################
# 4. nova.scheduler.manager
def select_destinations(self, ctxt, request_spec=None, *

##############################
## nova conductor -> cyborg api
##############################
# 5. nova.conductor.manager
def _create_and_bind_arq_for_instance(self, context, instance, host, *
-> return self._create_and_bind_arqs( *
def _create_and_bind_arqs(self, context, instance_uuid, extra_specs, *
-> arqs = cyclient.create_arqs_and_match_resource_providers( *
-> cyclient.bind_arqs(bindings=bindings)

# 6. nova.accelerator.cyborg
def create_arqs_and_match_resource_providers(self, dp_name, rg_rp_map):
-> arqs = self._create_arqs(dp_name)
def _create_arqs(self, dp_name):
-> resp, err_msg = self._call_cyborg(self._client.post, *
def bind_arqs(self, bindings):
-> resp, err_msg = self._call_cyborg(self._client.patch, *

## Part 1: _create_arqs()
# 7. cyborg.api.controllers.v2.arqs
@authorize_wsgi.authorize_wsgi("cyborg:arq", "create", False)
def post(self, req):
obj_extarq = objects.ExtARQ(context, **extarq_fields)
-> new_extarq = pecan.request.conductor_api.arq_create( *
# 8. cyborg.conductor.manager
def arq_create(self, context, obj_extarq, devprof_id):
-> obj_extarq.create(context, devprof_id)
# 9. cyborg.objects.ext_arq
def create(self, context, device_profile_id=None):
-> db_extarq = self.dbapi.extarq_create(context, values)
# 10. cyborg.db.sqlalchemy.api
def extarq_create(self, context, values):

## Part 2: bind_arqs()
# 11. cyborg.api.controllers.v2.arqs
@authorize_wsgi.authorize_wsgi("cyborg:arq", "update", False)
def patch(self, patch_list):
-> objects.ExtARQ.apply_patch(context, patch_list, valid_fields)
# 12. cyborg.objects.extarq.ext_arq_job
def apply_patch(cls, context, patch_list, valid_fields):
-> job = extarq.start_bind_job(context, valid_fields)
def start_bind_job(self, context, valid_fields):
-> return self._bind_job(context, dep)
def _bind_job(self, context, deployable):
-> self.bind(context, deployable)
# 13. cyborg.objects.ext_arq
def bind(self, context, deployable):
-> self._allocate_attach_handle(context, deployable)
-> self.update_check_state(context, constants.ARQ_BOUND)
def _allocate_attach_handle(self, context, deployable):
-> ah = AttachHandle.allocate(context, deployable.id)
# 14. cyborg.objects.attach_handle
def allocate(cls, context, deployable_id):
-> db_ah = cls.dbapi.attach_handle_allocate(context, deployable_id)
# 15. cyborg.db.sqlalchemy.api
def attach_handle_allocate(self, context, deployable_id):
-> ah = self._do_allocate_attach_handle( *
def _do_allocate_attach_handle(self, context, deployable_id):

##############################
## nova conductor -> nova compute
##############################
# 16. nova.compute.manager
def build_and_run_instance(self, context, instance, image, request_spec, *
-> result = self._do_build_and_run_instance(*args, **kwargs)
# 17. nova.compute.manager
def _do_build_and_run_instance(self, context, instance, image, *
-> self._build_and_run_instance(context, instance, image, *
# 18. nova.compute.manager
def _build_and_run_instance(self, context, instance, image, injected_files, *
-> with self._build_resources(context, instance, *
-> self.driver.spawn(context, instance, image_meta, *
# 19. nova.compute.manager
def _build_resources(self, context, instance, requested_networks, *
-> arqs = self._get_bound_arq_resources( *
def _get_bound_arq_resources(self, context, instance, arq_uuids):
-> return arqs
# 20. nova.virt.libvirt.driver
def spawn(self, context, instance, image_meta, injected_files,
-> mdevs = self._allocate_mdevs(allocations)
-> self._create_guest_with_network(
def _allocate_mdevs(self, allocations):
-> mdevs_available = self._get_existing_mdevs_not_assigned( *
-> chosen_mdev = self._create_new_mediated_device(parent_device)
-> return chosen_mdevs
def _create_guest_with_network(self, context, xml, instance, network_info, *
-> guest = self._create_guest( *
def _create_guest( *
-> guest = libvirt_guest.Guest.create(xml, self._host)

加速资源虚机删除

各模块作用

  • Nova API:响应删除虚机请求,根据虚机不同状态做出相应的处理:
    • 虚机正常:先释放各类资源,然后调用其宿主机节点的nova compute组件进行删除操作。
  • Cyborg API:响应Nova API请求,调用Cyborg Conductor解绑并删除ARQ。
  • Cyborg Conductor:负责解绑并删除ARQ,数据库层面操作。
  • Nova Computer:删除虚机,不删除已创建的vGPU资源。

代码调用关系

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
##############################
## nova api
##############################
# 1. nova.api.openstack.coumpute.servers
def delete(self, req, id):
-> self._delete(req.environ['nova.context'], req, id)
def _delete(self, context, req, instance_uuid):
-> self.compute_api.delete(context, instance)
# 2. nova.compute.api
def delete(self, context, instance):
-> self._delete_instance(context, instance)
def _delete_instance(self, context, instance):
-> self._delete(context, instance, 'delete', self._do_delete, *
def _delete(self, context, instance, delete_type, cb, **instance_attrs):
-> self._local_delete(cctxt, instance, bdms, delete_type, cb)
def _local_delete(self, context, instance, bdms, delete_type, cb):
-> compute_utils.delete_arqs_if_needed(context, instance)
-> cb(context, instance, bdms, local=True)

##############################
## nova api -> cyborg api
##############################
# 3. nova.compute.utils
def delete_arqs_if_needed(context, instance):
-> cyclient.delete_arqs_for_instance(instance.uuid)
# 4. nova.accelerator.cyborg
def delete_arqs_for_instance(self, instance_uuid):
-> resp, err_msg = self._call_cyborg(self._client.delete, *

# 5. cyborg.api.controllers.v2.arqs
@authorize_wsgi.authorize_wsgi("cyborg:arq", "delete", False)
def delete(self, arqs=None, instance=None):
-> pecan.request.conductor_api.arq_delete_by_instance_uuid( *
# 6. cyborg.conductor.manager
def arq_delete_by_instance_uuid(self, context, instance):
-> ExtARQ.delete_by_instance(context, instance)
# 7. cyborg.objects.ext_arq
def delete_by_instance(cls, context, instance_uuid):
-> obj_extarq.unbind(context)
-> obj_extarq.destroy(context)

## Part 1: obj_extarq.unbind(context)
# 8. cyborg.objects.ext_arq
def unbind(self, context):
-> attach_handle.deallocate(context)
# 9. cyborg.objects.attach_handle
def deallocate(self, context):
-> self.dbapi.attach_handle_update(context, self.uuid, values)
# 10. cyborg.db.sqlalchemy.api
def attach_handle_update(self, context, uuid, values):
-> return self._do_update_attach_handle(context, uuid, values)
def _do_update_attach_handle(self, context, uuid, values):

## Part 2: obj_extarq.destroy(context)
# 11. cyborg.objects.ext_arq
def destroy(self, context):
-> self.dbapi.extarq_delete(context, self.arq.uuid)
# 12. cyborg.db.sqlalchemy.api
def extarq_delete(self, context, uuid):

##############################
## nova api -> nova compute
##############################
# 13. nova.compute.api
def _do_delete(self, context, instance, bdms, local=False):
-> self.compute_rpcapi.terminate_instance(context, instance, bdms)
# 14. nova.compute.manager
def terminate_instance(self, context, instance, bdms):
-> do_terminate_instance(instance, bdms)
def do_terminate_instance(instance, bdms):
-> self._delete_instance(context, instance, bdms)
def _delete_instance(self, context, instance, bdms):
-> self._shutdown_instance(context, instance, bdms)
def _shutdown_instance(self, context, instance,
-> self.driver.destroy(context, instance, network_info, *
-> compute_utils.delete_arqs_if_needed(context, instance)
# 15. nova.virt.libvirt.driver
def destroy(self, context, instance, network_info, block_device_info=None, *
-> self._destroy(instance)
def _destroy(self, instance, attempt=1):
-> guest.poweroff()
# 16. nova.virt.libvirt.guest
def poweroff(self):
-> self._domain.destroy()

加速资源虚机开机

各模块作用

  • Nova API:响应虚机启动请求,调用宿主机节点的nova compute服务,启动虚机。
  • Nova Computer:查询虚机对应的加速资源信息,生成xml文件,在宿主机创建并启动虚机。
  • Cyborg API:响应Nova Compute的请求,返回虚机对应的ARQ信息。

代码调用关系

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
##############################
## nova api
##############################
# 1. nova.api.openstack.coumpute.servers
@wsgi.action('os-start')
def _start_server(self, req, id, body):
-> self.compute_api.start(context, instance)
# 2. nova.compute.api
@check_instance_state(vm_state=[vm_states.STOPPED])
def start(self, context, instance):
-> self.compute_rpcapi.start_instance(context, instance)

##############################
## nova api -> nova compute
##############################
# 3. nova.compute.manager
def start_instance(self, context, instance):
-> self._power_on(context, instance)
def _power_on(self, context, instance):
-> accel_info = self._get_accel_info(context, instance)
-> self.driver.power_on(context, instance, *

# Part 1: self._get_accel_info()
def _get_accel_info(self, context, instance):
-> accel_info = cyclient.get_arqs_for_instance(instance.uuid)

# Part 2: self.driver.power_on()
# 4. nova.virt.libvirt.driver
def power_on(self, context, instance, network_info, *
-> self._hard_reboot(context, instance, network_info, block_device_info, *
def _hard_reboot(self, context, instance, network_info, *
-> self._create_guest_with_network( *
def _create_guest_with_network(self, context, xml, instance, network_info, *
-> guest = self._create_guest( *
def _create_guest( *

加速资源虚机关机

各模块作用

  • Nova API:响应虚机关机请求,调用宿主机节点的nova compute服务,销毁虚机。
  • Nova Computer:在宿主机上销毁虚机。

代码调用关系

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
##############################
## nova api
##############################
# 1. nova.api.openstack.coumpute.servers
@wsgi.action('os-stop')
def _stop_server(self, req, id, body):
-> self.compute_api.stop(context, instance)
# 2. nova.compute.api
@check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.ERROR])
def stop(self, context, instance, do_cast=True, clean_shutdown=True):
-> self.force_stop(context, instance, do_cast, clean_shutdown)
def force_stop(self, context, instance, do_cast=True, clean_shutdown=True):
-> self.compute_rpcapi.stop_instance(context, instance, do_cast=do_cast, *

##############################
## nova api -> nova compute
##############################
# 3. nova.compute.manager
def stop_instance(self, context, instance, clean_shutdown):
-> self._power_off_instance(instance, clean_shutdown)
def _power_off_instance(self, instance, clean_shutdown=True):
-> self.driver.power_off(instance, timeout, retry_interval)
# 4. nova.virt.libvirt.driver
def power_off(self, instance, timeout=0, retry_interval=0):
-> self._destroy(instance)
def _destroy(self, instance, attempt=1):
-> guest.poweroff()

参考文档