本文将结合virtio spec、qemu与Linux kernel源码深入解析virtio-balloon的原理与实现。

本文参考的virtio spec是0.9.5,qemu版本为v2.6.0,Linux kernel版本为v4.19

为了简单起见,本文只介绍virtio-balloon的inflate操作,deflate、Memory Statistics就不赘述了。

本文考虑的场景:guest原先的内存为4096M,现在希望利用virtio-balloon,从guest内回收20M内存。

在hmp中设置内存

1
2
3
4
5
6
7
8
9
10
11
12
13
14
ETEXI

{
.name = "balloon",
.args_type = "value:M",
.params = "target",
.help = "request VM to change its memory allocation (in MB)",
.mhandler.cmd = hmp_balloon,
},

STEXI
@item balloon @var{value}
@findex balloon
Request VM to change its memory allocation to @var{value} (in MB).

在QEMU的hmp中执行balloon 4076,将guest的内存设置为4076M(4096-20)。此时会触发virtio-balloon设备的cofiguration change interrupt。

cofiguration change interrupt

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
// QEMU
static void virtio_balloon_to_target(void *opaque, ram_addr_t target)
{
VirtIOBalloon *dev = VIRTIO_BALLOON(opaque);
VirtIODevice *vdev = VIRTIO_DEVICE(dev);
ram_addr_t vm_ram_size = get_current_ram_size();

if (target > vm_ram_size) {
target = vm_ram_size;
}
if (target) {
dev->num_pages = (vm_ram_size - target) >> VIRTIO_BALLOON_PFN_SHIFT;
virtio_notify_config(vdev);
}
...
}

dev->num_pages记录了Number of pages host wants Guest to give up。virtio_notify_config会给guest发送cofiguration change interrupt。
guest cofiguration change interrupt的handler是virtballoon_changed

1
2
3
4
5
6
7
8
9
10
11
// guest driver
static void virtballoon_changed(struct virtio_device *vdev)
{
struct virtio_balloon *vb = vdev->priv;
unsigned long flags;

spin_lock_irqsave(&vb->stop_update_lock, flags);
if (!vb->stop_update)
queue_work(system_freezable_wq, &vb->update_balloon_size_work);
spin_unlock_irqrestore(&vb->stop_update_lock, flags);
}

vb->update_balloon_size_work就是update_balloon_size_func

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
// guest driver
static void update_balloon_size_func(struct work_struct *work)
{
struct virtio_balloon *vb;
s64 diff;

vb = container_of(work, struct virtio_balloon,
update_balloon_size_work);
diff = towards_target(vb);

if (diff > 0)
diff -= fill_balloon(vb, diff);
else if (diff < 0)
diff += leak_balloon(vb, -diff);
update_balloon_size(vb);

if (diff)
queue_work(system_freezable_wq, work);
}

guest inflate

首先思考下这个问题,guest怎么知道要inflate的呢?
update_balloon_size_func会调用towards_target

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
// guest driver
static inline s64 towards_target(struct virtio_balloon *vb)
{
s64 target;
u32 num_pages;

virtio_cread(vb->vdev, struct virtio_balloon_config, num_pages,
&num_pages);

/* Legacy balloon config space is LE, unlike all other devices. */
if (!virtio_has_feature(vb->vdev, VIRTIO_F_VERSION_1))
num_pages = le32_to_cpu((__force __le32)num_pages);

target = num_pages;
return target - vb->num_pages;
}

towards_target会读取virtio_balloon_config中的num_pages寄存器。

1
2
3
4
5
6
struct virtio_balloon_config {
/* Number of pages host wants Guest to give up. */
__u32 num_pages;
/* Number of pages we've actually got in balloon. */
__u32 actual;
};

因为num_pages位于PIO BAR0中,因此会发生VM Exit,QEMU的virtio_balloon_get_config最终会模拟num_pages寄存器的读。

1
2
3
4
5
6
7
8
9
10
11
12
// QEMU
static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data)
{
VirtIOBalloon *dev = VIRTIO_BALLOON(vdev);
struct virtio_balloon_config config;

config.num_pages = cpu_to_le32(dev->num_pages);
config.actual = cpu_to_le32(dev->actual);

trace_virtio_balloon_get_config(config.num_pages, config.actual);
memcpy(config_data, &config, sizeof(struct virtio_balloon_config));
}

最终guest看到的num_pages寄存器的值就是dev->num_pages,即为Number of pages host wants Guest to give up。

towards_targetvb->num_pages的含义为Number of balloon pages guest has told the Host it’s not using. 在我们考虑的场景中,vb->num_pages为0(初始值),此时towards_target返回的值为20 * (2MB/4KB)= 10K。因此update_balloon_size_func中的diff变量大于0,此时会调用fill_balloon函数。

1
2
3
4
5
fill_balloon
├── balloon_page_enqueue
└── tell_host
├── virtqueue_add_outbuf
└── virtqueue_kick

balloon_page_enqueue函数是guest os实现的回收unused pages的功能,与本文关系不大,这里就不继续往下追了。virtqueue_add_outbuf会往descs中填充pages的GFN(Guest Frame Number),并更新avail ring,最后virtqueue_kick会写kick寄存器来通知QEMU回收内存。

QEMU回收内存

guest driver调用virtqueue_kick后,QEMU最终会调用virtio_balloon_handle_output来回收内存。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq)
{
VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
VirtQueueElement *elem;
MemoryRegionSection section;

for (;;) {
size_t offset = 0;
uint32_t pfn;
elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
if (!elem) {
return;
}

while (iov_to_buf(elem->out_sg, elem->out_num, offset, &pfn, 4) == 4) {
ram_addr_t pa;
ram_addr_t addr;
int p = virtio_ldl_p(vdev, &pfn);

pa = (ram_addr_t) p << VIRTIO_BALLOON_PFN_SHIFT;
offset += 4;

section = memory_region_find(get_system_memory(), pa, 1);
if (!int128_nz(section.size) || !memory_region_is_ram(section.mr))
continue;

trace_virtio_balloon_handle_output(memory_region_name(section.mr),
pa);
/* Using memory_region_get_ram_ptr is bending the rules a bit, but
should be OK because we only want a single page. */
addr = section.offset_within_region;
balloon_page(memory_region_get_ram_ptr(section.mr) + addr,
!!(vq == s->dvq));
memory_region_unref(section.mr);
}

virtqueue_push(vq, elem, offset);
virtio_notify(vdev, vq);
g_free(elem);
}
}

virtqueue_pop会读取avail ring,从descs中获取要回收pages的GFN,然后进行内存的回收,回收完毕后,virtqueue_push会更新used ring,最后virtio_notify会发送inflateq的中断。

guest inflateq handler

guest inflateq handler是balloon_ack

1
2
3
4
5
6
static void balloon_ack(struct virtqueue *vq)
{
struct virtio_balloon *vb = vq->vdev->priv;

wake_up(&vb->acked);
}

guest在tell_host中注册了virtqueue_get_buf callback,最终balloon_ack的结果就是读取inflateq的used ring,回收descs。

1
2
3
4
5
6
static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
{
...
/* When host has read buffer, this completes via balloon_ack */
wait_event(vb->acked, virtqueue_get_buf(vq, &len));
}

guest updates balloon size

1
2
3
update_balloon_size_func
├── fill_balloon
└── update_balloon_size

update_balloon_size_func在调用完fill_balloon来回收内存后,guest driver会调用update_balloon_size来通知QEMU Number of pages it’s actually got in balloon。

1
2
3
4
5
6
7
8
9
10
11
static void update_balloon_size(struct virtio_balloon *vb)
{
u32 actual = vb->num_pages;

/* Legacy balloon config space is LE, unlike all other devices. */
if (!virtio_has_feature(vb->vdev, VIRTIO_F_VERSION_1))
actual = (__force u32)cpu_to_le32(actual);

virtio_cwrite(vb->vdev, struct virtio_balloon_config, actual,
&actual);
}

guest最终会将vb->num_pages(在fill_balloon中会确定vb->num_pages的值)写到virtio_balloon_config中的actual寄存器来通知QEMU。

QEMU最终会调用virtio_balloon_set_config,将guest写入的vb->num_pages值记录到dev->actual中。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

static void virtio_balloon_set_config(VirtIODevice *vdev,
const uint8_t *config_data)
{
VirtIOBalloon *dev = VIRTIO_BALLOON(vdev);
struct virtio_balloon_config config;
uint32_t oldactual = dev->actual;
ram_addr_t vm_ram_size = get_current_ram_size();

memcpy(&config, config_data, sizeof(struct virtio_balloon_config));
dev->actual = le32_to_cpu(config.actual);
if (dev->actual != oldactual) {
qapi_event_send_balloon_change(vm_ram_size -
((ram_addr_t) dev->actual << VIRTIO_BALLOON_PFN_SHIFT),
&error_abort);
}
trace_virtio_balloon_set_config(dev->actual, oldactual);
}

总结


the descriptor describing the resulting 32-bit array is added to the inflateq中32-bit array的含义如下所示:


参考资料:

  1. Virtio-Balloon超详细分析
  2. Speed Up Boot-up Time for Guest in Alibaba Cloud
  3. 内存过载使用