源代码版本:git://git.qemu.org/qemu.git v2.5.0
savevm指令对应的函数是hmp_savevm,loadvm则是hmp_loadvm, 对应的函数是
{ .name = "savevm", .args_type = "name:s", .params = "[tag|id]", .help = "save a VM snapshot. If no tag or id are provided, a new snapshot is created", .mhandler.cmd = hmp_savevm, }, { .name = "loadvm", .args_type = "name:s", .params = "tag|id", .help = "restore a VM snapshot from its tag or id", .mhandler.cmd = hmp_loadvm, .command_completion = loadvm_completion, },
上面这个代码是编译生成的,在编译目录下面的x86_64-softmmu/hmp-commands.h。
先看hmp_savevm,在qemu monitor console下执行savevm oenhan指令,在hmp_savevm中,const char *name = qdict_get_try_str(qdict, “name”)获取的就是oenhan这个名称,bdrv_all_can_snapshot和bdrv_all_delete_snapshot都是针对不支持snapshot的文件格式如raw或者snapshot的同名处理。
qemu_fopen_bdrv生成了填充了QEMUFile结构体,结构体如下:
struct QEMUFile { const QEMUFileOps *ops; void *opaque; int64_t bytes_xfer;/*按byte写了多少次*/ int64_t xfer_limit;/*File写入的次数限制,bytes_xfer不能超过此参数*/ int64_t pos; /* 当前的文件指针位置,但是没有被使用 */ int buf_index; /*标记buf数组的末尾索引*/ int buf_size; /* 0 when writing */ uint8_t buf[IO_BUF_SIZE];/*读写内容都缓冲到这里*/ struct iovec iov[MAX_IOV_SIZE]; unsigned int iovcnt; int last_error;/*上次读写有没有出错,如果有,当前的读写估计都会被禁止*/ };
而实际上qemu_fopen_bdrv此刻做的事情等同于QEMUFile.ops=bdrv_write_ops,QEMUFile.opaque=bs,初始化填充而已。
下面的qemu_savevm_state是主要函数,qemu_savevm_state开头初始化了很多Migration的结构体函数本质就是使用了migration的那套获取信息的机制。migrate_init填充MigrationState,但貌似ms没有作用。再往下则是qemu_savevm_state_header函数,qemu_put_be32负责给QEMUFile写入内容,就是使用qemu_put_byte按byte写四次,具体写的过程对照QEMUFile里面的注释就很清楚了。因为savevm_state.skip_configuration是空,则看vmstate_save_state,这里又引入了另外一个结构体VMStateDescription
typedef struct { const char *name; /*一般都是针对具体名称的字符化*/ /*计算非常复杂,在VMSTATE_UINTTL(env.eip, X86CPU) 初始化中 *则相当于eip到X86CPU的offset。 */ size_t offset; size_t size; /*被保存的单个数据的长度*/ size_t start; int num; /*一般为1,如果是保存的数据是数组可能>1*/ size_t num_offset; size_t size_offset; const VMStateInfo *info; enum VMStateFlags flags; const VMStateDescription *vmsd; int version_id; bool (*field_exists)(void *opaque, int version_id); } VMStateField; struct VMStateDescription { const char *name; /*标记名字*/ int unmigratable; /*数据是否支持迁移*/ int version_id; /*主要用来检查是否兼容的*/ int minimum_version_id; int minimum_version_id_old; LoadStateHandler *load_state_old; int (*pre_load)(void *opaque); /*不同场景下的回调函数*/ int (*post_load)(void *opaque, int version_id); void (*pre_save)(void *opaque); bool (*needed)(void *opaque); /*数据主要存放到这里,以数组结构体的形式存在*/ VMStateField *fields; const VMStateDescription **subsections; /*子模块*/ };
进入到vmstate_save_state,第一句就是获取vmsd->fields,也就是VMStateField,结构体代码如上,实际初始化内容如下
static const VMStateDescription vmstate_configuration = { .name = "configuration", .version_id = 1, .post_load = configuration_post_load, .pre_save = configuration_pre_save, .fields = (VMStateField[]) { VMSTATE_UINT32(len, SaveState), //准备获取SaveState下的len值 //准备获取SaveState下的name值 VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, 0, len), VMSTATE_END_OF_LIST() }, };
VMStateField这几个宏特别复杂,有兴趣的看一下,gdb获取的结果如下
(gdb) p *(vmstate_configuration.fields)
$9 = {name = 0x555555b27100 “len”, offset = 24, size = 4, start = 0, num = 0, num_offset = 0, size_offset = 0, info = 0x555555f6e4b0 <vmstate_info_uint32>, flags = VMS_SINGLE, vmsd = 0x0, version_id = 0, field_exists = 0x0}
继续,vmsd->pre_save(opaque)的执行也就是的执行,本质是在savevm_state结构体里面
typedef struct SaveState { QTAILQ_HEAD(, SaveStateEntry) handlers; int global_section_id; bool skip_configuration; uint32_t len; const char *name; } SaveState;
给name赋值,这个值是qemu当前采用的机器架构MACHINE_GET_CLASS(current_machine)->name是pc-i440fx-2.5。configuration_pre_save就是获取硬件架构模型的名字。vmdesc是0,忽略对应的处理。
field->name有值,field->field_exists为空,下面是base_addr等变量的赋值,base_addr = opaque + field->offset,这是base_addr指向的就是我们在SaveState里面准备获取的信息len,n_elems = vmstate_n_elems(opaque, field)则对于数组才有意义,size = vmstate_size(opaque, field)则是单个参数的长度,在下面的循环中
/*循环一般只有一次*/ for (i = 0; i < n_elems; i++) { void *addr = base_addr + size * i; /*vmsd_desc_field_start因为vmdesc_loop为空直接return*/ vmsd_desc_field_start(vmsd, vmdesc_loop, field, i, n_elems); //快速获取当前的offset old_offset = qemu_ftell_fast(f); if (field->flags & VMS_ARRAY_OF_POINTER) { addr = *(void **)addr; } if (field->flags & VMS_STRUCT) { vmstate_save_state(f, field->vmsd, addr, vmdesc_loop); } else { //指向put,具体函数则是put_uint32 field->info->put(f, addr, size); } written_bytes = qemu_ftell_fast(f) - old_offset; vmsd_desc_field_end(vmsd, vmdesc_loop, field, written_bytes, i); /* Compressed arrays only care about the first element */ if (vmdesc_loop && vmsd_can_compress(field)) { vmdesc_loop = NULL; } }
很容易可以分析出核心函数是put_uint32,前面说过了具体流程,略过。回来看vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0)这句,它根据vmstate_configuration的配置将savevm_state下的两个参数拷贝到qemufile里面。此刻qemu_savevm_state_header结束。
进入qemu_savevm_state_begin函数,begin整个函数都是对savevm_state.handlers函数的的遍历执行,QTAILQ_FOREACH(se, &savevm_state.handlers, entry),那么回头看savevm_state.handlers,handlers只是一个头指针指向SaveStateEntry的结构链表,也就是se,
typedef struct SaveStateEntry { QTAILQ_ENTRY(SaveStateEntry) entry; char idstr[256]; int instance_id; int alias_id; int version_id; int section_id; SaveVMHandlers *ops; const VMStateDescription *vmsd; void *opaque; CompatEntry *compat; int is_ram; } SaveStateEntry;
执行se->ops的函数,具体执行内容先不看,关注点在savevm_state.handlers的函数是是什么时候挂上去的,如何初始化的,记住ops的类型是SaveVMHandlers。
先从qemu启动main函数看起,有blk_mig_init和ram_mig_init,它们都调用了register_savevm_live函数,倒数的两个参数分别是SaveVMHandlers *ops和void *opaque,
int register_savevm_live(DeviceState *dev, const char *idstr, int instance_id, int version_id, SaveVMHandlers *ops, void *opaque) { SaveStateEntry *se; //正是savevm_state.handlers的对象 se = g_new0(SaveStateEntry, 1); se->version_id = version_id; se->section_id = savevm_state.global_section_id++; se->ops = ops; //ops在此处赋值 se->opaque = opaque; //操作的对象即是 se->vmsd = NULL; //.....代码有省略... /* add at the end of list */ QTAILQ_INSERT_TAIL(&savevm_state.handlers, se, entry); //就是在此处插入的对象结构体 return 0; }
其中blk_mig_init传入的ops是savevm_block_handlers
static SaveVMHandlers savevm_block_handlers = { .set_params = block_set_params, .save_live_setup = block_save_setup, .save_live_iterate = block_save_iterate, .save_live_complete_precopy = block_save_complete, .save_live_pending = block_save_pending, .load_state = block_load, .cleanup = block_migration_cleanup, .is_active = block_is_active, };
register_savevm_live是注册函数的核心,查看它的调用函数即发现调用轨迹,一个很重要的函数是register_savevm,它被其他调用的更多。
int register_savevm(DeviceState *dev, const char *idstr, int instance_id, int version_id, SaveStateHandler *save_state, LoadStateHandler *load_state, void *opaque) { // SaveVMHandlers直接初始化两个函数,然后整个就挂入钩子中 SaveVMHandlers *ops = g_new0(SaveVMHandlers, 1); ops->save_state = save_state; // 入参函数 ops->load_state = load_state; // 入参函数 return register_savevm_live(dev, idstr, instance_id, version_id, ops, opaque); }
调用函数以cpu_exec_init为例,传入的函数是cpu_save和cpu_load,整体就是保存和加载CPU信息的。
register_savevm(NULL, "cpu", cpu_index, CPU_SAVE_VERSION, cpu_save, cpu_load, cpu->env_ptr);
savevm_state.handlers就是一个链表,如果有新增的模拟设备需要保存,则将自己的save/load函数挂到handlers上去就可以执行了,整个链表如上图,qemu中有更多没有详细列出,自己看代码就可以了。
回到qemu_savevm_state_begin函数,se->ops->set_params实际上执行的就一个函数block_set_params,其他的ops都没有赋值
(gdb) p *params $4 = {blk = false, shared = false} static void block_set_params(const MigrationParams *params, void *opaque) { block_mig_state.blk_enable = params->blk; block_mig_state.shared_base = params->shared; /* shared base means that blk_enable = 1 */ block_mig_state.blk_enable |= params->shared; }
实际对比一下入参params和block_set_params函数也清楚看出params就是适配给block_set_params,如果handlers配置了其他函数,反而可能有其他问题。
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { if (!se->ops || !se->ops->save_live_setup) { continue; } if (se->ops && se->ops->is_active) { if (!se->ops->is_active(se->opaque)) { continue; } } save_section_header(f, se, QEMU_VM_SECTION_START); ret = se->ops->save_live_setup(f, se->opaque); save_section_footer(f, se); if (ret < 0) { qemu_file_set_error(f, ret); break; } }
在上面的循环里面,第一个被执行的是savevm_block_handlers,但是因为se->ops->is_active(se->opaque)为false,值是前面set_param赋值设定的。其后是savevm_ram_handlers,se->ops->is_active在后续的handlers函数中都是空了,走到save_section_header save header信息,执行se->ops->save_live_setup函数,即是ram_save_setup,内存的处理是重点,往下看有ram_save_iterate和ram_save_complete,后面再提。
ram_save_setup中migration_bitmap_sync_init和reset_ram_globals初始化全局变量,migrate_use_xbzrle为false直接忽略中间处理过程,last_ram_offset则获取ram的最大偏移,ram的内容参考“KVM源代码分析4:内存虚拟化”,而ram_bitmap_pages则是qemu虚拟的内存空间需要占用host的page个数,bitmap_new为migration_bitmap_rcu->bmap分配位图并初始化为0,migration_bitmap_rcu结构如下:
static struct BitmapRcu { struct rcu_head rcu; /* Main migration bitmap */ unsigned long *bmap; /* bitmap of pages that haven't been sent even once * only maintained and used in postcopy at the moment * where it's used to send the dirtymap at the start * of the postcopy phase */ unsigned long *unsentmap; } *migration_bitmap_rcu;
因为只是本地save,migration_bitmap_rcu->unsentmap明显没有使用,后面也就跳过了,migration_dirty_pages则是ram使用了多少个page。
进入memory_global_dirty_log_start,MEMORY_LISTENER_CALL_GLOBAL回调MemoryListener的函数,
#define MEMORY_LISTENER_CALL_GLOBAL(_callback, _direction, _args...) \ do { \ MemoryListener *_listener; \ \ switch (_direction) { \ case Forward: \ //这个循环和前面的基本一个思路 QTAILQ_FOREACH(_listener, &memory_listeners, link) { \ if (_listener->_callback) { \ _listener->_callback(_listener, ##_args); \ } \ } \ break; \ case Reverse: \ QTAILQ_FOREACH_REVERSE(_listener, &memory_listeners, \ memory_listeners, link) { \ if (_listener->_callback) { \ _listener->_callback(_listener, ##_args); \ } \ } \ break; \ default: \ abort(); \ } \ } while (0)
MemoryListener注册到memory_listeners是通过memory_listener_register实现的,对应的注册函数还是很多的,如上图,但是我们只关注
kvm_memory_listener_register(s, &s->memory_listener, &address_space_memory, 0); memory_listener_register(&kvm_io_listener, &address_space_io);
重点还是address_space_memory,
–未完待续–
—结束—