前言
qemu 编译版本 7.0.0
在线源码阅读
# qemu 编译
./configure --target-list=x86_64-softmmu --enable-debug --enable-debug-info --enable-kvm \
--enable-trace-backends=simple --enable-virtfs
# 其中 --enable-debug 是调试qemu自身所需
将当前目录路径加入到 $HOME/.gdbinit
add-auto-load-safe-path 路径
# 例如
add-auto-load-safe-path /home/ostest/linux-test/
编译器默认没有把宏定义扩展信息编译进二进制文件(man gcc
)。编译时添加-gdwaf-2
和-g3
这两个参数就可以查看调试宏了 。-g3
参数,在编译时会将扩展的debug信息编译进二进制文件里面,并且包括宏定义信息
E1000和TAP接口,在QEMU中互为peer, 数据结果关系参考
参考 qemu: 设备后端模拟
启动命令
gdb --args ./build/qemu-system-x86_64 \
-m 2G -smp 2 \
--enable-kvm \
-name plr2 \
-boot order=c \
-drive file=/home/ostest/linux-test/run-test/ubuntu2.img,index=0,media=disk,format=qcow2 \
-nic tap,id=plr2,model=e1000,mac=52:54:76:12:54:32,ifname=tap0,script=no,downscript=no \
-vnc :2
qemu 启动后,使用start
后,添加的断点信息
e1000 初始化
e1000_register_types
调用过程
e1000_class_init
记录
qemu net 初始化
net_param_nic
函数
net_client_init
函数:
通过source insight 没有查看到visit_type_Netdev
内容,通过gdb list
阅读
(gdb) list 818, 850
818 bool visit_type_Netdev(Visitor *v, const char *name,
819 Netdev **obj, Error **errp)
820 {
821 bool ok = false;
822
823 if (!visit_start_struct(v, name, (void **)obj, sizeof(Netdev), errp)) {
824 return false;
825 }
826 if (!*obj) {
827 /* incomplete */
828 assert(visit_is_dealloc(v));
829 ok = true;
830 goto out_obj;
831 }
832 if (!visit_type_Netdev_members(v, *obj, errp)) {
833 goto out_obj;
834 }
835 ok = visit_check_struct(v, errp);
836 out_obj:
837 visit_end_struct(v, (void **)obj);
838 if (!ok && visit_is_input(v)) {
839 qapi_free_Netdev(*obj);
840 *obj = NULL;
841 }
842 return ok;
843 }
执行完 net_client_int
net_client_init1
执行,需要进入函数后查看
source insight 没有找到 Netdev
的定义
1058 nc = qemu_find_netdev(netdev->id);
(gdb) n
1059 if (nc) {
(gdb) p nc
$28 = (NetClientState *) 0x0
(gdb) n
1064 if (net_client_init_fun[netdev->type](netdev, netdev->id, peer, errp) < 0) {
(gdb) s
net_init_tap (netdev=0x0, name=0x7fffffffe2d0 "\020", peer=0x55555583f940 <_start>, errp=0x7fffffffddd0) at ../net/tap.c:796
796 {
NetdevTapOptions *tap
结构体内容
net_tap_init
函数
pc_nic_init
(gdb) p nb_nics
$40 = 1
(gdb) p *nd
// 这个netdev 就是前面的net_tap_info
$41 = {macaddr = {a = "RTv\022T2"}, model = 0x5555568a1540 "e1000", name = 0x0, devaddr = 0x0,
netdev = 0x555556af3730, used = 1, instantiated = 0, nvectors = 0}
p pcmc->default_nic_model
$43 = 0x5555560b8313 "e1000"
pci_e1000_realize
虚拟网卡
在虚拟机创建之初,Qemu使用malloc()
从其进程地址空间中申请了一块与虚拟机的物理内存大小相等的区域,该块区域就是作为了Guest OS的物理内存来使用。
在Qemu创建虚拟机的时候会向KVM通告Guest OS所使用的物理内存布局,即KVMSlot
// include/sysemu/kvm.h l:16
typedef struct KVMSlot
{
hwaddr start_addr; // Guest物理地址块的起始地址
ram_addr_t memory_size; // 大小
void *ram; // QUMU用户空间地址
int slot; // slot号
int flags; // 内存属性
int old_flags;
/* Dirty bitmap cache for the slot */
unsigned long *dirty_bmap;
unsigned long dirty_bmap_size;
/* Cache of the address space ID */
int as_id;
/* Cache of the offset in ram address space */
ram_addr_t ram_start_offset;
} KVMSlot;
Guest OS访问任意一块物理地址GPA时,都可以通过KVMSlot记载的关系来得到Qemu的虚拟地址映射即HVA
前的网卡主要涉及到MMIO,而且还是PCI接口,所以使用 PCI mem
(gdb) p *pci_dev
$48 = {qdev = {parent_obj = {class = 0x555556a33da0, free = 0x55555583cd60 <g_free@plt>,
Python Exception <class 'gdb.error'> There is no member named keys.:
properties = 0x555557549520, ref = 4, parent = 0x555556abbb80}, id = 0x0,
canonical_path = 0x0, realized = false, pending_deleted_event = false,
pending_deleted_expires_ms = 0, opts = 0x0, hotplugged = 0,
allow_unplug_during_migration = false, parent_bus = 0x555556c4ef90, gpios = {lh_first = 0x0},
clocks = {lh_first = 0x0}, child_bus = {lh_first = 0x0}, num_child_bus = 0,
instance_id_alias = -1, alias_required_for_version = 0, reset = {count = 0,
hold_phase_pending = false, exit_phase_in_progress = false}}, partially_hotplugged = false,
has_power = false, config = 0x55555754a190 "\206\200\016\020",
cmask = 0x55555754a2a0 "\377\377\377\377", wmask = 0x5555575498e0 "", w1cmask = 0x5555575499f0 "",
used = 0x555557549b00 "", devfn = 24, requester_id_cache = {dev = 0x5555575589c0,
type = PCI_REQ_ID_BDF}, name = "e1000", '\000' <repeats 58 times>, io_regions = {{addr = 0,
size = 0, type = 0 '\000', memory = 0x0, address_space = 0x0}, {addr = 0, size = 0,
type = 0 '\000', memory = 0x0, address_space = 0x0}, {addr = 0, size = 0, type = 0 '\000',
memory = 0x0, address_space = 0x0}, {addr = 0, size = 0, type = 0 '\000', memory = 0x0,
address_space = 0x0}, {addr = 0, size = 0, type = 0 '\000', memory = 0x0,
address_space = 0x0}, {addr = 0, size = 0, type = 0 '\000', memory = 0x0,
address_space = 0x0}, {addr = 0, size = 0, type = 0 '\000', memory = 0x0,
address_space = 0x0}}, bus_master_as = {rcu = {next = 0x0, func = 0x0},
name = 0x55555754cf90 "e1000", root = 0x555557558c50, current_map = 0x555556ac5200,
ioeventfd_nb = 0, ioeventfds = 0x0, listeners = {tqh_first = 0x0, tqh_circ = {tql_next = 0x0,
tql_prev = 0x555557558c28}}, address_spaces_link = {tqe_next = 0x0, tqe_circ = {
tql_next = 0x0, tql_prev = 0x55555708d108}}}, bus_master_container_region = {parent_obj = {
class = 0x555556934ed0, free = 0x0, Python Exception <class 'gdb.error'> There is no member named keys.:
properties = 0x555557549460, ref = 1,
parent = 0x5555575589c0}, romd_mode = true, ram = false, subpage = false, readonly = false,
nonvolatile = false, rom_device = false, flush_coalesced_mmio = false,
dirty_log_mask = 0 '\000', is_iommu = false, ram_block = 0x0, owner = 0x5555575589c0, ops =
0x555556687ee0 <unassigned_mem_ops>, opaque = 0x0, container = 0x0, mapped_via_alias = 0,
size = 18446744073709551616, addr = 0,
destructor = 0x555555bc0830 <memory_region_destructor_none>, align = 0, terminates = false,
ram_device = false, enabled = true, warning_printed = false, vga_logging_count = 0 '\000',
alias = 0x0, alias_offset = 0, priority = 0, subregions = {tqh_first = 0x0, tqh_circ = {
tql_next = 0x0, tql_prev = 0x555557558d08}}, subregions_link = {tqe_next = 0x0, tqe_circ = {
--Type <RET> for more, q to quit, c to continue without paging--
(gdb) p *d
value of type `E1000State' requires 208544 bytes, which is more than max-value-size
(gdb) p *(d->nic)
$51 = {ncs = 0x55555754bd40, conf = 0x5555575593c8, opaque = 0x5555575589c0, peer_deleted = false}
(gdb) p *(d->nic->ncs)
$52 = {info = 0x5555566cb520 <net_e1000_info>, link_down = 0, next = {tqe_next = 0x0, tqe_circ = {
tql_next = 0x0, tql_prev = 0x555556af3740}}, peer = 0x555556af3730,
incoming_queue = 0x55555754bec0, model = 0x55555754bbe0 "e1000", name = 0x55555754bc00 "e1000.0",
info_str = '\000' <repeats 255 times>, receive_disabled = 0, destructor = 0x0, queue_index = 0,
rxfilter_notify_enabled = 0, vring_enable = 0, vnet_hdr_len = 0, is_netdev = false,
do_not_pad = false, is_datapath = true, filters = {tqh_first = 0x0, tqh_circ = {tql_next = 0x0,
tql_prev = 0x55555754bea8}}}
(gdb) p *(d->nic->conf)
$53 = {macaddr = {a = "RTv\022T2"}, peers = {ncs = {0x555556af3730, 0x0 <repeats 1023 times>},
queues = 1}, bootindex = -1}
(gdb) p *(d->nic->ncs->peer)
$54 = {info = 0x5555566aab80 <net_tap_info>, link_down = 0, next = {tqe_next = 0x55555754bd40,
tqe_circ = {tql_next = 0x55555754bd40, tql_prev = 0x5555567c2290 <net_clients>}}, ....
(gdb) p *(d->nic->conf.peers.ncs[0])
$57 = {info = 0x5555566aab80 <net_tap_info>, link_down = 0, next = {tqe_next = 0x55555754bd40,
tqe_circ = {tql_next = 0x55555754bd40, tql_prev = 0x5555567c2290 <net_clients>}}, ....
(gdb) p (d->nic->ncs[0]->info_str)
$58 = "model=e1000,macaddr=52:54:76:12:54:32", '\000' <repeats 218 times>
网络包发送
当Guest OS中有数据包要发送时,在全虚拟化情况下,Guest会像通常那样走普通网卡驱动流程,将数据包的内容写入待发送的sk_buff
的地址空间中,同时将待发送的sk_buff
地址放入发送ring中,配置网卡的发送寄存器就可将数据包发送出去,而在Guest模式下,当Guest访问PIO或者MMIO时会触发VM Exit,进入到Host OS 中的kvm。而设备的模拟是在Qemu中进行的,KVM对该中异常退出无法处理,会将该退出原因注入给Qemu来处理
(gdb) b e1000_mmio_write
Breakpoint 19 at 0x5555559e4c99: file ../hw/net/e1000.c,
// start_xmit 开始发送
(gdb) b start_xmit
Breakpoint 20 at 0x5555559e39e1: file ../hw/net/e1000.c, line
start_xmit
函数调用
// 注意:我这代码修改过行号与源码不一致
Thread 5 "qemu-system-x86" hit Breakpoint 20, start_xmit (s=0x7fff67dff560) at ../hw/net/e1000.c:816
816 {
(gdb) n
817 PCIDevice *d = PCI_DEVICE(s);
(gdb) n
820 uint32_t tdh_start = s->mac_reg[TDH], cause = E1000_ICS_TXQE;
(gdb) p d
$60 = (PCIDevice *) 0x5555575589c0
(gdb) n
// 存在不少情况调用这个,连续几次才跳过
822 if (!(s->mac_reg[TCTL] & E1000_TCTL_EN)) {
(gdb) n
824 return;
(gdb) p s->mac_reg[TCTL]
$61 = 17035514
(gdb) n
832 while (s->mac_reg[TDH] != s->mac_reg[TDT]) {
(gdb) n
858 s->tx.busy = false;
(gdb) n
859 set_ics(s, 0, cause);
(gdb) n
860 }
// 将断点打在 pci_dma_read(d, base, &desc, sizeof(desc));
(gdb) b 835
(gdb) n
841 process_tx_desc(s, &desc);
(gdb) p desc
$62 = {buffer_addr = 70205442, lower = {data = 2332033353, flags = {length = 329, cso = 0 '\000',
cmd = 139 '\213'}}, upper = {data = 0, fields = {status = 0 '\000', css = 0 '\000',
special = 0}}}
// 执行新的函数
process_tx_desc (s=0x5555559e1d47 <pci_dma_rw+80>, dp=0x7fff67dfe4d0) at ../hw/net/e1000.c:702
702 {
(gdb) n
703 PCIDevice *d = PCI_DEVICE(s);
(gdb) n
704 uint32_t txd_lower = le32_to_cpu(dp->lower.data);
(gdb) p d
// 记录一下:与上面的输出是一样的
$63 = (PCIDevice *) 0x5555575589c0
(gdb) n
712 s->mit_ide |= (txd_lower & E1000_TXD_CMD_IDE);
(gdb) p *xp
$64 = {lower_setup = {ip_config = 70205442, ip_fields = {ipcss = 2 '\002', ipcso = 64 '@',
ipcse = 1071}}, upper_setup = {tcp_config = 0, tcp_fields = {tucss = 0 '\000',
tucso = 0 '\000', tucse = 0}}, cmd_and_length = 2332033353, tcp_seg_setup = {data = 0,
fields = {status = 0 '\000', hdr_len = 0 '\000', mss = 0}}}
(gdb) p tp->data
$65 = '\000' <repeats 65535 times>
(gdb) p tp->header
$66 = '\000' <repeats 255 times>
(gdb) p tp->size
$67 = 0
(gdb) n
731 tp->cptse = 0;
(gdb) n
734 if (e1000x_vlan_enabled(s->mac_reg) &&
(gdb) n
744 addr = le64_to_cpu(dp->buffer_addr);
(gdb) p *dp
$68 = {buffer_addr = 70205442, lower = {data = 2332033353, flags = {length = 329, cso = 0 '\000',
cmd = 139 '\213'}}, upper = {data = 0, fields = {status = 0 '\000', css = 0 '\000',
special = 0}}}
(gdb) n
773 pci_dma_read(d, addr, tp->data + tp->size, split_size);
(gdb) p split_size
$69 = 329
(gdb) p tp->size
$70 = 0
(gdb) p tp->data
$71 = '\000' <repeats 65535 times>
(gdb) n
774 tp->size += split_size;
// 此时已经从guest 中获取到数据了
(gdb) x /20xb tp->data
0x55555757b7d0: 0xff 0xff 0xff 0xff 0xff 0xff 0x52 0x54
0x55555757b7d8: 0x76 0x12 0x54 0x32 0x08 0x00 0x45 0xc0
0x55555757b7e0: 0x01 0x3b 0x00 0x00
(gdb) s
xmit_seg (s=0x5555575589c0) at ../hw/net/e1000.c:632
632 {
638 if (tp->cptse) {
(gdb) n
675 if (tp->sum_needed & E1000_TXD_POPTS_TXSM) {
(gdb) n
678 if (tp->sum_needed & E1000_TXD_POPTS_IXSM) {
(gdb) n
681 if (tp->vlan_needed) {
(gdb) n
690 e1000_send_packet(s, tp->data, tp->size);
(gdb)
(gdb) s
e1000_send_packet()
(gdb) s
620 NetClientState *nc = qemu_get_queue(s->nic);
(gdb) info args
s = 0x5555575589c0
buf = 0x55555757b7d0 "\377\377\377\377\377\377RTv\022T2\b"
size = 329
(gdb) n
624 qemu_send_packet(nc, buf, size);
(gdb) s
qemu_send_packet(); at ../net/net.c:703
(gdb) s
qemu_send_packet_async () at ../net/net.c:697
(gdb) i args
sender = 0x55555754bd40
buf = 0x55555757b7d0 "\377\377\377\377\377\377RTv\022T2\b"
size = 329
sent_cb = 0x0
(gdb) s
qemu_send_packet_async_with_flags () at ../net/net.c:663
(gdb) p flags
$76 = 0
(gdb) s
filter_receive () at ../net/net.c:616
(gdb) i args
nc = 0x55555754bd40
direction = NET_FILTER_DIRECTION_TX
sender = 0x55555754bd40
(gdb) s
filter_receive_iov
(nc=0x55555754bd40, direction=NET_FILTER_DIRECTION_TX, sender=0x55555754bd40, flags=0, iov=0x7fff67dfe320, iovcnt=1, sent_cb=0x0) at ../net/net.c:586
(gdb) p nc->filters
$77 = {tqh_first = 0x0, tqh_circ = {tql_next = 0x0, tql_prev = 0x55555754bea8}}
(gdb) n
683 ret = filter_receive(sender->peer, NET_FILTER_DIRECTION_RX,
(gdb) s
filter_receive_iov(nc=0x555556af3730, direction=NET_FILTER_DIRECTION_RX, sender=0x55555754bd40, flags=0, iov=0x7fff67dfe320, iovcnt=1, sent_cb=0x0) at ../net/net.c:586
(gdb) p *nc
$78 = {info = 0x5555566aab80 <net_tap_info>, link_down = 0, next = {tqe_next = 0x55555754bd40, ....
(gdb) p *sender
$79 = {info = 0x5555566cb520 <net_e1000_info>, link_down = 0, next = ...
(gdb) n
689 queue = sender->peer->incoming_queue;
(gdb) p *queue
$82 = {opaque = 0x555556af3730, nq_maxlen = 10000, nq_count = 0,
deliver = 0x5555558ed408 <qemu_deliver_packet_iov>, packets = {tqh_first = 0x0, tqh_circ = {
tql_next = 0x0, tql_prev = 0x5555568a1918}}, delivering = 0}
(gdb) s
qemu_net_queue_send () at ../net/queue.c:213
(gdb) s
216 if (queue->delivering || !qemu_can_send_packet(sender)) {
(gdb) n
221 ret = qemu_net_queue_deliver(queue, sender, flags, data, size);
(gdb) s
qemu_deliver_packet_iov () at ../net/net.c:768
(gdb) p *nc
$83 = {info = 0x5555566aab80 <net_tap_info>, link_down = 0 ....
(gdb) n
781 if (nc->info->receive_iov && !(flags & QEMU_NET_PACKET_FLAG_RAW)) {
(gdb) s
782 ret = nc->info->receive_iov(nc, iov, iovcnt);
(gdb) s
tap_receive_iov ( nc=0x555556af3730, iov=0x7fff67dfe2e0, iovcnt=1) at ../net/tap.c:120
(gdb) s
tap_write_packet (s=0x555556af3730, iov=0x7fff67dfe1c0, iovcnt=2) at ../net/tap.c:103
(gdb) s
__GI___writev (fd=13, iov=0x7fff67dfe1c0, iovcnt=2) at ../sysdeps/unix/sysv/linux/writev.c:25
25 ../sysdeps/unix/sysv/linux/writev.c: 没有那个文件或目录.
(gdb) finish
Run till exit from #0 __GI___writev (fd=13, iov=0x7fff67dfe1c0, iovcnt=2)
at ../sysdeps/unix/sysv/linux/writev.c:25
0x00005555558f890f in tap_write_packet (s=0x555556af3730, iov=0x7fff67dfe1c0, iovcnt=2) at ../net/tap.c:107
107 len = writev(s->fd, iov, iovcnt);
Value returned is $85 = 339
(gdb) p iov->iov_len
$86 = 10
(gdb) p iov[1]->iov_len
$87 = 329
(gdb) x /10xb iov->iov_base
0x7fff67dfe22a: 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
0x7fff67dfe232: 0x00 0x00
(gdb) s
qemu_net_queue_flush (queue=0x149) at ../net/queue.c:274
274 {
(gdb) n
275 if (queue->delivering)
(gdb) n
278 while (!QTAILQ_EMPTY(&queue->packets)) {
(gdb) n
303 return true;
(gdb) n
304 }
// 返回到e1000_send_packet 函数
(gdb) s
e1000x_increase_size_stats (mac=0x5555575589c0, size_regs=0x55555757b7d0, size=21845)
(gdb) finish
Run till exit from #0 process_tx_desc (s=0x5555575589c0, dp=0x7fff67dfe540) at ../hw/net/e1000.c:784
start_xmit (s=0x5555575589c0) at ../hw/net/e1000.c:842
842 cause |= txdesc_writeback(s, base, &desc);
(gdb) n
844 if (++s->mac_reg[TDH] * sizeof(desc) >= s->mac_reg[TDLEN])
(gdb) p cause
$88 = 3
qemu mtree 报错
修改 scripts/qemugdb/mtree.py
内容
import gdb
import traceback
def isnull(ptr):
return ptr == gdb.Value(0).cast(ptr.type)
def int128(p):
'''Read an Int128 type to a python integer.
QEMU can be built with native Int128 support so we need to detect
if the value is a structure or the native type.
'''
if p.type.code == gdb.TYPE_CODE_STRUCT:
return int(p['lo']) + (int(p['hi']) << 64)
else:
return int(("%s" % p), 16)
class MtreeCommand(gdb.Command):
'''Display the memory tree hierarchy'''
def __init__(self):
# 注册qemu 命令 qemu mtree
gdb.Command.__init__(self, 'qemu mtree', gdb.COMMAND_DATA,
gdb.COMPLETE_NONE)
self.queue = []
# 输入相应命令会调用这个,所以通过这个找出出错位置
def invoke(self, arg, from_tty):
try:
self.seen = set()
self.queue_root('address_space_memory')
self.queue_root('address_space_io')
self.process_queue()
except Exception as e:
traceback.print_exc()
raise e
def queue_root(self, varname):
ptr = gdb.parse_and_eval(varname)['root']
self.queue.append(ptr)
def process_queue(self):
while self.queue:
ptr = self.queue.pop(0)
if int(ptr) in self.seen:
continue
self.print_item(ptr)
def print_item(self, ptr, offset = gdb.Value(0), level = 0):
self.seen.add(int(ptr))
addr = ptr['addr']
addr += offset
size = int128(ptr['size'])
alias = ptr['alias']
klass = ''
if not isnull(alias):
klass = ' (alias)'
elif not isnull(ptr['ops']):
klass = ' (I/O)'
elif bool(ptr['ram']):
klass = ' (RAM)'
try:
gdb.write('%s%016x-%016x %s%s (@ %s)\n'
% (' ' * level,
int(addr),
# 原始:int(addr + (size - 1)),
int(int(addr) + (size - 1)),
ptr['name'].string(),
klass,
ptr,
),
gdb.STDOUT)
except Exception as e:
print(type(addr))
print(type(size))
print(addr,size)
raise e
if not isnull(alias):
gdb.write('%s alias: %s@%016x (@ %s)\n' %
(' ' * level,
alias['name'].string(),
int(ptr['alias_offset']),
alias,
),
gdb.STDOUT)
self.queue.append(alias)
subregion = ptr['subregions']['tqh_first']
level += 1
while not isnull(subregion):
self.print_item(subregion, addr, level)
subregion = subregion['subregions_link']['tqe_next']