本节通过rtl8139网卡的例子,从虚拟驱动模块入手,来分析Qemu的网络虚拟化原理与架构。
10.1.1 rtl8139的虚拟网卡驱动
(1) 虚拟网卡设备的初始化
static Property rtl8139_properties[] = {
DEFINE_NIC_PROPERTIES(RTL8139State, conf),
DEFINE_PROP_END_OF_LIST(),
};(rtl8139.c)
static void rtl8139_class_init(ObjectClass *klass,void *data)
{
DeviceClass*dc = DEVICE_CLASS(klass);
PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
k->init = pci_rtl8139_init;
k->exit =pci_rtl8139_uninit;
.......
dc->reset= rtl8139_reset;
dc->vmsd= &vmstate_rtl8139;
dc->props = rtl8139_properties;
}
Property rtl8139_properties 包含了网卡设备需要用到的属性
#define DEFINE_NIC_PROPERTIES(_state, _conf) \
DEFINE_PROP_MACADDR("mac", _state, _conf.macaddr), \
DEFINE_PROP_VLAN("vlan", _state, _conf.peer), \
DEFINE_PROP_NETDEV("netdev", _state, _conf.peer), \
DEFINE_PROP_INT32("bootindex", _state, _conf.bootindex, -1)
mac为网卡的mac地址。 bootindex表明该网卡作为引导设备的顺序。 vlan 和netdev用于存储虚拟网卡的数据需要发送到的peer的信息;peer对应的结构体为NetClientState。
struct NetClientState {
NetClientInfo *info; //指向用于处理数据接收的函数指针集合
intlink_down; //设备的link状态
QTAILQ_ENTRY(NetClientState) next;
NetClientState *peer; //该设备的peer
NetQueue*send_queue; //设备的数据发送队列
char *model;
char *name;
charinfo_str[256];
unsignedreceive_disabled : 1;
};
下面是初始化函数
static int pci_rtl8139_init(PCIDevice*dev)
{
.........
s->nic = qemu_new_nic(&net_rtl8139_info,&s->conf,
object_get_typename(OBJECT(dev)), dev->qdev.id, s);
qemu_format_nic_info_str(&s->nic->nc,s->conf.macaddr.a);
..........
}
s->conf中存储了该网卡设备的peer端信息,10.1.2将分析网卡的实例初始化过程
qemu_new_nic是qemu Net层的接口函数(Net\net.c), 流程如下:
a) NetClientState* nc = qemu_new_net_client(info,conf->peer, model, name);
b) nic =DO_UPCAST(NICState, nc, nc);
nic->conf = conf;
nic->opaque = opaque; //opaque为RTL8139State
NetClientState *qemu_new_net_client(NetClientInfo*info,
NetClientState *peer,
const char*model,
const char *name) {
NetClientState *nc;
nc =g_malloc0(info->size);
nc->info= info;
nc->model= g_strdup(model);
if (name) {
nc->name = g_strdup(name);
} else {
nc->name = assign_name(nc, model);
}
if (peer) {
nc->peer = peer;
peer->peer = nc;
}
//将该NetClientState加入net_client链表
QTAILQ_INSERT_TAIL(&net_clients, nc,next);
//建立数据发送队列,该结构较简单,读者可以自行分析
nc->send_queue= qemu_new_net_queue(nc);
return nc;
}
(2)虚拟网卡的数据传输
net_rtl8139_info 实现了网卡设备NetClientState的NetClientInfo,用于实现网卡设备接收数据的处理:
static NetClientInfo net_rtl8139_info = {
.type =NET_CLIENT_OPTIONS_KIND_NIC,
.size =sizeof(NICState),
.can_receive= rtl8139_can_receive,
.receive =rtl8139_receive,
.cleanup =rtl8139_cleanup,
};
.recevie由该网卡对应的peer端收到数据后调用,同理当网卡在发送数据后, 会调用对应peer端的接收函数。
static ssize_t rtl8139_receive(NetClientState *nc,const uint8_t *buf, size_t size)
{
returnrtl8139_do_receive(nc, buf, size, 1);
}
buf和size由peer端的发送函数准备好。rtl8139_do_receive的主要流程描述如下:
a. 检察网卡状态能否接收数据
b. 根据网卡配置处理多播报包
c. 读取ring buffer的数据地址(pci_dma_read),并将数据buf写到这些地址(pci_dma_write)
d. 用rtl8139_update_irq ==》 qemu_set_irq 向虚拟机注入中断
同理当网卡向外发送数据的过程,就是调用其peer端接收函数的过程,其流程如下:
rtl8139_io_writel ==》寄存器TxStatus0 到 TxStatus0+4*4-1的写触发网卡的发送功能:
rtl8139_TxStatus_write ==》 rtl8139_transmit ==》 rtl8139_transmit_one
a. 下面的代码将数据从desc中指定的位置读出来放到txbuffer中
pci_dma_read(&s->dev,s->TxAddr[descriptor], txbuffer, txsize);
b. rtl8139_transfer_frame(s, txbuffer, txsize, 0,NULL);
c. rtl8139_update_irq(s) 传输完成,触发中断
rtl8139_transfer_frame ==> qemu_send_packet(&s->nic->nc,buf, size) ==> qemu_send_packet_async(nc, buf, size, NULL) ==>
qemu_send_packet_async_with_flags(sender,QEMU_NET_PACKET_FLAG_NONE,
buf, size, sent_cb);
static ssize_t qemu_send_packet_async_with_flags(NetClientState*sender,
unsigned flags,
const uint8_t *buf, int size,
NetPacketSent *sent_cb) {
......
queue =sender->peer->send_queue; //取出该网卡的peer的发送queue
returnqemu_net_queue_send(queue, sender, flags, buf, size, sent_cb);
}
ssize_t qemu_net_queue_send(NetQueue *queue,
NetClientState*sender,
unsigned flags,
const uint8_t*data,
size_t size,
NetPacketSent *sent_cb) {
.......
ret =qemu_net_queue_deliver(queue, sender, flags, data, size);
if (ret ==0) {
qemu_net_queue_append(queue, sender, flags, data, size, sent_cb);
return0;
}
qemu_net_queue_flush(queue);
return ret;
}
qemu_net_queue_deliver ==> qemu_deliver_packet
ssize_t qemu_deliver_packet(NetClientState *sender,
unsigned flags,
const uint8_t*data,
size_t size,
void *opaque) {
NetClientState *nc = opaque;
......
if (flags& QEMU_NET_PACKET_FLAG_RAW && nc->info->receive_raw) {
ret =nc->info->receive_raw(nc, data, size);
} else {
ret =nc->info->receive(nc, data, size);
}
if (ret ==0) {
nc->receive_disabled = 1;
};
return ret;
}
qemu_deliver_packet函数最后调用到了peer端的接收函数
10.1.2 rtl8139的虚拟网卡的实例化
pc_init1 ==》 pci_nic_init_nofail(nd,"rtl8139", NULL);
nd为NICInfo nd_table[MAX_NICS]; 该数组有main函数初始化,下一节分析
PCIDevice *pci_nic_init_nofail(NICInfo *nd, const char*default_model,
const char*default_devaddr)(pci.c)
{
PCIDevice*res;
if(qemu_show_nic_models(nd->model, pci_nic_models))
exit(0);
res =pci_nic_init(nd, default_model, default_devaddr);
if (!res)
exit(1);
return res;
}
pci_nic_models定义了qemu支持的网卡类别
static const char * const pci_nic_models[] = {
"ne2k_pci",
"i82551",
"i82557b",
"i82559er",
"rtl8139",
"e1000",
"pcnet",
"virtio",
NULL
};
PCIDevice *pci_nic_init(NICInfo *nd, const char*default_model,
const char *default_devaddr) {
const char*devaddr = nd->devaddr ? nd->devaddr : default_devaddr;
..........
i =qemu_find_nic_model(nd, pci_nic_models, default_model);
if (i < 0)
returnNULL;
bus =pci_get_bus_devfn(&devfn, devaddr);
........
//创建pci网卡,pci_rtl8139_init会被调用
pci_dev =pci_create(bus, devfn, pci_nic_names[i]);
dev =&pci_dev->qdev;
//设置网卡的属性
qdev_set_nic_properties(dev, nd);
if(qdev_init(dev) < 0)
returnNULL;
returnpci_dev;
}
void qdev_set_nic_properties(DeviceState *dev, NICInfo*nd) { (qdev.c)
qdev_prop_set_macaddr(dev, "mac", nd->macaddr.a);
if(nd->netdev)
qdev_prop_set_netdev(dev, "netdev", nd->netdev);
if(nd->nvectors != DEV_NVECTORS_UNSPECIFIED &&
object_property_find(OBJECT(dev), "vectors", NULL)) {
qdev_prop_set_uint32(dev, "vectors", nd->nvectors);
}
nd->instantiated = 1;
}
下一节,就分析nd结构的数据来源.
10.1.3 NicInfo初始化
main ==> net_init_clients
int net_init_clients(void) {
QemuOptsList*net = qemu_find_opts("net");
if(default_net) {
/* if noclients, we use a default config */
qemu_opts_set(net, NULL, "type", "nic");
#ifdef CONFIG_SLIRP
qemu_opts_set(net, NULL, "type", "user");
#endif
}
QTAILQ_INIT(&net_clients);
if(qemu_opts_foreach(qemu_find_opts("netdev"), net_init_netdev, NULL,1) == -1)
return-1;
if(qemu_opts_foreach(net, net_init_client, NULL, 1) == -1) {
return-1;
}
return 0;
}
net_init_netdev 和net_init_clent都会调用
net_client_init(opts,1, &local_err);(net.c)
net_client_init==>net_client_init1 ==> net_client_init_fun[opts->kind](opts,name, peer)
下面是一个使用bridge网络方式的虚拟机启动命令行:
qemu-system-x86_x64 /mnt/ubuntun.img -smp 2 -m 1024 -net nic -nettap,ifname=tap1,script=/etc/qemu-ifup,downscript=no
上面的参数中-net出现了2次, 第一个指定了网卡(采用默认的rtl8139),第二个指定了tap方式。
因此, qemu_opts_foreach会调用两次net_client_init; 而net_client_init1第一次会调用net_init_nic, 第二次调用net_init_tap
static int net_client_init1(const void *object, intis_netdev, Error **errp) {
...........
if (!is_netdev&&
(opts->kind != NET_CLIENT_OPTIONS_KIND_NIC ||
!opts->nic->has_netdev)) {
peer= net_hub_add_port(u.net->has_vlan ? u.net->vlan : 0, NULL);
}
if (net_client_init_fun[opts->kind](opts,name, peer) < 0) {
error_set(errp, QERR_DEVICE_INIT_FAILED,
NetClientOptionsKind_lookup[opts->kind]);
return -1;
}
}
第一次由于是网卡且has_netdev为0,所以peer 由net_hub_add_port建立,此时port为0.
static int (* constnet_client_init_fun[NET_CLIENT_OPTIONS_KIND_MAX])(
constNetClientOptions *opts,
const char*name,
NetClientState *peer) = {
[NET_CLIENT_OPTIONS_KIND_NIC] = net_init_nic,
#ifdef CONFIG_SLIRP
[NET_CLIENT_OPTIONS_KIND_USER] = net_init_slirp,
#endif
[NET_CLIENT_OPTIONS_KIND_TAP] = net_init_tap,
[NET_CLIENT_OPTIONS_KIND_SOCKET] = net_init_socket,
........
};
对于使用虚拟网卡的case, net_init_nic会被调用。
static int net_init_nic(const NetClientOptions *opts,const char *name,
NetClientState *peer) {
......
nic =opts->nic;
idx =nic_get_free_idx();
.......
nd =&nd_table[idx];
memset(nd,0, sizeof(*nd));
if(nic->has_netdev) {
//根据设备id获得netdev的NetClientState
nd->netdev = qemu_find_netdev(nic->netdev);
} else {
nd->netdev = peer;
}
if (name) {
nd->name= g_strdup(name);
}
if(nic->has_model) {
nd->model = g_strdup(nic->model);
}
if(nic->has_addr) {
nd->devaddr = g_strdup(nic->addr);
}
qemu_macaddr_default_if_unset(&nd->macaddr);
if(nic->has_vectors) {
nd->nvectors = nic->vectors;
} else {
nd->nvectors = DEV_NVECTORS_UNSPECIFIED;
}
nd->used= 1;
nb_nics++;
return idx;
}
net_client_init1第二次执行时也会调用net_hub_add_port,建立peer,才此时port为1. 接下来net_init_tap会被调用。并且有如下的调用关系:
net_init_tap(tap.c) ==> net_tap_fd_init(peer, model, name, fd,vnet_hdr) ==>
qemu_new_net_client(&net_tap_info,peer, model, name)
问题在于网卡的peer时hub.port0, 而net_tap的peer是hub.port1,这二者如何关联呢?
static NetClientInfo net_hub_port_info = {
.type =NET_CLIENT_OPTIONS_KIND_HUBPORT,
.size =sizeof(NetHubPort),
.can_receive= net_hub_port_can_receive,
.receive =net_hub_port_receive,
.receive_iov= net_hub_port_receive_iov,
.cleanup =net_hub_port_cleanup,
};
net_hub_port_receive ==> net_hub_receive
static ssize_t net_hub_receive(NetHub *hub, NetHubPort*source_port,
const uint8_t*buf, size_t len) {
NetHubPort*port;
QLIST_FOREACH(port, &hub->ports, next) {
if (port== source_port) {
continue;
}
qemu_send_packet(&port->nc, buf, len);
}
return len;
}
原来hub port会向出自己以外的port转发包
10.1.4 Net Tap模块
对于上节的参数net_init_tap 首先调用net_tap_init流程如下:
a) 调用tap_open 建立tap设备
b) launch_script调用setup的脚本
接着调用net_tap_fd_init
a) qemu_new_net_client注册net_tap_info
staticNetClientInfo net_tap_info = {
.type =NET_CLIENT_OPTIONS_KIND_TAP,
.size =sizeof(TAPState),
.receive =tap_receive,
.receive_raw= tap_receive_raw,
.receive_iov= tap_receive_iov,
.poll =tap_poll,
.cleanup =tap_cleanup,
};
b) tap_set_offload(&s->nc, 0, 0, 0, 0, 0);
c) tap_read_poll ==> tap_update_fd_handler
static voidtap_update_fd_handler(TAPState *s)
{
qemu_set_fd_handler2(s->fd,
s->read_poll ? tap_can_send : NULL,
s->read_poll ? tap_send : NULL,
s->write_poll ?tap_writable : NULL, s);
}
注册tap设备的读写回调。
net_init_tap最后准备好downscript的字符串。
当实际网络数据传入tap时,tap_send被调用,tap_send将调用qemu_send_packet_async,最终将数据传给虚拟网卡的receive.
当虚拟网卡发送数据时,tap_receive会被调用:
tap_receive ==> tap_write_packet ==> writev 将数据发送出去。