vhost源码分析

最新推荐文章于 2024-04-28 14:42:24 发布

zhenghuaduo

最新推荐文章于 2024-04-28 14:42:24 发布

阅读量3k

点赞数 3

分类专栏：虚拟化

本文链接：https://blog.csdn.net/zgy666/article/details/89635669

版权

虚拟化专栏收录该内容

23 篇文章 7 订阅

订阅专栏

1、概述

vhost的大致原理就是qemu在Guest、Host之间创建一些共享buffer，Guest作为生产者往buffer填充可用描述符信息，Host作为消费者从可用描述符里消费buffer。Host消费完buffer后再通知Guest回收描述符；

本文主要基于3.10版本kernel，分析了vhost的报文收发过程。

2、vring

Guest、Host之间通过通过共享vring buffer的方式完成数据报文传递，相关数据结构如下，其中vring_virtqueue为Guest侧数据结构，vhost_virtqueue为Host侧数据结构；

struct vring_virtqueue
{
	struct virtqueue vq;

	/* Actual memory layout for this queue */
    /*
    包含desc、avail、used三个vring，其中desc用于存放描述符信息，avail用于表示当前可用的desc 的
    head id，used用于描述当前已经使用的desc的head id
    */
	struct vring vring;

	/* Can we use weak barriers? */
	bool weak_barriers;

	/* Other side has made a mess, don't try any more. */
	bool broken;

	/* Host supports indirect buffers */
	bool indirect;

	/* Host publishes avail event idx */
	bool event;

	/* Head of free buffer list. */
	unsigned int free_head;
	/* Number we've added since last sync. */
	unsigned int num_added;

	/* Last used index we've seen. */
    /*
    用于描述Guest当前已回收的最后一个desc id值
    */
	u16 last_used_idx;

	/* How to notify other side. FIXME: commonalize hcalls! */
	void (*notify)(struct virtqueue *vq);

#ifdef DEBUG
	/* They're supposed to lock for us. */
	unsigned int in_use;

	/* Figure out if their kicks are too delayed. */
	bool last_add_time_valid;
	ktime_t last_add_time;
#endif

	/* Tokens for callbacks. */
	void *data[];
};

struct vhost_virtqueue {
	struct vhost_dev *dev;

	/* The actual ring of buffers. */
	struct mutex mutex;
	unsigned int num;
    /*
      Qemu通过VHOST_SET_VRING_ADDR将Guest的三个vring地址通知给vhost，vhost填充到
      vhost_virtqueue对应字段
    */
	struct vring_desc __user *desc;
	struct vring_avail __user *avail;
	struct vring_used __user *used;
	struct file *kick;
	struct file *call;
	struct file *error;
	struct eventfd_ctx *call_ctx;
	struct eventfd_ctx *error_ctx;
	struct eventfd_ctx *log_ctx;

	struct vhost_poll poll;

	/* The routine to call when the Guest pings us, or timeout. */
	vhost_work_fn_t handle_kick;

	/* Last available index we saw. */
    /*
      Host可用的第一个desc id
    */
	u16 last_avail_idx;

	/* Caches available index value from user. */
	u16 avail_idx;

	/* Last index we used. */
	u16 last_used_idx;

	/* Used flags */
	u16 used_flags;

	/* Last used index value we have signalled on */
	u16 signalled_used;

	/* Last used index value we have signalled on */
	bool signalled_used_valid;

	/* Log writes to used structure. */
	bool log_used;
	u64 log_addr;

	struct iovec iov[UIO_MAXIOV];
	struct iovec *indirect;
	struct vring_used_elem *heads;
	/* We use a kind of RCU to access private pointer.
	 * All readers access it from worker, which makes it possible to
	 * flush the vhost_work instead of synchronize_rcu. Therefore readers do
	 * not need to call rcu_read_lock/rcu_read_unlock: the beginning of
	 * vhost_work execution acts instead of rcu_read_lock() and the end of
	 * vhost_work execution acts instead of rcu_read_unlock().
	 * Writers use virtqueue mutex. */
	void __rcu *private_data;
	/* Log write descriptors */
	void __user *log_base;
	struct vhost_log *log;
};

3、发包流程

3.1 Guest侧

static struct virtqueue *setup_vq(struct virtio_device *vdev, unsigned index,
				  void (*callback)(struct virtqueue *vq),
				  const char *name,
				  u16 msix_vec)
{
	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
	struct virtio_pci_vq_info *info;
	struct virtqueue *vq;
	unsigned long flags, size;
	u16 num;
	int err;

	/* Select the queue we're interested in */
	iowrite16(index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL);

	/* Check if queue is either not available or already active. */
    /*获取当前配置的vring buffer个数*/
	num = ioread16(vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NUM);
	if (!num || ioread32(vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN))
		return ERR_PTR(-ENOENT);

	/* allocate and fill out our structure the represents an active
	 * queue */
	info = kmalloc(sizeof(struct virtio_pci_vq_info), GFP_KERNEL);
	if (!info)
		return ERR_PTR(-ENOMEM);

	info->num = num;
	info->msix_vector = msix_vec;

	size = PAGE_ALIGN(vring_size(num, VIRTIO_PCI_VRING_ALIGN));
    /*分配desc页信息*/
	info->queue = alloc_pages_exact(size, GFP_KERNEL|__GFP_ZERO);
	if (info->queue == NULL) {
		err = -ENOMEM;
		goto out_info;
	}

	/* activate the queue */
	iowrite32(virt_to_phys(info->queue) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT,
		  vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);

	/* create the vring */
	vq = vring_new_virtqueue(index, info->num, VIRTIO_PCI_VRING_ALIGN, vdev,
				 true, info->queue, vp_notify, callback, name);
	if (!vq) {
		err = -ENOMEM;
		goto out_activate_queue;
	}

	vq->priv = info;
	info->vq = vq;

	if (msix_vec != VIRTIO_MSI_NO_VECTOR) {
		iowrite16(msix_vec, vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
		msix_vec = ioread16(vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
		if (msix_vec == VIRTIO_MSI_NO_VECTOR) {
			err = -EBUSY;
			goto out_assign;
		}
	}

	if (callback) {
		spin_lock_irqsave(&vp_dev->lock, flags);
		list_add(&info->node, &vp_dev->virtqueues);
		spin_unlock_irqrestore(&vp_dev->lock, flags);
	} else {
		INIT_LIST_HEAD(&info->node);
	}

	return vq;

out_assign:
	vring_del_virtqueue(vq);
out_activate_queue:
	iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
	free_pages_exact(info->queue, size);
out_info:
	kfree(info);
	return ERR_PTR(err);
}

Guest在初始化virtio设备时，在setup_vq函数里，先获取配置的vring buffer个数信息，然后分配实际的desc内存，调用vring_new_virtqueue创建vq，完成vring信息的初始化；

struct virtqueue *vring_new_virtqueue(unsigned int index,
				      unsigned int num,
				      unsigned int vring_align,
				      struct virtio_device *vdev,
				      bool weak_barriers,
				      void *pages,
				      void (*notify)(struct virtqueue *),
				      void (*callback)(struct virtqueue *),
				      const char *name)
{
	struct vring_virtqueue *vq;
	unsigned int i;

	/* We assume num is a power of 2. */
	if (num & (num - 1)) {
		dev_warn(&vdev->dev, "Bad virtqueue length %u\n", num);
		return NULL;
	}

	vq = kmalloc(sizeof(*vq) + sizeof(void *)*num, GFP_KERNEL);
	if (!vq)
		return NULL;

	vring_init(&vq->vring, num, pages, vring_align);
	vq->vq.callback = callback;
	vq->vq.vdev = vdev;
	vq->vq.name = name;
	vq->vq.num_free = num;
	vq->vq.index = index;
	vq->notify = notify;
	vq->weak_barriers = weak_barriers;
	vq->broken = false;
	vq->last_used_idx = 0;
	vq->num_added = 0;
	list_add_tail(&vq->vq.list, &vdev->vqs);
#ifdef DEBUG
	vq->in_use = false;
	vq->last_add_time_valid = false;
#endif

	vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC);
	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);

	/* No callback?  Tell other side not to bother us. */
	if (!callback)
		vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;

	/* Put everything in free lists. */
	vq->free_head = 0;
    /*初始化desc描述符vring*/
	for (i = 0; i < num-1; i++) {
		vq->vring.desc[i].next = i+1;
		vq->data[i] = NULL;
	}
    /*初始化data数据结构，Host接收报文时，就是将相应的报文数据填充到data里*/
	vq->data[i] = NULL;

	return &vq->vq;
}

当Guest需要向外发送报文时，会调用到start_xmit（virtio_net.c），该函数最终会调用virtqueue_add将skb_buffer填充到vq->data里；

static inline int virtqueue_add(struct virtqueue *_vq,
				struct scatterlist *sgs[],
				struct scatterlist *(*next)
				  (struct scatterlist *, unsigned int *),
				unsigned int total_out,
				unsigned int total_in,
				unsigned int out_sgs,
				unsigned int in_sgs,
				void *data,
				gfp_t gfp)
{
	struct vring_virtqueue *vq = to_vvq(_vq);
	struct scatterlist *sg;
	unsigned int i, n, avail, uninitialized_var(prev), total_sg;
	int head;

	START_USE(vq);

	BUG_ON(data == NULL);

#ifdef DEBUG
	{
		ktime_t now = ktime_get();

		/* No kick or get, with .1 second between?  Warn. */
		if (vq->last_add_time_valid)
			WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time))
					    > 100);
		vq->last_add_time = now;
		vq->last_add_time_valid = true;
	}
#endif

	total_sg = total_in + total_out;

	/* If the host supports indirect descriptor tables, and we have multiple
	 * buffers, then go indirect. FIXME: tune this threshold */
	if (vq->indirect && total_sg > 1 && vq->vq.num_free) {
		head = vring_add_indirect(vq, sgs, next, total_sg, total_out,
					  total_in,
					  out_sgs, in_sgs, gfp);
		if (likely(head >= 0))
			goto add_head;
	}

	BUG_ON(total_sg > vq->vring.num);
	BUG_ON(total_sg == 0);

	if (vq->vq.num_free < total_sg) {
		pr_debug("Can't add buf len %i - avail = %i\n",
			 total_sg, vq->vq.num_free);
		/* FIXME: for historical reasons, we force a notify here if
		 * there are outgoing parts to the buffer.  Presumably the
		 * host should service the ring ASAP. */
		if (out_sgs)
			vq->notify(&vq->vq);
		END_USE(vq);
		return -ENOSPC;
	}

	/* We're about to use some buffers from the free list. */
	vq->vq.num_free -= total_sg;
    /*获取当前首个可用描述符id*/
	head = i = vq->free_head;
    /*填充描述符信息*/
	for (n = 0; n < out_sgs; n++) {
		for (sg = sgs[n]; sg; sg = next(sg, &total_out)) {
			vq->vring.desc[i].flags = VRING_DESC_F_NEXT;
			vq->vring.desc[i].addr = sg_phys(sg);
			vq->vring.desc[i].len = sg->length;
			prev = i;
			i = vq->vring.desc[i].next;
		}
	}
	for (; n < (out_sgs + in_sgs); n++) {
		for (sg = sgs[n]; sg; sg = next(sg, &total_in)) {
			vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
			vq->vring.desc[i].addr = sg_phys(sg);
			vq->vring.desc[i].len = sg->length;
			prev = i;
			i = vq->vring.desc[i].next;
		}
	}
	/* Last one doesn't continue. */
	vq->vring.desc[prev].flags &= ~VRING_DESC_F_NEXT;

	/* Update free pointer */
	vq->free_head = i;

add_head:
	/* Set token. */
	vq->data[head] = data;

	/* Put entry in available array (but don't update avail->idx until they
	 * do sync). */
	avail = (vq->vring.avail->idx & (vq->vring.num-1));
	vq->vring.avail->ring[avail] = head;

	/* Descriptors and available array need to be set before we expose the
	 * new available array entries. */
	virtio_wmb(vq->weak_barriers);
    /*更新avail idx，Host在收包get可用描述符时，会获取该值*/
	vq->vring.avail->idx++;
	vq->num_added++;

	/* This is very unlikely, but theoretically possible.  Kick
	 * just in case. */
	if (unlikely(vq->num_added == (1 << 16) - 1))
		virtqueue_kick(_vq);

	pr_debug("Added buffer head %i to %p\n", head, vq);
	END_USE(vq);

	return 0;
}

start_xmit添加完outbuffer后，调用virtqueue_kick通知host；

void virtqueue_kick(struct virtqueue *vq)
{
	if (virtqueue_kick_prepare(vq))
		virtqueue_notify(vq);
}

在virtqueue_kick，virtqueue_kick_prepare会根据vring_need_event的返回值判断是否需要通知host，下面重点看一下vring_need_event：

static inline int vring_need_event(__u16 event_idx, __u16 new_idx, __u16 old)
{
	/* Note: Xen has similar logic for notification hold-off
	 * in include/xen/interface/io/ring.h with req_event and req_prod
	 * corresponding to event_idx + 1 and new_idx respectively.
	 * Note also that req_event and req_prod in Xen start at 1,
	 * event indexes in virtio start at 0. */
	return (__u16)(new_idx - event_idx - 1) < (__u16)(new_idx - old);
}

看一下这个event_idx的定义：

#define vring_avail_event(vr) (*(__u16 *)&(vr)->used->ring[(vr)->num])

那这个值是在哪里更新的呢？回到vhost代码；会发现vhost在使用完ring buffer后会调用vhost_update_avail_event，在这里会调用__put_user(vq->avail_idx, vhost_avail_event(vq))，会将avail_idx写到vhost_avail_event(vq)里，看下vhost_avail_event(vq)的定义：

#define vhost_avail_event(vq) ((u16 __user *)&vq->used->ring[vq->num])

可以发现这个vhost_avail_event其实就是vring_avail_event；至于avail_idx，vhost在handle_rx阶段获取可用描述符（vhost_get_vq_desc）时会通过__get_user(vq->avail_idx, &vq->avail->idx)获取，表示当前而Guest已添加的可用buffer id，Guest每添加一个buffer（virtqueue_add），就会更新一次vq->avail->idx，总结一下就是：

Guest在添加buffer的时候会更新vq->avail->idx，host在获取可用buffer时会获取这个值，然后将其写到vhost_avail_event里，Guest在添加完新的buffer后，就会判断当前host填进去的vhost_avail_enevt是不是Guest最新的值，如果是，则通过virtqueue_notify通知host，如果不是，表明host当前消息处理不过来，等下次再通知。

3.2 Host侧

Guest调用vrtrqueue_notify后，触发mmio异常陷出到host，然后通过eventfd机制唤醒vhost线程（vhost线程唤醒机制后续再单独做分析）；当vhost需要发包被唤醒时，会调用handle_tx（drivers/vhost/net.c），在handle_tx里会调用vhost_get_vq_desc获取Guest填充的buffer信息；

int vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
		      struct iovec iov[], unsigned int iov_size,
		      unsigned int *out_num, unsigned int *in_num,
		      struct vhost_log *log, unsigned int *log_num)
{
	struct vring_desc desc;
	unsigned int i, head, found = 0;
	u16 last_avail_idx;
	int ret;

	/* Check it isn't doing very strange things with descriptor numbers. */
	last_avail_idx = vq->last_avail_idx;
	/*
		获取Guest填充的最新的可用id值
	*/
	if (unlikely(__get_user(vq->avail_idx, &vq->avail->idx))) {
		vq_err(vq, "Failed to access avail idx at %p\n",
		       &vq->avail->idx);
		return -EFAULT;
	}

	if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
		vq_err(vq, "Guest moved used index from %u to %u",
		       last_avail_idx, vq->avail_idx);
		return -EFAULT;
	}

	/* If there's nothing new since last we looked, return invalid. */
    /*
    Last_avail_idx表示vhost当前可用的id值，vhost每使用一个描述符，last_avail_idx就会加1；如果
     vq->avail_idx与last_avail_idx相等，则表明Guest没有填充新的buffer，也即当前没有需要发送的
    数据
    */
	if (vq->avail_idx == last_avail_idx)
		return vq->num;

	/* Only get avail ring entries after they have been exposed by guest. */
	smp_rmb();

	/* Grab the next descriptor number they're advertising, and increment
	 * the index we've seen. */
	if (unlikely(__get_user(head,
				&vq->avail->ring[last_avail_idx % vq->num]))) {
		vq_err(vq, "Failed to read head: idx %d address %p\n",
		       last_avail_idx,
		       &vq->avail->ring[last_avail_idx % vq->num]);
		return -EFAULT;
	}

	/* If their number is silly, that's an error. */
	if (unlikely(head >= vq->num)) {
		vq_err(vq, "Guest says index %u > %u is available",
		       head, vq->num);
		return -EINVAL;
	}

	/* When we start there are none of either input nor output. */
	*out_num = *in_num = 0;
	if (unlikely(log))
		*log_num = 0;

	i = head;
	do {
		unsigned iov_count = *in_num + *out_num;
		if (unlikely(i >= vq->num)) {
			vq_err(vq, "Desc index is %u > %u, head = %u",
			       i, vq->num, head);
			return -EINVAL;
		}
		if (unlikely(++found > vq->num)) {
			vq_err(vq, "Loop detected: last one at %u "
			       "vq size %u head %u\n",
			       i, vq->num, head);
			return -EINVAL;
		}
		ret = __copy_from_user(&desc, vq->desc + i, sizeof desc);
		if (unlikely(ret)) {
			vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
			       i, vq->desc + i);
			return -EFAULT;
		}
		if (desc.flags & VRING_DESC_F_INDIRECT) {
			ret = get_indirect(dev, vq, iov, iov_size,
					   out_num, in_num,
					   log, log_num, &desc);
			if (unlikely(ret < 0)) {
				vq_err(vq, "Failure detected "
				       "in indirect descriptor at idx %d\n", i);
				return ret;
			}
			continue;
		}

		ret = translate_desc(dev, desc.addr, desc.len, iov + iov_count,
				     iov_size - iov_count);
		if (unlikely(ret < 0)) {
			vq_err(vq, "Translation failure %d descriptor idx %d\n",
			       ret, i);
			return ret;
		}
		if (desc.flags & VRING_DESC_F_WRITE) {
			/* If this is an input descriptor,
			 * increment that count. */
			*in_num += ret;
			if (unlikely(log)) {
				log[*log_num].addr = desc.addr;
				log[*log_num].len = desc.len;
				++*log_num;
			}
		} else {
			/* If it's an output descriptor, they're all supposed
			 * to come before any input descriptors. */
			if (unlikely(*in_num)) {
				vq_err(vq, "Descriptor has out after in: "
				       "idx %d\n", i);
				return -EINVAL;
			}
			*out_num += ret;
		}
	} while ((i = next_desc(&desc)) != -1);

	/* On success, increment avail index. */
    /*
    成功获取一个buffer后，last_avail_idx加1
    */
	vq->last_avail_idx++;

	/* Assume notifications from guest are disabled at this point,
	 * if they aren't we would need to update avail_event index. */
	BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
	return head;
}

Vhost获取可用buffer后，通过sock->ops->sendmsg完成报文的发送；然后调用vhost_add_used将已使用的buffer信息回填给Guest；

int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
{
	struct vring_used_elem __user *used;

	/* The virtqueue contains a ring of used buffers.  Get a pointer to the
	 * next entry in that used ring. */
	used = &vq->used->ring[vq->last_used_idx % vq->num];
	if (__put_user(head, &used->id)) {
		vq_err(vq, "Failed to write used id");
		return -EFAULT;
	}
	if (__put_user(len, &used->len)) {
		vq_err(vq, "Failed to write used len");
		return -EFAULT;
	}
	/* Make sure buffer is written before we update index. */
	smp_wmb();
    /*
    将last_used_idx+1写到vq->used->idx里
    */
	if (__put_user(vq->last_used_idx + 1, &vq->used->idx)) {
		vq_err(vq, "Failed to increment used idx");
		return -EFAULT;
	}
	if (unlikely(vq->log_used)) {
		/* Make sure data is seen before log. */
		smp_wmb();
		/* Log used ring entry write. */
		log_write(vq->log_base,
			  vq->log_addr +
			   ((void __user *)used - (void __user *)vq->used),
			  sizeof *used);
		/* Log used index update. */
		log_write(vq->log_base,
			  vq->log_addr + offsetof(struct vring_used, idx),
			  sizeof vq->used->idx);
		if (vq->log_ctx)
			eventfd_signal(vq->log_ctx, 1);
	}
    /*
    Vhost已使用的id加1
    */
	vq->last_used_idx++;
	/* If the driver never bothers to signal in a very long while,
	 * used index might wrap around. If that happens, invalidate
	 * signalled_used index we stored. TODO: make sure driver
	 * signals at least once in 2^16 and remove this. */
	if (unlikely(vq->last_used_idx == vq->signalled_used))
		vq->signalled_used_valid = false;
	return 0;
}

在vhost_add_used里会调用__put_user(vq->last_used_idx + 1, &vq->used->idx)将vhost当前已使用的used id写到vq->used->idx里，这里的作用是让Guest知道当前vhost已经使用的id值，这样当Guest需要回收buffer或者接收vhost转给它的报文时才知道需要从哪里获取。

4、收包流程

4.1 Host侧

Vhost在接受网卡上送的报文时，会调用handle_rx，在handle_rx里首先通过get_rx_bufs获取当前可用描述符信息，然后通过sock->ops->recvmsg完成报文接收；报文接收完成后调用vhost_add_used_and_signal_n添加已使用id信息；

int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
		     unsigned count)
{
	int start, n, r;

	start = vq->last_used_idx % vq->num;
	n = vq->num - start;
	if (n < count) {
		r = __vhost_add_used_n(vq, heads, n);
		if (r < 0)
			return r;
		heads += n;
		count -= n;
	}
	r = __vhost_add_used_n(vq, heads, count);

	/* Make sure buffer is written before we update index. */
	smp_wmb();
    /*
    将当前已经的id值写到vq->used->idx里，供Guest接收报文时使用
    */	
	if (put_user(vq->last_used_idx, &vq->used->idx)) {
		vq_err(vq, "Failed to increment used idx");
		return -EFAULT;
	}
	if (unlikely(vq->log_used)) {
		/* Log used index update. */
		log_write(vq->log_base,
			  vq->log_addr + offsetof(struct vring_used, idx),
			  sizeof vq->used->idx);
		if (vq->log_ctx)
			eventfd_signal(vq->log_ctx, 1);
	}
	return r;
}

在__vhost_add_used_n里vhost会更新last_used_idx（new = （vq->last_used_idx += count）），更新完成后，将最新的last_used_idx通过(put_user(vq->last_used_idx, &vq->used->idx)通知给Guest；vhost更新完id信息后，调用vhost_signal通知Guest接收报文，在vhost_signal里，同样会先判断是否需要通知，判断的原理跟Guest决定是否通知Host时的类似，这里通过vring_need_event判断是否需要通知Guest，其中event_idx为Guest当前已回收的描述符id信息，如果Guest描述符回收过慢，说明当前Guest还有很多接收报文待处理，暂不通知Guest。

static inline int vring_need_event(__u16 event_idx, __u16 new_idx, __u16 old)
{
	/* Note: Xen has similar logic for notification hold-off
	 * in include/xen/interface/io/ring.h with req_event and req_prod
	 * corresponding to event_idx + 1 and new_idx respectively.
	 * Note also that req_event and req_prod in Xen start at 1,
	 * event indexes in virtio start at 0. */
	return (__u16)(new_idx - event_idx - 1) < (__u16)(new_idx - old);
}

4.2 Guest侧

当vhost判断当前需要通知Guest时，会通过irqfd往Guest注入一个虚拟中断（详细流程到时会单独分析）；Guest接收到中断时，进入中断处理函数skb_recv_done，然后触发NET_RX_SOFTIRQ软中断，然后进入virtnet_poll处理报文接收（详细看init_vqs drivers/net/virtio_net.c）；

static int virtnet_poll(struct napi_struct *napi, int budget)

{

       struct receive_queue *rq =

              container_of(napi, struct receive_queue, napi);

       struct virtnet_info *vi = rq->vq->vdev->priv;

       void *buf;

       unsigned int r, len, received = 0;



again:

       while (received < budget &&

              /* 获取Host填充的报文信息*/

              (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {

              receive_buf(rq, buf, len);

              --rq->num;

              received++;

       }

       if (rq->num < rq->max / 2) {

              /*回收描述符*/

              if (!try_fill_recv(rq, GFP_ATOMIC))

                     schedule_delayed_work(&vi->refill, 0);

       }


       /* Out of packets? */

       if (received < budget) {

              r = virtqueue_enable_cb_prepare(rq->vq);

              napi_complete(napi);

              if (unlikely(virtqueue_poll(rq->vq, r)) &&

                  napi_schedule_prep(napi)) {

                     virtqueue_disable_cb(rq->vq);

                     __napi_schedule(napi);

                     goto again;

              }

       }


       return received;

}

在virtqueue_get_buf里会获取Host填充的报文数据信息，然后再将报文发给协议栈处理；

void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)

{

       struct vring_virtqueue *vq = to_vvq(_vq);

       void *ret;

       unsigned int i;

       u16 last_used;



       START_USE(vq);



       if (unlikely(vq->broken)) {

              END_USE(vq);

              return NULL;

       }

       /*

       这里判断vq-> last_used_idx与vq->vring.used->idx是否相等，其中vq->last_used_idx表示
       Guest上一次已处理的已使用的描述符id值，vq->vring.used->id是vhost填写的，表示vhost当 前
       已使用的描述符id值，如果这两个值相等，说明Guest没有需要处理的报文了

       */

       if (!more_used(vq)) {

              pr_debug("No more buffers in queue\n");

              END_USE(vq);

              return NULL;

       }



       /* Only get used array entries after they have been exposed by host. */

       virtio_rmb(vq->weak_barriers);



       last_used = (vq->last_used_idx & (vq->vring.num - 1));

       i = vq->vring.used->ring[last_used].id;

       *len = vq->vring.used->ring[last_used].len;



       if (unlikely(i >= vq->vring.num)) {

              BAD_RING(vq, "id %u out of range\n", i);

              return NULL;

       }

       if (unlikely(!vq->data[i])) {

              BAD_RING(vq, "id %u is not a head!\n", i);

              return NULL;

       }



       /* detach_buf clears data, so grab it now. */

       /*Vq->data[i]为具体的报文buffer*/   

       ret = vq->data[i];

       detach_buf(vq, i);

       /*每处理完一个报文，last_used_idx加1*/

       vq->last_used_idx++;

       /* If we expect an interrupt for the next entry, tell host

        * by writing event index and flush out the write before

        * the read in the next get_buf call. */

       if (!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {

              vring_used_event(&vq->vring) = vq->last_used_idx;

              virtio_mb(vq->weak_barriers);

       }



#ifdef DEBUG

       vq->last_add_time_valid = false;

#endif



       END_USE(vq);

       return ret;

}

EXPORT_SYMBOL_GPL(virtqueue_get_buf);

zhenghuaduo

关注

3
点赞
踩
11

收藏

觉得还不错? 一键收藏
0
评论
vhost源码分析

1、概述 vhost的大致原理就是qemu在Guest、Host之间创建一些共享buffer，Guest作为生产者往buffer填充可用描述符信息，Host作为消费者从可用描述符里消费buffer。Host消费完buffer后再通知Guest回收描述符；本文主要基于3.10版本kernel，分析了vhost的报文收发过程。2、vringGuest、Host...
复制链接

扫一扫