struct rte_ring {
TAILQ_ENTRY(rte_ring) next; /**< Next in list. */
char name[RTE_RING_NAMESIZE]; /**< Name of the ring. */
int flags; /**< Flags supplied at creation. */
/** Ring producer status. */
struct prod {
uint32_t watermark; /**< Maximum items before EDQUOT. */
uint32_t sp_enqueue; /**< True, if single producer. */
uint32_t size; /**< Size of ring. */
uint32_t mask; /**< Mask (size-1) of ring. */
volatile uint32_t head; /**< Producer head. */
volatile uint32_t tail; /**< Producer tail. */
} prod __rte_cache_aligned;
/** Ring consumer status. */
struct cons {
uint32_t sc_dequeue; /**< True, if single consumer. */
uint32_t size; /**< Size of the ring. */
uint32_t mask; /**< Mask (size-1) of ring. */
volatile uint32_t head; /**< Consumer head. */
volatile uint32_t tail; /**< Consumer tail. */
#ifdef RTE_RING_SPLIT_PROD_CONS
} cons __rte_cache_aligned;
#else
} cons;
#endif
#ifdef RTE_LIBRTE_RING_DEBUG
struct rte_ring_debug_stats stats[RTE_MAX_LCORE];
#endif
void * ring[0] __rte_cache_aligned; /**< Memory space of ring starts here.
* not volatile so need to be careful
* about compiler re-ordering */
};
dpdk实现的multi-enque和multi-deque比起常规的ring性能具体优越在哪里呢?
常规操作:基本上是需要对整个que加锁的,或者对que的head和tail分别加锁
dpdk的操作:
static inline int __attribute__((always_inline))
__rte_ring_mp_do_enqueue(struct rte_ring *r, void * const *obj_table,
unsigned n, enum rte_ring_queue_behavior behavior)
{
uint32_t prod_head, prod_next;
uint32_t cons_tail, free_entries;
const unsigned max = n;
int success;
unsigned i;
uint32_t mask = r->prod.mask;
int ret;
/* move prod.head atomically */
do {
/* Reset n to the initial burst count */
n = max;
prod_head = r->prod.head;
cons_tail = r->cons.tail;
/* The subtraction is done between two unsigned 32bits value
* (the result is always modulo 32 bits even if we have
* prod_head > cons_tail). So 'free_entries' is always between 0
* and size(ring)-1. */
free_entries = (mask + cons_tail - prod_head);
/* check that we have enough room in ring */
if (unlikely(n > free_entries)) {
if (behavior == RTE_RING_QUEUE_FIXED) {
__RING_STAT_ADD(r, enq_fail, n);
return -ENOBUFS;
}
else {
/* No free entry available */
if (unlikely(free_entries == 0)) {
__RING_STAT_ADD(r, enq_fail, n);
return 0;
}
n = free_entries;
}
}
prod_next = prod_head + n;
success = rte_atomic32_cmpset(&r->prod.head, prod_head,
prod_next);
} while (unlikely(success == 0));
/* write entries in ring */
ENQUEUE_PTRS();
rte_compiler_barrier();
/* if we exceed the watermark */
if (unlikely(((mask + 1) - free_entries + n) > r->prod.watermark)) {
ret = (behavior == RTE_RING_QUEUE_FIXED) ? -EDQUOT :
(int)(n | RTE_RING_QUOT_EXCEED);
__RING_STAT_ADD(r, enq_quota, n);
}
else {
ret = (behavior == RTE_RING_QUEUE_FIXED) ? 0 : n;
__RING_STAT_ADD(r, enq_success, n);
}
/*
* If there are other enqueues in progress that preceeded us,
* we need to wait for them to complete
*/
while (unlikely(r->prod.tail != prod_head))
rte_pause();
r->prod.tail = prod_next;
return ret;
}
拥有prod.head和prod.tail,cons.head和cons.tail,在enque的时候,比较prod.head和cons.tail,来确认是否ring已满。
在多个线程同时enque时,通过compare and set(CAS)来确认同步性。在第一个while循环结束后,两个线程的prod_head和prod_next都是不同的,接下来就可以真正的塞入ring的数组中。
而第二个循环:
while (unlikely(r->prod.tail != prod_head))
rte_pause();
则是证明了这个数据已经确实塞入数组中,待deque的时候则比较cons.head和prod.tail,不能比较prod.head(因为head只是抢到了数组的位置,数据还没有真实写入),这也是为什么dpdk ring是prod.tail 和 prod.head 两个的原因。
比较下dpdk ring和常规multi ring的区别:
锁的粒度更小了,只对head,tail这些值做同步,对于数据真正塞入ring中其实并没有锁同步。
so,可以改造常规的ring实现,也有prod.head和prod.tail, 做同步时仅对head同步,各个线程获得当前的prod_head和prod_next,然后再进行真正的入队列,再更新prod.tail,当然这就和dpdk ring 一样了。。。。简单一点的head同步可以用锁,这样锁的范围小很多。。。