在移植speculative page fault代码时发现开发者巧妙的使用seqcount的特性来快速判断和标识vma区的变化,由于vma的修改由mmap_sem保证,所以开发只是用的seqcount而没有使用seqlock。
将vma作为保护区,在开始结束位置调用begin和end。通过seqcount的值来快速判断vma区是否改变,只要seqcount值改变了就说明vma区改变了。
对vma区进行保护的mmap_sem是rw_semaphore,该信号量适用于读多写少情况。
借熟悉移植代码的机会,熟悉了一下seqlock和rw_semaphore的代码。
spin_lock无法区分保护区的内容。对于读多写少的区域使用spin_lock显示不合算。
对于读多写少的临界区进行保护时,使用顺序锁(写优先,写阻塞读)是个很好的方法。
针对读多写少的情况,读写信号量和顺序锁都能有很好的性能。两个都是读并发;顺序锁实现了写优先,写阻塞读;而读写信号量则是读能阻塞写,写也能阻塞读,在读写同时发生时写优先。由此可知在特别强调写优先时使用顺序锁。
1.顺序锁
实现思路,写写互斥,读读互不影响。写进入临界区获取spin_lock,seqcount++;操作临界区;seqcount++,退出临界区。读进入临界区前先保存seqcount值,判断seqcount是不是偶数,偶数说明没有写在临界区,非偶数就循环等待seqcount变成偶数,读取临界区,判断此时seqcount值是否和开始值一致,一致说明此次读取值有效,如果不一致,就再读一次。
具体实现
1.1 定义
在include/linux/seqlock.h中定义了seqlock
405 typedef struct {
406 struct seqcount seqcount;
407 spinlock_t lock; //为了保证写写互斥,增加spin_lock
408 } seqlock_t;
seqcount定义如下:
48 typedef struct seqcount {
49 unsigned sequence; //整型计数
50 #ifdef CONFIG_DEBUG_LOCK_ALLOC
51 struct lockdep_map dep_map; //死锁检测
52 #endif
53 } seqcount_t;
1.2 初始化
420 #define seqlock_init(x) \
421 do { \
422 seqcount_init(&(x)->seqcount); \ //初始化计数
423 spin_lock_init(&(x)->lock); \ //初始化写写互斥的spin_lock
424 } while (0)
55 static inline void __seqcount_init(seqcount_t *s, const char *name,
56 struct lock_class_key *key)
57 {
58 /*
59 * Make sure we are not reinitializing a held lock:
60 */
61 lockdep_init_map(&s->dep_map, name, key, 0); //初始化死锁检测内容
62 s->sequence = 0; //初始化计数为0
63 }
69 # define seqcount_init(s) \
70 do { \
71 static struct lock_class_key __key; \
72 __seqcount_init((s), #s, &__key); \
73 } while (0)
如没定义CONFIG_DEBUG_LOCK_ALLOC,# define seqcount_init(s) __seqcount_init(s, NULL, NULL)
1.3 写操作
write_seqlock / write_sequnlock //获取锁 / 释放锁
write_seqlock_irq / write_sequnlock_irq //获取锁的同时disable中断 / 释放锁,enable中断
write_seqlock_bh / write_sequnlock_bh //获取锁同时disable中断下半部 / 释放锁,enable中断下半部
write_seqlock_irqsave / write_sequnlock_irqrestore //获取锁,保存中断标志位,disable中断 / 恢复中断标志,enable中断,释放锁
以write_seqlock为例:
写获取锁
447 static inline void write_seqlock(seqlock_t *sl)
448 {
449 spin_lock(&sl->lock); //获取spin_lock
450 write_seqcount_begin(&sl->seqcount); //计数加1
451 }
381 static inline void write_seqcount_begin(seqcount_t *s)
382 {
383 write_seqcount_begin_nested(s, 0);
384 }
375 static inline void write_seqcount_begin_nested(seqcount_t *s, int subclass)
376 {
377 raw_write_seqcount_begin(s);
378 seqcount_acquire(&s->dep_map, subclass, 0, _RET_IP_); //死锁检测
379 }
226 static inline void raw_write_seqcount_begin(seqcount_t *s)
227 {
228 s->sequence++; //计数加1
229 smp_wmb(); //添加写屏障。保证临界区的操作和计数加1操作不会被smp乱序执行。保证是seqcount计数增加后才操作临界区。
230 }
写释放锁
453 static inline void write_sequnlock(seqlock_t *sl)
454 {
455 write_seqcount_end(&sl->seqcount); //先计数加1
456 spin_unlock(&sl->lock); //然后才释放spin_lock
457 }
386 static inline void write_seqcount_end(seqcount_t *s)
387 {
388 seqcount_release(&s->dep_map, 1, _RET_IP_);
389 raw_write_seqcount_end(s);
390 }
232 static inline void raw_write_seqcount_end(seqcount_t *s)
233 {
234 smp_wmb(); //添加写屏障,保证临界区的操作和计数加1操作不会被smp乱序执行。保证临界区操作完成后seqcount计数才增加。
235 s->sequence++; //计数加1
236 }
1.4 读操作
read_seqbegin / read_seqretry //获取seqcount,当没有写操作时返回seqcount值 / 判断seqcount值是否变化.如果变化则读操作失败,重读
432 static inline unsigned read_seqbegin(const seqlock_t *sl)
433 {
434 return read_seqcount_begin(&sl->seqcount); //读取seqcount值
435 }
162 static inline unsigned read_seqcount_begin(const seqcount_t *s)
163 {
164 seqcount_lockdep_reader_access(s);
165 return raw_read_seqcount_begin(s);
166 }
146 static inline unsigned raw_read_seqcount_begin(const seqcount_t *s)
147 {
148 unsigned ret = __read_seqcount_begin(s);
149 smp_rmb();
150 return ret;
151 }
108 static inline unsigned __read_seqcount_begin(const seqcount_t *s)
109 {
110 unsigned ret;
111
112 repeat:
113 ret = READ_ONCE(s->sequence); //最终的读操作
114 if (unlikely(ret & 1)) { //如果seqcount值为奇数说明有写操作临界区就释放cpu,重读seqcount。达到写阻塞读的效果
115 cpu_relax();
116 goto repeat;
117 }
118 return ret;
119 }
seqcount值判断
437 static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
438 {
439 return read_seqcount_retry(&sl->seqcount, start);
440 }
218 static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
219 {
220 smp_rmb();
221 return __read_seqcount_retry(s, start);
222 }
203 static inline int __read_seqcount_retry(const seqcount_t *s, unsigned start)
204 {
205 return unlikely(s->sequence != start); //如果seqcount值和开始值不一致则返回1,即需要重做返回1,不需要重做返回0
206 }
一般read_seqretry和read_seqbegin配套使用。在读取临界区前调用read_seqbegin获取seqcount值,在结束位置用read_seqretry进行判断,如果read_seqretry返回值为1则说明在读期间临界区有变化需要重新读。
一般操作如下:
do {
seqcount = read_seqbegin(&seq_lock); //进入临界区前先获取seqcount值
do_something();
} while (read_seqretry(&seq_lock, seqcount)); //如果为1,则需要重新读
2 读写信号量
实现思路,写写互斥,读写互斥,读读并发。原子操作计数count表示是读还是写,-1表示1个写操作,0标识没有读写操作,大于0标识读操作的个数。通过对count计数和wait_list的判断来实现,读并发,写互斥。
2.1 定义
在文件include/linux/rwsem.h定义如下:
30 struct rw_semaphore {
31 atomic_long_t count; //原子操作计数
32 struct list_head wait_list; //等待列表
33 raw_spinlock_t wait_lock; //对列表操作时加锁
34 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
35 struct optimistic_spin_queue osq; /* spinner MCS lock */
36 /*
37 * Write owner. Used as a speculative check to see
38 * if the owner is running on the cpu.
39 */
40 struct task_struct *owner; //锁的持有者
41 #endif
42 #ifdef CONFIG_DEBUG_LOCK_ALLOC
43 struct lockdep_map dep_map; //死锁检测
44 #endif
45 };
2.2 初始化
99 #define init_rwsem(sem) \
100 do { \
101 static struct lock_class_key __key; \
102 \
103 __init_rwsem((sem), #sem, &__key); \
104 } while (0)
41 void __init_rwsem(struct rw_semaphore *sem, const char *name,
42 struct lock_class_key *key)
43 {
44 #ifdef CONFIG_DEBUG_LOCK_ALLOC
45 /*
46 * Make sure we are not reinitializing a held semaphore:
47 */
48 debug_check_no_locks_freed((void *)sem, sizeof(*sem));
49 lockdep_init_map(&sem->dep_map, name, key, 0);
50 #endif
51 sem->count = 0; //计数初始值为0,表示没有读写操作
52 raw_spin_lock_init(&sem->wait_lock); //初始化spin_lock,保护count计数和wait_list
53 INIT_LIST_HEAD(&sem->wait_list); //初始化列表
54 }
55 EXPORT_SYMBOL(__init_rwsem);
2.3 读操作
读加锁
21 void __sched down_read(struct rw_semaphore *sem)
22 {
23 might_sleep();
24 rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
25
26 LOCK_CONTENDED(sem, __down_read_trylock, __down_read); //进行读操作,先进行try,成功了进行真实读操作
27 rwsem_set_reader_owned(sem);
28 }
609 #define LOCK_CONTENDED(_lock, try, lock) \
610 do { \
611 if (!try(_lock)) { \
612 lock_contended(&(_lock)->dep_map, _RET_IP_); \
613 lock(_lock); \ //真实读操作
614 } \
615 lock_acquired(&(_lock)->dep_map, _RET_IP_); \
616 } while (0)
178 void __sched __down_read(struct rw_semaphore *sem)
179 {
180 __down_read_common(sem, TASK_UNINTERRUPTIBLE);
181 }
130 int __sched __down_read_common(struct rw_semaphore *sem, int state)
131 {
132 struct rwsem_waiter waiter;
133 unsigned long flags;
134
135 raw_spin_lock_irqsave(&sem->wait_lock, flags); //获取sem中的spin_lock,保证count和waitlist值使用时无其他人修改
136
137 if (sem->count >= 0 && list_empty(&sem->wait_list)) { //判断计数,如无写操作(count值不为-1),且wait_list为空,则读操作可以进行
138 /* granted */
139 sem->count++; //count加1,表示读操作多了1个
140 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); //spin_lock解锁
141 goto out;
142 }
143
144 /* set up my own style of waitqueue */
145 waiter.task = current;
146 waiter.type = RWSEM_WAITING_FOR_READ; //表示此操作时读,一个读操作加入到了等待队列
147 get_task_struct(current);
148
149 list_add_tail(&waiter.list, &sem->wait_list); //加入队列
150
151 /* wait to be given the lock */
152 for (;;) {
153 if (!waiter.task) //task为null时跳出循环,在写解锁时wait up读操作时会设置task为null
154 break;
155 if (signal_pending_state(state, current))
156 goto out_nolock;
157 set_current_state(state); //设置task状态为TASK_UNINTERRUPTIBLE
158 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); //spin_lock解锁
159 schedule(); //切换出去,等待唤醒
160 raw_spin_lock_irqsave(&sem->wait_lock, flags);
161 }
162
163 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); //spin_lock解锁
164 out:
165 return 0;
166
167 out_nolock:
168 /*
169 * We didn't take the lock, so that there is a writer, which
170 * is owner or the first waiter of the sem. If it's a waiter,
171 * it will be woken by current owner. Not need to wake anybody.
172 */
173 list_del(&waiter.list);
174 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
175 return -EINTR;
176 }
读解锁
101 void up_read(struct rw_semaphore *sem)
102 {
103 rwsem_release(&sem->dep_map, 1, _RET_IP_);
104
105 __up_read(sem);
106 }
107
108 EXPORT_SYMBOL(up_read);
295 void __up_read(struct rw_semaphore *sem)
296 {
297 unsigned long flags;
298
299 raw_spin_lock_irqsave(&sem->wait_lock, flags);
300
301 if (--sem->count == 0 && !list_empty(&sem->wait_list)) //count减1,判断是否为0,不为0说明还有读操作在,不唤醒写;没有读操作,且wait_list为空,则唤醒wait_list中的第一个(此时第一个肯定是写,读可以并发,在读操作过程中能加入到wait_list第一个肯定是写)。
302 sem = __rwsem_wake_one_writer(sem); //唤醒写操作,唤醒写操作后写操作会见count值改为-1,标识有写操作
303
304 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
305 }
116 static inline struct rw_semaphore *
117 __rwsem_wake_one_writer(struct rw_semaphore *sem)
118 {
119 struct rwsem_waiter *waiter;
120
121 waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); //获取第一个task
122 wake_up_process(waiter->task); //wake up task
123
124 return sem;
125 }
2.4 写操作
写加锁
51 void __sched down_write(struct rw_semaphore *sem)
52 {
53 might_sleep();
54 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
55
56 LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
57 rwsem_set_owner(sem);
58 }
59
60 EXPORT_SYMBOL(down_write);
261 void __sched __down_write(struct rw_semaphore *sem)
262 {
263 __down_write_common(sem, TASK_UNINTERRUPTIBLE);
264 }
213 int __sched __down_write_common(struct rw_semaphore *sem, int state)
214 {
215 struct rwsem_waiter waiter;
216 unsigned long flags;
217 int ret = 0;
218
219 raw_spin_lock_irqsave(&sem->wait_lock, flags);
220
221 /* set up my own style of waitqueue */
222 waiter.task = current;
223 waiter.type = RWSEM_WAITING_FOR_WRITE; //标识自己是写操作的等待
224 list_add_tail(&waiter.list, &sem->wait_list); //写操作先将自己加入到wait_list中,阻塞后面的读操作,先加入list再判断可以更早的阻塞读
225
226 /* wait for someone to release the lock */
227 for (;;) {
228 /*
229 * That is the key to support write lock stealing: allows the
230 * task already on CPU to get the lock soon rather than put
231 * itself into sleep and waiting for system woke it or someone
232 * else in the head of the wait list up.
233 */
234 if (sem->count == 0) //等待所有读操作结束
235 break;
236 if (signal_pending_state(state, current))
237 goto out_nolock;
238
239 set_current_state(state);
240 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
241 schedule(); //切换出去,等待唤醒
242 raw_spin_lock_irqsave(&sem->wait_lock, flags);
243 }
244 /* got the lock */
245 sem->count = -1; //设置count值为-1,表示有写操作
246 list_del(&waiter.list); //将自己从wait_list中删除
247
248 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
249
250 return ret;
251
252 out_nolock:
253 list_del(&waiter.list);
254 if (!list_empty(&sem->wait_list) && sem->count >= 0)
255 __rwsem_do_wake(sem, 0);
256 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
257
258 return -EINTR;
259 }
读解锁
113 void up_write(struct rw_semaphore *sem)
114 {
115 rwsem_release(&sem->dep_map, 1, _RET_IP_);
116
117 rwsem_clear_owner(sem);
118 __up_write(sem);
119 }
310 void __up_write(struct rw_semaphore *sem)
311 {
312 unsigned long flags;
313
314 raw_spin_lock_irqsave(&sem->wait_lock, flags);
315
316 sem->count = 0; //设置conunt值为0
317 if (!list_empty(&sem->wait_list)) //在写操作时,读操作也写操作都可能加入到wait_list,唤醒wait_list中等待的task
318 sem = __rwsem_do_wake(sem, 1);
319
320 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); //在wake过程中需要修改count所有需要加spin_lock
321 }
66 static inline struct rw_semaphore *
67 __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
68 {
69 struct rwsem_waiter *waiter;
70 struct task_struct *tsk;
71 int woken;
72
73 waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
74
75 if (waiter->type == RWSEM_WAITING_FOR_WRITE) { //唤醒写操作的等待任务,写互斥所有只能唤醒一个,然后就结束操作
76 if (wakewrite)
77 /* Wake up a writer. Note that we do not grant it the
78 * lock - it will have to acquire it when it runs. */
79 wake_up_process(waiter->task); //唤醒写操作,在唤醒后,写操作会修改count值为-1,标识写操作正在进行
80 goto out;
81 }
82
83 /* grant an infinite number of read locks to the front of the queue */
84 woken = 0; //设置读操作计数为0
85 do {
86 struct list_head *next = waiter->list.next;
87
88 list_del(&waiter->list);
89 tsk = waiter->task;
90 /*
91 * Make sure we do not wakeup the next reader before
92 * setting the nil condition to grant the next reader;
93 * otherwise we could miss the wakeup on the other
94 * side and end up sleeping again. See the pairing
95 * in rwsem_down_read_failed().
96 */
97 smp_mb();
98 waiter->task = NULL; //设置task为null,让读操作跳出for循环
99 wake_up_process(tsk); //唤醒读操作
100 put_task_struct(tsk);
101 woken++; //唤醒的读操作计数加1
102 if (next == &sem->wait_list) //wait_list为空跳出循环
103 break;
104 waiter = list_entry(next, struct rwsem_waiter, list);
105 } while (waiter->type != RWSEM_WAITING_FOR_WRITE); //一次唤醒wait_list上所有连续的读操作,直到又遇到写等待
106
107 sem->count += woken; //修改count值,表示一共唤醒了多少个读操作,在上一层加上了spin_lock,这里修改时安全的。
108
109 out:
110 return sem;
111 }
2.5 rw_semaphore使用例子
1.一个A读操作,获取到rw_semaphore,进入临界区,count++,count=1.
2.一个B写操作,由于A读操作存在导致获取rw_semaphore,写操作task加入到wait_list,count=1值不变.
3.C、D两个读操作,由于wait_list不为空(B写操作在wait_lsit),C、D读操作task加入到wait_list,count=1值不变。
4.A读操作解锁,count--,发现wait_list不为空,唤醒写操作B,count=0。
5.B写操作被唤醒,从wait_list中被删除,C读操作现在是wait_list中第一个task,count==0,跳出for循环,设置count=-1,标识有写操作在临界区,count=-1。
6.B写操作解锁,count=0,发现wait_list不为空,唤醒wait_list中task,count=0。
7.在wait_list中唤醒第一个task(即C读操作),发现下一个task D也是读操作,继续唤醒D读操作,wait_list为空,跳出唤醒task循环,设置count值为2,count=2(表示有2个读操作).
8.C读操作解锁,count--,count=2-1=1,count不为0,直接结束,count=1;
9.D读操作解锁,count--,count=1-1=0,count为0,wait_list为空,直接结束,count=0(恢复到最开始状态);
代码理解上如有错误,欢迎各个大牛指正。