linux内核分析--异步io(三)

 用户建立了异步io环境,并且提交了异步io请求,该做的都做了,剩下的就是结果了,人生漂泊,有因无果,结果真的重要吗?务实一点说,重要,真正不在乎 结果的人又有几个呢?人尤如此,内核就更不用说了,我拿到钱大把大把的花,等到请客吃饭时,囊中羞涩,这也是一种务实--肥水不流外人田;我有了时间,大 把大把浪费,等到考试或考核时,总在呐喊:再多一秒吧!linux内核是这样的吗?很抱歉,不是!我觉得它是世界上最吝啬的了,不花一分冤枉钱,啥时候 windows也能这么吝啬就好了,我们就不用攒几个月的工资来买一块显卡仅仅为了玩一个几个月后就过时的游戏了,君不见Linux跑在奔二128m内存 机器上,天马行空,windows在酷睿1g内存机器依然蠕动,钞票啊!好了,不扯犊子了,第三场,开始!
  为了得到结果,用户应该调用io_getevents库函数,该函数进行sys_io_getevents系统调用,实际上不用分析代码也能猜个八九不离 十,但是分析该系统调用前,必须先看一眼aio_complete函数,毕竟,得有人把数据放到一个地方你才可以读,而aio_complete函数就是 做这个的:
933 int fastcall aio_complete(struct kiocb *iocb, long res, long res2)
934 {
935 struct kioctx *ctx = iocb->ki_ctx;
936 struct aio_ring_info *info;
937 struct aio_ring *ring;
938 struct io_event *event;
939 unsigned long flags;
940 unsigned long tail;
941 int ret;
942
943 /*
944 * Special case handling for sync iocbs:
945 * - events go directly into the iocb for fast handling
946 * - the sync task with the iocb in its stack holds the single iocb
947 * ref, no other paths have a way to get another ref
948 * - the sync task helpfully left a reference to itself in the iocb
949 */
950 if (is_sync_kiocb(iocb)) {
951 BUG_ON(iocb->ki_users != 1);
952 iocb->ki_user_data = res;
953 iocb->ki_users = 0;
954 wake_up_process(iocb->ki_obj.tsk);
955 return 1;
956 }
957
958 info = &ctx->ring_info;//还记得吗?在setup_ring中事情
959
960 /* add a completion event to the ring buffer.
961 * must be done holding ctx->ctx_lock to prevent
962 * other code from messing with the tail
963 * pointer since we might be called from irq
964 * context.
965 */
966 spin_lock_irqsave(&ctx->ctx_lock, flags);
967
968 if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list))
969 list_del_init(&iocb->ki_run_list);
970
971 /*
972 * cancelled requests don't get events, userland was given one
973 * when the event got cancelled.
974 */
975 if (kiocbIsCancelled(iocb))
976 goto put_rq;
977
978 ring = kmap_atomic(info->ring_pages[0], KM_IRQ1);
979
980 tail = info->tail;
981 event = aio_ring_event(info, tail, KM_IRQ0);//这是个宏,用来临时将从用户虚存区间得到的物理页面影射到高端,这么做为了往里面写数据,毕竟现代操作系统内核只认虚拟地址
982 if (++tail >= info->nr)
983 tail = 0;
984
985 event->obj = (u64)(unsigned long)iocb->ki_obj.user;
986 event->data = iocb->ki_user_data;
987 event->res = res;
988 event->res2 = res2;
989
990 dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx/n",
991 ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
992 res, res2);
993
994 /* after flagging the request as done, we
995 * must never even look at it again
996 */
997 smp_wmb(); /* make event visible before updating tail */
998
999 info->tail = tail;
1000 ring->tail = tail;
1001
1002 put_aio_ring_event(event, KM_IRQ0);//写完了,去映射,高端映射区间资源很宝贵,不要长期占用就好,都自觉点就行,内核并没有强制!
1003 kunmap_atomic(ring, KM_IRQ1);
1004
1005 pr_debug("added to ring %p at [%lu]/n", iocb, tail);
1006
1007 pr_debug("%ld retries: %d of %d/n", iocb->ki_retried,
1008 iocb->ki_nbytes - iocb->ki_left, iocb->ki_nbytes);
1009 put_rq:
1010 /* everything turned out well, dispose of the aiocb. */
1011 ret = __aio_put_req(ctx, iocb);
1012
1013 spin_unlock_irqrestore(&ctx->ctx_lock, flags);
1014
1015 if (waitqueue_active(&ctx->wait))//在sys_io_getevents中可能当下没有数据的时候要睡眠,现在有了,唤醒吧!
1016 wake_up(&ctx->wait);
1017
1018 if (ret)
1019 put_ioctx(ctx);
1020
1021 return ret;
1022 }
现在内核已经把数据放到一个地方了,就是
info->ring_pages[0],就等着有人拿了,如果通读了代码,很多人不禁要问,内存直接申请一块不就的了,为什么还有映射来映射去的,难道不麻烦吗?
其实,调用这个完成函数的进程根本就可能不是我们的用户进程,前面说过,所有的请求是放到工作队列中的,而工作队列是有自己的进程上下文的,所以就应该在请求者的地址空间申请内存,然后
得到物理页,物理页面并不是进程级别的,所以可以随意操作。下面该真正的sys_io_getevents了:
1699 asmlinkage long sys_io_getevents(aio_context_t ctx_id,
1700 long min_nr,
1701 long nr,
1702 struct io_event __user *events,
1703 struct timespec __user *timeout)
1704 {
1705 struct kioctx *ioctx = lookup_ioctx(ctx_id);//轻车熟路!!
1706 long ret = -EINVAL;
1707
1708 if (likely(ioctx)) {
1709 if (likely(min_nr <= nr && min_nr >= 0 && nr >= 0))
1710 ret = read_events(ioctx, min_nr, nr, events, timeout);
1711 put_ioctx(ioctx);
1712 }
1713
1714 return ret;
1715 }
/
1104 static int read_events(struct kioctx *ctx,
1105 long min_nr, long nr,
1106 struct io_event __user *event,
1107 struct timespec __user *timeout)
1108 {
1109 long start_jiffies = jiffies;
1110 struct task_struct *tsk = current;
1111 DECLARE_WAITQUEUE(wait, tsk);
1112 int ret;
1113 int i = 0;
1114 struct io_event ent;
1115 struct aio_timeout to;
1116 int retry = 0;
1117
1118 /* needed to zero any padding within an entry (there shouldn't be
1119 * any, but C is fun!
1120 */
1121 memset(&ent, 0, sizeof(ent));
1122 retry:
1123 ret = 0;
1124 while (likely(i < nr)) {
1125 ret = aio_read_evt(ctx, &ent);//真正的读取
1126 if (unlikely(ret <= 0))
1127 break;
1128
1129 dprintk("read event: %Lx %Lx %Lx %Lx/n",
1130 ent.data, ent.obj, ent.res, ent.res2);
1131
1132 /* Could we split the check in two? */
1133 ret = -EFAULT;
1134 if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
1135 dprintk("aio: lost an event due to EFAULT./n");
1136 break;
1137 }
1138 ret = 0;
1139
1140 /* Good, event copied to userland, update counts. */
1141 event ++;
1142 i ++;
1143 }
1144
1145 if (min_nr <= i)
1146 return i;
1147 if (ret)
1148 return ret;
1149
1150 /* End fast path */
1151
1152 /* racey check, but it gets redone */
1153 if (!retry && unlikely(!list_empty(&ctx->run_list))) {
1154 retry = 1;
1155 aio_run_all_iocbs(ctx);
1156 goto retry;
1157 }
1158
1159 init_timeout(&to);
1160 if (timeout) {
1161 struct timespec ts;
1162 ret = -EFAULT;
1163 if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))
1164 goto out;
1165
1166 set_timeout(start_jiffies, &to, &ts);
1167 }
1168
1169 while (likely(i < nr)) {
1170 add_wait_queue_exclusive(&ctx->wait, &wait);
1171 do {
1172 set_task_state(tsk, TASK_INTERRUPTIBLE);//没有数据,睡眠!
1173 ret = aio_read_evt(ctx, &ent);
1174 if (ret)
1175 break;
1176 if (min_nr <= i)
1177 break;
1178 ret = 0;
1179 if (to.timed_out) /* Only check after read evt */
1180 break;
1181 schedule();
1182 if (signal_pending(tsk)) {
1183 ret = -EINTR;
1184 break;
1185 }
1186 /*ret = aio_read_evt(ctx, &ent);*/
1187 } while (1) ;
1188
1189 set_task_state(tsk, TASK_RUNNING);//被唤醒,肯定有了数据
1190 remove_wait_queue(&ctx->wait, &wait);
1191
1192 if (unlikely(ret <= 0))
1193 break;
1194
1195 ret = -EFAULT;
1196 if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
1197 dprintk("aio: lost an event due to EFAULT./n");
1198 break;
1199 }
1200
1201 /* Good, event copied to userland, update counts. */
1202 event ++;
1203 i ++;
1204 }
1205
1206 if (timeout)
1207 clear_timeout(&to);
1208 out:
1209 return i ? i : ret;
1210 }
最后看一下aio_read_evt函数,这个函数把所有虚伪的东西落到实处:
1030 static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)
1031 {
1032 struct aio_ring_info *info = &ioctx->ring_info;
1033 struct aio_ring *ring;
1034 unsigned long head;
1035 int ret = 0;
1036
1037 ring = kmap_atomic(info->ring_pages[0], KM_USER0);
1038 dprintk("in aio_read_evt h%lu t%lu m%lu/n",
1039 (unsigned long)ring->head, (unsigned long)ring->tail,
1040 (unsigned long)ring->nr);
1041
1042 if (ring->head == ring->tail)
1043 goto out;
1044
1045 spin_lock(&info->ring_lock);
1046
1047 head = ring->head % info->nr;
1048 if (head != ring->tail) {
1049 struct io_event *evp = aio_ring_event(info, head, KM_USER1);
1050 *ent = *evp;
1051 head = (head + 1) % info->nr;
1052 smp_mb(); /* finish reading the event before updatng the head */
1053 ring->head = head;
1054 ret = 1;
1055 put_aio_ring_event(evp, KM_USER1);
1056 }
1057 spin_unlock(&info->ring_lock);
1058
1059 out:
1060 kunmap_atomic(ring, KM_USER0);
1061 dprintk("leaving aio_read_evt: %d h%lu t%lu/n", ret,
1062 (unsigned long)ring->head, (unsigned long)ring->tail);
1063 return ret;
1064 }
这下脉络就很清晰了,到这里我想到了一些东西,我读内核的过程中发现,内核函数基本可以分为两种,一种为管理函数,一种为操作函数,管理函数主要涉及到很多很复杂的数据结构,可谓变态级别,
而操作函数一般一目了然,比如写寄存器,就一个writeX函数,由此想到了我们的社会何尝不是如此,管理机关机构臃肿,人员闲杂,而劳动人民则是那么的淳朴......
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值