linux内核分析--异步io（三）-CSDN博客

本文链接：https://blog.csdn.net/abisineer/article/details/2767918

用户建立了异步io环境，并且提交了异步io请求，该做的都做了，剩下的就是结果了，人生漂泊，有因无果，结果真的重要吗？务实一点说，重要，真正不在乎结果的人又有几个呢？人尤如此，内核就更不用说了，我拿到钱大把大把的花，等到请客吃饭时，囊中羞涩，这也是一种务实--肥水不流外人田；我有了时间，大把大把浪费，等到考试或考核时，总在呐喊：再多一秒吧！linux内核是这样的吗？很抱歉，不是！我觉得它是世界上最吝啬的了，不花一分冤枉钱，啥时候 windows也能这么吝啬就好了，我们就不用攒几个月的工资来买一块显卡仅仅为了玩一个几个月后就过时的游戏了，君不见Linux跑在奔二128m内存机器上，天马行空，windows在酷睿1g内存机器依然蠕动，钞票啊！好了，不扯犊子了，第三场，开始！
为了得到结果，用户应该调用io_getevents库函数，该函数进行sys_io_getevents系统调用，实际上不用分析代码也能猜个八九不离十，但是分析该系统调用前，必须先看一眼aio_complete函数，毕竟，得有人把数据放到一个地方你才可以读，而aio_complete函数就是做这个的：

933 int fastcall aio_complete(struct kiocb *iocb, long res, long res2)
934 {
935         struct kioctx   *ctx = iocb->ki_ctx;
936         struct aio_ring_info    *info;
937         struct aio_ring *ring;
938         struct io_event *event;
939         unsigned long   flags;
940         unsigned long   tail;
941         int             ret;
942 
943         /*
944          * Special case handling for sync iocbs:
945          *  - events go directly into the iocb for fast handling
946          *  - the sync task with the iocb in its stack holds the single iocb
947          *    ref, no other paths have a way to get another ref
948          *  - the sync task helpfully left a reference to itself in the iocb
949          */
950         if (is_sync_kiocb(iocb)) {
951                 BUG_ON(iocb->ki_users != 1);
952                 iocb->ki_user_data = res;
953                 iocb->ki_users = 0;
954                 wake_up_process(iocb->ki_obj.tsk);
955                 return 1;
956         }
957 
958         info = &ctx->ring_info;//还记得吗？在setup_ring中事情
959 
960         /* add a completion event to the ring buffer.
961          * must be done holding ctx->ctx_lock to prevent
962          * other code from messing with the tail
963          * pointer since we might be called from irq
964          * context.
965          */
966         spin_lock_irqsave(&ctx->ctx_lock, flags);
967 
968         if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list))
969                 list_del_init(&iocb->ki_run_list);
970 
971         /*
972          * cancelled requests don't get events, userland was given one
973          * when the event got cancelled.
974          */
975         if (kiocbIsCancelled(iocb))
976                 goto put_rq;
977 
978         ring = kmap_atomic(info->ring_pages[0], KM_IRQ1);
979 
980         tail = info->tail;
981         event = aio_ring_event(info, tail, KM_IRQ0);//这是个宏，用来临时将从用户虚存区间得到的物理页面影射到高端，这么做为了往里面写数据，毕竟现代操作系统内核只认虚拟地址
982         if (++tail >= info->nr)
983                 tail = 0;
984 
985         event->obj = (u64)(unsigned long)iocb->ki_obj.user;
986         event->data = iocb->ki_user_data;
987         event->res = res;
988         event->res2 = res2;
989 
990         dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx/n",
991                 ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
992                 res, res2);
993 
994         /* after flagging the request as done, we
995          * must never even look at it again
996          */
997         smp_wmb();      /* make event visible before updating tail */
998 
999         info->tail = tail;
1000         ring->tail = tail;
1001 
1002         put_aio_ring_event(event, KM_IRQ0);//写完了，去映射，高端映射区间资源很宝贵，不要长期占用就好，都自觉点就行，内核并没有强制！
1003         kunmap_atomic(ring, KM_IRQ1);
1004 
1005         pr_debug("added to ring %p at [%lu]/n", iocb, tail);
1006 
1007         pr_debug("%ld retries: %d of %d/n", iocb->ki_retried,
1008                 iocb->ki_nbytes - iocb->ki_left, iocb->ki_nbytes);
1009 put_rq:
1010         /* everything turned out well, dispose of the aiocb. */
1011         ret = __aio_put_req(ctx, iocb);
1012 
1013         spin_unlock_irqrestore(&ctx->ctx_lock, flags);
1014 
1015         if (waitqueue_active(&ctx->wait))//在sys_io_getevents中可能当下没有数据的时候要睡眠，现在有了，唤醒吧！
1016                 wake_up(&ctx->wait);
1017 
1018         if (ret)
1019                 put_ioctx(ctx);
1020 
1021         return ret;
1022 }
现在内核已经把数据放到一个地方了，就是info->ring_pages[0]，就等着有人拿了，如果通读了代码，很多人不禁要问，内存直接申请一块不就的了，为什么还有映射来映射去的，难道不麻烦吗？
其实，调用这个完成函数的进程根本就可能不是我们的用户进程，前面说过，所有的请求是放到工作队列中的，而工作队列是有自己的进程上下文的，所以就应该在请求者的地址空间申请内存，然后
得到物理页，物理页面并不是进程级别的，所以可以随意操作。下面该真正的sys_io_getevents了：
1699 asmlinkage long sys_io_getevents(aio_context_t ctx_id,
1700                                  long min_nr,
1701                                  long nr,
1702                                  struct io_event __user *events,
1703                                  struct timespec __user *timeout)
1704 {
1705         struct kioctx *ioctx = lookup_ioctx(ctx_id);//轻车熟路！！
1706         long ret = -EINVAL;
1707 
1708         if (likely(ioctx)) {
1709                 if (likely(min_nr <= nr && min_nr >= 0 && nr >= 0))
1710                         ret = read_events(ioctx, min_nr, nr, events, timeout);
1711                 put_ioctx(ioctx);
1712         }
1713 
1714         return ret;
1715 }
/
1104 static int read_events(struct kioctx *ctx,
1105                         long min_nr, long nr,
1106                         struct io_event __user *event,
1107                         struct timespec __user *timeout)
1108 {
1109         long                    start_jiffies = jiffies;
1110         struct task_struct      *tsk = current;
1111         DECLARE_WAITQUEUE(wait, tsk);
1112         int                     ret;
1113         int                     i = 0;
1114         struct io_event         ent;
1115         struct aio_timeout      to;
1116         int                     retry = 0;
1117 
1118         /* needed to zero any padding within an entry (there shouldn't be 
1119          * any, but C is fun!
1120          */
1121         memset(&ent, 0, sizeof(ent));
1122 retry:
1123         ret = 0;
1124         while (likely(i < nr)) {
1125                 ret = aio_read_evt(ctx, &ent);//真正的读取
1126                 if (unlikely(ret <= 0))
1127                         break;
1128 
1129                 dprintk("read event: %Lx %Lx %Lx %Lx/n",
1130                         ent.data, ent.obj, ent.res, ent.res2);
1131 
1132                 /* Could we split the check in two? */
1133                 ret = -EFAULT;
1134                 if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
1135                         dprintk("aio: lost an event due to EFAULT./n");
1136                         break;
1137                 }
1138                 ret = 0;
1139 
1140                 /* Good, event copied to userland, update counts. */
1141                 event ++;
1142                 i ++;
1143         }
1144 
1145         if (min_nr <= i)
1146                 return i;
1147         if (ret)
1148                 return ret;
1149 
1150         /* End fast path */
1151 
1152         /* racey check, but it gets redone */
1153         if (!retry && unlikely(!list_empty(&ctx->run_list))) {
1154                 retry = 1;
1155                 aio_run_all_iocbs(ctx);
1156                 goto retry;
1157         }
1158 
1159         init_timeout(&to);
1160         if (timeout) {
1161                 struct timespec ts;
1162                 ret = -EFAULT;
1163                 if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))
1164                         goto out;
1165 
1166                 set_timeout(start_jiffies, &to, &ts);
1167         }
1168 
1169         while (likely(i < nr)) {
1170                 add_wait_queue_exclusive(&ctx->wait, &wait);
1171                 do {
1172                         set_task_state(tsk, TASK_INTERRUPTIBLE);//没有数据，睡眠！
1173                         ret = aio_read_evt(ctx, &ent);
1174                         if (ret)
1175                                 break;
1176                         if (min_nr <= i)
1177                                 break;
1178                         ret = 0;
1179                         if (to.timed_out)       /* Only check after read evt */
1180                                 break;
1181                         schedule();
1182                         if (signal_pending(tsk)) {
1183                                 ret = -EINTR;
1184                                 break;
1185                         }
1186                         /*ret = aio_read_evt(ctx, &ent);*/
1187                 } while (1) ;
1188 
1189                 set_task_state(tsk, TASK_RUNNING);//被唤醒，肯定有了数据
1190                 remove_wait_queue(&ctx->wait, &wait);
1191 
1192                 if (unlikely(ret <= 0))
1193                         break;
1194 
1195                 ret = -EFAULT;
1196                 if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
1197                         dprintk("aio: lost an event due to EFAULT./n");
1198                         break;
1199                 }
1200 
1201                 /* Good, event copied to userland, update counts. */
1202                 event ++;
1203                 i ++;
1204         }
1205 
1206         if (timeout)
1207                 clear_timeout(&to);
1208 out:
1209         return i ? i : ret;
1210 }
最后看一下aio_read_evt函数，这个函数把所有虚伪的东西落到实处：
1030 static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)
1031 {
1032         struct aio_ring_info *info = &ioctx->ring_info;
1033         struct aio_ring *ring;
1034         unsigned long head;
1035         int ret = 0;
1036 
1037         ring = kmap_atomic(info->ring_pages[0], KM_USER0);
1038         dprintk("in aio_read_evt h%lu t%lu m%lu/n",
1039                  (unsigned long)ring->head, (unsigned long)ring->tail,
1040                  (unsigned long)ring->nr);
1041 
1042         if (ring->head == ring->tail)
1043                 goto out;
1044 
1045         spin_lock(&info->ring_lock);
1046 
1047         head = ring->head % info->nr;
1048         if (head != ring->tail) {
1049                 struct io_event *evp = aio_ring_event(info, head, KM_USER1);
1050                 *ent = *evp;
1051                 head = (head + 1) % info->nr;
1052                 smp_mb(); /* finish reading the event before updatng the head */
1053                 ring->head = head;
1054                 ret = 1;
1055                 put_aio_ring_event(evp, KM_USER1);
1056         }
1057         spin_unlock(&info->ring_lock);
1058 
1059 out:
1060         kunmap_atomic(ring, KM_USER0);
1061         dprintk("leaving aio_read_evt: %d  h%lu t%lu/n", ret,
1062                  (unsigned long)ring->head, (unsigned long)ring->tail);
1063         return ret;
1064 }
这下脉络就很清晰了，到这里我想到了一些东西，我读内核的过程中发现，内核函数基本可以分为两种，一种为管理函数，一种为操作函数，管理函数主要涉及到很多很复杂的数据结构，可谓变态级别，
而操作函数一般一目了然，比如写寄存器，就一个writeX函数，由此想到了我们的社会何尝不是如此，管理机关机构臃肿，人员闲杂，而劳动人民则是那么的淳朴......