应用层的操作大都会以系统调用的形式调用相应的内核函数,本文就分析一下相关的系统调用以及其他一些辅助函数。
本文重点分析UDP与TCP的read/write函数,以达到承前启后的目的(其他函数读者可以大致浏览一下)。
create
884 static int
885 ip_proto_create (struct socket *sock, int protocol)
886 {
887 volatile struct sock *sk;
888 struct proto *prot;
889 int err;
891 sk = kmalloc (sizeof (*sk), GFP_KERNEL);
892 if (sk == NULL)
893 return (-ENOMEM);
894 sk->num = 0;
897 switch (sock->type)
898 {
899 case SOCK_STREAM:
900 case SOCK_SEQPACKET:
901 if (protocol && protocol != IPPROTO_TCP)
902 {
903 kfree_s ((void *)sk, sizeof (*sk));
904 return (-EPROTONOSUPPORT);
905 }
906 sk->no_check = TCP_NO_CHECK;
907 prot = &tcp_prot;
908 break;
910 case SOCK_DGRAM:
911 if (protocol && protocol != IPPROTO_UDP)
912 {
913 kfree_s ((void *)sk, sizeof (*sk));
914 return (-EPROTONOSUPPORT);
915 }
916 sk->no_check = UDP_NO_CHECK;
917 prot=&udp_prot;
918 break;
920 case SOCK_RAW:
921 if (!suser())
922 {
923 kfree_s ((void *)sk, sizeof (*sk));
924 return (-EPERM);
925 }
927 if (!protocol)
928 {
929 kfree_s ((void *)sk, sizeof (*sk));
930 return (-EPROTONOSUPPORT);
931 }
932 prot = &raw_prot;
933 sk->reuse = 1;
934 sk->no_check = 0; /* doesn't matter no checksum is preformed
935 anyway. */
936 sk->num = protocol;
937 break;
939 case SOCK_PACKET:
940 if (!suser())
941 {
942 kfree_s ((void *)sk, sizeof (*sk));
943 return (-EPERM);
944 }
946 if (!protocol)
947 {
948 kfree_s ((void *)sk, sizeof (*sk));
949 return (-EPROTONOSUPPORT);
950 }
951 prot = &packet_prot;
952 sk->reuse = 1;
953 sk->no_check = 0; /* doesn't matter no checksum is preformed
954 anyway. */
955 sk->num = protocol;
956 break;
959 default:
960 kfree_s ((void *)sk, sizeof (*sk));
961 return (-ESOCKTNOSUPPORT);
963 }
885 ip_proto_create (struct socket *sock, int protocol)
886 {
887 volatile struct sock *sk;
888 struct proto *prot;
889 int err;
891 sk = kmalloc (sizeof (*sk), GFP_KERNEL);
892 if (sk == NULL)
893 return (-ENOMEM);
894 sk->num = 0;
897 switch (sock->type)
898 {
899 case SOCK_STREAM:
900 case SOCK_SEQPACKET:
901 if (protocol && protocol != IPPROTO_TCP)
902 {
903 kfree_s ((void *)sk, sizeof (*sk));
904 return (-EPROTONOSUPPORT);
905 }
906 sk->no_check = TCP_NO_CHECK;
907 prot = &tcp_prot;
908 break;
910 case SOCK_DGRAM:
911 if (protocol && protocol != IPPROTO_UDP)
912 {
913 kfree_s ((void *)sk, sizeof (*sk));
914 return (-EPROTONOSUPPORT);
915 }
916 sk->no_check = UDP_NO_CHECK;
917 prot=&udp_prot;
918 break;
920 case SOCK_RAW:
921 if (!suser())
922 {
923 kfree_s ((void *)sk, sizeof (*sk));
924 return (-EPERM);
925 }
927 if (!protocol)
928 {
929 kfree_s ((void *)sk, sizeof (*sk));
930 return (-EPROTONOSUPPORT);
931 }
932 prot = &raw_prot;
933 sk->reuse = 1;
934 sk->no_check = 0; /* doesn't matter no checksum is preformed
935 anyway. */
936 sk->num = protocol;
937 break;
939 case SOCK_PACKET:
940 if (!suser())
941 {
942 kfree_s ((void *)sk, sizeof (*sk));
943 return (-EPERM);
944 }
946 if (!protocol)
947 {
948 kfree_s ((void *)sk, sizeof (*sk));
949 return (-EPROTONOSUPPORT);
950 }
951 prot = &packet_prot;
952 sk->reuse = 1;
953 sk->no_check = 0; /* doesn't matter no checksum is preformed
954 anyway. */
955 sk->num = protocol;
956 break;
959 default:
960 kfree_s ((void *)sk, sizeof (*sk));
961 return (-ESOCKTNOSUPPORT);
963 }
以上根据sock的不同类型赋值不同的协议
964 sk->protocol = protocol;
965 sk->wmem_alloc = 0;
966 sk->rmem_alloc = 0;
967 sk->pair = NULL;
968 sk->opt = NULL;
969 sk->send_seq = 0;
970 sk->acked_seq = 0;
971 sk->copied_seq = 0;
972 sk->fin_seq = 0;
973 sk->proc = 0;
974 sk->rtt = TCP_WRITE_TIME;
975 sk->packets_out = 0;
976 sk->cong_window = 1; /* start with only sending one packet at a time. */
977 sk->exp_growth = 1; /* if set cong_window grow exponentially every time
978 we get an ack. */
979 sk->urginline = 0;
980 sk->intr = 0;
981 sk->linger = 0;
982 sk->destroy = 0;
983 sk->reuse = 0;
984 sk->priority = 1;
985 sk->shutdown = 0;
986 sk->urg = 0;
987 sk->keepopen = 0;
988 sk->done = 0;
989 sk->ack_backlog = 0;
990 sk->window = 0;
991 sk->bytes_rcv = 0;
992 sk->state = TCP_CLOSE;
993 sk->dead = 0;
994 sk->ack_timed = 0;
995 sk->send_tmp = NULL;
996 sk->mss = 0; /* we will try not to send any packets smaller
997 than this. */
999 /* this is how many unacked bytes we will accept for
1000 this socket. */
1002 sk->max_unacked = 2048; /* needs to be at most 2 full packets. */
1004 /* how many packets we should send before forcing an ack.
1005 if this is set to zero it is the same as sk->delay_acks = 0 */
1007 sk->max_ack_backlog = MAX_ACK_BACKLOG;
1008 sk->inuse = 0;
1009 sk->delay_acks = 1; /* default to waiting a while before sending
1010 acks. */
1011 sk->wback = NULL;
1012 sk->wfront = NULL;
1013 sk->rqueue = NULL;
1014 sk->mtu = 576;
1015 sk->prot = prot;
1016 sk->sleep = sock->wait;
1017 sk->daddr = 0;
1018 sk->saddr = MY_IP_ADDR;
1019 sk->err = 0;
1020 sk->next = NULL;
1021 sk->pair = NULL;
1022 sk->send_tail = NULL;
1023 sk->send_head = NULL;
1024 sk->time_wait.len = TCP_CONNECT_TIME;
1025 sk->time_wait.when = 0;
1026 sk->time_wait.sk = sk;
1027 sk->time_wait.next = NULL;
1028 sk->timeout = 0;
1029 sk->back_log = NULL;
1030 sk->blog = 0;
1031 sock->data =(void *) sk;
1032 sk->dummy_th.doff = sizeof (sk->dummy_th)/4;
1033 sk->dummy_th.res1=0;
1034 sk->dummy_th.res2=0;
1035 sk->dummy_th.urg_ptr = 0;
1036 sk->dummy_th.fin = 0;
1037 sk->dummy_th.syn = 0;
1038 sk->dummy_th.rst = 0;
1039 sk->dummy_th.psh = 0;
1040 sk->dummy_th.ack = 0;
1041 sk->dummy_th.urg = 0;
1042 sk->dummy_th.dest = 0;
1044 if (sk->num)
1045 {
1046 /* it assumes that any protocol which allows
1047 the user to assign a number at socket
1048 creation time automatically
1049 shares. */
1050 put_sock (sk->num, sk);
1051 sk->dummy_th.source = net16(sk->num);
1052 }
1054 if (sk->prot->init)
1055 {
1056 err = sk->prot->init(sk);
1057 if (err != 0)
1058 {
1059 destroy_sock (sk);
1060 return (err);
1061 }
1062 }
1063 return (0);
1064 }
代码虽长,逻辑却很简单,根据sock类型的不同选择相应的协议创建sock类型变量sk加入到sock的data域中,之后如果端口号大于0救将其加入到sock_array数组中,最后调用初始化函数进行初始化。
964 sk->protocol = protocol;
965 sk->wmem_alloc = 0;
966 sk->rmem_alloc = 0;
967 sk->pair = NULL;
968 sk->opt = NULL;
969 sk->send_seq = 0;
970 sk->acked_seq = 0;
971 sk->copied_seq = 0;
972 sk->fin_seq = 0;
973 sk->proc = 0;
974 sk->rtt = TCP_WRITE_TIME;
975 sk->packets_out = 0;
976 sk->cong_window = 1; /* start with only sending one packet at a time. */
977 sk->exp_growth = 1; /* if set cong_window grow exponentially every time
978 we get an ack. */
979 sk->urginline = 0;
980 sk->intr = 0;
981 sk->linger = 0;
982 sk->destroy = 0;
983 sk->reuse = 0;
984 sk->priority = 1;
985 sk->shutdown = 0;
986 sk->urg = 0;
987 sk->keepopen = 0;
988 sk->done = 0;
989 sk->ack_backlog = 0;
990 sk->window = 0;
991 sk->bytes_rcv = 0;
992 sk->state = TCP_CLOSE;
993 sk->dead = 0;
994 sk->ack_timed = 0;
995 sk->send_tmp = NULL;
996 sk->mss = 0; /* we will try not to send any packets smaller
997 than this. */
999 /* this is how many unacked bytes we will accept for
1000 this socket. */
1002 sk->max_unacked = 2048; /* needs to be at most 2 full packets. */
1004 /* how many packets we should send before forcing an ack.
1005 if this is set to zero it is the same as sk->delay_acks = 0 */
1007 sk->max_ack_backlog = MAX_ACK_BACKLOG;
1008 sk->inuse = 0;
1009 sk->delay_acks = 1; /* default to waiting a while before sending
1010 acks. */
1011 sk->wback = NULL;
1012 sk->wfront = NULL;
1013 sk->rqueue = NULL;
1014 sk->mtu = 576;
1015 sk->prot = prot;
1016 sk->sleep = sock->wait;
1017 sk->daddr = 0;
1018 sk->saddr = MY_IP_ADDR;
1019 sk->err = 0;
1020 sk->next = NULL;
1021 sk->pair = NULL;
1022 sk->send_tail = NULL;
1023 sk->send_head = NULL;
1024 sk->time_wait.len = TCP_CONNECT_TIME;
1025 sk->time_wait.when = 0;
1026 sk->time_wait.sk = sk;
1027 sk->time_wait.next = NULL;
1028 sk->timeout = 0;
1029 sk->back_log = NULL;
1030 sk->blog = 0;
1031 sock->data =(void *) sk;
1032 sk->dummy_th.doff = sizeof (sk->dummy_th)/4;
1033 sk->dummy_th.res1=0;
1034 sk->dummy_th.res2=0;
1035 sk->dummy_th.urg_ptr = 0;
1036 sk->dummy_th.fin = 0;
1037 sk->dummy_th.syn = 0;
1038 sk->dummy_th.rst = 0;
1039 sk->dummy_th.psh = 0;
1040 sk->dummy_th.ack = 0;
1041 sk->dummy_th.urg = 0;
1042 sk->dummy_th.dest = 0;
1044 if (sk->num)
1045 {
1046 /* it assumes that any protocol which allows
1047 the user to assign a number at socket
1048 creation time automatically
1049 shares. */
1050 put_sock (sk->num, sk);
1051 sk->dummy_th.source = net16(sk->num);
1052 }
1054 if (sk->prot->init)
1055 {
1056 err = sk->prot->init(sk);
1057 if (err != 0)
1058 {
1059 destroy_sock (sk);
1060 return (err);
1061 }
1062 }
1063 return (0);
1064 }
代码虽长,逻辑却很简单,根据sock类型的不同选择相应的协议创建sock类型变量sk加入到sock的data域中,之后如果端口号大于0救将其加入到sock_array数组中,最后调用初始化函数进行初始化。
listen
808 static int
809 ip_proto_listen(struct socket *sock, int backlog)
810 {
811 volatile struct sock *sk;
812 sk = sock->data;
809 ip_proto_listen(struct socket *sock, int backlog)
810 {
811 volatile struct sock *sk;
812 sk = sock->data;
(对应上面create函数的1031行)
813 if (sk == NULL)
814 {
815 printk ("Warning: sock->data = NULL: %d\n" ,__LINE__);
816 return (0);
817 }
819 /* we may need to bind the socket. */
820 if (sk->num == 0)
821 {
822 sk->num = get_new_socknum (sk->prot, 0);
823 if (sk->num == 0) return (-EAGAIN);
824 put_sock (sk->num, sk);
825 sk->dummy_th.source = net16(sk->num);
826 }
上面的create函数中,如果sock类型是SOCK_STREAM,创建的是TCP连接,它没有设置num属性。因此现在需要为其寻找一个端口,找到之后把该sock加入到sock_array数组中
828 /* we might as well re use these. */
829 sk->max_ack_backlog = backlog; //最大积压确认队列
830 sk->ack_backlog = 0;
831 sk->state = TCP_LISTEN; //设置状态为listen
832 return (0);
833 }
813 if (sk == NULL)
814 {
815 printk ("Warning: sock->data = NULL: %d\n" ,__LINE__);
816 return (0);
817 }
819 /* we may need to bind the socket. */
820 if (sk->num == 0)
821 {
822 sk->num = get_new_socknum (sk->prot, 0);
823 if (sk->num == 0) return (-EAGAIN);
824 put_sock (sk->num, sk);
825 sk->dummy_th.source = net16(sk->num);
826 }
上面的create函数中,如果sock类型是SOCK_STREAM,创建的是TCP连接,它没有设置num属性。因此现在需要为其寻找一个端口,找到之后把该sock加入到sock_array数组中
828 /* we might as well re use these. */
829 sk->max_ack_backlog = backlog; //最大积压确认队列
830 sk->ack_backlog = 0;
831 sk->state = TCP_LISTEN; //设置状态为listen
832 return (0);
833 }
可以看到listen也比较简单,从参数指明的sock的data域中取出在创建过程中保存的sk,如果没有分配端口的话就为其分配端口,并把该sk加入到sock_array中,同时把该端口赋值到dummy_th的source字段。设置最大积压确认队列的大小,设置状态为listen。对于TCP连接来说,只是创建socket并不会把它加入到sock_array中,在调用listen后才被加入到sock_array.
1455 /* This routine handles a connection request. This should make sure
1456 we haven't already responded. */
1457 /* Because of the way BSD works, we have to send a syn/ack now. This also
1458 means it will be harder to close a socket which is listening. */
1460 static void
1461 tcp_conn_request(volatile struct sock *sk, struct sk_buff *skb,
1462 unsigned long daddr,
1463 unsigned long saddr, struct options *opt, struct device *dev)
1464 {
1465 struct sk_buff *buff;
1466 struct tcp_header *t1;
1467 unsigned char *ptr;
1468 volatile struct sock *newsk;
1469 struct tcp_header *th;
1470 int tmp;
1471 th = skb->h.th;
1473 PRINTK ("tcp_conn_request (sk = %X, skb = %X, daddr = %X, sadd4= %X, \n"
1474 " opt = %X, dev = %X)\n",
1475 sk, skb, daddr, saddr, opt, dev);
1477 /* if the socket is dead, don't accept the connection. */
1478 if (!sk->dead)
1479 {
1480 wake_up(sk->sleep);
1481 }
1482 else
1483 {
1484 PRINTK ("tcp_conn_request on dead socket\n");
1485 tcp_reset (daddr, saddr, th, sk->prot, opt, dev);
1486 kfree_skb (skb, FREE_READ);
1487 return;
1488 }
1456 we haven't already responded. */
1457 /* Because of the way BSD works, we have to send a syn/ack now. This also
1458 means it will be harder to close a socket which is listening. */
1460 static void
1461 tcp_conn_request(volatile struct sock *sk, struct sk_buff *skb,
1462 unsigned long daddr,
1463 unsigned long saddr, struct options *opt, struct device *dev)
1464 {
1465 struct sk_buff *buff;
1466 struct tcp_header *t1;
1467 unsigned char *ptr;
1468 volatile struct sock *newsk;
1469 struct tcp_header *th;
1470 int tmp;
1471 th = skb->h.th;
1473 PRINTK ("tcp_conn_request (sk = %X, skb = %X, daddr = %X, sadd4= %X, \n"
1474 " opt = %X, dev = %X)\n",
1475 sk, skb, daddr, saddr, opt, dev);
1477 /* if the socket is dead, don't accept the connection. */
1478 if (!sk->dead)
1479 {
1480 wake_up(sk->sleep);
1481 }
1482 else
1483 {
1484 PRINTK ("tcp_conn_request on dead socket\n");
1485 tcp_reset (daddr, saddr, th, sk->prot, opt, dev);
1486 kfree_skb (skb, FREE_READ);
1487 return;
1488 }
1490 /* make sure we can accept more. This will prevent a flurry of
1491 syns from eating up all our memory. */
1492 if (sk->ack_backlog >= sk->max_ack_backlog)
1493 {
1494 kfree_skb (skb, FREE_READ);
1495 return;
1496 }
因为可能会有非常多的主机想要连接到服务器,在服务器接受连接之前,这些TCP连接会暂时保存,为了防止内存耗尽,需要限制最多积压确认数
1498 /* we need to build a new sock struct. */
1499 /* It is sort of bad to have a socket without an inode attached to
1500 it, but the wake_up's will just wake up the listening socket,
1501 and if the listening socket is destroyed before this is taken
1502 off of the queue, this will take care of it. */
1504 newsk = kmalloc(sizeof (struct sock), GFP_ATOMIC);
1505 if (newsk == NULL)
1506 {
1507 /* just ignore the syn. It will get retransmitted. */
1508 kfree_skb (skb, FREE_READ);
1509 return;
1510 }
1513 PRINTK ("newsk = %X\n", newsk);
1514 memcpy ((void *)newsk, (void *)sk, sizeof (*newsk));
1515 newsk->wback = NULL;
1516 newsk->wfront = NULL;
1517 newsk->rqueue = NULL;
1518 newsk->send_head = NULL;
1519 newsk->send_tail = NULL;
1520 newsk->back_log = NULL;
1521 newsk->blog = 0;
1522 newsk->intr = 0;
1523 newsk->proc = 0;
1524 newsk->done = 0;
1525 newsk->send_tmp = NULL;
1526 newsk->pair = NULL;
1527 newsk->wmem_alloc = 0;
1528 newsk->rmem_alloc = 0;
1514 memcpy ((void *)newsk, (void *)sk, sizeof (*newsk));
1515 newsk->wback = NULL;
1516 newsk->wfront = NULL;
1517 newsk->rqueue = NULL;
1518 newsk->send_head = NULL;
1519 newsk->send_tail = NULL;
1520 newsk->back_log = NULL;
1521 newsk->blog = 0;
1522 newsk->intr = 0;
1523 newsk->proc = 0;
1524 newsk->done = 0;
1525 newsk->send_tmp = NULL;
1526 newsk->pair = NULL;
1527 newsk->wmem_alloc = 0;
1528 newsk->rmem_alloc = 0;
1530 newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
1532 newsk->err = 0;
1533 newsk->shutdown = 0;
1534 newsk->ack_backlog = 0;
1535 newsk->acked_seq = skb->h.th->seq+1;
1536 newsk->fin_seq = skb->h.th->seq;
1537 newsk->copied_seq = skb->h.th->seq;
1538 newsk->state = TCP_SYN_RECV;
1539 newsk->timeout = 0;
1540 newsk->send_seq = timer_seq*SEQ_TICK-seq_offset;
1541 newsk->rcv_ack_seq = newsk->send_seq;
1542 newsk->urg =0;
1543 newsk->retransmits = 0;
1544 newsk->destroy = 0;
1545 newsk->time_wait.sk = newsk;
1546 newsk->time_wait.next = NULL;
1547 newsk->dummy_th.source = skb->h.th->dest;
1548 newsk->dummy_th.dest = skb->h.th->source;
1549 /* swap these two, they are from our point of view. */
1550 newsk->daddr=saddr;
1551 newsk->saddr=daddr;
1553 put_sock (newsk->num,newsk);
1554 newsk->dummy_th.res1=0;
1555 newsk->dummy_th.doff=6;
1556 newsk->dummy_th.fin=0;
1557 newsk->dummy_th.syn=0;
1558 newsk->dummy_th.rst=0;
1559 newsk->dummy_th.psh=0;
1560 newsk->dummy_th.ack=0;
1561 newsk->dummy_th.urg=0;
1562 newsk->dummy_th.res2=0;
1563 newsk->acked_seq = skb->h.th->seq+1;
1564 newsk->copied_seq
1532 newsk->err = 0;
1533 newsk->shutdown = 0;
1534 newsk->ack_backlog = 0;
1535 newsk->acked_seq = skb->h.th->seq+1;
1536 newsk->fin_seq = skb->h.th->seq;
1537 newsk->copied_seq = skb->h.th->seq;
1538 newsk->state = TCP_SYN_RECV;
1539 newsk->timeout = 0;
1540 newsk->send_seq = timer_seq*SEQ_TICK-seq_offset;
1541 newsk->rcv_ack_seq = newsk->send_seq;
1542 newsk->urg =0;
1543 newsk->retransmits = 0;
1544 newsk->destroy = 0;
1545 newsk->time_wait.sk = newsk;
1546 newsk->time_wait.next = NULL;
1547 newsk->dummy_th.source = skb->h.th->dest;
1548 newsk->dummy_th.dest = skb->h.th->source;
1549 /* swap these two, they are from our point of view. */
1550 newsk->daddr=saddr;
1551 newsk->saddr=daddr;
1553 put_sock (newsk->num,newsk);
1554 newsk->dummy_th.res1=0;
1555 newsk->dummy_th.doff=6;
1556 newsk->dummy_th.fin=0;
1557 newsk->dummy_th.syn=0;
1558 newsk->dummy_th.rst=0;
1559 newsk->dummy_th.psh=0;
1560 newsk->dummy_th.ack=0;
1561 newsk->dummy_th.urg=0;
1562 newsk->dummy_th.res2=0;
1563 newsk->acked_seq = skb->h.th->seq+1;
1564 newsk->copied_seq