ip_append_data函数的主要任务是创建套接字缓冲区(struct sk_buff结构体),为IP层数据分片做好准备。
725 int ip_append_data(struct sock *sk,
726 int getfrag(void *from, char *to, int offset, int len,727 int odd, struct sk_buff *skb),
728 void *from, int length, int transhdrlen,
729 struct ipcm_cookie *ipc, struct rtable *rt,
730 unsigned int flags)
731 {
732 struct inet_opt *inet = inet_sk(sk);733 struct sk_buff *skb;
735 struct ip_options *opt = NULL;
736 int hh_len;737 int exthdrlen;
738 int mtu;
739 int copy;
740 int err;741 int offset = 0;
742 unsigned int maxfraglen, fragheaderlen;
743 int csummode = CHECKSUM_NONE;
744
745 if (flags&MSG_PROBE)
746 return 0;
//判断套接字发送队列sk->sk_write_queue是否为空,如果队列为空,则对inet->cork初始化,为分片做准备
748 if (skb_queue_empty(&sk->sk_write_queue)) {
749 /*
750 * setup for corking.
751 */
//如果IP选项不为空,则在inet->cork中设置选项处理记录。
752 opt = ipc->opt;
//如果选项不为空,cork.opt为空,则为它分配一块内存区域,为什么+40?不明白
753 if (opt) {754 if (inet->cork.opt == NULL) {
755 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
756 if (unlikely(inet->cork.opt == NULL))
757 return -ENOBUFS;
758 }
否则,直接将IP选项中的内容拷贝到inet->cork.opt中。
759 memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
760 inet->cork.flags |= IPCORK_OPT;
761 inet->cork.addr = ipc->addr;
762 }
763 dst_hold(&rt->u.dst);
//得到用来分片的MTU
764 inet->cork.fragsize = mtu = dst_pmtu(&rt->u.dst);//inet->cork.rt是套接字携带的路由表项信息
765 inet->cork.rt = rt;
766 inet->cork.length = 0;
//初始化分片位置信息:
//sk_sndmsg_page指向分片首地址
//sk_sndmsg_off是下一分片的存放位置
767 sk->sk_sndmsg_page = NULL;768 sk->sk_sndmsg_off = 0;
769 if ((exthdrlen = rt->u.dst.header_len) != 0) {
770 length += exthdrlen;
771 transhdrlen += exthdrlen;
772 }
773 }
//如果IP选项为空
else {
774 rt = inet->cork.rt;775 if (inet->cork.flags & IPCORK_OPT)
776 opt = inet->cork.opt;
777
778 transhdrlen = 0;
779 exthdrlen = 0;
780 mtu = inet->cork.fragsize;
781 }
782 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
783
784 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
785 maxfraglen = ((mtu-fragheaderlen) & ~7) + fragheaderlen;
786
787 if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
788 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-
exthdrlen);
789 return -EMSGSIZE;
790 }
791
792 /*
793 * transhdrlen > 0 means that this is the first fragment and we wish
794 * it won't be fragmented in the future.
795 */
796 if (transhdrlen &&
797 length + fragheaderlen <= maxfraglen &&
798 rt->u.dst.dev->features&
(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
799 !exthdrlen)
800 csummode = CHECKSUM_HW;
801
802 inet->cork.length += length;
803
804 /* So, what's going on in the loop below?
805 *
806 * We use calculated fragment length to generate chained skb,
807 * each of segments is IP fragment ready for sending to network after
808 * adding appropriate IP header.
809 *
810 * Mistake is:
811 *
812 * If mtu-fragheaderlen is not 0 modulo 8, we generate additional
813 * small fragment of length (mtu-fragheaderlen)%8, even though
814 * it is not necessary. Not a big bug, but needs a fix.
815 */
816
817 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
818 goto alloc_new_skb;
819
820 while (length > 0) {
821 if ((copy = maxfraglen - skb->len) <= 0) {
822 char *data;
823 unsigned int datalen;
824 unsigned int fraglen;
825 unsigned int alloclen;
826 BUG_TRAP(copy == 0);
827
828 alloc_new_skb:
829 datalen = maxfraglen - fragheaderlen;
830 if (datalen > length)
831 datalen = length;
832
833 fraglen = datalen + fragheaderlen;
834 if ((flags & MSG_MORE) &&
835 !(rt->u.dst.dev->features&NETIF_F_SG))
836 alloclen = maxfraglen;
837 else
838 alloclen = datalen + fragheaderlen;
839
840 /* The last fragment gets additional space at tail.
841 * Note, with MSG_MORE we overallocate on fragments,
842 * because we have no idea what fragment will be
843 * the last.
844 */
845 if (datalen == length)
846 alloclen += rt->u.dst.trailer_len;
847
848 if (transhdrlen) {
849 skb = sock_alloc_send_skb(sk,
850 alloclen + hh_len + 15,
851 (flags & MSG_DONTWAIT), &err);
852 } else {
853 skb = NULL;
854 if (atomic_read(&sk->sk_wmem_alloc) <=
855 2 * sk->sk_sndbuf)
856 skb = sock_wmalloc(sk,
857 alloclen + hh_len + 15,
1,
858 sk->sk_allocation);
859 if (unlikely(skb == NULL))
860 err = -ENOBUFS;
861 }
862 if (skb == NULL)
863 goto error;
864
865 /*
866 * Fill in the control structures
867 */
868 skb->ip_summed = csummode;
869 skb->csum = 0;
870 skb_reserve(skb, hh_len);
871
872 /*
873 * Find where to start putting bytes.
874 */
875 data = skb_put(skb, fraglen);
876 skb->nh.raw = data + exthdrlen;
877 data += fragheaderlen;
878 skb->h.raw = data + exthdrlen;
879
880 copy = datalen - transhdrlen;
881 if (copy > 0 && getfrag(from, data + transhdrlen, offset,
copy, 0, skb) < 0) {
882 err = -EFAULT;
883 kfree_skb(skb);
884 goto error;
885 }
886
887 offset += copy;
888 length -= datalen;
889 transhdrlen = 0;
890 exthdrlen = 0;
891 csummode = CHECKSUM_NONE;
892
893 /*
894 * Put the packet on the pending queue.
895 */
896 __skb_queue_tail(&sk->sk_write_queue, skb);
897 continue;
898 }
899
900 if (copy > length)
901 copy = length;
902
903 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
904 unsigned int off;
905
906 off = skb->len;
907 if (getfrag(from, skb_put(skb, copy),
908 offset, copy, off, skb) < 0) {
909 __skb_trim(skb, off);
910 err = -EFAULT;
911 goto error;
912 }
913 } else {
914 int i = skb_shinfo(skb)->nr_frags;
915 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
916 struct page *page = sk->sk_sndmsg_page;
917 int off = sk->sk_sndmsg_off;
918 unsigned int left;
919
920 if (page && (left = PAGE_SIZE - off) > 0) {
921 if (copy >= left)
922 copy = left;
923 if (page != frag->page) {
924 if (i == MAX_SKB_FRAGS) {
925 err = -EMSGSIZE;
926 goto error;
927 }
928 get_page(page);
929 skb_fill_page_desc(skb, i, page, sk-
>sk_sndmsg_off, 0);
930 frag = &skb_shinfo(skb)->frags[i];
931 }
932 } else if (i < MAX_SKB_FRAGS) {
933 if (copy > PAGE_SIZE)
934 copy = PAGE_SIZE;
935 page = alloc_pages(sk->sk_allocation, 0);
936 if (page == NULL) {
937 err = -ENOMEM;
938 goto error;
939 }
940 sk->sk_sndmsg_page = page;
941 sk->sk_sndmsg_off = 0;
942
943 skb_fill_page_desc(skb, i, page, 0, 0);
944 frag = &skb_shinfo(skb)->frags[i];
945 skb->truesize += PAGE_SIZE;
946 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
947 } else {
948 err = -EMSGSIZE;
949 goto error;
950 }
951 if (getfrag(from, page_address(frag->page)+frag-
>page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
952 err = -EFAULT;
953 goto error;
954 }
955 sk->sk_sndmsg_off += copy;
956 frag->size += copy;
957 skb->len += copy;
958 skb->data_len += copy;
959 }
960 offset += copy;
961 length -= copy;
962 }
963
964 return 0;
965
966 error:
967 inet->cork.length -= length;
968 IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
969 return err;
970 }
971