Linux connect系统调用

注:本文分析基于3.10.0-693.el7内核版本,即CentOS 7.4

1、函数原型

int connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen); 

参数说明:

sockfd:套接字的文件描述符,socket()系统调用返回的fd

addr:指向存放地址信息的结构体的首地址

addrlen:存放地址信息的结构体的大小,其实也就是sizof(struct sockaddr)

可以看出,connect()系统调用的入参和bind()系统调用入参是一致的。

2、内核实现

SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr,
        int, addrlen)
{
    struct socket *sock;
    struct sockaddr_storage address;
    int err, fput_needed;
    //通过fd文件描述符,获取对应的socket结构体,bind,listen都有这个操作
    sock = sockfd_lookup_light(fd, &err, &fput_needed);
    if (!sock)
        goto out;
    err = move_addr_to_kernel(uservaddr, addrlen, &address);
    if (err < 0)
        goto out_put;

    err = security_socket_connect(sock, (struct sockaddr *)&address, addrlen);
    if (err)
        goto out_put;

    //connect调用的主处理函数
    err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen,
                 sock->file->f_flags);
out_put:
    fput_light(sock->file, fput_needed);
out:
    return err;
}

和bind(),listen()一样,第一步肯定是通过fd获取到socket结构。接下来sock->ops指向的是inet_stream_ops,因此sock->ops->connect调用的就是inet_stream_connect()。inet_stream_connect()对__inet_stream_connect()做了一个简单的封装。

int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
            int addr_len, int flags)
{
    int err;

    lock_sock(sock->sk);
    err = __inet_stream_connect(sock, uaddr, addr_len, flags);
    release_sock(sock->sk);
    return err;
}

int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
              int addr_len, int flags)
{
    struct sock *sk = sock->sk;
    int err;
    long timeo;

    if (addr_len < sizeof(uaddr->sa_family))
        return -EINVAL;

    if (uaddr->sa_family == AF_UNSPEC) {
        err = sk->sk_prot->disconnect(sk, flags);
        sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
        goto out;
    }
    //判断socket状态
    switch (sock->state) {
    default:
        err = -EINVAL;
        goto out;
    case SS_CONNECTED:
        err = -EISCONN;
        goto out;
    case SS_CONNECTING:
        err = -EALREADY;
        /* Fall out of switch with err, set for this state */
        break;
    case SS_UNCONNECTED://未建立连接,因此发起连接走的是这个流程
        err = -EISCONN;
        if (sk->sk_state != TCP_CLOSE)
            goto out;
        //主处理函数,最终调用的是tcp_v4_connect()函数
        err = sk->sk_prot->connect(sk, uaddr, addr_len);
        if (err < 0)
            goto out;

        sock->state = SS_CONNECTING;

        /* Just entered SS_CONNECTING state; the only
         * difference is that return value in non-blocking
         * case is EINPROGRESS, rather than EALREADY.
         */
        //如果是非阻塞调用,那么最后返回的就是这个错误码
        err = -EINPROGRESS;
        break;
    }

    //如果connect设置的是非阻塞,获取超时时间
    //超时时间可以通过SO_SNDTIMEO选项设置
    timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);

    if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
        int writebias = (sk->sk_protocol == IPPROTO_TCP) &&
                tcp_sk(sk)->fastopen_req &&
                tcp_sk(sk)->fastopen_req->data ? 1 : 0;

        /* Error code is set above */
        //非阻塞时,timeo为0,直接返回;否则设置定时器,然后调度出去,等待超时返回
        if (!timeo || !inet_wait_for_connect(sk, timeo, writebias))
            goto out;

        err = sock_intr_errno(timeo);
        if (signal_pending(current))
            goto out;
    }
...
}

sk->sk_prot指向tcp_prot,因此sk->sk_prot->connect最终调用的就是t**cp_v4_connect
()**。

/* This will initiate an outgoing connection. */
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
    struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
    struct inet_sock *inet = inet_sk(sk);
    struct tcp_sock *tp = tcp_sk(sk);
    __be16 orig_sport, orig_dport;
    __be32 daddr, nexthop;
    struct flowi4 *fl4;
    struct rtable *rt;
    int err;
    struct ip_options_rcu *inet_opt;
...
    nexthop = daddr = usin->sin_addr.s_addr;//赋值下一跳地址和目的地址,
    inet_opt = rcu_dereference_protected(inet->inet_opt,
                         sock_owned_by_user(sk));
    if (inet_opt && inet_opt->opt.srr) {
        if (!daddr)
            return -EINVAL;
        nexthop = inet_opt->opt.faddr;
    }

    orig_sport = inet->inet_sport;//源地址
    orig_dport = usin->sin_port;//源端口
    fl4 = &inet->cork.fl.u.ip4;
    //根据当前信息,查找路由,并新建路由缓存
    rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
                  RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
                  IPPROTO_TCP,
                  orig_sport, orig_dport, sk);
...
    if (!inet->inet_saddr)
        //如果socket没有绑定ip地址,使用路由查询返回的结果
        inet->inet_saddr = fl4->saddr;
    //inet_rcv_saddr表示的是本地绑定的ip地址,也就是源地址
    inet->inet_rcv_saddr = inet->inet_saddr;

    if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
        /* Reset inherited state */
        tp->rx_opt.ts_recent       = 0;
        tp->rx_opt.ts_recent_stamp = 0;
        if (likely(!tp->repair))
            tp->write_seq      = 0;
    }

    if (tcp_death_row.sysctl_tw_recycle &&
        !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
        tcp_fetch_timewait_stamp(sk, &rt->dst);

    inet->inet_dport = usin->sin_port;//目的端口
    inet->inet_daddr = daddr;//目的地址

    inet_csk(sk)->icsk_ext_hdr_len = 0;
    if (inet_opt)
        inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;

    tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;

    tcp_set_state(sk, TCP_SYN_SENT);//socket进入SYN-SENT状态
    //绑定ip和端口号,并将sock加入哈希表中
    err = inet_hash_connect(&tcp_death_row, sk);
    if (err)
        goto failure;

    sk_set_txhash(sk);
    //使用新的端口号再次做路由查询,
    //因为如果客户端没有用bind()绑定IP地址和端口号,上面inet_hash_connect()
    //就会自动选择一个端口号,因此源端口会不一样
    rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
                   inet->inet_sport, inet->inet_dport, sk);
    if (IS_ERR(rt)) {
        err = PTR_ERR(rt);
        rt = NULL;
        goto failure;
    }
    /* OK, now commit destination to socket.  */
    sk->sk_gso_type = SKB_GSO_TCPV4;
    sk_setup_caps(sk, &rt->dst);

    if (!tp->write_seq && likely(!tp->repair))
        //生成序列号
        tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
                               inet->inet_daddr,
                               inet->inet_sport,
                               usin->sin_port);

    inet->inet_id = tp->write_seq ^ jiffies;

    //由socket层转入TCP层,构造SYN报文并发送
    err = tcp_connect(sk);
...
}

我们知道,一般情况客户端发起连接不会调用bind()绑定ip和端口号,这个动作交由系统自动处理,其实就是在调用connect()时,由inet_hash_connect()完成这一操作。

/*
 * Bind a port for a connect operation and hash it.
 */
int inet_hash_connect(struct inet_timewait_death_row *death_row,
              struct sock *sk)
{
    return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
            __inet_check_established, __inet_hash_nolisten);
}

int __inet_hash_connect(struct inet_timewait_death_row *death_row,
        struct sock *sk, u32 port_offset,
        int (*check_established)(struct inet_timewait_death_row *,
            struct sock *, __u16, struct inet_timewait_sock **),
        int (*hash)(struct sock *sk, struct inet_timewait_sock *twp))
{
    struct inet_hashinfo *hinfo = death_row->hashinfo;
    const unsigned short snum = inet_sk(sk)->inet_num;
    struct inet_bind_hashbucket *head;
    struct inet_bind_bucket *tb;
    int ret;
    struct net *net = sock_net(sk);
    int twrefcnt = 1;

    if (!snum) {//没有绑定端口
        //这里选择端口和bind()调用中选择端口差不多
        int i, remaining, low, high, port;
        static u32 hint;
        u32 offset = hint + port_offset;
        struct inet_timewait_sock *tw = NULL;

        inet_get_local_port_range(net, &low, &high);//端口范围
        remaining = (high - low) + 1;

        /* By starting with offset being an even number,
         * we tend to leave about 50% of ports for other uses,
         * like bind(0).
         */
        offset &= ~1;

        for (i = 0; i < remaining; i++) {
            port = low + (i + offset) % remaining;
            if (inet_is_reserved_local_port(port))//端口是否被保留
                continue;
            head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)];
            spin_lock_bh(&head->lock);

             //检查该端口是否会和bind哈希表的连接冲突
            inet_bind_bucket_for_each(tb, &head->chain) {
                if (net_eq(ib_net(tb), net) && tb->port == port) {
                    //fastreuse和fastreuseport>=0大部分情况都成立,
                    //初始化时为0,使用tcp选项设置时大于0
                    //所以只有通过connect()选取的端口才允许connect()重用
                    //其实也就是为了避免重用bind使用的端口
                    //当然因为后面这两个参数赋值为-1,因此bind()里也是可以重用该端口的
                    //但bind使用过后这两个参数就置为1了(可重用的情况下),后续connect就不能再使用
                    if (tb->fastreuse >= 0 || tb->fastreuseport >= 0)
                        goto next_port;
                    WARN_ON(hlist_empty(&tb->owners));
                    //如果和bind哈希表不冲突,再检查是否和established以及time_wait表冲突
                    if (!check_established(death_row, sk, port, &tw))
                        goto ok;
                    goto next_port;
                }
            }
            //当前端口不冲突,可以使用,创建inet_bind_bucket保存ip|port信息
            tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
                    net, head, port);
            if (!tb) {
                spin_unlock_bh(&head->lock);
                break;
            }
            //设置地址端口重用参数为-1
            tb->fastreuse = -1;
            tb->fastreuseport = -1;
            goto ok;

        next_port:
            spin_unlock_bh(&head->lock);
            cond_resched();
        }

        return -EADDRNOTAVAIL;

ok:
        hint += (i + 2) & ~1;

        /* Head lock still held and bh's disabled */
        inet_bind_hash(sk, tb, port);//加入bind哈希表
        if (sk_unhashed(sk)) {
            //设置该sk的源端口
            inet_sk(sk)->inet_sport = htons(port);
            //如果没有加入到established哈希表中
            //调用hash,即__inet_hash_nolisten加入ehash中
            twrefcnt += hash(sk, tw);
        }
        if (tw)
            //把刚才查找到的time_wait sk从bind哈希表中摘除
            twrefcnt += inet_twsk_bind_unhash(tw, hinfo);
        spin_unlock(&head->lock);

        if (tw) {
            //和__inet_check_established一样
            //将bind哈希表和established哈希表中的该tw sock删除
            //不知道为什么非要区分
            inet_twsk_deschedule(tw, death_row);
            while (twrefcnt) {
                twrefcnt--;
                inet_twsk_put(tw);
            }
        }

        ret = 0;
        goto out;
    }
    //如果绑定了端口
    head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)];
    tb  = inet_csk(sk)->icsk_bind_hash;
    spin_lock_bh(&head->lock);
    if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
        //绑定该端口的连接只有当前这一个,肯定不会冲突,直接使用
        //调用hash,即__inet_hash_nolisten()加入ehash哈希表中
        hash(sk, NULL);
        spin_unlock_bh(&head->lock);
        return 0;
    } else {
        spin_unlock(&head->lock);
        /* No definite answer... Walk to established hash table */
        //检查是否和established以及time_wait表冲突
        //这里都不考虑是否可重用就直接判断是否冲突,不合适吧
        //毕竟__inet_check_established()里除了判断端口可重用
        //链表里的sk是否允许重用并没有检查
        ret = check_established(death_row, sk, snum, NULL);
out:
        local_bh_enable();
        return ret;
    }
}

绑定端口的时候就和bind()系统调用相似了,要考虑系统选择的端口是否会有冲突,这其中涉及几条链表的冲突判断。第一条自然是bind哈希表,然后是established哈希表,在后续的内核版本中将time_wait独立为一个哈希表,因此也要检查这个time_wait哈希表。当前CentOS 7.4内核仍然是一个链表。在这就是通过__inet_check_established来判断是否和established或者time_wait连接冲突的。

static int __inet_check_established(struct inet_timewait_death_row *death_row,
                    struct sock *sk, __u16 lport,
                    struct inet_timewait_sock **twp)
{
    struct inet_hashinfo *hinfo = death_row->hashinfo;
    struct inet_sock *inet = inet_sk(sk);
    __be32 daddr = inet->inet_rcv_saddr;
    __be32 saddr = inet->inet_daddr;
    int dif = sk->sk_bound_dev_if;
    INET_ADDR_COOKIE(acookie, saddr, daddr)
    const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
    struct net *net = sock_net(sk);
    unsigned int hash = inet_ehashfn(net, daddr, lport,
                     saddr, inet->inet_dport);
    //根据四元组获取established哈希表
    struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
    spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
    struct sock *sk2;
    const struct hlist_nulls_node *node;
    struct inet_timewait_sock *tw = NULL;
    int twrefcnt = 0;

    spin_lock(lock);

    sk_nulls_for_each(sk2, node, &head->chain) {
        if (sk2->sk_hash != hash)
            continue;
        //socket四元组信息等是否相同
        if (likely(INET_MATCH(sk2, net, acookie,
                     saddr, daddr, ports, dif))) {
            //如果各信息相同,考虑time_wait的复用
            if (sk2->sk_state == TCP_TIME_WAIT) {
                tw = inet_twsk(sk2);
                //检查time_wait状态的连接是否可以重用
                if (twsk_unique(sk, sk2, twp))
                    break;
            }
            goto not_unique;
        }
    }

    /* Must record num and sport now. Otherwise we will see
     * in hash table socket with a funny identity.
     */
    inet->inet_num = lport;//设置绑定端口号
    inet->inet_sport = htons(lport);//设置源端口
    sk->sk_hash = hash;
    WARN_ON(!sk_unhashed(sk));
    //这就加入到established表中了?不合适吧
    //不应该在第二次握手后,客户端才认为连接建立了吗
    //难道对于重用的sk处理就是如此。。。
    __sk_nulls_add_node_rcu(sk, &head->chain);
    if (tw) {
        //从established表中摘除这个time_wait sk
        //在后面的内核版本中,established和time_wait表分开了
        //当前版本(CentOS 7.4)仍是同一个链表
        twrefcnt = inet_twsk_unhash(tw);
        NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
    }
    spin_unlock(lock);
    if (twrefcnt)
        inet_twsk_put(tw);
    sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);

    //如果twp不为空,将该time_wait sk返回
    if (twp) {
        *twp = tw;
    } else if (tw) {
        //twp为空,即不需要返回time_wait的socket,在此做释放
        //将bind哈希表和established哈希表中的该tw sock删除
        /* Silly. Should hash-dance instead... */
        inet_twsk_deschedule(tw, death_row);

        inet_twsk_put(tw);
    }
    return 0;

not_unique:
    spin_unlock(lock);
    return -EADDRNOTAVAIL;
}

这其中time_wait状态的socket是比较特殊的,因为即使TCP的四元组都相同,如果设置了tcp_tw_reuse参数的话,是可以复用的。这里就是通过twsk_unique()检查。

static inline int twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
{
    if (sk->sk_prot->twsk_prot->twsk_unique != NULL)
        return sk->sk_prot->twsk_prot->twsk_unique(sk, sktw, twp);
    return 0;
}

因为sk->sk_prot指向tcp_prot,因此sk->sk_prot->twsk_prot指向的是tcp_timewait_sock_ops

static struct timewait_sock_ops tcp_timewait_sock_ops = {
    .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
    .twsk_unique    = tcp_twsk_unique,
    .twsk_destructor= tcp_twsk_destructor,
};

因此最终通过tcp_twsk_unique()判断该time_wait状态socket能不能被connect复用。

int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
{
    const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
    struct tcp_sock *tp = tcp_sk(sk);

    if (tcptw->tw_ts_recent_stamp && //不冲突的条件为链表中的连接开启了时间戳选项
        //twp为空,即不需要返回time_wait的socket
        (twp == NULL ||
        //或者开启了time_wait状态sk重用选项,且当前sk创建时间比该time_wait sk晚1s
        (sysctl_tcp_tw_reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
        tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
        if (tp->write_seq == 0)
            tp->write_seq = 1;
        //不冲突,复用该连接的一些ip选项信息
        tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
        tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
        sock_hold(sktw);
        return 1;
    }

    return 0;
}

绑定好ip和端口后,我们回到tcp_v4_connect()函数中。接下来就可以发送SYN报文了。这就是tcp_connect()干的事了。

/* Build a SYN and send it off. */
int tcp_connect(struct sock *sk)
{
    struct tcp_sock *tp = tcp_sk(sk);
    struct sk_buff *buff;
    int err;

    //初始化各种连接参数,比如初始RTO值为1HZ,即1s
    tcp_connect_init(sk);

    if (unlikely(tp->repair)) {
        tcp_finish_connect(sk, NULL);
        return 0;
    }

    //分配skb结构体
    buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
    if (unlikely(buff == NULL))
        return -ENOBUFS;

    /* Reserve space for headers. */
    skb_reserve(buff, MAX_TCP_HEADER);

    //初始化skb
    tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
    tp->retrans_stamp = tcp_time_stamp;//记录发包时间
    tcp_connect_queue_skb(sk, buff);
    TCP_ECN_send_syn(sk, buff);

    /* Send off SYN; include data in Fast Open. */
    //如果开启快速重传,调用tcp_send_syn_data()发送syn报文,否则走tcp_transmit_skb()
    err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
          tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
    if (err == -ECONNREFUSED)
        return err;

    /* We change tp->snd_nxt after the tcp_transmit_skb() call
     * in order to make this packet get counted in tcpOutSegs.
     */
    tp->snd_nxt = tp->write_seq;
    tp->pushed_seq = tp->write_seq;
    TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);

    /* Timer for repeating the SYN until an answer. */
    //启动重传定时器
    inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
                  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
    return 0;
}

初始化连接的各种参数之后,分配一个skb_buff,用于承载SYN报文。这里我们先不考虑快速开启的情况,因此会调用tcp_transmit_skb()来接力对SYN报文的发送。

static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
                gfp_t gfp_mask)
{
    const struct inet_connection_sock *icsk = inet_csk(sk);
    struct inet_sock *inet;
    struct tcp_sock *tp;
    struct tcp_skb_cb *tcb;
    struct tcp_out_options opts;
    unsigned int tcp_options_size, tcp_header_size;
    struct tcp_md5sig_key *md5;
    struct tcphdr *th;
    int err;

    BUG_ON(!skb || !tcp_skb_pcount(skb));

    if (clone_it) {
        skb_mstamp_get(&skb->skb_mstamp);

        if (unlikely(skb_cloned(skb)))
            skb = pskb_copy(skb, gfp_mask);
        else
            skb = skb_clone(skb, gfp_mask);
        if (unlikely(!skb))
            return -ENOBUFS;
    }

    inet = inet_sk(sk);
    tp = tcp_sk(sk);
    tcb = TCP_SKB_CB(skb);
    memset(&opts, 0, sizeof(opts));

    if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
        //为syn报文构建tcp选项
        tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
    else
        //为已建立连接的socket构建tcp选项
        tcp_options_size = tcp_established_options(sk, skb, &opts,
                               &md5);
    tcp_header_size = tcp_options_size + sizeof(struct tcphdr);

    if (tcp_packets_in_flight(tp) == 0)
        //网络上没有改连接的报文,即第一次传输报文
        tcp_ca_event(sk, CA_EVENT_TX_START);

    /* if no packet is in qdisc/device queue, then allow XPS to select
     * another queue.
     */
    skb->ooo_okay = sk_wmem_alloc_get(sk) == 0;

    skb_push(skb, tcp_header_size);
    skb_reset_transport_header(skb);

    skb_orphan(skb);
    skb->sk = sk;
    skb->destructor = skb_is_tcp_pure_ack(skb) ? sock_wfree : tcp_wfree;
    skb_set_hash_from_sk(skb, sk);
    atomic_add(skb->truesize, &sk->sk_wmem_alloc);

    skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);

    /* Build TCP header and checksum it. */
    //开始构建tcp头部
    th = tcp_hdr(skb);
    th->source      = inet->inet_sport;//源端口
    th->dest        = inet->inet_dport;//目的端口
    th->seq         = htonl(tcb->seq);//序列号
    th->ack_seq     = htonl(tp->rcv_nxt);//ack号
    *(((__be16 *)th) + 6)   = htons(((tcp_header_size >> 2) << 12) |
                    tcb->tcp_flags);

    if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
        /* RFC1323: The window in SYN & SYN/ACK segments
         * is never scaled.
         */
        th->window  = htons(min(tp->rcv_wnd, 65535U));
    } else {
        th->window  = htons(tcp_select_window(sk));
    }
    th->check       = 0;
    th->urg_ptr     = 0;

    /* The urg_mode check is necessary during a below snd_una win probe */
    if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
        if (before(tp->snd_up, tcb->seq + 0x10000)) {
            th->urg_ptr = htons(tp->snd_up - tcb->seq);
            th->urg = 1;
        } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
            th->urg_ptr = htons(0xFFFF);
            th->urg = 1;
        }
    }
    //tcp选项
    tcp_options_write((__be32 *)(th + 1), tp, &opts);
    if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))
        TCP_ECN_send(sk, skb, tcp_header_size);

#ifdef CONFIG_TCP_MD5SIG
    /* Calculate the MD5 hash, as we have all we need now */
    if (md5) {
        sk_nocaps_add(sk, NETIF_F_GSO_MASK);
        tp->af_specific->calc_md5_hash(opts.hash_location,
                           md5, sk, NULL, skb);
    }
#endif

    //生成检验和checksum
    icsk->icsk_af_ops->send_check(sk, skb);

    if (likely(tcb->tcp_flags & TCPHDR_ACK))
        tcp_event_ack_sent(sk, tcp_skb_pcount(skb));

    if (skb->len != tcp_header_size)
        tcp_event_data_sent(tp, sk);

    if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
        TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
                  tcp_skb_pcount(skb));

    tp->segs_out += tcp_skb_pcount(skb);
    /* Our usage of tstamp should remain private */
    skb->tstamp.tv64 = 0;
    //发往ip层处理了,调用ip_queue_xmit()
    err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl);

    if (likely(err <= 0))
        return err;

    tcp_enter_cwr(sk);

    return net_xmit_eval(err);
}

这里所做的就是构建TCP的头部信息,以及TCP选项等,这样TCP层的处理就完成了,调用icsk->icsk_af_ops->queue_xmit,发往ip层继续处理。icsk->icsk_af_ops指向ipv4_specific

const struct inet_connection_sock_af_ops ipv4_specific = {
    .queue_xmit    = ip_queue_xmit,
    .send_check    = tcp_v4_send_check,
    .rebuild_header    = inet_sk_rebuild_header,
    .sk_rx_dst_set     = inet_sk_rx_dst_set,
    .conn_request      = tcp_v4_conn_request,
    .syn_recv_sock     = tcp_v4_syn_recv_sock,
...
};

因此icsk->icsk_af_ops->queue_xmit最终是调用ip_queue_xmit()继续在IP层处理。这里就不再往下分析了,后面抽个时间把整个收发包流程梳理一遍。

但是connect的职责并没有就此结束,因为要等到三次握手的第二次握手成功后,connect才能算任务完成。这就涉及到SYN报文的接收以及SYN|ACK报文的发送,以及第三次握手的ACK的发送了。我们慢慢来。

阅读更多
版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/u010039418/article/details/79971453
个人分类: Linux
想对作者说点什么? 我来说一句

linux系统调用linux系统调用

2011年08月02日 101KB 下载

没有更多推荐了,返回首页

不良信息举报

Linux connect系统调用

最多只允许输入30个字

加入CSDN,享受更精准的内容推荐,与500万程序员共同成长!
关闭
关闭