TCP/IP

190 篇文章 1 订阅

TCP/IP impl

http://www.tech4cloud.com/tech/2012/11/09/TCPIP-note.html

协议策略和驱动进行分离

  • 为了搞清楚driver层和protocol层是怎么抽象分层和关联的
  • 如何在协议栈里进行流控和netfilter的如何工作

static struct packet_type ip_packet_type __read_mostly = {
    .type = cpu_to_be16(ETH_P_IP),
    .func = ip_rcv,
    .gso_send_check = inet_gso_send_check,
    .gso_segment = inet_gso_segment,
    .gro_receive = inet_gro_receive,
    .gro_complete = inet_gro_complete,
};

gso: generic segment offload

gro: generic receive offload

/* IP Hooks */
/* After promisc drops, checksum checks. */
#define NF_IP_PRE_ROUTING   0
/* If the packet is destined for this box. */
#define NF_IP_LOCAL_IN      1
/* If the packet is destined for another interface. */
#define NF_IP_FORWARD       2
/* Packets coming from a local process. */
#define NF_IP_LOCAL_OUT     3
/* Packets about to hit the wire. */
#define NF_IP_POST_ROUTING  4
#define NF_IP_NUMHOOKS      5

Packet Send Flow


这里涉及到协议栈层最终是如何调用网卡硬件进行发包的

dev_queue_xmit() {
    .....
    txq = dev_pick_tx(dev, skb);
    q = rcu_dereference_bh(txq->qdisc);
    .....
    if (q->enqueue) {
        rc = __dev_xmit_skb(skb, q, dev, txq);
        goto out;
    }
    .....
    if (!netif_xmit_stopped(txq)) {
        __this_cpu_inc(xmit_recursion);
        rc = dev_hard_start_xmit(skb, dev, txq);
        __this_cpu_dec(xmit_recursion);
        if (dev_xmit_complete(rc)) {
            HARD_TX_UNLOCK(dev, txq);
            goto out;
        }
    }       
}

__dev_xmit_skb()发送的时候确认Qos的配置:

if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
    kfree_skb(skb);
    rc = NET_XMIT_DROP;
} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
       qdisc_run_begin(q)) {
    /*
     * This is a work-conserving queue; there are no old skbs
     * waiting to be sent out; and the qdisc is not running -
     * xmit the skb directly.
     */
    if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
        skb_dst_force(skb);

    qdisc_bstats_update(q, skb);

    if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
        if (unlikely(contended)) {
            spin_unlock(&q->busylock);
            contended = false;
        }
        __qdisc_run(q);
    } else
        qdisc_run_end(q);

    rc = NET_XMIT_SUCCESS;
} else {
    skb_dst_force(skb);
    rc = q->enqueue(skb, q) & NET_XMIT_MASK;
    if (qdisc_run_begin(q)) {
        if (unlikely(contended)) {
            spin_unlock(&q->busylock);
            contended = false;
        }
        __qdisc_run(q);
    }
}

实际上__qdisc_run()使用的是软中断去发包

void __qdisc_run(struct Qdisc *q){
    ......
    if (--quota <= 0 || need_resched()) {
        __netif_schedule(q);
        break;
    }
}

static inline void __netif_reschedule(struct Qdisc *q) {
    local_irq_save(flags);
    sd = &__get_cpu_var(softnet_data);
    q->next_sched = NULL;
    *sd->output_queue_tailp = q;
    sd->output_queue_tailp = &q->next_sched;
    raise_softirq_irqoff(NET_TX_SOFTIRQ);
    local_irq_restore(flags);
}

或者在dev_hard_start_xmit(skb, dev, txq)

int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
        struct netdev_queue *txq) {
    if (netif_needs_gso(skb, features)) {
        if (unlikely(dev_gso_segment(skb, features)))
            goto out_kfree_skb;
        if (skb->next)
            goto gso;
    }

    skb_len = skb->len;
    rc = ops->ndo_start_xmit(skb, dev); 

gso:
    skb_len = nskb->len;
    rc = ops->ndo_start_xmit(nskb, dev);
}

使用的是ndo_start_xmit()这个在dev初始化的时候被赋值,比如在ixgbe中:

static const struct net_device_ops ixgbe_netdev_ops = {
    ......
    .ndo_start_xmit     = ixgbe_xmit_frame,
#ifdef CONFIG_NET_POLL_CONTROLLER
    .ndo_poll_controller    = ixgbe_netpoll,
#endif
    ......
}

netfilter后怎么发包

static int ip_finish_output(struct sk_buff *skb){
    ......
    if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
        return ip_fragment(skb, ip_finish_output2);
    else
        return ip_finish_output2(skb);
}

static inline int ip_finish_output2(struct sk_buff *skb){
    if (neigh) {
        int res = neigh_output(neigh, skb);

        rcu_read_unlock();
        return res;
    }
}

static inline int neigh_output(struct neighbour *n, struct sk_buff *skb) {
    if ((n->nud_state & NUD_CONNECTED) && hh->hh_len)
        return neigh_hh_output(hh, skb);
    else
        return n->output(n, skb);
}

static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb) {
    ......
    skb_push(skb, hh_len);
    return dev_queue_xmit(skb);
}

int ip_output(struct sk_buff *skb) {
    return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
        ip_finish_output,
        !(IPCB(skb)->flags & IPSKB_REROUTED));
}

int ip_mc_output(struct sk_buff *skb) {
    return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
        skb->dev, ip_finish_output,
        !(IPCB(skb)->flags & IPSKB_REROUTED));
}

static int __mkroute_input(struct sk_buff *skb,
           const struct fib_result *res,
           struct in_device *in_dev,
           __be32 daddr, __be32 saddr, u32 tos,
           struct rtable **result) {
    rth->dst.input = ip_forward;
    rth->dst.output = ip_output;
}

static struct rtable *__mkroute_output(const struct fib_result *res,
                   const struct flowi4 *fl4,
                   __be32 orig_daddr, __be32 orig_saddr,
                   int orig_oif, __u8 orig_rtos,
                   struct net_device *dev_out,
                   unsigned int flags) {
    rth->dst.output = ip_output;
    ......
    if (flags & RTCF_LOCAL) {
        rth->dst.input = ip_local_deliver;
        rth->rt_spec_dst = fl4->daddr;
    }
}

根据路由发送包

int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl) {
    if (rt != NULL)
        goto packet_routed;

    /* If this fails, retransmit mechanism of transport layer will
     * keep trying until route appears or the connection times
     * itself out.
     */
    rt = ip_route_output_ports(sock_net(sk), fl4, sk,
                   daddr, inet->inet_saddr,
                   inet->inet_dport,
                   inet->inet_sport,
                   sk->sk_protocol,
                   RT_CONN_FLAGS(sk),
                   sk->sk_bound_dev_if);

packet_routed:
    res = ip_local_out(skb);
}

int ip_local_out(struct sk_buff *skb) {
    ......
    if (likely(err == 1))
        err = dst_output(skb);

    return err;
}

static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
        void *from, size_t length,
        struct rtable **rtp,
        unsigned int flags) {
    ......
    err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
          rt->dst.dev, dst_output);     
}

int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4) {
    skb = ip_finish_skb(sk, fl4);
    if (!skb)
        return 0;

    /* Netfilter gets whole the not fragmented skb. */
    return ip_send_skb(skb);
}


static inline int dst_output(struct sk_buff *skb) {
    return skb_dst(skb)->output(skb);
}

const struct inet_connection_sock_af_ops ipv4_specific = {
    .queue_xmit    = ip_queue_xmit,
    .send_check    = tcp_v4_send_check,
};

static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
            gfp_t gfp_mask) {
    err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl);
}

initialization flow

static int __init inet_init(void) {
    ......
    rc = proto_register(&tcp_prot, 1);
    ......
}

struct proto tcp_prot = {
    .name           = "TCP",
    .owner          = THIS_MODULE,
    .close          = tcp_close,
    .connect        = tcp_v4_connect,
    .disconnect     = tcp_disconnect,
    .accept         = inet_csk_accept,
    .ioctl          = tcp_ioctl,
    .init           = tcp_v4_init_sock,
......
};

int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) {
    ......
    err = tcp_connect(sk);
}

int tcp_connect(struct sock *sk) {
    err = tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
    ......
    /* Timer for repeating the SYN until an answer. */
    inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
                  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
}

Packet Receive Flow


int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) {
    return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL,
       ip_rcv_finish);
}

static int ip_rcv_finish(struct sk_buff *skb) {
    return dst_input(skb);
}

/*Input packet from network to transport.*/
static inline int dst_input(struct sk_buff *skb) {
    return skb_dst(skb)->input(skb);
}

这里的input()在mkroute_output()和mkroute_input()中初始化成ip_forward()和ip_local_deliver()

ip_forward()做快转

int ip_forward(struct sk_buff *skb) {
    ......
    return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev,
        rt->dst.dev, ip_forward_finish);
}

static int ip_forward_finish(struct sk_buff *skb) { 
    return dst_output(skb);
}

ip_local_deliver()传到上层

int ip_local_deliver(struct sk_buff *skb) {
    return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
        ip_local_deliver_finish);
}

static int ip_local_deliver_finish(struct sk_buff *skb) {
    resubmit:
        raw = raw_local_deliver(skb, protocol);
}

int raw_local_deliver(struct sk_buff *skb, int protocol) {
    if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash))
        raw_sk = NULL;

    return raw_sk != NULL;
}

static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash) {


}

  • ip数据包的校验
  • 防火墙的处理(也就是netfilter子系统)
  • 处理options(这里的options包含了一些可选的信息。比如时间戳或者源路由option).
  • 切包和组包(由于mtu的存在,因此我们需要切包和组包).
  • 接收,输出和转发操作。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值