TCP/IP impl
http://www.tech4cloud.com/tech/2012/11/09/TCPIP-note.html
协议策略和驱动进行分离
- 为了搞清楚driver层和protocol层是怎么抽象分层和关联的
- 如何在协议栈里进行流控和netfilter的如何工作
static struct packet_type ip_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_IP),
.func = ip_rcv,
.gso_send_check = inet_gso_send_check,
.gso_segment = inet_gso_segment,
.gro_receive = inet_gro_receive,
.gro_complete = inet_gro_complete,
};
gso: generic segment offload
gro: generic receive offload
/* IP Hooks */
/* After promisc drops, checksum checks. */
#define NF_IP_PRE_ROUTING 0
/* If the packet is destined for this box. */
#define NF_IP_LOCAL_IN 1
/* If the packet is destined for another interface. */
#define NF_IP_FORWARD 2
/* Packets coming from a local process. */
#define NF_IP_LOCAL_OUT 3
/* Packets about to hit the wire. */
#define NF_IP_POST_ROUTING 4
#define NF_IP_NUMHOOKS 5
Packet Send Flow
这里涉及到协议栈层最终是如何调用网卡硬件进行发包的
dev_queue_xmit() {
.....
txq = dev_pick_tx(dev, skb);
q = rcu_dereference_bh(txq->qdisc);
.....
if (q->enqueue) {
rc = __dev_xmit_skb(skb, q, dev, txq);
goto out;
}
.....
if (!netif_xmit_stopped(txq)) {
__this_cpu_inc(xmit_recursion);
rc = dev_hard_start_xmit(skb, dev, txq);
__this_cpu_dec(xmit_recursion);
if (dev_xmit_complete(rc)) {
HARD_TX_UNLOCK(dev, txq);
goto out;
}
}
}
__dev_xmit_skb()
发送的时候确认Qos的配置:
if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
kfree_skb(skb);
rc = NET_XMIT_DROP;
} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
qdisc_run_begin(q)) {
/*
* This is a work-conserving queue; there are no old skbs
* waiting to be sent out; and the qdisc is not running -
* xmit the skb directly.
*/
if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
skb_dst_force(skb);
qdisc_bstats_update(q, skb);
if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
if (unlikely(contended)) {
spin_unlock(&q->busylock);
contended = false;
}
__qdisc_run(q);
} else
qdisc_run_end(q);
rc = NET_XMIT_SUCCESS;
} else {
skb_dst_force(skb);
rc = q->enqueue(skb, q) & NET_XMIT_MASK;
if (qdisc_run_begin(q)) {
if (unlikely(contended)) {
spin_unlock(&q->busylock);
contended = false;
}
__qdisc_run(q);
}
}
实际上__qdisc_run()
使用的是软中断去发包
void __qdisc_run(struct Qdisc *q){
......
if (--quota <= 0 || need_resched()) {
__netif_schedule(q);
break;
}
}
static inline void __netif_reschedule(struct Qdisc *q) {
local_irq_save(flags);
sd = &__get_cpu_var(softnet_data);
q->next_sched = NULL;
*sd->output_queue_tailp = q;
sd->output_queue_tailp = &q->next_sched;
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_restore(flags);
}
或者在dev_hard_start_xmit(skb, dev, txq)
int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
struct netdev_queue *txq) {
if (netif_needs_gso(skb, features)) {
if (unlikely(dev_gso_segment(skb, features)))
goto out_kfree_skb;
if (skb->next)
goto gso;
}
skb_len = skb->len;
rc = ops->ndo_start_xmit(skb, dev);
gso:
skb_len = nskb->len;
rc = ops->ndo_start_xmit(nskb, dev);
}
使用的是ndo_start_xmit()
这个在dev初始化的时候被赋值,比如在ixgbe中:
static const struct net_device_ops ixgbe_netdev_ops = {
......
.ndo_start_xmit = ixgbe_xmit_frame,
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = ixgbe_netpoll,
#endif
......
}
netfilter后怎么发包
static int ip_finish_output(struct sk_buff *skb){
......
if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
return ip_fragment(skb, ip_finish_output2);
else
return ip_finish_output2(skb);
}
static inline int ip_finish_output2(struct sk_buff *skb){
if (neigh) {
int res = neigh_output(neigh, skb);
rcu_read_unlock();
return res;
}
}
static inline int neigh_output(struct neighbour *n, struct sk_buff *skb) {
if ((n->nud_state & NUD_CONNECTED) && hh->hh_len)
return neigh_hh_output(hh, skb);
else
return n->output(n, skb);
}
static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb) {
......
skb_push(skb, hh_len);
return dev_queue_xmit(skb);
}
int ip_output(struct sk_buff *skb) {
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
int ip_mc_output(struct sk_buff *skb) {
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
skb->dev, ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
static int __mkroute_input(struct sk_buff *skb,
const struct fib_result *res,
struct in_device *in_dev,
__be32 daddr, __be32 saddr, u32 tos,
struct rtable **result) {
rth->dst.input = ip_forward;
rth->dst.output = ip_output;
}
static struct rtable *__mkroute_output(const struct fib_result *res,
const struct flowi4 *fl4,
__be32 orig_daddr, __be32 orig_saddr,
int orig_oif, __u8 orig_rtos,
struct net_device *dev_out,
unsigned int flags) {
rth->dst.output = ip_output;
......
if (flags & RTCF_LOCAL) {
rth->dst.input = ip_local_deliver;
rth->rt_spec_dst = fl4->daddr;
}
}
根据路由发送包
int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl) {
if (rt != NULL)
goto packet_routed;
/* If this fails, retransmit mechanism of transport layer will
* keep trying until route appears or the connection times
* itself out.
*/
rt = ip_route_output_ports(sock_net(sk), fl4, sk,
daddr, inet->inet_saddr,
inet->inet_dport,
inet->inet_sport,
sk->sk_protocol,
RT_CONN_FLAGS(sk),
sk->sk_bound_dev_if);
packet_routed:
res = ip_local_out(skb);
}
int ip_local_out(struct sk_buff *skb) {
......
if (likely(err == 1))
err = dst_output(skb);
return err;
}
static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
void *from, size_t length,
struct rtable **rtp,
unsigned int flags) {
......
err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
rt->dst.dev, dst_output);
}
int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4) {
skb = ip_finish_skb(sk, fl4);
if (!skb)
return 0;
/* Netfilter gets whole the not fragmented skb. */
return ip_send_skb(skb);
}
static inline int dst_output(struct sk_buff *skb) {
return skb_dst(skb)->output(skb);
}
const struct inet_connection_sock_af_ops ipv4_specific = {
.queue_xmit = ip_queue_xmit,
.send_check = tcp_v4_send_check,
};
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
gfp_t gfp_mask) {
err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl);
}
initialization flow
static int __init inet_init(void) {
......
rc = proto_register(&tcp_prot, 1);
......
}
struct proto tcp_prot = {
.name = "TCP",
.owner = THIS_MODULE,
.close = tcp_close,
.connect = tcp_v4_connect,
.disconnect = tcp_disconnect,
.accept = inet_csk_accept,
.ioctl = tcp_ioctl,
.init = tcp_v4_init_sock,
......
};
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) {
......
err = tcp_connect(sk);
}
int tcp_connect(struct sock *sk) {
err = tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
......
/* Timer for repeating the SYN until an answer. */
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
}
Packet Receive Flow
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) {
return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL,
ip_rcv_finish);
}
static int ip_rcv_finish(struct sk_buff *skb) {
return dst_input(skb);
}
/*Input packet from network to transport.*/
static inline int dst_input(struct sk_buff *skb) {
return skb_dst(skb)->input(skb);
}
这里的input()在mkroute_output()和mkroute_input()中初始化成ip_forward()和ip_local_deliver()
ip_forward()做快转
int ip_forward(struct sk_buff *skb) {
......
return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev,
rt->dst.dev, ip_forward_finish);
}
static int ip_forward_finish(struct sk_buff *skb) {
return dst_output(skb);
}
ip_local_deliver()传到上层
int ip_local_deliver(struct sk_buff *skb) {
return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
ip_local_deliver_finish);
}
static int ip_local_deliver_finish(struct sk_buff *skb) {
resubmit:
raw = raw_local_deliver(skb, protocol);
}
int raw_local_deliver(struct sk_buff *skb, int protocol) {
if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash))
raw_sk = NULL;
return raw_sk != NULL;
}
static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash) {
}
- ip数据包的校验
- 防火墙的处理(也就是netfilter子系统)
- 处理options(这里的options包含了一些可选的信息。比如时间戳或者源路由option).
- 切包和组包(由于mtu的存在,因此我们需要切包和组包).
- 接收,输出和转发操作。