linux内核网络模块原理,linux网络模块的发展历程

2010/6/18:还在分析linux/net代码中,觉得在2.6的内核用来分析代码流程的确很麻烦。原理的东西一般都很好理解,有的技术PPT可能几页就能讲的很清楚了。比如GRE,理论上就是圆环套圆环(记得好像是那部电影??),可是分析2.6的代码就麻烦的很。还要理解tunnel。

所以突然想到,跟踪linux/net功能的发展历程来分析,应该还不错。

于是,先下载了0.96c的代码,发现这个版本的代码还只支持本地socket(AF_UNIX),还不算真正意义上的net。

找来找去,搞了一个0.99.15。还算完善些。

这个版本支持INET,也就是IPV4。所以有值得分析的部分。

先看看吧,有了心得在写下来。希望借这个版本能够把路由策略(route.c)和TCP协议(tcp.c)很好的掌握以下。

1. 路由策略

在ip报文的发送和转发过程中,都涉及到路由的问题。在ip_build_header和ip_forward中,都调用了rt_route()。rt_route()这个函数根据传入的目的ip,返回一个rtable结构。这个结构就是我们经常看到的路由表,route这个命令能够查看到。

struct rtable {

struct rtable *rt_next;//路由表项是一个单项的链表,没有hash,完全是最简单的实现

unsigned long rt_dst; //目的地址

unsigned long rt_mask; //目的掩码

unsigned long rt_gateway;//目的网关

unsigned char rt_flags; //表示下一跳是个什么东西

unsigned char rt_metric;//可以理解为cost

short rt_refcnt;//引用计数

unsigned long rt_use;//命中次数?

unsigned short rt_mss, rt_mtu;//

struct device *rt_dev;//对应的设备,从哪个接口出去

};

在看一下rt_route的实现,简直是爽死了。就这么几行就搞定了2.6内核那一坨东西,不过功能肯定是不及:)

struct rtable * rt_route(unsigned long daddr, struct options *opt)

{

struct rtable *rt;

//遍历路由表

for (rt = rt_base; rt != NULL || early_out ; rt = rt->rt_next) {

//如果完全匹配或同一个子网,则选择这条路由。原来子网的用途在这里

if (!((rt->rt_dst ^ daddr) & rt->rt_mask))

break;

/* broadcast addresses can be special cases.. */

//如果你是一个广播地址,并且该路由表项允许广播,也就是这个路由表项制定的物理接口支持广播,表示命中这个表项。但有一个问题就是如果有多个接口,只能从第一个表项指定的接口进行广播了,因为break了。

if ((rt->rt_dev->flags & IFF_BROADCAST) &&

rt->rt_dev->pa_brdaddr == daddr)

break;

}

//如果是loopback,但没有loopback路由?奇怪,可能是不支持loopback接口吧

if (daddr == rt->rt_dev->pa_addr) {

if ((rt = rt_loopback) == NULL)

goto no_route;

}

rt->rt_use++;//命中次数

return rt;

no_route:

return NULL;

}

由此看来,早期内核对网络的支持非常简单,用来分析网络的实现也很容易。在看看路由表项是怎么添加进去的。rt_add()负责向路由表项添加路由表,而调用rt_add的地方有rt_ioctl和icmp.c中,icmp的路由重定向的支持。

也就是说,添加路由表项有两种方式:

a) inet_ioctl/rt_ioctl用来添加删除。

b) icmp协议的重定向报文。

在rt_add中,指定类型(flags),目的地址,掩码,目的网关地址,对应的物理接口等参数。

如此,只要花费半小时,就分析完了route.c,效率高啊:)

2. TCP协议

linux内核在简单,在TCP协议的实现上也简单不了,所以看一下tcp.c,所以linux的tcp实现参考了BSD Socket,这在文件头中有说明。

先看一下tcp_prot结构

struct proto tcp_prot = {

sock_wmalloc,

sock_rmalloc,

sock_wfree,

sock_rfree,

sock_rspace,

sock_wspace,

tcp_close,

tcp_read,

tcp_write,

tcp_sendto,

tcp_recvfrom,

ip_build_header,

tcp_connect,

tcp_accept,

ip_queue_xmit,

tcp_retransmit,

tcp_write_wakeup,

tcp_read_wakeup,

tcp_rcv,

tcp_select,

tcp_ioctl,

NULL,

tcp_shutdown,

tcp_setsockopt,

tcp_getsockopt,

128,

0,

{NULL,},

"TCP"

};

2010/6/21:稍微考虑了一下,一个完整的C/S通信过程,最好先看Server端的处理流程,在分析Client端的处理。

Server端流程一般都是这样的情况

a)socket

b)bind

c)listen

d)accept //这个过程中一般会fork一个进程处理新建立的链接。

e)send/recv

f)close

好,先看一下socket()都做了哪些事情,sock_register函数注册了inet相关操作,对应函数inet_create()。代码就不贴了,这个函数实现比较简单,就是为sock结构分配内存并初始化。

大概过程是socket(user space)->sys_socketcall(kernel space)->sock_socket()->inet_create(前提是指定inet通信)。

这里在sock_socket中为socket结构分配内存和属于进程的fd,在inet_create中为sock分配内存。

接下来看一下inet_bind

static int

inet_bind(struct socket *sock, struct sockaddr *uaddr,

int addr_len)

{

struct sockaddr_in addr;

struct sock *sk, *sk2;

unsigned short snum;

int err;

sk = (struct sock *) sock->data;

if (sk == NULL) {

printk("Warning: sock->data = NULL: %d\n" ,__LINE__);

return(0);

}

/* check this error. */

if (sk->state != TCP_CLOSE) return(-EIO);

if (sk->num != 0) return(-EINVAL);

err=verify_area(VERIFY_READ, uaddr, addr_len);

if(err)

return err;

memcpy_fromfs(&addr, uaddr, min(sizeof(addr), addr_len));

snum = ntohs(addr.sin_port);

DPRINTF((DBG_INET, "bind sk =%X to port = %d\n", sk, snum));

sk = (struct sock *) sock->data;

/*

* We can't just leave the socket bound wherever it is, it might

* be bound to a privileged port. However, since there seems to

* be a bug here, we will leave it if the port is not privileged.

*/

if (snum == 0) { //如果没有指定bind的端口号,系统为你分配一个

snum = get_new_socknum(sk->prot, 0);

}

if (snum < PROT_SOCK && !suser()) return(-EACCES);

if (addr.sin_addr.s_addr!=0 && chk_addr(addr.sin_addr.s_addr)!=IS_MYADDR)

return(-EADDRNOTAVAIL);    /* Source address MUST be ours! */

if (chk_addr(addr.sin_addr.s_addr) || addr.sin_addr.s_addr == 0)

sk->saddr = addr.sin_addr.s_addr;

DPRINTF((DBG_INET, "sock_array[%d] = %X:\n", snum &(SOCK_ARRAY_SIZE -1),

sk->prot->sock_array[snum &(SOCK_ARRAY_SIZE -1)]));

/* Make sure we are allowed to bind here. */

cli();

outside_loop:

//这里以源端口号做了一个散列,在bind操作,找到符合要求的sk

for(sk2 = sk->prot->sock_array[snum & (SOCK_ARRAY_SIZE -1)];

sk2 != NULL; sk2 = sk2->next) {

#if     1    /* should be below! */

if (sk2->num != snum) continue;

/*    if (sk2->saddr != sk->saddr) continue; */

#endif

if (sk2->dead) {

destroy_sock(sk2);

goto outside_loop;

}

if (!sk->reuse) {

sti();

return(-EADDRINUSE);

}

if (sk2->num != snum) continue;        /* more than one */

if (sk2->saddr != sk->saddr) continue;    /* socket per slot ! -FB */

if (!sk2->reuse) {

sti();

return(-EADDRINUSE);

}

}

sti();

remove_sock(sk);

put_sock(snum, sk);

sk->dummy_th.source = ntohs(sk->num);

sk->daddr = 0;

sk->dummy_th.dest = 0;

return(0);

}

再看listen

static int

inet_listen(struct socket *sock, int backlog)

{

struct sock *sk;

sk = (struct sock *) sock->data;

if (sk == NULL) {

printk("Warning: sock->data = NULL: %d\n" ,__LINE__);

return(0);

}

/* We may need to bind the socket. */

if (sk->num == 0) {

sk->num = get_new_socknum(sk->prot, 0);

if (sk->num == 0) return(-EAGAIN);

put_sock(sk->num, sk);

sk->dummy_th.source = ntohs(sk->num);

}

/* We might as well re use these. */

sk->max_ack_backlog = backlog;//比较感兴趣的就是这里和下面的sk->state, backlog这个值如果不分析socket实现的话,是不会理解这个含义的。先说一下,这个数值的功能是能够同时接受多少个sync报文,也算是为了防止sync攻击的初级防御吧,回想一下tcp的三次握手,先是client端发送sync报文,如果server端在接收这个报文后,还没有创建新的连接,那么最多可以缓存backlog个sync报文,多出的部分直接扔掉

if (sk->state != TCP_LISTEN) {

sk->ack_backlog = 0;

sk->state = TCP_LISTEN;//当设置这个值为listen的时候,对应端口的tcp报文就可以被内核处理了

}

return(0);

}

现在可以看一下tcp_recv

case TCP_LISTEN: //如果是listen状态才处理sync报文

if (th->rst) {//这时收到rst报文,直接丢弃,不处理

kfree_skb(skb, FREE_READ);

release_sock(sk);

return(0);

}

if (th->ack) {//如果收到ack报文,说明可能server端down过,通知对方链接已经被重置

tcp_reset(daddr, saddr, th, sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);

kfree_skb(skb, FREE_READ);

release_sock(sk);

return(0);

}

if (th->syn) {//这个才是正确的报文

#if 0

if (opt->security != 0 || opt->compartment != 0) {

tcp_reset(daddr, saddr, th, prot, opt,dev);

release_sock(sk);

return(0);

}

#endif

/*

* Now we just put the whole thing including

* the header and saddr, and protocol pointer

* into the buffer. We can't respond until the

* user tells us to accept the connection.

*/

tcp_conn_request(sk, skb, daddr, saddr, opt, dev);//这个函数处理sync报文,下面分析

release_sock(sk);

return(0);

}

kfree_skb(skb, FREE_READ);

release_sock(sk);

return(0);

default://缺省的报文丢弃

if (!tcp_sequence(sk, th, len, opt, saddr,dev)) {

kfree_skb(skb, FREE_READ);

release_sock(sk);

return(0);

}

static void

tcp_conn_request(struct sock *sk, struct sk_buff *skb,

unsigned long daddr, unsigned long saddr,

struct options *opt, struct device *dev)

{

struct sk_buff *buff;

struct tcphdr *t1;

unsigned char *ptr;

struct sock *newsk;

struct tcphdr *th;

int tmp;

DPRINTF((DBG_TCP, "tcp_conn_request(sk = %X, skb = %X, daddr = %X, sadd4= %X, \n"

" opt = %X, dev = %X)\n",

sk, skb, daddr, saddr, opt, dev));

th = skb->h.th;

/* If the socket is dead, don't accept the connection. */

if (!sk->dead) {

sk->data_ready(sk,0);

} else {

DPRINTF((DBG_TCP, "tcp_conn_request on dead socket\n"));

tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);

kfree_skb(skb, FREE_READ);

return;

}

/*

* Make sure we can accept more. This will prevent a

* flurry of syns from eating up all our memory.

*/

if (sk->ack_backlog >= sk->max_ack_backlog) {//这里,达到max,丢弃报文

kfree_skb(skb, FREE_READ);

return;

}

/*

* We need to build a new sock struct.

* It is sort of bad to have a socket without an inode attached

* to it, but the wake_up's will just wake up the listening socket,

* and if the listening socket is destroyed before this is taken

* off of the queue, this will take care of it.

*/

//复制一份新的sock

newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);

if (newsk == NULL) {

/* just ignore the syn. It will get retransmitted. */

kfree_skb(skb, FREE_READ);

return;

}

DPRINTF((DBG_TCP, "newsk = %X\n", newsk));

memcpy((void *)newsk,(void *)sk, sizeof(*newsk));

newsk->wback = NULL;

newsk->wfront = NULL;

newsk->rqueue = NULL;

newsk->send_head = NULL;

newsk->send_tail = NULL;

newsk->back_log = NULL;

newsk->rtt = TCP_CONNECT_TIME << 3;

newsk->rto = TCP_CONNECT_TIME;

newsk->mdev = 0;

newsk->max_window = 0;

newsk->cong_window = 1;

newsk->cong_count = 0;

newsk->ssthresh = 0;

newsk->backoff = 0;

newsk->blog = 0;

newsk->intr = 0;

newsk->proc = 0;

newsk->done = 0;

newsk->partial = NULL;

newsk->pair = NULL;

newsk->wmem_alloc = 0;

newsk->rmem_alloc = 0;

newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;

newsk->err = 0;

newsk->shutdown = 0;

newsk->ack_backlog = 0;

newsk->acked_seq = skb->h.th->seq+1;

newsk->fin_seq = skb->h.th->seq;

newsk->copied_seq = skb->h.th->seq;

newsk->state = TCP_SYN_RECV;//设置状态,注意这个是newsk,原来的sk状态仍为LISTEN,这样在下一个tcp_rcv的处理中,会做进一步的处理

newsk->timeout = 0;

newsk->send_seq = jiffies * SEQ_TICK - seq_offset;//随机生成序列号算法

newsk->window_seq = newsk->send_seq;

newsk->rcv_ack_seq = newsk->send_seq;

newsk->urg =0;

newsk->retransmits = 0;

newsk->destroy = 0;

newsk->timer.data = (unsigned long)newsk;

newsk->timer.function = &net_timer;//协议用到的timer

newsk->dummy_th.source = skb->h.th->dest;

newsk->dummy_th.dest = skb->h.th->source;

/* Swap these two, they are from our point of view. */

newsk->daddr = saddr;

newsk->saddr = daddr;

put_sock(newsk->num,newsk);

newsk->dummy_th.res1 = 0;

newsk->dummy_th.doff = 6;

newsk->dummy_th.fin = 0;

newsk->dummy_th.syn = 0;

newsk->dummy_th.rst = 0;

newsk->dummy_th.psh = 0;

newsk->dummy_th.ack = 0;

newsk->dummy_th.urg = 0;

newsk->dummy_th.res2 = 0;

newsk->acked_seq = skb->h.th->seq + 1;

newsk->copied_seq = skb->h.th->seq;

/* Grab the ttl and tos values and use them */

newsk->ip_ttl=sk->ip_ttl;

newsk->ip_tos=skb->ip_hdr->tos;

/* use 512 or whatever user asked for */

/* note use of sk->user_mss, since user has no direct access to newsk */

if (sk->user_mss)

newsk->mtu = sk->user_mss;

else {

#ifdef SUBNETSARELOCAL

if ((saddr ^ daddr) & default_mask(saddr))

#else

if ((saddr ^ daddr) & dev->pa_mask)

#endif

newsk->mtu = 576 - HEADER_SIZE;

else

newsk->mtu = MAX_WINDOW;

}

/* but not bigger than device MTU */

newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE);

/* this will min with what arrived in the packet */

tcp_options(newsk,skb->h.th);

//准备发送sync&ack的报文

buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);

if (buff == NULL) {

sk->err = -ENOMEM;

newsk->dead = 1;

release_sock(newsk);

kfree_skb(skb, FREE_READ);

return;

}

buff->mem_addr = buff;

buff->mem_len = MAX_SYN_SIZE;

buff->len = sizeof(struct tcphdr)+4;

buff->sk = newsk;

t1 =(struct tcphdr *) buff->data;

/* Put in the IP header and routing stuff. */

tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &dev,

IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);

/* Something went wrong. */

if (tmp < 0) {

sk->err = tmp;

buff->free=1;

kfree_skb(buff,FREE_WRITE);

newsk->dead = 1;

release_sock(newsk);

skb->sk = sk;

kfree_skb(skb, FREE_READ);

return;

}

buff->len += tmp;

t1 =(struct tcphdr *)((char *)t1 +tmp);

memcpy(t1, skb->h.th, sizeof(*t1));

buff->h.seq = newsk->send_seq;

/* Swap the send and the receive. */

t1->dest = skb->h.th->source;

t1->source = newsk->dummy_th.source;

t1->seq = ntohl(newsk->send_seq++);

t1->ack = 1;//设置ACK标志

newsk->window = tcp_select_window(newsk);/*newsk->prot->rspace(newsk);*/

t1->window = ntohs(newsk->window);

t1->res1 = 0;

t1->res2 = 0;

t1->rst = 0;

t1->urg = 0;

t1->psh = 0;

t1->syn = 1;//设置SYNC标志

t1->ack_seq = ntohl(skb->h.th->seq+1);

t1->doff = sizeof(*t1)/4+1;

ptr =(unsigned char *)(t1+1);

ptr[0] = 2;

ptr[1] = 4;

ptr[2] = ((newsk->mtu) >> 8) & 0xff;

ptr[3] =(newsk->mtu) & 0xff;

tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);

newsk->prot->queue_xmit(newsk, dev, buff, 0);//在这里发送SYNC/ACK报文

reset_timer(newsk, TIME_WRITE /* -1 ? FIXME ??? */, TCP_CONNECT_TIME);

skb->sk = newsk;

/* Charge the sock_buff to newsk. */

sk->rmem_alloc -= skb->mem_len;

newsk->rmem_alloc += skb->mem_len;

skb_queue_tail(&sk->rqueue,skb);//同时把接收的这个报文上加入到sock接收队列中

sk->ack_backlog++;

release_sock(newsk);

}

这样,一个tcp握手就完成了,下面开始用户调用accept了

accept调用关系sock_accept->inet_accept->tcp_accept,先看一下tcp_accept

/* This will accept the next outstanding connection. */

static struct sock *

tcp_accept(struct sock *sk, int flags)

{

struct sock *newsk;

struct sk_buff *skb;

DPRINTF((DBG_TCP, "tcp_accept(sk=%X, flags=%X, addr=%s)\n",

sk, flags, in_ntoa(sk->saddr)));

/*

* We need to make sure that this socket is listening,

* and that it has something pending.

*/

if (sk->state != TCP_LISTEN) {

sk->err = EINVAL;

return(NULL);

}

/* avoid the race. */

cli();

sk->inuse = 1;

//等待接收的报文,当有报文时,就传给inet_accept,这个报文就是sync报文了,刚刚在tcp_recv函数中已经分析过了,同时state已经变为TCP_SYN_RECV了,在看inet_accept实现

while((skb = get_firstr(sk)) == NULL) {

if (flags & O_NONBLOCK) {

sti();

release_sock(sk);

sk->err = EAGAIN;

return(NULL);

}

release_sock(sk);

interruptible_sleep_on(sk->sleep);

if (current->signal & ~current->blocked) {

sti();

sk->err = ERESTARTSYS;

return(NULL);

}

sk->inuse = 1;

}

sti();

/* Now all we need to do is return skb->sk. */

newsk = skb->sk;

kfree_skb(skb, FREE_READ);

sk->ack_backlog--;

release_sock(sk);

return(newsk);

}

static int

inet_accept(struct socket *sock, struct socket *newsock, int flags)

{

struct sock *sk1, *sk2;

int err;

sk1 = (struct sock *) sock->data;

if (sk1 == NULL) {

printk("Warning: sock->data = NULL: %d\n" ,__LINE__);

return(0);

}

/*

* We've been passed an extra socket.

* We need to free it up because the tcp module creates

* it's own when it accepts one.

*/

if (newsock->data) kfree_s(newsock->data, sizeof(struct sock));

newsock->data = NULL;

if (sk1->prot->accept == NULL) return(-EOPNOTSUPP);

/* Restore the state if we have been interrupted, and then returned. */

if (sk1->pair != NULL ) {

sk2 = sk1->pair;

sk1->pair = NULL;

} else {

sk2 = sk1->prot->accept(sk1,flags);

if (sk2 == NULL) {

if (sk1->err <= 0)

printk("Warning sock.c:sk1->err <= 0. Returning non-error.\n");

err=sk1->err;

sk1->err=0;

return(-err);

}

}

newsock->data = (void *)sk2;

sk2->sleep = newsock->wait;

newsock->conn = NULL;

if (flags & O_NONBLOCK) return(0);

cli(); /* avoid the race. */

while(sk2->state == TCP_SYN_RECV) {//由于这个条件肯定是成立的,所以进入等待状态

interruptible_sleep_on(sk2->sleep);

if (current->signal & ~current->blocked) {

sti();

sk1->pair = sk2;

sk2->sleep = NULL;

newsock->data = NULL;

return(-ERESTARTSYS);

}

}

sti();

//在tcp_rcv中,如果是TCP_SYN_RECV,在接收到ACK后,状态变为TCP_ESTABLISHED

if (sk2->state != TCP_ESTABLISHED && sk2->err > 0) {

err = -sk2->err;

sk2->err=0;

destroy_sock(sk2);

newsock->data = NULL;

return(err);

}

newsock->state = SS_CONNECTED;//新的链接状态为SS_CONNECTED,这样一个TCP链接就建立完成了,可以进行SEND/RECV了。

return(0);

}

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值