2010/6/18:还在分析linux/net代码中,觉得在2.6的内核用来分析代码流程的确很麻烦。原理的东西一般都很好理解,有的技术PPT可能几页就能讲的很清楚了。比如GRE,理论上就是圆环套圆环(记得好像是那部电影??),可是分析2.6的代码就麻烦的很。还要理解tunnel。
所以突然想到,跟踪linux/net功能的发展历程来分析,应该还不错。
于是,先下载了0.96c的代码,发现这个版本的代码还只支持本地socket(AF_UNIX),还不算真正意义上的net。
找来找去,搞了一个0.99.15。还算完善些。
这个版本支持INET,也就是IPV4。所以有值得分析的部分。
先看看吧,有了心得在写下来。希望借这个版本能够把路由策略(route.c)和TCP协议(tcp.c)很好的掌握以下。
1. 路由策略
在ip报文的发送和转发过程中,都涉及到路由的问题。在ip_build_header和ip_forward中,都调用了rt_route()。rt_route()这个函数根据传入的目的ip,返回一个rtable结构。这个结构就是我们经常看到的路由表,route这个命令能够查看到。
struct rtable {
struct rtable *rt_next;//路由表项是一个单项的链表,没有hash,完全是最简单的实现
unsigned long rt_dst; //目的地址
unsigned long rt_mask; //目的掩码
unsigned long rt_gateway;//目的网关
unsigned char rt_flags; //表示下一跳是个什么东西
unsigned char rt_metric;//可以理解为cost
short rt_refcnt;//引用计数
unsigned long rt_use;//命中次数?
unsigned short rt_mss, rt_mtu;//
struct device *rt_dev;//对应的设备,从哪个接口出去
};
在看一下rt_route的实现,简直是爽死了。就这么几行就搞定了2.6内核那一坨东西,不过功能肯定是不及:)
struct rtable * rt_route(unsigned long daddr, struct options *opt)
{
struct rtable *rt;
//遍历路由表
for (rt = rt_base; rt != NULL || early_out ; rt = rt->rt_next) {
//如果完全匹配或同一个子网,则选择这条路由。原来子网的用途在这里
if (!((rt->rt_dst ^ daddr) & rt->rt_mask))
break;
/* broadcast addresses can be special cases.. */
//如果你是一个广播地址,并且该路由表项允许广播,也就是这个路由表项制定的物理接口支持广播,表示命中这个表项。但有一个问题就是如果有多个接口,只能从第一个表项指定的接口进行广播了,因为break了。
if ((rt->rt_dev->flags & IFF_BROADCAST) &&
rt->rt_dev->pa_brdaddr == daddr)
break;
}
//如果是loopback,但没有loopback路由?奇怪,可能是不支持loopback接口吧
if (daddr == rt->rt_dev->pa_addr) {
if ((rt = rt_loopback) == NULL)
goto no_route;
}
rt->rt_use++;//命中次数
return rt;
no_route:
return NULL;
}
由此看来,早期内核对网络的支持非常简单,用来分析网络的实现也很容易。在看看路由表项是怎么添加进去的。rt_add()负责向路由表项添加路由表,而调用rt_add的地方有rt_ioctl和icmp.c中,icmp的路由重定向的支持。
也就是说,添加路由表项有两种方式:
a) inet_ioctl/rt_ioctl用来添加删除。
b) icmp协议的重定向报文。
在rt_add中,指定类型(flags),目的地址,掩码,目的网关地址,对应的物理接口等参数。
如此,只要花费半小时,就分析完了route.c,效率高啊:)
2. TCP协议
linux内核在简单,在TCP协议的实现上也简单不了,所以看一下tcp.c,所以linux的tcp实现参考了BSD Socket,这在文件头中有说明。
先看一下tcp_prot结构
struct proto tcp_prot = {
sock_wmalloc,
sock_rmalloc,
sock_wfree,
sock_rfree,
sock_rspace,
sock_wspace,
tcp_close,
tcp_read,
tcp_write,
tcp_sendto,
tcp_recvfrom,
ip_build_header,
tcp_connect,
tcp_accept,
ip_queue_xmit,
tcp_retransmit,
tcp_write_wakeup,
tcp_read_wakeup,
tcp_rcv,
tcp_select,
tcp_ioctl,
NULL,
tcp_shutdown,
tcp_setsockopt,
tcp_getsockopt,
128,
0,
{NULL,},
"TCP"
};
2010/6/21:稍微考虑了一下,一个完整的C/S通信过程,最好先看Server端的处理流程,在分析Client端的处理。
Server端流程一般都是这样的情况
a)socket
b)bind
c)listen
d)accept //这个过程中一般会fork一个进程处理新建立的链接。
e)send/recv
f)close
好,先看一下socket()都做了哪些事情,sock_register函数注册了inet相关操作,对应函数inet_create()。代码就不贴了,这个函数实现比较简单,就是为sock结构分配内存并初始化。
大概过程是socket(user space)->sys_socketcall(kernel space)->sock_socket()->inet_create(前提是指定inet通信)。
这里在sock_socket中为socket结构分配内存和属于进程的fd,在inet_create中为sock分配内存。
接下来看一下inet_bind
static int
inet_bind(struct socket *sock, struct sockaddr *uaddr,
int addr_len)
{
struct sockaddr_in addr;
struct sock *sk, *sk2;
unsigned short snum;
int err;
sk = (struct sock *) sock->data;
if (sk == NULL) {
printk("Warning: sock->data = NULL: %d\n" ,__LINE__);
return(0);
}
/* check this error. */
if (sk->state != TCP_CLOSE) return(-EIO);
if (sk->num != 0) return(-EINVAL);
err=verify_area(VERIFY_READ, uaddr, addr_len);
if(err)
return err;
memcpy_fromfs(&addr, uaddr, min(sizeof(addr), addr_len));
snum = ntohs(addr.sin_port);
DPRINTF((DBG_INET, "bind sk =%X to port = %d\n", sk, snum));
sk = (struct sock *) sock->data;
/*
* We can't just leave the socket bound wherever it is, it might
* be bound to a privileged port. However, since there seems to
* be a bug here, we will leave it if the port is not privileged.
*/
if (snum == 0) { //如果没有指定bind的端口号,系统为你分配一个
snum = get_new_socknum(sk->prot, 0);
}
if (snum < PROT_SOCK && !suser()) return(-EACCES);
if (addr.sin_addr.s_addr!=0 && chk_addr(addr.sin_addr.s_addr)!=IS_MYADDR)
return(-EADDRNOTAVAIL); /* Source address MUST be ours! */
if (chk_addr(addr.sin_addr.s_addr) || addr.sin_addr.s_addr == 0)
sk->saddr = addr.sin_addr.s_addr;
DPRINTF((DBG_INET, "sock_array[%d] = %X:\n", snum &(SOCK_ARRAY_SIZE -1),
sk->prot->sock_array[snum &(SOCK_ARRAY_SIZE -1)]));
/* Make sure we are allowed to bind here. */
cli();
outside_loop:
//这里以源端口号做了一个散列,在bind操作,找到符合要求的sk
for(sk2 = sk->prot->sock_array[snum & (SOCK_ARRAY_SIZE -1)];
sk2 != NULL; sk2 = sk2->next) {
#if 1 /* should be below! */
if (sk2->num != snum) continue;
/* if (sk2->saddr != sk->saddr) continue; */
#endif
if (sk2->dead) {
destroy_sock(sk2);
goto outside_loop;
}
if (!sk->reuse) {
sti();
return(-EADDRINUSE);
}
if (sk2->num != snum) continue; /* more than one */
if (sk2->saddr != sk->saddr) continue; /* socket per slot ! -FB */
if (!sk2->reuse) {
sti();
return(-EADDRINUSE);
}
}
sti();
remove_sock(sk);
put_sock(snum, sk);
sk->dummy_th.source = ntohs(sk->num);
sk->daddr = 0;
sk->dummy_th.dest = 0;
return(0);
}
再看listen
static int
inet_listen(struct socket *sock, int backlog)
{
struct sock *sk;
sk = (struct sock *) sock->data;
if (sk == NULL) {
printk("Warning: sock->data = NULL: %d\n" ,__LINE__);
return(0);
}
/* We may need to bind the socket. */
if (sk->num == 0) {
sk->num = get_new_socknum(sk->prot, 0);
if (sk->num == 0) return(-EAGAIN);
put_sock(sk->num, sk);
sk->dummy_th.source = ntohs(sk->num);
}
/* We might as well re use these. */
sk->max_ack_backlog = backlog;//比较感兴趣的就是这里和下面的sk->state, backlog这个值如果不分析socket实现的话,是不会理解这个含义的。先说一下,这个数值的功能是能够同时接受多少个sync报文,也算是为了防止sync攻击的初级防御吧,回想一下tcp的三次握手,先是client端发送sync报文,如果server端在接收这个报文后,还没有创建新的连接,那么最多可以缓存backlog个sync报文,多出的部分直接扔掉
if (sk->state != TCP_LISTEN) {
sk->ack_backlog = 0;
sk->state = TCP_LISTEN;//当设置这个值为listen的时候,对应端口的tcp报文就可以被内核处理了
}
return(0);
}
现在可以看一下tcp_recv
case TCP_LISTEN: //如果是listen状态才处理sync报文
if (th->rst) {//这时收到rst报文,直接丢弃,不处理
kfree_skb(skb, FREE_READ);
release_sock(sk);
return(0);
}
if (th->ack) {//如果收到ack报文,说明可能server端down过,通知对方链接已经被重置
tcp_reset(daddr, saddr, th, sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
kfree_skb(skb, FREE_READ);
release_sock(sk);
return(0);
}
if (th->syn) {//这个才是正确的报文
#if 0
if (opt->security != 0 || opt->compartment != 0) {
tcp_reset(daddr, saddr, th, prot, opt,dev);
release_sock(sk);
return(0);
}
#endif
/*
* Now we just put the whole thing including
* the header and saddr, and protocol pointer
* into the buffer. We can't respond until the
* user tells us to accept the connection.
*/
tcp_conn_request(sk, skb, daddr, saddr, opt, dev);//这个函数处理sync报文,下面分析
release_sock(sk);
return(0);
}
kfree_skb(skb, FREE_READ);
release_sock(sk);
return(0);
default://缺省的报文丢弃
if (!tcp_sequence(sk, th, len, opt, saddr,dev)) {
kfree_skb(skb, FREE_READ);
release_sock(sk);
return(0);
}
static void
tcp_conn_request(struct sock *sk, struct sk_buff *skb,
unsigned long daddr, unsigned long saddr,
struct options *opt, struct device *dev)
{
struct sk_buff *buff;
struct tcphdr *t1;
unsigned char *ptr;
struct sock *newsk;
struct tcphdr *th;
int tmp;
DPRINTF((DBG_TCP, "tcp_conn_request(sk = %X, skb = %X, daddr = %X, sadd4= %X, \n"
" opt = %X, dev = %X)\n",
sk, skb, daddr, saddr, opt, dev));
th = skb->h.th;
/* If the socket is dead, don't accept the connection. */
if (!sk->dead) {
sk->data_ready(sk,0);
} else {
DPRINTF((DBG_TCP, "tcp_conn_request on dead socket\n"));
tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
kfree_skb(skb, FREE_READ);
return;
}
/*
* Make sure we can accept more. This will prevent a
* flurry of syns from eating up all our memory.
*/
if (sk->ack_backlog >= sk->max_ack_backlog) {//这里,达到max,丢弃报文
kfree_skb(skb, FREE_READ);
return;
}
/*
* We need to build a new sock struct.
* It is sort of bad to have a socket without an inode attached
* to it, but the wake_up's will just wake up the listening socket,
* and if the listening socket is destroyed before this is taken
* off of the queue, this will take care of it.
*/
//复制一份新的sock
newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
if (newsk == NULL) {
/* just ignore the syn. It will get retransmitted. */
kfree_skb(skb, FREE_READ);
return;
}
DPRINTF((DBG_TCP, "newsk = %X\n", newsk));
memcpy((void *)newsk,(void *)sk, sizeof(*newsk));
newsk->wback = NULL;
newsk->wfront = NULL;
newsk->rqueue = NULL;
newsk->send_head = NULL;
newsk->send_tail = NULL;
newsk->back_log = NULL;
newsk->rtt = TCP_CONNECT_TIME << 3;
newsk->rto = TCP_CONNECT_TIME;
newsk->mdev = 0;
newsk->max_window = 0;
newsk->cong_window = 1;
newsk->cong_count = 0;
newsk->ssthresh = 0;
newsk->backoff = 0;
newsk->blog = 0;
newsk->intr = 0;
newsk->proc = 0;
newsk->done = 0;
newsk->partial = NULL;
newsk->pair = NULL;
newsk->wmem_alloc = 0;
newsk->rmem_alloc = 0;
newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
newsk->err = 0;
newsk->shutdown = 0;
newsk->ack_backlog = 0;
newsk->acked_seq = skb->h.th->seq+1;
newsk->fin_seq = skb->h.th->seq;
newsk->copied_seq = skb->h.th->seq;
newsk->state = TCP_SYN_RECV;//设置状态,注意这个是newsk,原来的sk状态仍为LISTEN,这样在下一个tcp_rcv的处理中,会做进一步的处理
newsk->timeout = 0;
newsk->send_seq = jiffies * SEQ_TICK - seq_offset;//随机生成序列号算法
newsk->window_seq = newsk->send_seq;
newsk->rcv_ack_seq = newsk->send_seq;
newsk->urg =0;
newsk->retransmits = 0;
newsk->destroy = 0;
newsk->timer.data = (unsigned long)newsk;
newsk->timer.function = &net_timer;//协议用到的timer
newsk->dummy_th.source = skb->h.th->dest;
newsk->dummy_th.dest = skb->h.th->source;
/* Swap these two, they are from our point of view. */
newsk->daddr = saddr;
newsk->saddr = daddr;
put_sock(newsk->num,newsk);
newsk->dummy_th.res1 = 0;
newsk->dummy_th.doff = 6;
newsk->dummy_th.fin = 0;
newsk->dummy_th.syn = 0;
newsk->dummy_th.rst = 0;
newsk->dummy_th.psh = 0;
newsk->dummy_th.ack = 0;
newsk->dummy_th.urg = 0;
newsk->dummy_th.res2 = 0;
newsk->acked_seq = skb->h.th->seq + 1;
newsk->copied_seq = skb->h.th->seq;
/* Grab the ttl and tos values and use them */
newsk->ip_ttl=sk->ip_ttl;
newsk->ip_tos=skb->ip_hdr->tos;
/* use 512 or whatever user asked for */
/* note use of sk->user_mss, since user has no direct access to newsk */
if (sk->user_mss)
newsk->mtu = sk->user_mss;
else {
#ifdef SUBNETSARELOCAL
if ((saddr ^ daddr) & default_mask(saddr))
#else
if ((saddr ^ daddr) & dev->pa_mask)
#endif
newsk->mtu = 576 - HEADER_SIZE;
else
newsk->mtu = MAX_WINDOW;
}
/* but not bigger than device MTU */
newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE);
/* this will min with what arrived in the packet */
tcp_options(newsk,skb->h.th);
//准备发送sync&ack的报文
buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
if (buff == NULL) {
sk->err = -ENOMEM;
newsk->dead = 1;
release_sock(newsk);
kfree_skb(skb, FREE_READ);
return;
}
buff->mem_addr = buff;
buff->mem_len = MAX_SYN_SIZE;
buff->len = sizeof(struct tcphdr)+4;
buff->sk = newsk;
t1 =(struct tcphdr *) buff->data;
/* Put in the IP header and routing stuff. */
tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &dev,
IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);
/* Something went wrong. */
if (tmp < 0) {
sk->err = tmp;
buff->free=1;
kfree_skb(buff,FREE_WRITE);
newsk->dead = 1;
release_sock(newsk);
skb->sk = sk;
kfree_skb(skb, FREE_READ);
return;
}
buff->len += tmp;
t1 =(struct tcphdr *)((char *)t1 +tmp);
memcpy(t1, skb->h.th, sizeof(*t1));
buff->h.seq = newsk->send_seq;
/* Swap the send and the receive. */
t1->dest = skb->h.th->source;
t1->source = newsk->dummy_th.source;
t1->seq = ntohl(newsk->send_seq++);
t1->ack = 1;//设置ACK标志
newsk->window = tcp_select_window(newsk);/*newsk->prot->rspace(newsk);*/
t1->window = ntohs(newsk->window);
t1->res1 = 0;
t1->res2 = 0;
t1->rst = 0;
t1->urg = 0;
t1->psh = 0;
t1->syn = 1;//设置SYNC标志
t1->ack_seq = ntohl(skb->h.th->seq+1);
t1->doff = sizeof(*t1)/4+1;
ptr =(unsigned char *)(t1+1);
ptr[0] = 2;
ptr[1] = 4;
ptr[2] = ((newsk->mtu) >> 8) & 0xff;
ptr[3] =(newsk->mtu) & 0xff;
tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
newsk->prot->queue_xmit(newsk, dev, buff, 0);//在这里发送SYNC/ACK报文
reset_timer(newsk, TIME_WRITE /* -1 ? FIXME ??? */, TCP_CONNECT_TIME);
skb->sk = newsk;
/* Charge the sock_buff to newsk. */
sk->rmem_alloc -= skb->mem_len;
newsk->rmem_alloc += skb->mem_len;
skb_queue_tail(&sk->rqueue,skb);//同时把接收的这个报文上加入到sock接收队列中
sk->ack_backlog++;
release_sock(newsk);
}
这样,一个tcp握手就完成了,下面开始用户调用accept了
accept调用关系sock_accept->inet_accept->tcp_accept,先看一下tcp_accept
/* This will accept the next outstanding connection. */
static struct sock *
tcp_accept(struct sock *sk, int flags)
{
struct sock *newsk;
struct sk_buff *skb;
DPRINTF((DBG_TCP, "tcp_accept(sk=%X, flags=%X, addr=%s)\n",
sk, flags, in_ntoa(sk->saddr)));
/*
* We need to make sure that this socket is listening,
* and that it has something pending.
*/
if (sk->state != TCP_LISTEN) {
sk->err = EINVAL;
return(NULL);
}
/* avoid the race. */
cli();
sk->inuse = 1;
//等待接收的报文,当有报文时,就传给inet_accept,这个报文就是sync报文了,刚刚在tcp_recv函数中已经分析过了,同时state已经变为TCP_SYN_RECV了,在看inet_accept实现
while((skb = get_firstr(sk)) == NULL) {
if (flags & O_NONBLOCK) {
sti();
release_sock(sk);
sk->err = EAGAIN;
return(NULL);
}
release_sock(sk);
interruptible_sleep_on(sk->sleep);
if (current->signal & ~current->blocked) {
sti();
sk->err = ERESTARTSYS;
return(NULL);
}
sk->inuse = 1;
}
sti();
/* Now all we need to do is return skb->sk. */
newsk = skb->sk;
kfree_skb(skb, FREE_READ);
sk->ack_backlog--;
release_sock(sk);
return(newsk);
}
static int
inet_accept(struct socket *sock, struct socket *newsock, int flags)
{
struct sock *sk1, *sk2;
int err;
sk1 = (struct sock *) sock->data;
if (sk1 == NULL) {
printk("Warning: sock->data = NULL: %d\n" ,__LINE__);
return(0);
}
/*
* We've been passed an extra socket.
* We need to free it up because the tcp module creates
* it's own when it accepts one.
*/
if (newsock->data) kfree_s(newsock->data, sizeof(struct sock));
newsock->data = NULL;
if (sk1->prot->accept == NULL) return(-EOPNOTSUPP);
/* Restore the state if we have been interrupted, and then returned. */
if (sk1->pair != NULL ) {
sk2 = sk1->pair;
sk1->pair = NULL;
} else {
sk2 = sk1->prot->accept(sk1,flags);
if (sk2 == NULL) {
if (sk1->err <= 0)
printk("Warning sock.c:sk1->err <= 0. Returning non-error.\n");
err=sk1->err;
sk1->err=0;
return(-err);
}
}
newsock->data = (void *)sk2;
sk2->sleep = newsock->wait;
newsock->conn = NULL;
if (flags & O_NONBLOCK) return(0);
cli(); /* avoid the race. */
while(sk2->state == TCP_SYN_RECV) {//由于这个条件肯定是成立的,所以进入等待状态
interruptible_sleep_on(sk2->sleep);
if (current->signal & ~current->blocked) {
sti();
sk1->pair = sk2;
sk2->sleep = NULL;
newsock->data = NULL;
return(-ERESTARTSYS);
}
}
sti();
//在tcp_rcv中,如果是TCP_SYN_RECV,在接收到ACK后,状态变为TCP_ESTABLISHED
if (sk2->state != TCP_ESTABLISHED && sk2->err > 0) {
err = -sk2->err;
sk2->err=0;
destroy_sock(sk2);
newsock->data = NULL;
return(err);
}
newsock->state = SS_CONNECTED;//新的链接状态为SS_CONNECTED,这样一个TCP链接就建立完成了,可以进行SEND/RECV了。
return(0);
}