socket系统调用,用来创建一个套接字(后文我都称socket),若成功返回它的描述符。
/*socket() creates an endpoint for communication and returns a descriptor.*/
int socket(int domain, int type, int protocol);
对于使用TCP协议的应用,我们常用 socket(AF_INET, SOCK_STREAM, 0) 创建一个TCP套接字。那么socket在内核中是如何表示的?下面的定义给出了它的真面目:
/*
* Kernel structure per socket.
* Contains send and receive buffer queues,
* handle on protocol and pointer to protocol
* private data and error information.
*/
struct socket {
short so_type; /* generic type, see socket.h */
short so_options; /* from socket call, see socket.h */
short so_linger; /* time to linger while closing */
short so_state; /* internal state flags SS_*, below */
caddr_t so_pcb; /* protocol control block */
struct protosw *so_proto; /* protocol handle */
/*
* Variables for connection queueing.
* Socket where accepts occur is so_head in all subsidiary sockets.
* If so_head is 0, socket is not related to an accept.
* For head socket so_q0 queues partially completed connections,
* while so_q is a queue of connections ready to be accepted.
* If a connection is aborted and it has so_head set, then
* it has to be pulled out of either so_q0 or so_q.
* We allow connections to queue up based on current queue lengths
* and limit on number of queued connections for this socket.
*/
struct socket *so_head; /* back pointer to accept socket */
struct socket *so_q0; /* queue of partial connections */
struct socket *so_q; /* queue of incoming connections */
short so_q0len; /* partials on so_q0 */
short so_qlen; /* number of connections on so_q */
short so_qlimit; /* max number queued connections */
short so_timeo; /* connection timeout */
u_short so_error; /* error affecting connection */
pid_t so_pgid; /* pgid for signals */
u_long so_oobmark; /* chars to oob mark */
/*
* Variables for socket buffering.
*/
struct sockbuf {
u_long sb_cc; /* actual chars in buffer */
u_long sb_hiwat; /* max actual char count */
u_long sb_mbcnt; /* chars of mbufs used */
u_long sb_mbmax; /* max chars of mbufs to use */
long sb_lowat; /* low water mark */
struct mbuf *sb_mb; /* the mbuf chain */
struct selinfo sb_sel; /* process selecting read/write */
short sb_flags; /* flags, see below */
short sb_timeo; /* timeout for read/write */
} so_rcv, so_snd;
#define SB_MAX (256*1024) /* default for max chars in sockbuf */
#define SB_LOCK 0x01 /* lock on data queue */
#define SB_WANT 0x02 /* someone is waiting to lock */
#define SB_WAIT 0x04 /* someone is waiting for data/space */
#define SB_SEL 0x08 /* someone is selecting */
#define SB_ASYNC 0x10 /* ASYNC I/O, need signals */
#define SB_NOTIFY (SB_WAIT|SB_SEL|SB_ASYNC)
#define SB_NOINTR 0x40 /* operations not interruptible */
};
/*
* Socket state bits.
*/
#define SS_NOFDREF 0x001 /* no file table ref any more */
#define SS_ISCONNECTED 0x002 /* socket connected to a peer */
#define SS_ISCONNECTING 0x004 /* in process of connecting to peer */
#define SS_ISDISCONNECTING 0x008 /* in process of disconnecting */
#define SS_CANTSENDMORE 0x010 /* can't send more data to peer */
#define SS_CANTRCVMORE 0x020 /* can't receive more data from peer */
#define SS_RCVATMARK 0x040 /* at mark on input */
#define SS_PRIV 0x080 /* privileged for broadcast, raw... */
#define SS_NBIO 0x100 /* non-blocking ops */
#define SS_ASYNC 0x200 /* async i/o notify */
#define SS_ISCONFIRMING 0x400 /* deciding to accept connection req */
我们常说的socket在内核中就是使用socket结构体来表示。
so_pcb成员指向协议控制块,对于AF_INET(Internet)域的协议,它指向我们下文要介绍的inpcb结构。
so_proto成员指向协议处理结构,下文会介绍该结构。
对于服务器应用程序,在创建完一个socket后,需要调用accept系统调用接收TCP连接请求,我们称服务器的这个socket为监听socket。每当TCP请求到来时都会创建一个新的socket,这个新的socket结构中的so_head成员指向监听的socket。监听的socket的so_q0成员和so_q是一个队列,so_q0保存着TCP三次握手尚未完成的所有socket。so_q保存着已完成三次握手的所有socket。
下图显示了有三个连接将被接受、一个连接已被建立的情况下的队列内容:
so_rcv和so_snd成员表示socket的接收缓冲区和发送缓冲区,它们用sockbuf结构体表示。其中sb_mb指向存储数据的mbuf,sb_cc表示实际的数据量。sb_lowat和sb_hiwat分别表示缓冲区的低水位标记和高水位标记。
以下是与socket相关的宏:
/*
* How much space is there in a socket buffer (so->so_snd or so->so_rcv)?
* This is problematical if the fields are unsigned, as the space might
* still be negative (cc > hiwat or mbcnt > mbmax). Should detect
* overflow and return 0. Should use "lmin" but it doesn't exist now.
*/
#define sbspace(sb) \
((long) imin((int)((sb)->sb_hiwat - (sb)->sb_cc), \
(int)((sb)->sb_mbmax - (sb)->sb_mbcnt)))
/* can we read something from so? */
#define soreadable(so) \
((so)->so_rcv.sb_cc >= (so)->so_rcv.sb_lowat || \
((so)->so_state & SS_CANTRCVMORE) || \
(so)->so_qlen || (so)->so_error)
/* can we write something to so? */
#define sowriteable(so) \
(sbspace(&(so)->so_snd) >= (so)->so_snd.sb_lowat && \
(((so)->so_state&SS_ISCONNECTED) || \
((so)->so_proto->pr_flags&PR_CONNREQUIRED)==0) || \
((so)->so_state & SS_CANTSENDMORE) || \
(so)->so_error)
/* adjust counters in sb reflecting allocation of m */
#define sballoc(sb, m) { \
(sb)->sb_cc += (m)->m_len; \
(sb)->sb_mbcnt += MSIZE; \
if ((m)->m_flags & M_EXT) \
(sb)->sb_mbcnt += (m)->m_ext.ext_size; \
}
/* adjust counters in sb reflecting freeing of m */
#define sbfree(sb, m) { \
(sb)->sb_cc -= (m)->m_len; \
(sb)->sb_mbcnt -= MSIZE; \
if ((m)->m_flags & M_EXT) \
(sb)->sb_mbcnt -= (m)->m_ext.ext_size; \
}
/*
* Set lock on sockbuf sb; sleep if lock is already held.
* Unless SB_NOINTR is set on sockbuf, sleep is interruptible.
* Returns error without lock if sleep is interrupted.
*/
#define sblock(sb, wf) ((sb)->sb_flags & SB_LOCK ? \
(((wf) == M_WAITOK) ? sb_lock(sb) : EWOULDBLOCK) : \
((sb)->sb_flags |= SB_LOCK), 0)
/* release lock on sockbuf sb */
#define sbunlock(sb) { \
(sb)->sb_flags &= ~SB_LOCK; \
if ((sb)->sb_flags & SB_WANT) { \
(sb)->sb_flags &= ~SB_WANT; \
wakeup((caddr_t)&(sb)->sb_flags); \
} \
}
/*唤醒在socket的接收缓冲区上等待的应用程序*/
#define sorwakeup(so) { sowakeup((so), &(so)->so_rcv); \
if ((so)->so_upcall) \
(*((so)->so_upcall))((so), (so)->so_upcallarg, M_DONTWAIT); \
}
/*唤醒在socket的发送缓冲区上等待的应用程序*/
#define sowwakeup(so) sowakeup((so), &(so)->so_snd
sbspace:返回缓冲区可用空间大小。
soreadable:该socket是否可读?
sowriteable:该socket是否可写?
sblock:给缓冲区加锁。
sbunlock:给缓冲区解锁。
对每个系统调用,BSD内核都定义了一个包含所有参数的结构体。socket系统调用的代码如下:
struct socket_args {
int domain; /*协议域*/
int type; /*类型*/
int protocol; /*协议*/
};
socket(p, uap, retval)
struct proc *p;
register struct socket_args *uap;
int *retval;
{
struct filedesc *fdp = p->p_fd;
struct socket *so;
struct file *fp;
int fd, error;
if (error = falloc(p, &fp, &fd)) /*分配一个file结构体和描述符*/
return (error);
fp->f_flag = FREAD|FWRITE;
fp->f_type = DTYPE_SOCKET;
fp->f_ops = &socketops; /*设置文件操作结构体*/
if (error = socreate(uap->domain, &so, uap->type, uap->protocol)) { /*调用socreate完成剩下的工作*/
fdp->fd_ofiles[fd] = 0;
ffree(fp);
} else {
fp->f_data = (caddr_t)so;
*retval = fd; /*返回描述符*/
}
return (error);
}
套接字对应的文件操作结构体是socketops变量,它的定义如下:
struct fileops socketops =
{ soo_read, soo_write, soo_ioctl, soo_select, soo_close };
当调用read系统调用在socket上读取数据时,内核调用soo_read函数。
当调用write系统调用往socket上写入数据时,内核调用soo_write函数。
当调用close系统调用关闭socket时,内核调用soo_close函数。
这三个函数的实现我在后续文章中讲述。
完成创建socket的工作的函数是socreate,它的代码如下:
socreate(dom, aso, type, proto)
int dom;
struct socket **aso; /*返回新创建的socket*/
register int type;
int proto;
{
struct proc *p = curproc; /* XXX */
register struct protosw *prp;
register struct socket *so;
register int error;
if (proto)
prp = pffindproto(dom, proto, type); /*根据socket系统调用参数找到对应的protosw结构*/
else
prp = pffindtype(dom, type);
if (prp == 0 || prp->pr_usrreq == 0)
return (EPROTONOSUPPORT);
if (prp->pr_type != type)
return (EPROTOTYPE);
MALLOC(so, struct socket *, sizeof(*so), M_SOCKET, M_WAIT); /*分配一个socket结构体*/
bzero((caddr_t)so, sizeof(*so));
so->so_type = type;
if (p->p_ucred->cr_uid == 0)
so->so_state = SS_PRIV;
so->so_proto = prp; /*指向协议处理结构*/
error =
(*prp->pr_usrreq)(so, PRU_ATTACH, /*以PRU_ATTACH命令调用TCP协议的用户请求函数*/
(struct mbuf *)0, (struct mbuf *)proto, (struct mbuf *)0);
if (error) {
so->so_state |= SS_NOFDREF;
sofree(so);
return (error);
}
*aso = so; /*返回新创建的socket*/
return (0);
}
它首先根据socket系统调用的参数查找一个protosw结构,这是个表示协议处理的结构。对于AF_INET域协议,inetsw全局数组表示了这个协议域中的所有协议。
protosw结构的定义和inetsw的定义如下:
/*
* Protocol switch table.
*
* Each protocol has a handle initializing one of these structures,
* which is used for protocol-protocol and system-protocol communication.
*
* A protocol is called through the pr_init entry before any other.
* Thereafter it is called every 200ms through the pr_fasttimo entry and
* every 500ms through the pr_slowtimo for timer based actions.
* The system will call the pr_drain entry if it is low on space and
* this should throw away any non-critical data.
*
* Protocols pass data between themselves as chains of mbufs using
* the pr_input and pr_output hooks. Pr_input passes data up (towards
* UNIX) and pr_output passes it down (towards the imps); control
* information passes up and down on pr_ctlinput and pr_ctloutput.
* The protocol is responsible for the space occupied by any the
* arguments to these entries and must dispose it.
*
* The userreq routine interfaces protocols to the system and is
* described below.
*/
struct protosw {
short pr_type; /* socket type used for */
struct domain *pr_domain; /* domain protocol a member of */
short pr_protocol; /* protocol number */
short pr_flags; /* see below */
/* protocol-protocol hooks */
void (*pr_input)(); /* input to protocol (from below) */
int (*pr_output)(); /* output to protocol (from above) */
void (*pr_ctlinput)(); /* control input (from below) */
int (*pr_ctloutput)(); /* control output (from above) */
/* user-protocol hook */
int (*pr_usrreq)(); /* user request: see list below */
/* utility hooks */
void (*pr_init)(); /* initialization hook */
void (*pr_fasttimo)(); /* fast timeout (200ms) */
void (*pr_slowtimo)(); /* slow timeout (500ms) */
void (*pr_drain)(); /* flush any excess space possible */
int (*pr_sysctl)(); /* sysctl for protocol */
};
struct protosw inetsw[] = {
{ 0, &inetdomain, 0, 0,
0, ip_output, 0, 0,
0,
ip_init, 0, ip_slowtimo, ip_drain, ip_sysctl
},
{ SOCK_DGRAM, &inetdomain, IPPROTO_UDP, PR_ATOMIC|PR_ADDR, /*UDP协议*/
udp_input, 0, udp_ctlinput, ip_ctloutput,
udp_usrreq,
udp_init, 0, 0, 0, udp_sysctl
},
{ SOCK_STREAM, &inetdomain, IPPROTO_TCP, PR_CONNREQUIRED|PR_WANTRCVD /*TCP协议*/,
tcp_input, 0, tcp_ctlinput, tcp_ctloutput,
tcp_usrreq,
tcp_init, tcp_fasttimo, tcp_slowtimo, tcp_drain,
},
{ SOCK_RAW, &inetdomain, IPPROTO_RAW, PR_ATOMIC|PR_ADDR, /*原始IP协议*/
rip_input, rip_output, 0, rip_ctloutput,
rip_usrreq,
0, 0, 0, 0,
},
{ SOCK_RAW, &inetdomain, IPPROTO_ICMP, PR_ATOMIC|PR_ADDR, /*ICMP协议*/
icmp_input, rip_output, 0, rip_ctloutput,
rip_usrreq,
0, 0, 0, 0, icmp_sysctl
},
{ SOCK_RAW, &inetdomain, IPPROTO_IGMP, PR_ATOMIC|PR_ADDR, /*IGMP协议*/
igmp_input, rip_output, 0, rip_ctloutput,
rip_usrreq,
igmp_init, igmp_fasttimo, 0, 0,
}, /* raw wildcard */
{ SOCK_RAW, &inetdomain, 0, PR_ATOMIC|PR_ADDR,
rip_input, rip_output, 0, rip_ctloutput,
rip_usrreq,
rip_init, 0, 0, 0,
},
};
pr_domain成员指向表示AF_INET协议域的domain结构(inetdomain变量)。
pr_input成员是协议处理底层协议输入请求的函数。对于TCP协议,它指向tcp_input函数。
pr_output成员是协议处理上层协议输出请求的函数。对于TCP协议它为空。实际上TCP协议的输出函数是tcp_output函数,但这个函数都是直接调用,因此这个函数指针为空。
pr_usrreq成员是协议的用户请求函数。对于TCP协议它是tcp_usrreq函数。
pr_fasttimo成员指向的函数tcp_fasttimo每200ms被调用一次。pr_slowtimo成员指向的函数tcp_slowtimo每500ms被调用一次。它们用来实现TCP的定时器。
domain结构体和全局变量inetdomain的定义如下:
struct domain {
int dom_family; /* AF_xxx */
char *dom_name;
void (*dom_init) /* initialize domain data structures */
__P((void));
int (*dom_externalize) /* externalize access rights */
__P((struct mbuf *));
int (*dom_dispose) /* dispose of internalized rights */
__P((struct mbuf *));
struct protosw *dom_protosw, *dom_protoswNPROTOSW;
struct domain *dom_next;
int (*dom_rtattach) /* initialize routing table */
__P((void **, int));
int dom_rtoffset; /* an arg to rtattach, in bits */
int dom_maxrtkey; /* for routing layer */
};
struct domain inetdomain =
{ AF_INET, "internet", 0, 0, 0,
inetsw, &inetsw[sizeof(inetsw)/sizeof(inetsw[0])], 0,
rn_inithead, 32, sizeof(struct sockaddr_in) };
socreate函数分配一个socket结构体后,会以PRU_ATTACH命令调用TCP协议的用户请求函数tcp_usrreq来初始化socket。tcp_usrreq函数会调用如下的tcp_attach函数:
/*
* Attach TCP protocol to socket, allocating
* internet protocol control block, tcp control block,
* bufer space, and entering LISTEN state if to accept connections.
*/
int
tcp_attach(so)
struct socket *so;
{
register struct tcpcb *tp;
struct inpcb *inp;
int error;
if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
error = soreserve(so, tcp_sendspace, tcp_recvspace); /*设置发送缓冲区和接收缓冲区大小*/
if (error)
return (error);
}
error = in_pcballoc(so, &tcb); /*为socket分配一个inpcb结构体*/
if (error)
return (error);
inp = sotoinpcb(so);
tp = tcp_newtcpcb(inp); /*分配一个tcpcb结构体*/
if (tp == 0) {
int nofd = so->so_state & SS_NOFDREF; /* XXX */
so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */
in_pcbdetach(inp);
so->so_state |= nofd;
return (ENOBUFS);
}
tp->t_state = TCPS_CLOSED; /*TCP连接的状态置为CLOSED*/
return (0);
}
soreserve函数设置缓冲区大小。默认的发送缓冲区和接收缓冲区大小都是8092字节。
in_pcballoc函数会创建一个inpcb结构体。这个结构体表示Internet域协议的协议控制块,定义如下:
/*
* Common structure pcb for internet protocol implementation.
* Here are stored pointers to local and foreign host table
* entries, local and foreign socket numbers, and pointers
* up (to a socket structure) and down (to a protocol-specific)
* control block.
*/
struct inpcb {
struct inpcb *inp_next,*inp_prev;
/* pointers to other pcb's */
struct inpcb *inp_head; /* pointer back to chain of inpcb's
for this protocol */
struct in_addr inp_faddr; /* foreign host table entry */
u_short inp_fport; /* foreign port */
struct in_addr inp_laddr; /* local host table entry */
u_short inp_lport; /* local port */
struct socket *inp_socket; /* back pointer to socket */
caddr_t inp_ppcb; /* pointer to per-protocol pcb */
struct route inp_route; /* placeholder for routing entry */
int inp_flags; /* generic IP/datagram flags */
struct ip inp_ip; /* header prototype; should have more */
struct mbuf *inp_options; /* IP options */
struct ip_moptions *inp_moptions; /* IP multicast options */
};
它包含了这个socket的本地地址(inp_laddr),本地端口(inp_lport),外部地址(inp_faddr)和外部端口(inp_fport)。
对于TCP协议而言,inp_ppcb成员指向tcpcb结构,这个结构表示TCP协议的协议控制块。
inp_route成员保存这个连接的路由。
in_pcballoc函数的代码如下:
int
in_pcballoc(so, head)
struct socket *so;
struct inpcb *head;
{
register struct inpcb *inp;
MALLOC(inp, struct inpcb *, sizeof(*inp), M_PCB, M_WAITOK); /*分配一个inpcb结构体*/
if (inp == NULL)
return (ENOBUFS);
bzero((caddr_t)inp, sizeof(*inp));
inp->inp_head = head;
inp->inp_socket = so;
insque(inp, head); /*将这个inpcb结构体加入全局的tcb链表中*/
so->so_pcb = (caddr_t)inp;
return (0);
}
tcp_newtcpcb函数会创建一个表示TCP协议的协议控制块结构体tcpcb,它的定义如下:
/*
* Tcp control block, one per tcp; fields:
*/
struct tcpcb {
struct tcpiphdr *seg_next; /* sequencing queue */
struct tcpiphdr *seg_prev;
short t_state; /* state of this connection */
short t_timer[TCPT_NTIMERS]; /* tcp timers */
short t_rxtshift; /* log(2) of rexmt exp. backoff */
short t_rxtcur; /* current retransmit value */
short t_dupacks; /* consecutive dup acks recd */
u_short t_maxseg; /* maximum segment size */
char t_force; /* 1 if forcing out a byte */
u_short t_flags;
#define TF_ACKNOW 0x0001 /* ack peer immediately */
#define TF_DELACK 0x0002 /* ack, but try to delay it */
#define TF_NODELAY 0x0004 /* don't delay packets to coalesce */
#define TF_NOOPT 0x0008 /* don't use tcp options */
#define TF_SENTFIN 0x0010 /* have sent FIN */
#define TF_REQ_SCALE 0x0020 /* have/will request window scaling */
#define TF_RCVD_SCALE 0x0040 /* other side has requested scaling */
#define TF_REQ_TSTMP 0x0080 /* have/will request timestamps */
#define TF_RCVD_TSTMP 0x0100 /* a timestamp was received in SYN */
#define TF_SACK_PERMIT 0x0200 /* other side said I could SACK */
struct tcpiphdr *t_template; /* skeletal packet for transmit */
struct inpcb *t_inpcb; /* back pointer to internet pcb */
/*
* The following fields are used as in the protocol specification.
* See RFC793, Dec. 1981, page 21.
*/
/* send sequence variables */
tcp_seq snd_una; /* send unacknowledged */
tcp_seq snd_nxt; /* send next */
tcp_seq snd_up; /* send urgent pointer */
tcp_seq snd_wl1; /* window update seg seq number */
tcp_seq snd_wl2; /* window update seg ack number */
tcp_seq iss; /* initial send sequence number */
u_long snd_wnd; /* send window */
/* receive sequence variables */
u_long rcv_wnd; /* receive window */
tcp_seq rcv_nxt; /* receive next */
tcp_seq rcv_up; /* receive urgent pointer */
tcp_seq irs; /* initial receive sequence number */
/*
* Additional variables for this implementation.
*/
/* receive variables */
tcp_seq rcv_adv; /* advertised window */
/* retransmit variables */
tcp_seq snd_max; /* highest sequence number sent;
* used to recognize retransmits
*/
/* congestion control (for slow start, source quench, retransmit after loss) */
u_long snd_cwnd; /* congestion-controlled window */
u_long snd_ssthresh; /* snd_cwnd size threshhold for
* for slow start exponential to
* linear switch
*/
/*
* transmit timing stuff. See below for scale of srtt and rttvar.
* "Variance" is actually smoothed difference.
*/
short t_idle; /* inactivity time */
short t_rtt; /* round trip time */
tcp_seq t_rtseq; /* sequence number being timed */
short t_srtt; /* smoothed round-trip time */
short t_rttvar; /* variance in round-trip time */
u_short t_rttmin; /* minimum rtt allowed */
u_long max_sndwnd; /* largest window peer has offered */
short t_softerror; /* possible error not yet reported */
/* RFC 1323 variables */
u_char snd_scale; /* window scaling for send window */
u_char rcv_scale; /* window scaling for recv window */
u_char request_r_scale; /* pending window scaling */
u_char requested_s_scale;
u_long ts_recent; /* timestamp echo data */
u_long ts_recent_age; /* when last updated */
tcp_seq last_ack_sent;
};
t_state成员表示TCP连接的状态。TCP连接有如下11个状态:
#define TCP_NSTATES 11
#define TCPS_CLOSED 0 /* closed */
#define TCPS_LISTEN 1 /* listening for connection */
#define TCPS_SYN_SENT 2 /* active, have sent syn */
#define TCPS_SYN_RECEIVED 3 /* have send and received syn */
/* states < TCPS_ESTABLISHED are those where connections not established */
#define TCPS_ESTABLISHED 4 /* established */
#define TCPS_CLOSE_WAIT 5 /* rcvd fin, waiting for close */
/* states > TCPS_CLOSE_WAIT are those where user has closed */
#define TCPS_FIN_WAIT_1 6 /* have closed, sent fin */
#define TCPS_CLOSING 7 /* closed xchd FIN; await FIN ACK */
#define TCPS_LAST_ACK 8 /* had fin and close; await FIN ACK */
/* states > TCPS_CLOSE_WAIT && < TCPS_FIN_WAIT_2 await ACK of FIN */
#define TCPS_FIN_WAIT_2 9 /* have closed, fin is acked */
#define TCPS_TIME_WAIT 10 /* in 2*msl quiet wait after close */
char *tcpstates[] = {
"CLOSED", "LISTEN", "SYN_SENT", "SYN_RCVD",
"ESTABLISHED", "CLOSE_WAIT", "FIN_WAIT_1", "CLOSING",
"LAST_ACK", "FIN_WAIT_2", "TIME_WAIT",
};
t_timer成员实现多个TCP定时器。TCP连接使用如下几个定时器:
/*
* Definitions of the TCP timers. These timers are counted
* down PR_SLOWHZ times a second.
*/
#define TCPT_NTIMERS 4
#define TCPT_REXMT 0 /* retransmit */
#define TCPT_PERSIST 1 /* retransmit persistance */
#define TCPT_KEEP 2 /* keep alive */
#define TCPT_2MSL 3 /* 2*msl quiet time timer */
t_rxtcur成员表示当前的超时重传时间(RTO)。
t_template成员表示TCP首部和IP首部模板。
snd_una成员表示第一个未确认的序列号。
snd_nxt成员表示下一个发送的序列号。
snd_wl1成员记录最新接收的报文段的序列号,用于发送窗口更新。
snd_wl2成员记录最新接收的报文段的确认序列号,用于发送窗口更新。
iss成员表示起始的发送序列号。
snd_wnd成员表示发送窗口大小,由对端通告的窗口大小和拥塞窗口大小决定。
rcv_wnd成员表示接收窗口大小,由接收缓冲区的大小决定。
rcv_nxt成员表示下一个要接收的序列号。
irs成员表示起始的接收序列号。
rcv_adv成员表示接收窗口的右边界。
snd_max成员表示发送过的最大序列号。
snd_cwnd成员表示拥塞窗口。
snd_ssthresh成员表示慢启动门限。
t_idle成员表示连接空闲时间。
t_rtt成员表示被计时报文的往返时间。
t_rtseq成员表示被计时报文的序列号。
t_srtt成员表示平滑的RTT(估计器)。
t_rttvar成员表示平滑的RTT平均偏差(估计器)。
tcp_newtcpcb函数的代码如下:
/*
* Create a new TCP control block, making an
* empty reassembly queue and hooking it to the argument
* protocol control block.
*/
struct tcpcb *
tcp_newtcpcb(inp)
struct inpcb *inp;
{
register struct tcpcb *tp;
tp = malloc(sizeof(*tp), M_PCB, M_NOWAIT); /*分配一个tcpcb结构体*/
if (tp == NULL)
return ((struct tcpcb *)0);
bzero((char *) tp, sizeof(struct tcpcb));
tp->seg_next = tp->seg_prev = (struct tcpiphdr *)tp;
tp->t_maxseg = tcp_mssdflt; /*默认的MSS 512*/
tp->t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; /*是否支持窗口缩放因子选项和时间戳选项*/
tp->t_inpcb = inp;
/*
* Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
* rtt estimate. Set rttvar so that srtt + 2 * rttvar gives
* reasonable initial retransmit time.
*/
tp->t_srtt = TCPTV_SRTTBASE; /*0*/
tp->t_rttvar = tcp_rttdflt * PR_SLOWHZ << 2; /*初始值为24个时钟滴答 (4 * rttvat)*/
tp->t_rttmin = TCPTV_MIN;
TCPT_RANGESET(tp->t_rxtcur, /*初始RTO,12个时钟滴答,6秒钟*/
((TCPTV_SRTTBASE >> 2) + (TCPTV_SRTTDFLT << 2)) >> 1,
TCPTV_MIN, TCPTV_REXMTMAX);
tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; /*拥塞窗口和慢启动门限设为最大值*/
tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
inp->inp_ip.ip_ttl = ip_defttl; /*IP首部中的TTL值,默认64*/
inp->inp_ppcb = (caddr_t)tp;
return (tp);
}
下图显示了socket系统调用创建的几个结构体之间的关系: