源码基于linux 3.0.35, imx6,refer and tks
linux 内核源代码剖析-tcpip实现,深入浅出linux tcpip,tcpip详解1
1.协议栈架构,一图概括
2.协议栈初始化流程
core_initcall(sock_init)-----------net/socket.c---------socket文件系统的初始化
subsys_initcall(proto_init)---net/core/sock.c---传输层TCP的初始化
module_init(dm9000_init)-----------drivers/net/dm9000.c---driver
上面的这5个层次的初始化,哪个先执行?看3
3.内核怎么初始化initcalls段的?
init/main.c
start_kernel -> rest_init -> kernel_init -> do_basic_setup -> do_initcalls
此函数会依次调用下面各个段的函数
include/asm-generic/vmlinux.lds.h
所以,经 core_initcall修饰的sock_init最先执行
4.协议栈send,recv流程likely走一遍
TCP消息下行(应用-->驱动):
TCP消息上行(应用<--驱动):
先看原型
int socket(int domain, int type, int protocol);
domain 目前可以取值AF_UNIX, AF_LOCAL, AF_INET, AF_INET6, AF_IPX, AF_NETLINK, AF_BLUETOOTH, AF_CAN 等
tcp/ip协议栈没有实现,取值0 ;
bluez协议栈有实现之,可取值 BTPROTO_L2CAP, BTPROTO_HCI, BTPROTO_SCO, BTPROTO_RFCOMM, BTPROTO_BNEP, BTPROTO_CMTP, BTPROTO_HIDP, BTPROTO_AVDTP等
内核流程:
1.sys_socket(net/socket.c)
2.sock_create(net/socket.c)
3.__sock_create(net/socket.c )
此函数中根据domain = AF_INET,通过函数pf = rcu_dereference(net_families[family]);
找到结构体inet_family_ops (net/ipv4/af_inet.c)
static const struct net_proto_family inet_family_ops = {
.family = PF_INET,
.create = inet_create,
.owner = THIS_MODULE,
};
4.pf->create(net, sock, protocol, kern);(net/socket.c)
5.即调用inet_family_ops 的inet_create (net/ipv4/af_inet.c)
6.创建socket sock
此函数中根据type = SOCK_STREAM,从静态数组struct inet_protosw inetsw_array(net/ipv4/af_inet.c)中找到对应SOCK_STREAM的结构体,
将结构体的struct proto_ops ops和struct proto prot赋给sock,最终会形成
sock->ops = inet_stream_ops(inet_stream_ops位于net/ipv4/af_inet.c)
sock->sk->sk_prot = tcp_prot (tcp_prot位于net/ipv4/tcp_ipv4.c)
可见,重要的几个结构体是上面红色字体标记的:
struct net_proto_family
struct proto_ops
struct proto
而struct inet_protosw主要是为了方便绑定proto_ops 和 proto
6.协议无关接口的作用是:
不管是什么协议,应用层都可以通过send,recv系统调用操作数据。比如
net/ipv4/af_inet.c来处理tcpip协议的协议无关
net/bluetooth/af_bluetooth.c来处理bluez协议的协议无关
而实现协议无关,只需要定义一个socket调用接口,如下
1.实现一个结构体struct net_proto_family,标记上是family 协议比如PF_INET,PF_BLUETOOTH 并注册到内核上。在应用层执行socket系统调用的时候会制定哪种family ,然后内核会根据family 来找具体的那个net_proto_family,然后调用这个结构体的create
2.实现struct proto_ops的各个函数指针,在应用层执行系统调用send,recv时,会调用proto_ops结构体的sendmsg和recvmsg
linux 内核源代码剖析-tcpip实现,深入浅出linux tcpip,tcpip详解1
1.协议栈架构,一图概括
2.协议栈初始化流程
core_initcall(sock_init)-----------net/socket.c---------socket文件系统的初始化
subsys_initcall(proto_init)---net/core/sock.c---传输层TCP的初始化
subsys_initcall(net_dev_init)---net/core/dev.c---设备无关接口的初始化
fs_initcall(inet_init)----------net/ipv4/af_inet.c---协议无关接口的初始化(部分协议栈的初始化),上图的INETmodule_init(dm9000_init)-----------drivers/net/dm9000.c---driver
上面的这5个层次的初始化,哪个先执行?看3
3.内核怎么初始化initcalls段的?
init/main.c
start_kernel -> rest_init -> kernel_init -> do_basic_setup -> do_initcalls
此函数会依次调用下面各个段的函数
include/asm-generic/vmlinux.lds.h
#define INITCALLS \
*(.initcallearly.init) \
VMLINUX_SYMBOL(__early_initcall_end) = .; \
*(.initcall0.init) \
*(.initcall0s.init) \
*(.initcall1.init) \
*(.initcall1s.init) \
*(.initcall2.init) \
*(.initcall2s.init) \
*(.initcall3.init) \
*(.initcall3s.init) \
*(.initcall4.init) \
*(.initcall4s.init) \
*(.initcall5.init) \
*(.initcall5s.init) \
*(.initcallrootfs.init) \
*(.initcall6.init) \
*(.initcall6s.init) \
*(.initcall7.init) \
*(.initcall7s.init)
include/linux/init.h
#define pure_initcall(fn) __define_initcall("0",fn,0)
#define core_initcall(fn) __define_initcall("1",fn,1)
#define core_initcall_sync(fn) __define_initcall("1s",fn,1s)
#define postcore_initcall(fn) __define_initcall("2",fn,2)
#define postcore_initcall_sync(fn) __define_initcall("2s",fn,2s)
#define arch_initcall(fn) __define_initcall("3",fn,3)
#define arch_initcall_sync(fn) __define_initcall("3s",fn,3s)
#define subsys_initcall(fn) __define_initcall("4",fn,4)
#define subsys_initcall_sync(fn) __define_initcall("4s",fn,4s)
#define fs_initcall(fn) __define_initcall("5",fn,5)
#define fs_initcall_sync(fn) __define_initcall("5s",fn,5s)
#define rootfs_initcall(fn) __define_initcall("rootfs",fn,rootfs)
#define device_initcall(fn) __define_initcall("6",fn,6)
#define device_initcall_sync(fn) __define_initcall("6s",fn,6s)
#define late_initcall(fn) __define_initcall("7",fn,7)
#define late_initcall_sync(fn) __define_initcall("7s",fn,7s)
#define __define_initcall(level,fn,id) \
static initcall_t __initcall_##fn##id __used \
__attribute__((__section__(".initcall" level ".init"))) = fn
所以,经 core_initcall修饰的sock_init最先执行
4.协议栈send,recv流程likely走一遍
TCP消息下行(应用-->驱动):
1 | sendto | 应用层 | |
2 | sys_sendto | net/socket.c | 系统调用接口(socket文件系统) |
3 | sock_sendmsg | net/socket.c | |
4 | __sock_sendmsg_nosec | net/socket.c | |
5 | sock->ops->sendmsg | net/socket.c | |
6 | inet_sendmsg (inet_stream_ops) | net/ipv4/af_inet.c | 协议无关接口 |
7 | sk->sk_prot->sendmsg | net/ipv4/af_inet.c | |
8 | tcp_sendmsg (tcp_prot) | net/ipv4/tcp.c | TCP层 |
9 | tcp_push | net/ipv4/tcp.c | |
10 | __tcp_push_pending_frames | net/ipv4/tcp_out.c | |
11 | tcp_write_xmit | net/ipv4/tcp_out.c | |
12 | tcp_transmit_skb | net/ipv4/tcp_out.c | |
13 | icsk->icsk_af_ops->queue_xmit | net/ipv4/tcp_out.c | |
14 | ip_queue_xmit (ipv4_specific) | net/ipv4/ip_output.c | IP层 |
15 | ip_local_out | net/ipv4/ip_output.c | |
16 | dst_output | include/net/dst.h | |
17 | skb_dst(skb)->output | include/net/dst.h | |
18 | ip_output | net/ipv4/ip_output.c | |
19 | ip_finish_output | net/ipv4/ip_output.c | |
20 | ip_finish_output2 | net/ipv4/ip_output.c | |
21 | neigh->output | net/ipv4/ip_output.c | |
22 | dev_queue_xmit | net/core/dev.c | 设备无关接口 |
23 | dev_hard_start_xmit | net/core/dev.c | |
24 | ops->ndo_start_xmit | net/core/dev.c | |
25 | dm9000_start_xmit | drivers/net/dm9000.c | 链路层,driver |
TCP消息上行(应用<--驱动):
28 | recvfrom | 应用层 | |
27 | sys_recvmsg | net/socket.c | 系统调用接口(socket文件系统) |
26 | sock_recvmsg | net/socket.c | |
25 | __sock_recvmsg_nosec | net/socket.c | |
24 | sock->ops->recvmsg | net/socket.c | |
23 | inet_recvmsg (inet_stream_ops) | net/ipv4/af_inet.c | 协议无关接口 |
22 | sk->sk_prot->recvmsg | net/ipv4/af_inet.c | |
21 | tcp_recvmsg (tcp_prot) | net/ipv4/tcp.c | TCP层 |
20 | skb_queue_walk | net/ipv4/tcp.c | |
19 | __skb_queue_tail(加入队列) | net/ipv4/tcp_input.c | |
18 | tcp_rcv_established | net/ipv4/tcp_input.c | |
17 | tcp_v4_do_rcv | net/ipv4/tcp_ipv4.c | |
16 | tcp_v4_rcv | net/ipv4/tcp_ipv4.c | |
15 | ipprot->handler | net/ipv4/ip_input.c | IP层 |
14 | ip_local_deliver_finish | net/ipv4/ip_input.c | |
13 | ip_local_deliver | net/ipv4/ip_input.c | |
12 | skb_dst(skb)->input | include/net/dst.h | |
11 | dst_input | include/net/dst.h | |
10 | ip_rcv_finish | net/ipv4/ip_input.c | |
9 | ip_rcv | net/ipv4/ip_input.c | |
8 | deliver_skb | net/core/dev.c | 驱动无关接口 |
7 | __netif_receive_skb | net/core/dev.c | |
6 | process_backlog | net/core/dev.c | |
5 | net_rx_action(软中断) | net/core/dev.c | |
4 | enqueue_to_backlog | net/core/dev.c | |
3 | netif_rx (Non NAPI) | net/core/dev.c | |
2 | dm9000_rx | drivers/net/dm9000.c | driver |
1 | dm9000_interrupt | drivers/net/dm9000.c |
5.协议栈socket系统调用走一遍
应用层执行int fd = socket(AF_INET, SOCK_STREAM, 0)时,内核做了什么?先看原型
int socket(int domain, int type, int protocol);
domain 目前可以取值AF_UNIX, AF_LOCAL, AF_INET, AF_INET6, AF_IPX, AF_NETLINK, AF_BLUETOOTH, AF_CAN 等
type 目前可以取值SOCK_STREAM, SOCK_DGRAM, SOCK_SEQPACKET, SOCK_RAW, SOCK_PACKET 等
protocol:tcp/ip协议栈没有实现,取值0 ;
bluez协议栈有实现之,可取值 BTPROTO_L2CAP, BTPROTO_HCI, BTPROTO_SCO, BTPROTO_RFCOMM, BTPROTO_BNEP, BTPROTO_CMTP, BTPROTO_HIDP, BTPROTO_AVDTP等
内核流程:
1.sys_socket(net/socket.c)
2.sock_create(net/socket.c)
3.__sock_create(net/socket.c )
此函数中根据domain = AF_INET,通过函数pf = rcu_dereference(net_families[family]);
找到结构体inet_family_ops (net/ipv4/af_inet.c)
static const struct net_proto_family inet_family_ops = {
.family = PF_INET,
.create = inet_create,
.owner = THIS_MODULE,
};
4.pf->create(net, sock, protocol, kern);(net/socket.c)
5.即调用inet_family_ops 的inet_create (net/ipv4/af_inet.c)
6.创建socket sock
此函数中根据type = SOCK_STREAM,从静态数组struct inet_protosw inetsw_array(net/ipv4/af_inet.c)中找到对应SOCK_STREAM的结构体,
将结构体的struct proto_ops ops和struct proto prot赋给sock,最终会形成
sock->ops = inet_stream_ops(inet_stream_ops位于net/ipv4/af_inet.c)
sock->sk->sk_prot = tcp_prot (tcp_prot位于net/ipv4/tcp_ipv4.c)
可见,重要的几个结构体是上面红色字体标记的:
struct net_proto_family
struct proto_ops
struct proto
而struct inet_protosw主要是为了方便绑定proto_ops 和 proto
6.协议无关接口的作用是:
不管是什么协议,应用层都可以通过send,recv系统调用操作数据。比如net/ipv4/af_inet.c来处理tcpip协议的协议无关
net/bluetooth/af_bluetooth.c来处理bluez协议的协议无关
而实现协议无关,只需要定义一个socket调用接口,如下
1.实现一个结构体struct net_proto_family,标记上是family 协议比如PF_INET,PF_BLUETOOTH 并注册到内核上。在应用层执行socket系统调用的时候会制定哪种family ,然后内核会根据family 来找具体的那个net_proto_family,然后调用这个结构体的create
2.实现struct proto_ops的各个函数指针,在应用层执行系统调用send,recv时,会调用proto_ops结构体的sendmsg和recvmsg
7.tcp中几个重要的结构体实例
//net/ipv4/af_inet.c
static struct inet_protosw inetsw_array[] =
{
{
.type = SOCK_STREAM,
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
.no_check = 0,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
},
{
.type = SOCK_DGRAM,
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_PERMANENT,
},
{
.type = SOCK_DGRAM,
.protocol = IPPROTO_ICMP,
.prot = &ping_prot,
.ops = &inet_dgram_ops,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_REUSE,
},
{
.type = SOCK_RAW,
.protocol = IPPROTO_IP, /* wild card */
.prot = &raw_prot,
.ops = &inet_sockraw_ops,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_REUSE,
}
};
//net/ipv4/af_inet.c
const struct proto_ops inet_stream_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
.bind = inet_bind,
.connect = inet_stream_connect,
.socketpair = sock_no_socketpair,
.accept = inet_accept,
.getname = inet_getname,
.poll = tcp_poll,
.ioctl = inet_ioctl,
.listen = inet_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
.splice_read = tcp_splice_read,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
.compat_ioctl = inet_compat_ioctl,
#endif
};
//net/ipv4/af_inet.c
static const struct net_protocol tcp_protocol = {
.handler = tcp_v4_rcv,
.err_handler = tcp_v4_err,
.gso_send_check = tcp_v4_gso_send_check,
.gso_segment = tcp_tso_segment,
.gro_receive = tcp4_gro_receive,
.gro_complete = tcp4_gro_complete,
.no_policy = 1,
.netns_ok = 1,
};
//net/ipv4/tcp_ipv4.c
struct proto tcp_prot = {
.name = "TCP",
.owner = THIS_MODULE,
.close = tcp_close,
.connect = tcp_v4_connect,
.disconnect = tcp_disconnect,
.accept = inet_csk_accept,
.ioctl = tcp_ioctl,
.init = tcp_v4_init_sock,
.destroy = tcp_v4_destroy_sock,
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
.sendpage = tcp_sendpage,
.backlog_rcv = tcp_v4_do_rcv,
.hash = inet_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
.sockets_allocated = &tcp_sockets_allocated,
.orphan_count = &tcp_orphan_count,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
.sysctl_mem = sysctl_tcp_mem,
.sysctl_wmem = sysctl_tcp_wmem,
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
.slab_flags = SLAB_DESTROY_BY_RCU,
.twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
.h.hashinfo = &tcp_hashinfo,
.no_autobind = true,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_tcp_setsockopt,
.compat_getsockopt = compat_tcp_getsockopt,
#endif
};
linux 用socket结构体描述套接字接口,关键成员是sk(sk->sk_prot,此ops指向传输层)和ops(此ops代表用户接口层)
struct socket {
socket_state state;
kmemcheck_bitfield_begin(type);
short type;
kmemcheck_bitfield_end(type);
unsigned long flags;
struct socket_wq __rcu *wq;
struct file *file;
struct sock *sk;
const struct proto_ops *ops;
};