这个函数初始化socket的主要数据结构:
socket --> inode
tcp_sock --> inet_connection_sock --> inet_sock --> sock --> sock_common
linux版本:4.9.51
在APP中, listenfd = socket(AF_INET, SOCK_STREAM, 0);
int socket(int domain, int type, int protocol);
因此 type=SOCK_STREAM, protocol=0,是使用默认值。
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol) -- sock_create -- __sock_create
实现函数在 __sock_create 中
1 sock_alloc
分配结构体,关联inode,并设置fops
struct socket *sock_alloc(void)
{
struct inode *inode;
struct socket *sock;
inode = new_inode_pseudo(sock_mnt->mnt_sb);
if (!inode)
return NULL;
sock = SOCKET_I(inode);
inode->i_op = &sockfs_inode_ops; // 设置ops
return sock;
}
static const struct inode_operations sockfs_inode_ops = {
.listxattr = sockfs_listxattr,
};
// new_inode_pseudo --> 会调用到这里:
static struct inode *sock_alloc_inode(struct super_block *sb)
{
struct socket_alloc *ei;
struct socket_wq *wq;
ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
wq = kmalloc(sizeof(*wq), GFP_KERNEL);
if (!wq) {
kmem_cache_free(sock_inode_cachep, ei);
return NULL;
}
init_waitqueue_head(&wq->wait);
wq->fasync_list = NULL;
wq->flags = 0;
RCU_INIT_POINTER(ei->socket.wq, wq);
ei->socket.state = SS_UNCONNECTED;
ei->socket.flags = 0;
ei->socket.ops = NULL;
ei->socket.sk = NULL;
ei->socket.file = NULL;
return &ei->vfs_inode;
}
设置socket的类型
// socket是面向VFS的结构体
/**
* struct socket - general BSD socket
* @state: socket state (%SS_CONNECTED, etc)
* @type: socket type (%SOCK_STREAM, etc)
* @flags: socket flags (%SOCK_NOSPACE, etc)
* @ops: protocol specific socket operations
* @file: File back pointer for gc
* @sk: internal networking protocol agnostic socket representation
* @wq: wait queue for several uses
*/
struct socket {
socket_state state;
kmemcheck_bitfield_begin(type);
short type;
kmemcheck_bitfield_end(type);
unsigned long flags;
struct socket_wq __rcu *wq;
struct file *file;
struct sock *sk;
const struct proto_ops *ops;
};
sock->type = type;
2 由入参family 找到inet_family_ops ,进而调用create
struct net_proto_family {
int family;
int (*create)(struct net *net, struct socket *sock,
int protocol, int kern);
struct module *owner;
};
static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly;
int __sock_create(struct net *net, int family, int type, int protocol,
struct socket **res, int kern)
{
const struct net_proto_family *pf;
pf = rcu_dereference(net_families[family]); // pf = inet_family_ops
err = pf->create(net, sock, protocol, kern);// inet_create
}
/*
net_families 的初始化在 sock_register 完成
*/
static const struct net_proto_family inet_family_ops = {
.family = PF_INET,
.create = inet_create,
.owner = THIS_MODULE,
};
static int __init inet_init(void)
{
(void)sock_register(&inet_family_ops);
}
3 第二步通过net_families 完成分流后,进入主体 inet_create
static struct inet_protosw inetsw_array[] =
{
{
.type = SOCK_STREAM,
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
.flags = INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK,
}, // 这个结构体就是下面的for循环要找的
... ... 为了看着清晰,删掉了其他没用到的。
};
enum sock_type {
SOCK_STREAM = 1,
... ...
SOCK_PACKET = 10,
};
#define SOCK_MAX (SOCK_PACKET + 1)
static struct list_head inetsw[SOCK_MAX];
struct inet_sock {
/* sk and pinet6 has to be the first two members of inet_sock */
struct sock sk;
/* Socket demultiplex comparisons on incoming packets. */
#define inet_daddr sk.__sk_common.skc_daddr
#define inet_rcv_saddr sk.__sk_common.skc_rcv_saddr
#define inet_dport sk.__sk_common.skc_dport
#define inet_num sk.__sk_common.skc_num
__be32 inet_saddr;
__s16 uc_ttl;
__u16 cmsg_flags;
__be16 inet_sport;
__u16 inet_id;
struct ip_options_rcu __rcu *inet_opt;
int rx_dst_ifindex;
__u8 tos;
__u8 min_ttl;
__u8 mc_ttl;
__u8 pmtudisc;
__u8 recverr:1,
is_icsk:1,
freebind:1,
hdrincl:1,
mc_loop:1,
transparent:1,
mc_all:1,
nodefrag:1;
__u8 bind_address_no_port:1;
__u8 rcv_tos;
__u8 convert_csum;
int uc_index;
int mc_index;
__be32 mc_addr;
struct ip_mc_socklist __rcu *mc_list;
struct inet_cork_full cork;
};
// 这个函数创建了sock, inet_sock, 并对inet_sock做了基本的初始化
/*
* Create an inet socket.
*/
static int inet_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
struct sock *sk;
struct inet_protosw *answer;
struct inet_sock *inet;
struct proto *answer_prot;
unsigned char answer_flags;
int try_loading_module = 0;
int err;
sock->state = SS_UNCONNECTED; // 先初始化socket的状态
/* 找出合适的 inet_protosw 结构体 */
/* Look for the requested type/protocol pair. */
lookup_protocol:
list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
// 此处的sock->type 是在上一步设定的,其实是入参type. 是SOCK_STREAM
err = 0;
/* Check the non-wild match. */
if (protocol == answer->protocol) {
if (protocol != IPPROTO_IP)
break;
} else {
/* Check for the two wild cases (野生情况, 什么意思?). */
if (IPPROTO_IP == protocol) {
protocol = answer->protocol;
break;
}
if (IPPROTO_IP == answer->protocol)
break;
}
err = -EPROTONOSUPPORT;
}
// 用找到的 inet_protosw 赋值
sock->ops = answer->ops; //inet_stream_ops
answer_prot = answer->prot;
answer_flags = answer->flags;
// 开始分配 sock. 这是一个核心的数据结构,并且要与socket区分开
// tcp_sock --> inet_connection_sock --> inet_sock --> sock --> sock_common
// 参考 https://blog.csdn.net/yinming4u/article/details/84312871
sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern); //answer_prot是 tcp_prot
inet = inet_sk(sk);
inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
inet->nodefrag = 0;
if (net->ipv4.sysctl_ip_no_pmtu_disc)
inet->pmtudisc = IP_PMTUDISC_DONT;
else
inet->pmtudisc = IP_PMTUDISC_WANT;
inet->inet_id = 0;
sock_init_data(sock, sk);
sk->sk_destruct = inet_sock_destruct;
sk->sk_protocol = protocol;
sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; // tcp_v4_do_rcv
inet->uc_ttl = -1;
inet->mc_loop = 1;
inet->mc_ttl = 1;
inet->mc_all = 1;
inet->mc_index = 0;
inet->mc_list = NULL;
inet->rcv_tos = 0;
sk_refcnt_debug_inc(sk);
// inet->inet_num = 0
if (inet->inet_num) {
/* It assumes that any protocol which allows
* the user to assign a number at socket
* creation time automatically
* shares.
*/
inet->inet_sport = htons(inet->inet_num);
/* Add to protocol hash chains. */
err = sk->sk_prot->hash(sk);
if (err) {
sk_common_release(sk);
goto out;
}
}
// tcp_v4_init_sock
if (sk->sk_prot->init) {
err = sk->sk_prot->init(sk);
if (err)
sk_common_release(sk);
}
out:
return err;
out_rcu_unlock:
rcu_read_unlock();
goto out;
}
初始化 prot->slab
static int __init inet_init(void)
{
rc = proto_register(&tcp_prot, 1);
}
int proto_register(struct proto *prot, int alloc_slab)
{
if (alloc_slab) {
prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
SLAB_HWCACHE_ALIGN | prot->slab_flags,
NULL);
}
list_add(&prot->node, &proto_list);
return 0;
}
struct proto tcp_prot = {
.name = "TCP",
.owner = THIS_MODULE,
.close = tcp_close,
.connect = tcp_v4_connect,
.disconnect = tcp_disconnect,
.accept = inet_csk_accept,
.ioctl = tcp_ioctl,
.init = tcp_v4_init_sock,
.destroy = tcp_v4_destroy_sock,
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
.sendpage = tcp_sendpage,
.backlog_rcv = tcp_v4_do_rcv,
.release_cb = tcp_release_cb,
.hash = inet_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
.stream_memory_free = tcp_stream_memory_free,
.sockets_allocated = &tcp_sockets_allocated,
.orphan_count = &tcp_orphan_count,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
.sysctl_mem = sysctl_tcp_mem,
.sysctl_wmem = sysctl_tcp_wmem,
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
.slab_flags = SLAB_DESTROY_BY_RCU,
.twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
.h.hashinfo = &tcp_hashinfo,
.no_autobind = true,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_tcp_setsockopt,
.compat_getsockopt = compat_tcp_getsockopt,
#endif
.diag_destroy = tcp_abort,
};
分配sock
/* Networking protocol blocks we attach to sockets.
* socket layer -> transport layer interface
*/
struct proto {
void (*close)(struct sock *sk,
long timeout);
int (*connect)(struct sock *sk,
struct sockaddr *uaddr,
int addr_len);
int (*disconnect)(struct sock *sk, int flags);
struct sock * (*accept)(struct sock *sk, int flags, int *err);
......
}
/**
* sk_alloc - All socket objects are allocated here
* @net: the applicable net namespace
* @family: protocol family
* @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
* @prot: struct proto associated with this new sock instance
* @kern: is this to be a kernel socket?
*/
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
struct proto *prot, int kern)
{
struct sock *sk;
sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
if (sk) {
sk->sk_family = family;
// 设置prot
sk->sk_prot = sk->sk_prot_creator = prot;
sock_net_set(sk, net);
}
return sk;
}
static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
int family)
{
struct sock *sk;
struct kmem_cache *slab;
slab = prot->slab;
if (slab != NULL) {
sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
if (!sk)
return sk;
if (priority & __GFP_ZERO)
sk_prot_clear_nulls(sk, prot->obj_size);
} else
sk = kmalloc(prot->obj_size, priority);
return sk;
}
sock 是 inet_sock的父类,所以可以这样转换
struct inet_sock {
/* sk and pinet6 has to be the first two members of inet_sock */
struct sock sk; // sock是第一个元素,所以inet_sk可以这样转换这两个指针
... ... // 其他元素
};
static inline struct inet_sock *inet_sk(const struct sock *sk)
{
return (struct inet_sock *)sk;
}
初始化sock
static int tcp_v4_init_sock(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
tcp_init_sock(sk);
// 初始化 inet_connection_sock
icsk->icsk_af_ops = &ipv4_specific;
return 0;
}
// 初始化 tcp_sock, sock
/* Address-family independent initialization for a tcp_sock.
*
* NOTE: A lot of things set to zero explicitly by call to
* sk_alloc() so need not be done here.
*/
void tcp_init_sock(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
tp->out_of_order_queue = RB_ROOT;
tcp_init_xmit_timers(sk);
tcp_prequeue_init(tp);
INIT_LIST_HEAD(&tp->tsq_node);
icsk->icsk_rto = TCP_TIMEOUT_INIT;
tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
minmax_reset(&tp->rtt_min, tcp_time_stamp, ~0U);
/* So many TCP implementations out there (incorrectly) count the
* initial SYN frame in their delayed-ACK and congestion control
* algorithms that we must have the following bandaid to talk
* efficiently to them. -DaveM
*/
tp->snd_cwnd = TCP_INIT_CWND;
/* There's a bubble in the pipe until at least the first ACK. */
tp->app_limited = ~0U;
/* See draft-stevens-tcpca-spec-01 for discussion of the
* initialization of these values.
*/
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tp->snd_cwnd_clamp = ~0;
tp->mss_cache = TCP_MSS_DEFAULT;
u64_stats_init(&tp->syncp);
tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
tcp_enable_early_retrans(tp);
tcp_assign_congestion_control(sk);
tp->tsoffset = 0;
sk->sk_state = TCP_CLOSE;
sk->sk_write_space = sk_stream_write_space;
sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
icsk->icsk_sync_mss = tcp_sync_mss;
sk->sk_sndbuf = sysctl_tcp_wmem[1];
sk->sk_rcvbuf = sysctl_tcp_rmem[1];
local_bh_disable();
sk_sockets_allocated_inc(sk);
local_bh_enable();
}