前言:
对于Linux内核的Socket系列文章都是依据于:Linux-3.14.5的版本内核分析,对于文中的注释和问题的说明也参考了网络上经典分析文章,对他们奉献表示感谢!
转载请标明:http://blog.chinaunix.net/uid-20788636-id-4408261.html
1. Socket内核调用数SYSCALL_DEFINE3
Socket的创建是在用户空间调用socket系统函数完成的,创建一个Socket返回一个文件描述符fd,内核的系统调用接口为SYSCALL_DEFINE3(socket,
int, family, int, type, int, protocol),在net/socket.c文件中,下面我们看一下内核中的源码实现。
SYSCALL_DEFINE3(socket, int, family, int,
type, int, protocol)
{
int
retval;
struct
socket *sock;
int
flags;
/*
Check the SOCK_* constants for consistency.下面这些都是进行各种的检查操作*/
BUILD_BUG_ON(SOCK_CLOEXEC
!= O_CLOEXEC);
BUILD_BUG_ON((SOCK_MAX
| SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
BUILD_BUG_ON(SOCK_CLOEXEC
& SOCK_TYPE_MASK);
BUILD_BUG_ON(SOCK_NONBLOCK
& SOCK_TYPE_MASK);
flags
= type & ~SOCK_TYPE_MASK;
if
(flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return
-EINVAL;
type
&= SOCK_TYPE_MASK;
if
(SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
flags
= (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
//调用创建socket的函数
retval
= sock_create(family, type, protocol, &sock);//------参考下面的分析
if
(retval < 0)
goto
out;
retval
= sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
if
(retval < 0)
goto
out_release;
out:
/*
It may be already another descriptor 8) Not kernel problem. */
return
retval;
out_release:
sock_release(sock);
return
retval;
}
1.1 socket_create函数
对于sock_create(family,
type, protocol, &sock)函数调用的是包囊函数,
__sock_create(current->nsproxy->net_ns,
family, type, protocol, res, 0);
对于__sock_create函数的定义如下:
int __sock_create(struct net *net, int
family, int type, int protocol,
struct socket **res, int kern)
{
int
err;
struct
socket *sock;
const
struct net_proto_family *pf;
/*
*
Check protocol is in range检查协议的范围,现在内核定义的最大范围为41,这里的family指的是AF_INET6,AF_INET协议簇
#define NPROTO AF_MAX
#define AF_MAX 41 /* For now.. */
*/
if
(family < 0 || family >= NPROTO)
return
-EAFNOSUPPORT;
if
(type < 0 || type >= SOCK_MAX)//这里的type是socket的类型例如SOCK_STREAM
return
-EINVAL;
/*
Compatibility.
This uglymoron is moved from INET layer to
here to avoid
deadlock in module load.
*/
if
(family == PF_INET && type == SOCK_PACKET) {//如果是该类型的socket,对family进行重新的赋值
static
int warned;//这里自动初始化为0,
if
(!warned) {
warned
= 1;
printk(KERN_INFO
"%s uses obsolete (PF_INET,SOCK_PACKET)\n",
current->comm);
}
family
= PF_PACKET;//赋值为PF_PACKET
}
err
= security_socket_create(family, type, protocol, kern);
if
(err)
return
err;
/*
* Allocate
the socket and allow the family to set things up. if
* the
protocol is 0, the family is instructed to select an appropriate
* default.这里调用sock_alloc分配sock,见下面的分析
*/
sock = sock_alloc();
if
(!sock) {
net_warn_ratelimited("socket:
no more sockets\n");
return
-ENFILE; /* Not exactly a match,
but its the
closest posix thing */
}
sock->type
= type;
#ifdef CONFIG_MODULES
/*
Attempt to load a protocol module if the find failed.
*
* 12/09/1996 Marcin: But! this makes REALLY
only sense, if the user
* requested real, full-featured networking
support upon configuration.
* Otherwise module support will break!
*/
if
(rcu_access_pointer(net_families[family]) == NULL)
request_module("net-pf-%d",
family);
#endif
rcu_read_lock();
pf
= rcu_dereference(net_families[family]);
err
= -EAFNOSUPPORT;
if
(!pf)
goto
out_release;
/*
* We will call the ->create function, that
possibly is in a loadable
* module, so we have to bump that loadable
module refcnt first.
*/
if
(!try_module_get(pf->owner))
goto
out_release;
/*
Now protected by module ref count */
rcu_read_unlock();
/*static const struct
net_proto_family inet_family_ops = {
.family = PF_INET,
.create = inet_create,
.owner =
THIS_MODULE,
};这里根据注册的family类型,调用不同的create函数,这里就是调用inet_ctreate*/
err = pf->create(net, sock, protocol, kern);
if
(err < 0)
goto
out_module_put;
/*
* Now to bump the refcnt of the [loadable]
module that owns this
* socket at sock_release time we decrement its
refcnt.
*/
if
(!try_module_get(sock->ops->owner))
goto
out_module_busy;
/*
* Now that we're done with the ->create
function, the [loadable]
* module can have its refcnt decremented
*/
module_put(pf->owner);
err
= security_socket_post_create(sock, family, type, protocol, kern);
if
(err)
goto
out_sock_release;
*res
= sock;
return
0;
out_module_busy:
err
= -EAFNOSUPPORT;
out_module_put:
sock->ops
= NULL;
module_put(pf->owner);
out_sock_release:
sock_release(sock);
return
err;
out_release:
rcu_read_unlock();
goto
out_sock_release;
}
1.1.1 sock_alloc函数
sock_alloc函数用于分配一个socket结构体,这这里涉及了inode结构以及在分配完成后返回的地址指针。
static struct socket *sock_alloc(void)
{
struct
inode *inode;
struct
socket *sock;
/*下面的new_inode_pseudo函数是分配一个新的inode结构体,但在实际分配过程中,分配了一个socket_alloc结构体,返回d的是inode地址,struct socket_alloc {
struct socket socket;
struct inode vfs_inode;
};
*/
inode
= new_inode_pseudo(sock_mnt->mnt_sb);//sock_mnt哪里进行初始的,请看下面的分析-----(1)
if
(!inode)
return
NULL;
sock
= SOCKET_I(inode);//该宏根据返回的inode获取到分配的socket_alloc指针
kmemcheck_annotate_bitfield(sock,
type);
/*下面是对inode变量进行初始化操作,*/
inode->i_ino
= get_next_ino();
inode->i_mode
= S_IFSOCK | S_IRWXUGO;
inode->i_uid
= current_fsuid();//用户ID,在后面调用bind系统调用时会进行对比
inode->i_gid
= current_fsgid();//组ID
inode->i_op
= &sockfs_inode_ops;
this_cpu_add(sockets_in_use,
1);
return
sock;
}
(1)对于sock_mnt->mnt_sb的赋值和分配过程如下:
在sock_init函数中对socket类型的文件系统进行注册
static struct file_system_type sock_fs_type = {
.name
= "sockfs",
.mount
= sockfs_mount,
.kill_sb
= kill_anon_super,
};
static int __init sock_init(void)
{
int
err;
/*
*
Initialize the network sysctl infrastructure.
*/
err
= net_sysctl_init();
if
(err)
goto
out;
/*
*
Initialize skbuff SLAB cache
*/
skb_init();
/*
*
Initialize the protocols module.
*/
init_inodecache();
/*下面的函数进行文件系统的注册*/
err = register_filesystem(&sock_fs_type);
if
(err)
goto
out_fs;
/*下面的函数挂载文件系统*/
sock_mnt = kern_mount(&sock_fs_type);
if
(IS_ERR(sock_mnt)) {
err
= PTR_ERR(sock_mnt);
goto
out_mount;
}
/*
The real protocol initialization is performed in later initcalls.
*/
#ifdef CONFIG_NETFILTER
err
= netfilter_init();
if
(err)
goto
out;
#endif
#ifdef CONFIG_NETWORK_PHY_TIMESTAMPING
skb_timestamping_init();
#endif
out:
return
err;
out_mount:
unregister_filesystem(&sock_fs_type);
out_fs:
goto
out;
}
(2)new_inode_pseudo函数创建一个inode,并初始化inode的i_state变量和inode->i_sb_list链表,实际的分配函数为alloc_inode函数
struct inode *new_inode_pseudo(struct
super_block *sb)
{
struct inode *inode = alloc_inode(sb);
if
(inode) {
spin_lock(&inode->i_lock);
inode->i_state = 0;
spin_unlock(&inode->i_lock);
INIT_LIST_HEAD(&inode->i_sb_list);
}
return
inode;
}
alloc_inode分配一个inode节点,
static struct inode *alloc_inode(struct
super_block *sb)
{
struct
inode *inode;
if
(sb->s_op->alloc_inode)
/*如果当前文件系统的超级块,有自己的分配inode的函数,则调用它自己的分配函数,否则从公用的高速缓存中分配一个inode.对于sokcet来说,在socket.c文件中,调用的函数为sock_alloc_inode
static const struct
super_operations sockfs_ops = {
.alloc_inode =
sock_alloc_inode,
.destroy_inode =
sock_destroy_inode,
.statfs =
simple_statfs,
};
*/
inode = sb->s_op->alloc_inode(sb);
else
inode
= kmem_cache_alloc(inode_cachep, GFP_KERNEL);
if
(!inode)
return
NULL;
/*对inode结构进行初始化*/
if
(unlikely(inode_init_always(sb, inode))) {
if
(inode->i_sb->s_op->destroy_inode)
inode->i_sb->s_op->destroy_inode(inode);
else
kmem_cache_free(inode_cachep,
inode);
return
NULL;
}
return
inode;
}
(3) 下面是sock_alloc_inode函数,在socket.c文件中
static struct inode
*sock_alloc_inode(struct super_block *sb)
{
struct socket_alloc *ei;
struct
socket_wq *wq;
/*下面的函数分配struct socket_alloc结构体,这里是怎么分配的呢?参考下面的说明*/
ei
= kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
if
(!ei)
return
NULL;
wq
= kmalloc(sizeof(*wq), GFP_KERNEL);
if
(!wq) {
kmem_cache_free(sock_inode_cachep,
ei);
return
NULL;
}
init_waitqueue_head(&wq->wait);
wq->fasync_list
= NULL;
RCU_INIT_POINTER(ei->socket.wq,
wq);
ei->socket.state
= SS_UNCONNECTED;
ei->socket.flags
= 0;
ei->socket.ops
= NULL;
ei->socket.sk
= NULL;
ei->socket.file
= NULL;
return
&ei->vfs_inode; //这里返回的是struct inode vfs_inode;
}
备注说明:在分配函数sock_alloc_inode中调用了ei =
kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);这里分配的大小为socket_alloc大小,下面分析一下是如果分配该大小的?
init_inodecache函数中(net/socket.c),对其进行了高速缓存的分配操作,定义在socket.c文件中,这里分配的大小为socket_alloc,但返回是的socket_alloc结构体中的struct inode vfs_inode;变量。该函数在sock_init函数中被调用
static int init_inodecache(void)
{
sock_inode_cachep =
kmem_cache_create("sock_inode_cache",
sizeof(struct socket_alloc),
0,
(SLAB_HWCACHE_ALIGN |
SLAB_RECLAIM_ACCOUNT |
SLAB_MEM_SPREAD),
init_once);
if
(sock_inode_cachep == NULL)
return
-ENOMEM;
return
0;
}
1.1.2 inet_create函数
在socket_create函数中调用pf->create这里的函数指针为inet_create。在文件/net/af_inet.c中
static int inet_create(struct net *net,
struct socket *sock, int protocol,
int kern)
{
struct
sock *sk;
struct
inet_protosw *answer;
struct
inet_sock *inet;
struct
proto *answer_prot;
unsigned
char answer_flags;
char
answer_no_check;
int
try_loading_module = 0;
int
err;
sock->state
= SS_UNCONNECTED;
/*
Look for the requested type/protocol pair. */
lookup_protocol:
err
= -ESOCKTNOSUPPORT;
rcu_read_lock();
/*从inetsw中根据类型、协议查找相应的socket interface也就是inet_protosw */
list_for_each_entry_rcu(answer,
&inetsw[sock->type], list) {
err
= 0;
/*
Check the non-wild match. */
if
(protocol == answer->protocol) {
if
(protocol != IPPROTO_IP)
break;
}
else {
/*
Check for the two wild cases. */
if
(IPPROTO_IP == protocol) {
protocol
= answer->protocol;
break;
}
if
(IPPROTO_IP == answer->protocol)
break;
}
err
= -EPROTONOSUPPORT;
}
/*如果没有找到,尝试加载模块*/
if
(unlikely(err)) {
if
(try_loading_module < 2) {
rcu_read_unlock();
/*
* Be more specific, e.g.
net-pf-2-proto-132-type-1
*
(net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
*/
if (++try_loading_module ==
1)
request_module("net-pf-%d-proto-%d-type-%d",
PF_INET, protocol, sock->type);
/*
* Fall back to generic, e.g.
net-pf-2-proto-132
* (net-pf-PF_INET-proto-IPPROTO_SCTP)
*/
else
request_module("net-pf-%d-proto-%d",
PF_INET, protocol);
goto
lookup_protocol;
}
else
goto
out_rcu_unlock;
}
err
= -EPERM;
if
(sock->type == SOCK_RAW && !kern &&
!ns_capable(net->user_ns, CAP_NET_RAW))
goto
out_rcu_unlock;
sock->ops
= answer->ops;
answer_prot
= answer->prot;
answer_no_check
= answer->no_check;
answer_flags
= answer->flags;
rcu_read_unlock();
WARN_ON(answer_prot->slab
== NULL);
/* sk_alloc表面上是生成一个sock的结构体,但是实际上对于tcp来说是一个tcp_sock的大小的结构体,这样就可以使用inet_sk(sk);进行强制的类型转换,具体是怎么分配的是tcp_sock大小的,在后续进行分析*/
err
= -ENOBUFS;
sk
= sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
if
(sk == NULL)
goto
out;
err
= 0;
sk->sk_no_check
= answer_no_check;
if
(INET_PROTOSW_REUSE & answer_flags)
sk->sk_reuse
= SK_CAN_REUSE;
inet
= inet_sk(sk);
inet->is_icsk
= (INET_PROTOSW_ICSK & answer_flags) != 0;
inet->nodefrag
= 0;
if
(SOCK_RAW == sock->type) {
inet->inet_num
= protocol;
if
(IPPROTO_RAW == protocol)
inet->hdrincl
= 1;
}
if
(net->ipv4.sysctl_ip_no_pmtu_disc)
inet->pmtudisc
= IP_PMTUDISC_DONT;
else
inet->pmtudisc
= IP_PMTUDISC_WANT;
inet->inet_id
= 0;
/*对sk结构体中的变量进行初始化操作,*/
sock_init_data(sock, sk);------------------(1)
sk->sk_destruct
= inet_sock_destruct;
sk->sk_protocol
= protocol;
sk->sk_backlog_rcv
= sk->sk_prot->backlog_rcv;
inet->uc_ttl = -1;
inet->mc_loop = 1;
inet->mc_ttl = 1;
inet->mc_all = 1;
inet->mc_index = 0;
inet->mc_list = NULL;
inet->rcv_tos = 0;
sk_refcnt_debug_inc(sk);
if
(inet->inet_num) {
/*
It assumes that any protocol which allows
* the user to assign a number at socket
* creation time automatically
* shares.
*/
inet->inet_sport
= htons(inet->inet_num);
/*
Add to protocol hash chains. */
sk->sk_prot->hash(sk);
}
if
(sk->sk_prot->init) {
err
= sk->sk_prot->init(sk);//如果是tcp的话,这里就是tcp_v4_init_sock--------(2)
if
(err)
sk_common_release(sk);
}
out:
return
err;
out_rcu_unlock:
rcu_read_unlock();
goto
out;
}
(1)sock_init_data函数分析
void sock_init_data(struct socket *sock,
struct sock *sk)
{
skb_queue_head_init(&sk->sk_receive_queue);
skb_queue_head_init(&sk->sk_write_queue);
skb_queue_head_init(&sk->sk_error_queue);
#ifdef CONFIG_NET_DMA
skb_queue_head_init(&sk->sk_async_wait_queue);
#endif
sk->sk_send_head = NULL;
/*初始化sk定时器*/
init_timer(&sk->sk_timer);
sk->sk_allocation = GFP_KERNEL;
sk->sk_rcvbuf = sysctl_rmem_default;
sk->sk_sndbuf = sysctl_wmem_default;
sk->sk_state = TCP_CLOSE; //初始化sk_state=TCP_CLOSE状态,在后面是的系统调用中会进行判断
sk_set_socket(sk,
sock);// sk->sk_socket = sock;设置sk中指向socket的指针
sock_set_flag(sk,
SOCK_ZAPPED);//设置SOKCET的flag位,表明该sokcet已经绑定了一个名字,该标志位没有搞明白什么意思?
if
(sock) {
sk->sk_type = sock->type;
sk->sk_wq = sock->wq;
sock->sk = sk; // struct socket *sock的sk指向sock
}
else
sk->sk_wq = NULL;
spin_lock_init(&sk->sk_dst_lock);
rwlock_init(&sk->sk_callback_lock);
lockdep_set_class_and_name(&sk->sk_callback_lock,
af_callback_keys
+ sk->sk_family,
af_family_clock_key_strings[sk->sk_family]);
sk->sk_state_change = sock_def_wakeup;
sk->sk_data_ready = sock_def_readable;
sk->sk_write_space = sock_def_write_space;
sk->sk_error_report = sock_def_error_report;
sk->sk_destruct = sock_def_destruct;
sk->sk_frag.page = NULL;
sk->sk_frag.offset = 0;
sk->sk_peek_off = -1;
sk->sk_peer_pid
= NULL;
sk->sk_peer_cred = NULL;
sk->sk_write_pending = 0;
sk->sk_rcvlowat = 1;
sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
sk->sk_stamp
= ktime_set(-1L, 0);
#ifdef CONFIG_NET_RX_BUSY_POLL
sk->sk_napi_id = 0;
sk->sk_ll_usec = sysctl_net_busy_read;
#endif
sk->sk_max_pacing_rate
= ~0U;
sk->sk_pacing_rate
= ~0U;
/*
* Before updating sk_refcnt, we must commit
prior changes to memory
* (Documentation/RCU/rculist_nulls.txt for
details)
*/
smp_wmb();
atomic_set(&sk->sk_refcnt,
1);//sk的引用计数加1
atomic_set(&sk->sk_drops,
0);
}
(2)static int
tcp_v4_init_sock(struct sock *sk)
{
struct inet_connection_sock *icsk =
inet_csk(sk);//基于上面的原因分析,其实这里可以进行强制的类型转换
tcp_init_sock(sk);//进行tcp相关变量的初始化工作
icsk->icsk_af_ops =
&ipv4_specific;
#ifdef
CONFIG_TCP_MD5SIG
tcp_sk(sk)->af_specific =
&tcp_sock_ipv4_specific;
#endif
return 0;
}