本篇博客主要记录socket创建是的流程,其它socket接口API详见:点击打开链接
1. 应用层创建socket套接字
int socket(int domain, int type, int protocol);
参数说明:
domain:协议域,又称协议族(family)。常用的协议族有AF_INET、AF_INET6、AF_LOCAL(或称AF_UNIX,Unix域Socket)、AF_ROUTE等。协议族决定了socket的地址类型,在通信中必须采用对应的地址,如AF_INET决定了要用ipv4地址(32位的)与端口号(16位的)的组合、AF_UNIX决定了要用一个绝对路径名作为地址;
type:指定Socket类型。常用的socket类型有SOCK_STREAM、SOCK_DGRAM、SOCK_RAW、SOCK_PACKET、SOCK_SEQPACKET等。流式Socket(SOCK_STREAM)是一种面向连接的Socket,针对于面向连接的TCP服务应用。数据报式Socket(SOCK_DGRAM)是一种无连接的Socket,对应于无连接的UDP服务应用;
protocol:指定协议。常用协议有IPPROTO_TCP、IPPROTO_UDP、IPPROTO_STCP、IPPROTO_TIPC等,分别对应TCP传输协议、UDP传输协议、STCP传输协议、TIPC传输协议。
2. socket创建时的套接字堆栈信息
CPU: 0 PID: 472 Comm: init Not tainted 3.10.32 #216
Backtrace:
[<c0012df8>] (dump_backtrace+0x0/0x10c) from [<c0012f1c>] (show_stack+0x18/0x1c)
r7:00000000 r6:00000001 r5:00000002 r4:00000000
[<c0012f04>] (show_stack+0x0/0x1c) from [<c03244bc>] (dump_stack+0x20/0x2c)
[<c032449c>] (dump_stack+0x0/0x2c) from [<c029e3f4>] (SyS_socket+0x28/0xcc)
[<c029e3cc>] (SyS_socket+0x0/0xcc) from [<c000f7c0>] (ret_fast_syscall+0x0/0x2c)
r8:c000f948 r7:00000119 r6:b6fdc33c r5:00000002 r4:ffffffff
3. socket内核源码分析
根据第2.点的堆栈信息分析其对应的源码如下:
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
int retval;
struct socket *sock; //套接字
int flags;
/* Check the SOCK_* constants for consistency. */
BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);
flags = type & ~SOCK_TYPE_MASK;
//SOCK_CLOEXEC(close-on-exec):执行exec函数时关闭本进程内打开的文件描述符
//SOCK_NONBLOCK:设置为非阻塞模式
//具体详见篇文章的使用:http://blog.chinaunix.net/uid-24907956-id-3969651.html
if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return -EINVAL;
type &= SOCK_TYPE_MASK;
if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
retval = sock_create(family, type, protocol, &sock); //创建socket,详见下源码
if (retval < 0)
goto out;
retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); //套接字映射一个描述符,详见下源码
if (retval < 0)
goto out_release;
out:
/* It may be already another descriptor 8) Not kernel problem. */
return retval;
out_release:
sock_release(sock);
return retval;
}
在SYSCALL_DEFINE3这个函数内部主要完成两个工作:
第一,socket创建 sock_create(...);
第二,socket创建之后通过sock_map_fd映射对应的fd并返回给应用程序。
接下来,逐个分析!
4. 内核socket创建
retval = sock_create(family, type, protocol, &sock); //创建socket
int sock_create(int family, int type, int protocol, struct socket **res)
{
return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}
EXPORT_SYMBOL(sock_create);
气质current->nsproxy->net_ns是网络命名空间,详见:点击打开链接
int __sock_create(struct net *net, int family, int type, int protocol,
struct socket **res, int kern)
{
int err;
struct socket *sock;
const struct net_proto_family *pf;
/*
* Check protocol is in range
*/
if (family < 0 || family >= NPROTO) //协议族范围判定
return -EAFNOSUPPORT;
if (type < 0 || type >= SOCK_MAX) //socket类型范围判定
return -EINVAL;
/* Compatibility.
This ugly moron is moved from INET layer to here to avoid
deadlock in module load.
*/
if (family == PF_INET && type == SOCK_PACKET) {
static int warned;
if (!warned) {
warned = 1;
printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",
current->comm);
}
family = PF_PACKET;
}
err = security_socket_create(family, type, protocol, kern); //LSM校验
if (err)
return err;
/*
* Allocate the socket and allow the family to set things up. if
* the protocol is 0, the family is instructed to select an appropriate(合适)
* default.
*/
sock = sock_alloc(); //动态分配一个socket
if (!sock) {
net_warn_ratelimited("socket: no more sockets\n");
return -ENFILE; /* Not exactly a match, but its the
closest posix thing */
}
sock->type = type; //绑定socket类型
#ifdef CONFIG_MODULES
/* Attempt to load a protocol module if the find failed.
*
* 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
* requested real, full-featured networking support upon configuration.
* Otherwise module support will break!
*/
//通过family索引,从全局协议族数组net_families[]中查找是否有效的;
//关于协议族的注册,详见博客:https://blog.csdn.net/chenliang0224/article/details/80330756
if (rcu_access_pointer(net_families[family]) == NULL)
request_module("net-pf-%d", family);
#endif
rcu_read_lock();
pf = rcu_dereference(net_families[family]); //rcu的方式获取协议族
err = -EAFNOSUPPORT;
if (!pf)
goto out_release;
/*
* We will call the ->create function, that possibly is in a loadable
* module, so we have to bump that loadable module refcnt first.
*/
if (!try_module_get(pf->owner))
goto out_release;
/* Now protected by module ref count */
rcu_read_unlock();
//调用协议族的函数create socket
err = pf->create(net, sock, protocol, kern);
if (err < 0)
goto out_module_put;
/*
* Now to bump the refcnt of the [loadable] module that owns this
* socket at sock_release time we decrement its refcnt.
*/
if (!try_module_get(sock->ops->owner))
goto out_module_busy;
/*
* Now that we're done with the ->create function, the [loadable]
* module can have its refcnt decremented
*/
module_put(pf->owner);
err = security_socket_post_create(sock, family, type, protocol, kern);
if (err)
goto out_sock_release;
*res = sock;
return 0;
out_module_busy:
err = -EAFNOSUPPORT;
out_module_put:
sock->ops = NULL;
module_put(pf->owner);
out_sock_release:
sock_release(sock);
return err;
out_release:
rcu_read_unlock();
goto out_sock_release;
}
EXPORT_SYMBOL(__sock_create);
在该函数__sock_create内部,主要完成以下几个工作,
第一,动态分配一个socket
static struct socket *sock_alloc(void)
{
struct inode *inode;
struct socket *sock;
//新建一个inode
inode = new_inode_pseudo(sock_mnt->mnt_sb); //pseudo: 假的、冒充的
if (!inode)
return NULL;
//通过inode,从vfs_inode中内部获取socket
sock = SOCKET_I(inode);
//kmemcheck_annotate_bitfield为宏定义,该宏内部展开后,通过sock调用socket结构体内部的成员
//kmemcheck_bitfield_begin(type)\kmemcheck_bitfield_end(type),然后再在该宏内部通过函数
//kmemcheck_mark_initialized标记被初始化
kmemcheck_annotate_bitfield(sock, type); //annotate: 注释,给...作注释
inode->i_ino = get_next_ino(); //获取下一个节点
inode->i_mode = S_IFSOCK | S_IRWXUGO; //模式为socket | 读写可执行(用户、组、其他)
inode->i_uid = current_fsuid(); //获取当前的用户ID
inode->i_gid = current_fsgid(); //获取当前的组ID
inode->i_op = &sockfs_inode_ops; //绑定节点的操作句柄
this_cpu_add(sockets_in_use, 1);
return sock;
}
其中socket节点操作句柄为
static const struct inode_operations sockfs_inode_ops = {
.getxattr = sockfs_getxattr,
.listxattr = sockfs_listxattr,
};
第二,根据协议族family,在全局协议族数组net_families[*]中查找匹配的
//通过family索引,从全局协议族数组net_families[]中查找有效的;
if (rcu_access_pointer(net_families[family]) == NULL)
request_module("net-pf-%d", family);
#endif
rcu_read_lock();
pf = rcu_dereference(net_families[family]); //rcu的方式获取协议族
err = -EAFNOSUPPORT;
if (!pf)
goto out_release;
关于net_families[*]的创建,详见另外一篇博客:点击打开链接
第三,通过匹配成功的协议族,调用协议族的create函数创建socket
//调用协议族的函数create socket
err = pf->create(net, sock, protocol, kern);
if (err < 0)
goto out_module_put;
假设现在的协议族类型为PF_INET,那么pf->create的函数指针指向inet_create
static const struct net_proto_family inet_family_ops = {
.family = PF_INET,
.create = inet_create,
.owner = THIS_MODULE,
};
static int inet_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
struct sock *sk;
struct inet_protosw *answer;
struct inet_sock *inet;
struct proto *answer_prot;
unsigned char answer_flags;
char answer_no_check;
int try_loading_module = 0;
int err;
if (unlikely(!inet_ehash_secret))
if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
build_ehash_secret();
sock->state = SS_UNCONNECTED; //设置socket的状态为未连接
/* Look for the requested type/protocol pair. */
lookup_protocol:
err = -ESOCKTNOSUPPORT;
rcu_read_lock();
//根据sock->type协议类型,从inetsw[]链表头中获取一个网络层协议
list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
err = 0;
/* Check the non-wild match. */
if (protocol == answer->protocol) { //协议匹配
if (protocol != IPPROTO_IP) //非虚拟协议
break;
} else {
/* Check for the two wild cases. */
if (IPPROTO_IP == protocol) { //虚拟协议
protocol = answer->protocol; //将inetsw中的协议强制赋值给protocol
break;
}
if (IPPROTO_IP == answer->protocol) //answer->protocol中的协议为虚拟就直接跳出,因为检索就没有意义了
break;
}
err = -EPROTONOSUPPORT;
}
if (unlikely(err)) { //条件为真
if (try_loading_module < 2) {
rcu_read_unlock();
/*
* Be more specific, e.g. net-pf-2-proto-132-type-1
* (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
*/
if (++try_loading_module == 1)
request_module("net-pf-%d-proto-%d-type-%d",
PF_INET, protocol, sock->type);
/*
* Fall back to generic, e.g. net-pf-2-proto-132
* (net-pf-PF_INET-proto-IPPROTO_SCTP)
*/
else
request_module("net-pf-%d-proto-%d",
PF_INET, protocol);
goto lookup_protocol;
} else
goto out_rcu_unlock;
}
err = -EPERM;
if (sock->type == SOCK_RAW && !kern &&
!ns_capable(net->user_ns, CAP_NET_RAW))
goto out_rcu_unlock;
sock->ops = answer->ops; //协议特定套接字操作句柄绑定 inet_stream_ops
answer_prot = answer->prot; //传输层协议绑定 tcp_prot
answer_no_check = answer->no_check; //接收、发送是否校验
answer_flags = answer->flags;
rcu_read_unlock();
WARN_ON(answer_prot->slab == NULL);
err = -ENOBUFS;
//动态申请一个网络层表示的套接字sock
sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
if (sk == NULL)
goto out;
err = 0;
sk->sk_no_check = answer_no_check;
if (INET_PROTOSW_REUSE & answer_flags)
sk->sk_reuse = SK_CAN_REUSE;
//通过sock获取inet_sock
inet = inet_sk(sk);
inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
inet->nodefrag = 0;
if (SOCK_RAW == sock->type) {
inet->inet_num = protocol;
if (IPPROTO_RAW == protocol)
inet->hdrincl = 1;
}
if (ipv4_config.no_pmtu_disc)
inet->pmtudisc = IP_PMTUDISC_DONT;
else
inet->pmtudisc = IP_PMTUDISC_WANT;
inet->inet_id = 0;
//sock参数初始化(包括发送、接收、错误队列,以及内存空间)
sock_init_data(sock, sk);
sk->sk_destruct = inet_sock_destruct;
sk->sk_protocol = protocol;
sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
inet->uc_ttl = -1;
inet->mc_loop = 1;
inet->mc_ttl = 1;
inet->mc_all = 1;
inet->mc_index = 0;
inet->mc_list = NULL;
inet->rcv_tos = 0;
sk_refcnt_debug_inc(sk);
if (inet->inet_num) {
/* It assumes that any protocol which allows
* the user to assign a number at socket
* creation time automatically
* shares.
*/
inet->inet_sport = htons(inet->inet_num);
/* Add to protocol hash chains. */
sk->sk_prot->hash(sk); //调用传输层协议 inet_hash
}
//初始化传输层协议
if (sk->sk_prot->init) {
err = sk->sk_prot->init(sk); //调用传输层协议 tcp_v4_init_sock
if (err)
sk_common_release(sk);
}
out:
return err;
out_rcu_unlock:
rcu_read_unlock();
goto out;
}
在inet_create函数内部主要完成以下:
第一,设置socket的状态为未连接
sock->state = SS_UNCONNECTED; //设置socket的状态为未连接
第二,协议类型的判定
//根据sock->type协议类型,从inetsw[]链表头中获取一个网络层协议
list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
err = 0;
/* Check the non-wild match. */
if (protocol == answer->protocol) { //协议匹配
if (protocol != IPPROTO_IP) //非虚拟协议
break;
} else {
/* Check for the two wild cases. */
if (IPPROTO_IP == protocol) { //虚拟协议
protocol = answer->protocol; //将inetsw中的协议强制赋值给protocol
break;
}
if (IPPROTO_IP == answer->protocol) //answer->protocol中的协议为虚拟就直接跳出,因为检索就没有意义了
break;
}
err = -EPROTONOSUPPORT;
}
第三,动态申请一个sock
//动态申请一个网络层表示的套接字sock
sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
if (sk == NULL)
goto out;
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
struct proto *prot)
{
struct sock *sk;
//分配sock
sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
if (sk) {
sk->sk_family = family; //绑定协议族
/*
* See comment in struct sock definition to understand
* why we need sk_prot_creator -acme
*/
sk->sk_prot = sk->sk_prot_creator = prot; //传输层协议绑定
sock_lock_init(sk);
sock_net_set(sk, get_net(net)); //net为命名空间,get_net(net)是命名空间个数加1,然后在函数sock_net_set(...)内部将sk->sk_net指向net命名空间,这里的net命令空间相当于一个全局变量
atomic_set(&sk->sk_wmem_alloc, 1);
sock_update_classid(sk);
sock_update_netprioidx(sk);
}
return sk;
}
EXPORT_SYMBOL(sk_alloc);
在该函数内部,net为命名空间,get_net(net)是命名空间个数加1,然后在函数sock_net_set(...)内部将sk->sk_net指向net命名空间,这里的net命令空间相当于一个全局变量,最后返回sk,这样就将命名空间与sock关联起来了。
第四,通过sock获取网络层inet_sock
//通过sock获取inet_sock
inet = inet_sk(sk);
inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
第五,sock参数初始化
//sock参数初始化(包括发送、接收、错误队列,以及内存空间)
sock_init_data(sock, sk);
void sock_init_data(struct socket *sock, struct sock *sk)
{
skb_queue_head_init(&sk->sk_receive_queue); //接收队列
skb_queue_head_init(&sk->sk_write_queue); //写队列
skb_queue_head_init(&sk->sk_error_queue); //错误队列
#ifdef CONFIG_NET_DMA
skb_queue_head_init(&sk->sk_async_wait_queue);
#endif
sk->sk_send_head = NULL;
init_timer(&sk->sk_timer); //初始化定时器
sk->sk_allocation = GFP_KERNEL;
sk->sk_rcvbuf = sysctl_rmem_default; //默认内存尺寸 256*256,
sk->sk_sndbuf = sysctl_wmem_default;
sk->sk_state = TCP_CLOSE;
sk_set_socket(sk, sock);
sock_set_flag(sk, SOCK_ZAPPED);
if (sock) {
sk->sk_type = sock->type;
sk->sk_wq = sock->wq;
sock->sk = sk;
} else
sk->sk_wq = NULL;
spin_lock_init(&sk->sk_dst_lock);
rwlock_init(&sk->sk_callback_lock);
lockdep_set_class_and_name(&sk->sk_callback_lock,
af_callback_keys + sk->sk_family,
af_family_clock_key_strings[sk->sk_family]);
sk->sk_state_change = sock_def_wakeup;
sk->sk_data_ready = sock_def_readable;
sk->sk_write_space = sock_def_write_space;
sk->sk_error_report = sock_def_error_report;
sk->sk_destruct = sock_def_destruct;
sk->sk_frag.page = NULL;
sk->sk_frag.offset = 0;
sk->sk_peek_off = -1;
sk->sk_peer_pid = NULL;
sk->sk_peer_cred = NULL;
sk->sk_write_pending = 0;
sk->sk_rcvlowat = 1;
sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
sk->sk_stamp = ktime_set(-1L, 0);
sk->sk_pacing_rate = ~0U;
/*
* Before updating sk_refcnt, we must commit prior changes to memory
* (Documentation/RCU/rculist_nulls.txt for details)
*/
smp_wmb();
atomic_set(&sk->sk_refcnt, 1);
atomic_set(&sk->sk_drops, 0);
}
EXPORT_SYMBOL(sock_init_data);
第六,sock inet_hash初始化(重要!)
inet->inet_sport = htons(inet->inet_num);
/* Add to protocol hash chains. */
sk->sk_prot->hash(sk); //调用传输层协议 inet_hash
void inet_hash(struct sock *sk)
{
if (sk->sk_state != TCP_CLOSE) {
local_bh_disable();
__inet_hash(sk);
local_bh_enable();
}
}
static void __inet_hash(struct sock *sk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; //将调用 tcp_hashinfo()函数
struct inet_listen_hashbucket *ilb;
/*Socket不处于监听状态*/
if (sk->sk_state != TCP_LISTEN) {
__inet_hash_nolisten(sk, NULL); /*这里对应的是已经建立连接的*/
return;
}
WARN_ON(!sk_unhashed(sk));
/*根据监听的端口号,查找相对应的HASH*/
ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
spin_lock(&ilb->lock);
/*把sock添加到监听HASH桶的头部,连接到sk->sk_nulls_node */
__sk_nulls_add_node_rcu(sk, &ilb->head);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
spin_unlock(&ilb->lock);
}
第七,sock 传输层协议初始化(重要!)
err = sk->sk_prot->init(sk); //调用传输层协议 tcp_v4_init_sock
static int tcp_v4_init_sock(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
tcp_init_sock(sk);
icsk->icsk_af_ops = &ipv4_specific; //见下,重要
#ifdef CONFIG_TCP_MD5SIG
tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
#endif
return 0;
}
const struct inet_connection_sock_af_ops ipv4_specific = {
.queue_xmit = ip_queue_xmit, //ip: 网络层协议接口
.send_check = tcp_v4_send_check, //tcp:传输层ipv4发送校验
.rebuild_header = inet_sk_rebuild_header, //inet sock重建头
.sk_rx_dst_set = inet_sk_rx_dst_set, //inet socket 接收目的地址设置
.conn_request = tcp_v4_conn_request, //tcp ipv4连接请求
.syn_recv_sock = tcp_v4_syn_recv_sock, //tcp ipv4同步接收socket
.net_header_len = sizeof(struct iphdr),
.setsockopt = ip_setsockopt, //ip网络层设置socket操作集
.getsockopt = ip_getsockopt, //ip网络层获取socket操作集
.addr2sockaddr = inet_csk_addr2sockaddr,
.sockaddr_len = sizeof(struct sockaddr_in),
.bind_conflict = inet_csk_bind_conflict,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_ip_setsockopt, //compat:兼容
.compat_getsockopt = compat_ip_getsockopt,
#endif
};
EXPORT_SYMBOL(ipv4_specific);
顾名思义,上面const struct inet_connection_sock_af_ops ipv4_specific结构体内部成员描述了TCP与IP协议层之间的接口,该结构体的内部成员非常重要,后续会在connect()\bind()\recv()\send()详解!
第八,最终返回新建的 sock。
5. sock_map_fd文件描述符映射
retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); //套接字映射一个描述符
static int sock_map_fd(struct socket *sock, int flags)
{
struct file *newfile;
int fd = get_unused_fd_flags(flags); //获取一个未被使用的描述符
if (unlikely(fd < 0))
return fd;
//sock动态分配文件
newfile = sock_alloc_file(sock, flags, NULL);
if (likely(!IS_ERR(newfile))) {
fd_install(fd, newfile); //fd安装struct file *newfile
return fd;
}
put_unused_fd(fd);
return PTR_ERR(newfile);
}
在sock_map_fd函数内部主要完成以下几部分:
第一,获取一个未被使用的文件描述符fd
int fd = get_unused_fd_flags(flags); //获取一个未被使用的描述符
int get_unused_fd_flags(unsigned flags)
{
return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags);
}
EXPORT_SYMBOL(get_unused_fd_flags);
int __alloc_fd(struct files_struct *files,
unsigned start, unsigned end, unsigned flags)
{
unsigned int fd;
int error;
struct fdtable *fdt;
spin_lock(&files->file_lock);
repeat:
fdt = files_fdtable(files); //通过 struct files_struct *files 查找 struct fdtable *fdt
fd = start;
if (fd < files->next_fd)
fd = files->next_fd;
if (fd < fdt->max_fds)
fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd); //分配一个未被使用的bit位
/*
* N.B. For clone tasks sharing a files structure, this test
* will limit the total number of files that can be opened.
*/
error = -EMFILE;
if (fd >= end)
goto out;
error = expand_files(files, fd); //确定单前fd的下一个描述符是否有效
if (error < 0)
goto out;
/*
* If we needed to expand the fs array we
* might have blocked - try again.
*/
if (error)
goto repeat;
if (start <= files->next_fd)
files->next_fd = fd + 1; //切换到下一个fd
__set_open_fd(fd, fdt); //设置为打开标识
if (flags & O_CLOEXEC) //close-no-exec,为真,表示执行exec时关闭其已经打开的描述符
__set_close_on_exec(fd, fdt);
else
__clear_close_on_exec(fd, fdt);
error = fd;
#if 1
/* Sanity check */
if (rcu_dereference_raw(fdt->fd[fd]) != NULL) {
printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
rcu_assign_pointer(fdt->fd[fd], NULL);
}
#endif
out:
spin_unlock(&files->file_lock);
return error;
}
在__alloc_fd函数内部主要是动态分配一个描述符fd,单里面涉及的知识其实挺广的,关于fd的具体分配,详见:
点击打开链接
第二,sock动态分配一个file结构体
//sock动态分配文件
newfile = sock_alloc_file(sock, flags, NULL);
//在当前进程里,创建一个file struct结构体,并把它映射到fd的空间里
struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
{
struct qstr name = { .name = "" };
struct path path;
struct file *file;
if (dname) { //dname=“NULL”
name.name = dname;
name.len = strlen(name.name);
} else if (sock->sk) {
name.name = sock->sk->sk_prot_creator->name; //“TCP”
name.len = strlen(name.name);
}
//动态分配一个假的目录
path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, &name); //pseudo: 冒充,假的
if (unlikely(!path.dentry))
return ERR_PTR(-ENOMEM);
path.mnt = mntget(sock_mnt);
d_instantiate(path.dentry, SOCK_INODE(sock));
SOCK_INODE(sock)->i_fop = &socket_file_ops; //非常重要,对应文件操作句柄
//动态分配一个file
file = alloc_file(&path, FMODE_READ | FMODE_WRITE,
&socket_file_ops);
if (unlikely(IS_ERR(file))) {
/* drop dentry, keep inode */
ihold(path.dentry->d_inode);
path_put(&path);
return file;
}
sock->file = file;
file->f_flags = O_RDWR | (flags & O_NONBLOCK);
file->private_data = sock; //绑定file私有数据为sock,这也是串联整个socket套接字的关键数据!
return file;
}
EXPORT_SYMBOL(sock_alloc_file);
在sock_alloc_file该函数内部会动态分配一个文件,如下
//动态分配一个file
file = alloc_file(&path, FMODE_READ | FMODE_WRITE,&socket_file_ops);
期中有一个重要的file结构体绑定,其对应文件操作句柄,在编写驱动的时候,这个结构体是最熟悉不过的了
static const struct file_operations socket_file_ops = {
.owner = THIS_MODULE,
.llseek = no_llseek,
.aio_read = sock_aio_read, //异步读取
.aio_write = sock_aio_write,
.poll = sock_poll,
.unlocked_ioctl = sock_ioctl, //对socket套接字设置时调用的接口,具体详见其源码
#ifdef CONFIG_COMPAT
.compat_ioctl = compat_sock_ioctl,
#endif
.mmap = sock_mmap,
.open = sock_no_open, /* special open code to disallow open via /proc */
.release = sock_close,
.fasync = sock_fasync,
.sendpage = sock_sendpage,
.splice_write = generic_splice_sendpage,
.splice_read = sock_splice_read,
};
最后在sock_alloc_file内部进行绑定,这个是不是很熟悉,哈哈!
file->private_data = sock; //绑定file私有数据为sock,这也是串联整个socket套接字的关键数据!
第三,最后根据获取到的fd和file进行安装
fd_install(fd, newfile); //fd安装struct file *newfile
void fd_install(unsigned int fd, struct file *file)
{
__fd_install(current->files, fd, file);
}
/*
* Install a file pointer in the fd array.
*
* The VFS is full of places where we drop the files lock between
* setting the open_fds bitmap and installing the file in the file
* array. At any such point, we are vulnerable to a dup2() race
* installing a file in the array before us. We need to detect this and
* fput() the struct file we are about to overwrite in this case.
*
* It should never happen - if we allow dup2() do it, _really_ bad things
* will follow.
*
* NOTE: __fd_install() variant is really, really low-level; don't
* use it unless you are forced to by truly lousy API shoved down
* your throat. 'files' *MUST* be either current->files or obtained
* by get_files_struct(current) done by whoever had given it to you,
* or really bad things will happen. Normally you want to use
* fd_install() instead.
*/
void __fd_install(struct files_struct *files, unsigned int fd,
struct file *file)
{
struct fdtable *fdt;
spin_lock(&files->file_lock);
fdt = files_fdtable(files);
BUG_ON(fdt->fd[fd] != NULL);
rcu_assign_pointer(fdt->fd[fd], file);
spin_unlock(&files->file_lock);
}
这个函数的注解很详细,这里不再赘述!
6. 总结
本篇博客主要针对socket套接字创建进行源码流程分析,通过协议族、协议类型、协议创建socket套接字,然后通过创建成果的套接字去分配一个文件描述符fd,这里涉及到两个重要的结构体:
第一,当前进程任务结构体 struct task_struct,在套接字创建里面主要涉及到该结构体成员net
第二, 命名空间struct nsproxy,个人认为,命名空间相当于一个全局变量,在socket里面创建,然后在bind,recv,send都能访问该命名空间,所以相当于一个全局变量;
最后返回文件句柄fd给应用层。