linux 服务器、客户端socket(3)

本篇博客主要记录socket创建是的流程,其它socket接口API详见:点击打开链接

1. 应用层创建socket套接字

int socket(int domain, int type, int protocol);

参数说明:

domain:协议域,又称协议族(family)。常用的协议族有AF_INET、AF_INET6、AF_LOCAL(或称AF_UNIX,Unix域Socket)、AF_ROUTE等。协议族决定了socket的地址类型,在通信中必须采用对应的地址,如AF_INET决定了要用ipv4地址(32位的)与端口号(16位的)的组合、AF_UNIX决定了要用一个绝对路径名作为地址;
type:指定Socket类型。常用的socket类型有SOCK_STREAM、SOCK_DGRAM、SOCK_RAW、SOCK_PACKET、SOCK_SEQPACKET等。流式Socket(SOCK_STREAM)是一种面向连接的Socket,针对于面向连接的TCP服务应用。数据报式Socket(SOCK_DGRAM)是一种无连接的Socket,对应于无连接的UDP服务应用;

protocol:指定协议。常用协议有IPPROTO_TCP、IPPROTO_UDP、IPPROTO_STCP、IPPROTO_TIPC等,分别对应TCP传输协议、UDP传输协议、STCP传输协议、TIPC传输协议

2. socket创建时的套接字堆栈信息

CPU: 0 PID: 472 Comm: init Not tainted 3.10.32 #216
Backtrace: 
[<c0012df8>] (dump_backtrace+0x0/0x10c) from [<c0012f1c>] (show_stack+0x18/0x1c)
 r7:00000000 r6:00000001 r5:00000002 r4:00000000
[<c0012f04>] (show_stack+0x0/0x1c) from [<c03244bc>] (dump_stack+0x20/0x2c)
[<c032449c>] (dump_stack+0x0/0x2c) from [<c029e3f4>] (SyS_socket+0x28/0xcc)
[<c029e3cc>] (SyS_socket+0x0/0xcc) from [<c000f7c0>] (ret_fast_syscall+0x0/0x2c)
 r8:c000f948 r7:00000119 r6:b6fdc33c r5:00000002 r4:ffffffff

3. socket内核源码分析

根据第2.点的堆栈信息分析其对应的源码如下:

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
	int retval;
	struct socket *sock; //套接字
	int flags;

	/* Check the SOCK_* constants for consistency.  */
	BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
	BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
	BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
	BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);

	flags = type & ~SOCK_TYPE_MASK;
	//SOCK_CLOEXEC(close-on-exec):执行exec函数时关闭本进程内打开的文件描述符
	//SOCK_NONBLOCK:设置为非阻塞模式
	//具体详见篇文章的使用:http://blog.chinaunix.net/uid-24907956-id-3969651.html
	if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
		return -EINVAL;
	type &= SOCK_TYPE_MASK;

	if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
		flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

	retval = sock_create(family, type, protocol, &sock); //创建socket,详见下源码
	if (retval < 0)
		goto out;

	retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); //套接字映射一个描述符,详见下源码
	if (retval < 0)
		goto out_release;

out:
	/* It may be already another descriptor 8) Not kernel problem. */
	return retval;

out_release:
	sock_release(sock);
	return retval;
}

在SYSCALL_DEFINE3这个函数内部主要完成两个工作:

第一,socket创建 sock_create(...);

第二,socket创建之后通过sock_map_fd映射对应的fd并返回给应用程序。

接下来,逐个分析!

4. 内核socket创建

retval = sock_create(family, type, protocol, &sock); //创建socket
int sock_create(int family, int type, int protocol, struct socket **res)
{
	return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}
EXPORT_SYMBOL(sock_create);

气质current->nsproxy->net_ns是网络命名空间,详见:点击打开链接

int __sock_create(struct net *net, int family, int type, int protocol,
			 struct socket **res, int kern)
{
	int err;
	struct socket *sock;
	const struct net_proto_family *pf;

	/*
	 *      Check protocol is in range
	 */
	if (family < 0 || family >= NPROTO) //协议族范围判定
		return -EAFNOSUPPORT;
	if (type < 0 || type >= SOCK_MAX) //socket类型范围判定
		return -EINVAL;

	/* Compatibility.

	   This ugly moron is moved from INET layer to here to avoid
	   deadlock in module load.
	 */
	if (family == PF_INET && type == SOCK_PACKET) {
		static int warned;
		if (!warned) {
			warned = 1;
			printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",
			       current->comm);
		}
		family = PF_PACKET;
	}

	err = security_socket_create(family, type, protocol, kern); //LSM校验
	if (err)
		return err;

	/*
	 *	Allocate the socket and allow the family to set things up. if
	 *	the protocol is 0, the family is instructed to select an appropriate(合适)
	 *	default.
	 */
	sock = sock_alloc(); //动态分配一个socket
	if (!sock) {
		net_warn_ratelimited("socket: no more sockets\n");
		return -ENFILE;	/* Not exactly a match, but its the
				   closest posix thing */
	}

	sock->type = type; //绑定socket类型

#ifdef CONFIG_MODULES
	/* Attempt to load a protocol module if the find failed.
	 *
	 * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
	 * requested real, full-featured networking support upon configuration.
	 * Otherwise module support will break!
	 */
	//通过family索引,从全局协议族数组net_families[]中查找是否有效的;
	//关于协议族的注册,详见博客:https://blog.csdn.net/chenliang0224/article/details/80330756
	if (rcu_access_pointer(net_families[family]) == NULL)
		request_module("net-pf-%d", family);
#endif

	rcu_read_lock();
	pf = rcu_dereference(net_families[family]); //rcu的方式获取协议族
	err = -EAFNOSUPPORT;
	if (!pf)
		goto out_release;

	/*
	 * We will call the ->create function, that possibly is in a loadable
	 * module, so we have to bump that loadable module refcnt first.
	 */
	if (!try_module_get(pf->owner))
		goto out_release;

	/* Now protected by module ref count */
	rcu_read_unlock();

	//调用协议族的函数create socket
	err = pf->create(net, sock, protocol, kern);
	if (err < 0)
		goto out_module_put;

	/*
	 * Now to bump the refcnt of the [loadable] module that owns this
	 * socket at sock_release time we decrement its refcnt.
	 */
	if (!try_module_get(sock->ops->owner))
		goto out_module_busy;

	/*
	 * Now that we're done with the ->create function, the [loadable]
	 * module can have its refcnt decremented
	 */
	module_put(pf->owner);
	err = security_socket_post_create(sock, family, type, protocol, kern);
	if (err)
		goto out_sock_release;
	*res = sock;

	return 0;

out_module_busy:
	err = -EAFNOSUPPORT;
out_module_put:
	sock->ops = NULL;
	module_put(pf->owner);
out_sock_release:
	sock_release(sock);
	return err;

out_release:
	rcu_read_unlock();
	goto out_sock_release;
}
EXPORT_SYMBOL(__sock_create);

在该函数__sock_create内部,主要完成以下几个工作,

第一,动态分配一个socket

static struct socket *sock_alloc(void)
{
	struct inode *inode;
	struct socket *sock;

	//新建一个inode
	inode = new_inode_pseudo(sock_mnt->mnt_sb); //pseudo: 假的、冒充的
	if (!inode)
		return NULL;

	//通过inode,从vfs_inode中内部获取socket
	sock = SOCKET_I(inode);

	//kmemcheck_annotate_bitfield为宏定义,该宏内部展开后,通过sock调用socket结构体内部的成员
	//kmemcheck_bitfield_begin(type)\kmemcheck_bitfield_end(type),然后再在该宏内部通过函数
	//kmemcheck_mark_initialized标记被初始化
	kmemcheck_annotate_bitfield(sock, type); //annotate: 注释,给...作注释
	inode->i_ino = get_next_ino(); //获取下一个节点
	inode->i_mode = S_IFSOCK | S_IRWXUGO; //模式为socket | 读写可执行(用户、组、其他)
	inode->i_uid = current_fsuid(); //获取当前的用户ID
	inode->i_gid = current_fsgid(); //获取当前的组ID
	inode->i_op = &sockfs_inode_ops; //绑定节点的操作句柄

	this_cpu_add(sockets_in_use, 1);
	return sock;
}

其中socket节点操作句柄为

static const struct inode_operations sockfs_inode_ops = {
	.getxattr = sockfs_getxattr,
	.listxattr = sockfs_listxattr,
};

第二,根据协议族family,在全局协议族数组net_families[*]中查找匹配的

	//通过family索引,从全局协议族数组net_families[]中查找有效的;
	if (rcu_access_pointer(net_families[family]) == NULL)
		request_module("net-pf-%d", family);
#endif

	rcu_read_lock();
	pf = rcu_dereference(net_families[family]); //rcu的方式获取协议族
	err = -EAFNOSUPPORT;
	if (!pf)
		goto out_release;

关于net_families[*]的创建,详见另外一篇博客:点击打开链接

第三,通过匹配成功的协议族,调用协议族的create函数创建socket

	//调用协议族的函数create socket
	err = pf->create(net, sock, protocol, kern);
	if (err < 0)
		goto out_module_put;

假设现在的协议族类型为PF_INET,那么pf->create的函数指针指向inet_create

static const struct net_proto_family inet_family_ops = {
	.family = PF_INET,
	.create = inet_create,
	.owner	= THIS_MODULE,
};
static int inet_create(struct net *net, struct socket *sock, int protocol,
		       int kern)
{
	struct sock *sk;
	struct inet_protosw *answer;
	struct inet_sock *inet;
	struct proto *answer_prot;
	unsigned char answer_flags;
	char answer_no_check;
	int try_loading_module = 0;
	int err;

	if (unlikely(!inet_ehash_secret))
		if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
			build_ehash_secret();

	sock->state = SS_UNCONNECTED; //设置socket的状态为未连接

	/* Look for the requested type/protocol pair. */
lookup_protocol:
	err = -ESOCKTNOSUPPORT;
	rcu_read_lock();
	//根据sock->type协议类型,从inetsw[]链表头中获取一个网络层协议
	list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {

		err = 0;
		/* Check the non-wild match. */
		if (protocol == answer->protocol) { //协议匹配
			if (protocol != IPPROTO_IP) //非虚拟协议
				break;
		} else {
			/* Check for the two wild cases. */
			if (IPPROTO_IP == protocol) { //虚拟协议
				protocol = answer->protocol; //将inetsw中的协议强制赋值给protocol
				break;
			}
			if (IPPROTO_IP == answer->protocol) //answer->protocol中的协议为虚拟就直接跳出,因为检索就没有意义了
				break;
		}
		err = -EPROTONOSUPPORT;
	}

	if (unlikely(err)) { //条件为真
		if (try_loading_module < 2) {
			rcu_read_unlock();
			/*
			 * Be more specific, e.g. net-pf-2-proto-132-type-1
			 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
			 */
			if (++try_loading_module == 1)
				request_module("net-pf-%d-proto-%d-type-%d",
					       PF_INET, protocol, sock->type);
			/*
			 * Fall back to generic, e.g. net-pf-2-proto-132
			 * (net-pf-PF_INET-proto-IPPROTO_SCTP)
			 */
			else
				request_module("net-pf-%d-proto-%d",
					       PF_INET, protocol);
			goto lookup_protocol;
		} else
			goto out_rcu_unlock;
	}

	err = -EPERM;
	if (sock->type == SOCK_RAW && !kern &&
	    !ns_capable(net->user_ns, CAP_NET_RAW))
		goto out_rcu_unlock;

	sock->ops = answer->ops; //协议特定套接字操作句柄绑定  inet_stream_ops
	answer_prot = answer->prot; //传输层协议绑定 tcp_prot
	answer_no_check = answer->no_check; //接收、发送是否校验
	answer_flags = answer->flags;
	rcu_read_unlock();

	WARN_ON(answer_prot->slab == NULL);

	err = -ENOBUFS;
	//动态申请一个网络层表示的套接字sock
	sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
	if (sk == NULL)
		goto out;

	err = 0;
	sk->sk_no_check = answer_no_check;
	if (INET_PROTOSW_REUSE & answer_flags)
		sk->sk_reuse = SK_CAN_REUSE;

	//通过sock获取inet_sock
	inet = inet_sk(sk);
	inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;

	inet->nodefrag = 0;

	if (SOCK_RAW == sock->type) {
		inet->inet_num = protocol;
		if (IPPROTO_RAW == protocol)
			inet->hdrincl = 1;
	}

	if (ipv4_config.no_pmtu_disc)
		inet->pmtudisc = IP_PMTUDISC_DONT;
	else
		inet->pmtudisc = IP_PMTUDISC_WANT;

	inet->inet_id = 0;

	//sock参数初始化(包括发送、接收、错误队列,以及内存空间)
	sock_init_data(sock, sk);

	sk->sk_destruct	   = inet_sock_destruct;
	sk->sk_protocol	   = protocol;
	sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;

	inet->uc_ttl	= -1;
	inet->mc_loop	= 1;
	inet->mc_ttl	= 1;
	inet->mc_all	= 1;
	inet->mc_index	= 0;
	inet->mc_list	= NULL;
	inet->rcv_tos	= 0;

	sk_refcnt_debug_inc(sk);

	if (inet->inet_num) {
		/* It assumes that any protocol which allows
		 * the user to assign a number at socket
		 * creation time automatically
		 * shares.
		 */
		inet->inet_sport = htons(inet->inet_num); 
		/* Add to protocol hash chains. */
		sk->sk_prot->hash(sk); //调用传输层协议 inet_hash
	}

	//初始化传输层协议
	if (sk->sk_prot->init) {
		err = sk->sk_prot->init(sk); //调用传输层协议   tcp_v4_init_sock
		if (err)
			sk_common_release(sk);
	}
out:
	return err;
out_rcu_unlock:
	rcu_read_unlock();
	goto out;
}

在inet_create函数内部主要完成以下:

第一,设置socket的状态为未连接

sock->state = SS_UNCONNECTED; //设置socket的状态为未连接

第二,协议类型的判定

	//根据sock->type协议类型,从inetsw[]链表头中获取一个网络层协议
	list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {

		err = 0;
		/* Check the non-wild match. */
		if (protocol == answer->protocol) { //协议匹配
			if (protocol != IPPROTO_IP) //非虚拟协议
				break;
		} else {
			/* Check for the two wild cases. */
			if (IPPROTO_IP == protocol) { //虚拟协议
				protocol = answer->protocol; //将inetsw中的协议强制赋值给protocol
				break;
			}
			if (IPPROTO_IP == answer->protocol) //answer->protocol中的协议为虚拟就直接跳出,因为检索就没有意义了
				break;
		}
		err = -EPROTONOSUPPORT;
	}

第三,动态申请一个sock

	//动态申请一个网络层表示的套接字sock
	sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
	if (sk == NULL)
		goto out;
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
		      struct proto *prot)
{
	struct sock *sk;

	//分配sock
	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
	if (sk) {
		sk->sk_family = family; //绑定协议族
		/*
		 * See comment in struct sock definition to understand
		 * why we need sk_prot_creator -acme
		 */
		sk->sk_prot = sk->sk_prot_creator = prot; //传输层协议绑定
		sock_lock_init(sk);
		sock_net_set(sk, get_net(net)); //net为命名空间,get_net(net)是命名空间个数加1,然后在函数sock_net_set(...)内部将sk->sk_net指向net命名空间,这里的net命令空间相当于一个全局变量
		atomic_set(&sk->sk_wmem_alloc, 1);

		sock_update_classid(sk);
		sock_update_netprioidx(sk);
	}

	return sk;
}
EXPORT_SYMBOL(sk_alloc);

在该函数内部,net为命名空间,get_net(net)是命名空间个数加1,然后在函数sock_net_set(...)内部将sk->sk_net指向net命名空间,这里的net命令空间相当于一个全局变量,最后返回sk,这样就将命名空间与sock关联起来了。

第四,通过sock获取网络层inet_sock

//通过sock获取inet_sock
inet = inet_sk(sk);
inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;

第五,sock参数初始化

//sock参数初始化(包括发送、接收、错误队列,以及内存空间)
sock_init_data(sock, sk);
void sock_init_data(struct socket *sock, struct sock *sk)
{
	skb_queue_head_init(&sk->sk_receive_queue); //接收队列
	skb_queue_head_init(&sk->sk_write_queue); //写队列
	skb_queue_head_init(&sk->sk_error_queue); //错误队列
#ifdef CONFIG_NET_DMA
	skb_queue_head_init(&sk->sk_async_wait_queue);
#endif

	sk->sk_send_head	=	NULL;

	init_timer(&sk->sk_timer); //初始化定时器

	sk->sk_allocation	=	GFP_KERNEL;
	sk->sk_rcvbuf		=	sysctl_rmem_default; //默认内存尺寸 256*256,
	sk->sk_sndbuf		=	sysctl_wmem_default;
	sk->sk_state		=	TCP_CLOSE;
	sk_set_socket(sk, sock);

	sock_set_flag(sk, SOCK_ZAPPED);

	if (sock) {
		sk->sk_type	=	sock->type;
		sk->sk_wq	=	sock->wq;
		sock->sk	=	sk;
	} else
		sk->sk_wq	=	NULL;

	spin_lock_init(&sk->sk_dst_lock);
	rwlock_init(&sk->sk_callback_lock);
	lockdep_set_class_and_name(&sk->sk_callback_lock,
			af_callback_keys + sk->sk_family,
			af_family_clock_key_strings[sk->sk_family]);

	sk->sk_state_change	=	sock_def_wakeup;
	sk->sk_data_ready	=	sock_def_readable;
	sk->sk_write_space	=	sock_def_write_space;
	sk->sk_error_report	=	sock_def_error_report;
	sk->sk_destruct		=	sock_def_destruct;

	sk->sk_frag.page	=	NULL;
	sk->sk_frag.offset	=	0;
	sk->sk_peek_off		=	-1;

	sk->sk_peer_pid 	=	NULL;
	sk->sk_peer_cred	=	NULL;
	sk->sk_write_pending	=	0;
	sk->sk_rcvlowat		=	1;
	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;

	sk->sk_stamp = ktime_set(-1L, 0);

	sk->sk_pacing_rate = ~0U;
	/*
	 * Before updating sk_refcnt, we must commit prior changes to memory
	 * (Documentation/RCU/rculist_nulls.txt for details)
	 */
	smp_wmb();
	atomic_set(&sk->sk_refcnt, 1);
	atomic_set(&sk->sk_drops, 0);
}
EXPORT_SYMBOL(sock_init_data);

第六,sock inet_hash初始化(重要!)

inet->inet_sport = htons(inet->inet_num); 
/* Add to protocol hash chains. */
sk->sk_prot->hash(sk); //调用传输层协议 inet_hash
void inet_hash(struct sock *sk)
{
	if (sk->sk_state != TCP_CLOSE) {
		local_bh_disable();
		__inet_hash(sk);
		local_bh_enable();
	}
}
static void __inet_hash(struct sock *sk)
{
	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; //将调用 tcp_hashinfo()函数
	struct inet_listen_hashbucket *ilb;

	/*Socket不处于监听状态*/
	if (sk->sk_state != TCP_LISTEN) {
		__inet_hash_nolisten(sk, NULL); /*这里对应的是已经建立连接的*/
		return;
	}

	WARN_ON(!sk_unhashed(sk));
	/*根据监听的端口号,查找相对应的HASH*/
	ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];

	spin_lock(&ilb->lock);
	/*把sock添加到监听HASH桶的头部,连接到sk->sk_nulls_node */
	__sk_nulls_add_node_rcu(sk, &ilb->head);
	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
	spin_unlock(&ilb->lock);
}

第七,sock 传输层协议初始化(重要!)

err = sk->sk_prot->init(sk); //调用传输层协议   tcp_v4_init_sock
static int tcp_v4_init_sock(struct sock *sk)
{
	struct inet_connection_sock *icsk = inet_csk(sk);

	tcp_init_sock(sk);

	icsk->icsk_af_ops = &ipv4_specific; //见下,重要

#ifdef CONFIG_TCP_MD5SIG
	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
#endif

	return 0;
}
const struct inet_connection_sock_af_ops ipv4_specific = {
	.queue_xmit	   = ip_queue_xmit, //ip: 网络层协议接口
	.send_check	   = tcp_v4_send_check, //tcp:传输层ipv4发送校验
	.rebuild_header	   = inet_sk_rebuild_header, //inet sock重建头
	.sk_rx_dst_set	   = inet_sk_rx_dst_set, //inet socket 接收目的地址设置
	.conn_request	   = tcp_v4_conn_request, //tcp ipv4连接请求
	.syn_recv_sock	   = tcp_v4_syn_recv_sock, //tcp ipv4同步接收socket
	.net_header_len	   = sizeof(struct iphdr),
	.setsockopt	   = ip_setsockopt, //ip网络层设置socket操作集
	.getsockopt	   = ip_getsockopt, //ip网络层获取socket操作集
	.addr2sockaddr	   = inet_csk_addr2sockaddr,
	.sockaddr_len	   = sizeof(struct sockaddr_in),
	.bind_conflict	   = inet_csk_bind_conflict,
#ifdef CONFIG_COMPAT
	.compat_setsockopt = compat_ip_setsockopt, //compat:兼容
	.compat_getsockopt = compat_ip_getsockopt,
#endif
};
EXPORT_SYMBOL(ipv4_specific);

顾名思义,上面const struct inet_connection_sock_af_ops ipv4_specific结构体内部成员描述了TCP与IP协议层之间的接口,该结构体的内部成员非常重要,后续会在connect()\bind()\recv()\send()详解!

第八,最终返回新建的 sock。

5. sock_map_fd文件描述符映射

retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); //套接字映射一个描述符
static int sock_map_fd(struct socket *sock, int flags)
{
	struct file *newfile;
	int fd = get_unused_fd_flags(flags); //获取一个未被使用的描述符
	if (unlikely(fd < 0))
		return fd;

	//sock动态分配文件
	newfile = sock_alloc_file(sock, flags, NULL);
	if (likely(!IS_ERR(newfile))) {
		fd_install(fd, newfile); //fd安装struct file *newfile
		return fd;
	}

	put_unused_fd(fd);
	return PTR_ERR(newfile);
}

在sock_map_fd函数内部主要完成以下几部分:

第一,获取一个未被使用的文件描述符fd

int fd = get_unused_fd_flags(flags); //获取一个未被使用的描述符
int get_unused_fd_flags(unsigned flags)
{
	return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags);
}
EXPORT_SYMBOL(get_unused_fd_flags);
int __alloc_fd(struct files_struct *files,
	       unsigned start, unsigned end, unsigned flags)
{
	unsigned int fd;
	int error;
	struct fdtable *fdt;

	spin_lock(&files->file_lock);
repeat:
	fdt = files_fdtable(files); //通过 struct files_struct *files 查找 struct fdtable *fdt
	fd = start;
	if (fd < files->next_fd)
		fd = files->next_fd;

	if (fd < fdt->max_fds)
		fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd); //分配一个未被使用的bit位

	/*
	 * N.B. For clone tasks sharing a files structure, this test
	 * will limit the total number of files that can be opened.
	 */
	error = -EMFILE;
	if (fd >= end)
		goto out;

	error = expand_files(files, fd); //确定单前fd的下一个描述符是否有效
	if (error < 0)
		goto out;

	/*
	 * If we needed to expand the fs array we
	 * might have blocked - try again.
	 */
	if (error)
		goto repeat;

	if (start <= files->next_fd)
		files->next_fd = fd + 1; //切换到下一个fd

	__set_open_fd(fd, fdt); //设置为打开标识
	if (flags & O_CLOEXEC) //close-no-exec,为真,表示执行exec时关闭其已经打开的描述符
		__set_close_on_exec(fd, fdt); 
	else
		__clear_close_on_exec(fd, fdt);
	error = fd;
#if 1
	/* Sanity check */
	if (rcu_dereference_raw(fdt->fd[fd]) != NULL) {
		printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
		rcu_assign_pointer(fdt->fd[fd], NULL);
	}
#endif

out:
	spin_unlock(&files->file_lock);
	return error;
}
在__alloc_fd函数内部主要是动态分配一个描述符fd,单里面涉及的知识其实挺广的,关于fd的具体分配,详见: 点击打开链接

第二,sock动态分配一个file结构体

//sock动态分配文件
newfile = sock_alloc_file(sock, flags, NULL);
//在当前进程里,创建一个file struct结构体,并把它映射到fd的空间里
struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
{
	struct qstr name = { .name = "" };
	struct path path;
	struct file *file;

	if (dname) { //dname=“NULL”
		name.name = dname;
		name.len = strlen(name.name);
	} else if (sock->sk) {
		name.name = sock->sk->sk_prot_creator->name; //“TCP”
		name.len = strlen(name.name);
	}
	//动态分配一个假的目录
	path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, &name); //pseudo: 冒充,假的
	if (unlikely(!path.dentry))
		return ERR_PTR(-ENOMEM);
	path.mnt = mntget(sock_mnt);

	d_instantiate(path.dentry, SOCK_INODE(sock));
	SOCK_INODE(sock)->i_fop = &socket_file_ops; //非常重要,对应文件操作句柄

	//动态分配一个file
	file = alloc_file(&path, FMODE_READ | FMODE_WRITE,
		  &socket_file_ops);
	if (unlikely(IS_ERR(file))) {
		/* drop dentry, keep inode */
		ihold(path.dentry->d_inode);
		path_put(&path);
		return file;
	}

	sock->file = file;
	file->f_flags = O_RDWR | (flags & O_NONBLOCK);
	file->private_data = sock; //绑定file私有数据为sock,这也是串联整个socket套接字的关键数据!
	return file;
}
EXPORT_SYMBOL(sock_alloc_file);

在sock_alloc_file该函数内部会动态分配一个文件,如下

//动态分配一个file
file = alloc_file(&path, FMODE_READ | FMODE_WRITE,&socket_file_ops);

期中有一个重要的file结构体绑定,其对应文件操作句柄,在编写驱动的时候,这个结构体是最熟悉不过的了

static const struct file_operations socket_file_ops = {
	.owner =	THIS_MODULE,
	.llseek =	no_llseek,
	.aio_read =	sock_aio_read, //异步读取
	.aio_write =	sock_aio_write,
	.poll =		sock_poll,
	.unlocked_ioctl = sock_ioctl, //对socket套接字设置时调用的接口,具体详见其源码
#ifdef CONFIG_COMPAT
	.compat_ioctl = compat_sock_ioctl,
#endif
	.mmap =		sock_mmap,
	.open =		sock_no_open,	/* special open code to disallow open via /proc */
	.release =	sock_close,
	.fasync =	sock_fasync,
	.sendpage =	sock_sendpage,
	.splice_write = generic_splice_sendpage,
	.splice_read =	sock_splice_read,
};

最后在sock_alloc_file内部进行绑定,这个是不是很熟悉,哈哈!

file->private_data = sock; //绑定file私有数据为sock,这也是串联整个socket套接字的关键数据!

第三,最后根据获取到的fd和file进行安装

fd_install(fd, newfile); //fd安装struct file *newfile
void fd_install(unsigned int fd, struct file *file)
{
	__fd_install(current->files, fd, file);
}
/*
 * Install a file pointer in the fd array.
 *
 * The VFS is full of places where we drop the files lock between
 * setting the open_fds bitmap and installing the file in the file
 * array.  At any such point, we are vulnerable to a dup2() race
 * installing a file in the array before us.  We need to detect this and
 * fput() the struct file we are about to overwrite in this case.
 *
 * It should never happen - if we allow dup2() do it, _really_ bad things
 * will follow.
 *
 * NOTE: __fd_install() variant is really, really low-level; don't
 * use it unless you are forced to by truly lousy API shoved down
 * your throat.  'files' *MUST* be either current->files or obtained
 * by get_files_struct(current) done by whoever had given it to you,
 * or really bad things will happen.  Normally you want to use
 * fd_install() instead.
 */

void __fd_install(struct files_struct *files, unsigned int fd,
		struct file *file)
{
	struct fdtable *fdt;
	spin_lock(&files->file_lock);
	fdt = files_fdtable(files);
	BUG_ON(fdt->fd[fd] != NULL);
	rcu_assign_pointer(fdt->fd[fd], file);
	spin_unlock(&files->file_lock);
}
这个函数的注解很详细,这里不再赘述!

6. 总结

本篇博客主要针对socket套接字创建进行源码流程分析,通过协议族、协议类型、协议创建socket套接字,然后通过创建成果的套接字去分配一个文件描述符fd,这里涉及到两个重要的结构体:

第一,当前进程任务结构体 struct task_struct,在套接字创建里面主要涉及到该结构体成员net

第二, 命名空间struct nsproxy,个人认为,命名空间相当于一个全局变量,在socket里面创建,然后在bind,recv,send都能访问该命名空间,所以相当于一个全局变量;

最后返回文件句柄fd给应用层。






  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值