1、socket系统调用源码剖析
系统调用原型
int socket(int domain, int type, int protocol);
domain:表示协议族,在Linux系统中,有很多个协议族,比如IPv4的协议族为AF_INET,IPv6的协议族为AF_INET6,从字面意思上看,domain是int类型的,那它在内核层面是如何帮我们定位到特定协议族的代码呢?这个问题后面再谈
Linux下所有的协议族
Name Purpose
AF_UNIX, AF_LOCAL Local communication
AF_INET IPv4 Internet protocols
AF_INET6 IPv6 Internet protocols
AF_IPX IPX - Novell protocols
AF_NETLINK Kernel user interface device
AF_X25 ITU-T X.25 / ISO-8208 protocol
AF_AX25 Amateur radio AX.25 protocol
AF_ATMPVC Access to raw ATM PVCs
AF_APPLETALK Appletalk
AF_PACKET Low level packet interface
type:表示通信的语义,最常见的就是SOCK_STREAM和SOCK_DGRAM,除了这两个比较常见为,还有其他的协议族,可以自行使用man手册查看
protocol:表示一个传输层协议号,传输层的协议包括IPPROTO_TCP,IPPROTO_UDP;或者对于SOCK_RAW来讲,它是一个通配协议号IPPROTO_IP,用于通配网络层的附属协议ICMP,IGMP等。对于传输层协议来讲,IPPROTO_TCP对应的套接字类型总是SOCK_STREAM,IPPRTO_UDP对应的套接字类型总是STREAM_DGRAM,所以在socket系统调用时,可以不必指定协议号,而直接使用通配符IPPROTO_IP
比如创建一个tcp的套接字:int sockfd = socket(AF_INET, SOCK_STREAM, 0);
这里的0就表示默认使用tcp,不同的协用不同的数字表明,在in.h中定义了内核中所有协议号
enum {
IPPROTO_IP = 0, /* Dummy protocol for TCP */
#define IPPROTO_IP IPPROTO_IP
IPPROTO_ICMP = 1, /* Internet Control Message Protocol */
#define IPPROTO_ICMP IPPROTO_ICMP
IPPROTO_IGMP = 2, /* Internet Group Management Protocol */
#define IPPROTO_IGMP IPPROTO_IGMP
IPPROTO_IPIP = 4, /* IPIP tunnels (older KA9Q tunnels use 94) */
#define IPPROTO_IPIP IPPROTO_IPIP
IPPROTO_TCP = 6, /* Transmission Control Protocol */
#define IPPROTO_TCP IPPROTO_TCP
IPPROTO_EGP = 8, /* Exterior Gateway Protocol */
#define IPPROTO_EGP IPPROTO_EGP
IPPROTO_PUP = 12, /* PUP protocol */
#define IPPROTO_PUP IPPROTO_PUP
IPPROTO_UDP = 17, /* User Datagram Protocol */
#define IPPROTO_UDP IPPROTO_UDP
IPPROTO_IDP = 22, /* XNS IDP protocol */
#define IPPROTO_IDP IPPROTO_IDP
IPPROTO_TP = 29, /* SO Transport Protocol Class 4 */
#define IPPROTO_TP IPPROTO_TP
IPPROTO_DCCP = 33, /* Datagram Congestion Control Protocol */
#define IPPROTO_DCCP IPPROTO_DCCP
IPPROTO_IPV6 = 41, /* IPv6-in-IPv4 tunnelling */
#define IPPROTO_IPV6 IPPROTO_IPV6
IPPROTO_RSVP = 46, /* RSVP Protocol */
#define IPPROTO_RSVP IPPROTO_RSVP
IPPROTO_GRE = 47, /* Cisco GRE tunnels (rfc 1701,1702) */
#define IPPROTO_GRE IPPROTO_GRE
IPPROTO_ESP = 50, /* Encapsulation Security Payload protocol */
#define IPPROTO_ESP IPPROTO_ESP
IPPROTO_AH = 51, /* Authentication Header protocol */
#define IPPROTO_AH IPPROTO_AH
IPPROTO_MTP = 92, /* Multicast Transport Protocol */
#define IPPROTO_MTP IPPROTO_MTP
IPPROTO_BEETPH = 94, /* IP option pseudo header for BEET */
#define IPPROTO_BEETPH IPPROTO_BEETPH
IPPROTO_ENCAP = 98, /* Encapsulation Header */
#define IPPROTO_ENCAP IPPROTO_ENCAP
IPPROTO_PIM = 103, /* Protocol Independent Multicast */
#define IPPROTO_PIM IPPROTO_PIM
IPPROTO_COMP = 108, /* Compression Header Protocol */
#define IPPROTO_COMP IPPROTO_COMP
IPPROTO_SCTP = 132, /* Stream Control Transport Protocol */
#define IPPROTO_SCTP IPPROTO_SCTP
IPPROTO_UDPLITE = 136, /* UDP-Lite (RFC 3828) */
#define IPPROTO_UDPLITE IPPROTO_UDPLITE
IPPROTO_MPLS = 137, /* MPLS in IP (RFC 4023) */
#define IPPROTO_MPLS IPPROTO_MPLS
IPPROTO_RAW = 255, /* Raw IP packets */
#define IPPROTO_RAW IPPROTO_RAW
IPPROTO_MAX
};
2、struct socket 是如何创建的
socket系统调用源码:
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
int retval;//返回值
struct socket *sock;//struct socket主要用于给上层提供接口,会跟文件系统相关联
//后续还会有一个结构体:struct sock,它主要用于向下对接协议栈
int flags;
//做一些检测
BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);
flags = type & ~SOCK_TYPE_MASK;
if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return -EINVAL;
type &= SOCK_TYPE_MASK;
if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
//创建socket,将上层的struct socket 和下层的 struct sock 关联在一起
retval = sock_create(family, type, protocol, &sock);
if (retval < 0)
goto out;
retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
if (retval < 0)
goto out_release;
out:
/* It may be already another descriptor 8) Not kernel problem. */
return retval;
out_release:
sock_release(sock);
return retval;
}
定义好相关的结构,做完相应的检测,接着会调用retval = sock_create(family, type, protocol, &sock);
int sock_create(int family, int type, int protocol, struct socket **res)
{
return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}
接着进一步调用
int __sock_create(struct net *net, int family, int type, int protocol,
struct socket **res, int kern)
{
int err;
struct socket *sock;
const struct net_proto_family *pf;
//struct net_proto_family是协议族管理类型,负责不同协议族套接字的创建
/**
struct net_proto_family {
int family;
int (*create)(struct net *net, struct socket *sock,
int protocol, int kern);
struct module *owner;
};
*/
//判断family是否在有效范围内
if (family < 0 || family >= NPROTO)
return -EAFNOSUPPORT;
if (type < 0 || type >= SOCK_MAX)
return -EINVAL;
//不重要
if (family == PF_INET && type == SOCK_PACKET) {
pr_info_once("%s uses obsolete (PF_INET,SOCK_PACKET)\n",
current->comm);
family = PF_PACKET;
}
err = security_socket_create(family, type, protocol, kern);
if (err)
return err;
sock = sock_alloc();//申请struct socket
//申请失败则返回
if (!sock) {
net_warn_ratelimited("socket: no more sockets\n");
return -ENFILE; /* Not exactly a match, but its the
closest posix thing */
}
sock->type = type;//将应用层传入的type进行赋值
#ifdef CONFIG_MODULES
if (rcu_access_pointer(net_families[family]) == NULL)
request_module("net-pf-%d", family);
#endif
rcu_read_lock();//rcu锁,给数据的访问做一些保护
pf = rcu_dereference(net_families[family]);//进行协议管理类型的复制
/*static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly;
所有的协议族管理类型都放在了这个数组中,因为family是int类型,把它当成下标,就能拿到对应的协议族管理类型
*/
err = -EAFNOSUPPORT;
if (!pf)
goto out_release;
if (!try_module_get(pf->owner))
goto out_release;
rcu_read_unlock();
//pf就是我们想要的协议族,再调用协议族中对应的创建套接字的函数
err = pf->create(net, sock, protocol, kern);
if (err < 0)
goto out_module_put;
if (!try_module_get(sock->ops->owner))
goto out_module_busy;
module_put(pf->owner);
err = security_socket_post_create(sock, family, type, protocol, kern);
if (err)
goto out_sock_release;
*res = sock;
return 0;
out_module_busy:
err = -EAFNOSUPPORT;
out_module_put:
sock->ops = NULL;
module_put(pf->owner);
out_sock_release:
sock_release(sock);
return err;
out_release:
rcu_read_unlock();
goto out_sock_release;
}
struct net_proto_family管理不同协议族套接字的创建方法,其中create指针指向具体协议族套接字的创建函数。在include/linux/socket.h文件中,内核用整数定义这些协议族。在初始化时,Linux系统支持的协议族被注册到数组static struct net_proto_family *net_families中。以下为内核中部分协议族的定义:
比如INET协议族初始化时,函数inet_init调用sock_register来注册INET套接字的创建方法
协议族被sock_register函数(位于net\socket.c中)注册到net_families中:
int sock_register(const struct net_proto_family *ops)
{
int err;
if (ops->family >= NPROTO) {
pr_crit("protocol %d >= NPROTO(%d)\n", ops->family, NPROTO);
return -ENOBUFS;
}
spin_lock(&net_family_lock);
//协议族被注册到net_families
if (rcu_dereference_protected(net_families[ops->family],
lockdep_is_held(&net_family_lock)))
err = -EEXIST;
else {
rcu_assign_pointer(net_families[ops->family], ops);
err = 0;
}
spin_unlock(&net_family_lock);
pr_info("NET: Registered protocol family %d\n", ops->family);
return err;
}
inet_init函数则在\net\ipv4\af_inet.c下:
static int __init inet_init(void)
{
...
(void)sock_register(&inet_family_ops);
...
}
而 inet_family_ops 则是定义在af_inet.c下的静态变量,并且已经进行了相应的初始化
static const struct net_proto_family inet_family_ops = {
.family = PF_INET, //表示INET协议族
.create = inet_create, //表示INET套接字的创建方法
.owner = THIS_MODULE,
};
这里的逻辑就是在af_inet.c中,将INET协议族(net_proto_family)定义好,再调用sock_register函数,将其注册到net_families数组中,通过net_families数组中的family和上层传入的family进行对比,就能拿到对应net_proto_family,从而调用对应的套接字的创建方法
针对同一个协议族,但却有着不同的type,那么在inet_create内部是如何正确的调用对应的套接字的创建方法呢?
在看inet_create之前,需要了解一下inet_protosw 结构体
struct inet_protosw {
struct list_head list;//双向链表,将所有的inet_protosw (内部封装了套接字类型和对应的协议)都链在一起
unsigned short type; //socket的第二个参数
unsigned short protocol; //socket的第三个参数
struct proto *prot; //传输层协议对应函数的具体实现
const struct proto_ops *ops; //针对同一个type的函数集
unsigned char flags; /* See INET_PROTOSW_* below. */
};
针对struct proto和struct proto_ops,得特别强调一下两者的关系:
proto是一个传输层协议绑定的操作集,比如对于IPPROTO_TCP,它就是tcp_prot,对于IPPROTO_UDP,它就是udp_prot。而对于类型为SOCK_RAW的套接字,它没有相应的传输层协议,而是用于通配所有的网络层附属协议,所以,prot就是所有网络层附属协议共用的一个操作集raw_prot。
proto_ops是套接字类型绑定的操作集,对应于SOCK_STREAM, SOCK_DGRAM, SOCK_RAW,操作集分别为inet_stream_ops,inet_dgram_ops,inet_sockraw_ops。
举个例子,例如针对于type,假设为SOCK_STREAM,它定义了一组通用的接口(函数),tcp的type为SOCK_STREAM,那它就必须要实现这些接口(函数),所以 struct proto 就是tcp中这些接口(函数)的具体实现,如果又有人实现了其他协议,那么则需要重新实现这些接口(函数)
在初始化时,内核已经将我们常用的inet_protosw 结构体(套接字类型和对应的协议)定义好了,放在了一个名为inet_protosw 数组中,一共有四组
static struct inet_protosw inetsw_array[] =
{
//这一组就表示tcp
{
.type = SOCK_STREAM,
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
},
//udp
{
.type = SOCK_DGRAM,
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
.flags = INET_PROTOSW_PERMANENT,
},
//icmp
{
.type = SOCK_DGRAM,
.protocol = IPPROTO_ICMP,
.prot = &ping_prot,
.ops = &inet_sockraw_ops,
.flags = INET_PROTOSW_REUSE,
},
//ip
{
.type = SOCK_RAW,
.protocol = IPPROTO_IP, /* wild card */
.prot = &raw_prot,
.ops = &inet_sockraw_ops,
.flags = INET_PROTOSW_REUSE,
}
};
通过函数inet_register_protosw将inet_protosw 注册到inetsw数组中,并且将他们链接在一起
INET套接字的创建方法inet_create也定义在af_inet.c下
static int inet_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
struct sock *sk;
struct inet_protosw *answer; //内部封装了套接字类型和对应的协议,以及操作套接字的相关函数
struct inet_sock *inet; //struct inet_sock是对struct sock进一步封装
struct proto *answer_prot; //传输层协议对应函数的具体实现
unsigned char answer_flags;
int try_loading_module = 0;
int err;
if (protocol < 0 || protocol >= IPPROTO_MAX)
return -EINVAL;
sock->state = SS_UNCONNECTED;
lookup_protocol:
err = -ESOCKTNOSUPPORT;
rcu_read_lock();
//list_for_each_entry_rcu一个宏定义,主要是用于遍历,获取inet_protosw数组中的每个元素,并赋值给answer,
//并且判断answer中protocol和上层传入的protocol是否一致,如果一致,则说明找到了对应的协议
list_for_each_entry_rcu(answer, &itsw[sock->type], list) {
err = 0;
//如果上层传入的protocl是0,inet_protosw数组的第一个元素的protocol是6(IPPROTO_TCP)
//那么就会进入else
if (protocol == answer->protocol) {
if (protocol != IPPROTO_IP)
break;
} else { //IPPROTO_IP和protocol相等,都为0,那么就会将inet_protosw数组的第一个元素的protocol是6赋值给上层传入的protocol,此时protocol就为6了,表示IPPROTO_TCP
if (IPPROTO_IP == protocol) {
protocol = answer->protocol;
break;
}
if (IPPROTO_IP == answer->protocol)
break;
}
err = -EPROTONOSUPPORT;
}
if (unlikely(err)) {
if (try_loading_module < 2) {
rcu_read_unlock();
if (++try_loading_module == 1)
request_module("net-pf-%d-proto-%d-type-%d",
PF_INET, protocol, sock->type);
else
request_module("net-pf-%d-proto-%d",
PF_INET, protocol);
goto lookup_protocol;
} else
goto out_rcu_unlock;
}
err = -EPERM;
if (sock->type == SOCK_RAW && !kern &&
!ns_capable(net->user_ns, CAP_NET_RAW))
goto out_rcu_unlock;
//找到正确的inet_protosw,也就是answer,就将最开始定义的变量进行赋值
sock->ops = answer->ops;
answer_prot = answer->prot;
answer_flags = answer->flags;
rcu_read_unlock();
WARN_ON(!answer_prot->slab);
err = -ENOBUFS;
//调用sk_alloc申请struct sock
sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);
if (!sk)
goto out;
err = 0;
if (INET_PROTOSW_REUSE & answer_flags)
sk->sk_reuse = SK_CAN_REUSE;
inet = inet_sk(sk);
inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
inet->nodefrag = 0;
if (SOCK_RAW == sock->type) {
inet->inet_num = protocol;
if (IPPROTO_RAW == protocol)
inet->hdrincl = 1;
}
if (net->ipv4.sysctl_ip_no_pmtu_disc)
inet->pmtudisc = IP_PMTUDISC_DONT;
else
inet->pmtudisc = IP_PMTUDISC_WANT;
inet->inet_id = 0;
//用struct sock(sk) 初始化 struct socket(sock),并将它们关联起来
sock_init_data(sock, sk);
sk->sk_destruct = inet_sock_destruct;
sk->sk_protocol = protocol;
sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
inet->uc_ttl = -1;
inet->mc_loop = 1;
inet->mc_ttl = 1;
inet->mc_all = 1;
inet->mc_index = 0;
inet->mc_list = NULL;
inet->rcv_tos = 0;
sk_refcnt_debug_inc(sk);
if (inet->inet_num) {
inet->inet_sport = htons(inet->inet_num);
err = sk->sk_prot->hash(sk);
if (err) {
sk_common_release(sk);
goto out;
}
}
if (sk->sk_prot->init) {
err = sk->sk_prot->init(sk);
if (err)
sk_common_release(sk);
}
out:
return err;
out_rcu_unlock:
rcu_read_unlock();
goto out;
}
sk_alloc函数
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
struct proto *prot, int kern)
{
struct sock *sk;
//申请struct sock
sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
if (sk) {
//如果申请成功就进行相关数据的初始化
sk->sk_family = family;
sk->sk_prot = sk->sk_prot_creator = prot;
sock_lock_init(sk);
sk->sk_net_refcnt = kern ? 0 : 1;
if (likely(sk->sk_net_refcnt))
get_net(net);
sock_net_set(sk, net);
atomic_set(&sk->sk_wmem_alloc, 1);
mem_cgroup_sk_alloc(sk);
cgroup_sk_alloc(&sk->sk_cgrp_data);
sock_update_classid(&sk->sk_cgrp_data);
sock_update_netprioidx(&sk->sk_cgrp_data);
}
return sk;
}
void sock_init_data(struct socket *sock, struct sock *sk)
{
...
sk_set_socket(sk, sock);//sk->sk_socket = sock;
if (sock) {
sk->sk_type = sock->type;
sk->sk_wq = sock->wq;
sock->sk = sk;
}
...
}
//负责对上给用户提供接口,并且和文件系统关联;
struct socket {
...
struct file *file;
struct sock *sk;
const struct proto_ops *ops;
};
//负责向下对接内核网络协议栈;
struct sock {
struct socket *sk_socket;
...
};
//通过sk->sk_socket = sock;和sock->sk = sk; 就将两者链接了起来
3、struct socket 和文件的联系
通过调用sock_create,将上层的struct socket 和下层的 struct sock 链接了起来,接着还会调用sock_map_fd,将struct socket 和文件系统关联在一起,因为我们在上层创建socket的时候,返回的是文件描述符(fd),后续进行所有的操作都是对fd进行操作的
static int sock_map_fd(struct socket *sock, int flags)
{
struct file *newfile; //文件指针
int fd = get_unused_fd_flags(flags); //获取一个未使用的fd
if (unlikely(fd < 0))
return fd;
newfile = sock_alloc_file(sock, flags, NULL);//申请一个文件,并且在内部会将socket和文件关联起来
if (likely(!IS_ERR(newfile))) {
fd_install(fd, newfile);//将文件和fd关联起来
return fd;
}
put_unused_fd(fd);
return PTR_ERR(newfile);
}
sock_alloc_file函数
struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
{
...
//将socket和文件链接起来
sock->file = file;
file->private_data = sock;
return file;
}
fd_install函数
void fd_install(unsigned int fd, struct file *file)
{
__fd_install(current->files, fd, file);
}
void __fd_install(struct files_struct *files, unsigned int fd,
struct file *file)
{
struct fdtable *fdt;
might_sleep();
rcu_read_lock_sched();
while (unlikely(files->resize_in_progress)) {
rcu_read_unlock_sched();
wait_event(files->resize_wait, !files->resize_in_progress);
rcu_read_lock_sched();
}
smp_rmb();
fdt = rcu_dereference_sched(files->fdt);
BUG_ON(fdt->fd[fd] != NULL);
rcu_assign_pointer(fdt->fd[fd], file); //就是将文件放到进程的文件描述符列表中
rcu_read_unlock_sched();
}
//fd是给应用层看的,而内核操作的却是struct file