用户创建socket
调用内核__sock_create
int __sock_create(struct net *net, int family, int type, int protocol,
struct socket **res, int kern)
{
int err;
struct socket *sock;
const struct net_proto_family *pf;
/*
* Check protocol is in range
*/
if (family < 0 || family >= NPROTO)
return -EAFNOSUPPORT;
if (type < 0 || type >= SOCK_MAX)
return -EINVAL;
/* Compatibility.
This uglymoron is moved from INET layer to here to avoid
deadlock in module load.
*/
if (family == PF_INET && type == SOCK_PACKET) {
pr_info_once("%s uses obsolete (PF_INET,SOCK_PACKET)\n",
current->comm);
family = PF_PACKET;
}
err = security_socket_create(family, type, protocol, kern);
if (err)
return err;
/*
* Allocate the socket and allow the family to set things up. if
* the protocol is 0, the family is instructed to select an appropriate
* default.
*/
sock = sock_alloc();
if (!sock) {
net_warn_ratelimited("socket: no more sockets\n");
return -ENFILE; /* Not exactly a match, but its the
closest posix thing */
}
sock->type = type;
#ifdef CONFIG_MODULES
/* Attempt to load a protocol module if the find failed.
*
* 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
* requested real, full-featured networking support upon configuration.
* Otherwise module support will break!
*/
if (rcu_access_pointer(net_families[family]) == NULL)
request_module("net-pf-%d", family);
#endif
rcu_read_lock();
pf = rcu_dereference(net_families[family]);
err = -EAFNOSUPPORT;
if (!pf)
goto out_release;
/*
* We will call the ->create function, that possibly is in a loadable
* module, so we have to bump that loadable module refcnt first.
*/
if (!try_module_get(pf->owner))
goto out_release;
/* Now protected by module ref count */
rcu_read_unlock();
err = pf->create(net, sock, protocol, kern);
if (err < 0)
goto out_module_put;
/*
* Now to bump the refcnt of the [loadable] module that owns this
* socket at sock_release time we decrement its refcnt.
*/
if (!try_module_get(sock->ops->owner))
goto out_module_busy;
/*
* Now that we're done with the ->create function, the [loadable]
* module can have its refcnt decremented
*/
module_put(pf->owner);
err = security_socket_post_create(sock, family, type, protocol, kern);
if (err)
goto out_sock_release;
*res = sock;
return 0;
out_module_busy:
err = -EAFNOSUPPORT;
out_module_put:
sock->ops = NULL;
module_put(pf->owner);
out_sock_release:
sock_release(sock);
return err;
out_release:
rcu_read_unlock();
goto out_sock_release;
}
EXPORT_SYMBOL(__sock_create);
/**
* sock_create - creates a socket
* @family: protocol family (AF_INET, ...)
* @type: communication type (SOCK_STREAM, ...)
* @protocol: protocol (0, ...)
* @res: new socket
*
* A wrapper around __sock_create().
* Returns 0 or an error. This function internally uses GFP_KERNEL.
*/
int sock_create(int family, int type, int protocol, struct socket **res)
{
return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}
EXPORT_SYMBOL(sock_create);
利用sock_ioctl控制socket
static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
struct socket *sock;
struct sock *sk;
void __user *argp = (void __user *)arg;
int pid, err;
struct net *net;
sock = file->private_data;
sk = sock->sk;
net = sock_net(sk);
if (unlikely(cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15))) {
struct ifreq ifr;
bool need_copyout;
if (copy_from_user(&ifr, argp, sizeof(struct ifreq)))
return -EFAULT;
err = dev_ioctl(net, cmd, &ifr, &need_copyout);
if (!err && need_copyout)
if (copy_to_user(argp, &ifr, sizeof(struct ifreq)))
return -EFAULT;
} else
#ifdef CONFIG_WEXT_CORE
if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
err = wext_handle_ioctl(net, cmd, argp);
} else
#endif
switch (cmd) {
case FIOSETOWN:
case SIOCSPGRP:
err = -EFAULT;
if (get_user(pid, (int __user *)argp))
break;
err = f_setown(sock->file, pid, 1);
break;
case FIOGETOWN:
case SIOCGPGRP:
err = put_user(f_getown(sock->file),
(int __user *)argp);
break;
case SIOCGIFBR:
case SIOCSIFBR:
case SIOCBRADDBR:
case SIOCBRDELBR:
err = -ENOPKG;
if (!br_ioctl_hook)
request_module("bridge");
mutex_lock(&br_ioctl_mutex);
if (br_ioctl_hook)
err = br_ioctl_hook(net, cmd, argp);
mutex_unlock(&br_ioctl_mutex);
break;
case SIOCGIFVLAN:
case SIOCSIFVLAN:
err = -ENOPKG;
if (!vlan_ioctl_hook)
request_module("8021q");
mutex_lock(&vlan_ioctl_mutex);
if (vlan_ioctl_hook)
err = vlan_ioctl_hook(net, argp);
mutex_unlock(&vlan_ioctl_mutex);
break;
case SIOCADDDLCI:
case SIOCDELDLCI:
err = -ENOPKG;
if (!dlci_ioctl_hook)
request_module("dlci");
mutex_lock(&dlci_ioctl_mutex);
if (dlci_ioctl_hook)
err = dlci_ioctl_hook(cmd, argp);
mutex_unlock(&dlci_ioctl_mutex);
break;
case SIOCGSKNS:
err = -EPERM;
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
break;
err = open_related_ns(&net->ns, get_net_ns);
break;
case SIOCGSTAMP_OLD:
case SIOCGSTAMPNS_OLD:
if (!sock->ops->gettstamp) {
err = -ENOIOCTLCMD;
break;
}
err = sock->ops->gettstamp(sock, argp,
cmd == SIOCGSTAMP_OLD,
!IS_ENABLED(CONFIG_64BIT));
break;
case SIOCGSTAMP_NEW:
case SIOCGSTAMPNS_NEW:
if (!sock->ops->gettstamp) {
err = -ENOIOCTLCMD;
break;
}
err = sock->ops->gettstamp(sock, argp,
cmd == SIOCGSTAMP_NEW,
false);
break;
default:
err = sock_do_ioctl(net, sock, cmd, arg);
break;
}
return err;
}
虚拟层inet协议族的创建inet_create
static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
struct socket *sock;
struct sock *sk;
void __user *argp = (void __user *)arg;
int pid, err;
struct net *net;
sock = file->private_data;
sk = sock->sk;
net = sock_net(sk);
if (unlikely(cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15))) {
struct ifreq ifr;
bool need_copyout;
if (copy_from_user(&ifr, argp, sizeof(struct ifreq)))
return -EFAULT;
err = dev_ioctl(net, cmd, &ifr, &need_copyout);
if (!err && need_copyout)
if (copy_to_user(argp, &ifr, sizeof(struct ifreq)))
return -EFAULT;
} else
#ifdef CONFIG_WEXT_CORE
if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
err = wext_handle_ioctl(net, cmd, argp);
} else
#endif
switch (cmd) {
case FIOSETOWN:
case SIOCSPGRP:
err = -EFAULT;
if (get_user(pid, (int __user *)argp))
break;
err = f_setown(sock->file, pid, 1);
break;
case FIOGETOWN:
case SIOCGPGRP:
err = put_user(f_getown(sock->file),
(int __user *)argp);
break;
case SIOCGIFBR:
case SIOCSIFBR:
case SIOCBRADDBR:
case SIOCBRDELBR:
err = -ENOPKG;
if (!br_ioctl_hook)
request_module("bridge");
mutex_lock(&br_ioctl_mutex);
if (br_ioctl_hook)
err = br_ioctl_hook(net, cmd, argp);
mutex_unlock(&br_ioctl_mutex);
break;
case SIOCGIFVLAN:
case SIOCSIFVLAN:
err = -ENOPKG;
if (!vlan_ioctl_hook)
request_module("8021q");
mutex_lock(&vlan_ioctl_mutex);
if (vlan_ioctl_hook)
err = vlan_ioctl_hook(net, argp);
mutex_unlock(&vlan_ioctl_mutex);
break;
case SIOCADDDLCI:
case SIOCDELDLCI:
err = -ENOPKG;
if (!dlci_ioctl_hook)
request_module("dlci");
mutex_lock(&dlci_ioctl_mutex);
if (dlci_ioctl_hook)
err = dlci_ioctl_hook(cmd, argp);
mutex_unlock(&dlci_ioctl_mutex);
break;
case SIOCGSKNS:
err = -EPERM;
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
break;
err = open_related_ns(&net->ns, get_net_ns);
break;
case SIOCGSTAMP_OLD:
case SIOCGSTAMPNS_OLD:
if (!sock->ops->gettstamp) {
err = -ENOIOCTLCMD;
break;
}
err = sock->ops->gettstamp(sock, argp,
cmd == SIOCGSTAMP_OLD,
!IS_ENABLED(CONFIG_64BIT));
break;
case SIOCGSTAMP_NEW:
case SIOCGSTAMPNS_NEW:
if (!sock->ops->gettstamp) {
err = -ENOIOCTLCMD;
break;
}
err = sock->ops->gettstamp(sock, argp,
cmd == SIOCGSTAMP_NEW,
false);
break;
default:
err = sock_do_ioctl(net, sock, cmd, arg);
break;
}
return err;
}
其中协议族的结构proto_ops结构如下
struct proto_ops {
int family;
struct module *owner;
int (*release) (struct socket *sock);
int (*bind) (struct socket *sock,
struct sockaddr *myaddr,
int sockaddr_len);
int (*connect) (struct socket *sock,
struct sockaddr *vaddr,
int sockaddr_len, int flags);
int (*socketpair)(struct socket *sock1,
struct socket *sock2);
int (*accept) (struct socket *sock,
struct socket *newsock, int flags, bool kern);
int (*getname) (struct socket *sock,
struct sockaddr *addr,
int peer);
__poll_t (*poll) (struct file *file, struct socket *sock,
struct poll_table_struct *wait);
int (*ioctl) (struct socket *sock, unsigned int cmd,
unsigned long arg);
#ifdef CONFIG_COMPAT
int (*compat_ioctl) (struct socket *sock, unsigned int cmd,
unsigned long arg);
#endif
int (*gettstamp) (struct socket *sock, void __user *userstamp,
bool timeval, bool time32);
int (*listen) (struct socket *sock, int len);
int (*shutdown) (struct socket *sock, int flags);
int (*setsockopt)(struct socket *sock, int level,
int optname, char __user *optval, unsigned int optlen);
int (*getsockopt)(struct socket *sock, int level,
int optname, char __user *optval, int __user *optlen);
#ifdef CONFIG_COMPAT
int (*compat_setsockopt)(struct socket *sock, int level,
int optname, char __user *optval, unsigned int optlen);
int (*compat_getsockopt)(struct socket *sock, int level,
int optname, char __user *optval, int __user *optlen);
#endif
void (*show_fdinfo)(struct seq_file *m, struct socket *sock);
int (*sendmsg) (struct socket *sock, struct msghdr *m,
size_t total_len);
/* Notes for implementing recvmsg:
* ===============================
* msg->msg_namelen should get updated by the recvmsg handlers
* iff msg_name != NULL. It is by default 0 to prevent
* returning uninitialized memory to user space. The recvfrom
* handlers can assume that msg.msg_name is either NULL or has
* a minimum size of sizeof(struct sockaddr_storage).
*/
int (*recvmsg) (struct socket *sock, struct msghdr *m,
size_t total_len, int flags);
int (*mmap) (struct file *file, struct socket *sock,
struct vm_area_struct * vma);
ssize_t (*sendpage) (struct socket *sock, struct page *page,
int offset, size_t size, int flags);
ssize_t (*splice_read)(struct socket *sock, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len, unsigned int flags);
int (*set_peek_off)(struct sock *sk, int val);
int (*peek_len)(struct socket *sock);
/* The following functions are called internally by kernel with
* sock lock already held.
*/
int (*read_sock)(struct sock *sk, read_descriptor_t *desc,
sk_read_actor_t recv_actor);
int (*sendpage_locked)(struct sock *sk, struct page *page,
int offset, size_t size, int flags);
int (*sendmsg_locked)(struct sock *sk, struct msghdr *msg,
size_t size);
int (*set_rcvlowat)(struct sock *sk, int val);
};
其包含的functions和struct proto很类似,但是其在socket层
上图来源:https://blog.csdn.net/feiwatson/article/details/82785697
接下来分为字节流(TCP)和数据报(UDP),二者结构分别为:
const struct proto_ops inet_stream_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
.bind = inet_bind,
.connect = inet_stream_connect,
.socketpair = sock_no_socketpair,
.accept = inet_accept,
.getname = inet_getname,
.poll = tcp_poll,
.ioctl = inet_ioctl,
.gettstamp = sock_gettstamp,
.listen = inet_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg,
#ifdef CONFIG_MMU
.mmap = tcp_mmap,
#endif
.sendpage = inet_sendpage,
.splice_read = tcp_splice_read,
.read_sock = tcp_read_sock,
.sendmsg_locked = tcp_sendmsg_locked,
.sendpage_locked = tcp_sendpage_locked,
.peek_len = tcp_peek_len,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
.compat_ioctl = inet_compat_ioctl,
#endif
.set_rcvlowat = tcp_set_rcvlowat,
};
EXPORT_SYMBOL(inet_stream_ops);
const struct proto_ops inet_dgram_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
.bind = inet_bind,
.connect = inet_dgram_connect,
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = inet_getname,
.poll = udp_poll,
.ioctl = inet_ioctl,
.gettstamp = sock_gettstamp,
.listen = sock_no_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
.set_peek_off = sk_set_peek_off,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
.compat_ioctl = inet_compat_ioctl,
#endif
};
EXPORT_SYMBOL(inet_dgram_ops);
以数据报(UDP)为例,结构udp_prot结构如下:
struct proto udp_prot = {
.name = "UDP",
.owner = THIS_MODULE,
.close = udp_lib_close,
.pre_connect = udp_pre_connect,
.connect = ip4_datagram_connect,
.disconnect = udp_disconnect,
.ioctl = udp_ioctl,
.init = udp_init_sock,
.destroy = udp_destroy_sock,
.setsockopt = udp_setsockopt,
.getsockopt = udp_getsockopt,
.sendmsg = udp_sendmsg,
.recvmsg = udp_recvmsg,
.sendpage = udp_sendpage,
.release_cb = ip4_datagram_release_cb,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
.rehash = udp_v4_rehash,
.get_port = udp_v4_get_port,
.memory_allocated = &udp_memory_allocated,
.sysctl_mem = sysctl_udp_mem,
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min),
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min),
.obj_size = sizeof(struct udp_sock),
.h.udp_table = &udp_table,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_udp_setsockopt,
.compat_getsockopt = compat_udp_getsockopt,
#endif
.diag_destroy = udp_abort,
};
EXPORT_SYMBOL(udp_prot);
tcp_prot结构如下:
struct proto tcp_prot = {
.name = "TCP",
.owner = THIS_MODULE,
.close = tcp_close,
.pre_connect = tcp_v4_pre_connect,
.connect = tcp_v4_connect,
.disconnect = tcp_disconnect,
.accept = inet_csk_accept,
.ioctl = tcp_ioctl,
.init = tcp_v4_init_sock,
.destroy = tcp_v4_destroy_sock,
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
.keepalive = tcp_set_keepalive,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
.sendpage = tcp_sendpage,
.backlog_rcv = tcp_v4_do_rcv,
.release_cb = tcp_release_cb,
.hash = inet_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
.leave_memory_pressure = tcp_leave_memory_pressure,
.stream_memory_free = tcp_stream_memory_free,
.sockets_allocated = &tcp_sockets_allocated,
.orphan_count = &tcp_orphan_count,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
.sysctl_mem = sysctl_tcp_mem,
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
.slab_flags = SLAB_TYPESAFE_BY_RCU,
.twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
.h.hashinfo = &tcp_hashinfo,
.no_autobind = true,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_tcp_setsockopt,
.compat_getsockopt = compat_tcp_getsockopt,
#endif
.diag_destroy = tcp_abort,
};
EXPORT_SYMBOL(tcp_prot);
其具体对应的钩子函数(回调函数)结构如下图所示:
经过一系列关系,最终进入网络接口层,改成负责完成网络驱动的功能,然后就进入额具体的网络设备。