位置:net/socket.c
首先,和目录,一般文件一样,socket也是一种文件类型,对它的处理通过VFS系统来进行,先来看初始化:
core_initcall(sock_init); /* early initcall */
#define core_initcall(fn) __define_initcall(fn, 1)
#define __define_initcall(fn, id) \
static initcall_t __initcall_##fn##id __used \
__attribute__((__section__(".initcall" #id ".init"))) = fn
sock_init此函数的指针被放在段 .initcall 中,在内核初始化的时候会被调用到。同时此函数申明为 __init,说明此函数体所占内存在初始化过后会被释放。
static int __init sock_init(void)
{
int err;
/* Initialize the network sysctl infrastructure.
<em> 当设置了CONFIG_SYSCTL(</em>在内核正在运行的时候修改内核)选项后执行
*/
err = net_sysctl_init();
if (err)
goto out;
/*
* Initialize skbuff SLAB cache
*/
skb_init(); // 初始化sk_buff缓冲
/*
* Initialize the protocols module.
*/
init_inodecache(); // 初始化socket缓冲
/*注册socket文件系统,sock_fs_type指定文件系统名字,mount命令执行的函数等*/
err = register_filesystem(&sock_fs_type);
if (err)
goto out_fs;
sock_mnt = kern_mount(&sock_fs_type); // 挂载文件系统,最后调用sockfs_mount
if (IS_ERR(sock_mnt)) {
err = PTR_ERR(sock_mnt);
goto out_mount;
}
/* The real protocol initialization is performed in later initcalls.
*/
#ifdef CONFIG_NETFILTER
err = netfilter_init();
if (err)
goto out;
#endif
#ifdef CONFIG_NETWORK_PHY_TIMESTAMPING
skb_timestamping_init();
#endif
out:
return err;
out_mount:
unregister_filesystem(&sock_fs_type);
out_fs:
goto out;
}
skb_init用来初始化缓冲,分配sk_buff时使用
void __init skb_init(void)
{
/*缓冲的元素是一个sk_buff对象*/
skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
sizeof(struct sk_buff),
0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC,
NULL);
/*缓冲的元素是2个sk_buff对象加一个引用计数*/
skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
(2*sizeof(struct sk_buff)) +
sizeof(atomic_t)
0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC,
NULL);
}
初始化socket缓冲,分配socket时使用:
static struct kmem_cache *sock_inode_cachep __read_mostly;
static int init_inodecache(void)
{
/*缓冲的元素是一个socket_alloc结构*/
sock_inode_cachep = kmem_cache_create("sock_inode_cache",
sizeof(struct socket_alloc),
0,
(SLAB_HWCACHE_ALIGN |
SLAB_RECLAIM_ACCOUNT |
SLAB_MEM_SPREAD),
init_once);
if (sock_inode_cachep == NULL)
return -ENOMEM;
return 0;
}
// 对分配的元素进行初始化
static void init_once(void *foo)
{
struct socket_alloc *ei = (struct socket_alloc *)foo;
inode_init_once(&ei->vfs_inode);
}
挂载文件系统的相关结构和函数:
static struct file_system_type sock_fs_type = {
.name = "sockfs",
.mount = sockfs_mount,
.kill_sb = kill_anon_super,
};
socket目录相关结构和函数:
/*
* sockfs_dname() is called from d_path().
*/
static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen)
{
return dynamic_dname(dentry, buffer, buflen, "socket:[%lu]",
dentry->d_inode->i_ino);
}
static const struct dentry_operations sockfs_dentry_operations = {
.d_dname = sockfs_dname,
};
static const struct super_operations sockfs_ops = {
.alloc_inode = sock_alloc_inode,
.destroy_inode = sock_destroy_inode,
.statfs = simple_statfs,
};
static struct dentry *sockfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data)
{
return mount_pseudo(fs_type, "socket:", &sockfs_ops,&sockfs_dentry_operations, SOCKFS_MAGIC);
}
来看分配socket相关联的inode,此函数会分配一个socket_alloc的对象,返回其中的vfs_inode的地址,可以用SOCKET_I从返回值得到socket :
struct socket_alloc {
struct socket socket;
struct inode vfs_inode;
};
// 等待队列
struct socket_wq {
/* Note: wait MUST be first field of socket_wq */
wait_queue_head_t wait;
struct fasync_struct *fasync_list;
struct rcu_head rcu;
} ____cacheline_aligned_in_smp;
static struct inode *sock_alloc_inode(struct super_block *sb)
{
struct socket_alloc *ei;
struct socket_wq *wq;
ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL); // 从缓冲中得到一个 socket_alloc
if (!ei)
return NULL;
/*分配一个socke_wq对象,并初始化*/
wq = kmalloc(sizeof(*wq), GFP_KERNEL);
if (!wq) {
kmem_cache_free(sock_inode_cachep, ei);
return NULL;
}
init_waitqueue_head(&wq->wait);
wq->fasync_list = NULL;
RCU_INIT_POINTER(ei->socket.wq, wq);
ei->socket.state = SS_UNCONNECTED;
ei->socket.flags = 0;
ei->socket.ops = NULL;
ei->socket.sk = NULL;
ei->socket.file = NULL;
return &ei->vfs_inode;
}
static void sock_destroy_inode(struct inode *inode)
{
struct socket_alloc *ei;
struct socket_wq *wq;
ei = container_of(inode, struct socket_alloc, vfs_inode);
wq = rcu_dereference_protected(ei->socket.wq, 1); // 引用计数减1
kfree_rcu(wq, rcu); // 先释放掉等待队列
kmem_cache_free(sock_inode_cachep, ei); // 将socket_alloc交还给缓冲
}
先看如何创建一个socket结构:
static DEFINE_PER_CPU(int, sockets_in_use); // sockets_in_use是一个Per-CPU变量,表示当前CPU上socket的数量
static struct socket *sock_alloc(void)
{
struct inode *inode;
struct socket *sock;
inode = new_inode_pseudo(sock_mnt->mnt_sb); // 这里实际调用的是sock_alloc_inode
if (!inode)
return NULL;
sock = SOCKET_I(inode); // 通过指针偏移得到socket结构
kmemcheck_annotate_bitfield(sock, type);
inode->i_ino = get_next_ino(); // 设置inode号
inode->i_mode = S_IFSOCK | S_IRWXUGO; // 设置inode为socket类型,读写权限为777
inode->i_uid = current_fsuid(); // 当前进程的文件系统用户ID
inode->i_gid = current_fsgid(); // 当前进程的文件系统组ID
inode->i_op = &sockfs_inode_ops; // 对inode的操作
this_cpu_add(sockets_in_use, 1); // 当前CPU上的socket数量加1
return sock;
}
其中的SOCKET_I宏定义如下:
static inline struct socket *SOCKET_I(struct inode *inode)
{
return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
}
#define container_of(ptr, type, member) ({ \
const typeof( ((type *)0)->member ) *__mptr = (ptr); \
(type *)( (char *)__mptr - offsetof(type,member) );})
铺垫:为解决在64位的内核上执行32位的系统调用,作为传递系统调用号的%rax高32位未被清零的问题,所有的系统调用都加上了 SYSCALL_DEFINE 的宏
如:SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol) 最后替换为:
asmlinkage long SyS_socket( long family, long type, long protocol )
{
long ret = SYSC_socket( int(family), int(type), int(protocol) );
return ret;
}
static inline long SYSC_socket( int family, int type, int protocol)
当中的办法就是强制转换。asmlinkage使得在调用此函数时通过堆栈而不是寄存器来传递参数,这一点在系统调用时非常重要。
在调用socket接口时,第一部就是创建一个socket,这里的返回值是一个文件描述符,用户看到的只有这一个描述符,其它结构都是内部使用的:
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
int retval;
struct socket *sock;
int flags;
/* Check the SOCK_* constants for consistency. */
BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);
/* #define SOCK_TYPE_MASK 0xf
#define SOCK_CLOEXEC O_CLOEXEC // 文件描述符属性,当在调用exec后,文件描述符关闭。socket也是文件描述符的一种类型
#define SOCK_NONBLOCK O_NONBLOCK // socket为非阻塞IO
参数type只能设置各socket相关的属性(由SOCK_TYPE_MASK决定),和上面两种属性,如果设置了其它属性,则返回错误。
*/
flags = type & ~SOCK_TYPE_MASK;
if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return -EINVAL;
type &= SOCK_TYPE_MASK;
/*保证设置正确的为非阻塞IO*/
if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
/*创建socket结构,创建成功后,下面要绑定到一个文件描述符*/
retval = sock_create(family, type, protocol, &sock);
if (retval < 0)
goto out;
/*将socket与文件描述符绑定,返回的值是一个文件描述符*/
retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
if (retval < 0)
goto out_release;
out:
/* It may be already another descriptor 8) Not kernel problem. */
return retval;
out_release:
sock_release(sock);
return retval;
}
创建socket结构:
int sock_create(int family, int type, int protocol, struct socket **res)
{
return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}
EXPORT_SYMBOL(sock_create);
int __sock_create(struct net *net, int family, int type, int protocol,
struct socket **res, int kern)
{
int err;
struct socket *sock;
const struct net_proto_family *pf;
/*
* Check protocol is in range
*/
if (family < 0 || family >= NPROTO)
return -EAFNOSUPPORT;
if (type < 0 || type >= SOCK_MAX)
return -EINVAL;
/* Compatibility.
This uglymoron is moved from INET layer to here to avoid
deadlock in module load.
*/
if (family == PF_INET && type == SOCK_PACKET) { // 直接访问链路层
static int warned;
if (!warned) {
warned = 1;
printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",
current->comm);
}
family = PF_PACKET;
}
err = security_socket_create(family, type, protocol, kern);
if (err)
return err;
/*
* Allocate the socket and allow the family to set things up. if
* the protocol is 0, the family is instructed to select an appropriate
* default.
*/
sock = sock_alloc();
if (!sock) {
net_warn_ratelimited("socket: no more sockets\n");
return -ENFILE; /* Not exactly a match, but its the
closest posix thing */
}
sock->type = type;
#ifdef CONFIG_MODULES
/* Attempt to load a protocol module if the find failed.
*
* 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
* requested real, full-featured networking support upon configuration.
* Otherwise module support will break!
*/
if (rcu_access_pointer(net_families[family]) == NULL)
request_module("net-pf-%d", family);
#endif
/* 从net_families得到对应的协议
所有的协议都会注册一个net_proto_family结构到此数组中
*/
rcu_read_lock();
pf = rcu_dereference(net_families[family]);
err = -EAFNOSUPPORT;
if (!pf)
goto out_release;
/*
* We will call the ->create function, that possibly is in a loadable
* module, so we have to bump that loadable module refcnt first.
*/
if (!try_module_get(pf->owner))
goto out_release;
/* Now protected by module ref count */
rcu_read_unlock();
err = pf->create(net, sock, protocol, kern); // 不同的协议会创建不同的socket
if (err < 0)
goto out_module_put;
/*
* Now to bump the refcnt of the [loadable] module that owns this
* socket at sock_release time we decrement its refcnt.
*/
if (!try_module_get(sock->ops->owner))
goto out_module_busy;
/*
* Now that we're done with the ->create function, the [loadable]
* module can have its refcnt decremented
*/
module_put(pf->owner);
err = security_socket_post_create(sock, family, type, protocol, kern);
if (err)
goto out_sock_release;
*res = sock;
return 0;
out_module_busy:
err = -EAFNOSUPPORT;
out_module_put:
sock->ops = NULL;
module_put(pf->owner);
out_sock_release:
sock_release(sock);
return err;
out_release:
rcu_read_unlock();
goto out_sock_release;
}
EXPORT_SYMBOL(__sock_create);
绑定socket到文件描述符:
static int sock_map_fd(struct socket *sock, int flags)
{
struct file *newfile;
int fd = get_unused_fd_flags(flags); // 获得空闲的文件描述符
if (unlikely(fd < 0))
return fd;
newfile = sock_alloc_file(sock, flags, NULL); // 新建文件
if (likely(!IS_ERR(newfile))) {
fd_install(fd, newfile); // 将文件和文件描述符对应起来,增加到己经打开的文件列表中
return fd;
}
put_unused_fd(fd);
return PTR_ERR(newfile);
}
创建与socket相关联的文件:
static const struct file_operations socket_file_ops = {
.owner = THIS_MODULE,
.llseek = no_llseek,
.aio_read = sock_aio_read,
.aio_write = sock_aio_write,
.poll = sock_poll,
.unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = compat_sock_ioctl,
#endif
.mmap = sock_mmap,
.open = sock_no_open, /* special open code to disallow open via /proc */
.release = sock_close,
.fasync = sock_fasync,
.sendpage = sock_sendpage,
.splice_write = generic_splice_sendpage,
.splice_read = sock_splice_read,
};
上面结构是对文件的操作
struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
{
struct qstr name = { .name = "" };
struct path path;
struct file *file;
if (dname) {
name.name = dname;
name.len = strlen(name.name);
} else if (sock->sk) {
name.name = sock->sk->sk_prot_creator->name; // 协议不同这里的name也不同
name.len = strlen(name.name);
}
path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, &name); // 从socket文件系统挂载点分配文件目录项
if (unlikely(!path.dentry))
return ERR_PTR(-ENOMEM);
path.mnt = mntget(sock_mnt); // 挂载点引用计数加1
d_instantiate(path.dentry, SOCK_INODE(sock)); // 目录项与socket的inode关联
SOCK_INODE(sock)->i_fop = &socket_file_ops;
/*这里已经把前面创建的socket的inode关联到了一个目录项,这样此inode就可以通过路径进行访问了。
访问的方式当然是创建一个到此路径的文件*/
file = alloc_file(&path, FMODE_READ | FMODE_WRITE,
&socket_file_ops);
if (unlikely(IS_ERR(file))) {
/* drop dentry, keep inode */
/*出错,这种情况几乎不会发生
先对inode的引用计数加1,然后对目录项的引用计数减1,这就意味着些目录项被删除了,但inode被保留了
*/
ihold(path.dentry->d_inode);
path_put(&path);
return file;
}
sock->file = file;
file->f_flags = O_RDWR | (flags & O_NONBLOCK);
file->private_data = sock;
return file;
}
EXPORT_SYMBOL(sock_alloc_file);
前面提到每一个协议都要注册到一个全局数组 net_families中,这是通过下面的函数实现:
struct net_proto_family {
int family;
int (*create)(struct net *net, struct socket *sock, int protocol, int kern);
struct module *owner;
};
static DEFINE_SPINLOCK(net_family_lock);
static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly; // NPROTO 是支持的协议的数量
int sock_register(const struct net_proto_family *ops)
{
int err;
if (ops->family >= NPROTO) {
printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family, NPROTO);
return -ENOBUFS;
}
spin_lock(&net_family_lock);
if (rcu_dereference_protected(net_families[ops->family],
lockdep_is_held(&net_family_lock)))
err = -EEXIST;
else {
rcu_assign_pointer(net_families[ops->family], ops);
err = 0;
}
spin_unlock(&net_family_lock);
printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family);
return err;
}
EXPORT_SYMBOL(sock_register);
注销一个协议:
void sock_unregister(int family)
{
BUG_ON(family < 0 || family >= NPROTO);
spin_lock(&net_family_lock);
RCU_INIT_POINTER(net_families[family], NULL);
spin_unlock(&net_family_lock);
synchronize_rcu();
printk(KERN_INFO "NET: Unregistered protocol family %d\n", family);
}
EXPORT_SYMBOL(sock_unregister);
如:对IP协议(net/ipv4/af_inet.c)
static const struct net_proto_family inet_family_ops = {
.family = PF_INET,
.create = inet_create,
.owner = THIS_MODULE,
};
static int __init inet_init(void)
{
...
(void)sock_register(&inet_family_ops);
...
}
下面来看inet_create