Tcp/IP---socket

位置:net/socket.c

首先,和目录,一般文件一样,socket也是一种文件类型,对它的处理通过VFS系统来进行,先来看初始化:

core_initcall(sock_init);	/* early initcall */

#define core_initcall(fn)		__define_initcall(fn, 1)
#define __define_initcall(fn, id) \
	static initcall_t __initcall_##fn##id __used \
	__attribute__((__section__(".initcall" #id ".init"))) = fn
sock_init此函数的指针被放在段 .initcall 中,在内核初始化的时候会被调用到。同时此函数申明为 __init,说明此函数体所占内存在初始化过后会被释放。

static int __init sock_init(void)
{
	int err;
	/* Initialize the network sysctl infrastructure.
<em>           当设置了CONFIG_SYSCTL(</em>在内核正在运行的时候修改内核)选项后执行
        */
	err = net_sysctl_init();
	if (err)
		goto out;

	/*
	 *      Initialize skbuff SLAB cache
	 */
	skb_init();  // 初始化sk_buff缓冲

	 /*
	 *      Initialize the protocols module.
	 */

  	init_inodecache(); // 初始化socket缓冲

        /*注册socket文件系统,sock_fs_type指定文件系统名字,mount命令执行的函数等*/
	 err = register_filesystem(&sock_fs_type);
  	if (err)
 		goto out_fs;
 	sock_mnt = kern_mount(&sock_fs_type);  // 挂载文件系统,最后调用sockfs_mount
 	if (IS_ERR(sock_mnt)) {
  		err = PTR_ERR(sock_mnt);
 		goto out_mount;
	 }

 	/* The real protocol initialization is performed in later initcalls.
	 */

#ifdef CONFIG_NETFILTER
 	err = netfilter_init();
        if (err)
 		goto out;
#endif

#ifdef CONFIG_NETWORK_PHY_TIMESTAMPING
 	skb_timestamping_init();
#endif

out:
 	return err;

out_mount:
 	unregister_filesystem(&sock_fs_type);
out_fs:
 	goto out;
}

skb_init用来初始化缓冲,分配sk_buff时使用

void __init skb_init(void)
{
        /*缓冲的元素是一个sk_buff对象*/
        skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
					      sizeof(struct sk_buff),
					      0,
					      SLAB_HWCACHE_ALIGN|SLAB_PANIC,
					      NULL);

        /*缓冲的元素是2个sk_buff对象加一个引用计数*/
        skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
						(2*sizeof(struct sk_buff)) +
						sizeof(atomic_t) 
						0,
 						SLAB_HWCACHE_ALIGN|SLAB_PANIC,
						NULL);
}
初始化socket缓冲,分配socket时使用:

static struct kmem_cache *sock_inode_cachep __read_mostly;

static int init_inodecache(void)
{
        /*缓冲的元素是一个socket_alloc结构*/
        sock_inode_cachep = kmem_cache_create("sock_inode_cache",
					      sizeof(struct socket_alloc),
					      0,
					      (SLAB_HWCACHE_ALIGN |
					       SLAB_RECLAIM_ACCOUNT |
					       SLAB_MEM_SPREAD),
					      init_once);
	if (sock_inode_cachep == NULL)
		return -ENOMEM;
	return 0;
}

// 对分配的元素进行初始化
static void init_once(void *foo)
{
    struct socket_alloc *ei = (struct socket_alloc *)foo;

    inode_init_once(&ei->vfs_inode);
}
挂载文件系统的相关结构和函数:

static struct file_system_type sock_fs_type = {
	.name =		"sockfs",
	.mount =	sockfs_mount,
	.kill_sb =	kill_anon_super,
};

socket目录相关结构和函数:

/*
 * sockfs_dname() is called from d_path().
 */
static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen)
{
    return dynamic_dname(dentry, buffer, buflen, "socket:[%lu]",
                dentry->d_inode->i_ino);
}

static const struct dentry_operations sockfs_dentry_operations = {
    .d_dname  = sockfs_dname,
};

static const struct super_operations sockfs_ops = {
	.alloc_inode	= sock_alloc_inode,
	.destroy_inode	= sock_destroy_inode,
	.statfs		= simple_statfs,
};

static struct dentry *sockfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data)
{
    return mount_pseudo(fs_type, "socket:", &sockfs_ops,&sockfs_dentry_operations, SOCKFS_MAGIC);
}


 来看分配socket相关联的inode,此函数会分配一个socket_alloc的对象,返回其中的vfs_inode的地址,可以用SOCKET_I从返回值得到socket : 
struct socket_alloc {
    struct socket socket;
    struct inode vfs_inode;
};

// 等待队列
struct socket_wq {
    /* Note: wait MUST be first field of socket_wq */
    wait_queue_head_t    wait;
    struct fasync_struct    *fasync_list;
    struct rcu_head        rcu;
} ____cacheline_aligned_in_smp;

static struct inode *sock_alloc_inode(struct super_block *sb)
{
	struct socket_alloc *ei;
	struct socket_wq *wq;

	ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);  // 从缓冲中得到一个 socket_alloc
	if (!ei)
		return NULL;
        /*分配一个socke_wq对象,并初始化*/
        wq = kmalloc(sizeof(*wq), GFP_KERNEL);
        if (!wq) {
             kmem_cache_free(sock_inode_cachep, ei);
             return NULL;
        }

        init_waitqueue_head(&wq->wait); 

        wq->fasync_list = NULL; 

        RCU_INIT_POINTER(ei->socket.wq, wq); 

        ei->socket.state = SS_UNCONNECTED; 
        ei->socket.flags = 0; 
        ei->socket.ops = NULL; 
        ei->socket.sk = NULL; 
        ei->socket.file = NULL; 
        return &ei->vfs_inode;
} 

static void sock_destroy_inode(struct inode *inode)
{
    struct socket_alloc *ei;
    struct socket_wq *wq;

    ei = container_of(inode, struct socket_alloc, vfs_inode);
    wq = rcu_dereference_protected(ei->socket.wq, 1); // 引用计数减1
    kfree_rcu(wq, rcu);  // 先释放掉等待队列
    kmem_cache_free(sock_inode_cachep, ei);  // 将socket_alloc交还给缓冲
}

 
 先看如何创建一个socket结构: 

static DEFINE_PER_CPU(int, sockets_in_use); // sockets_in_use是一个Per-CPU变量,表示当前CPU上socket的数量

static struct socket *sock_alloc(void)
{
    struct inode *inode;
    struct socket *sock;

    inode = new_inode_pseudo(sock_mnt->mnt_sb);  // 这里实际调用的是sock_alloc_inode
    if (!inode)
        return NULL;

    sock = SOCKET_I(inode);  // 通过指针偏移得到socket结构

    kmemcheck_annotate_bitfield(sock, type);
    inode->i_ino = get_next_ino();  // 设置inode号
    inode->i_mode = S_IFSOCK | S_IRWXUGO;  // 设置inode为socket类型,读写权限为777
    inode->i_uid = current_fsuid(); // 当前进程的文件系统用户ID
    inode->i_gid = current_fsgid();  // 当前进程的文件系统组ID
    inode->i_op = &sockfs_inode_ops;  // 对inode的操作

    this_cpu_add(sockets_in_use, 1); // 当前CPU上的socket数量加1
    return sock;
}
其中的SOCKET_I宏定义如下:

static inline struct socket *SOCKET_I(struct inode *inode)
{
    return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
}

#define container_of(ptr, type, member) ({            \
    const typeof( ((type *)0)->member ) *__mptr = (ptr);    \
    (type *)( (char *)__mptr - offsetof(type,member) );})
铺垫:为解决在64位的内核上执行32位的系统调用,作为传递系统调用号的%rax高32位未被清零的问题,所有的系统调用都加上了 SYSCALL_DEFINE 的宏

如:SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol) 最后替换为:

asmlinkage long SyS_socket( long family, long type, long protocol )
{								
	long ret = SYSC_socket( int(family), int(type), int(protocol) );
	return ret;						
}								
static inline long SYSC_socket( int family, int type, int protocol)
当中的办法就是强制转换。asmlinkage使得在调用此函数时通过堆栈而不是寄存器来传递参数,这一点在系统调用时非常重要。

在调用socket接口时,第一部就是创建一个socket,这里的返回值是一个文件描述符,用户看到的只有这一个描述符,其它结构都是内部使用的:

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
	int retval;
	struct socket *sock;
	int flags;

	/* Check the SOCK_* constants for consistency.  */
	BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
	BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
	BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
	BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);

        /* #define SOCK_TYPE_MASK 0xf  
           #define SOCK_CLOEXEC    O_CLOEXEC  // 文件描述符属性,当在调用exec后,文件描述符关闭。socket也是文件描述符的一种类型
           #define SOCK_NONBLOCK   O_NONBLOCK // socket为非阻塞IO
           参数type只能设置各socket相关的属性(由SOCK_TYPE_MASK决定),和上面两种属性,如果设置了其它属性,则返回错误。
         */
 	flags = type & ~SOCK_TYPE_MASK;
 	if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
		return -EINVAL;
	type &= SOCK_TYPE_MASK;

        /*保证设置正确的为非阻塞IO*/
 	if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
		flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

        /*创建socket结构,创建成功后,下面要绑定到一个文件描述符*/
	retval = sock_create(family, type, protocol, &sock);
	if (retval < 0)
 		goto out;

        /*将socket与文件描述符绑定,返回的值是一个文件描述符*/
	retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
 	if (retval < 0)
 		goto out_release;

out:
 	/* It may be already another descriptor 8) Not kernel problem. */
 	return retval;

out_release:
 	sock_release(sock);
 	return retval;
}
创建socket结构:
int sock_create(int family, int type, int protocol, struct socket **res)
{
    return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}
EXPORT_SYMBOL(sock_create);

int __sock_create(struct net *net, int family, int type, int protocol,
			 struct socket **res, int kern)
{
	int err;
	struct socket *sock;
	const struct net_proto_family *pf;

	/*
	 *      Check protocol is in range
	 */
	if (family < 0 || family >= NPROTO)
		return -EAFNOSUPPORT;
	if (type < 0 || type >= SOCK_MAX)
		return -EINVAL;

	/* Compatibility.

	   This uglymoron is moved from INET layer to here to avoid
	   deadlock in module load.
	 */
	if (family == PF_INET && type == SOCK_PACKET) {  // 直接访问链路层
 		static int warned;
 		if (!warned) {
 			warned = 1;
 			printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",
 			       current->comm);
		}
 		family = PF_PACKET;
	}

 	err = security_socket_create(family, type, protocol, kern);
 	if (err)
 		return err;


 	/*
	 *	Allocate the socket and allow the family to set things up. if
 	 *	the protocol is 0, the family is instructed to select an appropriate
 	 *	default.
	 */
 	sock = sock_alloc();
 	if (!sock) {
 		net_warn_ratelimited("socket: no more sockets\n");
		return -ENFILE;	/* Not exactly a match, but its the
 				   closest posix thing */
 	}

 	sock->type = type;

#ifdef CONFIG_MODULES
 	/* Attempt to load a protocol module if the find failed.
	 *
	 * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
	 * requested real, full-featured networking support upon configuration.
	  * Otherwise module support will break!
 	 */
	if (rcu_access_pointer(net_families[family]) == NULL)
 		request_module("net-pf-%d", family);
#endif

        /* 从net_families得到对应的协议
           所有的协议都会注册一个net_proto_family结构到此数组中
        */
 	rcu_read_lock();
 	pf = rcu_dereference(net_families[family]);
 	err = -EAFNOSUPPORT;
 	if (!pf)
 		goto out_release;

        /*
	 * We will call the ->create function, that possibly is in a loadable
	 * module, so we have to bump that loadable module refcnt first.
	 */
	 if (!try_module_get(pf->owner))
		goto out_release;

	/* Now protected by module ref count */
  	rcu_read_unlock();

 	err = pf->create(net, sock, protocol, kern); // 不同的协议会创建不同的socket
  	if (err < 0)
		goto out_module_put;

 	/*
	  * Now to bump the refcnt of the [loadable] module that owns this
	  * socket at sock_release time we decrement its refcnt.
	 */
	if (!try_module_get(sock->ops->owner))
 		goto out_module_busy;

	/*
	  * Now that we're done with the ->create function, the [loadable]
	 * module can have its refcnt decremented
	 */
 	module_put(pf->owner);
	err = security_socket_post_create(sock, family, type, protocol, kern);
	if (err)
		goto out_sock_release;
 	*res = sock;

	return 0;

out_module_busy:
	err = -EAFNOSUPPORT;
out_module_put:
 	sock->ops = NULL;
	module_put(pf->owner);
out_sock_release:
	sock_release(sock);
	return err;

out_release:
	rcu_read_unlock();
	goto out_sock_release;
}
EXPORT_SYMBOL(__sock_create);
绑定socket到文件描述符:

static int sock_map_fd(struct socket *sock, int flags)
{
	struct file *newfile;
	int fd = get_unused_fd_flags(flags);  // 获得空闲的文件描述符
	if (unlikely(fd < 0))
		return fd;

 	newfile = sock_alloc_file(sock, flags, NULL); // 新建文件
	if (likely(!IS_ERR(newfile))) {
		fd_install(fd, newfile);  // 将文件和文件描述符对应起来,增加到己经打开的文件列表中
		return fd;
	}

	put_unused_fd(fd);
	return PTR_ERR(newfile);
}

创建与socket相关联的文件:

static const struct file_operations socket_file_ops = {
	.owner =	THIS_MODULE,
	.llseek =	no_llseek,
	.aio_read =	sock_aio_read,
	.aio_write =	sock_aio_write,
	.poll =		sock_poll,
	.unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
	.compat_ioctl = compat_sock_ioctl,
#endif
	.mmap =		sock_mmap,
	.open =		sock_no_open,	/* special open code to disallow open via /proc */
	.release =	sock_close,
	.fasync =	sock_fasync,
	.sendpage =	sock_sendpage,
	.splice_write = generic_splice_sendpage,
	.splice_read =	sock_splice_read,
};
上面结构是对文件的操作

struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
{
	struct qstr name = { .name = "" };
	struct path path;
	struct file *file;

	if (dname) {
		name.name = dname;
		name.len = strlen(name.name);
	} else if (sock->sk) {
		name.name = sock->sk->sk_prot_creator->name; // 协议不同这里的name也不同
		name.len = strlen(name.name);
	}
	path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, &name);  // 从socket文件系统挂载点分配文件目录项
	if (unlikely(!path.dentry))
 		return ERR_PTR(-ENOMEM);
 	path.mnt = mntget(sock_mnt);  // 挂载点引用计数加1

	d_instantiate(path.dentry, SOCK_INODE(sock));  // 目录项与socket的inode关联
	SOCK_INODE(sock)->i_fop = &socket_file_ops;

        /*这里已经把前面创建的socket的inode关联到了一个目录项,这样此inode就可以通过路径进行访问了。
          访问的方式当然是创建一个到此路径的文件*/
	file = alloc_file(&path, FMODE_READ | FMODE_WRITE,
		  &socket_file_ops);
 	if (unlikely(IS_ERR(file))) {
		 /* drop dentry, keep inode */
                /*出错,这种情况几乎不会发生
                  先对inode的引用计数加1,然后对目录项的引用计数减1,这就意味着些目录项被删除了,但inode被保留了
                 */
                ihold(path.dentry->d_inode);
		path_put(&path); 
		return file;
	}

	sock->file = file;
	file->f_flags = O_RDWR | (flags & O_NONBLOCK);
	file->private_data = sock;
	return file;
}
EXPORT_SYMBOL(sock_alloc_file);
前面提到每一个协议都要注册到一个全局数组 net_families中,这是通过下面的函数实现:

struct net_proto_family {
    int        family;
    int        (*create)(struct net *net, struct socket *sock, int protocol, int kern);
    struct module    *owner;
};

static DEFINE_SPINLOCK(net_family_lock);
static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly;  // NPROTO 是支持的协议的数量

int sock_register(const struct net_proto_family *ops)
{
	int err;

	if (ops->family >= NPROTO) {
		printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family, NPROTO);
		 return -ENOBUFS;
	}

	 spin_lock(&net_family_lock);
	if (rcu_dereference_protected(net_families[ops->family],
 				      lockdep_is_held(&net_family_lock)))
 		err = -EEXIST;
	else {
 		rcu_assign_pointer(net_families[ops->family], ops);
		err = 0;
 	}
	spin_unlock(&net_family_lock);

 	printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family);
 	return err;
}
EXPORT_SYMBOL(sock_register);
注销一个协议:

void sock_unregister(int family)
{
	BUG_ON(family < 0 || family >= NPROTO);

	spin_lock(&net_family_lock);
	RCU_INIT_POINTER(net_families[family], NULL);
	spin_unlock(&net_family_lock);

	synchronize_rcu();

	printk(KERN_INFO "NET: Unregistered protocol family %d\n", family);
}
EXPORT_SYMBOL(sock_unregister);

如:对IP协议(net/ipv4/af_inet.c)

static const struct net_proto_family inet_family_ops = {
	.family = PF_INET,
	.create = inet_create,
	.owner	= THIS_MODULE,
};

static int __init inet_init(void)
{
    ...
    (void)sock_register(&inet_family_ops);
    ...
}
下面来看inet_create

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值