linux内核socket实现

http://blog.chinaunix.net/uid-20788636-id-4408276.html

1.2 sock_map_fd函数

         在用户空间创建了一个socket后,返回值是一个文件描述符,下面分析一下创建socket时怎么和文件描述符联系的。在SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)最后调用sock_map_fd进行关联,其中返回的retval就是用户空间获取的文件描述符fdsock就是调用sock_create创建成功的socket.

         sock_map_fd()主要用于对socket*file指针初始化,经过sock_map_fd()操作后,socket就通过其*file指针与VFS管理的文件进行了关联,便可以进行文件的各种操作,如readwritelseekioctl.

retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));

 

static int sock_map_fd(struct socket *sock, int flags)

{

         struct file *newfile;

         int fd = get_unused_fd_flags(flags);//根据flags获取没有使用的fd,具体分析见1.2.1

         if (unlikely(fd < 0))

                   return fd;

 

         newfile = sock_alloc_file(sock, flags, NULL);

         if (likely(!IS_ERR(newfile))) {

                   fd_install(fd, newfile);

                   return fd;

         }

 

         put_unused_fd(fd);

         return PTR_ERR(newfile);

}

1.2.1   get_unused_fd_flags函数

         get_unused_fd_flags()函数调用__alloc_fd分配一个新的可用的fd

int __alloc_fd(struct files_struct *files,

                unsigned start, unsigned end, unsigned flags)

{

         unsigned int fd;

         int error;

         struct fdtable *fdt;

 

         spin_lock(&files->file_lock);

repeat:

/*得到本进程的文件描述符表*/

         fdt = files_fdtable(files);

         fd = start;//start开始,这里的start0

/* files->next_fd为上一次查找确定的下一个可用空闲的文件描述符,这样可以提高获取的效率,如果fd小于files->next_fd的话就可以直接使用next_fd */

         if (fd < files->next_fd)

                   fd = files->next_fd;

/*fd小于目前进程支持的最大的描述符号,那么可以通过fds_bits位图,从fd位开始查找,找到下一个0位,即下一个空闲描述符。*/

         if (fd < fdt->max_fds)

                   fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd);

 

         /*

          * N.B. For clone tasks sharing a files structure, this test

          * will limit the total number of files that can be opened.

          */

         error = -EMFILE;

         if (fd >= end)

                   goto out;

/* 如需要则扩展文件描述符表 */

         error = expand_files(files, fd);

         if (error < 0)

                   goto out;

 

         /*

          * If we needed to expand the fs array we

          * might have blocked - try again.

          */

         if (error)

                   goto repeat;

    /*

     设置next_fd,用于下次加速查找空闲的fd

     start大于next_fd时,不会设置next_fd以避免文件描述符的不连续

     */

         if (start <= files->next_fd)

                   files->next_fd = fd + 1;

 

  /* fd添加到已打开的文件描述符表中 */

         __set_open_fd(fd, fdt);

         if (flags & O_CLOEXEC)

                   __set_close_on_exec(fd, fdt);

         else

                   __clear_close_on_exec(fd, fdt);

         error = fd;

#if 1

         /* Sanity check */

         if (rcu_dereference_raw(fdt->fd[fd]) != NULL) {

                   printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);

                   rcu_assign_pointer(fdt->fd[fd], NULL);

         }

#endif

 

out:

         spin_unlock(&files->file_lock);

         return error;

}

1.2.2 sock_alloc_file函数

struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)

{

         struct qstr name = { .name = "" };

         struct path path;

         struct file *file;

 

         if (dname) {//这里的dname为空

                   name.name = dname;

                   name.len = strlen(name.name);

         } else if (sock->sk) {

 /*这里的name应该是TCP 根据struct proto tcp_prot */

                   name.name = sock->sk->sk_prot_creator->name;

                   name.len = strlen(name.name);

         }

/*申请一个新的dentry,其中sock_mnt->mnt_sb在前面已经分析过了,是一个sock_fs_type文件系统挂载点,*/

         path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, &name);

         if (unlikely(!path.dentry))

                   return ERR_PTR(-ENOMEM);

         path.mnt = mntget(sock_mnt);

/*将文件操作的函数绑定到inode,对于dentry是在sockfs_mount函数中sockfs_dentry_operations,该函数在sock_init是调用,在前面有分析 */

         d_instantiate(path.dentry, SOCK_INODE(sock));

         SOCK_INODE(sock)->i_fop = &socket_file_ops;

/*申请新的file,将path,file,关联起来*/

         file = alloc_file(&path, FMODE_READ | FMODE_WRITE,

                     &socket_file_ops);

         if (unlikely(IS_ERR(file))) {

                   /* drop dentry, keep inode */

                   ihold(path.dentry->d_inode);

                   path_put(&path);

                   return file;

         }

 

         sock->file = file;//sock->file和刚分配的file关联起来

         file->f_flags = O_RDWR | (flags & O_NONBLOCK);//设置file的标志

         file->private_data = sock;//file的私有数据指针指向sock.

         return file;

}

Socket创建流程图

附录:对于sk_alloc分配的内存大小问题分析

         在分析中经常看到此种类型的强制转换inet = inet_sk(sk);,其中inet被定义为struct inet_sock *inet;结构体,我们看结构体的定义sock结构体的大小小于struct inet_sock,这样是无法进行强制类型转换的,但在实际分配的过程中sock分配的大小为tcp_sock的大小,而该结构足够大。

struct sock *sk_alloc(struct net *net, int family, gfp_t priority,

                         struct proto *prot)

{

         struct sock *sk;

 

         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);

         if (sk) {

                   sk->sk_family = family;

                   /*

                    * See comment in struct sock definition to understand

                    * why we need sk_prot_creator -acme

                    */

                   sk->sk_prot = sk->sk_prot_creator = prot;

                   sock_lock_init(sk);

                   sock_net_set(sk, get_net(net));

                   atomic_set(&sk->sk_wmem_alloc, 1);

 

                   sock_update_classid(sk);

                   sock_update_netprioidx(sk);

         }

 

         return sk;

}

 

static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,

                   int family)

{

         struct sock *sk;

         struct kmem_cache *slab;

    /*这里分配内存空间时,分为两种情况,第一种情况是从高速缓存上分配,第二种是普通的分配*/

         slab = prot->slab;

         if (slab != NULL) {

                  sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);---------------------1

                   if (!sk)

                            return sk;

                   if (priority & __GFP_ZERO) {

                            if (prot->clear_sk)

                                     prot->clear_sk(sk, prot->obj_size);

                            else

                                     sk_prot_clear_nulls(sk, prot->obj_size);

                   }

         } else

                   sk = kmalloc(prot->obj_size, priority);---------------------------2

 

         if (sk != NULL) {

                   kmemcheck_annotate_bitfield(sk, flags);

 

                   if (security_sk_alloc(sk, family, priority))

                            goto out_free;

 

                   if (!try_module_get(prot->owner))

                            goto out_free_sec;

                   sk_tx_queue_clear(sk);

         }

 

         return sk;

 

out_free_sec:

         security_sk_free(sk);

out_free:

         if (slab != NULL)

                   kmem_cache_free(slab, sk);

         else

                   kfree(sk);

         return NULL;

}

1)第一种情况:sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO) 这里的slap等于slab = prot->slab;也就是函数传递过来的struct proto *prot,再看一下这个结构体是怎么定义的?在inet_create函数中sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);,这里的answer_protanswer_prot = answer->prot;在看一下answer->prot是如何来的?

         inet_ctreate函数中通过遍历inetsw数组获取到struct inet_protosw *answer;

list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {

 

                   err = 0;

                   /* Check the non-wild match. */

                   if (protocol == answer->protocol) {

                            if (protocol != IPPROTO_IP)

                                     break;

                   } else {

                            /* Check for the two wild cases. */

                            if (IPPROTO_IP == protocol) {

                                     protocol = answer->protocol;

                                     break;

                            }

                            if (IPPROTO_IP == answer->protocol)

                                     break;

                   }

                   err = -EPROTONOSUPPORT;

         }

         其中inetsw的定义下面类型的数组,如果是SOCK_STREAM类型的socket,这里的prot = tcp_prot

static struct inet_protosw inetsw_array[] =

{

         {

                   .type =       SOCK_STREAM,

                   .protocol =   IPPROTO_TCP,

                  .prot =       &tcp_prot,

                   .ops =        &inet_stream_ops,

                   .no_check =   0,

                   .flags =      INET_PROTOSW_PERMANENT |

                                  INET_PROTOSW_ICSK,

         },

 

         {

                   .type =       SOCK_DGRAM,

                   .protocol =   IPPROTO_UDP,

                   .prot =       &udp_prot,

                   .ops =        &inet_dgram_ops,

                   .no_check =   UDP_CSUM_DEFAULT,

                   .flags =      INET_PROTOSW_PERMANENT,

       },

 

       {

                   .type =       SOCK_DGRAM,

                   .protocol =   IPPROTO_ICMP,

                   .prot =       &ping_prot,

                   .ops =        &inet_dgram_ops,

                   .no_check =   UDP_CSUM_DEFAULT,

                   .flags =      INET_PROTOSW_REUSE,

       },

 

       {

                .type =       SOCK_RAW,

                .protocol =   IPPROTO_IP,       /* wild card */

                .prot =       &raw_prot,

                .ops =        &inet_sockraw_ops,

                .no_check =   UDP_CSUM_DEFAULT,

                .flags =      INET_PROTOSW_REUSE,

       }

};

         再看一下

struct proto tcp_prot = {

         .name                         = "TCP",

         .owner                        = THIS_MODULE,

         .close                          = tcp_close,

         .connect            = tcp_v4_connect,

         .disconnect                = tcp_disconnect,

         .accept                       = inet_csk_accept,

         .ioctl                            = tcp_ioctl,

         .init                     = tcp_v4_init_sock,

         .destroy            = tcp_v4_destroy_sock,

         .shutdown                 = tcp_shutdown,

         .setsockopt               = tcp_setsockopt,

         .getsockopt               = tcp_getsockopt,

         .recvmsg           = tcp_recvmsg,

         .sendmsg                   = tcp_sendmsg,

         .sendpage                  = tcp_sendpage,

         .backlog_rcv              = tcp_v4_do_rcv,

         .release_cb               = tcp_release_cb,

         .mtu_reduced          = tcp_v4_mtu_reduced,

         .hash                           = inet_hash,

         .unhash                      = inet_unhash,

         .get_port          = inet_csk_get_port,

         .enter_memory_pressure       = tcp_enter_memory_pressure,

         .stream_memory_free    = tcp_stream_memory_free,

         .sockets_allocated  = &tcp_sockets_allocated,

         .orphan_count                   = &tcp_orphan_count,

         .memory_allocated = &tcp_memory_allocated,

         .memory_pressure = &tcp_memory_pressure,

         .sysctl_mem             = sysctl_tcp_mem,

         .sysctl_wmem          = sysctl_tcp_wmem,

         .sysctl_rmem            = sysctl_tcp_rmem,

         .max_header            = MAX_TCP_HEADER,

         .obj_size           = sizeof(struct tcp_sock),

         .slab_flags                 = SLAB_DESTROY_BY_RCU,

         .twsk_prot                 = &tcp_timewait_sock_ops,

         .rsk_prot           = &tcp_request_sock_ops,

         .h.hashinfo                = &tcp_hashinfo,

         .no_autobind            = true,

#ifdef CONFIG_COMPAT

         .compat_setsockopt        = compat_tcp_setsockopt,

         .compat_getsockopt        = compat_tcp_getsockopt,

#endif

#ifdef CONFIG_MEMCG_KMEM

         .init_cgroup               = tcp_init_cgroup,

         .destroy_cgroup                = tcp_destroy_cgroup,

         .proto_cgroup          = tcp_proto_cgroup,

#endif

};

         af_inet.c文件中的inet_init函数中的

static int __init inet_init(void)

{

         struct inet_protosw *q;

         struct list_head *r;

         int rc = -EINVAL;

         BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb));

         sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);

         if (!sysctl_local_reserved_ports)

                   goto out;

    //该函数是注册tcp_prot,在该函数中对tcp_prot->slab进行内存分配

         rc = proto_register(&tcp_prot, 1);

         if (rc)

                   goto out_free_reserved_ports;

         rc = proto_register(&udp_prot, 1);

         if (rc)

                   goto out_unregister_tcp_proto;

         rc = proto_register(&raw_prot, 1);

         if (rc)

                   goto out_unregister_udp_proto;

         rc = proto_register(&ping_prot, 1);

         if (rc)

                   goto out_unregister_raw_proto;

         /*

          *     Tell SOCKET that we are alive...

          */

         (void)sock_register(&inet_family_ops);

#ifdef CONFIG_SYSCTL

         ip_static_sysctl_init();

#endif

         /*

          *     Add all the base protocols.

          */

         if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)

                   pr_crit("%s: Cannot add ICMP protocol\n", __func__);

         if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)

                   pr_crit("%s: Cannot add UDP protocol\n", __func__);

         if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)

                   pr_crit("%s: Cannot add TCP protocol\n", __func__);

#ifdef CONFIG_IP_MULTICAST

         if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)

                   pr_crit("%s: Cannot add IGMP protocol\n", __func__);

#endif

         /* Register the socket-side information for inet_create. inetsw进行初始化操作*/

         for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)

                   INIT_LIST_HEAD(r);

/*inetsw_array 加入到对于的inetsw链表中,就可以在inet_create 函数中进行遍历*/

         for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)

                   inet_register_protosw(q);

         /*

          *     Set the ARP module up

          */

         arp_init();

         /*

          *     Set the IP module up

          */

         ip_init();

         tcp_v4_init();

         /* Setup TCP slab cache for open requests. */

         tcp_init();

         /* Setup UDP memory threshold */

         udp_init();

         /* Add UDP-Lite (RFC 3828) */

         udplite4_register();

         ping_init();

         /*

          *     Set the ICMP layer up

          */

         if (icmp_init() < 0)

                   panic("Failed to create the ICMP control socket.\n");

         /*

          *     Initialise the multicast router

          */

#if defined(CONFIG_IP_MROUTE)

         if (ip_mr_init())

                   pr_crit("%s: Cannot init ipv4 mroute\n", __func__);

#endif

         /*

          *     Initialise per-cpu ipv4 mibs

          */

         if (init_ipv4_mibs())

                   pr_crit("%s: Cannot init ipv4 mibs\n", __func__);

         ipv4_proc_init();

         ipfrag_init();

         dev_add_pack(&ip_packet_type);

         rc = 0;

out:

         return rc;

out_unregister_raw_proto:

         proto_unregister(&raw_prot);

out_unregister_udp_proto:

         proto_unregister(&udp_prot);

out_unregister_tcp_proto:

         proto_unregister(&tcp_prot);

out_free_reserved_ports:

         kfree(sysctl_local_reserved_ports);

         goto out;

}

         proto_register函数中,主要是关注prot->slab进行了初始化。

int proto_register(struct proto *prot, int alloc_slab)

{

         if (alloc_slab) {

                   prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,

                                               SLAB_HWCACHE_ALIGN | prot->slab_flags,

                                               NULL);// 这里的饿prot->obj_size.obj_size               = sizeof(struct tcp_sock),

                   if (prot->slab == NULL) {

                            pr_crit("%s: Can't create sock SLAB cache!\n",

                                     prot->name);

                            goto out;

                   }

……………………..

}

2)对于第二种情况,主要prot->obj_size,就是struct proto tcp_prot 中初始化的.obj_size            = sizeof(struct tcp_sock)sk = kmalloc(prot->obj_size, priority);---------------------------2

         下面是五个相关的数据结构,tcp_sock结构体占用的空间是最大的,所以在分配内存空间时,都是分配的tcp_sock的大小,这样在后面进行强制转换的过程中可以保证正确。


相关推荐
©️2020 CSDN 皮肤主题: 大白 设计师:CSDN官方博客 返回首页