Socket Kernel Source Chapter03 socket

3 socket()

3.1 sys_socket

包括三个主要函数,sock_create()负责创建socket分配内存并初始化, sock_map_fd()负责分配struct file*, sock_release()负责释放内存。

 

进入sys_socket后,调用sock_create,传入familytype,就是前面的a0,a1,调用socket_create创建struct socket结构

asmlinkage long sys_socket(int family, int type, int protocol)

{

        int retval;
        struct socket *sock;

        retval = sock_create(family, type, protocol, &sock); //创建socket, 3.2
        if (retval < 0)
                goto out;

        retval = sock_map_fd(sock);  //将socket和struct file*挂钩 3.5
        if (retval < 0)
                goto out_release;

out:
        return retval;

out_release:
        sock_release(sock);        //socket释放 3.7
        return retval;
}

 

 

3.2 sock_create

负责创建struct socket,并初始化,包括sock_alloc()负责内存分配和初始化,pf->create()负责根据协议族初始化。

 

直接调用__sock_create,添加了当前进程的struct net*参数,net_proto_family *pf->create()时需要此参数。其它参数直接传递。 

int sock_create(int family, int type, int protocol, struct socket **res)
{
        return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}

 

3.2.1 struct nsproxy 

其中nsproxy用来实现各个进程的命名空间,包括mnt, uts, pid,user, net等, 由count实现计数,用来在工作队列中共享(?)结构如下:

struct nsproxy {
    atomic_t count;
    struct uts_namespace *uts_ns;
    struct ipc_namespace *ipc_ns;
    struct mnt_namespace *mnt_ns;
    struct pid_namespace *pid_ns;
    struct user_namespace *user_ns;
    struct net           *net_ns;
}; 

 

3.3 __sock_create

真正调用的是__sock_create函数,先分配内存空间sock_alloc,  然后根据各个type调用其create, 当是TCP/IP(faimiles),这个create会调用inet_create,创建struct socket

static int __sock_create(struct net *net, int family, int type, int protocol, struct socket **res, int kern)
{
        int err;
        struct socket *sock;
        const struct net_proto_family *pf;

       

         //检查协议族类型范围

        if (family < 0 || family >= NPROTO)
                return -EAFNOSUPPORT;
        if (type < 0 || type >= SOCK_MAX)
                return -EINVAL;

 

         //兼容性考虑,当协议族为PF_INET,协议是SOCK_PACKET时,协议族强行改为PF_PACKET,避免模块加载时出现死锁(?)

        if (family == PF_INET && type == SOCK_PACKET) {
            static int warned;
            if (!warned) {
                    warned = 1;
                    printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)/n", current->comm);
            }
            family = PF_PACKET;
        }

        err = security_socket_create(family, type, protocol, kern);    //安全方面的,先忽略
        if (err)
                return err;

        
        sock = sock_alloc();             //分配socket,内部创建新inode并关联 3.3
        if (!sock) {
                if (net_ratelimit())          //添加模块接口,可以进行扩展处理,默认是不做处理直接返回0,
                        printk(KERN_WARNING "socket: no more sockets/n");
                return -ENFILE; /* Not exactly a match, but its the closest posix thing */
        }

        sock->type = type;           //协议类型

 

#if defined(CONFIG_KMOD)
        //模块需已经加载上
        if (net_families[family] == NULL)  //如果相应的协议族的模块不存在,就报错
                request_module("net-pf-%d", family);
#endif

        rcu_read_lock();                   //读锁

        //rcu_dereference能够安全的获取需要的协议族的指针,其实就是pf = net_familits[family],获取其指针
        pf = rcu_dereference(net_families[family]);        

        err = -EAFNOSUPPORT;  //预先分配错误号,
        if (!pf)
                goto out_release;

 

         //由于协议族可能是从模块上加载的,因此要先检查其模块是否仍然存活,如果仍然存活,则计数增加,否则返回失败
         if (!try_module_get(pf->owner))      //增加此协议族模块,在此cpu上的计数
                goto out_release;

 

        //解开读锁
        rcu_read_unlock();    

        err = pf->create(net, sock, protocol);    //不同协议族调用不同的创建函数: 3.7

        if (err < 0)
                goto out_module_put;

       
        if (!try_module_get(sock->ops->owner))  //增加此协议的函数指针的模块在此cpu上的计数 3.8
                goto out_module_busy;


        module_put(pf->owner); //对应 if (!try_module_get(pf->owner))  
        err = security_socket_post_create(sock, family, type, protocol, kern);    //安全控制,忽略 
        if (err)
                goto out_sock_release;
        *res = sock;

        return 0;

out_module_busy:
        err = -EAFNOSUPPORT;
out_module_put:
        sock->ops = NULL;
        module_put(pf->owner); //对应 if (!try_module_get(pf->owner))  ,减少计数  3.9
out_sock_release:
        sock_release(sock);
        return err;
out_release:
        rcu_read_unlock();
        goto out_sock_release;
}
 

3.3.1 net_reatelimit

其中net_ratelimit为模块中加载的函数,可以添加限制等扩展功能,默认是不做任何处理直接返回。

int net_msg_cost  __read_mostly = 5*HZ;
int net_msg_burst  __read_mostly = 10;
int net_ratelimit(void)
{
    return __printk_ratelimit(net_msg_cost, net_msg_burst);
}

EXPORT_SYMBOL(net_ratelimit);

其默认的输出是不做任何处理,直接返回。

static inline int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) { return 0; } 

 

3.3.2 rcu_dereference

其中rcu_dereference用于安全的获取p指针 ,其中的smp_read_barrier_depends是专门的读门槛,保证读的顺序,避免编译器优化

#define rcu_dereference(p)     ({ /
                                typeof(p) _________p1 = ACCESS_ONCE(p); /
                                smp_read_barrier_depends(); /
                                (_________p1); /
                                })

其中的volatile也是强制每次真正读取x值,而不能从缓存读取,这样一个变量经volatile修饰后,任何线程中改变了它的值,所有其他线程读取时就获取到了相同的值。

#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))

 

3.3.3 try_module_get

而try_module_get是检查module是否存在,如果存在就将其计数加1,否则返回失败。

static inline int try_module_get(struct module *module)
{
        int ret = 1;

        if (module) {
                unsigned int cpu = get_cpu();
                if (likely(module_is_live(module)))
                        local_inc(&module->ref[cpu].count);
                else
                        ret = 0;
                put_cpu();
        }
        return ret;
}

而module_is_live很简单,就是检查其状态字。
static inline int module_is_live(struct module *mod)
{
        return mod->state != MODULE_STATE_GOING;
}

 

 

3.4 sock_alloc

 

sock_alloc分配socket空间, 需要申请inode,实际调用的函数是new_inode,并且最终调用alloc_inode() 
static struct socket *sock_alloc(void)
{
        struct inode *inode;
        struct socket *sock;

        inode = new_inode(sock_mnt->mnt_sb);    //初始化inode
        if (!inode)
                return NULL;

        sock = SOCKET_I(inode);                         //从初始化好的inode中提取出sock指针,用于后面的设置

        inode->i_mode = S_IFSOCK | S_IRWXUGO;
        inode->i_uid = current->fsuid;
        inode->i_gid = current->fsgid;

        get_cpu_var(sockets_in_use)++;
        put_cpu_var(sockets_in_use);
        return sock;
}

 

3.5 new_inode

分配内存空间,并且对inode初始化,加入全局队列inode_in_use

struct inode *new_inode(struct super_block *sb)
{
        static unsigned int last_ino;
        struct inode * inode;

        spin_lock_prefetch(&inode_lock); //上锁,预计上锁(?),待查
        
        inode = alloc_inode(sb);
        if (inode) {
                spin_lock(&inode_lock);
                inodes_stat.nr_inodes++;
                list_add(&inode->i_list, &inode_in_use); //加入使用队列,全局队列,定义为 LIST_HEAD(inode_in_use);
                list_add(&inode->i_sb_list, &sb->s_inodes);
                inode->i_ino = ++last_ino;
                inode->i_state = 0;
                spin_unlock(&inode_lock);
        }
        return inode;
}

EXPORT_SYMBOL(new_inode);

 

3.6 alloc_inode

真正的分配空间

static struct inode *alloc_inode(struct super_block *sb)
{
        static const struct address_space_operations empty_aops;
        static struct inode_operations empty_iops;
        static const struct file_operations empty_fops;
        struct inode *inode;

        if (sb->s_op->alloc_inode)   
                inode = sb->s_op->alloc_inode(sb); //可以扩展,设置自己的内存分配函数
        else
                inode = (struct inode *) kmem_cache_alloc(inode_cachep, GFP_KERNEL); //默认的内存分配

        if (inode) {

                //初始化,不详细解释
                struct address_space * const mapping = &inode->i_data;

                inode->i_sb = sb;
                inode->i_blkbits = sb->s_blocksize_bits;
                inode->i_flags = 0;
                atomic_set(&inode->i_count, 1);
                inode->i_op = &empty_iops;
                inode->i_fop = &empty_fops;
                inode->i_nlink = 1;
                atomic_set(&inode->i_writecount, 0);
                inode->i_size = 0;
                inode->i_blocks = 0;
                inode->i_bytes = 0;
                inode->i_generation = 0;
#ifdef CONFIG_QUOTA
                memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
#endif
                inode->i_pipe = NULL;
                inode->i_bdev = NULL;
                inode->i_cdev = NULL;
                inode->i_rdev = 0;
                inode->dirtied_when = 0;
                if (security_inode_alloc(inode)) {
                        if (inode->i_sb->s_op->destroy_inode)
                                inode->i_sb->s_op->destroy_inode(inode);
                        else
                                kmem_cache_free(inode_cachep, (inode));
                        return NULL;
                }

                spin_lock_init(&inode->i_lock);
                lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);

                mutex_init(&inode->i_mutex);
                lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);

                init_rwsem(&inode->i_alloc_sem);
                lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key);

                mapping->a_ops = &empty_aops;
                mapping->host = inode;
                mapping->flags = 0;
                mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
                mapping->assoc_mapping = NULL;
                mapping->backing_dev_info = &default_backing_dev_info;

                /*
                 * If the block_device provides a backing_dev_info for client
                 * inodes then use that.  Otherwise the inode share the bdev's
                 * backing_dev_info.
                 */
                if (sb->s_bdev) {
                        struct backing_dev_info *bdi;

                        bdi = sb->s_bdev->bd_inode_backing_dev_info;
                        if (!bdi)
                                bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
                        mapping->backing_dev_info = bdi;
                }
                inode->i_private = NULL;
                inode->i_mapping = mapping;
        }
        return inode;
}
 

3.7 SOCKET_I

从inode中获取socket地址

static inline struct socket *SOCKET_I(struct inode *inode)
{
        return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
}

宏container_of定义如下 

#define container_of(ptr, type, member) ({                  /
    const typeof( ((type *)0)->member ) *__mptr = (ptr);    /
    (type *)( (char *)__mptr - offsetof(type,member) );})
#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
可以看出宏container_of(ptr, type, member)其实就是取出包括ptr指针的type地址,

struct type

{

     //....其它成员变量

     member* ptr;

}

struct socket_alloc {
        struct socket socket;
        struct inode vfs_inode;
};

 


 

3.8 inet_create

 

每个协议族有自己的创建方法, net_proto_family->create方法

struct net_proto_family {
        int             family;
        int             (*create)(struct net *net, struct socket *sock, int protocol);
        struct module   *owner;
};

 

inet_family_ops是struct net_proto_family结构体的实例,被初始化为:

static struct net_proto_family inet_family_ops = {
 .family = PF_INET,
 .create = inet_create,
 .owner = THIS_MODULE,
};


因此最后调用inet_create

创建struct sock,并初始化TCP/IP协议相关数据,成功后,返回应用层,创建socket结束。

static int inet_create(struct net *net, struct socket *sock, int protocol)
{
        struct sock *sk;
        struct list_head *p;
        struct inet_protosw *answer;
        struct inet_sock *inet;
        struct proto *answer_prot;
        unsigned char answer_flags;
        char answer_no_check;
        int try_loading_module = 0;
        int err;

        if (net != &init_net)                          //init_net是全局变量:struct net init_net;EXPORT_SYMBOL(init_net);
                return -EAFNOSUPPORT;

        if (sock->type != SOCK_RAW &&sock->type != SOCK_DGRAM && !inet_ehash_secret)
                build_ehash_secret();        //生成随机数,忽略

        sock->state = SS_UNCONNECTED;    //设置状态是未连接

       

        //根据protocol不同,调用不同的协议函数指针
        answer = NULL;
lookup_protocol:
        err = -ESOCKTNOSUPPORT;
        rcu_read_lock();

        //根据不同的协议type,找到协议指针answer
        list_for_each_rcu(p, &inetsw[sock->type]) {
                answer = list_entry(p, struct inet_protosw, list);

                //当是需要的协议,且不是IP协议时,就算找到了,退出循环
                if (protocol == answer->protocol) {
                        if (protocol != IPPROTO_IP)
                                break;
                } else {
                        //不是所查找的协议,但当是IP协议时,则直接退出循环
                        if (IPPROTO_IP == protocol) {
                                protocol = answer->protocol;
                                break;
                        }
                        if (IPPROTO_IP == answer->protocol)
                                break;
                }
                err = -EPROTONOSUPPORT;
                answer = NULL;
        }

        if (unlikely(answer == NULL)) {
                if (try_loading_module < 2) {
                        rcu_read_unlock();
                        //没有找到协议,则声明需要加载协议模块                      

                        if (++try_loading_module == 1)
                                request_module("net-pf-%d-proto-%d-type-%d",
                                               PF_INET, protocol, sock->type);
                       else
                                request_module("net-pf-%d-proto-%d",
                                               PF_INET, protocol);
                        goto lookup_protocol;
                } else
                        goto out_rcu_unlock;
        }

        err = -EPERM;
        if (answer->capability > 0 && !capable(answer->capability))
                goto out_rcu_unlock;

        sock->ops = answer->ops; //将此协议answer的指针函数赋值给sock的函数指针
        answer_prot = answer->prot;
        answer_no_check = answer->no_check;
        answer_flags = answer->flags;
        rcu_read_unlock();

        BUG_TRAP(answer_prot->slab != NULL);

 

        //初始化struct sock,是网络内核部分很重要的结构 

        err = -ENOBUFS;
        sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot); //分配sk空间
        if (sk == NULL)
                goto out;

        err = 0;
        sk->sk_no_check = answer_no_check;
        if (INET_PROTOSW_REUSE & answer_flags)
                sk->sk_reuse = 1;

 

         //初始化inet

        inet = inet_sk(sk);
        inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;

        if (SOCK_RAW == sock->type) {
                inet->num = protocol;
                if (IPPROTO_RAW == protocol)
                        inet->hdrincl = 1;
        }

        if (ipv4_config.no_pmtu_disc)
                inet->pmtudisc = IP_PMTUDISC_DONT;
        else
                inet->pmtudisc = IP_PMTUDISC_WANT;

        inet->id = 0;

        sock_init_data(sock, sk);                        //将sock赋值给sk

        sk->sk_destruct    = inet_sock_destruct;
        sk->sk_family      = PF_INET;
        sk->sk_protocol    = protocol;
        sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;

        inet->uc_ttl    = -1;                              //给inet赋值
        inet->mc_loop   = 1;
        inet->mc_ttl    = 1;
        inet->mc_index  = 0;
        inet->mc_list   = NULL;

        sk_refcnt_debug_inc(sk);

        if (inet->num) {
                /* It assumes that any protocol which allows
                 * the user to assign a number at socket
                 * creation time automatically
                 * shares.
                 */
                inet->sport = htons(inet->num);
                /* Add to protocol hash chains. */
                sk->sk_prot->hash(sk);
        }

        if (sk->sk_prot->init) {
                err = sk->sk_prot->init(sk);
                if (err)
                        sk_common_release(sk);
        }
out:
        return err;
out_rcu_unlock:
        rcu_read_unlock();
        goto out;
}

3.8.1

static struct list_head inetsw[SOCK_MAX];

 

3.9 sk_alloc

struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
                      struct proto *prot)
{
        struct sock *sk;

        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
        if (sk) {
                sk->sk_family = family;
                //赋值         

                sk->sk_prot = sk->sk_prot_creator = prot;
                sock_lock_init(sk);  //sk内部socket_lock_t   sk_lock的一些初始化,用于同步锁
                sk->sk_net = get_net(net);
        }

        return sk;
}

 

3.9.1 sk_prot_alloc

真正进行内存分配

static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
                int family)
{
        struct sock *sk;
        struct kmem_cache *slab;

        slab = prot->slab;
        if (slab != NULL)
                sk = kmem_cache_alloc(slab, priority);
        else
                sk = kmalloc(prot->obj_size, priority);

        if (sk != NULL) {
                if (security_sk_alloc(sk, family, priority))
                        goto out_free;

                if (!try_module_get(prot->owner))
                        goto out_free_sec;
        }

        return sk;

out_free_sec:
        security_sk_free(sk);
out_free:
        if (slab != NULL)
                kmem_cache_free(slab, sk);
        else
                kfree(sk);
        return NULL;
}

3.9.2 get_net

增加struct net的计数
static inline struct net *get_net(struct net *net)
{
        atomic_inc(&net->count);
        return net;
}

3.10

类型强制转换为inet_sock
static inline struct inet_sock *inet_sk(const struct sock *sk)
{
        return (struct inet_sock *)sk;
}

 

inet_sock是代表了网络层的socket

struct inet_sock {
        /* sk and pinet6 has to be the first two members of inet_sock */
        struct sock             sk;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
        struct ipv6_pinfo       *pinet6;
#endif
        /* Socket demultiplex comparisons on incoming packets. */
        __be32                  daddr;
        __be32                  rcv_saddr;
        __be16                  dport;
        __u16                   num;
        __be32                  saddr;
        __s16                   uc_ttl;
        __u16                   cmsg_flags;
        struct ip_options       *opt;
        __be16                  sport;
        __u16                   id;
        __u8                    tos;
        __u8                    mc_ttl;
        __u8                    pmtudisc;
        __u8                    recverr:1,
                                is_icsk:1,
                                freebind:1,
                                hdrincl:1,
                                mc_loop:1;
        int                     mc_index;
        __be32                  mc_addr;
        struct ip_mc_socklist   *mc_list;
        struct {
                unsigned int            flags;
                unsigned int            fragsize;
                struct ip_options       *opt;
                struct rtable           *rt;
                int                     length; /* Total length of all frames */
                __be32                  addr;
                struct flowi            fl;
        } cork;
};

 

3.11  sock_init_data

初始化struct sock* sk的一些值

void sock_init_data(struct socket *sock, struct sock *sk)
{
        skb_queue_head_init(&sk->sk_receive_queue);
        skb_queue_head_init(&sk->sk_write_queue);
        skb_queue_head_init(&sk->sk_error_queue);
#ifdef CONFIG_NET_DMA
        skb_queue_head_init(&sk->sk_async_wait_queue);
#endif

        sk->sk_send_head        =       NULL;

        init_timer(&sk->sk_timer);

        sk->sk_allocation       =       GFP_KERNEL;
        sk->sk_rcvbuf           =       sysctl_rmem_default;
        sk->sk_sndbuf           =       sysctl_wmem_default;
        sk->sk_state            =       TCP_CLOSE;
        sk->sk_socket           =       sock;

        sock_set_flag(sk, SOCK_ZAPPED);

        if (sock) {
                sk->sk_type     =       sock->type;
                sk->sk_sleep    =       &sock->wait;
                sock->sk        =       sk;
        } else
                sk->sk_sleep    =       NULL;

        rwlock_init(&sk->sk_dst_lock);
        rwlock_init(&sk->sk_callback_lock);
        lockdep_set_class_and_name(&sk->sk_callback_lock,
                        af_callback_keys + sk->sk_family,
                        af_family_clock_key_strings[sk->sk_family]);

        sk->sk_state_change     =       sock_def_wakeup;
        sk->sk_data_ready       =       sock_def_readable;
        sk->sk_write_space      =       sock_def_write_space;
        sk->sk_error_report     =       sock_def_error_report;
        sk->sk_destruct         =       sock_def_destruct;

        sk->sk_sndmsg_page      =       NULL;
        sk->sk_sndmsg_off       =       0;

        sk->sk_peercred.pid     =       0;
        sk->sk_peercred.uid     =       -1;
        sk->sk_peercred.gid     =       -1;
        sk->sk_write_pending    =       0;
        sk->sk_rcvlowat         =       1;
        sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
        sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;

        sk->sk_stamp = ktime_set(-1L, 0);

        atomic_set(&sk->sk_refcnt, 1);
        atomic_set(&sk->sk_drops, 0);
}

 

 

3.12 sock_map_fd

将socket和file相关联, file是内核级别指针(由于Linux把所有的都看成是文件) 

int sock_map_fd(struct socket *sock)
{
        struct file *newfile;
        int fd = sock_alloc_fd(&newfile);  //分配struct file*空间

        if (likely(fd >= 0)) {                                          //likely表示更多的可能性
                int err = sock_attach_fd(sock, newfile);       //将file和socket关联 3.6

                if (unlikely(err < 0)) {                               //unlinkely表示不太可能
                        put_filp(newfile);
                        put_unused_fd(fd);
                        return err;
                }
                fd_install(fd, newfile);
        }
        return fd;
}

 

3.12.1 sock_alloc_fd

获取描述符fd,和file*

static int sock_alloc_fd(struct file **filep)
{
        int fd;

        fd = get_unused_fd();
        if (likely(fd >= 0)) {
                struct file *file = get_empty_filp();

                *filep = file;
                if (unlikely(!file)) {
                        put_unused_fd(fd);
                        return -ENFILE;
                }
        } else
                *filep = NULL;
        return fd;
}

int get_unused_fd(void)
{
        return get_unused_fd_flags(0);
}


 

3.6 sock_attach_fd

很简单,就是sock->file = file, 同时 file->privaate_data=sock,内部指针互相指向

static int sock_attach_fd(struct socket *sock, struct file *file)
{
        struct dentry *dentry;
        struct qstr name = { .name = "" };

        dentry = d_alloc(sock_mnt->mnt_sb->s_root, &name);
        if (unlikely(!dentry))
                return -ENOMEM;

        dentry->d_op = &sockfs_dentry_operations;
        dentry->d_flags &= ~DCACHE_UNHASHED;
        d_instantiate(dentry, SOCK_INODE(sock));

        sock->file = file;               //将sock指针的file指针指向file
        init_file(file, sock_mnt, dentry, FMODE_READ | FMODE_WRITE,    &socket_file_ops);
        SOCK_INODE(sock)->i_fop = &socket_file_ops;
        file->f_flags = O_RDWR;
        file->f_pos = 0;
        file->private_data = sock;    //将file对private_data指向sock

        return 0;
}


 

3.7 sock_release

释放 

void sock_release(struct socket *sock)
{

        //释放调用的函数指针
        if (sock->ops) {
                struct module *owner = sock->ops->owner;

                sock->ops->release(sock);
                sock->ops = NULL;
                module_put(owner);
        }

        if (sock->fasync_list)
                printk(KERN_ERR "sock_release: fasync list not empty!/n");

        get_cpu_var(sockets_in_use)--;    //内部计数, 先忽略
        put_cpu_var(sockets_in_use);      //内部计数, 先忽略
        if (!sock->file) {
                iput(SOCK_INODE(sock));
                return;
        }
        sock->file = NULL;        //直接设为null
}

 

3.8 try_module_get

//增加模块中此cpu上的计数,

static inline int try_module_get(struct module *module)
{
        int ret = 1;

        if (module) {
                unsigned int cpu = get_cpu();
                if (likely(module_is_live(module)))
                        local_inc(&module->ref[cpu].count);
                else
                        ret = 0;
                put_cpu();
        }
        return ret;
}

 

3.9 module_put

//减少模块中在此cpu上的计数,减少后检查模块是否还存活

void module_put(struct module *module)
{
        if (module) {
                unsigned int cpu = get_cpu();
                local_dec(&module->ref[cpu].count);
                /* Maybe they're waiting for us to drop reference? */
                if (unlikely(!module_is_live(module)))
                        wake_up_process(module->waiter);
                put_cpu();
        }
}

 

 

 

3.10函数指针结构体的说明:

利用函数指针,使不同协议时,函数指针指向具体不同的函数,

struct proto_ops {
        int             family;
        struct module   *owner;
        int             (*release)   (struct socket *sock);
        int             (*bind)      (struct socket *sock,
                                      struct sockaddr *myaddr,
                                      int sockaddr_len);
        int             (*connect)   (struct socket *sock,
                                      struct sockaddr *vaddr,
                                      int sockaddr_len, int flags);
        int             (*socketpair)(struct socket *sock1,
                                      struct socket *sock2);
        int             (*accept)    (struct socket *sock,
                                      struct socket *newsock, int flags);
        int             (*getname)   (struct socket *sock,
                                      struct sockaddr *addr,
                                      int *sockaddr_len, int peer);
        unsigned int    (*poll)      (struct file *file, struct socket *sock,
                                      struct poll_table_struct *wait);
        int             (*ioctl)     (struct socket *sock, unsigned int cmd,
                                      unsigned long arg);
        int             (*compat_ioctl) (struct socket *sock, unsigned int cmd,
                                      unsigned long arg);
        int             (*listen)    (struct socket *sock, int len);
        int             (*shutdown)  (struct socket *sock, int flags);
        int             (*setsockopt)(struct socket *sock, int level,
                                      int optname, char __user *optval, int optlen);
        int             (*getsockopt)(struct socket *sock, int level,
                                      int optname, char __user *optval, int __user *optlen);
        int             (*compat_setsockopt)(struct socket *sock, int level,
                                      int optname, char __user *optval, int optlen);
        int             (*compat_getsockopt)(struct socket *sock, int level,
                                      int optname, char __user *optval, int __user *optlen);
        int             (*sendmsg)   (struct kiocb *iocb, struct socket *sock,
                                      struct msghdr *m, size_t total_len);
        int             (*recvmsg)   (struct kiocb *iocb, struct socket *sock,
                                      struct msghdr *m, size_t total_len,
                                      int flags);
        int             (*mmap)      (struct file *file, struct socket *sock,
                                      struct vm_area_struct * vma);
        ssize_t         (*sendpage)  (struct socket *sock, struct page *page,
                                      int offset, size_t size, int flags);
};

 

//TCP的操作地址 ,分别对应proto_ops中函数指针类型

const struct proto_ops inet_stream_ops = {
        .family            = PF_INET,
        .owner             = THIS_MODULE,
        .release           = inet_release,
        .bind              = inet_bind,
        .connect           = inet_stream_connect,
        .socketpair        = sock_no_socketpair,
        .accept            = inet_accept,
        .getname           = inet_getname,
        .poll              = tcp_poll,
        .ioctl             = inet_ioctl,
        .listen            = inet_listen,
        .shutdown          = inet_shutdown,
        .setsockopt        = sock_common_setsockopt,
        .getsockopt        = sock_common_getsockopt,
        .sendmsg           = tcp_sendmsg,
        .recvmsg           = sock_common_recvmsg,
        .mmap              = sock_no_mmap,
        .sendpage          = tcp_sendpage,
#ifdef CONFIG_COMPAT
        .compat_setsockopt = compat_sock_common_setsockopt,
        .compat_getsockopt = compat_sock_common_getsockopt,
#endif
};


//UDP

 

的操作地址 ,分别对应proto_ops中函数指针类型
const struct proto_ops inet_dgram_ops = {
        .family            = PF_INET,
        .owner             = THIS_MODULE,
        .release           = inet_release,
        .bind              = inet_bind,
        .connect           = inet_dgram_connect,
        .socketpair        = sock_no_socketpair,
        .accept            = sock_no_accept,
        .getname           = inet_getname,
        .poll              = udp_poll,
        .ioctl             = inet_ioctl,
        .listen            = sock_no_listen,
        .shutdown          = inet_shutdown,
        .setsockopt        = sock_common_setsockopt,
        .getsockopt        = sock_common_getsockopt,
        .sendmsg           = inet_sendmsg,
        .recvmsg           = sock_common_recvmsg,
        .mmap              = sock_no_mmap,
        .sendpage          = inet_sendpage,
#ifdef CONFIG_COMPAT
        .compat_setsockopt = compat_sock_common_setsockopt,
        .compat_getsockopt = compat_sock_common_getsockopt,
#endif
};

 

proto_ops目的类似,也是函数指针

struct proto {
        void                    (*close)(struct sock *sk,
                                        long timeout);
        int                     (*connect)(struct sock *sk,
                                        struct sockaddr *uaddr,
                                        int addr_len);
        int                     (*disconnect)(struct sock *sk, int flags);

        struct sock *           (*accept) (struct sock *sk, int flags, int *err);

        int                     (*ioctl)(struct sock *sk, int cmd,
                                         unsigned long arg);
        int                     (*init)(struct sock *sk);
        int                     (*destroy)(struct sock *sk);
        void                    (*shutdown)(struct sock *sk, int how);
        int                     (*setsockopt)(struct sock *sk, int level,
                                        int optname, char __user *optval,
                                        int optlen);
        int                     (*getsockopt)(struct sock *sk, int level,
                                        int optname, char __user *optval,
                                        int __user *option);    
        int                     (*compat_setsockopt)(struct sock *sk,
                                        int level,
                                        int optname, char __user *optval,
                                        int optlen);
        int                     (*compat_getsockopt)(struct sock *sk,
                                        int level,
                                        int optname, char __user *optval,
                                        int __user *option);
        int                     (*sendmsg)(struct kiocb *iocb, struct sock *sk,
                                           struct msghdr *msg, size_t len);
        int                     (*recvmsg)(struct kiocb *iocb, struct sock *sk,
                                           struct msghdr *msg,
                                        size_t len, int noblock, int flags,
                                        int *addr_len);
        int                     (*sendpage)(struct sock *sk, struct page *page,
                                        int offset, size_t size, int flags);
        int                     (*bind)(struct sock *sk,
                                        struct sockaddr *uaddr, int addr_len);

        int                     (*backlog_rcv) (struct sock *sk,
                                                struct sk_buff *skb);

        /* Keeping track of sk's, looking them up, and port selection methods. */
        void                    (*hash)(struct sock *sk);
        void                    (*unhash)(struct sock *sk);
        int                     (*get_port)(struct sock *sk, unsigned short snum);

        /* Memory pressure */
        void                    (*enter_memory_pressure)(void);
        atomic_t                *memory_allocated;      /* Current allocated memory. */
        atomic_t                *sockets_allocated;     /* Current number of sockets. */
        /*
         * Pressure flag: try to collapse.
         * Technical note: it is used by multiple contexts non atomically.
         * All the sk_stream_mem_schedule() is of this nature: accounting
         * is strict, actions are advisory and have some latency.
         */
        int                     *memory_pressure;
        int                     *sysctl_mem;
        int                     *sysctl_wmem;
        int                     *sysctl_rmem;
        int                     max_header;

        struct kmem_cache               *slab;
        unsigned int            obj_size;

        atomic_t                *orphan_count;

        struct request_sock_ops *rsk_prot;
        struct timewait_sock_ops *twsk_prot;

        struct module           *owner;

        char                    name[32];

        struct list_head        node;
#ifdef SOCK_REFCNT_DEBUG
        atomic_t                socks;
#endif
        struct {
                int inuse;
                u8  __pad[SMP_CACHE_BYTES - sizeof(int)];
        } stats[NR_CPUS];
};

 

tcp_prot函数指针

struct proto tcp_prot = {
        .name                   = "TCP",
        .owner                  = THIS_MODULE,
        .close                  = tcp_close,
        .connect                = tcp_v4_connect,
        .disconnect             = tcp_disconnect,
        .accept                 = inet_csk_accept,
        .ioctl                  = tcp_ioctl,
        .init                   = tcp_v4_init_sock,
        .destroy                = tcp_v4_destroy_sock,
        .shutdown               = tcp_shutdown,
        .setsockopt             = tcp_setsockopt,
        .getsockopt             = tcp_getsockopt,
        .recvmsg                = tcp_recvmsg,
        .backlog_rcv            = tcp_v4_do_rcv,
        .hash                   = inet_hash,
        .unhash                 = inet_unhash,
        .get_port               = inet_csk_get_port,
        .enter_memory_pressure  = tcp_enter_memory_pressure,
        .sockets_allocated      = &tcp_sockets_allocated,
        .orphan_count           = &tcp_orphan_count,
        .memory_allocated       = &tcp_memory_allocated,
        .memory_pressure        = &tcp_memory_pressure,
        .sysctl_mem             = sysctl_tcp_mem,
        .sysctl_wmem            = sysctl_tcp_wmem,
        .sysctl_rmem            = sysctl_tcp_rmem,
        .max_header             = MAX_TCP_HEADER,
        .obj_size               = sizeof(struct tcp_sock),
        .twsk_prot              = &tcp_timewait_sock_ops,
        .rsk_prot               = &tcp_request_sock_ops,
        .hashinfo               = &tcp_hashinfo,
#ifdef CONFIG_COMPAT
        .compat_setsockopt      = compat_tcp_setsockopt,
        .compat_getsockopt      = compat_tcp_getsockopt,
#endif
        REF_PROTO_INUSE(tcp)
};

dup指针函数
struct proto udp_prot = {
        .name              = "UDP",
        .owner             = THIS_MODULE,
        .close             = udp_lib_close,
        .connect           = ip4_datagram_connect,
        .disconnect        = udp_disconnect,
        .ioctl             = udp_ioctl,
        .destroy           = udp_destroy_sock,
        .setsockopt        = udp_setsockopt,
        .getsockopt        = udp_getsockopt,
        .sendmsg           = udp_sendmsg,
        .recvmsg           = udp_recvmsg,
        .sendpage          = udp_sendpage,
        .backlog_rcv       = udp_queue_rcv_skb,
        .hash              = udp_lib_hash,
        .unhash            = udp_lib_unhash,
        .get_port          = udp_v4_get_port,
        .memory_allocated  = &udp_memory_allocated,
        .sysctl_mem        = sysctl_udp_mem,
        .sysctl_wmem       = &sysctl_udp_wmem_min,
        .sysctl_rmem       = &sysctl_udp_rmem_min,
        .obj_size          = sizeof(struct udp_sock),
#ifdef CONFIG_COMPAT
        .compat_setsockopt = compat_udp_setsockopt,
        .compat_getsockopt = compat_udp_getsockopt,
#endif
        REF_PROTO_INUSE(udp)
};

 


3.11 struct sk_buff结构

struct sk_buff {
        /* These two members must be first. */
        struct sk_buff          *next;   //Next buffer in list
        struct sk_buff          *prev;   //Previous buffer in list

        struct sock             *sk;     //Socket we are owned by
        ktime_t                  tstamp;   //Time we arrived
        struct net_device       *dev;   //evice we arrived on/are leaving by

        struct  dst_entry       *dst;   //destination entry
        struct  sec_path        *sp;   //the security path, used for xfrm

        /*
         * This is the control buffer. It is free to use for every
         * layer. Please put your private variables there. If you
         * want to keep them across layers you have to do a skb_clone()
         * first. This is owned by whoever has the skb queued ATM.
         */
        char            cb[48];    //Control buffer. Free for use by every layer. Put private vars here

        unsigned int            len,    //Length of actual data
                                data_len;  //Data length
        __u16                   mac_len,  //Length of link layer header
                                hdr_len;  //writable header length of cloned skb
        union {
                __wsum          csum;    //Checksum (must include start/offset pair)
                struct {
                        __u16   csum_start; //Offset from skb->head where checksumming should start
                        __u16   csum_offset; //Offset from csum_start where checksum should be stored
                };
        };
        __u32                   priority;   //Packet queueing priority
        __u8                    local_df:1,  //allow local fragmentation
                                cloned:1,   //Head may be cloned (check refcnt to be sure)
                                ip_summed:2, //Driver fed us an IP checksum
                                nohdr:1,   //Payload reference only, must not modify header
                                nfctinfo:3;  //Relationship of this skb to the connection
        __u8                    pkt_type:3,  //Packet class
                                fclone:2,   //skbuff clone status
                                ipvs_property:1,  //skbuff is owned by ipvs
                                nf_trace:1;     //netfilter packet trace flag
        __be16                  protocol;      //Packet protocol from driver

        void                    (*destructor)(struct sk_buff *skb);  //Destruct function
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        struct nf_conntrack     *nfct;      //Associated connection, if any
        struct sk_buff          *nfct_reasm;   //netfilter conntrack re-assembly pointer
#endif
#ifdef CONFIG_BRIDGE_NETFILTER
        struct nf_bridge_info   *nf_bridge;    //Saved data about a bridged frame - see br_netfilter.c
#endif

        int                     iif;       //ifindex of device we arrived on
        __u16                   queue_mapping;  //Queue mapping for multiqueue devices

#ifdef CONFIG_NET_SCHED
        __u16                   tc_index;       // traffic control index
#ifdef CONFIG_NET_CLS_ACT
        __u16                   tc_verd;        // traffic control verdict
#endif
#endif
        /* 2 byte hole */

#ifdef CONFIG_NET_DMA
        dma_cookie_t            dma_cookie;//a cookie to one of several possible DMA operations done by skb DMA functions
#endif
#ifdef CONFIG_NETWORK_SECMARK
        __u32                   secmark;   //security marking
#endif

        __u32                   mark;     //Generic packet mark

        sk_buff_data_t          transport_header;  //Transport layer header
        sk_buff_data_t          network_header;   //Network layer header
        sk_buff_data_t          mac_header;     //Link layer header
        /* These elements must be at the end, see alloc_skb() for details.  */
        sk_buff_data_t          tail;        //Tail pointer
        sk_buff_data_t          end;        //End pointer
        unsigned char           *head,       //ead of buffer
                                *data;       //Data head pointer
        unsigned int            truesize;      //Buffer size
        atomic_t                users;       //User count - see {datagram,tcp}.c
};
 

结构入下图

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值