3 socket()
3.1 sys_socket
包括三个主要函数,sock_create()负责创建socket分配内存并初始化, sock_map_fd()负责分配struct file*, sock_release()负责释放内存。
进入sys_socket后,调用sock_create,传入family和type,就是前面的a0,a1,调用socket_create创建struct socket结构
asmlinkage long sys_socket(int family, int type, int protocol)
{
int retval;
struct socket *sock;
retval = sock_create(family, type, protocol, &sock); //创建socket, 3.2
if (retval < 0)
goto out;
retval = sock_map_fd(sock); //将socket和struct file*挂钩 3.5
if (retval < 0)
goto out_release;
out:
return retval;
out_release:
sock_release(sock); //socket释放 3.7
return retval;
}
3.2 sock_create
负责创建struct socket,并初始化,包括sock_alloc()负责内存分配和初始化,pf->create()负责根据协议族初始化。
直接调用__sock_create,添加了当前进程的struct net*参数,net_proto_family *pf->create()时需要此参数。其它参数直接传递。
int sock_create(int family, int type, int protocol, struct socket **res)
{
return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}
3.2.1 struct nsproxy
其中nsproxy用来实现各个进程的命名空间,包括mnt, uts, pid,user, net等, 由count实现计数,用来在工作队列中共享(?)结构如下:
struct nsproxy {
atomic_t count;
struct uts_namespace *uts_ns;
struct ipc_namespace *ipc_ns;
struct mnt_namespace *mnt_ns;
struct pid_namespace *pid_ns;
struct user_namespace *user_ns;
struct net *net_ns;
};
3.3 __sock_create
真正调用的是__sock_create函数,先分配内存空间sock_alloc, 然后根据各个type调用其create, 当是TCP/IP时(由faimiles定),这个create会调用inet_create,创建struct socket
static int __sock_create(struct net *net, int family, int type, int protocol, struct socket **res, int kern)
{
int err;
struct socket *sock;
const struct net_proto_family *pf;
//检查协议族类型范围
if (family < 0 || family >= NPROTO)
return -EAFNOSUPPORT;
if (type < 0 || type >= SOCK_MAX)
return -EINVAL;
//兼容性考虑,当协议族为PF_INET,协议是SOCK_PACKET时,协议族强行改为PF_PACKET,避免模块加载时出现死锁(?)
if (family == PF_INET && type == SOCK_PACKET) {
static int warned;
if (!warned) {
warned = 1;
printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)/n", current->comm);
}
family = PF_PACKET;
}
err = security_socket_create(family, type, protocol, kern); //安全方面的,先忽略
if (err)
return err;
sock = sock_alloc(); //分配socket,内部创建新inode并关联 3.3
if (!sock) {
if (net_ratelimit()) //添加模块接口,可以进行扩展处理,默认是不做处理直接返回0,
printk(KERN_WARNING "socket: no more sockets/n");
return -ENFILE; /* Not exactly a match, but its the closest posix thing */
}
sock->type = type; //协议类型
#if defined(CONFIG_KMOD)
//模块需已经加载上
if (net_families[family] == NULL) //如果相应的协议族的模块不存在,就报错
request_module("net-pf-%d", family);
#endif
rcu_read_lock(); //读锁
//rcu_dereference能够安全的获取需要的协议族的指针,其实就是pf = net_familits[family],获取其指针
pf = rcu_dereference(net_families[family]);
err = -EAFNOSUPPORT; //预先分配错误号,
if (!pf)
goto out_release;
//由于协议族可能是从模块上加载的,因此要先检查其模块是否仍然存活,如果仍然存活,则计数增加,否则返回失败
if (!try_module_get(pf->owner)) //增加此协议族模块,在此cpu上的计数
goto out_release;
//解开读锁
rcu_read_unlock();
err = pf->create(net, sock, protocol); //不同协议族调用不同的创建函数: 3.7
if (err < 0)
goto out_module_put;
if (!try_module_get(sock->ops->owner)) //增加此协议的函数指针的模块在此cpu上的计数 3.8
goto out_module_busy;
module_put(pf->owner); //对应 if (!try_module_get(pf->owner))
err = security_socket_post_create(sock, family, type, protocol, kern); //安全控制,忽略
if (err)
goto out_sock_release;
*res = sock;
return 0;
out_module_busy:
err = -EAFNOSUPPORT;
out_module_put:
sock->ops = NULL;
module_put(pf->owner); //对应 if (!try_module_get(pf->owner)) ,减少计数 3.9
out_sock_release:
sock_release(sock);
return err;
out_release:
rcu_read_unlock();
goto out_sock_release;
}
3.3.1 net_reatelimit
其中net_ratelimit为模块中加载的函数,可以添加限制等扩展功能,默认是不做任何处理直接返回。
int net_msg_cost __read_mostly = 5*HZ;
int net_msg_burst __read_mostly = 10;
int net_ratelimit(void)
{
return __printk_ratelimit(net_msg_cost, net_msg_burst);
}
EXPORT_SYMBOL(net_ratelimit);
其默认的输出是不做任何处理,直接返回。
static inline int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) { return 0; }
3.3.2 rcu_dereference
其中rcu_dereference用于安全的获取p指针 ,其中的smp_read_barrier_depends是专门的读门槛,保证读的顺序,避免编译器优化
#define rcu_dereference(p) ({ /
typeof(p) _________p1 = ACCESS_ONCE(p); /
smp_read_barrier_depends(); /
(_________p1); /
})
其中的volatile也是强制每次真正读取x值,而不能从缓存读取,这样一个变量经volatile修饰后,任何线程中改变了它的值,所有其他线程读取时就获取到了相同的值。
#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
3.3.3 try_module_get
而try_module_get是检查module是否存在,如果存在就将其计数加1,否则返回失败。
static inline int try_module_get(struct module *module)
{
int ret = 1;
if (module) {
unsigned int cpu = get_cpu();
if (likely(module_is_live(module)))
local_inc(&module->ref[cpu].count);
else
ret = 0;
put_cpu();
}
return ret;
}
而module_is_live很简单,就是检查其状态字。
static inline int module_is_live(struct module *mod)
{
return mod->state != MODULE_STATE_GOING;
}
3.4 sock_alloc
{
struct inode *inode;
struct socket *sock;
inode = new_inode(sock_mnt->mnt_sb); //初始化inode
if (!inode)
return NULL;
sock = SOCKET_I(inode); //从初始化好的inode中提取出sock指针,用于后面的设置
inode->i_mode = S_IFSOCK | S_IRWXUGO;
inode->i_uid = current->fsuid;
inode->i_gid = current->fsgid;
get_cpu_var(sockets_in_use)++;
put_cpu_var(sockets_in_use);
return sock;
}
3.5 new_inode
分配内存空间,并且对inode初始化,加入全局队列inode_in_use
struct inode *new_inode(struct super_block *sb)
{
static unsigned int last_ino;
struct inode * inode;
spin_lock_prefetch(&inode_lock); //上锁,预计上锁(?),待查
inode = alloc_inode(sb);
if (inode) {
spin_lock(&inode_lock);
inodes_stat.nr_inodes++;
list_add(&inode->i_list, &inode_in_use); //加入使用队列,全局队列,定义为 LIST_HEAD(inode_in_use);
list_add(&inode->i_sb_list, &sb->s_inodes);
inode->i_ino = ++last_ino;
inode->i_state = 0;
spin_unlock(&inode_lock);
}
return inode;
}
EXPORT_SYMBOL(new_inode);
3.6 alloc_inode
真正的分配空间
static struct inode *alloc_inode(struct super_block *sb)
{
static const struct address_space_operations empty_aops;
static struct inode_operations empty_iops;
static const struct file_operations empty_fops;
struct inode *inode;
if (sb->s_op->alloc_inode)
inode = sb->s_op->alloc_inode(sb); //可以扩展,设置自己的内存分配函数
else
inode = (struct inode *) kmem_cache_alloc(inode_cachep, GFP_KERNEL); //默认的内存分配
if (inode) {
//初始化,不详细解释
struct address_space * const mapping = &inode->i_data;
inode->i_sb = sb;
inode->i_blkbits = sb->s_blocksize_bits;
inode->i_flags = 0;
atomic_set(&inode->i_count, 1);
inode->i_op = &empty_iops;
inode->i_fop = &empty_fops;
inode->i_nlink = 1;
atomic_set(&inode->i_writecount, 0);
inode->i_size = 0;
inode->i_blocks = 0;
inode->i_bytes = 0;
inode->i_generation = 0;
#ifdef CONFIG_QUOTA
memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
#endif
inode->i_pipe = NULL;
inode->i_bdev = NULL;
inode->i_cdev = NULL;
inode->i_rdev = 0;
inode->dirtied_when = 0;
if (security_inode_alloc(inode)) {
if (inode->i_sb->s_op->destroy_inode)
inode->i_sb->s_op->destroy_inode(inode);
else
kmem_cache_free(inode_cachep, (inode));
return NULL;
}
spin_lock_init(&inode->i_lock);
lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
mutex_init(&inode->i_mutex);
lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
init_rwsem(&inode->i_alloc_sem);
lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key);
mapping->a_ops = &empty_aops;
mapping->host = inode;
mapping->flags = 0;
mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
mapping->assoc_mapping = NULL;
mapping->backing_dev_info = &default_backing_dev_info;
/*
* If the block_device provides a backing_dev_info for client
* inodes then use that. Otherwise the inode share the bdev's
* backing_dev_info.
*/
if (sb->s_bdev) {
struct backing_dev_info *bdi;
bdi = sb->s_bdev->bd_inode_backing_dev_info;
if (!bdi)
bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
mapping->backing_dev_info = bdi;
}
inode->i_private = NULL;
inode->i_mapping = mapping;
}
return inode;
}
3.7 SOCKET_I
从inode中获取socket地址
static inline struct socket *SOCKET_I(struct inode *inode)
{
return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
}
宏container_of定义如下
#define container_of(ptr, type, member) ({ /
const typeof( ((type *)0)->member ) *__mptr = (ptr); /
(type *)( (char *)__mptr - offsetof(type,member) );})
#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
可以看出宏container_of(ptr, type, member)其实就是取出包括ptr指针的type地址,
struct type
{
//....其它成员变量
member* ptr;
}
struct socket_alloc {
struct socket socket;
struct inode vfs_inode;
};
3.8 inet_create
每个协议族有自己的创建方法, net_proto_family->create方法
struct net_proto_family {
int family;
int (*create)(struct net *net, struct socket *sock, int protocol);
struct module *owner;
};
inet_family_ops是struct net_proto_family结构体的实例,被初始化为:
static struct net_proto_family inet_family_ops = {
.family = PF_INET,
.create = inet_create,
.owner = THIS_MODULE,
};
因此最后调用inet_create
创建struct sock,并初始化TCP/IP协议相关数据,成功后,返回应用层,创建socket结束。
static int inet_create(struct net *net, struct socket *sock, int protocol)
{
struct sock *sk;
struct list_head *p;
struct inet_protosw *answer;
struct inet_sock *inet;
struct proto *answer_prot;
unsigned char answer_flags;
char answer_no_check;
int try_loading_module = 0;
int err;
if (net != &init_net) //init_net是全局变量:struct net init_net;EXPORT_SYMBOL(init_net);
return -EAFNOSUPPORT;
if (sock->type != SOCK_RAW &&sock->type != SOCK_DGRAM && !inet_ehash_secret)
build_ehash_secret(); //生成随机数,忽略
sock->state = SS_UNCONNECTED; //设置状态是未连接
//根据protocol不同,调用不同的协议函数指针
answer = NULL;
lookup_protocol:
err = -ESOCKTNOSUPPORT;
rcu_read_lock();
//根据不同的协议type,找到协议指针answer
list_for_each_rcu(p, &inetsw[sock->type]) {
answer = list_entry(p, struct inet_protosw, list);
//当是需要的协议,且不是IP协议时,就算找到了,退出循环
if (protocol == answer->protocol) {
if (protocol != IPPROTO_IP)
break;
} else {
//不是所查找的协议,但当是IP协议时,则直接退出循环
if (IPPROTO_IP == protocol) {
protocol = answer->protocol;
break;
}
if (IPPROTO_IP == answer->protocol)
break;
}
err = -EPROTONOSUPPORT;
answer = NULL;
}
if (unlikely(answer == NULL)) {
if (try_loading_module < 2) {
rcu_read_unlock();
//没有找到协议,则声明需要加载协议模块
if (++try_loading_module == 1)
request_module("net-pf-%d-proto-%d-type-%d",
PF_INET, protocol, sock->type);
else
request_module("net-pf-%d-proto-%d",
PF_INET, protocol);
goto lookup_protocol;
} else
goto out_rcu_unlock;
}
err = -EPERM;
if (answer->capability > 0 && !capable(answer->capability))
goto out_rcu_unlock;
sock->ops = answer->ops; //将此协议answer的指针函数赋值给sock的函数指针
answer_prot = answer->prot;
answer_no_check = answer->no_check;
answer_flags = answer->flags;
rcu_read_unlock();
BUG_TRAP(answer_prot->slab != NULL);
//初始化struct sock,是网络内核部分很重要的结构
err = -ENOBUFS;
sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot); //分配sk空间
if (sk == NULL)
goto out;
err = 0;
sk->sk_no_check = answer_no_check;
if (INET_PROTOSW_REUSE & answer_flags)
sk->sk_reuse = 1;
//初始化inet
inet = inet_sk(sk);
inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
if (SOCK_RAW == sock->type) {
inet->num = protocol;
if (IPPROTO_RAW == protocol)
inet->hdrincl = 1;
}
if (ipv4_config.no_pmtu_disc)
inet->pmtudisc = IP_PMTUDISC_DONT;
else
inet->pmtudisc = IP_PMTUDISC_WANT;
inet->id = 0;
sock_init_data(sock, sk); //将sock赋值给sk
sk->sk_destruct = inet_sock_destruct;
sk->sk_family = PF_INET;
sk->sk_protocol = protocol;
sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
inet->uc_ttl = -1; //给inet赋值
inet->mc_loop = 1;
inet->mc_ttl = 1;
inet->mc_index = 0;
inet->mc_list = NULL;
sk_refcnt_debug_inc(sk);
if (inet->num) {
/* It assumes that any protocol which allows
* the user to assign a number at socket
* creation time automatically
* shares.
*/
inet->sport = htons(inet->num);
/* Add to protocol hash chains. */
sk->sk_prot->hash(sk);
}
if (sk->sk_prot->init) {
err = sk->sk_prot->init(sk);
if (err)
sk_common_release(sk);
}
out:
return err;
out_rcu_unlock:
rcu_read_unlock();
goto out;
}
3.8.1
static struct list_head inetsw[SOCK_MAX];
3.9 sk_alloc
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
struct proto *prot)
{
struct sock *sk;
sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
if (sk) {
sk->sk_family = family;
//赋值
sk->sk_prot = sk->sk_prot_creator = prot;
sock_lock_init(sk); //sk内部socket_lock_t sk_lock的一些初始化,用于同步锁
sk->sk_net = get_net(net);
}
return sk;
}
3.9.1 sk_prot_alloc
真正进行内存分配
static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
int family)
{
struct sock *sk;
struct kmem_cache *slab;
slab = prot->slab;
if (slab != NULL)
sk = kmem_cache_alloc(slab, priority);
else
sk = kmalloc(prot->obj_size, priority);
if (sk != NULL) {
if (security_sk_alloc(sk, family, priority))
goto out_free;
if (!try_module_get(prot->owner))
goto out_free_sec;
}
return sk;
out_free_sec:
security_sk_free(sk);
out_free:
if (slab != NULL)
kmem_cache_free(slab, sk);
else
kfree(sk);
return NULL;
}
3.9.2 get_net
增加struct net的计数
static inline struct net *get_net(struct net *net)
{
atomic_inc(&net->count);
return net;
}
3.10
类型强制转换为inet_sock
static inline struct inet_sock *inet_sk(const struct sock *sk)
{
return (struct inet_sock *)sk;
}
inet_sock是代表了网络层的socket
struct inet_sock {
/* sk and pinet6 has to be the first two members of inet_sock */
struct sock sk;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
struct ipv6_pinfo *pinet6;
#endif
/* Socket demultiplex comparisons on incoming packets. */
__be32 daddr;
__be32 rcv_saddr;
__be16 dport;
__u16 num;
__be32 saddr;
__s16 uc_ttl;
__u16 cmsg_flags;
struct ip_options *opt;
__be16 sport;
__u16 id;
__u8 tos;
__u8 mc_ttl;
__u8 pmtudisc;
__u8 recverr:1,
is_icsk:1,
freebind:1,
hdrincl:1,
mc_loop:1;
int mc_index;
__be32 mc_addr;
struct ip_mc_socklist *mc_list;
struct {
unsigned int flags;
unsigned int fragsize;
struct ip_options *opt;
struct rtable *rt;
int length; /* Total length of all frames */
__be32 addr;
struct flowi fl;
} cork;
};
3.11 sock_init_data
初始化struct sock* sk的一些值
void sock_init_data(struct socket *sock, struct sock *sk)
{
skb_queue_head_init(&sk->sk_receive_queue);
skb_queue_head_init(&sk->sk_write_queue);
skb_queue_head_init(&sk->sk_error_queue);
#ifdef CONFIG_NET_DMA
skb_queue_head_init(&sk->sk_async_wait_queue);
#endif
sk->sk_send_head = NULL;
init_timer(&sk->sk_timer);
sk->sk_allocation = GFP_KERNEL;
sk->sk_rcvbuf = sysctl_rmem_default;
sk->sk_sndbuf = sysctl_wmem_default;
sk->sk_state = TCP_CLOSE;
sk->sk_socket = sock;
sock_set_flag(sk, SOCK_ZAPPED);
if (sock) {
sk->sk_type = sock->type;
sk->sk_sleep = &sock->wait;
sock->sk = sk;
} else
sk->sk_sleep = NULL;
rwlock_init(&sk->sk_dst_lock);
rwlock_init(&sk->sk_callback_lock);
lockdep_set_class_and_name(&sk->sk_callback_lock,
af_callback_keys + sk->sk_family,
af_family_clock_key_strings[sk->sk_family]);
sk->sk_state_change = sock_def_wakeup;
sk->sk_data_ready = sock_def_readable;
sk->sk_write_space = sock_def_write_space;
sk->sk_error_report = sock_def_error_report;
sk->sk_destruct = sock_def_destruct;
sk->sk_sndmsg_page = NULL;
sk->sk_sndmsg_off = 0;
sk->sk_peercred.pid = 0;
sk->sk_peercred.uid = -1;
sk->sk_peercred.gid = -1;
sk->sk_write_pending = 0;
sk->sk_rcvlowat = 1;
sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
sk->sk_stamp = ktime_set(-1L, 0);
atomic_set(&sk->sk_refcnt, 1);
atomic_set(&sk->sk_drops, 0);
}
3.12 sock_map_fd
将socket和file相关联, file是内核级别指针(由于Linux把所有的都看成是文件)
int sock_map_fd(struct socket *sock)
{
struct file *newfile;
int fd = sock_alloc_fd(&newfile); //分配struct file*空间
if (likely(fd >= 0)) { //likely表示更多的可能性
int err = sock_attach_fd(sock, newfile); //将file和socket关联 3.6
if (unlikely(err < 0)) { //unlinkely表示不太可能
put_filp(newfile);
put_unused_fd(fd);
return err;
}
fd_install(fd, newfile);
}
return fd;
}
3.12.1 sock_alloc_fd
获取描述符fd,和file*
static int sock_alloc_fd(struct file **filep)
{
int fd;
fd = get_unused_fd();
if (likely(fd >= 0)) {
struct file *file = get_empty_filp();
*filep = file;
if (unlikely(!file)) {
put_unused_fd(fd);
return -ENFILE;
}
} else
*filep = NULL;
return fd;
}
int get_unused_fd(void)
{
return get_unused_fd_flags(0);
}
3.6 sock_attach_fd
很简单,就是sock->file = file, 同时 file->privaate_data=sock,内部指针互相指向
static int sock_attach_fd(struct socket *sock, struct file *file)
{
struct dentry *dentry;
struct qstr name = { .name = "" };
dentry = d_alloc(sock_mnt->mnt_sb->s_root, &name);
if (unlikely(!dentry))
return -ENOMEM;
dentry->d_op = &sockfs_dentry_operations;
dentry->d_flags &= ~DCACHE_UNHASHED;
d_instantiate(dentry, SOCK_INODE(sock));
sock->file = file; //将sock指针的file指针指向file
init_file(file, sock_mnt, dentry, FMODE_READ | FMODE_WRITE, &socket_file_ops);
SOCK_INODE(sock)->i_fop = &socket_file_ops;
file->f_flags = O_RDWR;
file->f_pos = 0;
file->private_data = sock; //将file对private_data指向sock
return 0;
}
3.7 sock_release
释放
void sock_release(struct socket *sock)
{
//释放调用的函数指针
if (sock->ops) {
struct module *owner = sock->ops->owner;
sock->ops->release(sock);
sock->ops = NULL;
module_put(owner);
}
if (sock->fasync_list)
printk(KERN_ERR "sock_release: fasync list not empty!/n");
get_cpu_var(sockets_in_use)--; //内部计数, 先忽略
put_cpu_var(sockets_in_use); //内部计数, 先忽略
if (!sock->file) {
iput(SOCK_INODE(sock));
return;
}
sock->file = NULL; //直接设为null
}
3.8 try_module_get
//增加模块中此cpu上的计数,
static inline int try_module_get(struct module *module)
{
int ret = 1;
if (module) {
unsigned int cpu = get_cpu();
if (likely(module_is_live(module)))
local_inc(&module->ref[cpu].count);
else
ret = 0;
put_cpu();
}
return ret;
}
3.9 module_put
//减少模块中在此cpu上的计数,减少后检查模块是否还存活
void module_put(struct module *module)
{
if (module) {
unsigned int cpu = get_cpu();
local_dec(&module->ref[cpu].count);
/* Maybe they're waiting for us to drop reference? */
if (unlikely(!module_is_live(module)))
wake_up_process(module->waiter);
put_cpu();
}
}
3.10函数指针结构体的说明:
利用函数指针,使不同协议时,函数指针指向具体不同的函数,
struct proto_ops {
int family;
struct module *owner;
int (*release) (struct socket *sock);
int (*bind) (struct socket *sock,
struct sockaddr *myaddr,
int sockaddr_len);
int (*connect) (struct socket *sock,
struct sockaddr *vaddr,
int sockaddr_len, int flags);
int (*socketpair)(struct socket *sock1,
struct socket *sock2);
int (*accept) (struct socket *sock,
struct socket *newsock, int flags);
int (*getname) (struct socket *sock,
struct sockaddr *addr,
int *sockaddr_len, int peer);
unsigned int (*poll) (struct file *file, struct socket *sock,
struct poll_table_struct *wait);
int (*ioctl) (struct socket *sock, unsigned int cmd,
unsigned long arg);
int (*compat_ioctl) (struct socket *sock, unsigned int cmd,
unsigned long arg);
int (*listen) (struct socket *sock, int len);
int (*shutdown) (struct socket *sock, int flags);
int (*setsockopt)(struct socket *sock, int level,
int optname, char __user *optval, int optlen);
int (*getsockopt)(struct socket *sock, int level,
int optname, char __user *optval, int __user *optlen);
int (*compat_setsockopt)(struct socket *sock, int level,
int optname, char __user *optval, int optlen);
int (*compat_getsockopt)(struct socket *sock, int level,
int optname, char __user *optval, int __user *optlen);
int (*sendmsg) (struct kiocb *iocb, struct socket *sock,
struct msghdr *m, size_t total_len);
int (*recvmsg) (struct kiocb *iocb, struct socket *sock,
struct msghdr *m, size_t total_len,
int flags);
int (*mmap) (struct file *file, struct socket *sock,
struct vm_area_struct * vma);
ssize_t (*sendpage) (struct socket *sock, struct page *page,
int offset, size_t size, int flags);
};
//TCP的操作地址 ,分别对应proto_ops中函数指针类型
const struct proto_ops inet_stream_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
.bind = inet_bind,
.connect = inet_stream_connect,
.socketpair = sock_no_socketpair,
.accept = inet_accept,
.getname = inet_getname,
.poll = tcp_poll,
.ioctl = inet_ioctl,
.listen = inet_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = tcp_sendmsg,
.recvmsg = sock_common_recvmsg,
.mmap = sock_no_mmap,
.sendpage = tcp_sendpage,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
#endif
};
//UDP
的操作地址 ,分别对应proto_ops中函数指针类型
const struct proto_ops inet_dgram_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
.bind = inet_bind,
.connect = inet_dgram_connect,
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = inet_getname,
.poll = udp_poll,
.ioctl = inet_ioctl,
.listen = sock_no_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
.recvmsg = sock_common_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
#endif
};
和proto_ops目的类似,也是函数指针
struct proto {
void (*close)(struct sock *sk,
long timeout);
int (*connect)(struct sock *sk,
struct sockaddr *uaddr,
int addr_len);
int (*disconnect)(struct sock *sk, int flags);
struct sock * (*accept) (struct sock *sk, int flags, int *err);
int (*ioctl)(struct sock *sk, int cmd,
unsigned long arg);
int (*init)(struct sock *sk);
int (*destroy)(struct sock *sk);
void (*shutdown)(struct sock *sk, int how);
int (*setsockopt)(struct sock *sk, int level,
int optname, char __user *optval,
int optlen);
int (*getsockopt)(struct sock *sk, int level,
int optname, char __user *optval,
int __user *option);
int (*compat_setsockopt)(struct sock *sk,
int level,
int optname, char __user *optval,
int optlen);
int (*compat_getsockopt)(struct sock *sk,
int level,
int optname, char __user *optval,
int __user *option);
int (*sendmsg)(struct kiocb *iocb, struct sock *sk,
struct msghdr *msg, size_t len);
int (*recvmsg)(struct kiocb *iocb, struct sock *sk,
struct msghdr *msg,
size_t len, int noblock, int flags,
int *addr_len);
int (*sendpage)(struct sock *sk, struct page *page,
int offset, size_t size, int flags);
int (*bind)(struct sock *sk,
struct sockaddr *uaddr, int addr_len);
int (*backlog_rcv) (struct sock *sk,
struct sk_buff *skb);
/* Keeping track of sk's, looking them up, and port selection methods. */
void (*hash)(struct sock *sk);
void (*unhash)(struct sock *sk);
int (*get_port)(struct sock *sk, unsigned short snum);
/* Memory pressure */
void (*enter_memory_pressure)(void);
atomic_t *memory_allocated; /* Current allocated memory. */
atomic_t *sockets_allocated; /* Current number of sockets. */
/*
* Pressure flag: try to collapse.
* Technical note: it is used by multiple contexts non atomically.
* All the sk_stream_mem_schedule() is of this nature: accounting
* is strict, actions are advisory and have some latency.
*/
int *memory_pressure;
int *sysctl_mem;
int *sysctl_wmem;
int *sysctl_rmem;
int max_header;
struct kmem_cache *slab;
unsigned int obj_size;
atomic_t *orphan_count;
struct request_sock_ops *rsk_prot;
struct timewait_sock_ops *twsk_prot;
struct module *owner;
char name[32];
struct list_head node;
#ifdef SOCK_REFCNT_DEBUG
atomic_t socks;
#endif
struct {
int inuse;
u8 __pad[SMP_CACHE_BYTES - sizeof(int)];
} stats[NR_CPUS];
};
tcp_prot函数指针
struct proto tcp_prot = {
.name = "TCP",
.owner = THIS_MODULE,
.close = tcp_close,
.connect = tcp_v4_connect,
.disconnect = tcp_disconnect,
.accept = inet_csk_accept,
.ioctl = tcp_ioctl,
.init = tcp_v4_init_sock,
.destroy = tcp_v4_destroy_sock,
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
.recvmsg = tcp_recvmsg,
.backlog_rcv = tcp_v4_do_rcv,
.hash = inet_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
.sockets_allocated = &tcp_sockets_allocated,
.orphan_count = &tcp_orphan_count,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
.sysctl_mem = sysctl_tcp_mem,
.sysctl_wmem = sysctl_tcp_wmem,
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
.twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
.hashinfo = &tcp_hashinfo,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_tcp_setsockopt,
.compat_getsockopt = compat_tcp_getsockopt,
#endif
REF_PROTO_INUSE(tcp)
};
dup指针函数
struct proto udp_prot = {
.name = "UDP",
.owner = THIS_MODULE,
.close = udp_lib_close,
.connect = ip4_datagram_connect,
.disconnect = udp_disconnect,
.ioctl = udp_ioctl,
.destroy = udp_destroy_sock,
.setsockopt = udp_setsockopt,
.getsockopt = udp_getsockopt,
.sendmsg = udp_sendmsg,
.recvmsg = udp_recvmsg,
.sendpage = udp_sendpage,
.backlog_rcv = udp_queue_rcv_skb,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
.get_port = udp_v4_get_port,
.memory_allocated = &udp_memory_allocated,
.sysctl_mem = sysctl_udp_mem,
.sysctl_wmem = &sysctl_udp_wmem_min,
.sysctl_rmem = &sysctl_udp_rmem_min,
.obj_size = sizeof(struct udp_sock),
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_udp_setsockopt,
.compat_getsockopt = compat_udp_getsockopt,
#endif
REF_PROTO_INUSE(udp)
};
3.11 struct sk_buff结构
struct sk_buff {
/* These two members must be first. */
struct sk_buff *next; //Next buffer in list
struct sk_buff *prev; //Previous buffer in list
struct sock *sk; //Socket we are owned by
ktime_t tstamp; //Time we arrived
struct net_device *dev; //evice we arrived on/are leaving by
struct dst_entry *dst; //destination entry
struct sec_path *sp; //the security path, used for xfrm
/*
* This is the control buffer. It is free to use for every
* layer. Please put your private variables there. If you
* want to keep them across layers you have to do a skb_clone()
* first. This is owned by whoever has the skb queued ATM.
*/
char cb[48]; //Control buffer. Free for use by every layer. Put private vars here
unsigned int len, //Length of actual data
data_len; //Data length
__u16 mac_len, //Length of link layer header
hdr_len; //writable header length of cloned skb
union {
__wsum csum; //Checksum (must include start/offset pair)
struct {
__u16 csum_start; //Offset from skb->head where checksumming should start
__u16 csum_offset; //Offset from csum_start where checksum should be stored
};
};
__u32 priority; //Packet queueing priority
__u8 local_df:1, //allow local fragmentation
cloned:1, //Head may be cloned (check refcnt to be sure)
ip_summed:2, //Driver fed us an IP checksum
nohdr:1, //Payload reference only, must not modify header
nfctinfo:3; //Relationship of this skb to the connection
__u8 pkt_type:3, //Packet class
fclone:2, //skbuff clone status
ipvs_property:1, //skbuff is owned by ipvs
nf_trace:1; //netfilter packet trace flag
__be16 protocol; //Packet protocol from driver
void (*destructor)(struct sk_buff *skb); //Destruct function
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
struct nf_conntrack *nfct; //Associated connection, if any
struct sk_buff *nfct_reasm; //netfilter conntrack re-assembly pointer
#endif
#ifdef CONFIG_BRIDGE_NETFILTER
struct nf_bridge_info *nf_bridge; //Saved data about a bridged frame - see br_netfilter.c
#endif
int iif; //ifindex of device we arrived on
__u16 queue_mapping; //Queue mapping for multiqueue devices
#ifdef CONFIG_NET_SCHED
__u16 tc_index; // traffic control index
#ifdef CONFIG_NET_CLS_ACT
__u16 tc_verd; // traffic control verdict
#endif
#endif
/* 2 byte hole */
#ifdef CONFIG_NET_DMA
dma_cookie_t dma_cookie;//a cookie to one of several possible DMA operations done by skb DMA functions
#endif
#ifdef CONFIG_NETWORK_SECMARK
__u32 secmark; //security marking
#endif
__u32 mark; //Generic packet mark
sk_buff_data_t transport_header; //Transport layer header
sk_buff_data_t network_header; //Network layer header
sk_buff_data_t mac_header; //Link layer header
/* These elements must be at the end, see alloc_skb() for details. */
sk_buff_data_t tail; //Tail pointer
sk_buff_data_t end; //End pointer
unsigned char *head, //ead of buffer
*data; //Data head pointer
unsigned int truesize; //Buffer size
atomic_t users; //User count - see {datagram,tcp}.c
};
结构入下图