linux socket 定时器,Linux内核Socket实现之------Socket创建(1)

前言:

对于Linux内核的Socket系列文章都是依据于:Linux-3.14.5的版本内核分析,对于文中的注释和问题的说明也参考了网络上经典分析文章,对他们奉献表示感谢!

转载请标明:http://blog.chinaunix.net/uid-20788636-id-4408261.html

1. Socket内核调用数SYSCALL_DEFINE3

Socket的创建是在用户空间调用socket系统函数完成的,创建一个Socket返回一个文件描述符fd,内核的系统调用接口为SYSCALL_DEFINE3(socket,

int, family, int, type, int, protocol),在net/socket.c文件中,下面我们看一下内核中的源码实现。

SYSCALL_DEFINE3(socket, int, family, int,

type, int, protocol)

{

int

retval;

struct

socket *sock;

int

flags;

/*

Check the SOCK_* constants for consistency.下面这些都是进行各种的检查操作*/

BUILD_BUG_ON(SOCK_CLOEXEC

!= O_CLOEXEC);

BUILD_BUG_ON((SOCK_MAX

| SOCK_TYPE_MASK) != SOCK_TYPE_MASK);

BUILD_BUG_ON(SOCK_CLOEXEC

& SOCK_TYPE_MASK);

BUILD_BUG_ON(SOCK_NONBLOCK

& SOCK_TYPE_MASK);

flags

= type & ~SOCK_TYPE_MASK;

if

(flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))

return

-EINVAL;

type

&= SOCK_TYPE_MASK;

if

(SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))

flags

= (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

//调用创建socket的函数

retval

= sock_create(family, type, protocol, &sock);//------参考下面的分析

if

(retval < 0)

goto

out;

retval

= sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));

if

(retval < 0)

goto

out_release;

out:

/*

It may be already another descriptor 8) Not kernel problem. */

return

retval;

out_release:

sock_release(sock);

return

retval;

}

1.1  socket_create函数

对于sock_create(family,

type, protocol, &sock)函数调用的是包囊函数,

__sock_create(current->nsproxy->net_ns,

family, type, protocol, res, 0);

对于__sock_create函数的定义如下:

int __sock_create(struct net *net, int

family, int type, int protocol,

struct socket **res, int kern)

{

int

err;

struct

socket *sock;

const

struct net_proto_family *pf;

/*

*

Check protocol is in range检查协议的范围,现在内核定义的最大范围为41,这里的family指的是AF_INET6,AF_INET协议簇

#define NPROTO                  AF_MAX

#define AF_MAX           41     /* For now.. */

*/

if

(family < 0 || family >= NPROTO)

return

-EAFNOSUPPORT;

if

(type < 0 || type >= SOCK_MAX)//这里的type是socket的类型例如SOCK_STREAM

return

-EINVAL;

/*

Compatibility.

This uglymoron is moved from INET layer to

here to avoid

deadlock in module load.

*/

if

(family == PF_INET && type == SOCK_PACKET) {//如果是该类型的socket,对family进行重新的赋值

static

int warned;//这里自动初始化为0,

if

(!warned) {

warned

= 1;

printk(KERN_INFO

"%s uses obsolete (PF_INET,SOCK_PACKET)\n",

current->comm);

}

family

= PF_PACKET;//赋值为PF_PACKET

}

err

= security_socket_create(family, type, protocol, kern);

if

(err)

return

err;

/*

*     Allocate

the socket and allow the family to set things up. if

*     the

protocol is 0, the family is instructed to select an appropriate

*     default.这里调用sock_alloc分配sock,见下面的分析

*/

sock = sock_alloc();

if

(!sock) {

net_warn_ratelimited("socket:

no more sockets\n");

return

-ENFILE;         /* Not exactly a match,

but its the

closest posix thing */

}

sock->type

= type;

#ifdef CONFIG_MODULES

/*

Attempt to load a protocol module if the find failed.

*

* 12/09/1996 Marcin: But! this makes REALLY

only sense, if the user

* requested real, full-featured networking

support upon configuration.

* Otherwise module support will break!

*/

if

(rcu_access_pointer(net_families[family]) == NULL)

request_module("net-pf-%d",

family);

#endif

rcu_read_lock();

pf

= rcu_dereference(net_families[family]);

err

= -EAFNOSUPPORT;

if

(!pf)

goto

out_release;

/*

* We will call the ->create function, that

possibly is in a loadable

* module, so we have to bump that loadable

module refcnt first.

*/

if

(!try_module_get(pf->owner))

goto

out_release;

/*

Now protected by module ref count */

rcu_read_unlock();

/*static const struct

net_proto_family inet_family_ops = {

.family = PF_INET,

.create = inet_create,

.owner     =

THIS_MODULE,

};这里根据注册的family类型,调用不同的create函数,这里就是调用inet_ctreate*/

err = pf->create(net, sock, protocol, kern);

if

(err < 0)

goto

out_module_put;

/*

* Now to bump the refcnt of the [loadable]

module that owns this

* socket at sock_release time we decrement its

refcnt.

*/

if

(!try_module_get(sock->ops->owner))

goto

out_module_busy;

/*

* Now that we're done with the ->create

function, the [loadable]

* module can have its refcnt decremented

*/

module_put(pf->owner);

err

= security_socket_post_create(sock, family, type, protocol, kern);

if

(err)

goto

out_sock_release;

*res

= sock;

return

0;

out_module_busy:

err

= -EAFNOSUPPORT;

out_module_put:

sock->ops

= NULL;

module_put(pf->owner);

out_sock_release:

sock_release(sock);

return

err;

out_release:

rcu_read_unlock();

goto

out_sock_release;

}

1.1.1   sock_alloc函数

sock_alloc函数用于分配一个socket结构体,这这里涉及了inode结构以及在分配完成后返回的地址指针。

static struct socket *sock_alloc(void)

{

struct

inode *inode;

struct

socket *sock;

/*下面的new_inode_pseudo函数是分配一个新的inode结构体,但在实际分配过程中,分配了一个socket_alloc结构体,返回d的是inode地址,struct socket_alloc {

struct socket socket;

struct inode vfs_inode;

};

*/

inode

= new_inode_pseudo(sock_mnt->mnt_sb);//sock_mnt哪里进行初始的,请看下面的分析-----(1)

if

(!inode)

return

NULL;

sock

= SOCKET_I(inode);//该宏根据返回的inode获取到分配的socket_alloc指针

kmemcheck_annotate_bitfield(sock,

type);

/*下面是对inode变量进行初始化操作,*/

inode->i_ino

= get_next_ino();

inode->i_mode

= S_IFSOCK | S_IRWXUGO;

inode->i_uid

= current_fsuid();//用户ID,在后面调用bind系统调用时会进行对比

inode->i_gid

= current_fsgid();//组ID

inode->i_op

= &sockfs_inode_ops;

this_cpu_add(sockets_in_use,

1);

return

sock;

}

(1)对于sock_mnt->mnt_sb的赋值和分配过程如下:

在sock_init函数中对socket类型的文件系统进行注册

static struct file_system_type sock_fs_type = {

.name

=             "sockfs",

.mount

=  sockfs_mount,

.kill_sb

=  kill_anon_super,

};

static int __init sock_init(void)

{

int

err;

/*

*

Initialize the network sysctl infrastructure.

*/

err

= net_sysctl_init();

if

(err)

goto

out;

/*

*

Initialize skbuff SLAB cache

*/

skb_init();

/*

*

Initialize the protocols module.

*/

init_inodecache();

/*下面的函数进行文件系统的注册*/

err = register_filesystem(&sock_fs_type);

if

(err)

goto

out_fs;

/*下面的函数挂载文件系统*/

sock_mnt = kern_mount(&sock_fs_type);

if

(IS_ERR(sock_mnt)) {

err

= PTR_ERR(sock_mnt);

goto

out_mount;

}

/*

The real protocol initialization is performed in later initcalls.

*/

#ifdef CONFIG_NETFILTER

err

= netfilter_init();

if

(err)

goto

out;

#endif

#ifdef CONFIG_NETWORK_PHY_TIMESTAMPING

skb_timestamping_init();

#endif

out:

return

err;

out_mount:

unregister_filesystem(&sock_fs_type);

out_fs:

goto

out;

}

(2)new_inode_pseudo函数创建一个inode,并初始化inode的i_state变量和inode->i_sb_list链表,实际的分配函数为alloc_inode函数

struct inode *new_inode_pseudo(struct

super_block *sb)

{

struct inode *inode = alloc_inode(sb);

if

(inode) {

spin_lock(&inode->i_lock);

inode->i_state = 0;

spin_unlock(&inode->i_lock);

INIT_LIST_HEAD(&inode->i_sb_list);

}

return

inode;

}

alloc_inode分配一个inode节点,

static struct inode *alloc_inode(struct

super_block *sb)

{

struct

inode *inode;

if

(sb->s_op->alloc_inode)

/*如果当前文件系统的超级块,有自己的分配inode的函数,则调用它自己的分配函数,否则从公用的高速缓存中分配一个inode.对于sokcet来说,在socket.c文件中,调用的函数为sock_alloc_inode

static const struct

super_operations sockfs_ops = {

.alloc_inode     =

sock_alloc_inode,

.destroy_inode         =

sock_destroy_inode,

.statfs                =

simple_statfs,

};

*/

inode = sb->s_op->alloc_inode(sb);

else

inode

= kmem_cache_alloc(inode_cachep, GFP_KERNEL);

if

(!inode)

return

NULL;

/*对inode结构进行初始化*/

if

(unlikely(inode_init_always(sb, inode))) {

if

(inode->i_sb->s_op->destroy_inode)

inode->i_sb->s_op->destroy_inode(inode);

else

kmem_cache_free(inode_cachep,

inode);

return

NULL;

}

return

inode;

}

(3) 下面是sock_alloc_inode函数,在socket.c文件中

static struct inode

*sock_alloc_inode(struct super_block *sb)

{

struct socket_alloc *ei;

struct

socket_wq *wq;

/*下面的函数分配struct socket_alloc结构体,这里是怎么分配的呢?参考下面的说明*/

ei

= kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);

if

(!ei)

return

NULL;

wq

= kmalloc(sizeof(*wq), GFP_KERNEL);

if

(!wq) {

kmem_cache_free(sock_inode_cachep,

ei);

return

NULL;

}

init_waitqueue_head(&wq->wait);

wq->fasync_list

= NULL;

RCU_INIT_POINTER(ei->socket.wq,

wq);

ei->socket.state

= SS_UNCONNECTED;

ei->socket.flags

= 0;

ei->socket.ops

= NULL;

ei->socket.sk

= NULL;

ei->socket.file

= NULL;

return

&ei->vfs_inode; //这里返回的是struct inode vfs_inode;

}

备注说明:在分配函数sock_alloc_inode中调用了ei =

kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);这里分配的大小为socket_alloc大小,下面分析一下是如果分配该大小的?

init_inodecache函数中(net/socket.c),对其进行了高速缓存的分配操作,定义在socket.c文件中,这里分配的大小为socket_alloc,但返回是的socket_alloc结构体中的struct inode vfs_inode;变量。该函数在sock_init函数中被调用

static int init_inodecache(void)

{

sock_inode_cachep =

kmem_cache_create("sock_inode_cache",

sizeof(struct socket_alloc),

0,

(SLAB_HWCACHE_ALIGN |

SLAB_RECLAIM_ACCOUNT |

SLAB_MEM_SPREAD),

init_once);

if

(sock_inode_cachep == NULL)

return

-ENOMEM;

return

0;

}

1.1.2   inet_create函数

在socket_create函数中调用pf->create这里的函数指针为inet_create。在文件/net/af_inet.c中

static int inet_create(struct net *net,

struct socket *sock, int protocol,

int kern)

{

struct

sock *sk;

struct

inet_protosw *answer;

struct

inet_sock *inet;

struct

proto *answer_prot;

unsigned

char answer_flags;

char

answer_no_check;

int

try_loading_module = 0;

int

err;

sock->state

= SS_UNCONNECTED;

/*

Look for the requested type/protocol pair. */

lookup_protocol:

err

= -ESOCKTNOSUPPORT;

rcu_read_lock();

/*从inetsw中根据类型、协议查找相应的socket interface也就是inet_protosw */

list_for_each_entry_rcu(answer,

&inetsw[sock->type], list) {

err

= 0;

/*

Check the non-wild match. */

if

(protocol == answer->protocol) {

if

(protocol != IPPROTO_IP)

break;

}

else {

/*

Check for the two wild cases. */

if

(IPPROTO_IP == protocol) {

protocol

= answer->protocol;

break;

}

if

(IPPROTO_IP == answer->protocol)

break;

}

err

= -EPROTONOSUPPORT;

}

/*如果没有找到,尝试加载模块*/

if

(unlikely(err)) {

if

(try_loading_module < 2) {

rcu_read_unlock();

/*

* Be more specific, e.g.

net-pf-2-proto-132-type-1

*

(net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)

*/

if (++try_loading_module ==

1)

request_module("net-pf-%d-proto-%d-type-%d",

PF_INET, protocol, sock->type);

/*

* Fall back to generic, e.g.

net-pf-2-proto-132

* (net-pf-PF_INET-proto-IPPROTO_SCTP)

*/

else

request_module("net-pf-%d-proto-%d",

PF_INET, protocol);

goto

lookup_protocol;

}

else

goto

out_rcu_unlock;

}

err

= -EPERM;

if

(sock->type == SOCK_RAW && !kern &&

!ns_capable(net->user_ns, CAP_NET_RAW))

goto

out_rcu_unlock;

sock->ops

= answer->ops;

answer_prot

= answer->prot;

answer_no_check

= answer->no_check;

answer_flags

= answer->flags;

rcu_read_unlock();

WARN_ON(answer_prot->slab

== NULL);

/* sk_alloc表面上是生成一个sock的结构体,但是实际上对于tcp来说是一个tcp_sock的大小的结构体,这样就可以使用inet_sk(sk);进行强制的类型转换,具体是怎么分配的是tcp_sock大小的,在后续进行分析*/

err

= -ENOBUFS;

sk

= sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);

if

(sk == NULL)

goto

out;

err

= 0;

sk->sk_no_check

= answer_no_check;

if

(INET_PROTOSW_REUSE & answer_flags)

sk->sk_reuse

= SK_CAN_REUSE;

inet

= inet_sk(sk);

inet->is_icsk

= (INET_PROTOSW_ICSK & answer_flags) != 0;

inet->nodefrag

= 0;

if

(SOCK_RAW == sock->type) {

inet->inet_num

= protocol;

if

(IPPROTO_RAW == protocol)

inet->hdrincl

= 1;

}

if

(net->ipv4.sysctl_ip_no_pmtu_disc)

inet->pmtudisc

= IP_PMTUDISC_DONT;

else

inet->pmtudisc

= IP_PMTUDISC_WANT;

inet->inet_id

= 0;

/*对sk结构体中的变量进行初始化操作,*/

sock_init_data(sock, sk);------------------(1)

sk->sk_destruct

= inet_sock_destruct;

sk->sk_protocol

= protocol;

sk->sk_backlog_rcv

= sk->sk_prot->backlog_rcv;

inet->uc_ttl      = -1;

inet->mc_loop = 1;

inet->mc_ttl     = 1;

inet->mc_all     = 1;

inet->mc_index        = 0;

inet->mc_list   = NULL;

inet->rcv_tos   = 0;

sk_refcnt_debug_inc(sk);

if

(inet->inet_num) {

/*

It assumes that any protocol which allows

* the user to assign a number at socket

* creation time automatically

* shares.

*/

inet->inet_sport

= htons(inet->inet_num);

/*

Add to protocol hash chains. */

sk->sk_prot->hash(sk);

}

if

(sk->sk_prot->init) {

err

= sk->sk_prot->init(sk);//如果是tcp的话,这里就是tcp_v4_init_sock--------(2)

if

(err)

sk_common_release(sk);

}

out:

return

err;

out_rcu_unlock:

rcu_read_unlock();

goto

out;

}

(1)sock_init_data函数分析

void sock_init_data(struct socket *sock,

struct sock *sk)

{

skb_queue_head_init(&sk->sk_receive_queue);

skb_queue_head_init(&sk->sk_write_queue);

skb_queue_head_init(&sk->sk_error_queue);

#ifdef CONFIG_NET_DMA

skb_queue_head_init(&sk->sk_async_wait_queue);

#endif

sk->sk_send_head   =       NULL;

/*初始化sk定时器*/

init_timer(&sk->sk_timer);

sk->sk_allocation     =       GFP_KERNEL;

sk->sk_rcvbuf            =       sysctl_rmem_default;

sk->sk_sndbuf           =       sysctl_wmem_default;

sk->sk_state             =       TCP_CLOSE; //初始化sk_state=TCP_CLOSE状态,在后面是的系统调用中会进行判断

sk_set_socket(sk,

sock);// sk->sk_socket = sock;设置sk中指向socket的指针

sock_set_flag(sk,

SOCK_ZAPPED);//设置SOKCET的flag位,表明该sokcet已经绑定了一个名字,该标志位没有搞明白什么意思?

if

(sock) {

sk->sk_type      =       sock->type;

sk->sk_wq        =       sock->wq;

sock->sk   =       sk; // struct socket *sock的sk指向sock

}

else

sk->sk_wq        =       NULL;

spin_lock_init(&sk->sk_dst_lock);

rwlock_init(&sk->sk_callback_lock);

lockdep_set_class_and_name(&sk->sk_callback_lock,

af_callback_keys

+ sk->sk_family,

af_family_clock_key_strings[sk->sk_family]);

sk->sk_state_change       =       sock_def_wakeup;

sk->sk_data_ready  =       sock_def_readable;

sk->sk_write_space         =       sock_def_write_space;

sk->sk_error_report         =       sock_def_error_report;

sk->sk_destruct                 =       sock_def_destruct;

sk->sk_frag.page     =       NULL;

sk->sk_frag.offset   =       0;

sk->sk_peek_off                =       -1;

sk->sk_peer_pid

=       NULL;

sk->sk_peer_cred    =       NULL;

sk->sk_write_pending     =       0;

sk->sk_rcvlowat                =       1;

sk->sk_rcvtimeo                =       MAX_SCHEDULE_TIMEOUT;

sk->sk_sndtimeo               =       MAX_SCHEDULE_TIMEOUT;

sk->sk_stamp

= ktime_set(-1L, 0);

#ifdef CONFIG_NET_RX_BUSY_POLL

sk->sk_napi_id                   =       0;

sk->sk_ll_usec          =       sysctl_net_busy_read;

#endif

sk->sk_max_pacing_rate

= ~0U;

sk->sk_pacing_rate

= ~0U;

/*

* Before updating sk_refcnt, we must commit

prior changes to memory

* (Documentation/RCU/rculist_nulls.txt for

details)

*/

smp_wmb();

atomic_set(&sk->sk_refcnt,

1);//sk的引用计数加1

atomic_set(&sk->sk_drops,

0);

}

(2)static int

tcp_v4_init_sock(struct sock *sk)

{

struct inet_connection_sock *icsk =

inet_csk(sk);//基于上面的原因分析,其实这里可以进行强制的类型转换

tcp_init_sock(sk);//进行tcp相关变量的初始化工作

icsk->icsk_af_ops =

&ipv4_specific;

#ifdef

CONFIG_TCP_MD5SIG

tcp_sk(sk)->af_specific =

&tcp_sock_ipv4_specific;

#endif

return 0;

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值