1、socket(family,type,protocol)
当我们在开发网络应用程序时,使用该系统调用来创建一个套接字。该API所做的工作如下所示:
struct socket {
socket_state state; //socket状态
unsigned long flags; //标识,如SOCK_ASYNC_NOSAPCE
const struct proto_ops *ops; //协议特定的socket操作集
struct fasync_struct *fasync_list; //异步唤醒队列
struct file *file; //指向文件的指针
struct sock *sk; //指向下一层中的sock结构
wait_queue_head_t wait; //等待在这个socket上的任务列表
short type; //数据包的类型
};
在创建socket套接字时,就是要完成ops、file和sk等这些成员的初始化。
1). 创建套接字:sock_create()
函数sock_create会调用__sock_create函数进行套接字的创建:
- int __sock_create(struct net *net, int family, int type, int protocol,
- struct socket **res, int kern)
- {
- int err;
- struct socket *sock;
- const struct net_proto_family *pf;
- /*
- * 合法性检查
- */
- if (family < 0 || family >= NPROTO)
- return -EAFNOSUPPORT;
- if (type < 0 || type >= SOCK_MAX)
- return -EINVAL;
- /* Compatibility.
- This uglymoron is moved from INET layer to here to avoid
- deadlock in module load.
- */
- if (family == PF_INET && type == SOCK_PACKET) {
- static int warned;
- if (!warned) {
- warned = 1;
- printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",
- current->comm);
- }
- family = PF_PACKET;
- }
- err = security_socket_create(family, type, protocol, kern);
- if (err)
- return err;
- sock = sock_alloc();//分配inode结构并获得对应的socket结构
- if (!sock) {
- if (net_ratelimit())
- printk(KERN_WARNING "socket: no more sockets\n");
- return -ENFILE; /* Not exactly a match, but its the
- closest posix thing */
- }
- sock->type = type;
- rcu_read_lock();
- pf = rcu_dereference(net_families[family]);
- err = -EAFNOSUPPORT;
- if (!pf)
- goto out_release;
- /*
- * We will call the ->create function, that possibly is in a loadable
- * module, so we have to bump that loadable module refcnt first.
- */
- if (!try_module_get(pf->owner))//模块检测
- goto out_release;
- /* Now protected by module ref count */
- rcu_read_unlock();
- //这里调用inet_create函数对INET协议族进行创建
- err = pf->create(net, sock, protocol, kern);
- if (err < 0)
- goto out_module_put;
- /*
- * Now to bump the refcnt of the [loadable] module that owns this
- * socket at sock_release time we decrement its refcnt.
- */
- if (!try_module_get(sock->ops->owner))
- goto out_module_busy;
- /*
- * Now that we're done with the ->create function, the [loadable]
- * module can have its refcnt decremented
- */
- module_put(pf->owner);
- err = security_socket_post_create(sock, family, type, protocol, kern);
- if (err)
- goto out_sock_release;
- *res = sock;
- return 0;
- out_module_busy:
- err = -EAFNOSUPPORT;
- out_module_put:
- sock->ops = NULL;
- module_put(pf->owner);
- out_sock_release:
- sock_release(sock);
- return err;
- out_release:
- rcu_read_unlock();
- goto out_sock_release;
- }
- /* Standard well-defined IP protocols. */
- enum {
- IPPROTO_IP = 0, /* Dummy protocol for TCP */
- IPPROTO_ICMP = 1, /* Internet Control Message Protocol */
- IPPROTO_IGMP = 2, /* Internet Group Management Protocol */
- IPPROTO_IPIP = 4, /* IPIP tunnels (older KA9Q tunnels use 94) */
- IPPROTO_TCP = 6, /* Transmission Control Protocol */
- IPPROTO_EGP = 8, /* Exterior Gateway Protocol */
- IPPROTO_PUP = 12, /* PUP protocol */
- IPPROTO_UDP = 17, /* User Datagram Protocol */
- IPPROTO_IDP = 22, /* XNS IDP protocol */
- IPPROTO_DCCP = 33, /* Datagram Congestion Control Protocol */
- IPPROTO_RSVP = 46, /* RSVP protocol */
- IPPROTO_GRE = 47, /* Cisco GRE tunnels (rfc 1701,1702) */
- IPPROTO_IPV6 = 41, /* IPv6-in-IPv4 tunnelling */
- IPPROTO_ESP = 50, /* Encapsulation Security Payload protocol */
- IPPROTO_AH = 51, /* Authentication Header protocol */
- IPPROTO_BEETPH = 94, /* IP option pseudo header for BEET */
- IPPROTO_PIM = 103, /* Protocol Independent Multicast */
- IPPROTO_COMP = 108, /* Compression Header protocol */
- IPPROTO_SCTP = 132, /* Stream Control Transport Protocol */
- IPPROTO_UDPLITE = 136, /* UDP-Lite (RFC 3828) */
- IPPROTO_RAW = 255, /* Raw IP packets */
- IPPROTO_MAX
- };
函数inet_create完成了上述功能,并初始化了sock的属性值,将socket的sk属性指向sock结构
- static int inet_create(struct net *net, struct socket *sock, int protocol,
- int kern)
- {
- struct sock *sk;
- struct inet_protosw *answer;
- struct inet_sock *inet;
- struct proto *answer_prot;
- unsigned char answer_flags;
- char answer_no_check;
- int try_loading_module = 0;
- int err;
- if (unlikely(!inet_ehash_secret))
- if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
- build_ehash_secret();
- sock->state = SS_UNCONNECTED;
- /* Look for the requested type/protocol pair. */
- lookup_protocol:
- err = -ESOCKTNOSUPPORT;
- rcu_read_lock();
- //根据传输层协议的类型创建sock结构
- //遍历inetsw链表
- list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
- err = 0;
- /* Check the non-wild match. */
- if (protocol == answer->protocol) {
- if (protocol != IPPROTO_IP)
- break;//找到了适配的inetsw[]元素
- } else {
- /* Check for the two wild cases. */
- if (IPPROTO_IP == protocol) {
- protocol = answer->protocol;
- break;
- }
- if (IPPROTO_IP == answer->protocol)
- break;
- }
- err = -EPROTONOSUPPORT;
- }
- //到这里answer指向了合适的inetsw结构,若是TCP协议,answer指向内容如下
- /*
- * .type = SOCK_STREAM,
- * .protocol = IPPROTO_TCP,
- * .prot = &tcp_prot,
- * .ops = &inet_stream_ops,
- * .no_check = 0,
- * .flags = INET_PROTOSW_PERMANENT |
- * INET_PROTOSW_ICSK,
- */
- if (unlikely(err)) {
- if (try_loading_module < 2) {
- rcu_read_unlock();
- /*
- * Be more specific, e.g. net-pf-2-proto-132-type-1
- * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
- */
- if (++try_loading_module == 1)
- request_module("net-pf-%d-proto-%d-type-%d",
- PF_INET, protocol, sock->type);
- /*
- * Fall back to generic, e.g. net-pf-2-proto-132
- * (net-pf-PF_INET-proto-IPPROTO_SCTP)
- */
- else
- request_module("net-pf-%d-proto-%d",
- PF_INET, protocol);
- goto lookup_protocol;
- } else
- goto out_rcu_unlock;
- }
- err = -EPERM;
- if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
- goto out_rcu_unlock;
- err = -EAFNOSUPPORT;
- if (!inet_netns_ok(net, protocol))
- goto out_rcu_unlock;
- sock->ops = answer->ops;
- answer_prot = answer->prot;
- answer_no_check = answer->no_check;
- answer_flags = answer->flags;
- rcu_read_unlock();
- WARN_ON(answer_prot->slab == NULL);
- err = -ENOBUFS;
- //分配sock结构体内存,这里在inet_init函数初始化好的高速缓冲区中分配内存,然后做一些初始化工作。后面有进一步分析。
- sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
- if (sk == NULL)
- goto out;
- err = 0;
- sk->sk_no_check = answer_no_check;
- if (INET_PROTOSW_REUSE & answer_flags)
- sk->sk_reuse = 1;
- inet = inet_sk(sk);//后面有进一步分析,为何可以强制转换?!!
- inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
- inet->nodefrag = 0;
- if (SOCK_RAW == sock->type) {
- inet->inet_num = protocol;
- if (IPPROTO_RAW == protocol)
- inet->hdrincl = 1;
- }
- if (ipv4_config.no_pmtu_disc)
- inet->pmtudisc = IP_PMTUDISC_DONT;
- else
- inet->pmtudisc = IP_PMTUDISC_WANT;
- inet->inet_id = 0;
- //对sk进行初始化设置并将sock中的sk指针指向sk结构
- sock_init_data(sock, sk);
- //进一步设置sk的其他属性信息
- sk->sk_destruct = inet_sock_destruct;
- sk->sk_protocol = protocol;
- sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
- inet->uc_ttl = -1;
- inet->mc_loop = 1;
- inet->mc_ttl = 1;
- inet->mc_all = 1;
- inet->mc_index = 0;
- inet->mc_list = NULL;
- sk_refcnt_debug_inc(sk);
- if (inet->inet_num) {
- /* It assumes that any protocol which allows
- * the user to assign a number at socket
- * creation time automatically
- * shares.
- */
- inet->inet_sport = htons(inet->inet_num);
- /* Add to protocol hash chains. */
- sk->sk_prot->hash(sk);//调用inet_hash函数
- }
- if (sk->sk_prot->init) {
- err = sk->sk_prot->init(sk);//调用tcp_v4_init_sock函数进行进一步的初始化,由于在函数sk_alloc中一些属性被设置成0了,所以在此调用进行初始化
- if (err)
- sk_common_release(sk);
- }
- out:
- return err;
- out_rcu_unlock:
- rcu_read_unlock();
- goto out;
- }
-
根据family参数值在全局数组struct net_proto_family net_families[]里找到我们所指定的地址簇。
其中inetsw_array[]是一个比较重要的数据结构,定义在af_inet.c文件中:
在初始化的时候我们会将上面数组中的的元素按套接字类型插入inetsw链表数组中。其定义如下:
- static struct list_head inetsw[SOCK_MAX];
不同类型的地址簇都有一个struct net_proto_family{}类型的对象,例如我们常见的IPv4的inet_family_ops,IPv6的inet6_family_ops,X25协议的ax25_family_ops等。在内核是初始化时,这些模块会在自己的初始化函数内部调用sock_register()接口将各自的地址簇对象注册到net_families[]数组里。
我们分析的焦点集中在IPv4协议簇,即inet_family_ops对象上。重点是inet_create函数,该函数的主要任务就是创建一个socket套接字,并对其中相关结构体成员进行必要的初始化。至于它创建套接字时的依据和原理等到我们讲协议栈时大家就明白了,这里主要是让大家对其流程执行流程有个感性的把握。
sock_alloc()函数中我们创建一个struct socket{}类型的对象,假如叫做A,将socket()系统调用的第二参数type字段赋值给A->type。
在inet_create()函数中,我们根据type的值,在全局数组struct inet_protosw inetsw[]里找到我们对应的协议转换开关。而inetsw[]数组是在inet_init()函数里被初始化的:根据type的值,就可以确定struct socket{}->ops,到底是inet_stream_ops、inet_dgram_ops或者inet_sockraw_ops。
TCP协议z在INET层操作集inet_stream_ops
- const struct proto_ops inet_stream_ops = {
- .family = PF_INET,
- .owner = THIS_MODULE,
- .release = inet_release,
- .bind = inet_bind,
- .connect = inet_stream_connect,
- .socketpair = sock_no_socketpair,
- .accept = inet_accept,
- .getname = inet_getname,
- .poll = tcp_poll,
- .ioctl = inet_ioctl,
- .listen = inet_listen,
- .shutdown = inet_shutdown,
- .setsockopt = sock_common_setsockopt,
- .getsockopt = sock_common_getsockopt,
- .sendmsg = inet_sendmsg,
- .recvmsg = inet_recvmsg,
- .mmap = sock_no_mmap,
- .sendpage = inet_sendpage,
- .splice_read = tcp_splice_read,
- #ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_sock_common_setsockopt,
- .compat_getsockopt = compat_sock_common_getsockopt,
- .compat_ioctl = inet_compat_ioctl,
- #endif
- };
- const struct proto_ops inet_dgram_ops = {
- .family = PF_INET,
- .owner = THIS_MODULE,
- .release = inet_release,
- .bind = inet_bind,
- .connect = inet_dgram_connect,
- .socketpair = sock_no_socketpair,
- .accept = sock_no_accept,
- .getname = inet_getname,
- .poll = udp_poll,
- .ioctl = inet_ioctl,
- .listen = sock_no_listen,
- .shutdown = inet_shutdown,
- .setsockopt = sock_common_setsockopt,
- .getsockopt = sock_common_getsockopt,
- .sendmsg = inet_sendmsg,
- .recvmsg = inet_recvmsg,
- .mmap = sock_no_mmap,
- .sendpage = inet_sendpage,
- #ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_sock_common_setsockopt,
- .compat_getsockopt = compat_sock_common_getsockopt,
- .compat_ioctl = inet_compat_ioctl,
- #endif
- };
然后,对应地,就以tcp_prot、udp_prot或raw_prot为输入参数,
- struct proto tcp_prot = {
- .name = "TCP",
- .owner = THIS_MODULE,
- .close = tcp_close,
- .connect = tcp_v4_connect,
- .disconnect = tcp_disconnect,
- .accept = inet_csk_accept,
- .ioctl = tcp_ioctl,
- .init = tcp_v4_init_sock,
- .destroy = tcp_v4_destroy_sock,
- .shutdown = tcp_shutdown,
- .setsockopt = tcp_setsockopt,
- .getsockopt = tcp_getsockopt,
- .recvmsg = tcp_recvmsg,
- .sendmsg = tcp_sendmsg,
- .sendpage = tcp_sendpage,
- .backlog_rcv = tcp_v4_do_rcv,
- .hash = inet_hash,
- .unhash = inet_unhash,
- .get_port = inet_csk_get_port,
- .enter_memory_pressure = tcp_enter_memory_pressure,
- .sockets_allocated = &tcp_sockets_allocated,
- .orphan_count = &tcp_orphan_count,
- .memory_allocated = &tcp_memory_allocated,
- .memory_pressure = &tcp_memory_pressure,
- .sysctl_mem = sysctl_tcp_mem,
- .sysctl_wmem = sysctl_tcp_wmem,
- .sysctl_rmem = sysctl_tcp_rmem,
- .max_header = MAX_TCP_HEADER,
- .obj_size = sizeof(struct tcp_sock),
- .slab_flags = SLAB_DESTROY_BY_RCU,
- .twsk_prot = &tcp_timewait_sock_ops,
- .rsk_prot = &tcp_request_sock_ops,
- .h.hashinfo = &tcp_hashinfo,
- .no_autobind = true,
- #ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_tcp_setsockopt,
- .compat_getsockopt = compat_tcp_getsockopt,
- #endif
- };
- struct proto udp_prot = {
- .name = "UDP",
- .owner = THIS_MODULE,
- .close = udp_lib_close,
- .connect = ip4_datagram_connect,
- .disconnect = udp_disconnect,
- .ioctl = udp_ioctl,
- .destroy = udp_destroy_sock,
- .setsockopt = udp_setsockopt,
- .getsockopt = udp_getsockopt,
- .sendmsg = udp_sendmsg,
- .recvmsg = udp_recvmsg,
- .sendpage = udp_sendpage,
- .backlog_rcv = __udp_queue_rcv_skb,
- .hash = udp_lib_hash,
- .unhash = udp_lib_unhash,
- .rehash = udp_v4_rehash,
- .get_port = udp_v4_get_port,
- .memory_allocated = &udp_memory_allocated,
- .sysctl_mem = sysctl_udp_mem,
- .sysctl_wmem = &sysctl_udp_wmem_min,
- .sysctl_rmem = &sysctl_udp_rmem_min,
- .obj_size = sizeof(struct udp_sock),
- .slab_flags = SLAB_DESTROY_BY_RCU,
- .h.udp_table = &udp_table,
- #ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_udp_setsockopt,
- .compat_getsockopt = compat_udp_getsockopt,
- #endif
- .clear_sk = sk_prot_clear_portaddr_nulls,
- };
实例化一个struct sock{} 对象sk=sk_alloc()。
- struct sock {
- /*
- * Now struct inet_timewait_sock also uses sock_common, so please just
- * don't add nothing before this first member (__sk_common) --acme
- */
- struct sock_common __sk_common;
- #define sk_node __sk_common.skc_node
- #define sk_nulls_node __sk_common.skc_nulls_node
- #define sk_refcnt __sk_common.skc_refcnt
- #define sk_tx_queue_mapping __sk_common.skc_tx_queue_mapping
- #define sk_dontcopy_begin __sk_common.skc_dontcopy_begin
- #define sk_dontcopy_end __sk_common.skc_dontcopy_end
- #define sk_hash __sk_common.skc_hash
- #define sk_family __sk_common.skc_family
- #define sk_state __sk_common.skc_state
- #define sk_reuse __sk_common.skc_reuse
- #define sk_bound_dev_if __sk_common.skc_bound_dev_if
- #define sk_bind_node __sk_common.skc_bind_node
- #define sk_prot __sk_common.skc_prot
- #define sk_net __sk_common.skc_net
- socket_lock_t sk_lock;
- struct sk_buff_head sk_receive_queue;
- /*
- * The backlog queue is special, it is always used with
- * the per-socket spinlock held and requires low latency
- * access. Therefore we special case it's implementation.
- * Note : rmem_alloc is in this structure to fill a hole
- * on 64bit arches, not because its logically part of
- * backlog.
- */
- struct {
- atomic_t rmem_alloc;
- int len;
- struct sk_buff *head;
- struct sk_buff *tail;
- } sk_backlog;
- #define sk_rmem_alloc sk_backlog.rmem_alloc
- int sk_forward_alloc;
- #ifdef CONFIG_RPS
- __u32 sk_rxhash;
- #endif
- atomic_t sk_drops;
- int sk_rcvbuf;
- struct sk_filter __rcu *sk_filter;
- struct socket_wq __rcu *sk_wq;
- #ifdef CONFIG_NET_DMA
- struct sk_buff_head sk_async_wait_queue;
- #endif
- #ifdef CONFIG_XFRM
- struct xfrm_policy *sk_policy[2];
- #endif
- unsigned long sk_flags;
- struct dst_entry *sk_dst_cache;
- spinlock_t sk_dst_lock;
- atomic_t sk_wmem_alloc;
- atomic_t sk_omem_alloc;
- int sk_sndbuf;
- struct sk_buff_head sk_write_queue;
- kmemcheck_bitfield_begin(flags);
- unsigned int sk_shutdown : 2,
- sk_no_check : 2,
- sk_userlocks : 4,
- sk_protocol : 8,
- sk_type : 16;
- kmemcheck_bitfield_end(flags);
- int sk_wmem_queued;
- gfp_t sk_allocation;
- int sk_route_caps;
- int sk_route_nocaps;
- int sk_gso_type;
- unsigned int sk_gso_max_size;
- int sk_rcvlowat;
- unsigned long sk_lingertime;
- struct sk_buff_head sk_error_queue;
- struct proto *sk_prot_creator;
- rwlock_t sk_callback_lock;
- int sk_err,
- sk_err_soft;
- unsigned short sk_ack_backlog;
- unsigned short sk_max_ack_backlog;
- __u32 sk_priority;
- struct pid *sk_peer_pid;
- const struct cred *sk_peer_cred;
- long sk_rcvtimeo;
- long sk_sndtimeo;
- void *sk_protinfo;
- struct timer_list sk_timer;
- ktime_t sk_stamp;
- struct socket *sk_socket;
- void *sk_user_data;
- struct page *sk_sndmsg_page;
- struct sk_buff *sk_send_head;
- __u32 sk_sndmsg_off;
- int sk_write_pending;
- #ifdef CONFIG_SECURITY
- void *sk_security;
- #endif
- __u32 sk_mark;
- u32 sk_classid;
- void (*sk_state_change)(struct sock *sk);
- void (*sk_data_ready)(struct sock *sk, int bytes);
- void (*sk_write_space)(struct sock *sk);
- void (*sk_error_report)(struct sock *sk);
- int (*sk_backlog_rcv)(struct sock *sk,
- struct sk_buff *skb);
- void (*sk_destruct)(struct sock *sk);
- };
紧接着建立socket{}和sock{}的关联,最后将socket()系统调用的第三个参数protocol付给sock{}对象中的属性sk_protocol。
2). 为套接字绑定文件句柄:sock_map_fd()
我们都知道网络套接字也是一种系统IO,所以不可避免的要与文件系统打交道。每个套接字都对应一个已打开的文件标识符,所以在套接字初始化完成后,就要将其和本地一个唯一的文件标识符关联起来,即建立socket{}和file{}之间的关联关系。
2、bind (sockfd, sockaddr, addrlen)
该系统调用在内核中的执行过程如下:
重点是socket->ops->bind()回调接口。我们现在已经知道了,针对IPv4而言,这里的ops无非就是inet_stream_ops、inet_dgram_ops或inet_sockraw_ops对象。碰巧的是,这三个对象中的bind函数指针均指向inet_bind()函数。只有原始套接字的情况,这里会去调用raw_prot对象的bind回调函数,即raw_bind()。
3、listen(sockfd, backlog)
这里我们可以看到面向无连接的套接字和原始套接字是不用listen的,只有流式套接字才有效。
4 、connect(sockfd, sockaddr, addrlen)
从这幅图中我们确实看到,connect()系统调用不但可以面向连接的套接字,也可用于无连接及原始套接字。
5、accept(sockfd, sockaddr, addrlen)
同样地,我们看到只有面向连接的流式套接字调用accept()才有意义。最终调用的是tcp_prot对象的accept成员函数。
补充::
为什么内核中可以直接将sock结构体首地址强制转换成inet_sock的首地址?并且inet_sock的大小要大于sock,直接进行如下强制转换
- inet = inet_sk(sk);
- static inline struct inet_sock *inet_sk(const struct sock *sk)
- {
- return (struct inet_sock *)sk;
- }
不会发生内存非法访问吗?!那就是在分配的时候并不只是分配的struct sock结构体大小的存储空间!
可以细看sock结构体分配的代码:
- struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
- struct proto *prot)
- {
- struct sock *sk;
- sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
- if (sk) {
- sk->sk_family = family;
- sk->sk_prot = sk->sk_prot_creator = prot;
- sock_lock_init(sk);
- sock_net_set(sk, get_net(net));
- atomic_set(&sk->sk_wmem_alloc, 1);
- sock_update_classid(sk);
- }
- return sk;
- }
- static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
- int family)
- {
- struct sock *sk;
- struct kmem_cache *slab;
- slab = prot->slab;
- if (slab != NULL) {
- sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
- ..............................
- } else
- sk = kmalloc(prot->obj_size, priority);
- .....................
- return sk;
- ......................
- }
如果是TCP协议中的tcp_prot中指明该属性的大小为.obj_size = sizeof(struct tcp_sock)。
所以,程序中给struct sock指针分配的不是该结构体的实际大小,而是大于其实际大小,以便其扩展套接字的属性占用。
以图例说明tcp_sock是如何从sock强制转换来的: