学习linux网络编程时,会接触socket这个概念,它到底是什么?通过什么机制来使用协议栈?这是本文讨论的问题。
socket的实现包括用户空间和内核空间两个部分。用户空间部分的由标准c库函数(实际上是系统调用的一个wraper),内核部分的系统调用处理函数(sys_*)。根据不同的体系结构,具体的系统调用可能是socketcall或者直接sys_socket、sys_bind、sys_listen...(net/socket.c)。
socket应用程序,首先需要调用socket(2)创建socket(对应于kernel中的sys_socket)。socket(2)的函数原型是:
int socket(int family, int type, int protocol);
family、type、protocol用于指定socket类型。当系统调用成功时,返回一个socket对应的文件描述符;失败时,返回-1。这些man 2 socket上都有。需要强调的是
- 为了配合everything is file的思想,用户空间通过VFS的文件描述符来索引之前创建的socket。socket layer会管理一个叫sockfs的pesudo filesystem,因此如果你使用标准IO接口来访问socket,相应的函数会被映射到对应的socket操作(recvmsg、sendto...)。
- family/type/protocol不是并列的关系。如果想创建一个ipv4的udp socket,sys_socket在创建socket时会先通过socket_create()创建一个与sockfs关联的struct socket实例;接着通过family(AF_INET)找到一个协议簇(struct net_proto_family inet_family_ops),然后调用对应的create函数(inet_create)创建相应的struct sock实例;每个协议簇还有一个用于存放具体协议的数组(ipv4下,数组是struct list_head inetsw[SOCK_MAX]),它由type(SOCK_DGRAM)索引,并通过protocol(0或IPPROTO_UDP)进行匹配,从而找到对应socket type和protocol的协议。
family struct net_proto_family net_families sock_register inet_create
type struct inet_protosw inetsw inet_register_protosw
protocol struct proto proto_list proto_register
struct socket与struct sock
当socket(2)成功返回后,在kernel中,就为对应的socket描述符创建了若干的数据结构。其中最重要的两个是struct socket和struct sock。简单地说,struct socket用于处理用户空间与socket layer之间的关系,struct sock用于处理socket layer与往下的协议栈之间的关系。每个socket描述符都会对应一组socket/sock。
struct socket {
socket_state state;
short type;
unsigned long flags;
/*
* Please keep fasync_list & wait fields in the same cache line
*/
struct fasync_struct *fasync_list;
wait_queue_head_t wait;
struct file *file;
struct sock *sk;
const struct proto_ops *ops;
};
在struct socket中:
- state
- type
- wait
- file
- sk
- ops
struct sock {
/*
* Now struct inet_timewait_sock also uses sock_common, so please just
* don't add nothing before this first member (__sk_common) --acme
*/
struct sock_common __sk_common;
unsigned int sk_shutdown : 2,
sk_no_check : 2,
sk_userlocks : 4,
sk_protocol : 8,
sk_type : 16;
int sk_rcvbuf;
socket_lock_t sk_lock;
/*
* The backlog queue is special, it is always used with
* the per-socket spinlock held and requires low latency
* access. Therefore we special case it's implementation.
*/
struct {
struct sk_buff *head;
struct sk_buff *tail;
int len;
int limit;
} sk_backlog;
wait_queue_head_t *sk_sleep;
struct dst_entry *sk_dst_cache;
rwlock_t sk_dst_lock;
atomic_t sk_rmem_alloc;
atomic_t sk_wmem_alloc;
atomic_t sk_omem_alloc;
int sk_sndbuf;
struct sk_buff_head sk_receive_queue;
struct sk_buff_head sk_write_queue;
int sk_wmem_queued;
int sk_forward_alloc;
gfp_t sk_allocation;
int sk_route_caps;
int sk_gso_type;
unsigned int sk_gso_max_size;
int sk_rcvlowat;
unsigned long sk_flags;
unsigned long sk_lingertime;
struct sk_buff_head sk_error_queue;
struct proto *sk_prot_creator;
rwlock_t sk_callback_lock;
int sk_err,
sk_err_soft;
atomic_t sk_drops;
unsigned short sk_ack_backlog;
unsigned short sk_max_ack_backlog;
__u32 sk_priority;
struct ucred sk_peercred;
long sk_rcvtimeo;
long sk_sndtimeo;
struct sk_filter *sk_filter;
void *sk_protinfo;
struct timer_list sk_timer;
ktime_t sk_stamp;
struct socket *sk_socket;
void *sk_user_data;
struct page *sk_sndmsg_page;
struct sk_buff *sk_send_head;
__u32 sk_sndmsg_off;
int sk_write_pending;
__u32 sk_mark;
/* XXX 4 bytes hole on 64 bit */
void (*sk_state_change)(struct sock *sk);
void (*sk_data_ready)(struct sock *sk, int bytes);
void (*sk_write_space)(struct sock *sk);
void (*sk_error_report)(struct sock *sk);
int (*sk_backlog_rcv)(struct sock *sk,
struct sk_buff *skb);
void (*sk_destruct)(struct sock *sk);
};
在struct sock中:
__sk_common
- sk_prot
socket layer如何接收/发送报文(UDP)
TBD
源代码参考linux-2.6.34