Netlink套接字是用以实现用户进程与内核进程通信的一种特殊的进程间通信(IPC) ,也是网络应用程序与内核通信的最常用的接口,用户态应用使用标准的socket API就可以使用netlink提供的强大功能。
Netlink是一种特殊的socket,它是Linux所特有的,类似于BSD系统中的AF_ROUTE但又远比它的功能强大,目前在Linux内核中使用netlink进行应用与内核通信的应用很多;包括:路由(NETLINK_ROUTE)、用户态socket协议(NETLINK_USERSOCK)、防火墙(NETLINK_FIREWALL)、Netfilter子系统(NETLINK_NETFILTER)、内核事件向用户态通知(NETLINK_KOBJECT_UEVENT),通用netlink(NETLINK_GENERIC)等。
相对于ioctl、sysfs、proc的优势:
- 内核可以主动向用户空间发送异步消息,而不需要用户空间来触发。
- 用户与内核间的通信方式,不需要轮询,用户空间应用程序打开套接字,调用recvmsg(),如果没有来自内核的消息,就进入阻塞状态。
- 支持组播传输。
Netlink协议簇初始化
Netlink协议簇初始化代码位于net/netlink/af_netlink.c中。
core_initcall(netlink_proto_init);
static int __init netlink_proto_init(void)
{
int i;
// 注册netlink协议
int err = proto_register(&netlink_proto, 0);
if (err != 0)
goto out;
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
err = bpf_iter_register();
if (err)
goto out;
#endif
BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof_field(struct sk_buff, cb));
// 申请netlink table,每种协议类型一个
nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL);
if (!nl_table)
goto panic;
// 初始化netlink table
for (i = 0; i < MAX_LINKS; i++) {
// 初始化哈希表
if (rhashtable_init(&nl_table[i].hash,
&netlink_rhashtable_params) < 0) {
while (--i > 0)
rhashtable_destroy(&nl_table[i].hash);
kfree(nl_table);
goto panic;
}
}
// 初始化应用层使用的NETLINK_USERSOCK协议类型的netlink(用于应用层进程间通信)
netlink_add_usersock_entry();
// 向内核注册协议处理函数,即将netlink的socket创建处理函数注册到内核中
sock_register(&netlink_family_ops);
// 向内核所有的网络命名空间注册”子系统“的初始化和注销函数,在网络命名空间创建和注销时会调用这里注册的初始化和注销函数
register_pernet_subsys(&netlink_net_ops);
register_pernet_subsys(&netlink_tap_net_ops);
/* The netlink device handler may be needed early. */
// 注册各个消息类型,注册指定的函数指针(至少其中一个必须为非NULL),以便在收到指定协议族和消息类型的请求消息时调用。
rtnetlink_init();
out:
return err;
panic:
panic("netlink_init: Cannot allocate nl_table\n");
}
资料直通车:最新Linux内核源码资料文档+视频资料
学习直通车:Linux内核源码/内存调优/文件系统/进程管理/设备驱动/网络协议栈
创建Netlink
static const struct net_proto_family netlink_family_ops = {
.family = PF_NETLINK,
.create = netlink_create,
.owner = THIS_MODULE, /* for consistency 8) */
};
static int netlink_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
struct module *module = NULL;
struct mutex *cb_mutex;
struct netlink_sock *nlk;
int (*bind)(struct net *net, int group);
void (*unbind)(struct net *net, int group);
int err = 0;
sock->state = SS_UNCONNECTED;
// 支持raw和dgram类型
if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
return -ESOCKTNOSUPPORT;
// 检查netlink协议类型,目前22个,最大支持32个
if (protocol < 0 || protocol >= MAX_LINKS)
return -EPROTONOSUPPORT;
protocol = array_index_nospec(protocol, MAX_LINKS);
// 锁表
netlink_lock_table();
#ifdef CONFIG_MODULES
// netlink指定协议未注册,则加载模块并注册
if (!nl_table[protocol].registered) {
netlink_unlock_table();
request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol);
netlink_lock_table();
}
#endif
// 查找dodule\cb_mutex\bind\unbind
if (nl_table[protocol].registered &&
try_module_get(nl_table[protocol].module))
module = nl_table[protocol].module;
else
err = -EPROTONOSUPPORT;
cb_mutex = nl_table[protocol].cb_mutex;
bind = nl_table[protocol].bind;
unbind = nl_table[protocol].unbind;
// 释放锁
netlink_unlock_table();
if (err < 0)
goto out;
err = __netlink_create(net, sock, cb_mutex, protocol, kern);
if (err < 0)
goto out_module;
// 增加netlink协议inuse计数器
sock_prot_inuse_add(net, &netlink_proto, 1);
// 继续初始化netlink_sock
nlk = nlk_sk(sock->sk);
nlk->module = module;
nlk->netlink_bind = bind;
nlk->netlink_unbind = unbind;
out:
return err;
out_module:
module_put(module);
goto out;
}
static int __netlink_create(struct net *net, struct socket *sock,
struct mutex *cb_mutex, int protocol,
int kern)
{
struct sock *sk;
struct netlink_sock *nlk;
// 注册netlink socket处理函数
sock->ops = &netlink_ops;
// 创建内核sock对象
sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto, kern);
if (!sk)
return -ENOMEM;
// 使用sockt初始sk
sock_init_data(sock, sk);
// sk转netlink_sock,并初始化netlink_sock
nlk = nlk_sk(sk);
if (cb_mutex) {
nlk->cb_mutex = cb_mutex;
} else {
nlk->cb_mutex = &nlk->cb_def_mutex;
mutex_init(nlk->cb_mutex);
lockdep_set_class_and_name(nlk->cb_mutex,
nlk_cb_mutex_keys + protocol,
nlk_cb_mutex_key_strings[protocol]);
}
// 初始化netlink_sock的等待队列
init_waitqueue_head(&nlk->wait);
// sk协议和析构
sk->sk_destruct = netlink_sock_destruct;
sk->sk_protocol = protocol;
return 0;
}
static const struct proto_ops netlink_ops = {
.family = PF_NETLINK,
.owner = THIS_MODULE,
.release = netlink_release,
.bind = netlink_bind,
.connect = netlink_connect,
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = netlink_getname,
.poll = datagram_poll,
.ioctl = netlink_ioctl,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.setsockopt = netlink_setsockopt,
.getsockopt = netlink_getsockopt,
.sendmsg = netlink_sendmsg,
.recvmsg = netlink_recvmsg,
.mmap = sock_no_mmap,
.sendpage = sock_no_sendpage,
};
接收Netlink消息
从socket上接收数据包skb,并解析成netlink msg。
/*
* As we do 4.4BSD message passing we use a 4.4BSD message passing
* system, not 4.3. Thus msg_accrights(len) are now missing. They
* belong in an obscure libc emulation or the bin.
*/
struct msghdr {
void *msg_name; /* ptr to socket address structure */
int msg_namelen; /* size of socket address structure */
struct iov_iter msg_iter; /* data */
/*
* Ancillary data. msg_control_user is the user buffer used for the
* recv* side when msg_control_is_user is set, msg_control is the kernel
* buffer used for all other cases.
*/
union {
void *msg_control;
void __user *msg_control_user;
};
bool msg_control_is_user : 1;
__kernel_size_t msg_controllen; /* ancillary data buffer length */
unsigned int msg_flags; /* flags on received message */
struct kiocb *msg_iocb; /* ptr to iocb for async requests */
};
static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
int flags)
{
struct scm_cookie scm;
// 内核sock对象
struct sock *sk = sock->sk;
// netink_sock对象
struct netlink_sock *nlk = nlk_sk(sk);
int noblock = flags & MSG_DONTWAIT;
size_t copied;
struct sk_buff *skb, *data_skb;
int err, ret;
if (flags & MSG_OOB)
return -EOPNOTSUPP;
copied = 0;
// 从sk上接收数据包skb
skb = skb_recv_datagram(sk, flags, noblock, &err);
if (skb == NULL)
goto out;
data_skb = skb;
#ifdef CONFIG_COMPAT_NETLINK_MESSAGES
if (unlikely(skb_shinfo(skb)->frag_list)) {
/*
* If this skb has a frag_list, then here that means that we
* will have to use the frag_list skb's data for compat tasks
* and the regular skb's data for normal (non-compat) tasks.
*
* If we need to send the compat skb, assign it to the
* 'data_skb' variable so that it will be used below for data
* copying. We keep 'skb' for everything else, including
* freeing both later.
*/
if (flags & MSG_CMSG_COMPAT)
data_skb = skb_shinfo(skb)->frag_list;
}
#endif
/* Record the max length of recvmsg() calls for future allocations */
nlk->max_recvmsg_len = max(nlk->max_recvmsg_len, len);
nlk->max_recvmsg_len = min_t(size_t, nlk->max_recvmsg_len,
SKB_WITH_OVERHEAD(32768));
// 计算需要拷贝的长度
copied = data_skb->len;
if (len < copied) {
msg->msg_flags |= MSG_TRUNC;
copied = len;
}
// 从skb拷贝数据到msg
err = skb_copy_datagram_msg(data_skb, 0, msg, copied);
// socket地址和参数
if (msg->msg_name) {
DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name);
addr->nl_family = AF_NETLINK;
addr->nl_pad = 0;
addr->nl_pid = NETLINK_CB(skb).portid;
addr->nl_groups = netlink_group_mask(NETLINK_CB(skb).dst_group);
msg->msg_namelen = sizeof(*addr);
}
if (nlk->flags & NETLINK_F_RECV_PKTINFO)
netlink_cmsg_recv_pktinfo(msg, skb);
if (nlk->flags & NETLINK_F_LISTEN_ALL_NSID)
netlink_cmsg_listen_all_nsid(sk, msg, skb);
// 初始化scm_coookie
memset(&scm, 0, sizeof(scm));
scm.creds = *NETLINK_CREDS(skb);
if (flags & MSG_TRUNC)
copied = data_skb->len;
// 释放skb
skb_free_datagram(sk, skb);
if (nlk->cb_running &&
atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) {
ret = netlink_dump(sk);
if (ret) {
sk->sk_err = -ret;
sk_error_report(sk);
}
}
// scm:Socket level control messages processing,校验,并读取scm_cookie(进程信息、文件描述符等)
scm_recv(sock, msg, &scm, flags);
out:
// 唤醒sk处理
netlink_rcv_wake(sk);
return err ? : copied;
}
发送Netlink消息
将要发送过的netlink msg构造成skb数据包,然后发送。
static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
{
struct sock *sk = sock->sk;
struct netlink_sock *nlk = nlk_sk(sk);
DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name);
u32 dst_portid;
u32 dst_group;
struct sk_buff *skb;
int err;
struct scm_cookie scm;
u32 netlink_skb_flags = 0;
if (msg->msg_flags & MSG_OOB)
return -EOPNOTSUPP;
if (len == 0) {
pr_warn_once("Zero length message leads to an empty skb\n");
return -ENODATA;
}
// scm:Socket level control messages processing,校验msg,并初始化scm_cookie(进程信息、文件描述符等)
err = scm_send(sock, msg, &scm, true);
if (err < 0)
return err;
// socket地址
if (msg->msg_namelen) {
err = -EINVAL;
if (msg->msg_namelen < sizeof(struct sockaddr_nl))
goto out;
if (addr->nl_family != AF_NETLINK)
goto out;
dst_portid = addr->nl_pid;
dst_group = ffs(addr->nl_groups);
err = -EPERM;
if ((dst_group || dst_portid) &&
!netlink_allowed(sock, NL_CFG_F_NONROOT_SEND))
goto out;
netlink_skb_flags |= NETLINK_SKB_DST;
} else {
dst_portid = nlk->dst_portid;
dst_group = nlk->dst_group;
}
/* Paired with WRITE_ONCE() in netlink_insert() */
if (!READ_ONCE(nlk->bound)) {
err = netlink_autobind(sock);
if (err)
goto out;
} else {
/* Ensure nlk is hashed and visible. */
smp_rmb();
}
err = -EMSGSIZE;
if (len > sk->sk_sndbuf - 32)
goto out;
err = -ENOBUFS;
// 申请skb
skb = netlink_alloc_large_skb(len, dst_group);
if (skb == NULL)
goto out;
NETLINK_CB(skb).portid = nlk->portid;
NETLINK_CB(skb).dst_group = dst_group;
NETLINK_CB(skb).creds = scm.creds;
NETLINK_CB(skb).flags = netlink_skb_flags;
err = -EFAULT;
// 拷贝msg到skb
if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
kfree_skb(skb);
goto out;
}
// 发送skb
err = security_netlink_send(sk, skb);
if (err) {
kfree_skb(skb);
goto out;
}
// 广播
if (dst_group) {
refcount_inc(&skb->users);
netlink_broadcast(sk, skb, dst_portid, dst_group, GFP_KERNEL);
}
// 单播
err = netlink_unicast(sk, skb, dst_portid, msg->msg_flags & MSG_DONTWAIT);
out:
scm_destroy(&scm);
return err;
}