Netlink是一种常用于内核进程与用户进程间双工通信的进程间通信机制。前段时间遇到一个netlink在双系统手机上通信失败的问题,故对netlink的源码做了梳理。
1.内核netlink创建
以MTK的电池模块与应用程序通信为例,在模块初始化阶段,调用netlink_kernel_create()创建对应的sock,netlink_kernel_create()调用__netlink_kernel_create()。
static int __init battery_init(void)
{
struct netlink_kernel_cfg cfg = {
.input = nl_data_handler,
};
int ret;
gm.daemo_nl_sk = netlink_kernel_create(&init_net, NETLINK_FGD, &cfg);
.....................................................................
__netlink_kernel_create()首先调用sock_create_lite()创建socket结构体,然后调用__netlink_create()创建netlink_sock(包含sock),并初始化成员。
struct sock * __netlink_kernel_create(struct net *net, int unit, struct module *module,
struct netlink_kernel_cfg *cfg)
{
struct socket *sock;
struct sock *sk;
struct netlink_sock *nlk;
struct listeners *listeners = NULL;
struct mutex *cb_mutex = cfg ? cfg->cb_mutex : NULL;
unsigned int groups;
.................................................................................
if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock))
return NULL;
if (__netlink_create(net, sock, cb_mutex, unit, 1) < 0)
goto out_sock_release_nosk;
sk = sock->sk;
if (!cfg || cfg->groups < 32)
groups = 32;
else
groups = cfg->groups;
listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
if (!listeners)
goto out_sock_release;
sk->sk_data_ready = netlink_data_ready;
if (cfg && cfg->input)
nlk_sk(sk)->netlink_rcv = cfg->input;
................................................................................
__netlink_kernel_create()创建并初始化netlink_sock(包含sock)后,再调用netlink_insert(),以端口号(kernel固定为0)和net构建键值,并将sock的指针存放在hash表中以便查找。之所以键值的组成除了端口号还有net,是出于资源隔离的需要,这样一个net namespace中的进程就查不到另一个net namespace中进程插入的sock。
static int __netlink_insert(struct netlink_table *table, struct sock *sk)
{
struct netlink_compare_arg arg;
netlink_compare_arg_init(&arg, sock_net(sk), nlk_sk(sk)->portid);
return rhashtable_lookup_insert_key(&table->hash, &arg,
&nlk_sk(sk)->node,
netlink_rhashtable_params);
}
netlink_kernel_create()执行成功后的结构体关系如下图所示,创建了一个struct socket,其成员sk指向struct netlink_sock,以端口号(0)和net为参数构建键值,并存放在hash表中。
2.用户程序注册并绑定
用户程序首先调用socket()系统调用创建socket。
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
int retval;
struct socket *sock;
int flags;
...........................................................
retval = sock_create(family, type, protocol, &sock);
if (retval < 0)
goto out;
...........................................................
sock_create()调用__sock_create()。在__sock_create()中,首先调用sock_alloc()创建socket,然后再调用对应协议的create接口,netlink的create就是netlink_create()。
int __sock_create(struct net *net, int family, int type, int protocol,
struct socket **res, int kern)
{
int err;
struct socket *sock;
const struct net_proto_family *pf;
.................................................................
sock = sock_alloc();
if (!sock) {
net_warn_ratelimited("socket: no more sockets\n");
return -ENFILE; /* Not exactly a match, but its the
closest posix thing */
}
sock->type = type;
................................................................
rcu_read_lock();
pf = rcu_dereference(net_families[family]);
err = -EAFNOSUPPORT;
if (!pf)
goto out_release;
.................................................................
rcu_read_unlock();
err = pf->create(net, sock, protocol, kern);
if (err < 0)
goto out_module_put;
.................................................................
netlink_create()与__netlink_kernel_create()功能相似,就是调用__netlink_create()创建并初始化struct netlink_sock(包含sock)。
static int netlink_create(struct net *net, struct socket *sock, int protocol, int kern)
{
struct module *module = NULL;
struct mutex *cb_mutex;
struct netlink_sock *nlk;
int (*bind)(struct net *net, int group);
void (*unbind)(struct net *net, int group);
int err = 0;
sock->state = SS_UNCONNECTED;
..................................................................................
if (nl_table[protocol].registered &&
try_module_get(nl_table[protocol].module))
module = nl_table[protocol].module;
else
err = -EPROTONOSUPPORT;
cb_mutex = nl_table[protocol].cb_mutex;
bind = nl_table[protocol].bind;
unbind = nl_table[protocol].unbind;
netlink_unlock_table();
err = __netlink_create(net, sock, cb_mutex, protocol, kern);
...................................................................................
用户进程创建socket后,再调用bind绑定端口,用户进程经常用自己的pid作为端口号。bind系统调用对最终会调用到应协议的bind接口,netlink是 netlink_bind()。 netlink_bind()调用netlink_insert()把nladdr->nl_pid作为端口号 ,并以端口号和net为参数构建键值,将struct netlink_sock存放在hash表中。
static int netlink_bind(struct socket *sock, struct sockaddr *addr,
int addr_len)
{
struct sock *sk = sock->sk;
struct net *net = sock_net(sk);
struct netlink_sock *nlk = nlk_sk(sk);
struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
int err;
long unsigned int groups = nladdr->nl_groups;
bool bound;
...............................................................
bound = nlk->bound;
...............................................................
if (!bound) {
err = nladdr->nl_pid ?
netlink_insert(sk, nladdr->nl_pid) :
netlink_autobind(sock);
if (err) {
netlink_undo_bind(nlk->ngroups, groups, sk);
return err;
}
}
...............................................................
用户程序bind完成后的结构体关系如下所示,内核程序和用户程序都有对应的struct netlink_sock,且放在同一个hash表中。
3.数据通信过程
用户bind完成后,调用系统调用recvmsg准备接收内核进程发送的数据,对应netlink而言,recvmsg接口是netlink_recvmsg。
static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,int flags)
{
struct scm_cookie scm;
struct sock *sk = sock->sk;
struct netlink_sock *nlk = nlk_sk(sk);
int noblock = flags&MSG_DONTWAIT;
size_t copied;
struct sk_buff *skb, *data_skb;
int err, ret;
if (flags&MSG_OOB)
return -EOPNOTSUPP;
copied = 0;
skb = skb_recv_datagram(sk, flags, noblock, &err);
if (skb == NULL)
goto out;
....................................................................................
netlink_recvmsg()调用skb_recv_datagram()->__skb_recv_datagram()。__skb_recv_datagram()调用__skb_try_recv_datagram()查看sock->sk_receive_queues上是否有数据。如果有数据,skb不为空跳出循环;如果没有数据则休眠等待,休眠超时后再次查看。
struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
int *peeked, int *off, int *err)
{
struct sk_buff *skb, *last;
long timeo;
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
do {
skb = __skb_try_recv_datagram(sk, flags, peeked, off, err, &last);
if (skb)
return skb;
if (*err != -EAGAIN)
break;
} while (timeo && !__skb_wait_for_more_packets(sk, err, &timeo, last));
return NULL;
}
3.1用户进程发送数据给内核进程
要进行netlink通信得知道对方的端口号,内核进程的端口号是0,用户进程可以直接向端口0发送数据。但是内核进程并不知道用户进程的端口号,所以通常情况下,用户进程第一个向内核进程发送的数据就是自己的端口号,内核进程获得了这个端口号后才能向用户进程发送数据。
用户进程发送数据调用的是sendmsg,netlink对应的接口是netlink_sendmsg()。netlink_sendmsg()中分配一个sk_buff再把数据拷贝到该buffer中,然后调用 netlink_unicast()发送给目的端口。
static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
{
struct sock *sk = sock->sk;
struct netlink_sock *nlk = nlk_sk(sk);
DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name);
u32 dst_portid;
u32 dst_group;
struct sk_buff *skb;
int err;
struct scm_cookie scm;
u32 netlink_skb_flags = 0;
.........................................................................
//分配buffer
skb = netlink_alloc_large_skb(len, dst_group);
if (skb == NULL)
goto out;
.........................................................................
//拷贝数据
if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
kfree_skb(skb);
goto out;
}
........................................................................
//发送给制定端口
err = netlink_unicast(sk, skb, dst_portid, msg->msg_flags&MSG_DONTWAIT);
........................................................................
netlink_unicast()先通过目的端口号在hash表中找到目标sock,如果目标sock是内核进程创建的则调用netlink_unicast_kernel(),netlink_unicast_kernel()中调用netlink_sock的netlink_rcv(),即内核注册创建netlink时参数netlink_kernel_cfg的input成员。这样就把数据发送到了内核。
int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 portid, int nonblock)
{
struct sock *sk;
int err;
long timeo;
skb = netlink_trim(skb, gfp_any());
timeo = sock_sndtimeo(ssk, nonblock);
retry:
sk = netlink_getsockbyportid(ssk, portid);
if (IS_ERR(sk)) {
kfree_skb(skb);
return PTR_ERR(sk);
}
if (netlink_is_kernel(sk)) //如果是内核sock
return netlink_unicast_kernel(sk, skb, ssk);
................................................................................
3.2内核发送信息给用户进程
内核得到用户进程的端口号后就可以发送数据给用户进程了,内核直接调用netlink_unicast()发送给目的端口。netlink_unicast()在hash表中通过端口号,找到用户进程bind的sock,然后将数据buffer链入到目标sock的内核链表sk_receive_queue。
int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 portid, int nonblock)
{
struct sock *sk;
int err;
long timeo;
skb = netlink_trim(skb, gfp_any());
timeo = sock_sndtimeo(ssk, nonblock);
retry:
sk = netlink_getsockbyportid(ssk, portid);
...............................................................................
//条件不成立
if (netlink_is_kernel(sk))
return netlink_unicast_kernel(sk, skb, ssk);
...............................................................................
return netlink_sendskb(sk, skb);
}
static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
{
int len = skb->len;
netlink_deliver_tap(skb);
skb_queue_tail(&sk->sk_receive_queue, skb);
sk->sk_data_ready(sk);
return len;
}
用户进程在前面提到的__skb_recv_datagram中检查到有数据退出循环回到netlink_recvmsg()中,再把数据复复制到用户空间。
static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
int flags)
{
struct scm_cookie scm;
struct sock *sk = sock->sk;
struct netlink_sock *nlk = nlk_sk(sk);
int noblock = flags&MSG_DONTWAIT;
size_t copied;
struct sk_buff *skb, *data_skb;
int err, ret;
if (flags&MSG_OOB)
return -EOPNOTSUPP;
copied = 0;
skb = skb_recv_datagram(sk, flags, noblock, &err);
if (skb == NULL)
goto out;
data_skb = skb;
...........................................................................
err = skb_copy_datagram_msg(data_skb, 0, msg, copied);
...........................................................................