linux内核 netlink,Linux内核中netlink协议族的实现(上)

Linux内核中netlink协议族的实现(下)

本文档的Copyleft归yfydz所有,使用GPL发布,可以自由拷贝,转载,转载时请保持文档的完整性,严

禁用于任何商业用途。

msn: yfydz_no1@hotmail.com

来源:http://yfydz.cublog.cn

5.3 连接

连接通常是针对客户端连接服务器

static int netlink_connect(struct socket *sock, struct sockaddr *addr,

int alen, int flags)

{

int err = 0;

struct sock *sk = sock->sk;

struct netlink_sock *nlk = nlk_sk(sk);

struct sockaddr_nl *nladdr=(struct sockaddr_nl*)addr;

if (addr->sa_family == AF_UNSPEC) {

// 目的地址协议族为AF_UNSPEC(未指定), 简单返回成功

sk->sk_state = NETLINK_UNCONNECTED;

nlk->dst_pid = 0;

nlk->dst_group = 0;

return 0;

}

// 限制目的地址协议族类型为AF_NETLINK

if (addr->sa_family != AF_NETLINK)

return -EINVAL;

/* Only superuser is allowed to send multicasts */

// 只有ROOT权限才能多播

if (nladdr->nl_groups && !netlink_capable(sock, NL_NONROOT_SEND))

return -EPERM;

// 没指定pid的话自动绑定一个pid

if (!nlk->pid)

err = netlink_autobind(sock);

if (err == 0) {

// 已经指定了pid或者自动绑定成功时设置sock的对方参数, 状态为连接成功

sk->sk_state = NETLINK_CONNECTED;

nlk->dst_pid = nladdr->nl_pid;

nlk->dst_group = ffs(nladdr->nl_groups);

}

return err;

}

5.4 获取sock名称

// 填充sockaddr_nl结构中的数据

static int netlink_getname(struct socket *sock, struct sockaddr *addr, int *addr_len, int

peer)

{

struct sock *sk = sock->sk;

struct netlink_sock *nlk = nlk_sk(sk);

struct sockaddr_nl *nladdr=(struct sockaddr_nl *)addr;

// 协议族

nladdr->nl_family = AF_NETLINK;

nladdr->nl_pad = 0;

*addr_len = sizeof(*nladdr);

if (peer) {

// 对方sock的pid和groups

nladdr->nl_pid = nlk->dst_pid;

nladdr->nl_groups = netlink_group_mask(nlk->dst_group);

} else {

// 自己sock的pid和groups

nladdr->nl_pid = nlk->pid;

nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0;

}

return 0;

}

5.5 poll

poll是用poll(2)或select(2)系统调用选择套接口数据是否准备好时的处理函数,netlink用的是通用

的数据报的poll处理函数dategram_poll(), 说明略。

5.6 setsockopt

设置netlink sock的各种控制参数:

static int netlink_setsockopt(struct socket *sock, int level, int optname,

char __user *optval, int optlen)

{

struct sock *sk = sock->sk;

struct netlink_sock *nlk = nlk_sk(sk);

int val = 0, err;

// sock层次要为SOL_NETLINK

if (level != SOL_NETLINK)

return -ENOPROTOOPT;

// 读取用户空间的设置信息

if (optlen >= sizeof(int) &&

get_user(val, (int __user *)optval))

return -EFAULT;

switch (optname) {

case NETLINK_PKTINFO:

// 处理NETLINK_RECV_PKTINFO标志, 非0设置, 0为清除

if (val)

nlk->flags |= NETLINK_RECV_PKTINFO;

else

nlk->flags &= ~NETLINK_RECV_PKTINFO;

err = 0;

break;

case NETLINK_ADD_MEMBERSHIP:

case NETLINK_DROP_MEMBERSHIP: {

// 加入或退出多播组

unsigned int subscriptions;

int old, new = optname == NETLINK_ADD_MEMBERSHIP ? 1 : 0;

// 检查权限

if (!netlink_capable(sock, NL_NONROOT_RECV))

return -EPERM;

// 如果当前sock的多播组为空是分配空间

if (nlk->groups == NULL) {

err = netlink_alloc_groups(sk);

if (err)

return err;

}

// 检查数据范围

if (!val || val - 1 >= nlk->ngroups)

return -EINVAL;

netlink_table_grab();

// 原来的状态标志

old = test_bit(val - 1, nlk->groups);

// 如果old=1, new=0, subscriptions-1

// 如果old=0, new=1, subscriptions+1

subscriptions = nlk->subscriptions - old + new;

// 设置或清除相应状态标志

if (new)

__set_bit(val - 1, nlk->groups);

else

__clear_bit(val - 1, nlk->groups);

// 更新sock参数

netlink_update_subscriptions(sk, subscriptions);

netlink_update_listeners(sk);

netlink_table_ungrab();

err = 0;

break;

}

default:

err = -ENOPROTOOPT;

}

return err;

}

// 分配netlink sock的多播组空间

static int netlink_alloc_groups(struct sock *sk)

{

struct netlink_sock *nlk = nlk_sk(sk);

unsigned int groups;

int err = 0;

netlink_lock_table();

// 组的数量是内核初始化时固定的, 最小值32, 尽量是8的倍数

groups = nl_table[sk->sk_protocol].groups;

if (!nl_table[sk->sk_protocol].registered)

err = -ENOENT;

netlink_unlock_table();

if (err)

return err;

// NLGRPSZ(groups)进行8字节对齐

nlk->groups = kzalloc(NLGRPSZ(groups), GFP_KERNEL);

if (nlk->groups == NULL)

return -ENOMEM;

nlk->ngroups = groups;

return 0;

}

5.7 getsockopt

获取netlink sock的各种控制参数:

static int netlink_getsockopt(struct socket *sock, int level, int optname,

char __user *optval, int __user *optlen)

{

struct sock *sk = sock->sk;

struct netlink_sock *nlk = nlk_sk(sk);

int len, val, err;

// sock层次要为SOL_NETLINK

if (level != SOL_NETLINK)

return -ENOPROTOOPT;

// 读取用户空间的查询信息

if (get_user(len, optlen))

return -EFAULT;

if (len < 0)

return -EINVAL;

switch (optname) {

case NETLINK_PKTINFO:

// 只提供一种选项信息PKTINFO

if (len < sizeof(int))

return -EINVAL;

len = sizeof(int);

// 看sock标志是否有NETLINK_RECV_PKTINFO返回1或0

val = nlk->flags & NETLINK_RECV_PKTINFO ? 1 : 0;

if (put_user(len, optlen) ||

put_user(val, optval))

return -EFAULT;

err = 0;

break;

default:

err = -ENOPROTOOPT;

}

return err;

}

5.8 发送消息

从用户层发送数据到内核, 内核的sock是接收方

static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,

struct msghdr *msg, size_t len)

{

// sock的IO控制块

struct sock_iocb *siocb = kiocb_to_siocb(kiocb);

// socket -> sock

struct sock *sk = sock->sk;

// sock -> netlink sock

struct netlink_sock *nlk = nlk_sk(sk);

struct sockaddr_nl *addr=msg->msg_name;

u32 dst_pid;

u32 dst_group;

struct sk_buff *skb;

int err;

// scm: Socket level control messages processing

struct scm_cookie scm;

// 设置了OOB(out of band)标志, 在TCP中支持,netlink不支持

if (msg->msg_flags&MSG_OOB)

return -EOPNOTSUPP;

if (NULL == siocb->scm)

siocb->scm = &scm;

// scm这些处理是干什么的以后再看

err = scm_send(sock, msg, siocb->scm);

if (err < 0)

return err;

// 确定目的pid和组

if (msg->msg_namelen) {

if (addr->nl_family != AF_NETLINK)

return -EINVAL;

dst_pid = addr->nl_pid;

dst_group = ffs(addr->nl_groups);

if (dst_group && !netlink_capable(sock, NL_NONROOT_SEND))

return -EPERM;

} else {

dst_pid = nlk->dst_pid;

dst_group = nlk->dst_group;

}

// 如果sock的pid为0, 自动绑定一个pid

if (!nlk->pid) {

err = netlink_autobind(sock);

if (err)

goto out;

}

err = -EMSGSIZE;

// 消息长度太大

if (len > sk->sk_sndbuf - 32)

goto out;

err = -ENOBUFS;

// 新生成一个skb数据包

skb = nlmsg_new(len, GFP_KERNEL);

if (skb==NULL)

goto out;

// 设置该skb的netlink控制块参数

NETLINK_CB(skb).pid = nlk->pid;

NETLINK_CB(skb).dst_pid = dst_pid;

NETLINK_CB(skb).dst_group = dst_group;

NETLINK_CB(skb).loginuid = audit_get_loginuid(current->audit_context);

selinux_get_task_sid(current, &(NETLINK_CB(skb).sid));

memcpy(NETLINK_CREDS(skb), &siocb->scm->creds, sizeof(struct ucred));

/* What can I do? Netlink is asynchronous, so that

we will have to save current capabilities to

check them, when this message will be delivered

to corresponding kernel module. --ANK (980802)

*/

err = -EFAULT;

// 将发送的信息拷贝到skb的存储区

if (memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len)) {

kfree_skb(skb);

goto out;

}

/* @netlink_send:

* Save security information for a netlink message so that permission

* checking can be performed when the message is processed. The security

* information can be saved using the eff_cap field of the

* netlink_skb_parms structure. Also may be used to provide fine

* grained control over message transmission.

* @sk associated sock of task sending the message.,

* @skb contains the sk_buff structure for the netlink message.

* Return 0 if the information was successfully saved and message

* is allowed to be transmitted.

*/

err = security_netlink_send(sk, skb);

if (err) {

kfree_skb(skb);

goto out;

}

// 如果是多播的,先进行广播发送

if (dst_group) {

// 增加使用者计数, 使skb不会真正释放

atomic_inc(&skb->users);

netlink_broadcast(sk, skb, dst_pid, dst_group, GFP_KERNEL);

}

// 单播发送

err = netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT);

out:

return err;

}

// netlink广播, 发送到组内的全部sock

int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid,

u32 group, gfp_t allocation)

{

// netlink广播数据结构信息

struct netlink_broadcast_data info;

struct hlist_node *node;

struct sock *sk;

// 调整skb空间

skb = netlink_trim(skb, allocation);

// 填充info结构基本参数

info.exclude_sk = ssk;

info.pid = pid;

info.group = group;

info.failure = 0;

info.congested = 0;

info.delivered = 0;

info.allocation = allocation;

info.skb = skb;

info.skb2 = NULL;

/* While we sleep in clone, do not allow to change socket list */

netlink_lock_table();

// 遍历多播链表, 分别对每个sock进行单播

sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list)

do_one_broadcast(sk, &info);

// 释放skb, 其实没有立即释放, 要先减少使用者数

kfree_skb(skb);

netlink_unlock_table();

// 如果分配了skb2,释放之

if (info.skb2)

kfree_skb(info.skb2);

if (info.delivered) {

if (info.congested && (allocation & __GFP_WAIT))

yield();

return 0;

}

if (info.failure)

return -ENOBUFS;

return -ESRCH;

}

// 单一广播

static inline int do_one_broadcast(struct sock *sk,

struct netlink_broadcast_data *p)

{

struct netlink_sock *nlk = nlk_sk(sk);

int val;

if (p->exclude_sk == sk)

goto out;

// 检查pid和组是否合法

if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups ||

!test_bit(p->group - 1, nlk->groups))

goto out;

if (p->failure) {

netlink_overrun(sk);

goto out;

}

sock_hold(sk);

if (p->skb2 == NULL) {

if (skb_shared(p->skb)) {

// 克隆skb

p->skb2 = skb_clone(p->skb, p->allocation);

} else {

// 此时skb2不会为NULL的

p->skb2 = skb_get(p->skb);

/*

* skb ownership may have been set when

* delivered to a previous socket.

*/

skb_orphan(p->skb2);

}

}

if (p->skb2 == NULL) {

// 如果还是为NULL必然是克隆失败

netlink_overrun(sk);

/* Clone failed. Notify ALL listeners. */

p->failure = 1;

// 否则发送skb2

} else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) {

netlink_overrun(sk);

} else {

// 数据正常发送

p->congested |= val;

p->delivered = 1;

p->skb2 = NULL;

}

sock_put(sk);

out:

return 0;

}

static __inline__ int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb)

{

struct netlink_sock *nlk = nlk_sk(sk);

// 发送缓冲中要有足够空间

if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&

!test_bit(0, &nlk->state)) {

skb_set_owner_r(skb, sk);

// 添加到接收队列尾, 由于是本机内部通信, 可以自己找到要发送的目的方,

// 所以直接将数据扔给目的方, 所以是接收队列

skb_queue_tail(&sk->sk_receive_queue, skb);

// 调用netlink sock的sk_data_ready函数处理, 由此进入内核中netlink各协议

// 的回调处理

sk->sk_data_ready(sk, skb->len);

return atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf;

}

return -1;

}

// netlink单播

int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 pid, int nonblock)

{

struct sock *sk;

int err;

long timeo;

// 调整skb大小

skb = netlink_trim(skb, gfp_any());

// 获取超时时间

timeo = sock_sndtimeo(ssk, nonblock);

retry:

// ssk是服务器端的sock, 然后根据pid找到客户端的sock

sk = netlink_getsockbypid(ssk, pid);

if (IS_ERR(sk)) {

kfree_skb(skb);

return PTR_ERR(sk);

}

// 将数据包附着在客户端sock上

err = netlink_attachskb(sk, skb, nonblock, timeo, ssk);

if (err == 1)

goto retry;

if (err)

return err;

// 发送netlink数据包

return netlink_sendskb(sk, skb, ssk->sk_protocol);

}

/*

* Attach a skb to a netlink socket.

* The caller must hold a reference to the destination socket. On error, the

* reference is dropped. The skb is not send to the destination, just all

* all error checks are performed and memory in the queue is reserved.

* Return values:

* < 0: error. skb freed, reference to sock dropped.

* 0: continue

* 1: repeat lookup - reference dropped while waiting for socket memory.

*/

// 注意这个是内核全局函数, 非static

int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock,

long timeo, struct sock *ssk)

{

struct netlink_sock *nlk;

nlk = nlk_sk(sk);

// 检查接收缓存大小是否足够, 不够的话阻塞等待直到出错或条件满足

if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||

test_bit(0, &nlk->state)) {

// 声明当前进程的等待队列

DECLARE_WAITQUEUE(wait, current);

if (!timeo) {

if (!ssk || nlk_sk(ssk)->pid == 0)

netlink_overrun(sk);

sock_put(sk);

kfree_skb(skb);

return -EAGAIN;

}

// 设置当前进程状态为可中断的

__set_current_state(TASK_INTERRUPTIBLE);

// 将sock挂接到等待队列

add_wait_queue(&nlk->wait, &wait);

// 空间不够的话阻塞, timeo为阻塞超时

if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||

test_bit(0, &nlk->state)) &&

!sock_flag(sk, SOCK_DEAD))

timeo = schedule_timeout(timeo);

// 进程状态运行

__set_current_state(TASK_RUNNING);

// 删除等待队列

remove_wait_queue(&nlk->wait, &wait);

sock_put(sk);

if (signal_pending(current)) {

// 阻塞是通过超时解开的,而不是空间条件符合解开, 属于错误状态

kfree_skb(skb);

return sock_intr_errno(timeo);

}

// 返回1, 重新选sock

return 1;

}

// 条件满足, 直接将skb的所有者设为该netlink sock

skb_set_owner_r(skb, sk);

return 0;

}

// 注意这个是内核全局函数, 非static

int netlink_sendskb(struct sock *sk, struct sk_buff *skb, int protocol)

{

int len = skb->len;

// 将skb添加到接收队列末尾

skb_queue_tail(&sk->sk_receive_queue, skb);

// 调用netlink sock的sk_data_ready函数处理

sk->sk_data_ready(sk, len);

sock_put(sk);

return len;

}

5.9 接收消息

数据是内核传向用户空间的

static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,

struct msghdr *msg, size_t len,

int flags)

{

// sock的IO控制块

struct sock_iocb *siocb = kiocb_to_siocb(kiocb);

// scm

struct scm_cookie scm;

// socket -> sock

struct sock *sk = sock->sk;

// sock -> netlink sock

struct netlink_sock *nlk = nlk_sk(sk);

// 是否是非阻塞的

int noblock = flags&MSG_DONTWAIT;

size_t copied;

struct sk_buff *skb;

int err;

// 不能带OOB标志

if (flags&MSG_OOB)

return -EOPNOTSUPP;

copied = 0;

// 接收一个数据包

skb = skb_recv_datagram(sk,flags,noblock,&err);

if (skb==NULL)

goto out;

msg->msg_namelen = 0;

// 收到的实际数据长度

copied = skb->len;

// 接收缓冲小于数据长度, 设置数据裁剪标志

if (len < copied) {

msg->msg_flags |= MSG_TRUNC;

copied = len;

}

skb->h.raw = skb->data;

// 将skb的数据拷贝到接收缓冲区

err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);

if (msg->msg_name) {

// sock有效, 填写nl sock的数据

struct sockaddr_nl *addr = (struct sockaddr_nl*)msg->msg_name;

addr->nl_family = AF_NETLINK;

addr->nl_pad = 0;

addr->nl_pid = NETLINK_CB(skb).pid;

addr->nl_groups = netlink_group_mask(NETLINK_CB(skb).dst_group);

msg->msg_namelen = sizeof(*addr);

}

// 接收数据包信息标志, 将消息头拷贝到用户空间

if (nlk->flags & NETLINK_RECV_PKTINFO)

netlink_cmsg_recv_pktinfo(msg, skb);

if (NULL == siocb->scm) {

memset(&scm, 0, sizeof(scm));

siocb->scm = &scm;

}

siocb->scm->creds = *NETLINK_CREDS(skb);

skb_free_datagram(sk, skb);

if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2)

netlink_dump(sk);

scm_recv(sock, msg, siocb->scm, flags);

out:

// 接收唤醒

netlink_rcv_wake(sk);

return err ? : copied;

}

6. 结论

netlink处理代码不是很好懂, 毕竟和其他协议不同之处是内核中同时存在服务器和客户端的sock, 因

此接收发送数据要注意数据的流向。不过在实际使用中感觉不是很稳定, 流量大时会发生各种奇异的死机现象。

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值