1.netlink介绍
netlink socket是一种用于用户态进程和内核态进程之间的通信机制。它通过为内核模块提供一组特殊的API,并为用户程序提供了一组标准的socket接口的方式,实现了全双工的通讯连接。
netlink仅支持32种协议类型,目前内核中已经使用其中21个,对于用户需要定制特殊的协议类型略显不够,为此Linux设计了这种通用Netlink协议簇,用户可在此之上定义更多类型的子协议。
#define NETLINK_ROUTE 0 /* Routing/device hook */
#define NETLINK_UNUSED 1 /* Unused number */
#define NETLINK_USERSOCK 2 /* Reserved for user mode socket protocols */
#define NETLINK_FIREWALL 3 /* Unused number, formerly ip_queue */
#define NETLINK_SOCK_DIAG 4 /* socket monitoring */
#define NETLINK_NFLOG 5 /* netfilter/iptables ULOG */
#define NETLINK_XFRM 6 /* ipsec */
#define NETLINK_SELINUX 7 /* SELinux event notifications */
#define NETLINK_ISCSI 8 /* Open-iSCSI */
#define NETLINK_AUDIT 9 /* auditing */
#define NETLINK_FIB_LOOKUP 10
#define NETLINK_CONNECTOR 11
#define NETLINK_NETFILTER 12 /* netfilter subsystem */
#define NETLINK_IP6_FW 13
#define NETLINK_DNRTMSG 14 /* DECnet routing messages */
#define NETLINK_KOBJECT_UEVENT 15 /* Kernel messages to userspace */
#define NETLINK_GENERIC 16
/* leave room for NETLINK_DM (DM Events) */
#define NETLINK_SCSITRANSPORT 18 /* SCSI Transports */
#define NETLINK_ECRYPTFS 19
#define NETLINK_RDMA 20
#define NETLINK_CRYPTO 21 /* Crypto layer */
#define NETLINK_SMC 22 /* SMC monitoring */
#define NETLINK_INET_DIAG NETLINK_SOCK_DIAG
#define MAX_LINKS 32
generic netlink支持1023个子协议号,弥补了netlink协议类型较少的缺陷。#define NETLINK_GENERIC 16
2 netlink通信架构
Netlink子系统(3)是所有genl通信的基础,Netlink子系统中收到的所有Generic类型的netlink数据都被送到genl总线(2)上;从内核发出的数据也经由genl总线送至netlink子系统,再打包送至用户空间。
Generic Netlink控制器(1)作为内核的一部分,负责动态地分配genl通道(即genl family id),并管理genl任务。genl控制器是一个特殊的genl内核用户,它负责监听genl bus上的通信通道。genl通信建立在一系列的通信通道的基础上,每个genl family对应多个通道,这些通道由genl控制器动态分配。
2.1 netlink类型的消息结构
2.1.1 相关结构体
netlink 必须采用特定的格式,开头是 netlink 消息报头(struct nlmsghdr),通用 netlink 还有一个通用 netlink 消息报头(struct genlmsghdr)
struct nlmsghdr {
__u32 nlmsg_len; /* Length of message including header */
__u16 nlmsg_type; /* Message content */
__u16 nlmsg_flags; /* Additional flags */
__u32 nlmsg_seq; /* Sequence number */
__u32 nlmsg_pid; /* Sending process port ID */
};
struct genlmsghdr {
__u8 cmd;
__u8 version;
__u16 reserved;
};
struct genl_family {
int id; /* private */
unsigned int hdrsize;
char name[GENL_NAMSIZ];
unsigned int version;
unsigned int maxattr;
bool netnsok;
bool parallel_ops;
int (*pre_doit)(const struct genl_ops *ops,
struct sk_buff *skb,
struct genl_info *info);
void (*post_doit)(const struct genl_ops *ops,
struct sk_buff *skb,
struct genl_info *info);
struct nlattr ** attrbuf; /* private */
const struct genl_ops * ops; //cmd操作注册点
const struct genl_multicast_group *mcgrps;
unsigned int n_ops;
unsigned int n_mcgrps;
unsigned int mcgrp_offset; /* private */
struct module *module;
};
struct genl_ops {
const struct nla_policy *policy;
int (*doit)(struct sk_buff *skb,
struct genl_info *info);
int (*start)(struct netlink_callback *cb);
int (*dumpit)(struct sk_buff *skb,
struct netlink_callback *cb);
int (*done)(struct netlink_callback *cb);
u8 cmd;
u8 internal_flags;
u8 flags;
};
- doit:回调函数,在generic netlink收到数据时触发,运行在进程上下文
- dumpit:回调函数,当genl_ops的flag标志被添加了NLM_F_DUMP以后,每次收到genl消息即会回触发这个函数
dumpit与doit的区别是:dumpit的第一个参数skb不会携带从客户端发来的数据。相反地,开发者应该在skb中填入需要传给客户端的数据,然后,并skb的数据长度(可以用skb->len)return。skb中携带的数据会被自动送到客户端。只要dumpit的返回值大于0,dumpit函数就会再次被调用,并被要求在skb中填入数据。当服务端没有数据要传给客户端时,dumpit要返回0。如果函数中出错,要求返回一个负值。
3 netlink 在ovs中的使用
前面已经提到netlink仅支持32种协议类型,目前内核中已经使用其中21个,对于用户需要定制特殊的协议类型略显不够,为此Linux设计了这种通用Netlink协议簇,在ovs中使用的也是Generic Netlink
ovs定义的Generic Netlink协议:
static struct genl_family * const dp_genl_families[] = {
&dp_datapath_genl_family,
&dp_vport_genl_family,
&dp_flow_genl_family,
&dp_packet_genl_family,
&dp_meter_genl_family,
#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
&dp_ct_limit_genl_family,
#endif
};
以dp_datapath_genl_family为例看一下注册的cmd处理函数和policy策略
static struct genl_family dp_datapath_genl_family __ro_after_init = {
.hdrsize = sizeof(struct ovs_header),
.name = OVS_DATAPATH_FAMILY,
.version = OVS_DATAPATH_VERSION,
.maxattr = OVS_DP_ATTR_MAX,
.netnsok = true,
.parallel_ops = true,
.ops = dp_datapath_genl_ops,
.n_ops = ARRAY_SIZE(dp_datapath_genl_ops),
.mcgrps = &ovs_dp_datapath_multicast_group,
.n_mcgrps = 1,
.module = THIS_MODULE,
};
dp_datapath 类型的netlink消息支持的cmd操作
static const struct genl_ops dp_datapath_genl_ops[] = {
{ .cmd = OVS_DP_CMD_NEW,
.flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = datapath_policy,
.doit = ovs_dp_cmd_new
},
{ .cmd = OVS_DP_CMD_DEL,
.flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = datapath_policy,
.doit = ovs_dp_cmd_del
},
{ .cmd = OVS_DP_CMD_GET,
.flags = 0, /* OK for unprivileged users. */
.policy = datapath_policy,
.doit = ovs_dp_cmd_get,
.dumpit = ovs_dp_cmd_dump
},
{ .cmd = OVS_DP_CMD_SET,
.flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = datapath_policy,
.doit = ovs_dp_cmd_set,
},
};
每个cmd操作的policy
static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = {
[OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
[OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 },
[OVS_DP_ATTR_USER_FEATURES] = { .type = NLA_U32 },
};
3.1 Generic Netlink注册
static int __init dp_register_genl(void)
{
int err;
int i;
for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) {
err = genl_register_family(dp_genl_families[i]);
if (err)
goto error;
}
return 0;
error:
dp_unregister_genl(i);
return err;
}
int genl_register_family(struct genl_family *family)
{
int err, i;
int start = GENL_START_ALLOC, end = GENL_MAX_ID;
err = genl_validate_ops(family);
if (err)
return err;
genl_lock_all();
if (genl_family_find_byname(family->name)) {
err = -EEXIST;
goto errout_locked;
}
/*
* Sadly, a few cases need to be special-cased
* due to them having previously abused the API
* and having used their family ID also as their
* multicast group ID, so we use reserved IDs
* for both to be sure we can do that mapping.
*/
if (family == &genl_ctrl) {
/* and this needs to be special for initial family lookups */
start = end = GENL_ID_CTRL;
} else if (strcmp(family->name, "pmcraid") == 0) {
start = end = GENL_ID_PMCRAID;
} else if (strcmp(family->name, "VFS_DQUOT") == 0) {
start = end = GENL_ID_VFS_DQUOT;
}
if (family->maxattr && !family->parallel_ops) {
family->attrbuf = kmalloc_array(family->maxattr + 1,
sizeof(struct nlattr *),
GFP_KERNEL);
if (family->attrbuf == NULL) {
err = -ENOMEM;
goto errout_locked;
}
} else
family->attrbuf = NULL;
family->id = idr_alloc(&genl_fam_idr, family,
start, end + 1, GFP_KERNEL); //分配ID,每个netlink一个ID
if (family->id < 0) {
err = family->id;
goto errout_free;
}
err = genl_validate_assign_mc_groups(family);
if (err)
goto errout_remove;
genl_unlock_all();
/* send all events */
genl_ctrl_event(CTRL_CMD_NEWFAMILY, family, NULL, 0); //用于向所有的应用层加入CTRL控制器簇组播组的Generic Netlink套接字多播发送CTRL_CMD_NEWFAMILY消息,通知应用层有新的family注册了,这样应用层就可以捕获这一消息
for (i = 0; i < family->n_mcgrps; i++)
genl_ctrl_event(CTRL_CMD_NEWMCAST_GRP, family,
&family->mcgrps[i], family->mcgrp_offset + i);
return 0;
errout_remove:
idr_remove(&genl_fam_idr, family->id);
errout_free:
kfree(family->attrbuf);
errout_locked:
genl_unlock_all();
return err;
}
3.2 内核空间接收用户态数据
内核端一旦收到generic netlink数据,会触发doit函数运行,通过回调函数进行处理。
注册generic netlink总线的处理函数 genl_rcv
static int __net_init genl_pernet_init(struct net *net)
{
struct netlink_kernel_cfg cfg = {
.input = genl_rcv,
.flags = NL_CFG_F_NONROOT_RECV,
};
/* we'll bump the group number right afterwards */
net->genl_sock = netlink_kernel_create(net, NETLINK_GENERIC, &cfg);
if (!net->genl_sock && net_eq(net, &init_net))
panic("GENL: Cannot initialize generic netlink\n");
if (!net->genl_sock)
return -ENOMEM;
return 0;
}
static void genl_rcv(struct sk_buff *skb)
{
down_read(&cb_lock);
netlink_rcv_skb(skb, &genl_rcv_msg);
up_read(&cb_lock);
}
static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
const struct genl_family *family;
int err;
family = genl_family_find_byid(nlh->nlmsg_type);
if (family == NULL)
return -ENOENT;
if (!family->parallel_ops)
genl_lock();
err = genl_family_rcv_msg(family, skb, nlh, extack);
if (!family->parallel_ops)
genl_unlock();
return err;
}
static int genl_family_rcv_msg(const struct genl_family *family,
struct sk_buff *skb,
struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
const struct genl_ops *ops;
struct net *net = sock_net(skb->sk);
struct genl_info info;
struct genlmsghdr *hdr = nlmsg_data(nlh);
struct nlattr **attrbuf;
int hdrlen, err;
/* this family doesn't exist in this netns */
if (!family->netnsok && !net_eq(net, &init_net))
return -ENOENT;
hdrlen = GENL_HDRLEN + family->hdrsize;
if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen))
return -EINVAL;
ops = genl_get_cmd(hdr->cmd, family); //通过generic netlink的ID获取对应的ops,family即为ovs注册各类协议
if (ops == NULL)
return -EOPNOTSUPP;
if ((ops->flags & GENL_ADMIN_PERM) &&
!netlink_capable(skb, CAP_NET_ADMIN))
return -EPERM;
if ((ops->flags & GENL_UNS_ADMIN_PERM) &&
!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
return -EPERM;
if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) {
int rc;
if (ops->dumpit == NULL)
return -EOPNOTSUPP;
if (!family->parallel_ops) {
struct netlink_dump_control c = {
.module = family->module,
/* we have const, but the netlink API doesn't */
.data = (void *)ops,
.start = genl_lock_start,
.dump = genl_lock_dumpit,
.done = genl_lock_done,
};
genl_unlock();
rc = __netlink_dump_start(net->genl_sock, skb, nlh, &c);
genl_lock();
} else {
struct netlink_dump_control c = {
.module = family->module,
.start = ops->start,
.dump = ops->dumpit,
.done = ops->done,
};
rc = __netlink_dump_start(net->genl_sock, skb, nlh, &c);
}
return rc;
}
if (ops->doit == NULL)
return -EOPNOTSUPP;
if (family->maxattr && family->parallel_ops) {
attrbuf = kmalloc_array(family->maxattr + 1,
sizeof(struct nlattr *),
GFP_KERNEL);
if (attrbuf == NULL)
return -ENOMEM;
} else
attrbuf = family->attrbuf;
if (attrbuf) {
err = nlmsg_parse(nlh, hdrlen, attrbuf, family->maxattr,
ops->policy, extack);
if (err < 0)
goto out;
}
info.snd_seq = nlh->nlmsg_seq;
info.snd_portid = NETLINK_CB(skb).portid;
info.nlhdr = nlh;
info.genlhdr = nlmsg_data(nlh); //generic netlink消息头
info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN; //指向的是ovs_header结构体,结构中只有一个成员变量dp_ifindex
info.attrs = attrbuf; //用户态传入的属性值
info.extack = extack;
genl_info_net_set(&info, net);
memset(&info.user_ptr, 0, sizeof(info.user_ptr));
if (family->pre_doit) {
err = family->pre_doit(ops, skb, &info);
if (err)
goto out;
}
err = ops->doit(skb, &info);//调用cmd处理函数
if (family->post_doit)
family->post_doit(ops, skb, &info);
out:
if (family->parallel_ops)
kfree(attrbuf);
return err;
}
3.3 用户态使用netlink协议
在使用generic netlink消息时,首先需要根据定义的名称获取其ID,ID是在注册时用ctl控制器统一分配的。
ovs在与datapath通信时同样也需要首先获取各类generic netlink的ID
static int
dpif_netlink_init(void)
{
static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
static int error;
if (ovsthread_once_start(&once)) {
error = nl_lookup_genl_family(OVS_DATAPATH_FAMILY,
&ovs_datapath_family);
if (error) {
VLOG_INFO("Generic Netlink family '%s' does not exist. "
"The Open vSwitch kernel module is probably not loaded.",
OVS_DATAPATH_FAMILY);
}
if (!error) {
error = nl_lookup_genl_family(OVS_VPORT_FAMILY, &ovs_vport_family);
}
if (!error) {
error = nl_lookup_genl_family(OVS_FLOW_FAMILY, &ovs_flow_family);
}
if (!error) {
error = nl_lookup_genl_family(OVS_PACKET_FAMILY,
&ovs_packet_family);
}
if (!error) {
error = nl_lookup_genl_mcgroup(OVS_VPORT_FAMILY, OVS_VPORT_MCGROUP,
&ovs_vport_mcgroup);
}
if (!error) {
if (nl_lookup_genl_family(OVS_METER_FAMILY, &ovs_meter_family)) {
VLOG_INFO("The kernel module does not support meters.");
}
}
if (nl_lookup_genl_family(OVS_CT_LIMIT_FAMILY,
&ovs_ct_limit_family) < 0) {
VLOG_INFO("Generic Netlink family '%s' does not exist. "
"Please update the Open vSwitch kernel module to enable "
"the conntrack limit feature.", OVS_CT_LIMIT_FAMILY);
}
ovs_tunnels_out_of_tree = dpif_netlink_rtnl_probe_oot_tunnels();
ovsthread_once_done(&once);
}
return error;
}
#define OVS_DATAPATH_FAMILY "ovs_datapath"
error = nl_lookup_genl_family(OVS_DATAPATH_FAMILY,
&ovs_datapath_family);
根据name获取id,此处使用的name与在内核datapath注册时填入的name一致。
int
nl_lookup_genl_family(const char *name, int *number)
{
if (*number == 0) {
struct nlattr *attrs[ARRAY_SIZE(family_policy)];
struct ofpbuf *reply;
int error;
error = do_lookup_genl_family(name, attrs, &reply);
if (!error) {
*number = nl_attr_get_u16(attrs[CTRL_ATTR_FAMILY_ID]);
define_genl_family(*number, name);
} else {
*number = -error;
}
ofpbuf_delete(reply);
ovs_assert(*number != 0);
}
return *number > 0 ? 0 : -*number;
}
static int
do_lookup_genl_family(const char *name, struct nlattr **attrs,
struct ofpbuf **replyp)
{
struct nl_sock *sock;
struct ofpbuf request, *reply;
int error;
*replyp = NULL;
error = nl_sock_create(NETLINK_GENERIC, &sock); //创建NETLINK_GENERIC类型的socket
if (error) {
return error;
}
ofpbuf_init(&request, 0);
nl_msg_put_genlmsghdr(&request, 0, GENL_ID_CTRL, NLM_F_REQUEST,
CTRL_CMD_GETFAMILY, 1); //填充消息头,消息发送给内核的crtl模块,cmd为CTRL_CMD_GETFAMILY,根据协议名称获取ID
nl_msg_put_string(&request, CTRL_ATTR_FAMILY_NAME, name); //填充要获取ID的协议名称
error = nl_sock_transact(sock, &request, &reply);
ofpbuf_uninit(&request);
if (error) {
nl_sock_destroy(sock);
return error;
}
if (!nl_policy_parse(reply, NLMSG_HDRLEN + GENL_HDRLEN,
family_policy, attrs, ARRAY_SIZE(family_policy))
|| nl_attr_get_u16(attrs[CTRL_ATTR_FAMILY_ID]) == 0) {
nl_sock_destroy(sock);
ofpbuf_delete(reply);
return EPROTO;
}
nl_sock_destroy(sock);
*replyp = reply;
return 0;
}