vswitch 与内核datapath通信机制 Generic Netlink介绍

1.netlink介绍 

netlink socket是一种用于用户态进程和内核态进程之间的通信机制。它通过为内核模块提供一组特殊的API,并为用户程序提供了一组标准的socket接口的方式,实现了全双工的通讯连接。

netlink仅支持32种协议类型,目前内核中已经使用其中21个,对于用户需要定制特殊的协议类型略显不够,为此Linux设计了这种通用Netlink协议簇,用户可在此之上定义更多类型的子协议。

#define NETLINK_ROUTE		0	/* Routing/device hook				*/
#define NETLINK_UNUSED		1	/* Unused number				*/
#define NETLINK_USERSOCK	2	/* Reserved for user mode socket protocols 	*/
#define NETLINK_FIREWALL	3	/* Unused number, formerly ip_queue		*/
#define NETLINK_SOCK_DIAG	4	/* socket monitoring				*/
#define NETLINK_NFLOG		5	/* netfilter/iptables ULOG */
#define NETLINK_XFRM		6	/* ipsec */
#define NETLINK_SELINUX		7	/* SELinux event notifications */
#define NETLINK_ISCSI		8	/* Open-iSCSI */
#define NETLINK_AUDIT		9	/* auditing */
#define NETLINK_FIB_LOOKUP	10	
#define NETLINK_CONNECTOR	11
#define NETLINK_NETFILTER	12	/* netfilter subsystem */
#define NETLINK_IP6_FW		13
#define NETLINK_DNRTMSG		14	/* DECnet routing messages */
#define NETLINK_KOBJECT_UEVENT	15	/* Kernel messages to userspace */
#define NETLINK_GENERIC		16
/* leave room for NETLINK_DM (DM Events) */
#define NETLINK_SCSITRANSPORT	18	/* SCSI Transports */
#define NETLINK_ECRYPTFS	19
#define NETLINK_RDMA		20
#define NETLINK_CRYPTO		21	/* Crypto layer */
#define NETLINK_SMC		22	/* SMC monitoring */

#define NETLINK_INET_DIAG	NETLINK_SOCK_DIAG

#define MAX_LINKS 32		

generic netlink支持1023个子协议号,弥补了netlink协议类型较少的缺陷。#define NETLINK_GENERIC        16

2 netlink通信架构

Netlink子系统(3)是所有genl通信的基础,Netlink子系统中收到的所有Generic类型的netlink数据都被送到genl总线(2)上;从内核发出的数据也经由genl总线送至netlink子系统,再打包送至用户空间。

Generic Netlink控制器(1)作为内核的一部分,负责动态地分配genl通道(即genl family id),并管理genl任务。genl控制器是一个特殊的genl内核用户,它负责监听genl bus上的通信通道。genl通信建立在一系列的通信通道的基础上,每个genl family对应多个通道,这些通道由genl控制器动态分配。

2.1  netlink类型的消息结构

 2.1.1 相关结构体

netlink 必须采用特定的格式,开头是 netlink 消息报头(struct nlmsghdr),通用 netlink 还有一个通用 netlink 消息报头(struct genlmsghdr)

struct nlmsghdr {
    __u32        nlmsg_len;    /* Length of message including header */
    __u16        nlmsg_type;    /* Message content */
    __u16        nlmsg_flags;    /* Additional flags */
    __u32        nlmsg_seq;    /* Sequence number */
    __u32        nlmsg_pid;    /* Sending process port ID */
};
struct genlmsghdr {
    __u8    cmd;
    __u8    version;
    __u16    reserved;
};
struct genl_family {
    int            id;        /* private */
    unsigned int        hdrsize;
    char            name[GENL_NAMSIZ];
    unsigned int        version;
    unsigned int        maxattr;
    bool            netnsok;
    bool            parallel_ops;
    int            (*pre_doit)(const struct genl_ops *ops,
                        struct sk_buff *skb,
                        struct genl_info *info);
    void            (*post_doit)(const struct genl_ops *ops,
                         struct sk_buff *skb,
                         struct genl_info *info);
    struct nlattr **    attrbuf;    /* private */
    const struct genl_ops *    ops;  //cmd操作注册点
    const struct genl_multicast_group *mcgrps;
    unsigned int        n_ops;
    unsigned int        n_mcgrps;
    unsigned int        mcgrp_offset;    /* private */
    struct module        *module;
};
struct genl_ops {
    const struct nla_policy    *policy;
    int               (*doit)(struct sk_buff *skb,
                       struct genl_info *info);

    int               (*start)(struct netlink_callback *cb);
    int               (*dumpit)(struct sk_buff *skb,
                     struct netlink_callback *cb);

    int               (*done)(struct netlink_callback *cb);
    u8            cmd;
    u8            internal_flags;
    u8            flags;
};

  • doit:回调函数,在generic netlink收到数据时触发,运行在进程上下文
  • dumpit:回调函数,当genl_ops的flag标志被添加了NLM_F_DUMP以后,每次收到genl消息即会回触发这个函数

dumpit与doit的区别是:dumpit的第一个参数skb不会携带从客户端发来的数据。相反地,开发者应该在skb中填入需要传给客户端的数据,然后,并skb的数据长度(可以用skb->len)return。skb中携带的数据会被自动送到客户端。只要dumpit的返回值大于0,dumpit函数就会再次被调用,并被要求在skb中填入数据。当服务端没有数据要传给客户端时,dumpit要返回0。如果函数中出错,要求返回一个负值。

3 netlink 在ovs中的使用

前面已经提到netlink仅支持32种协议类型,目前内核中已经使用其中21个,对于用户需要定制特殊的协议类型略显不够,为此Linux设计了这种通用Netlink协议簇,在ovs中使用的也是Generic Netlink

ovs定义的Generic Netlink协议:

static struct genl_family * const dp_genl_families[] = {
    &dp_datapath_genl_family,
    &dp_vport_genl_family,
    &dp_flow_genl_family,
    &dp_packet_genl_family,
    &dp_meter_genl_family,
#if    IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
    &dp_ct_limit_genl_family,
#endif
};

以dp_datapath_genl_family为例看一下注册的cmd处理函数和policy策略

static struct genl_family dp_datapath_genl_family __ro_after_init = {
	.hdrsize = sizeof(struct ovs_header),
	.name = OVS_DATAPATH_FAMILY,
	.version = OVS_DATAPATH_VERSION,
	.maxattr = OVS_DP_ATTR_MAX,
	.netnsok = true,
	.parallel_ops = true,
	.ops = dp_datapath_genl_ops,
	.n_ops = ARRAY_SIZE(dp_datapath_genl_ops),
	.mcgrps = &ovs_dp_datapath_multicast_group,
	.n_mcgrps = 1,
	.module = THIS_MODULE,
};

dp_datapath 类型的netlink消息支持的cmd操作
static const struct genl_ops dp_datapath_genl_ops[] = {
	{ .cmd = OVS_DP_CMD_NEW,
	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
	  .policy = datapath_policy,
	  .doit = ovs_dp_cmd_new
	},
	{ .cmd = OVS_DP_CMD_DEL,
	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
	  .policy = datapath_policy,
	  .doit = ovs_dp_cmd_del
	},
	{ .cmd = OVS_DP_CMD_GET,
	  .flags = 0,		    /* OK for unprivileged users. */
	  .policy = datapath_policy,
	  .doit = ovs_dp_cmd_get,
	  .dumpit = ovs_dp_cmd_dump
	},
	{ .cmd = OVS_DP_CMD_SET,
	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
	  .policy = datapath_policy,
	  .doit = ovs_dp_cmd_set,
	},
};

每个cmd操作的policy
static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = {
	[OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
	[OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 },
	[OVS_DP_ATTR_USER_FEATURES] = { .type = NLA_U32 },
};

3.1 Generic Netlink注册

static int __init dp_register_genl(void)
{
	int err;
	int i;

	for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) {

		err = genl_register_family(dp_genl_families[i]);
		if (err)
			goto error;
	}

	return 0;

error:
	dp_unregister_genl(i);
	return err;
}

int genl_register_family(struct genl_family *family)
{
	int err, i;
	int start = GENL_START_ALLOC, end = GENL_MAX_ID;

	err = genl_validate_ops(family);
	if (err)
		return err;

	genl_lock_all();

	if (genl_family_find_byname(family->name)) {
		err = -EEXIST;
		goto errout_locked;
	}

	/*
	 * Sadly, a few cases need to be special-cased
	 * due to them having previously abused the API
	 * and having used their family ID also as their
	 * multicast group ID, so we use reserved IDs
	 * for both to be sure we can do that mapping.
	 */
	if (family == &genl_ctrl) {
		/* and this needs to be special for initial family lookups */
		start = end = GENL_ID_CTRL;
	} else if (strcmp(family->name, "pmcraid") == 0) {
		start = end = GENL_ID_PMCRAID;
	} else if (strcmp(family->name, "VFS_DQUOT") == 0) {
		start = end = GENL_ID_VFS_DQUOT;
	}

	if (family->maxattr && !family->parallel_ops) {
		family->attrbuf = kmalloc_array(family->maxattr + 1,
						sizeof(struct nlattr *),
						GFP_KERNEL);
		if (family->attrbuf == NULL) {
			err = -ENOMEM;
			goto errout_locked;
		}
	} else
		family->attrbuf = NULL;

	family->id = idr_alloc(&genl_fam_idr, family,
			       start, end + 1, GFP_KERNEL); //分配ID,每个netlink一个ID
	if (family->id < 0) {
		err = family->id;
		goto errout_free;
	}

	err = genl_validate_assign_mc_groups(family);
	if (err)
		goto errout_remove;

	genl_unlock_all();

	/* send all events */
	genl_ctrl_event(CTRL_CMD_NEWFAMILY, family, NULL, 0); //用于向所有的应用层加入CTRL控制器簇组播组的Generic Netlink套接字多播发送CTRL_CMD_NEWFAMILY消息,通知应用层有新的family注册了,这样应用层就可以捕获这一消息
	for (i = 0; i < family->n_mcgrps; i++)
		genl_ctrl_event(CTRL_CMD_NEWMCAST_GRP, family,
				&family->mcgrps[i], family->mcgrp_offset + i);

	return 0;

errout_remove:
	idr_remove(&genl_fam_idr, family->id);
errout_free:
	kfree(family->attrbuf);
errout_locked:
	genl_unlock_all();
	return err;
}

3.2 内核空间接收用户态数据

内核端一旦收到generic netlink数据,会触发doit函数运行,通过回调函数进行处理。

注册generic netlink总线的处理函数 genl_rcv

static int __net_init genl_pernet_init(struct net *net)
{
	struct netlink_kernel_cfg cfg = {
		.input		= genl_rcv,
		.flags		= NL_CFG_F_NONROOT_RECV,
	};

	/* we'll bump the group number right afterwards */
	net->genl_sock = netlink_kernel_create(net, NETLINK_GENERIC, &cfg);

	if (!net->genl_sock && net_eq(net, &init_net))
		panic("GENL: Cannot initialize generic netlink\n");

	if (!net->genl_sock)
		return -ENOMEM;

	return 0;
}
static void genl_rcv(struct sk_buff *skb)
{
	down_read(&cb_lock);
	netlink_rcv_skb(skb, &genl_rcv_msg);
	up_read(&cb_lock);
}

static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
			struct netlink_ext_ack *extack)
{
	const struct genl_family *family;
	int err;

	family = genl_family_find_byid(nlh->nlmsg_type);
	if (family == NULL)
		return -ENOENT;

	if (!family->parallel_ops)
		genl_lock();

	err = genl_family_rcv_msg(family, skb, nlh, extack);

	if (!family->parallel_ops)
		genl_unlock();

	return err;
}


static int genl_family_rcv_msg(const struct genl_family *family,
			       struct sk_buff *skb,
			       struct nlmsghdr *nlh,
			       struct netlink_ext_ack *extack)
{
	const struct genl_ops *ops;
	struct net *net = sock_net(skb->sk);
	struct genl_info info;
	struct genlmsghdr *hdr = nlmsg_data(nlh);
	struct nlattr **attrbuf;
	int hdrlen, err;

	/* this family doesn't exist in this netns */
	if (!family->netnsok && !net_eq(net, &init_net))
		return -ENOENT;

	hdrlen = GENL_HDRLEN + family->hdrsize;
	if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen))
		return -EINVAL;

	ops = genl_get_cmd(hdr->cmd, family); //通过generic netlink的ID获取对应的ops,family即为ovs注册各类协议
	if (ops == NULL)
		return -EOPNOTSUPP;

	if ((ops->flags & GENL_ADMIN_PERM) &&
	    !netlink_capable(skb, CAP_NET_ADMIN))
		return -EPERM;

	if ((ops->flags & GENL_UNS_ADMIN_PERM) &&
	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
		return -EPERM;

	if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) {
		int rc;

		if (ops->dumpit == NULL)
			return -EOPNOTSUPP;

		if (!family->parallel_ops) {
			struct netlink_dump_control c = {
				.module = family->module,
				/* we have const, but the netlink API doesn't */
				.data = (void *)ops,
				.start = genl_lock_start,
				.dump = genl_lock_dumpit,
				.done = genl_lock_done,
			};

			genl_unlock();
			rc = __netlink_dump_start(net->genl_sock, skb, nlh, &c);
			genl_lock();

		} else {
			struct netlink_dump_control c = {
				.module = family->module,
				.start = ops->start,
				.dump = ops->dumpit,
				.done = ops->done,
			};

			rc = __netlink_dump_start(net->genl_sock, skb, nlh, &c);
		}

		return rc;
	}

	if (ops->doit == NULL)
		return -EOPNOTSUPP;

	if (family->maxattr && family->parallel_ops) {
		attrbuf = kmalloc_array(family->maxattr + 1,
					sizeof(struct nlattr *),
					GFP_KERNEL);
		if (attrbuf == NULL)
			return -ENOMEM;
	} else
		attrbuf = family->attrbuf;

	if (attrbuf) {
		err = nlmsg_parse(nlh, hdrlen, attrbuf, family->maxattr,
				  ops->policy, extack);
		if (err < 0)
			goto out;
	}

	info.snd_seq = nlh->nlmsg_seq;
	info.snd_portid = NETLINK_CB(skb).portid;
	info.nlhdr = nlh;
	info.genlhdr = nlmsg_data(nlh); //generic netlink消息头
	info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN; //指向的是ovs_header结构体,结构中只有一个成员变量dp_ifindex
	info.attrs = attrbuf; //用户态传入的属性值
	info.extack = extack;
	genl_info_net_set(&info, net);
	memset(&info.user_ptr, 0, sizeof(info.user_ptr));

	if (family->pre_doit) {
		err = family->pre_doit(ops, skb, &info);
		if (err)
			goto out;
	}

	err = ops->doit(skb, &info);//调用cmd处理函数

	if (family->post_doit)
		family->post_doit(ops, skb, &info);

out:
	if (family->parallel_ops)
		kfree(attrbuf);

	return err;
}

3.3 用户态使用netlink协议

在使用generic netlink消息时,首先需要根据定义的名称获取其ID,ID是在注册时用ctl控制器统一分配的。

ovs在与datapath通信时同样也需要首先获取各类generic netlink的ID

static int
dpif_netlink_init(void)
{
    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
    static int error;

    if (ovsthread_once_start(&once)) {
        error = nl_lookup_genl_family(OVS_DATAPATH_FAMILY,
                                      &ovs_datapath_family);
        if (error) {
            VLOG_INFO("Generic Netlink family '%s' does not exist. "
                      "The Open vSwitch kernel module is probably not loaded.",
                      OVS_DATAPATH_FAMILY);
        }
        if (!error) {
            error = nl_lookup_genl_family(OVS_VPORT_FAMILY, &ovs_vport_family);
        }
        if (!error) {
            error = nl_lookup_genl_family(OVS_FLOW_FAMILY, &ovs_flow_family);
        }
        if (!error) {
            error = nl_lookup_genl_family(OVS_PACKET_FAMILY,
                                          &ovs_packet_family);
        }
        if (!error) {
            error = nl_lookup_genl_mcgroup(OVS_VPORT_FAMILY, OVS_VPORT_MCGROUP,
                                           &ovs_vport_mcgroup);
        }
        if (!error) {
            if (nl_lookup_genl_family(OVS_METER_FAMILY, &ovs_meter_family)) {
                VLOG_INFO("The kernel module does not support meters.");
            }
        }
        if (nl_lookup_genl_family(OVS_CT_LIMIT_FAMILY,
                                  &ovs_ct_limit_family) < 0) {
            VLOG_INFO("Generic Netlink family '%s' does not exist. "
                      "Please update the Open vSwitch kernel module to enable "
                      "the conntrack limit feature.", OVS_CT_LIMIT_FAMILY);
        }

        ovs_tunnels_out_of_tree = dpif_netlink_rtnl_probe_oot_tunnels();

        ovsthread_once_done(&once);
    }

    return error;
}

#define OVS_DATAPATH_FAMILY  "ovs_datapath"         

error = nl_lookup_genl_family(OVS_DATAPATH_FAMILY,
                                      &ovs_datapath_family);

根据name获取id,此处使用的name与在内核datapath注册时填入的name一致。

int
nl_lookup_genl_family(const char *name, int *number)
{
    if (*number == 0) {
        struct nlattr *attrs[ARRAY_SIZE(family_policy)];
        struct ofpbuf *reply;
        int error;

        error = do_lookup_genl_family(name, attrs, &reply);
        if (!error) {
            *number = nl_attr_get_u16(attrs[CTRL_ATTR_FAMILY_ID]);
            define_genl_family(*number, name);
        } else {
            *number = -error;
        }
        ofpbuf_delete(reply);

        ovs_assert(*number != 0);
    }
    return *number > 0 ? 0 : -*number;
}

static int
do_lookup_genl_family(const char *name, struct nlattr **attrs,
                      struct ofpbuf **replyp)
{
    struct nl_sock *sock;
    struct ofpbuf request, *reply;
    int error;

    *replyp = NULL;
    error = nl_sock_create(NETLINK_GENERIC, &sock); //创建NETLINK_GENERIC类型的socket
    if (error) {
        return error;
    }

    ofpbuf_init(&request, 0);
    nl_msg_put_genlmsghdr(&request, 0, GENL_ID_CTRL, NLM_F_REQUEST,
                          CTRL_CMD_GETFAMILY, 1); //填充消息头,消息发送给内核的crtl模块,cmd为CTRL_CMD_GETFAMILY,根据协议名称获取ID
    nl_msg_put_string(&request, CTRL_ATTR_FAMILY_NAME, name); //填充要获取ID的协议名称
    error = nl_sock_transact(sock, &request, &reply);
    ofpbuf_uninit(&request);
    if (error) {
        nl_sock_destroy(sock);
        return error;
    }

    if (!nl_policy_parse(reply, NLMSG_HDRLEN + GENL_HDRLEN,
                         family_policy, attrs, ARRAY_SIZE(family_policy))
        || nl_attr_get_u16(attrs[CTRL_ATTR_FAMILY_ID]) == 0) {
        nl_sock_destroy(sock);
        ofpbuf_delete(reply);
        return EPROTO;
    }

    nl_sock_destroy(sock);
    *replyp = reply;
    return 0;
}

4 netlink收发数据—以ovs中packet为例 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值