netfilter之filter

对于filter表来说,作用在三个hook点 input, forward和output,每个hook点上默认有一条链,每个链默认有一条rule,这个rule不能被删除,永远是此链上最后一个rule,如果前面的rule没匹配上,肯定会匹配到这个默认的rule,并执行它的target(内建action:drop,accept等)。

通过iptables下发规则时,会将所有规则放在一块内存中,在kernel端整体替换,内存中的规则分布如下

image.png

hook_entry和underflow分别用于标识此链中用户添加的rule的头和尾,即第一个用户添加的rule的头和最后一个用户添加的rule的尾,这里的最后一个rule指的是用户添加的rule,不是默认rule。

上面提到的三个链及 input, forward和output其默认rule是加载filter模块自动生成的,此时还没有用户设置rule,所以hook_entry和underflow的值是相同的。

用户自定义链添加在三个默认链后面,并且会默认添加两条rule,其中return rule用于返回调用链,error rule用于标识链的结束,可以通过计数error rule的个数获取用户自定义链的个数。

filter 模块初始化

//filter 是以kernel module形式加载的(insmod iptable_filter),初始化函数为iptable_filter_init
module_init(iptable_filter_init);

//filter作用于IN, FORWARD和OUTPUT这三个hook点
#define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \
                (1 << NF_INET_FORWARD) | \
                (1 << NF_INET_LOCAL_OUT))

static const struct xt_table packet_filter = {
    .name       = "filter",
    .valid_hooks    = FILTER_VALID_HOOKS, //1110
    .me     = THIS_MODULE,
    .af     = NFPROTO_IPV4,
    .priority   = NF_IP_PRI_FILTER, //NF_IP_PRI_FILTER = 0,
};

static struct pernet_operations iptable_filter_net_ops = {
    .init = iptable_filter_net_init,
    .exit = iptable_filter_net_exit,
};

//kernel module iptable_filter初始化函数
static int __init iptable_filter_init(void)
{
    //注册到pernet子系统,将iptable_filter_net_ops添加到链表first_device
    //如果支持多网络namespace,则对每个namespace调用ops->init(net),即iptable_filter_net_init(net)
    //如果不支持,则只对init_net调用init函数iptable_filter_net_init
    register_pernet_subsys(&iptable_filter_net_ops);

    //将filter提供三个hook函数(其实是同一个函数iptable_filter_hook)挂载到 nf_hooks[reg->pf][reg->hooknum]
    /* Register hooks */
     xt_hook_link(&packet_filter, iptable_filter_hook);
}
  1. 注册filter表及生成默认规则
    iptable_filter_net_init参数为net,用来对所有network namespace进行初始化。

static int __net_init iptable_filter_net_init(struct net *net)
{
    struct ipt_replace *repl;
    //1.1 
    //根据packet_filter,生成repl结构体
    //根据valid_hooks默认创建对应个数的的entry,并且每个entry的verdict初始化为NF_ACCEPT。
    //每个entry中没有扩展match ipt_entry_match
    repl = ipt_alloc_initial_table(&packet_filter);
        
    /* Entry 1 is the FORWARD hook */
    //forward是filter kernel module的一个参数,默认为true。
    //如果想要在forward hook点默认丢包,可在加载模块时设置forward为false
    //注意target.verdict为负值才代表标准target,正值表示跳转到自定义链的偏移量
    //insmod xxx forward=false
    ((struct ipt_standard *)repl->entries)[1].target.verdict =
        forward ? -NF_ACCEPT - 1 : -NF_DROP - 1;
    
    //1.2
    //将repl中的内容转换成 struct xt_table_info,并且存储在struct xt_table的private中,
    //将struct xt_table挂载到链表 net->xt.tables[table->af]
    net->ipv4.iptable_filter = ipt_register_table(net, &packet_filter, repl);

    return PTR_ERR_OR_ZERO(net->ipv4.iptable_filter);
}

1.1 ipt_alloc_initial_table
ipt_alloc_initial_table通过宏xt_alloc_initial_table生成struct ipt_replace+默认的四个entry,对应的图示如下

image.png

void *ipt_alloc_initial_table(const struct xt_table *info)
{
    return xt_alloc_initial_table(ipt, IPT);
}

#define xt_alloc_initial_table(type, typ2) ({ \
    unsigned int hook_mask = info->valid_hooks; \
    unsigned int nhooks = hweight32(hook_mask); \
    unsigned int bytes = 0, hooknum = 0, i = 0; \
    struct { \
        struct type##_replace repl; \
        struct type##_standard entries[]; \
    } *tbl; \
    struct type##_error *term; \
    size_t term_offset = (offsetof(typeof(*tbl), entries[nhooks]) + \
        __alignof__(*term) - 1) & ~(__alignof__(*term) - 1); \
    tbl = kzalloc(term_offset + sizeof(*term), GFP_KERNEL); \
    if (tbl == NULL) \
        return NULL; \
    term = (struct type##_error *)&(((char *)tbl)[term_offset]); \
    strncpy(tbl->repl.name, info->name, sizeof(tbl->repl.name)); \
    *term = (struct type##_error)typ2##_ERROR_INIT;  \
    tbl->repl.valid_hooks = hook_mask; \
    tbl->repl.num_entries = nhooks + 1; \
    tbl->repl.size = nhooks * sizeof(struct type##_standard) + \
             sizeof(struct type##_error); \
    for (; hook_mask != 0; hook_mask >>= 1, ++hooknum) { \
        if (!(hook_mask & 1)) \
            continue; \
        tbl->repl.hook_entry[hooknum] = bytes; \
        tbl->repl.underflow[hooknum]  = bytes; \
        tbl->entries[i++] = (struct type##_standard) \
            typ2##_STANDARD_INIT(NF_ACCEPT); \
        bytes += sizeof(struct type##_standard); \
    } \
    tbl; \
})

#define IPT_ENTRY_INIT(__size)                             \
{                                          \
    .target_offset  = sizeof(struct ipt_entry),                \
    .next_offset    = (__size),                        \
}

#define XT_TARGET_INIT(__name, __size)                         \
{                                          \
    .target.u.user = {                             \
        .target_size    = XT_ALIGN(__size),                \
        .name       = __name,                      \
    },                                     \
}

#define IPT_STANDARD_INIT(__verdict)                           \
{                                          \
    .entry      = IPT_ENTRY_INIT(sizeof(struct ipt_standard)),         \
    .target     = XT_TARGET_INIT(XT_STANDARD_TARGET,               \
                     sizeof(struct xt_standard_target)),   \
    .target.verdict = -(__verdict) - 1,                    \
}

1.2 ipt_register_table

struct xt_table *ipt_register_table(struct net *net,
                    const struct xt_table *table,
                    const struct ipt_replace *repl)
{
    int ret;
    struct xt_table_info *newinfo;
    struct xt_table_info bootstrap = {0};
    void *loc_cpu_entry;
    struct xt_table *new_table;
    //分配xt_table_info结构体,大小为XT_TABLE_INFO_SZ
    //#define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) + nr_cpu_ids * sizeof(char *))
    newinfo = xt_alloc_table_info(repl->size);

    /* choose the copy on our node/cpu, but dont care about preemption */
    loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
    memcpy(loc_cpu_entry, repl->entries, repl->size);

    //将repl信息转换到newinfo中。
    //转换过程中,作长度的校验
    //调用match或者target提供的check函数检查
    translate_table(net, newinfo, loc_cpu_entry, repl);

    //生成新的xt_table,将旧table中内容复制到新table
    //将newinfo赋值到新table的private
    //将新table加到链表 net->xt.tables[table->af]
    new_table = xt_register_table(net, table, &bootstrap, newinfo);
    return new_table;
}

1.2.1 xt_alloc_table_info
分配结构体内存struct xt_table_info。

struct xt_table_info *xt_alloc_table_info(unsigned int size)
{
    struct xt_table_info *newinfo;
    int cpu;

    /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */
    if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages)
        return NULL;

    newinfo = kzalloc(XT_TABLE_INFO_SZ, GFP_KERNEL);

    newinfo->size = size;

    for_each_possible_cpu(cpu) {
        if (size <= PAGE_SIZE)
            newinfo->entries[cpu] = kmalloc_node(size,
                            GFP_KERNEL,
                            cpu_to_node(cpu));
        else
            newinfo->entries[cpu] = vmalloc_node(size,
                            cpu_to_node(cpu));

        if (newinfo->entries[cpu] == NULL) {
            xt_free_table_info(newinfo);
            return NULL;
        }
    }

    return newinfo;
}

1.2.2 translate_table

/* Checks and translates the user-supplied table segment (held in
   newinfo) */
static int
translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
                const struct ipt_replace *repl)
{
    struct ipt_entry *iter;
    unsigned int *offsets;
    unsigned int i;
    int ret = 0;

    newinfo->size = repl->size;
    newinfo->number = repl->num_entries;

    /* Init all hooks to impossible value. */
    for (i = 0; i < NF_INET_NUMHOOKS; i++) {
        newinfo->hook_entry[i] = 0xFFFFFFFF;
        newinfo->underflow[i] = 0xFFFFFFFF;
    }

    duprintf("translate_table: size %u\n", newinfo->size);
    offsets = xt_alloc_entry_offsets(newinfo->number);
    if (!offsets)
        return -ENOMEM;
    i = 0;
    /* Walk through entries, checking offsets. */
    xt_entry_foreach(iter, entry0, newinfo->size) {
        ret = check_entry_size_and_hooks(iter, newinfo, entry0,
                         entry0 + repl->size,
                         repl->hook_entry,
                         repl->underflow,
                         repl->valid_hooks);
        if (ret != 0)
            goto out_free;
        if (i < repl->num_entries)
            offsets[i] = (void *)iter - entry0;
        ++i;
        //每添加一个自定义链,都会在最后添加一条规则,其
        //target 为"ERROR",所以这里统计"ERROR"的个数就是
        //自定义链个数
        if (strcmp(ipt_get_target(iter)->u.user.name,
            XT_ERROR_TARGET) == 0)
            ++newinfo->stacksize;
    }

    ret = -EINVAL;
    if (i != repl->num_entries) {
        duprintf("translate_table: %u not %u entries\n",
             i, repl->num_entries);
        goto out_free;
    }

    /* Check hooks all assigned */
    for (i = 0; i < NF_INET_NUMHOOKS; i++) {
        /* Only hooks which are valid */
        if (!(repl->valid_hooks & (1 << i)))
            continue;
        if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
            duprintf("Invalid hook entry %u %u\n",
                 i, repl->hook_entry[i]);
            goto out_free;
        }
        if (newinfo->underflow[i] == 0xFFFFFFFF) {
            duprintf("Invalid underflow %u %u\n",
                 i, repl->underflow[i]);
            goto out_free;
        }
    }

    if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) {
        ret = -ELOOP;
        goto out_free;
    }
    kvfree(offsets);

    //遍历所有entry(entry包含多个match和一个target),针对每个
    //entry,根据match的m->u.user.name和m->u.user.revision在
    //xt[af].match上查找匹配的match,
    //如果查找失败,说明用户设置的match是错误的,
    //如果查找成功,则调用match模块提供的check函数进行检查,
    //并将match赋值给m->u.kernel.match = match,
    //接着根据t->u.user.name和t->u.user.revision查找xt[af].target匹配的target,
    //将此target赋值给t->u.kernel.target = target,后面数据匹配时,直接使用
    /* Finally, each sanity check must pass */
    i = 0;
    xt_entry_foreach(iter, entry0, newinfo->size) {
        ret = find_check_entry(iter, net, repl->name, repl->size);
        if (ret != 0)
            break;
        ++i;
    }

    if (ret != 0) {
        xt_entry_foreach(iter, entry0, newinfo->size) {
            if (i-- == 0)
                break;
            cleanup_entry(iter, net);
        }
        return ret;
    }

    /* And one copy for every other CPU */
    //将entry信息复制到所有cpu对应的内存
    for_each_possible_cpu(i) {
        if (newinfo->entries[i] && newinfo->entries[i] != entry0)
            memcpy(newinfo->entries[i], entry0, newinfo->size);
    }

    return ret;
 out_free:
    kvfree(offsets);
    return ret;
}

1.2.3 xt_register_table

struct xt_table *xt_register_table(struct net *net,
                   const struct xt_table *input_table,
                   struct xt_table_info *bootstrap,
                   struct xt_table_info *newinfo)
{
    int ret;
    struct xt_table_info *private;
    struct xt_table *t, *table;

    /* Don't add one object to multiple lists. */
    table = kmemdup(input_table, sizeof(struct xt_table), GFP_KERNEL);
    if (!table) {
        ret = -ENOMEM;
        goto out;
    }

    mutex_lock(&xt[table->af].mutex);
    /* Don't autoload: we'd eat our tail... */
    list_for_each_entry(t, &net->xt.tables[table->af], list) {
        if (strcmp(t->name, table->name) == 0) {
            ret = -EEXIST;
            goto unlock;
        }
    }

    /* Simplifies replace_table code. */
    table->private = bootstrap;

    if (!xt_replace_table(table, 0, newinfo, &ret))
        goto unlock;

    private = table->private;
    pr_debug("table->private->number = %u\n", private->number);

    /* save number of initial entries */
    private->initial_entries = private->number;

    list_add(&table->list, &net->xt.tables[table->af]);
    mutex_unlock(&xt[table->af].mutex);
    return table;

unlock:
    mutex_unlock(&xt[table->af].mutex);
    kfree(table);
out:
    return ERR_PTR(ret);
}

最终会把 xt_table 存储到net.ipv4.iptable_filter 和 net.xt.tables[IPV4]的链表中

image.png

image.png

xt_table_info->entries[cpuid]是所有规则的的首地址。entries数组大小为cpu个数,每个entry都存放相同的内容,查找规则时只需要查找本cpu的entry即可,避免和其他cpu冲突。
在每个hook点,只需要查询此hook对应的规则即可,如果做到呢?
答案是通过hook_entry和underflow,hook_entry存放的是每个hook相对于首地址的偏移量,而underflow存放的是用户设置的最后一个规则的末尾。对于filter表来说,不在hook点prerouting和postrouting生效,所以offset设置为无效值ffffffff。

  1. 注册hook函数

struct nf_hook_ops *xt_hook_link(const struct xt_table *table, nf_hookfn *fn)
{
    unsigned int hook_mask = table->valid_hooks;
    uint8_t i, num_hooks = hweight32(hook_mask);
    uint8_t hooknum;
    struct nf_hook_ops *ops;
    int ret;

    ops = kmalloc(sizeof(*ops) * num_hooks, GFP_KERNEL);
    if (ops == NULL)
        return ERR_PTR(-ENOMEM);

    for (i = 0, hooknum = 0; i < num_hooks && hook_mask != 0;
         hook_mask >>= 1, ++hooknum) {
        if (!(hook_mask & 1))
            continue;
        ops[i].hook     = fn;
        ops[i].owner    = table->me;
        ops[i].pf       = table->af;
        ops[i].hooknum  = hooknum;
        ops[i].priority = table->priority;
        ++i;
    }

    ret = nf_register_hooks(ops, num_hooks);
    if (ret < 0) {
        kfree(ops);
        return ERR_PTR(ret);
    }

    return ops;
}

int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n)
{
    unsigned int i;
    int err = 0;

    for (i = 0; i < n; i++) {
        err = nf_register_hook(&reg[i]);
        if (err)
            goto err;
    }
    return err;

err:
    if (i > 0)
        nf_unregister_hooks(reg, i);
    return err;
}

int nf_register_hook(struct nf_hook_ops *reg)
{
    struct nf_hook_ops *elem;

    mutex_lock(&nf_hook_mutex);
    list_for_each_entry(elem, &nf_hooks[reg->pf][reg->hooknum], list) {
        if (reg->priority < elem->priority)
            break;
    }
    //按照优先级从大到小
    list_add_rcu(&reg->list, elem->list.prev);
    mutex_unlock(&nf_hook_mutex);
#ifdef HAVE_JUMP_LABEL
    static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]);
#endif
    return 0;
}

将nf_hook_ops注册到nf_hooks[reg->pf][reg->hooknum]上,filter生效的三个hook点使用同一个hook函数iptable_filter_hook。

image.png

报文匹配

数据包匹配时,根据pf和hook点找到对应的nf_hooks链表头,循环执行此链表上注册的hook函数,对于filter来说hook函数为iptable_filter_hook。
在hook函数iptable_filter_hook中,调用ipt_do_table遍历filter规则,其参数net->ipv4.iptable_filter为filter的xt_table,存储了filter表相关的规则。

static unsigned int
iptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
            const struct net_device *in, const struct net_device *out,
            int (*okfn)(struct sk_buff *))
{
    const struct net *net;

    if (ops->hooknum == NF_INET_LOCAL_OUT &&
        (skb->len < sizeof(struct iphdr) ||
         ip_hdrlen(skb) < sizeof(struct iphdr)))
        /* root is playing with raw sockets. */
        return NF_ACCEPT;

    net = dev_net((in != NULL) ? in : out);
    return ipt_do_table(skb, ops->hooknum, in, out,
                net->ipv4.iptable_filter);
}

遍历filter规则进行匹配,重点地方都做了详细的注释。

/* Returns one of the generic firewall policies, like NF_ACCEPT. */
unsigned int
ipt_do_table(struct sk_buff *skb,
         unsigned int hook,
         const struct net_device *in,
         const struct net_device *out,
         struct xt_table *table)
{
    static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
    const struct iphdr *ip;
    /* Initializing verdict to NF_DROP keeps gcc happy. */
    unsigned int verdict = NF_DROP;
    const char *indev, *outdev;
    const void *table_base;
    struct ipt_entry *e, **jumpstack;
    unsigned int *stackptr, origptr, cpu;
    const struct xt_table_info *private;
    struct xt_action_param acpar;
    unsigned int addend;

    /* Initialization */
    ip = ip_hdr(skb);
    indev = in ? in->name : nulldevname;
    outdev = out ? out->name : nulldevname;
    /* We handle fragments by dealing with the first fragment as
     * if it was a normal packet.  All other fragments are treated
     * normally, except that they will NEVER match rules that ask
     * things we don't know, ie. tcp syn flag or ports).  If the
     * rule is also a fragment-specific rule, non-fragments won't
     * match it. */
    acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
    acpar.thoff   = ip_hdrlen(skb);
    acpar.hotdrop = false;
    acpar.in      = in;
    acpar.out     = out;
    acpar.family  = NFPROTO_IPV4;
    acpar.hooknum = hook;

    IP_NF_ASSERT(table->valid_hooks & (1 << hook));
    local_bh_disable();
    addend = xt_write_recseq_begin();
    private = table->private;
    cpu      = smp_processor_id();
    /*
     * Ensure we load private-> members after we've fetched the base
     * pointer.
     */
    smp_read_barrier_depends();
    //获取filter表规则的首地址
    table_base = private->entries[cpu];
    jumpstack  = (struct ipt_entry **)private->jumpstack[cpu];
    stackptr   = per_cpu_ptr(private->stackptr, cpu);
    origptr    = *stackptr;
    //获取hook在filter表规则内的地址
    e = get_entry(table_base, private->hook_entry[hook]);

    do {
        const struct xt_entry_target *t;
        const struct xt_entry_match *ematch;

        //标准匹配,源目的ip,出入接口,四层协议等
        if (!ip_packet_match(ip, indev, outdev,
            &e->ip, acpar.fragoff)) {
 no_match:
            //返回值为false,说明没匹配成功,获取下一个entry继续匹配
            e = ipt_next_entry(e);
            continue;
        }

        //标准匹配成功后,如果有扩展match,遍历执行match提供的函数
        //比如 iptables -A INPUT -p tcp --dport 22 -j testssh 这条规则中 --dport 22就是扩展匹配
        xt_ematch_foreach(ematch, e) {
            //如果是tcp,kernel.match 就是tcp_mt
            acpar.match     = ematch->u.kernel.match; 
            //扩展规则内容放在data里,在tcp_mt中转换成struct xt_tcp *tcpinfo
            acpar.matchinfo = ematch->data;
            //如果扩展规则返回false,说明匹配失败,跳转到no_match,获取下一个entry继续匹配
            if (!acpar.match->match(skb, &acpar))
                goto no_match;
        }
        //如果标准和扩展规则都匹配成功,则增加统计计数,
        ADD_COUNTER(e->counters, skb->len, 1);
        //并且数据包需要执行该条规则的target操作。
        //调用函数ipt_get_target,获得该rule对应的target,
        //这个函数返回e+e->target_offset,每个entry只有一个
        //target,所以不需要像match一样遍历,直接指针指过去了
        t = ipt_get_target(e);
        IP_NF_ASSERT(t->u.kernel.target);

        /* Standard target? */
        //target为空,说明是标准target
        if (!t->u.kernel.target->target) {
            int v;
            //标准target中,如果verdict为负值,可能是内建action,
            //比如accept,drop等。也有可能是从自定义链返回的return
            v = ((struct xt_standard_target *)t)->verdict;
            if (v < 0) {
                /* Pop from stack? */
                //verdict不为return,则肯定是内建action,跳出循环,结束匹配。
                //只要匹配上一个规则就要结束匹配,不会按顺序继续匹配。
                if (v != XT_RETURN) {
                    verdict = (unsigned int)(-v) - 1;
                    break;
                }
                //if判断成立,说明跳转到自定义链时没有保存
                //rule偏移量,这说明是通过-g跳转的,并且只能
                //是从INPUT等内建链跳转的,这种情况下从自
                //定义链返回时,直接获取内建链的最后一个默
                //认rule执行即可。
                if (*stackptr <= origptr) {
                    e = get_entry(table_base,
                        private->underflow[hook]);
                    pr_debug("Underflow (this is normal) "
                         "to %p\n", e);
                } else {
                    //else的情况是通过-j跳转到的自定义链,这
                    //里取出跳转时的rule,继续执行调用链rule
                    //后面的rule。
                    e = jumpstack[--*stackptr];
                    pr_debug("Pulled %p out from pos %u\n",
                         e, *stackptr);
                    e = ipt_next_entry(e);
                }
                continue;
            }

            //此时v为正值,代表自定义链的偏移量,表示需要跳转到自定义链。
            //如果没有IPT_F_GOTO标志,说明是 -j 跳转,需要保存当前位置,以便
            //从自定义链返回后,继续执行当前rule后面的rule。
            //如果有IPT_F_GOTO标志,说明是 -g 跳转,不用保存当前位置,从自定义
            //链返回后,直接到调用链的上个链执行。
            //还要满足table_base + v != ipt_next_entry(e),即加
            //上偏移量后不是当前链的下一个entry,如果是当前
            //链下一个entry说明是添加规则时没有指定 -j 或 -g,    
            //这种情况下是不需要保存栈的。
            if (table_base + v != ipt_next_entry(e) &&
                !(e->ip.flags & IPT_F_GOTO)) {
                //如果跳转次数太多,直接返回drop
                if (*stackptr >= private->stacksize) {
                    verdict = NF_DROP;
                    break;
                }
                jumpstack[(*stackptr)++] = e;
                pr_debug("Pushed %p into pos %u\n",
                     e, *stackptr - 1);
            }
            //跳转到自定义链的第一个entry
            e = get_entry(table_base, v);
            continue;
        }

        acpar.target   = t->u.kernel.target;
        acpar.targinfo = t->data;
        //执行扩展target
        verdict = t->u.kernel.target->target(skb, &acpar);
        /* Target might have changed stuff. */
        ip = ip_hdr(skb);
        if (verdict == XT_CONTINUE)
            e = ipt_next_entry(e);
        else
            /* Verdict */
            break;
    } while (!acpar.hotdrop);
    pr_debug("Exiting %s; resetting sp from %u to %u\n",
         __func__, *stackptr, origptr);
    *stackptr = origptr;
    xt_write_recseq_end(addend);
    local_bh_enable();

#ifdef DEBUG_ALLOW_ALL
    return NF_ACCEPT;
#else
    if (acpar.hotdrop)
        return NF_DROP;
    else return verdict;
#endif
}

标准target,包含如下内建target和跳转到自定义链的target,
t->u.kernel.target->target为空。

#define NF_DROP 0
#define NF_ACCEPT 1
#define NF_STOLEN 2
#define NF_QUEUE 3
#define NF_REPEAT 4
#define RETURN     IPT_RETURN
#define IPT_RETURN     (-NF_MAX_VERDICT - 1)
#define NF_MAX_VERDICT NF_REPEAT 

//标准target,verdict为负值
iptables –A INPUT –i eth0 –p udp –dport 137:138 –j ACCEPT
//跳转到自定义链的target,verdict为正值,因为verdict保存的是到自定义链的偏移量。
iptables -A INPUT -p tcp --dport 22 -j testssh

扩展target,t->u.kernel.target->target不为空。

iptables –t nat –A POSTROUTING –s 192.168.10.10 –o eth1 –j SNAT --to-source 111.196.221.212

iptables命令行操作

自定义链
内建链上默认都有一个rule,而添加自定义链时,会默认添加两个rule,一个用于return回调用链,另一个用于错误处理。
entry172和entry173是添加新链test生成的rule,entry171是在新链上添加规则生成的,其target为DROP。
假如数据包匹配到了entry171则丢弃,否则匹配默认的entry172,其target为return,返回上一个链继续执行。

//创建新链test
#iptables -N test
//在新链test上添加一条规则
#iptables -A test -p tcp -j DROP
//查看当前iptable规则
#iptables -vvv -Ln
Entry 171 (66304):
SRC IP: 0.0.0.0/0.0.0.0
DST IP: 0.0.0.0/0.0.0.0
Interface: `'/................to `'/................
Protocol: 6
Flags: 00
Invflags: 00
Counters: 0 packets, 0 bytes
Cache: 00000000
Target name: `' [40]
verdict=NF_DROP

Entry 172 (66456):
SRC IP: 0.0.0.0/0.0.0.0
DST IP: 0.0.0.0/0.0.0.0
Interface: `'/................to `'/................
Protocol: 0
Flags: 00
Invflags: 00
Counters: 0 packets, 0 bytes
Cache: 00000000
Target name: `' [40]
verdict=RETURN

Entry 173 (66608):
SRC IP: 0.0.0.0/0.0.0.0
DST IP: 0.0.0.0/0.0.0.0
Interface: `'/................to `'/................
Protocol: 0
Flags: 00
Invflags: 00
Counters: 0 packets, 0 bytes
Cache: 00000000
Target name: `ERROR' [64]
error=`ERROR'

//先清空自定义链test上的规则,否则-X删除不了链
#iptables -F test
//删除自定义链
#iptables -X test

不指定-j或-g
如果用户设置rule时,没有指定 -j或者-g,则verdict为指定下一个rule的偏移量。
entry175和entry176是添加新链test默认生成的rule,
entry174是在新链上添加规则生成的,没有指定target,默认为空,则verdict=66936即下一个entry175。
假如数据包匹配到了entry174,只是增加计数,然后通过verdict指定的偏移量跳转到规则175,继续执行默认的entry175,其target为return,返回上一个链继续执行。

#iptables -N test1
//添加规则,但是没有指定target
#iptables -A test1 -p tcp
#iptables -vvv -Ln
Entry 174 (66784):
SRC IP: 0.0.0.0/0.0.0.0
DST IP: 0.0.0.0/0.0.0.0
Interface: `'/................to `'/................
Protocol: 6
Flags: 00
Invflags: 00
Counters: 0 packets, 0 bytes
Cache: 00000000
Target name: `' [40]
verdict=66936

Entry 175 (66936):
SRC IP: 0.0.0.0/0.0.0.0
DST IP: 0.0.0.0/0.0.0.0
Interface: `'/................to `'/................
Protocol: 0
Flags: 00
Invflags: 00
Counters: 0 packets, 0 bytes
Cache: 00000000
Target name: `' [40]
verdict=RETURN

Entry 176 (67088):
SRC IP: 0.0.0.0/0.0.0.0
DST IP: 0.0.0.0/0.0.0.0
Interface: `'/................to `'/................
Protocol: 0
Flags: 00
Invflags: 00
Counters: 0 packets, 0 bytes
Cache: 00000000
Target name: `ERROR' [64]
error=`ERROR'

-j和-g跳转的区别
添加rule时,-j和-g的区别是: -g会设置标志位IPT_F_GOTO。
如果没有此标志IPT_F_GOTO,则跳转到自定义链时会保存当前rule,当从自定义链返回时,如果verdict为return,则会获取保存的rule,获取调用链下一个rule继续匹配(保存rule的下一个rule)。如果有此标志IPT_F_GOTO,跳转时不会保存rule,从自定义链返回时也就不会返回到调用链,此时还要分为两种情况:
a. 在内建chain上添加rule时使用 -g

#iptables -N test
#iptables -A INPUT -g test
走if分支,获取此内建chain的默认rule,执行默认的action。

b. 在自定义链上添加rule时使用 -g

#iptables -N test
#iptables -N test1
#iptables -A INPUT -j test
#iptables -A test -g test1
走else分支,在跳转到test时,会保存chain INPUT的rule,再跳转
到test1时,不会保存任何地址,当从test1返回时,取出保存的rule,
获取INPUT chain上下一个rule继续匹配。

对应的代码如下

                //if判断成立,说明跳转到自定义链时没有保存
                //rule偏移量,这说明是通过-g跳转的,并且只能
                //是从INPUT等内建链跳转的,这种情况下从自
                //定义链返回时,直接获取内建链的最后一个默
                //认rule执行即可。
                if (*stackptr <= origptr) {
                    //underflow[hook]保存的是hook对应的默认
                    //rule的偏移量,也就是自定义rule的末尾。
                    e = get_entry(table_base,
                        private->underflow[hook]);
                    pr_debug("Underflow (this is normal) "
                         "to %p\n", e);
                } else {
                    //else的情况是通过-j跳转到的自定义链,这
                    //里取出跳转时的rule,继续执行调用链rule
                    //后面的rule。
                    e = jumpstack[--*stackptr];
                    pr_debug("Pulled %p out from pos %u\n",
                         e, *stackptr);
                    e = ipt_next_entry(e);
                }

下面根据-g和-j的区别做如下几个实验

a. 从内建链INPUT通过-j跳转到自定义链,返回后,仍然可以执行INPUT链上后面的rule
root@ubuntu:~# iptables -N test
root@ubuntu:~# iptables -A INPUT -p tcp -j test -->跳转到test链
root@ubuntu:~# iptables -A INPUT -p tcp         -->从test链返回后,仍然可以执行此rule

root@ubuntu:~# iptables -L -vn
Chain INPUT (policy ACCEPT 62 packets, 4096 bytes)
 pkts bytes target     prot opt in     out     source               destination
   66  4340 test       tcp  --  *      *       0.0.0.0/0            0.0.0.0/0
   62  4096            tcp  --  *      *       0.0.0.0/0            0.0.0.0/0

Chain FORWARD (policy ACCEPT 0 packets, 0 bytes)
 pkts bytes target     prot opt in     out     source               destination

Chain OUTPUT (policy ACCEPT 40 packets, 8328 bytes)
 pkts bytes target     prot opt in     out     source               destination

Chain test (1 references)
 pkts bytes target     prot opt in     out     source               destination

b. 从内建链INPUT通过-g跳转到自定义链,返回后,不会执行INPUT链上后面的rule,而是直接执行INPUT链上的默认rule policy ACCEPT
root@ubuntu:~# iptables -N test
root@ubuntu:~# iptables -A INPUT -p tcp -g test -->跳转到test链
root@ubuntu:~# iptables -A INPUT -p tcp         -->从test链返回后,不会执行此rule,直接执行INPUT的默认rule policy ACCEPT

root@ubuntu:~# iptables -L -vn
Chain INPUT (policy ACCEPT 31 packets, 2064 bytes)
 pkts bytes target     prot opt in     out     source               destination
   35  2308 test       tcp  --  *      *       0.0.0.0/0            0.0.0.0/0           [goto]
    0     0            tcp  --  *      *       0.0.0.0/0            0.0.0.0/0   -->计数为0,说明没有执行 iptables -A INPUT -p tcp

Chain FORWARD (policy ACCEPT 0 packets, 0 bytes)
 pkts bytes target     prot opt in     out     source               destination

Chain OUTPUT (policy ACCEPT 20 packets, 4232 bytes)
 pkts bytes target     prot opt in     out     source               destination

Chain test (1 references)
 pkts bytes target     prot opt in     out     source               destination

c. 从内建链INPUT通过-j跳转到自定义链test,再通过-j跳转到自定义链test1,返回后,仍然可以执行test和INPUT链上后面的rule。
root@ubuntu:~# iptables -N test
root@ubuntu:~# iptables -N test1
root@ubuntu:~# iptables -A INPUT -p tcp -j test -->跳转到test链
root@ubuntu:~# iptables -A INPUT -p tcp         -->从test1链返回后,会执行此rule
root@ubuntu:~# iptables -A test -p tcp -j test1 -->通过-g跳转到test1链
root@ubuntu:~# iptables -A test -p tcp          -->从test1链返回后,会执行此rule

root@ubuntu:~# iptables -L -vn
Chain INPUT (policy ACCEPT 31 packets, 2116 bytes)
 pkts bytes target     prot opt in     out     source               destination
   39  2568 test       tcp  --  *      *       0.0.0.0/0            0.0.0.0/0
   37  2464            tcp  --  *      *       0.0.0.0/0            0.0.0.0/0

Chain FORWARD (policy ACCEPT 0 packets, 0 bytes)
 pkts bytes target     prot opt in     out     source               destination

Chain OUTPUT (policy ACCEPT 19 packets, 4432 bytes)
 pkts bytes target     prot opt in     out     source               destination

Chain test (1 references)
 pkts bytes target     prot opt in     out     source               destination
   35  2360 test1      tcp  --  *      *       0.0.0.0/0            0.0.0.0/0
   31  2116            tcp  --  *      *       0.0.0.0/0            0.0.0.0/0

Chain test1 (1 references)
 pkts bytes target     prot opt in     out     source               destination

d. 从内建链INPUT通过-j跳转到自定义链test,再通过-g跳转到自定义链test1,返回后,不会执行test链上后面的rule,而是执行INPUT链上后面的rule。
root@ubuntu:~# iptables -N test
root@ubuntu:~# iptables -N test1
root@ubuntu:~# iptables -A INPUT -p tcp -j test -->跳转到test链
root@ubuntu:~# iptables -A INPUT -p tcp         -->从test1链返回后,会执行此rule
root@ubuntu:~# iptables -A test -p tcp -g test1 -->通过-g跳转到test1链
root@ubuntu:~# iptables -A test -p tcp          -->从test1链返回后,不会执行此rule

root@ubuntu:~# iptables -L -vn
Chain INPUT (policy ACCEPT 156 packets, 9692 bytes)
 pkts bytes target     prot opt in     out     source               destination
  164 10144 test       tcp  --  *      *       0.0.0.0/0            0.0.0.0/0
  162 10040            tcp  --  *      *       0.0.0.0/0            0.0.0.0/0

Chain FORWARD (policy ACCEPT 0 packets, 0 bytes)
 pkts bytes target     prot opt in     out     source               destination

Chain OUTPUT (policy ACCEPT 118 packets, 26867 bytes)
 pkts bytes target     prot opt in     out     source               destination

Chain test (1 references)
 pkts bytes target     prot opt in     out     source               destination
  160  9936 test1      tcp  --  *      *       0.0.0.0/0            0.0.0.0/0           [goto]
    0     0            tcp  --  *      *       0.0.0.0/0            0.0.0.0/0   -->计数为0,说明没有执行 iptables -A test -p tcp

Chain test1 (1 references)
 pkts bytes target     prot opt in     out     source               destination

选项-vvv的作用
iptables命令的选项-vvv的作用是显示每个rule的详细信息。
从下面代码可知,verbose初始值为0,只有当verbose大于1时才会调用dump_entries显示详细信息,而verbose的值根据-v选项而来,如果只有一个-v,verbose就是1,两个-v,verbose就是2,。。。
所以至少指定两个-v才可显示

iptables-1.8.1/iptables/iptables.c:
int do_command4(int argc, char *argv[], char **table,
    int verbose = 0;
        struct xtc_handle **handle, bool restore)
        case 'v':
            if (!verbose)
                set_option(&cs.options, OPT_VERBOSE,
                       &cs.fw.ip.invflags, cs.invert);
            verbose++;
            break;
    if (verbose > 1)
        dump_entries(*handle);

hooks和Underflows的作用
对于filter表来说,默认有三个chain: input, forward和output。
每个chain有一个默认的rule(大小为152字节),这个rule不能被删除,永远是此chain上最后一个rule,如果前面的rule没匹配上,肯定会匹配到这个默认的rule,并执行它的target(内建action:drop,accept等)。除了这三个默认的rule,最后还有一个处理error的rule。

另外从下面命令执行结果还能得到关于hooks和Underflows的重要信息,他们分别用于标识此chain中用户设置的rule的头和尾,即第一个用户设置rule的头和最后一个用户设置rule的尾,这里的最后一个rule指的是用户设置的rule,不是最后面的默认rule。underflow减去hook等于用户设置的所有rule的大小。
而且只显示内建chain的偏移,不显示用户自定义chain的偏移。
这三个chain和其默认rule是加载filter模块自动生成的,此时还没有用户设置rule,所以hook和underflow的值是相同的。

Hooks: pre/in/fwd/out/post = ffffffff/0/98/130/ffffffff
Underflows: pre/in/fwd/out/post = ffffffff/0/98/130/ffffffff

在此输出中,分别显示了pre/in/fwd/out/post这五个chain的偏移(98/130为十六进制),其中ffffffff为无效值,因为filter表只应用在in/fwd/out这三个chain上。
in chain的起始偏移为0,因为in上没有用户设置的rule,所以结束偏移也为0,最后默认rule大小为152, in chain所有rule大小为152;
forward chain起始偏移为152,也没有用户设置的rule,所以结束偏移也为152,最后默认rule大小为152,forward chain所有rule大小为152;
out chain起始偏移为304,也没有用户设置的rule,所以结束偏移也为304,最后默认rule大小为152.

root@master:~# iptables -L -vvv
Chain INPUT (policy ACCEPT 162 packets, 16830 bytes)
 pkts bytes target     prot opt in     out     source               destination

Chain FORWARD (policy DROP 0 packets, 0 bytes)
 pkts bytes target     prot opt in     out     source               destination

Chain OUTPUT (policy ACCEPT 158 packets, 21254 bytes)
 pkts bytes target     prot opt in     out     source               destination
libiptc vlibxtables.so.12. 632 bytes.
Table `filter'
//此处的98/130等数字是十六进制的,换算成十进制为152/304
Hooks: pre/in/fwd/out/post = ffffffff/0/98/130/ffffffff
Underflows: pre/in/fwd/out/post = ffffffff/0/98/130/ffffffff
//input chain
Entry 0 (0):
SRC IP: 0.0.0.0/0.0.0.0
DST IP: 0.0.0.0/0.0.0.0
Interface: `'/................to `'/................
Protocol: 0
Flags: 00
Invflags: 00
Counters: 162 packets, 16830 bytes
Cache: 00000000
Target name: `' [40]
verdict=NF_ACCEPT

//forward chain
Entry 1 (152):
SRC IP: 0.0.0.0/0.0.0.0
DST IP: 0.0.0.0/0.0.0.0
Interface: `'/................to `'/................
Protocol: 0
Flags: 00
Invflags: 00
Counters: 0 packets, 0 bytes
Cache: 00000000
Target name: `' [40]
verdict=NF_DROP

//output chain
Entry 2 (304):
SRC IP: 0.0.0.0/0.0.0.0
DST IP: 0.0.0.0/0.0.0.0
Interface: `'/................to `'/................
Protocol: 0
Flags: 00
Invflags: 00
Counters: 158 packets, 21254 bytes
Cache: 00000000
Target name: `' [40]
verdict=NF_ACCEPT

Entry 3 (456):
SRC IP: 0.0.0.0/0.0.0.0
DST IP: 0.0.0.0/0.0.0.0
Interface: `'/................to `'/................
Protocol: 0
Flags: 00
Invflags: 00
Counters: 0 packets, 0 bytes
Cache: 00000000
Target name: `ERROR' [64]
error=`ERROR'

iptables源码中可知
一个chain包含N个用户设置的rule和一个默认的rule,
c->head_offset是这个chain里的第一个rule的偏移,
c->foot_offset是这个chain里最后一个用户设置的rule的偏移,
最后一个默认rule大小为size = sizeof(STRUCT_ENTRY)+ ALIGN(sizeof(STRUCT_STANDARD_TARGET)) = 152
整个chain的大小为c->foot_offset-c->head_offset+size

iptables-1.8.1/libiptc/libiptc.c:
/* put the pieces back together again */
static int iptcc_compile_table_prep(struct xtc_handle *h, unsigned int *size)
{
    struct chain_head *c;
    unsigned int offset = 0, num = 0;
    int ret = 0;

    /* First pass: calculate offset for every rule */
    //遍历所有的chain,计算每个chain所有rule的大小和个数
    //每个chain包含用户设置的rule和最后一个默认的rule。
    list_for_each_entry(c, &h->chains, list) {
        ret = iptcc_compile_chain_offsets(h, c, &offset, &num);
        if (ret < 0)
            return ret;
    }
    //所有chain的最后有一个默认的target为error的rule
    /* Append one error rule at end of chain */
    num++;
    offset += sizeof(STRUCT_ENTRY)
          + ALIGN(sizeof(struct xt_error_target));

    /* ruleset size is now in offset */
    *size = offset;
    return num;
}

//上面说到一个chain包含N个用户设置的rule和一个默认的rule,
//c->head_offset是这个chain里的第一个rule的偏移,
//c->foot_offset是这个chain里最后一个用户设置的rule的偏移,
//最后一个默认rule大小为size = sizeof(STRUCT_ENTRY)+ ALIGN(sizeof(STRUCT_STANDARD_TARGET)) = 152
//整个chain的大小为c->foot_offset-c->head_offset+size
/* calculate offset and number for every rule in the cache */
static int iptcc_compile_chain_offsets(struct xtc_handle *h, struct chain_head *c,
                       unsigned int *offset, unsigned int *num)
{
    struct rule_head *r;

    c->head_offset = *offset;
    DEBUGP("%s: chain_head %u, offset=%u\n", c->name, *num, *offset);

    if (!iptcc_is_builtin(c))  {
        /* Chain has header */
        *offset += sizeof(STRUCT_ENTRY)
                 + ALIGN(sizeof(struct xt_error_target));
        (*num)++;
    }

    list_for_each_entry(r, &c->rules, list) {
        DEBUGP("rule %u, offset=%u, index=%u\n", *num, *offset, *num);
        r->offset = *offset;
        r->index = *num;
        *offset += r->size;
        (*num)++;
    }

    DEBUGP("%s; chain_foot %u, offset=%u, index=%u\n", c->name, *num,
        *offset, *num);
    c->foot_offset = *offset;
    c->foot_index = *num;
    *offset += sizeof(STRUCT_ENTRY)
           + ALIGN(sizeof(STRUCT_STANDARD_TARGET));
    (*num)++;

    return 1;
}

下面添加一个用户自定义chain test,并在input chain设置一个rule,跳转到test chain。观察下偏移量的变化。
Hooks: pre/in/fwd/out/post = ffffffff/0/130/1c8/ffffffff
Underflows: pre/in/fwd/out/post = ffffffff/98/130/1c8/ffffffff
input chain: 起始偏移hook为0,因为添加了一个rule,大小为152,所以结束偏移underflow为152(十六进制98),还有一个默认rule,大小为152,所以input chain大小为304(十六进制为130)。
forward chian:起始偏移hook为304(十六进制为130),没有用户设置的rule,所以结束偏移underflow仍然为304,默认rule大小为152,所以forward chain大小为152。
output chain:起始偏移hook为456(十六进制为1c8),没有用户设置的rule,所以结束偏移underflow仍然为456,默认rule大小为152,所以forward chain大小为152。

#iptables -N test
#iptables -A INPUT -j test
root@master:~# iptables -L -vvv
Chain INPUT (policy ACCEPT 448 packets, 57326 bytes)
 pkts bytes target     prot opt in     out     source               destination
  448 57326 test       all  --  any    any     anywhere             anywhere

Chain FORWARD (policy DROP 0 packets, 0 bytes)
 pkts bytes target     prot opt in     out     source               destination

Chain OUTPUT (policy ACCEPT 446 packets, 61170 bytes)
 pkts bytes target     prot opt in     out     source               destination

Chain test (1 references)
 pkts bytes target     prot opt in     out     source               destination
libiptc vlibxtables.so.12. 1112 bytes.
Table `filter'
Hooks: pre/in/fwd/out/post = ffffffff/0/130/1c8/ffffffff
Underflows: pre/in/fwd/out/post = ffffffff/98/130/1c8/ffffffff
//input chain
//用户设置的rule
Entry 0 (0):
SRC IP: 0.0.0.0/0.0.0.0
DST IP: 0.0.0.0/0.0.0.0
Interface: `'/................to `'/................
Protocol: 0
Flags: 00
Invflags: 00
Counters: 448 packets, 57326 bytes
Cache: 00000000
Target name: `' [40]
verdict=784  //784为自定义chain
//默认rule 
Entry 1 (152):
SRC IP: 0.0.0.0/0.0.0.0
DST IP: 0.0.0.0/0.0.0.0
Interface: `'/................to `'/................
Protocol: 0
Flags: 00
Invflags: 00
Counters: 448 packets, 57326 bytes
Cache: 00000000
Target name: `' [40]
verdict=NF_ACCEPT

//forward chain
Entry 2 (304):
SRC IP: 0.0.0.0/0.0.0.0
DST IP: 0.0.0.0/0.0.0.0
Interface: `'/................to `'/................
Protocol: 0
Flags: 00
Invflags: 00
Counters: 0 packets, 0 bytes
Cache: 00000000
Target name: `' [40]
verdict=NF_DROP

//output chain
Entry 3 (456):
SRC IP: 0.0.0.0/0.0.0.0
DST IP: 0.0.0.0/0.0.0.0
Interface: `'/................to `'/................
Protocol: 0
Flags: 00
Invflags: 00
Counters: 446 packets, 61170 bytes
Cache: 00000000
Target name: `' [40]
verdict=NF_ACCEPT

Entry 4 (608):
SRC IP: 0.0.0.0/0.0.0.0
DST IP: 0.0.0.0/0.0.0.0
Interface: `'/................to `'/................
Protocol: 0
Flags: 00
Invflags: 00
Counters: 0 packets, 0 bytes
Cache: 00000000
Target name: `ERROR' [64]
error=`test'

//自定义chain
Entry 5 (784):
SRC IP: 0.0.0.0/0.0.0.0
DST IP: 0.0.0.0/0.0.0.0
Interface: `'/................to `'/................
Protocol: 0
Flags: 00
Invflags: 00
Counters: 448 packets, 57326 bytes
Cache: 00000000
Target name: `' [40]
verdict=RETURN

Entry 6 (936):
SRC IP: 0.0.0.0/0.0.0.0
DST IP: 0.0.0.0/0.0.0.0
Interface: `'/................to `'/................
Protocol: 0
Flags: 00
Invflags: 00
Counters: 0 packets, 0 bytes
Cache: 00000000
Target name: `ERROR' [64]
error=`ERROR'

也可参考:netfilter之filter - 简书 (jianshu.com) 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值