本文走讀內核網絡?.NETfilter子系統相關的源碼。源碼基于kernel 4.14版本。
Netfilter子系統包含數據包選擇、過濾、修改,連接跟蹤,網絡地址轉換(NAT)等內容。
Netfilter掛載點
在上篇《linux內核源碼走讀之IPv4及IPv6》文章中,我們在IPv4和IPv6的接收和發送路徑中,看到過這些掛載點。
- NF_INET_PRE_ROUTING: 在IPv4中,這個掛載點位于方法ip_rcv()中。這是所有入站數據包遇到的第一個掛載點,它處在路由選擇之前。
- NF_INET_LOCAL_IN: 在IPv4中,這個掛載點位于方法ip_local_deliver中。對于所有發給當前主機的入站數據包,經過掛載點NF_INET_PRE_ROUTING和路由選擇子系統之后,都將到達這個掛載點。
- NF_INET_FORWARD: 在IPv4中,這個掛載點位于方法ip_forward()中。對于所有要轉發的數據包,經過掛載點NF_INET_PRE_ROUTING和路由選擇子系統之后,都將到達這個掛載點。
- NF_INET_POST_ROUTING: 在IPv4中,這個掛載點位于方法ip_output()中。所有要轉發的數據包,都在經過掛載點NF_INET_FORWARD后到達這個掛載點。另外,當前主機生成的數據包經過掛載點NF_INET_LOCAL_OUT后將到達這個掛載點。
- NF_INET_LOCAL_OUT: 在IPv4中,這個掛載點位于方法__ip_local_out中。當前主機生成的所有出站數據包都在經過路由查找和此掛載點之后,到達掛載點NF_INET_POST_ROUTING。
內核網絡代碼中,一般通過宏NF_HOOK來調用在掛載點中注冊的鉤子函數。
static inline intNF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *)){ int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn); if (ret == 1) ret = okfn(net, sk, skb); return ret;}//nf_hook并不調用okfn回調函數,NF_HOOK宏判斷nf_hook返回值=1(表示允許包通過)調用okfnstatic inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) switch (pf) case NFPROTO_IPV4: hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]); struct nf_hook_state state; nf_hook_state_init(&state, hook, pf, indev, outdev, sk, net, okfn); ret = nf_hook_slow(skb, &state, hook_head, 0); return retint nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, const struct nf_hook_entries *e, unsigned int s) for (; s < e->num_hook_entries; s++) //依次執行注冊的hook函數,如果返回值是NF_ACCEPT,則表示調用者可進一步執行okfn verdict = nf_hook_entry_hookfn(&e->hooks[s], skb, state); return entry->hook(entry->priv, skb, state);
Netfilter鉤子回調函數返回值必須是下述五個值之一,這些值被稱為netfilter verdicts(netfilter判決)
- NF_DROP: 默默丟棄數據包
- NF_ACCEPT: 數據包繼續在內核協議棧中傳輸
- NF_STOLEN: 數據包不繼續傳輸,由鉤子方法進行處理
- NF_QUEUE: 將數據包排序,供用戶空間使用
- NF_REPEAT: 再次調用鉤子函數
注冊Netfilter鉤子回調函數
注冊Netfilter鉤子回調函數的方法有兩個nf_register_net_hook和nf_register_net_hooks。 4.13之前的內核版本還有兩個注冊接口nf_register_hook和nf_register_hooks, 從4.13版本開始內核刪除了這兩個接口,這兩個接口最終也是調用nf_register_net_hook,下面看下nf_register_net_hook:
int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) __nf_register_net_hook(net, reg->pf, reg) struct nf_hook_entries *p, *new_hooks; struct nf_hook_entries __rcu **pp; pp = nf_hook_entry_head(net, pf, reg->hooknum, reg->dev) return net->nf.hooks_ipv4 + hooknum; //以pf==NFPROTO_IPV4為例。鉤子掛載點保存在struct net對象中 p = nf_entry_dereference(*pp); new_hooks = nf_hook_entries_grow(p, reg); //將新的nf_hook_ops按照優先級插入到hook entries中
我們看到nf_register_net_hook一個入參是結構體struct nf_hook_ops,看下這個結構體:
typedef unsigned int nf_hookfn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state);struct nf_hook_ops { /* User fills in from here down. */ nf_hookfn *hook; //要注冊的鉤子回調函數 struct net_device *dev; void *priv; u_int8_t pf; //協議簇,對于IPv4來說,它為NFPROTO_IPV4; IPV6, NFPROTO_IPV6 bool nat_hook; unsigned int hooknum; //netfilter的5個掛載點之一 /* Hooks are ordered in ascending priority. */ int priority; //按優先級升序排列回調函數,priority值越小回調函數越先被調用};
連接跟蹤
現代網絡中,僅根據L4和L3報頭來過濾流量還不夠,還應考慮基于會話對包進行處理。 連接跟蹤能夠讓內核跟蹤會話,連接跟蹤的主要目標是為NAT打下基礎。
連接跟蹤初始化
先看下連接跟蹤模塊定義的netfilter掛載點對象數組,即結構體struct nf_hook_ops數組,定義在netfilter各掛載點的處理函數。
static const struct nf_hook_ops ipv4_conntrack_ops[] = { { .hook = ipv4_conntrack_in, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_CONNTRACK, }, { .hook = ipv4_conntrack_local, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_CONNTRACK, }, { .hook = ipv4_helper, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_HELPER, }, { .hook = ipv4_confirm, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_CONFIRM, }, { .hook = ipv4_helper, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_CONNTRACK_HELPER, }, { .hook = ipv4_confirm, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_CONNTRACK_CONFIRM, },};
注冊的最重要的連接跟蹤回調函數是,NF_INET_PRE_ROUTING鉤子回調函數ipv4_conntrack_in和NF_INET_LOCAL_OUT鉤子回調函數ipv4_conntrack_local。 這兩個鉤子函數的優先級為NF_IP_PRI_CONNTRACK(-200),優先級較高。 ipv4_conntrack_in和ipv4_conntrack_local都會調用到nf_conntrack_in,下一小結走讀nf_conntrack_in。
繼續看下注冊這個ipv4_conntrack_ops的地方。在內核版本4.9及以前,直接在函數
nf_conntrack_l3proto_ipv4_init中調用nf_register_hooks來注冊。 4.10及以后內核,不在nf_conntrack_l3proto_ipv4_init中直接注冊ipv4_conntrack_ops,看下相關代碼:
//nf_conntrack_l3proto_ipv4.c//nf_conntrack_l3proto_ipv4_init為nf_conntrack_ipv4.ko的初始化函數module_init(nf_conntrack_l3proto_ipv4_init);static int __init nf_conntrack_l3proto_ipv4_init(void) ... ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv4); rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], proto); //注冊到全局變量nf_ct_l3protos中struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { .l3proto = PF_INET, .pkt_to_tuple = ipv4_pkt_to_tuple, .invert_tuple = ipv4_invert_tuple, .get_l4proto = ipv4_get_l4proto,#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = ipv4_tuple_to_nlattr, .nlattr_to_tuple = ipv4_nlattr_to_tuple, .nla_policy = ipv4_nla_policy, .nla_size = NLA_ALIGN(NLA_HDRLEN + sizeof(u32)) + /* CTA_IP_V4_SRC */ NLA_ALIGN(NLA_HDRLEN + sizeof(u32)), /* CTA_IP_V4_DST */#endif .net_ns_get = ipv4_hooks_register, //這里注冊的函數用于注冊連接跟蹤的netfliter鉤子 .net_ns_put = ipv4_hooks_unregister, .me = THIS_MODULE,};//先看下ipv4_hooks_registerstatic int ipv4_hooks_register(struct net *net) struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id); cnet->users++; if (cnet->users > 1) goto out_unlock; //只在第一次調用的時候往下走,之后的調用只是users技術+1 //注冊連接跟蹤的netfilter鉤子 nf_register_net_hooks(net, ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));//再看下調用nf_conntrack_l3proto_ipv4.net_ns_get的地方int nf_ct_netns_get(struct net *net, u8 nfproto) if (nfproto == NFPROTO_INET) nf_ct_netns_do_get(net, NFPROTO_IPV4) nf_ct_netns_do_get(net, NFPROTO_IPV6)static int nf_ct_netns_do_get(struct net *net, u8 nfproto) const struct nf_conntrack_l3proto *l3proto; l3proto = __nf_ct_l3proto_find(nfproto); //對于NFPROTO_IPV4,這里返回的是nf_conntrack_l3proto_ipv4 l3proto->net_ns_get(net); //調用net_ns_get//調用nf_ct_netns_get地方有很多,主要應該是通過NFT_ct_get_init和nft_nat_init
下圖展示了IPv4連接跟蹤鉤子函數在IPv4收發流程中的位置,其中綠色方塊是netfilter的5個鉤子掛載點,藍色方塊是連接跟蹤模塊注冊的鉤子函數:

連接跟蹤netfilter掛載點
用來區分特定方向上的流的結構體是struct nf_conntrack_tuple:
struct nf_conntrack_tuple { struct nf_conntrack_man src; //tuple的可操作部分 /* 以下是tuple的固定部分 */ struct { union nf_inet_addr u3; union { /* Add other protocols here. */ __be16 all; struct { __be16 port; } tcp; struct { __be16 port; } udp; struct { u_int8_t type, code; } icmp; struct { __be16 port; } dccp; struct { __be16 port; } sctp; struct { __be16 key; } gre; } u; u_int8_t protonum; //protocol u_int8_t dir; } dst;};
連接跟蹤條目
struct nf_conn表示連接跟蹤條目,即保存到連接跟蹤hash表里的節點。
struct nf_conn { struct nf_conntrack ct_general; spinlock_t lock; u16 cpu; struct nf_conntrack_zone zone; struct nf_conntrack_tuple_hash tuplehash[IP_CT_DIR_MAX]; //hashlist節點 unsigned long status; u32 timeout; possible_net_t ct_net; struct hlist_node nat_bysource; /* all members below initialized via memset */ struct { } __nfct_init_offset; struct nf_conn *master; u_int32_t mark; u_int32_t secmark; struct nf_ct_ext *ext; union nf_conntrack_proto proto;};
接下來看一下方法nf_conntrack_in():
unsigned int nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, struct sk_buff *skb) l3proto = __nf_ct_l3proto_find(pf); //對于pf=PF_INET,PF_INET,返回的是全局變量nf_conntrack_l3proto_ipv4 l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff, &protonum); //.get_l4proto=ipv4_get_l4proto //對于IPv4 ->get_l4proto=ipv4_get_l4proto *dataoff = nhoff + (iph->ihl << 2); *protonum = iph->protocol; //protonum即四層協議 l4proto = __nf_ct_l4proto_find(pf, protonum); //以IPPROTO_TCP為例,返回的是全局變量nf_conntrack_l4proto_tcp4 resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum, l3proto, l4proto); struct nf_conntrack_tuple tuple; struct nf_conntrack_tuple_hash *h; nf_ct_get_tuple() //填充tuple hash = hash_conntrack_raw(&tuple, net); //對tuple進行hash散列運算,調用的內核提供的jhash2() h = __nf_conntrack_find_get(net, zone, &tuple, hash); //在全局變量nf_conntrack_hash hash表下查找連接是否存在 if (!h) //如果連接不存在,則新建一個連接,保存到unconfirmed list h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,skb, dataoff, hash); ct = nf_ct_tuplehash_to_ctrack(h); //利用container_of得到真正的連接對象 ...//一系列ctinfo賦值邏輯,對于新建的連接ctinfo = IP_CT_NEW nf_ct_set(skb, ct, ctinfo); //將連接對象和連接狀態值,保存到skb中 skb->_nfct = (unsigned long)ct | info; //借助指針低4位一定為0的邏輯,低4位存整數值 timeouts = nf_ct_timeout_lookup(net, ct, l4proto); l4proto->packet(ct, skb, dataoff, ctinfo, pf, timeouts); //以TCP為例,->packet==tcp_packet()
再看下ipv4_confirm()的代碼:
ipv4_confirm nf_conntrack_confirmstatic inline int nf_conntrack_confirm(struct sk_buff *skb) ct = nf_ct_get(skb, &ctinfo); ... nf_ct_del_from_dying_or_unconfirmed_list(ct); //從unconfirmed或dying表中刪除連接 ... __nf_conntrack_hash_insert(ct, hash, reply_hash); //插入到nf_conntrack_hash ...
iptables
iptables由內核部分和用戶空間部分組成,核心是內核部分。
iptables的字面意思就是ip表項,每個表由struct xt_table表示。IPv4中,注冊和注銷表的接口是ipt_register_table()和ipt_unregister_table()。
struct xt_table { struct list_head list; /* What hooks you will enter on */ unsigned int valid_hooks; /* Man behind the curtain... */ struct xt_table_info *private; // struct module *me; u_int8_t af; /* address/protocol family */ int priority; /* hook order */ /* called when table is needed in the given netns */ int (*table_init)(struct net *net); const char name[XT_TABLE_MAXNAMELEN];};int ipt_register_table(struct net *net, const struct xt_table *table, const struct ipt_replace *repl, const struct nf_hook_ops *ops, struct xt_table **res) xt_register_table(net, table, &bootstrap, newinfo); list_add(&table->list, &net->xt.tables[table->af]); //注冊到net->xt.tables上 nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)) //注冊netfilter鉤子
struct net對象包含IPv4和IPv6專用對象netns_ipv4和netns_ipv6,netns_ipv4和netns_ipv6又包含指向xt_table對象的指針。 例如netns_ipv4包含iptable_filter、iptable_mangle、iptable_raw、arptable_filter、nat_table。
我們以iptable_filter過濾表為例,來進一步看下iptables的工作原理。
//filter表的定義#define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT))static const struct xt_table packet_filter = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, //按照FILTER_VALID_HOOKS定義,在netfilter的3個掛載點掛載鉤子 .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_FILTER, .table_init = iptable_filter_table_init,};//初始化static int __init iptable_filter_init(void) //這一步主要是初始化netfilter鉤子掛載對象,3個掛載點的回調函數都是iptable_filter_hook filter_ops = xt_hook_ops_alloc(&packet_filter, iptable_filter_hook); register_pernet_subsys(&iptable_filter_net_ops) iptable_filter_net_init iptable_filter_table_init(net) //注冊filter表 ipt_register_table(net, &packet_filter, repl, filter_ops, &net->ipv4.iptable_filter);
總結下,內核提供了一些表,表里的條目由用戶空間程序設置。
看一個用戶空間iptables命令例子:
iptables -A INPUT -p udp --dport=5001 -j LOG --log-level 1
這條規則的意思是,向filter表中添加一條規則,將目標端口為5001的UDP入站數據包轉儲到系統日志中。 使用iptables命令時,應使用修飾符-t來指定要使用的表,如果沒指定,默認使用過濾表。
再看一個規則:
iptables -A INPUT -p tcp -m conntrack --ctstate ESTABLISHED -j LOG --log-level 1
這個規則是根據連接跟蹤狀態來過濾數據包,將連接狀態為ESTABLISHED的數據包轉儲到系統日志中。
本文主要聚焦內核源碼,關于用戶空間的iptables命令,后面另起文章學習
NAT
NAT(Network Address Translation)網絡地址轉換,主要用于IP地址轉換或端口轉換。 NAT最常見的用途之一是,讓局域網中一組使用私有IP地址的主機能夠通過網關的公網IP訪問Internet。
NAT初始化
與上節介紹的過濾表一樣,NAT表也是一個xt_table對象。
static const struct xt_table nf_nat_ipv4_table = { .name = "nat", .valid_hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, .af = NFPROTO_IPV4, .table_init = iptable_nat_table_init,};
nat表的netfilter鉤子函數:
static const struct nf_hook_ops nf_nat_ipv4_ops[] = { /* Before packet filtering, change destination */ { .hook = iptable_nat_ipv4_in, .pf = NFPROTO_IPV4, .nat_hook = true, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_NAT_DST, }, /* After packet filtering, change source */ { .hook = iptable_nat_ipv4_out, .pf = NFPROTO_IPV4, .nat_hook = true, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_NAT_SRC, }, /* Before packet filtering, change destination */ { .hook = iptable_nat_ipv4_local_fn, .pf = NFPROTO_IPV4, .nat_hook = true, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST, }, /* After packet filtering, change source */ { .hook = iptable_nat_ipv4_fn, .pf = NFPROTO_IPV4, .nat_hook = true, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC, },};
nat表的初始化:
static int __init iptable_nat_init(void) iptable_nat_table_init(&init_net) struct ipt_replace *repl; repl = ipt_alloc_initial_table(&nf_nat_ipv4_table); //調用ipt_register_table注冊nat表 ret = ipt_register_table(net, &nf_nat_ipv4_table, repl, nf_nat_ipv4_ops, &net->ipv4.nat_table);
NAT鉤子回調函數
NAT的核心實現位于
net/netfilter/nf_nat_core.c。NAT實現的基本元素為結構nf_nat_l4proto和nf_nat_l3proto。 (在3.7之前的內核中,使用的是結構nf_nat_protocol)。這兩個結構都包含函數指針manip_pkt(),它會修改數據報頭。 下面看下這兩個結構。
static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = { .l3proto = NFPROTO_IPV4, .in_range = nf_nat_ipv4_in_range, .secure_port = nf_nat_ipv4_secure_port, .manip_pkt = nf_nat_ipv4_manip_pkt, //修改ip包 .csum_update = nf_nat_ipv4_csum_update, .csum_recalc = nf_nat_ipv4_csum_recalc,#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_to_range = nf_nat_ipv4_nlattr_to_range,#endif#ifdef CONFIG_XFRM .decode_session = nf_nat_ipv4_decode_session,#endif};//專門看下這個修改ip包的函數nf_nat_ipv4_manip_pktstatic bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, const struct nf_nat_l4proto *l4proto, const struct nf_conntrack_tuple *target, enum nf_nat_manip_type maniptype) ... if (maniptype == NF_NAT_MANIP_SRC) { csum_replace4(&iph->check, iph->saddr, target->src.u3.ip); iph->saddr = target->src.u3.ip; //修改源IP } else { csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip); iph->daddr = target->dst.u3.ip; //修改目標IP }//TCPconst struct nf_nat_l4proto nf_nat_l4proto_tcp = { .l4proto = IPPROTO_TCP, .manip_pkt = tcp_manip_pkt, //修改IP包 .in_range = nf_nat_l4proto_in_range, .unique_tuple = tcp_unique_tuple,#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,#endif};//看下tcp_manip_pkt, udp的類似static bool tcp_manip_pkt(struct sk_buff *skb, const struct nf_nat_l3proto *l3proto, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) ... if (maniptype == NF_NAT_MANIP_SRC) { /* Get rid of src port */ newport = tuple->src.u.tcp.port; portptr = &hdr->source; } else { /* Get rid of dst port */ newport = tuple->dst.u.tcp.port; portptr = &hdr->dest; } oldport = *portptr; *portptr = newport; //修改端口號
繼續看下NAT模塊注冊的netfilter鉤子函數。IPv4 NAT模塊在4個掛載點注冊了鉤子函數, 這4個函數最終都調用到nf_nat_ipv4_fn()。
unsigned int nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct)) struct nf_conn *ct; enum ip_conntrack_info ctinfo; ct = nf_ct_get(skb, &ctinfo); if (!ct) return NF_ACCEPT; //沒有連接跟蹤就直接返回 switch (ctinfo) case IP_CT_NEW: if (!nf_nat_initialized(ct, maniptype)) //do_chain最終調用ipt_do_table,在nat標準查找指定條目,找到則調用target的回調函數 do_chain(priv, skb, state, ct); //執行報文修改操作 nf_nat_packet(ct, ctinfo, state->hook, skb); //這里的l3proto對應前面講的nf_nat_l3proto_ipv4 l3proto = __nf_nat_l3proto_find(target.src.l3num); //如果是TCP的話,l4proto是nf_nat_l4proto_tcp l4proto = __nf_nat_l4proto_find(target.src.l3num,target.dst.protonum) l3proto->manip_pkt(skb, 0, l4proto, &target, mtype) //調用manip_pkt函數