环境:
- 版本:kernel-5.4.54 amd64 双核 ubuntu18.04
- k8s集群网络组件:flannel,kube-proxy: ipvs
- 代码工具:vs code
1.概述
- SNAT(源地址转换)是IPTABLES的NAT表的核心功能,广泛应用与路由器,云服务器,K8S集群等内网环境中,是内核网络子系统中不可或缺的功能
- IPTABLES的NAT完全依赖于netfilter的conntrack,对于没有进行conntrack的数据包无法进行NAT
- 在K8S集群中DNAT用于负载均衡,SNAT用来保证节点转发的数据包能回到节点去完成de-DNAT还原,而不是直接发给客户端。
- 客户端访问的是负载均衡IP,后端IP直接回包给客户端的话,客户端无法识别;
- 后端IP回包先转给负载均衡器,将后端IP还原成负载均衡IP之后再发给客户端
- IPTABLES和IPVS都可以实现DNAT负载均衡的功能,但是SNAT只能由IPTABLES实现
- 查看集群中IPTABLES的SNAT规则
root@cluster1-worker1:~# iptables -t nat -nL
Chain PREROUTING (policy ACCEPT)
target prot opt source destination
KUBE-SERVICES all -- 0.0.0.0/0 0.0.0.0/0 /* kubernetes service portals */
DOCKER all -- 0.0.0.0/0 0.0.0.0/0 ADDRTYPE match dst-type LOCAL
Chain INPUT (policy ACCEPT)
target prot opt source destination
Chain OUTPUT (policy ACCEPT)
target prot opt source destination
KUBE-SERVICES all -- 0.0.0.0/0 0.0.0.0/0 /* kubernetes service portals */
DOCKER all -- 0.0.0.0/0 !127.0.0.0/8 ADDRTYPE match dst-type LOCAL
Chain POSTROUTING (policy ACCEPT)
target prot opt source destination
KUBE-POSTROUTING all -- 0.0.0.0/0 0.0.0.0/0 /* kubernetes postrouting rules */
MASQUERADE all -- 172.17.0.0/16 0.0.0.0/0
RETURN all -- 10.244.0.0/16 10.244.0.0/16
MASQUERADE all -- 10.244.0.0/16 !224.0.0.0/4
RETURN all -- !10.244.0.0/16 10.244.2.0/24
MASQUERADE all -- !10.244.0.0/16 10.244.0.0/16
...
Chain KUBE-POSTROUTING (1 references)
target prot opt source destination
/* Kubernetes endpoints dst ip:port, source ip for solving hairpin purpose */
MASQUERADE all -- 0.0.0.0/0 0.0.0.0/0 match-set KUBE-LOOP-BACK dst,dst,src
RETURN all -- 0.0.0.0/0 0.0.0.0/0 mark match ! 0x4000/0x4000
MARK all -- 0.0.0.0/0 0.0.0.0/0 MARK xor 0x4000
/* kubernetes service traffic requiring SNAT */
MASQUERADE all -- 0.0.0.0/0 0.0.0.0/0
...
分析MASQUERADE是如何SNAT的对于我们了解集群间网络通信很有帮助
2.概念
2.1 de-SNAT
为什么要做de-SNAT? 假设本机将POD1发出的包进行了SNAT,源IP从POD1-IP变成了HOST-IP;这样服务端回包目的地是HOST-IP,但是需要收包的是POD1,如果不de-SNAT把回包的目的地改为POD1-IP,POD1就无法收到数据包
2.2 netfilter中的与SNAT有关的钩子点
K8S集群的SNAT规则是在POST_ROUTING做SNAT,在PRE_ROUTING做de-SNAT
3.代码分析
3.1 MASQUERADE在NAT表中注册的钩子函数
static struct xt_target masquerade_tg_reg[] __read_mostly = {
{
#if IS_ENABLED(CONFIG_IPV6)
.name = "MASQUERADE",
.family = NFPROTO_IPV6,
.target = masquerade_tg6,
.targetsize = sizeof(struct nf_nat_range),
.table = "nat",
.hooks = 1 << NF_INET_POST_ROUTING,
.checkentry = masquerade_tg6_checkentry,
.destroy = masquerade_tg_destroy,
.me = THIS_MODULE,
}, {
#endif
.name = "MASQUERADE",
.family = NFPROTO_IPV4,
.target = masquerade_tg,
.targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
.table = "nat",
.hooks = 1 << NF_INET_POST_ROUTING,
.checkentry = masquerade_tg_check,
.destroy = masquerade_tg_destroy,
.me = THIS_MODULE,
}
};
3.2 masquerade_tg分析
static unsigned int
masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
struct nf_nat_range2 range;
const struct nf_nat_ipv4_multi_range_compat *mr;
/* 获取规则的配置和SNAT的可用端口范围 */
mr = par->targinfo;
range.flags = mr->range[0].flags;
range.min_proto = mr->range[0].min;
range.max_proto = mr->range[0].max;
/* 核心函数 */
return nf_nat_masquerade_ipv4(skb, xt_hooknum(par), &range,
xt_out(par));
}
3.2.1 nf_nat_masquerade_ipv4分析
unsigned int
nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
const struct nf_nat_range2 *range,
const struct net_device *out)
{
struct nf_conn *ct;
struct nf_conn_nat *nat;
enum ip_conntrack_info ctinfo;
struct nf_nat_range2 newrange;
const struct rtable *rt;
__be32 newsrc, nh;
WARN_ON(hooknum != NF_INET_POST_ROUTING);
/* 获取conntrack连接信息 */
ct = nf_ct_get(skb, &ctinfo);
WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
ctinfo == IP_CT_RELATED_REPLY)));
/* Source address is 0.0.0.0 - locally generated packet that is
* probably not supposed to be masqueraded.
*/
if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0)
return NF_ACCEPT;
/* 获取路由表 */
rt = skb_rtable(skb);
/* 下一跳的地址 */
nh = rt_nexthop(rt, ip_hdr(skb)->daddr);
/* 选择最合适的SNAT源地址 */
newsrc = inet_select_addr(out, nh, RT_SCOPE_UNIVERSE);
if (!newsrc) {
pr_info("%s ate my IP address\n", out->name);
return NF_DROP;
}
nat = nf_ct_nat_ext_add(ct);
if (nat)
nat->masq_index = out->ifindex;
/* Transfer from original range. */
/* 设置可用的源地址和源端口范围 */
memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS;
newrange.min_addr.ip = newsrc;
newrange.max_addr.ip = newsrc;
newrange.min_proto = range->min_proto;
newrange.max_proto = range->max_proto;
/* Hand modified range to generic setup. */
/* 根据可用范围确定SNAT源地址,并修改连接记录 */
return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
}
3.2.2 nf_nat_setup_info分析
unsigned int
nf_nat_setup_info(struct nf_conn *ct,
const struct nf_nat_range2 *range,
enum nf_nat_manip_type maniptype)
{
struct net *net = nf_ct_net(ct);
struct nf_conntrack_tuple curr_tuple, new_tuple;
/* Can't setup nat info for confirmed ct. */
if (nf_ct_is_confirmed(ct))
return NF_ACCEPT;
WARN_ON(maniptype != NF_NAT_MANIP_SRC &&
maniptype != NF_NAT_MANIP_DST);
if (WARN_ON(nf_nat_initialized(ct, maniptype)))
return NF_DROP;
/* What we've got will look like inverse of reply. Normally
* this is what is in the conntrack, except for prior
* manipulations (future optimization: if num_manips == 0,
* orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
*/
nf_ct_invert_tuple(&curr_tuple,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
/* 从可用范围中获取唯一的五元组 */
get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);
if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
struct nf_conntrack_tuple reply;
/* Alter conntrack table so will recognize replies. */
/* 修改conntrack中的回包的五元组 */
nf_ct_invert_tuple(&reply, &new_tuple);
nf_conntrack_alter_reply(ct, &reply);
/* Non-atomic: we own this at the moment. */
/* 标识需要做的nat类型 */
if (maniptype == NF_NAT_MANIP_SRC)
ct->status |= IPS_SRC_NAT;
else
ct->status |= IPS_DST_NAT;
if (nfct_help(ct) && !nfct_seqadj(ct))
if (!nfct_seqadj_ext_add(ct))
return NF_DROP;
}
/* 将连接记录添加到bysource表中 */
if (maniptype == NF_NAT_MANIP_SRC) {
unsigned int srchash;
spinlock_t *lock;
srchash = hash_by_src(net,
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS];
spin_lock_bh(lock);
hlist_add_head_rcu(&ct->nat_bysource,
&nf_nat_bysource[srchash]);
spin_unlock_bh(lock);
}
/* It's done. */
if (maniptype == NF_NAT_MANIP_DST)
ct->status |= IPS_DST_NAT_DONE;
else
ct->status |= IPS_SRC_NAT_DONE;
return NF_ACCEPT;
}
3.3.3 get_unique_tuple分析
/* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING,
* we change the source to map into the range. For NF_INET_PRE_ROUTING
* and NF_INET_LOCAL_OUT, we change the destination to map into the
* range. It might not be possible to get a unique tuple, but we try.
* At worst (or if we race), we will end up with a final duplicate in
* __nf_conntrack_confirm and drop the packet. */
static void
get_unique_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_tuple *orig_tuple,
const struct nf_nat_range2 *range,
struct nf_conn *ct,
enum nf_nat_manip_type maniptype)
{
const struct nf_conntrack_zone *zone;
struct net *net = nf_ct_net(ct);
zone = nf_ct_zone(ct);
/* 1) If this srcip/proto/src-proto-part is currently mapped,
* and that same mapping gives a unique tuple within the given
* range, use that.
*
* This is only required for source (ie. NAT/masq) mappings.
* So far, we don't do local source mappings, so multiple
* manips not an issue.
*/
/* 先尝试判断不做SNAT是否满足可用范围,或者在最近SNAT的连接记录中获取SNAT源地址 */
if (maniptype == NF_NAT_MANIP_SRC &&
!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
/* SNAT和非随机端口会走到这里 */
/* try the original tuple first */
/* 不做SNAT判断是否满足可用范围 */
if (in_range(orig_tuple, range)) {
/* 判断五元组是否唯一 */
if (!nf_nat_used_tuple(orig_tuple, ct)) {
*tuple = *orig_tuple;
return;
}
/* 根据源地址hash,在最近SNAT的连接记录中获取SNAT源地址 */
} else if (find_appropriate_src(net, zone,
orig_tuple, tuple, range)) {
pr_debug("get_unique_tuple: Found current src map\n");
/* 判断五元组是否唯一 */
if (!nf_nat_used_tuple(tuple, ct))
return;
}
}
/* 随机端口或者没有找到符合上面判断的五元组时会走到这里 */
/* 2) Select the least-used IP/proto combination in the given range */
*tuple = *orig_tuple;
/* 从源地址范围中获取最合适的源地址 */
find_best_ips_proto(zone, tuple, range, ct, maniptype);
/* 3) The per-protocol part of the manip is made to map into
* the range to make a unique tuple.
*/
/* Only bother mapping if it's not already in range and unique */
/* 先不修改端口判断五元组是否满足范围 */
if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) &&
l4proto_in_range(tuple, maniptype,
&range->min_proto,
&range->max_proto) &&
(range->min_proto.all == range->max_proto.all ||
!nf_nat_used_tuple(tuple, ct)))
/* 非随机端口 && 设置了端口范围 && 端口满足范围 && 五元组唯一
* 会走到这里 直接返回确认的五元组*/
return;
} else if (!nf_nat_used_tuple(tuple, ct)) {
/* 非随机端口 && 没有设置了端口范围 && 五元组唯一
* 会走到这里 直接返回确认的五元组*/
return;
}
}
/* Last chance: get protocol to try to obtain unique tuple. */
/* 在可用范围中选择一个合适的端口(五元组唯一,端口在范围内) */
nf_nat_l4proto_unique_tuple(tuple, range, maniptype, ct);
}
先不做对数据包的修改,这里只修改conntrack连接记录,后续根据连接记录对数据包修改 对数据包的修改和de-SNAT在NAT分析文档中:IPTABLES的连接跟踪与NAT分析
3.3 SNAT与MASQ区别
3.3.1 SNAT钩子函数
static struct xt_target xt_nat_target_reg[] __read_mostly = {
{
.name = "SNAT",
.revision = 0,
.checkentry = xt_nat_checkentry_v0,
.destroy = xt_nat_destroy,
.target = xt_snat_target_v0,
.targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
.family = NFPROTO_IPV4,
.table = "nat",
.hooks = (1 << NF_INET_POST_ROUTING) |
(1 << NF_INET_LOCAL_IN),
.me = THIS_MODULE,
},
...
3.3.2 xt_snat_target_v0分析
static unsigned int
xt_snat_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
struct nf_nat_range2 range;
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
ct = nf_ct_get(skb, &ctinfo);
WARN_ON(!(ct != NULL &&
(ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
ctinfo == IP_CT_RELATED_REPLY)));
/* 获取范围 */
xt_nat_convert_range(&range, &mr->range[0]);
/* 根据可用范围确定SNAT源地址,并修改连接记录 */
return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
}
可以看到SNAT和MASQ最后都调用了nf_nat_setup_info,区别是MASQ在前面有一个选择最合适源IP的步骤。