1.Netfilter & CONNTRACK & IPVS结构图
2.IPVS
ipvs只有DNAT和de-DNAT功能 ,它独立与iptables和conntrack,实现了自己的一套连接跟踪表和NAT机制
2.1 ipvs与conntrack的联系:
ipvs仅仅在做DNAT后对conntrack连接进行更新,防止回包因为没有记录而被丢弃 ipvs在TUNNEL模式下,会调用nf_conntrack_confirm函数对连接进行确认
2.2 ipvs注册的钩子函数
static const struct nf_hook_ops ip_vs_ops[] = {
/* After packet filtering, change source only for VS/NAT */
{
.hook = ip_vs_reply4, /* ip_vs_out */
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC - 2,
},
/* After packet filtering, forward packet through VS/DR, VS/TUN,
* or VS/NAT(change destination), so that filtering rules can be
* applied to IPVS. */
{
.hook = ip_vs_remote_request4,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC - 1,
},
/* Before ip_vs_in, change source only for VS/NAT */
{
.hook = ip_vs_local_reply4,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_NAT_DST + 1,
},
/* After mangle, schedule and forward local requests */
{
.hook = ip_vs_local_request4,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_NAT_DST + 2,
},
/* After packet filtering (but before ip_vs_out_icmp), catch icmp
* destined for 0.0.0.0/0, which is for incoming IPVS connections */
{
.hook = ip_vs_forward_icmp,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_FORWARD,
.priority = 99,
},
/* After packet filtering, change source only for VS/NAT */
{
.hook = ip_vs_reply4,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_FORWARD,
.priority = 100,
},
};
2.3 IPVS中tcp协议的状态转换表
/*
* Timeout table[state]
*/
static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
[IP_VS_TCP_S_NONE] = 2*HZ,
[IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ,
[IP_VS_TCP_S_SYN_SENT] = 2*60*HZ,
[IP_VS_TCP_S_SYN_RECV] = 1*60*HZ,
[IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ,
[IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ,
[IP_VS_TCP_S_CLOSE] = 10*HZ,
[IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
[IP_VS_TCP_S_LAST_ACK] = 30*HZ,
[IP_VS_TCP_S_LISTEN] = 2*60*HZ,
[IP_VS_TCP_S_SYNACK] = 120*HZ,
[IP_VS_TCP_S_LAST] = 2*HZ,
};
#define sNO IP_VS_TCP_S_NONE
#define sES IP_VS_TCP_S_ESTABLISHED
#define sSS IP_VS_TCP_S_SYN_SENT
#define sSR IP_VS_TCP_S_SYN_RECV
#define sFW IP_VS_TCP_S_FIN_WAIT
#define sTW IP_VS_TCP_S_TIME_WAIT
#define sCL IP_VS_TCP_S_CLOSE
#define sCW IP_VS_TCP_S_CLOSE_WAIT
#define sLA IP_VS_TCP_S_LAST_ACK
#define sLI IP_VS_TCP_S_LISTEN
#define sSA IP_VS_TCP_S_SYNACK
static struct tcp_states_t tcp_states[] = {
/* INPUT ip_vs_in调用 */
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA 初始状态 */
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
/* OUTPUT ip_vs_out调用 */
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA 初始状态 */
/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
/* INPUT-ONLY ip_vs_in在没有收到回包时调用*/
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
};
2.4 ip_vs_in
为目的地为虚拟服务器的数据包确认连接,并为连接分配后端,然后转发数据包
- 数据包四元组匹配到了连接记录
- 连接不复用 - 释放连接
- 连接复用
- 复用连接
- 数据包四元组没有匹配到连接记录,或者连接被释放
- 目的地是虚拟服务器 - 分配后端,新建连接
- 目的地不是虚拟服务器
- 返回ACCEPT
- 统计计数,更新四层协议连接状态
- 执行DNAT,转发数据包到LOCAL_OUT
- 更新连接保持时间
源码分析:
/*
* Check if it's for virtual services, look it up,
* and send it on its way...
*/
static unsigned int
ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af)
{
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
struct ip_vs_proto_data *pd;
struct ip_vs_conn *cp;
int ret, pkts;
int conn_reuse_mode;
struct sock *sk;
/* 已经被ipvs处理过则不处理 */
/* Already marked as IPVS request or reply? */
if (skb->ipvs_property)
return NF_ACCEPT;
/*
* Big tappo:
* - remote client: only PACKET_HOST
* - route: used for struct net when skb->dev is unset
*/
if (unlikely((skb->pkt_type != PACKET_HOST &&
hooknum != NF_INET_LOCAL_OUT) ||
!skb_dst(skb))) {
ip_vs_fill_iph_skb(af, skb, false, &iph);
IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
" ignored in hook %u\n",
skb->pkt_type, iph.protocol,
IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);
return NF_ACCEPT;
}
/* ipvs enabled in this netns ? */
if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
return NF_ACCEPT;
/* 获取ip头 */
ip_vs_fill_iph_skb(af, skb, false, &iph);
/* 获取数据包所属sock */
/* Bad... Do not break raw sockets */
sk = skb_to_full_sk(skb);
if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT &&
af == AF_INET)) {
if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag)
return NF_ACCEPT;
}
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
int related;
int verdict = ip_vs_in_icmp_v6(ipvs, skb, &related,
hooknum, &iph);
if (related)
return verdict;
}
} else
#endif
if (unlikely(iph.protocol == IPPROTO_ICMP)) {
int related;
int verdict = ip_vs_in_icmp(ipvs, skb, &related,
hooknum);
if (related)
return verdict;
}
/* Protocol supported? */
/* 判断是否为ipvs支持的协议 */
pd = ip_vs_proto_data_get(ipvs, iph.protocol);
if (unlikely(!pd)) {
/* The only way we'll see this packet again is if it's
* encapsulated, so mark it with ipvs_property=1 so we
* skip it if we're ignoring tunneled packets
*/
if (sysctl_ignore_tunneled(ipvs))
skb->ipvs_property = 1;
return NF_ACCEPT;
}
pp = pd->pp;
/*
* Check if the packet belongs to an existing connection entry
*/
/* 在ipvs连接跟踪表里查找数据包所属连接 */
cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto,
ipvs, af, skb, &iph);
/* conn_reuse_mode是ipvs连接复用参数
* frag是分片偏移量
* is_new_conn()是判断tcp头的syn标志位
*/
conn_reuse_mode = sysctl_conn_reuse_mode(ipvs);
if (conn_reuse_mode && !iph.fragoffs && is_new_conn(skb, &iph) && cp) {
/* 找到了所属连接并且是SYN,非分片,reuse_mode==1,时会走到这里 */
bool uses_ct = false, resched = false;
/* 判断expire_nodest_conn和连接的目的地的weight */
if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest &&
unlikely(!atomic_read(&cp->dest->weight))) {
/* expire_nodest_conn表示释放不可用后端的连接
* 后端不可用会走到这里
*/
resched = true;
/* 是否使用了nf_conntrack */
uses_ct = ip_vs_conn_uses_conntrack(cp, skb);
/* 判断之前的连接是否可以释放 */
} else if (is_new_conn_expected(cp, conn_reuse_mode)) {
/* 是否使用了nf_conntrack */
uses_ct = ip_vs_conn_uses_conntrack(cp, skb);
if (!atomic_read(&cp->n_control)) {
resched = true;
} else {
/* Do not reschedule controlling connection
* that uses conntrack while it is still
* referenced by controlled connection(s).
*/
resched = !uses_ct;
}
}
if (resched) {
/* 提前释放之前的连接 */
if (!atomic_read(&cp->n_control))
ip_vs_conn_expire_now(cp);
__ip_vs_conn_put(cp);
/* 这里有一个bug,如果使用了conntrack,直接丢包,客户端必须重传
* 重传导致产生1s延迟 */
if (uses_ct)
return NF_DROP;
cp = NULL;
}
}
if (unlikely(!cp)) {
/* 没有连接记录和不复用连接记录会走到这里 */
int v;
/* 进行连接记录的创建和目的地的确认 */
if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph))
/* 没有匹配到service的不属于ipvs的数据包返回ACCEPT */
return v;
}
/* 属于IPVS的service的数据包会走到这里 */
IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet");
/* Check the server status */
if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
/* the destination server is not available */
__u32 flags = cp->flags;
/* when timer already started, silently drop the packet.*/
if (timer_pending(&cp->timer))
__ip_vs_conn_put(cp);
else
ip_vs_conn_put(cp);
if (sysctl_expire_nodest_conn(ipvs) &&
!(flags & IP_VS_CONN_F_ONE_PACKET)) {
/* try to expire the connection immediately */
ip_vs_conn_expire_now(cp);
}
return NF_DROP;
}
/* 统计计数 */
ip_vs_in_stats(cp, skb);
/* 更新四层协议连接状态 */
ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
if (cp->packet_xmit)
/* DNAT之后,发送数据包到local_out
* 发送成功ret = NF_STOLEN
*/
ret = cp->packet_xmit(skb, cp, pp, &iph);
/* do not touch skb anymore */
else {
IP_VS_DBG_RL("warning: packet_xmit is null");
ret = NF_ACCEPT;
}
/* Increase its packet counter and check if it is needed
* to be synchronized
*
* Sync connection if it is about to close to
* encorage the standby servers to update the connections timeout
*
* For ONE_PKT let ip_vs_sync_conn() do the filter work.
*/
if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
pkts = sysctl_sync_threshold(ipvs);
else
pkts = atomic_add_return(1, &cp->in_pkts);
if (ipvs->sync_state & IP_VS_STATE_MASTER)
ip_vs_sync_conn(ipvs, cp, pkts);
else if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control)
/* increment is done inside ip_vs_sync_conn too */
atomic_inc(&cp->control->in_pkts);
/* 更新连接记录保持时间 */
ip_vs_conn_put(cp);
return ret;
}
2.5 ip_vs_out
为回包确认所属连接,并将回包做还原处理 源码分析:
/*
* Check if outgoing packet belongs to the established ip_vs_conn.
*/
static unsigned int
ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af)
{
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
struct ip_vs_proto_data *pd;
struct ip_vs_conn *cp;
struct sock *sk;
EnterFunction(11);
/* 已经被ipvs处理过 */
/* Already marked as IPVS request or reply? */
if (skb->ipvs_property)
return NF_ACCEPT;
/* 获取所属连接?? */
sk = skb_to_full_sk(skb);
/* Bad... Do not break raw sockets */
if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT &&
af == AF_INET)) {
if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag)
return NF_ACCEPT;
}
if (unlikely(!skb_dst(skb)))
return NF_ACCEPT;
if (!ipvs->enable)
return NF_ACCEPT;
/* 获取ip协议头 */
ip_vs_fill_iph_skb(af, skb, false, &iph);
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
int related;
int verdict = ip_vs_out_icmp_v6(ipvs, skb, &related,
hooknum, &iph);
if (related)
return verdict;
}
} else
#endif
if (unlikely(iph.protocol == IPPROTO_ICMP)) {
int related;
int verdict = ip_vs_out_icmp(ipvs, skb, &related, hooknum);
if (related)
return verdict;
}
/* 判断是否是ipvs支持的四层协议类型 */
pd = ip_vs_proto_data_get(ipvs, iph.protocol);
if (unlikely(!pd))
return NF_ACCEPT;
pp = pd->pp;
/* reassemble IP fragments */
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET)
#endif
if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) {
if (ip_vs_gather_frags(ipvs, skb,
ip_vs_defrag_user(hooknum)))
return NF_STOLEN;
ip_vs_fill_iph_skb(AF_INET, skb, false, &iph);
}
/*
* Check if the packet belongs to an existing entry
*/
/* 是否属于现有连接 */
cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto,
ipvs, af, skb, &iph);
if (likely(cp)) {
if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
goto ignore_cp;
/* de-DNAT和连接状态更新 */
return handle_response(af, skb, pd, cp, &iph, hooknum);
}
/* UDP协议 */
/* Check for real-server-started requests */
if (atomic_read(&ipvs->conn_out_counter)) {
/* Currently only for UDP:
* connection oriented protocols typically use
* ephemeral ports for outgoing connections, so
* related incoming responses would not match any VS
*/
if (pp->protocol == IPPROTO_UDP) {
cp = __ip_vs_rs_conn_out(hooknum, ipvs, af, skb, &iph);
if (likely(cp))
return handle_response(af, skb, pd, cp, &iph,
hooknum);
}
}
/* icmp协议 */
if (sysctl_nat_icmp_send(ipvs) &&
(pp->protocol == IPPROTO_TCP ||
pp->protocol == IPPROTO_UDP ||
pp->protocol == IPPROTO_SCTP)) {
__be16 _ports[2], *pptr;
pptr = frag_safe_skb_hp(skb, iph.len,
sizeof(_ports), _ports);
if (pptr == NULL)
return NF_ACCEPT; /* Not for me */
if (ip_vs_has_real_service(ipvs, af, iph.protocol, &iph.saddr,
pptr[0])) {
/*
* Notify the real server: there is no
* existing entry if it is not RST
* packet or not TCP packet.
*/
if ((iph.protocol != IPPROTO_TCP &&
iph.protocol != IPPROTO_SCTP)
|| ((iph.protocol == IPPROTO_TCP
&& !is_tcp_reset(skb, iph.len))
|| (iph.protocol == IPPROTO_SCTP
&& !is_sctp_abort(skb,
iph.len)))) {
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
if (!skb->dev)
skb->dev = ipvs->net->loopback_dev;
icmpv6_send(skb,
ICMPV6_DEST_UNREACH,
ICMPV6_PORT_UNREACH,
0);
} else
#endif
icmp_send(skb,
ICMP_DEST_UNREACH,
ICMP_PORT_UNREACH, 0);
return NF_DROP;
}
}
}
out:
IP_VS_DBG_PKT(12, af, pp, skb, iph.off,
"ip_vs_out: packet continues traversal as normal");
return NF_ACCEPT;
ignore_cp:
__ip_vs_conn_put(cp);
goto out;
}
/* Handle response packets: rewrite addresses and send away...
*/
static unsigned int
handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
struct ip_vs_conn *cp, struct ip_vs_iphdr *iph,
unsigned int hooknum)
{
struct ip_vs_protocol *pp = pd->pp;
IP_VS_DBG_PKT(11, af, pp, skb, iph->off, "Outgoing packet");
if (skb_ensure_writable(skb, iph->len))
goto drop;
/* mangle the packet */
/* 四层de-DNAT */
if (pp->snat_handler &&
!SNAT_CALL(pp->snat_handler, skb, pp, cp, iph))
goto drop;
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6)
ipv6_hdr(skb)->saddr = cp->vaddr.in6;
else
#endif
{
/* 三层de-DNAT */
ip_hdr(skb)->saddr = cp->vaddr.ip;
ip_send_check(ip_hdr(skb));
}
/*
* nf_iterate does not expect change in the skb->dst->dev.
* It looks like it is not fatal to enable this code for hooks
* where our handlers are at the end of the chain list and
* when all next handlers use skb->dst->dev and not outdev.
* It will definitely route properly the inout NAT traffic
* when multiple paths are used.
*/
/* For policy routing, packets originating from this
* machine itself may be routed differently to packets
* passing through. We want this packet to be routed as
* if it came from this machine itself. So re-compute
* the routing information.
*/
/* 重新路由,由snat_reroute参数决定 */
if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum))
goto drop;
IP_VS_DBG_PKT(10, af, pp, skb, iph->off, "After SNAT");
/* 统计计数 */
ip_vs_out_stats(cp, skb);
/* 更新四层协议状态 */
ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd);
skb->ipvs_property = 1;
if (!(cp->flags & IP_VS_CONN_F_NFCT))
ip_vs_notrack(skb);
else
ip_vs_update_conntrack(skb, cp, 0);
/* 更新连接记录保持时间 */
ip_vs_conn_put(cp);
LeaveFunction(11);
return NF_ACCEPT;
drop:
ip_vs_conn_put(cp);
kfree_skb(skb);
LeaveFunction(11);
return NF_STOLEN;
}