From: Neil Horman <nhorman@redhat.com> Date: Thu, 22 Oct 2009 10:33:21 -0400 Subject: [net] lvs: adjust sync protocol handling for ipvsadm -2 Message-id: 20091022143321.GB25788@shamino.rdu.redhat.com O-Subject: Re: [RHEL 5.5 PATCH] lvs: adjust sync protocol handling so timeout values are taken correctly from ipvsadm (bz 524129) Bugzilla: 524129 RH-Acked-by: Thomas Graf <tgraf@redhat.com> RH-Acked-by: Jiri Pirko <jpirko@redhat.com> RH-Acked-by: Cong Wang <amwang@redhat.com> RH-Acked-by: Danny Feng <dfeng@redhat.com> Ok, in response to congs concern over the check on that state variable, I've gone and backported the entirety of this LVS upstream commit. I've built it in brew and booted it, and that all works fine. I need to point out two things: 1) The functionality of this patch is completely untested. I'm posting it here for early review to save time, but we're expecting test results from NASA next week, so we need to wait until at least then to incorporate the patch. Early reviews would be appreciated however 2) The extra bits that this patch takes in constitute an ABI break. The affected symbols aren't on the whitelist, so from a strict ABI standpoint, we're ok, but because of how REGISTER_PROTOCOL works in LVS, any 3rd party modules out there that register LVS protocol support are going to start scribbling over memory with this patch. If anyone knows of any partners/vendors/users doing this sort of thing, please let me know now, so we can give them an early heads up. Thanks Neil diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 3b57b15..6f0c2ea 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -436,7 +436,8 @@ struct sk_buff; struct ip_vs_protocol { struct ip_vs_protocol *next; char *name; - __u16 protocol; + u16 protocol; + u16 num_states; int dont_defrag; atomic_t appcnt; /* counter of proto app incs */ int *timeout_table; /* protocol timeout table */ diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c index 867d4e9..4bd7dd9 100644 --- a/net/ipv4/ipvs/ip_vs_proto.c +++ b/net/ipv4/ipvs/ip_vs_proto.c @@ -154,7 +154,7 @@ const char * ip_vs_state_name(__u16 proto, int state) struct ip_vs_protocol *pp = ip_vs_proto_get(proto); if (pp == NULL || pp->state_name == NULL) - return "ERR!"; + return (IPPROTO_IP == proto) ? "NONE" : "ERR!"; return pp->state_name(state); } diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c index 8b0505b..5a90372 100644 --- a/net/ipv4/ipvs/ip_vs_proto_ah.c +++ b/net/ipv4/ipvs/ip_vs_proto_ah.c @@ -160,6 +160,7 @@ static void ah_exit(struct ip_vs_protocol *pp) struct ip_vs_protocol ip_vs_protocol_ah = { .name = "AH", .protocol = IPPROTO_AH, + .num_states = 1, .dont_defrag = 1, .init = ah_init, .exit = ah_exit, diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c index c36ccf0..2cdc6a5 100644 --- a/net/ipv4/ipvs/ip_vs_proto_esp.c +++ b/net/ipv4/ipvs/ip_vs_proto_esp.c @@ -159,6 +159,7 @@ static void esp_exit(struct ip_vs_protocol *pp) struct ip_vs_protocol ip_vs_protocol_esp = { .name = "ESP", .protocol = IPPROTO_ESP, + .num_states = 1, .dont_defrag = 1, .init = esp_init, .exit = esp_exit, diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index bc28b11..7e8364b 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -597,6 +597,7 @@ static void ip_vs_tcp_exit(struct ip_vs_protocol *pp) struct ip_vs_protocol ip_vs_protocol_tcp = { .name = "TCP", .protocol = IPPROTO_TCP, + .num_states = IP_VS_TCP_S_LAST, .dont_defrag = 0, .appcnt = ATOMIC_INIT(0), .init = ip_vs_tcp_init, diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c index 89d9175..c96ae99 100644 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c @@ -410,6 +410,7 @@ static void udp_exit(struct ip_vs_protocol *pp) struct ip_vs_protocol ip_vs_protocol_udp = { .name = "UDP", .protocol = IPPROTO_UDP, + .num_states = IP_VS_UDP_S_LAST, .dont_defrag = 0, .init = udp_init, .exit = udp_exit, diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index 07da8e4..03fcde8 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -278,15 +278,21 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer; struct ip_vs_sync_conn *s; struct ip_vs_sync_conn_options *opt; + struct ip_vs_protocol *pp; struct ip_vs_conn *cp; char *p; int i; + if (buflen < sizeof(struct ip_vs_sync_mesg)) { + IP_VS_ERR_RL("sync message header too short\n"); + return; + } + /* Convert size back to host byte order */ m->size = ntohs(m->size); if (buflen != m->size) { - IP_VS_ERR("bogus message\n"); + IP_VS_ERR_RL("bogus sync message size\n"); return; } @@ -299,10 +305,52 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) p = (char *)buffer + sizeof(struct ip_vs_sync_mesg); for (i=0; i<m->nr_conns; i++) { - unsigned flags; + unsigned flags, state; + + if (p + SIMPLE_CONN_SIZE > buffer+buflen) { + IP_VS_ERR_RL("bogus conn in sync message\n"); + return; + } + s = (struct ip_vs_sync_conn *) p; + + flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC; + flags &= ~IP_VS_CONN_F_HASHED; + if (flags & IP_VS_CONN_F_SEQ_MASK) { + opt = (struct ip_vs_sync_conn_options *)&s[1]; + p += FULL_CONN_SIZE; + if (p > buffer+buflen) { + IP_VS_ERR_RL("bogus conn options in sync message\n"); + return; + } + } else { + opt = NULL; + p += SIMPLE_CONN_SIZE; + } - s = (struct ip_vs_sync_conn *)p; - flags = ntohs(s->flags); + state = ntohs(s->state); + + if (!(flags & IP_VS_CONN_F_TEMPLATE)) { + pp = ip_vs_proto_get(s->protocol); + if (!pp) { + IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n", + s->protocol); + continue; + } + if (state >= pp->num_states) { + IP_VS_DBG(2, "Invalid %s state %u in sync msg\n", + pp->name, state); + continue; + } + } else { + /* protocol in templates is not used for state/timeout */ + pp = NULL; + if (state > 0) { + IP_VS_DBG(2, "Invalid template state %u in sync msg\n", + state); + state = 0; + } + } + if (!(flags & IP_VS_CONN_F_TEMPLATE)) cp = ip_vs_conn_in_get(s->protocol, s->caddr, s->cport, @@ -321,7 +369,6 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) IP_VS_ERR("ip_vs_conn_new failed\n"); return; } - cp->state = ntohs(s->state); } else if (!cp->dest) { /* it is an entry created by the synchronization */ cp->state = ntohs(s->state); @@ -329,21 +376,23 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) } /* Note that we don't touch its state and flags if it is a normal entry. */ - if (flags & IP_VS_CONN_F_SEQ_MASK) { - opt = (struct ip_vs_sync_conn_options *)&s[1]; + if (opt) memcpy(&cp->in_seq, opt, sizeof(*opt)); - p += FULL_CONN_SIZE; - } else - p += SIMPLE_CONN_SIZE; atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]); - cp->timeout = IP_VS_SYNC_CONN_TIMEOUT; + cp->state = state; + /* + * We can not recover the right timeout for templates + * in all cases, we can not find the right fwmark + * virtual service. If needed, we can do it for + * non-fwmark persistent services. + */ + if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table) + cp->timeout = pp->timeout_table[state]; + else + cp->timeout = (3*60*HZ); ip_vs_conn_put(cp); - if (p > buffer+buflen) { - IP_VS_ERR("bogus message\n"); - return; - } } }