2.4.36-stable kernel tree
修订版 | 0cf06e61768d8a77aa28cc1444e111496a6553e4 (tree) |
---|---|
时间 | 2008-09-22 13:32:20 |
作者 | Gilles Espinasse <g.esp@free...> |
Commiter | Willy Tarreau |
tcp: Clear probes_out more aggressively in tcp_ack().
backport of 2.6 commit 4b53fb67e385b856a991d402096379dab462170a
Test conditions : 2.4.36 kernel using this iptables configuration
iptables -N SLOWLO
iptables -A SLOWLO -m limit --limit 2/sec --limit-burst 1 -j ACCEPT
iptables -A SLOWLO -j DROP
iptables -A OUTPUT -o lo -p tcp --dport 12000 -j SLOWLO
borrowed ss from iproute2-2.4.7-now-ss020116-try.tar.gz,
I had the same result on 2.4.36.7 as Eric Dumazet on 2.6.25 without the patch with his test program.
This is based upon an excellent bug report from Eric Dumazet.
tcp_ack() should clear ->icsk_probes_out even if there are packets
outstanding. Otherwise if we get a sequence of ACKs while we do have
packets outstanding over and over again, we'll never clear the
probes_out value and eventually think the connection is too sick and
we'll reset it.
This appears to be some "optimization" added to tcp_ack() in the 2.4.x
timeframe. In 2.2.x, probes_out is pretty much always cleared by
tcp_ack().
Here is Eric's original report:
Apparently, we can in some situations reset TCP connections in a couple of seconds when some frames are lost.
In order to reproduce the problem, please try the following program on linux-2.6.25.*
Setup some iptables rules to allow two frames per second sent on loopback interface to tcp destination port 12000
...
Then run the attached program and see the output :
./test_tcp-input
State Recv-Q Send-Q Local Address:Port Peer Address:Port
ESTAB 0 40 127.0.0.1:32769 127.0.0.1:12000 timer:(persist,180ms,1)
State Recv-Q Send-Q Local Address:Port Peer Address:Port
ESTAB 0 40 127.0.0.1:32769 127.0.0.1:12000 timer:(persist,180ms,3)
State Recv-Q Send-Q Local Address:Port Peer Address:Port
ESTAB 0 40 127.0.0.1:32769 127.0.0.1:12000 timer:(persist,180ms,5)
State Recv-Q Send-Q Local Address:Port Peer Address:Port
ESTAB 0 40 127.0.0.1:32769 127.0.0.1:12000 timer:(persist,180ms,7)
State Recv-Q Send-Q Local Address:Port Peer Address:Port
ESTAB 0 40 127.0.0.1:32769 127.0.0.1:12000 timer:(persist,180ms,9)
State Recv-Q Send-Q Local Address:Port Peer Address:Port
ESTAB 0 40 127.0.0.1:32769 127.0.0.1:12000 timer:(persist,180ms,11)
State Recv-Q Send-Q Local Address:Port Peer Address:Port
ESTAB 0 40 127.0.0.1:32769 127.0.0.1:12000 timer:(persist,180ms,13)
State Recv-Q Send-Q Local Address:Port Peer Address:Port
ESTAB 0 40 127.0.0.1:32769 127.0.0.1:12000 timer:(persist,180ms,15)
write(): Connection timed out
wrote 880 bytes but was interrupted after 10 seconds
ESTAB 0 0 127.0.0.1:12000 127.0.0.1:32769
Exiting read() because no data available (4000 ms timeout).
read 860 bytes
While this tcp session makes progress (sending frames with 50 bytes of payload, every 500ms), linux tcp stack decides to reset it, when tcp_retries 2 is reached (default value : 15)
...
Source of program :
/*
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <unistd.h>
#include <stdio.h>
#include <time.h>
#include <sys/poll.h>
int port = 12000;
char buffer[4096];
int main(int argc, char *argv[])
{
int lfd = socket(AF_INET, SOCK_STREAM, 0);
struct sockaddr_in socket_address;
time_t t0, t1;
int on = 1, sfd, res;
unsigned long total = 0;
socklen_t alen = sizeof(socket_address);
pid_t pid;
time(&t0);
socket_address.sin_family = AF_INET;
socket_address.sin_port = htons(port);
socket_address.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
if (lfd == -1) {
perror("socket()");
return 1;
}
setsockopt(lfd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(int));
if (bind(lfd, (struct sockaddr *)&socket_address, sizeof(socket_address)) == -1) {
perror("bind");
close(lfd);
return 1;
}
if (listen(lfd, 1) == -1) {
perror("listen()");
close(lfd);
return 1;
}
pid = fork();
if (pid == 0) {
int i, cfd = socket(AF_INET, SOCK_STREAM, 0);
close(lfd);
if (connect(cfd, (struct sockaddr *)&socket_address, sizeof(socket_address)) == -1) {
perror("connect()");
return 1;
}
for (i = 0 ; ;) {
res = write(cfd, "blablabla\n", 10);
if (res > 0) total += res;
else if (res == -1) {
perror("write()");
break;
} else break;
usleep(100000);
if (++i == 10) {
system("ss -on dst 127.0.0.1:12000");
i = 0;
}
}
time(&t1);
fprintf(stderr, "wrote %lu bytes but was interrupted after %g seconds\n", total, difftime(t1, t0));
system("ss -on | grep 127.0.0.1:12000");
close(cfd);
return 0;
}
sfd = accept(lfd, (struct sockaddr *)&socket_address, &alen);
if (sfd == -1) {
perror("accept");
return 1;
}
close(lfd);
while (1) {
struct pollfd pfd[1];
pfd[0].fd = sfd;
pfd[0].events = POLLIN;
if (poll(pfd, 1, 4000) == 0) {
fprintf(stderr, "Exiting read() because no data available (4000 ms timeout).\n");
break;
}
res = read(sfd, buffer, sizeof(buffer));
if (res > 0) total += res;
else if (res == 0) break;
else perror("read()");
}
fprintf(stderr, "read %lu bytes\n", total);
close(sfd);
return 0;
}
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Gilles Espinasse g.esp@free.fr
Signed-off-by: Willy Tarreau <w@1wt.eu>
@@ -2814,6 +2814,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | ||
2814 | 2814 | * log. Something worked... |
2815 | 2815 | */ |
2816 | 2816 | sk->err_soft = 0; |
2817 | + tp->probes_out = 0; | |
2817 | 2818 | tp->rcv_tstamp = tcp_time_stamp; |
2818 | 2819 | if ((prior_packets = tp->packets_out) == 0) |
2819 | 2820 | goto no_queue; |
@@ -2845,8 +2846,6 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | ||
2845 | 2846 | return 1; |
2846 | 2847 | |
2847 | 2848 | no_queue: |
2848 | - tp->probes_out = 0; | |
2849 | - | |
2850 | 2849 | /* If this ack opens up a zero window, clear backoff. It was |
2851 | 2850 | * being used to time the probes, and is probably far higher than |
2852 | 2851 | * it needs to be for normal retransmission. |