Comment 7 for bug 411349

Revision history for this message
Frits Jalvingh (fjalvingh) wrote :

I did a full strace on the server thread for such a run. It seems to do a lot of reading for a long, long time; then it looks like it is dying:
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8bf03a8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0

lots of repeats of this, with intermittent something like:
fstat64(5, {st_mode=S_IFREG|0664, st_size=27403786, ...}) = 0
mmap2(NULL, 32768, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x931a4000
_llseek(5, 0, [0], SEEK_SET) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
read(5, "Bazaar pack format 1 (introduced"..., 32768) = 32768
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
_llseek(5, 9961472, [9961472], SEEK_SET) = 0
read(5, "\331b\261\241\0\0\0B122\n\n\37\213\10\0\0\0\0\0\2\377\235"..., 32768) = 32768

Then near the end it goes:
futex(0x8bf03a8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8bf03a8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
mmap2(NULL, 5746688, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x8bea9000
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
send(6, "b\0\0\3|B874\ninventories\n\nknit-delt"..., 65536, 0) = 65536
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
send(6, "\313z\263|\304\325\372\31\363\242.r\17B{\2\376\355\214"..., 65536, 0) = 29648
send(6, "\242&\303}\335MV6\311&\207GJ\257\373\362\357\207\7\227"..., 35888, 0) = -1 ECONNRESET (Connection reset by peer)
futex(0x8ac2260, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
stat64("/usr/local/lib/python2.4/site-packages/bzrlib/smart/protocol.py", {st_mode=S_IFREG|0644, st_size=50748, ...}) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
stat64("/usr/local/lib/python2.4/site-packages/bzrlib/smart/message.py", {st_mode=S_IFREG|0644, st_size=13718, ...}) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
stat64("/usr/local/lib/python2.4/site-packages/bzrlib/smart/protocol.py", {st_mode=S_IFREG|0644, st_size=50748, ...}) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
stat64("/usr/local/lib/python2.4/site-packages/bzrlib/smart/protocol.py", {st_mode=S_IFREG|0644, st_size=50748, ...}) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
stat64("/usr/local/lib/python2.4/site-packages/bzrlib/smart/protocol.py", {st_mode=S_IFREG|0644, st_size=50748, ...}) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
stat64("/usr/local/lib/python2.4/site-packages/bzrlib/smart/protocol.py", {st_mode=S_IFREG|0644, st_size=50748, ...}) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
stat64("/usr/local/lib/python2.4/site-packages/bzrlib/smart/medium.py", {st_mode=S_IFREG|0644, st_size=35183, ...}) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
stat64("/usr/local/lib/python2.4/site-packages/bzrlib/osutils.py", {st_mode=S_IFREG|0644, st_size=61815, ...}) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
stat64("/usr/local/lib/python2.4/site-packages/bzrlib/osutils.py", {st_mode=S_IFREG|0644, st_size=61815, ...}) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
stat64("/usr/local/lib/python2.4/site-packages/bzrlib/smart/protocol.py", {st_mode=S_IFREG|0644, st_size=50748, ...}) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
stat64("/usr/local/lib/python2.4/site-packages/bzrlib/smart/protocol.py", {st_mode=S_IFREG|0644, st_size=50748, ...}) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
stat64("/usr/local/lib/python2.4/site-packages/bzrlib/smart/protocol.py", {st_mode=S_IFREG|0644, st_size=50748, ...}) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
stat64("/usr/local/lib/python2.4/site-packages/bzrlib/smart/protocol.py", {st_mode=S_IFREG|0644, st_size=50748, ...}) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
gettimeofday({1249995453, 349641}, NULL) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
write(3, "91094.887 Traceback (most recen"..., 2055) = 2055
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
recv(6, "", 65536, 0) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
recv(6, "", 65536, 0) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0xbbf5418, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8b669f8, FUTEX_WAKE, 1) = 0
futex(0x8ac2198, FUTEX_WAKE, 1) = 0
futex(0x8adca10, FUTEX_WAKE, 1) = 0
futex(0x8adca10, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
_exit(0) = ?
Process 9703 detached

At this point the log shows:
91094.887 Traceback (most recent call last):
  File "/usr/local/lib/python2.4/site-packages/bzrlib/smart/protocol.py", line 893, in accept_bytes
    _StatefulDecoder.accept_bytes(self, bytes)
  File "/usr/local/lib/python2.4/site-packages/bzrlib/smart/protocol.py", line 396, in accept_bytes
    self.state_accept()
  File "/usr/local/lib/python2.4/site-packages/bzrlib/smart/protocol.py", line 998, in _state_accept_expecting_message_part
    self.done()
  File "/usr/local/lib/python2.4/site-packages/bzrlib/smart/protocol.py", line 1036, in done
    raise errors.SmartMessageHandlerError(sys.exc_info())
SmartMessageHandlerError: The message handler raised an exception:
Traceback (most recent call last):
  File "/usr/local/lib/python2.4/site-packages/bzrlib/smart/protocol.py", line 1034, in done
    self.message_handler.end_received()
  File "/usr/local/lib/python2.4/site-packages/bzrlib/smart/message.py", line 167, in end_received
    self.responder.send_response(self.request_handler.response)
  File "/usr/local/lib/python2.4/site-packages/bzrlib/smart/protocol.py", line 1174, in send_response
    self._write_prefixed_body(chunk)
  File "/usr/local/lib/python2.4/site-packages/bzrlib/smart/protocol.py", line 1110, in _write_prefixed_body
    self._write_func(struct.pack('!L', len(bytes)))
  File "/usr/local/lib/python2.4/site-packages/bzrlib/smart/protocol.py", line 1069, in _write_func
    self.flush()
  File "/usr/local/lib/python2.4/site-packages/bzrlib/smart/protocol.py", line 1073, in flush
    self._real_write_func(''.join(self._buf))
  File "/usr/local/lib/python2.4/site-packages/bzrlib/smart/medium.py", line 298, in _write_out
    osutils.send_all(self.socket, bytes, self._report_activity)
  File "/usr/local/lib/python2.4/site-packages/bzrlib/osutils.py", line 1724, in send_all
    until_no_eintr(socket.sendall, block)
  File "/usr/local/lib/python2.4/site-packages/bzrlib/osutils.py", line 1803, in until_no_eintr
    return f(*a, **kw)
  File "<string>", line 1, in sendall
error: (104, 'Connection reset by peer')

I think I know what at least is causing this: when this happens the client has not seen data for > 5 minutes. At this point appearently the firewall closes the connection (because no keepalive is active). Then when the server is finally ready to continue it gets the indication the connection is gone and aborts. The client gets no such indication and waits forever.

The base problem seems to be that because of the other problems we have querying the repository takes so long that the connection gets dropped by the firewall.