I did a full strace on the server thread for such a run. It seems to do a lot of reading for a long, long time; then it looks like it is dying:
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8bf03a8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
At this point the log shows:
91094.887 Traceback (most recent call last):
File "/usr/local/lib/python2.4/site-packages/bzrlib/smart/protocol.py", line 893, in accept_bytes
_StatefulDecoder.accept_bytes(self, bytes)
File "/usr/local/lib/python2.4/site-packages/bzrlib/smart/protocol.py", line 396, in accept_bytes
self.state_accept()
File "/usr/local/lib/python2.4/site-packages/bzrlib/smart/protocol.py", line 998, in _state_accept_expecting_message_part
self.done()
File "/usr/local/lib/python2.4/site-packages/bzrlib/smart/protocol.py", line 1036, in done
raise errors.SmartMessageHandlerError(sys.exc_info())
SmartMessageHandlerError: The message handler raised an exception:
Traceback (most recent call last):
File "/usr/local/lib/python2.4/site-packages/bzrlib/smart/protocol.py", line 1034, in done
self.message_handler.end_received()
File "/usr/local/lib/python2.4/site-packages/bzrlib/smart/message.py", line 167, in end_received
self.responder.send_response(self.request_handler.response)
File "/usr/local/lib/python2.4/site-packages/bzrlib/smart/protocol.py", line 1174, in send_response
self._write_prefixed_body(chunk)
File "/usr/local/lib/python2.4/site-packages/bzrlib/smart/protocol.py", line 1110, in _write_prefixed_body
self._write_func(struct.pack('!L', len(bytes)))
File "/usr/local/lib/python2.4/site-packages/bzrlib/smart/protocol.py", line 1069, in _write_func
self.flush()
File "/usr/local/lib/python2.4/site-packages/bzrlib/smart/protocol.py", line 1073, in flush
self._real_write_func(''.join(self._buf))
File "/usr/local/lib/python2.4/site-packages/bzrlib/smart/medium.py", line 298, in _write_out
osutils.send_all(self.socket, bytes, self._report_activity)
File "/usr/local/lib/python2.4/site-packages/bzrlib/osutils.py", line 1724, in send_all
until_no_eintr(socket.sendall, block)
File "/usr/local/lib/python2.4/site-packages/bzrlib/osutils.py", line 1803, in until_no_eintr
return f(*a, **kw)
File "<string>", line 1, in sendall
error: (104, 'Connection reset by peer')
I think I know what at least is causing this: when this happens the client has not seen data for > 5 minutes. At this point appearently the firewall closes the connection (because no keepalive is active). Then when the server is finally ready to continue it gets the indication the connection is gone and aborts. The client gets no such indication and waits forever.
The base problem seems to be that because of the other problems we have querying the repository takes so long that the connection gets dropped by the firewall.
I did a full strace on the server thread for such a run. It seems to do a lot of reading for a long, long time; then it looks like it is dying:
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8bf03a8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
lots of repeats of this, with intermittent something like: S_IFREG| 0664, st_size=27403786, ...}) = 0 PROT_WRITE, MAP_PRIVATE| MAP_ANONYMOUS, -1, 0) = 0x931a4000 261\241\ 0\0\0B122\ n\n\37\ 213\10\ 0\0\0\0\ 0\2\377\ 235"... , 32768) = 32768
fstat64(5, {st_mode=
mmap2(NULL, 32768, PROT_READ|
_llseek(5, 0, [0], SEEK_SET) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
read(5, "Bazaar pack format 1 (introduced"..., 32768) = 32768
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
_llseek(5, 9961472, [9961472], SEEK_SET) = 0
read(5, "\331b\
Then near the end it goes: PROT_WRITE, MAP_PRIVATE| MAP_ANONYMOUS, -1, 0) = 0x8bea9000 3|B874\ ninventories\ n\nknit- delt".. ., 65536, 0) = 65536 263|\304\ 325\372\ 31\363\ 242.r\17B{ \2\376\ 355\214" ..., 65536, 0) = 29648 303}\335MV6\ 311&\207GJ\ 257\373\ 362\357\ 207\7\227" ..., 35888, 0) = -1 ECONNRESET (Connection reset by peer) "/usr/local/ lib/python2. 4/site- packages/ bzrlib/ smart/protocol. py", {st_mode= S_IFREG| 0644, st_size=50748, ...}) = 0 "/usr/local/ lib/python2. 4/site- packages/ bzrlib/ smart/message. py", {st_mode= S_IFREG| 0644, st_size=13718, ...}) = 0 "/usr/local/ lib/python2. 4/site- packages/ bzrlib/ smart/protocol. py", {st_mode= S_IFREG| 0644, st_size=50748, ...}) = 0 "/usr/local/ lib/python2. 4/site- packages/ bzrlib/ smart/protocol. py", {st_mode= S_IFREG| 0644, st_size=50748, ...}) = 0 "/usr/local/ lib/python2. 4/site- packages/ bzrlib/ smart/protocol. py", {st_mode= S_IFREG| 0644, st_size=50748, ...}) = 0 "/usr/local/ lib/python2. 4/site- packages/ bzrlib/ smart/protocol. py", {st_mode= S_IFREG| 0644, st_size=50748, ...}) = 0 "/usr/local/ lib/python2. 4/site- packages/ bzrlib/ smart/medium. py", {st_mode= S_IFREG| 0644, st_size=35183, ...}) = 0 "/usr/local/ lib/python2. 4/site- packages/ bzrlib/ osutils. py", {st_mode= S_IFREG| 0644, st_size=61815, ...}) = 0 "/usr/local/ lib/python2. 4/site- packages/ bzrlib/ osutils. py", {st_mode= S_IFREG| 0644, st_size=61815, ...}) = 0 "/usr/local/ lib/python2. 4/site- packages/ bzrlib/ smart/protocol. py", {st_mode= S_IFREG| 0644, st_size=50748, ...}) = 0 "/usr/local/ lib/python2. 4/site- packages/ bzrlib/ smart/protocol. py", {st_mode= S_IFREG| 0644, st_size=50748, ...}) = 0 "/usr/local/ lib/python2. 4/site- packages/ bzrlib/ smart/protocol. py", {st_mode= S_IFREG| 0644, st_size=50748, ...}) = 0 "/usr/local/ lib/python2. 4/site- packages/ bzrlib/ smart/protocol. py", {st_mode= S_IFREG| 0644, st_size=50748, ...}) = 0 {1249995453, 349641}, NULL) = 0
futex(0x8bf03a8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8bf03a8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
mmap2(NULL, 5746688, PROT_READ|
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
send(6, "b\0\0\
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
send(6, "\313z\
send(6, "\242&\
futex(0x8ac2260, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
stat64(
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
stat64(
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
stat64(
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
stat64(
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
stat64(
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
stat64(
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
stat64(
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
stat64(
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
stat64(
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
stat64(
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
stat64(
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
stat64(
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
stat64(
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
gettimeofday(
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
write(3, "91094.887 Traceback (most recen"..., 2055) = 2055
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
recv(6, "", 65536, 0) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
recv(6, "", 65536, 0) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0xbbf5418, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
futex(0x8b669f8, FUTEX_WAKE, 1) = 0
futex(0x8ac2198, FUTEX_WAKE, 1) = 0
futex(0x8adca10, FUTEX_WAKE, 1) = 0
futex(0x8adca10, FUTEX_WAKE, 1) = 0
futex(0x8c96cf8, FUTEX_WAKE, 1) = 0
_exit(0) = ?
Process 9703 detached
At this point the log shows: lib/python2. 4/site- packages/ bzrlib/ smart/protocol. py", line 893, in accept_bytes coder.accept_ bytes(self, bytes) lib/python2. 4/site- packages/ bzrlib/ smart/protocol. py", line 396, in accept_bytes state_accept( ) lib/python2. 4/site- packages/ bzrlib/ smart/protocol. py", line 998, in _state_ accept_ expecting_ message_ part lib/python2. 4/site- packages/ bzrlib/ smart/protocol. py", line 1036, in done SmartMessageHan dlerError( sys.exc_ info()) dlerError: The message handler raised an exception: lib/python2. 4/site- packages/ bzrlib/ smart/protocol. py", line 1034, in done message_ handler. end_received( ) lib/python2. 4/site- packages/ bzrlib/ smart/message. py", line 167, in end_received responder. send_response( self.request_ handler. response) lib/python2. 4/site- packages/ bzrlib/ smart/protocol. py", line 1174, in send_response _write_ prefixed_ body(chunk) lib/python2. 4/site- packages/ bzrlib/ smart/protocol. py", line 1110, in _write_ prefixed_ body _write_ func(struct. pack('! L', len(bytes))) lib/python2. 4/site- packages/ bzrlib/ smart/protocol. py", line 1069, in _write_func lib/python2. 4/site- packages/ bzrlib/ smart/protocol. py", line 1073, in flush _real_write_ func('' .join(self. _buf)) lib/python2. 4/site- packages/ bzrlib/ smart/medium. py", line 298, in _write_out send_all( self.socket, bytes, self._report_ activity) lib/python2. 4/site- packages/ bzrlib/ osutils. py", line 1724, in send_all no_eintr( socket. sendall, block) lib/python2. 4/site- packages/ bzrlib/ osutils. py", line 1803, in until_no_eintr
91094.887 Traceback (most recent call last):
File "/usr/local/
_StatefulDe
File "/usr/local/
self.
File "/usr/local/
self.done()
File "/usr/local/
raise errors.
SmartMessageHan
Traceback (most recent call last):
File "/usr/local/
self.
File "/usr/local/
self.
File "/usr/local/
self.
File "/usr/local/
self.
File "/usr/local/
self.flush()
File "/usr/local/
self.
File "/usr/local/
osutils.
File "/usr/local/
until_
File "/usr/local/
return f(*a, **kw)
File "<string>", line 1, in sendall
error: (104, 'Connection reset by peer')
I think I know what at least is causing this: when this happens the client has not seen data for > 5 minutes. At this point appearently the firewall closes the connection (because no keepalive is active). Then when the server is finally ready to continue it gets the indication the connection is gone and aborts. The client gets no such indication and waits forever.
The base problem seems to be that because of the other problems we have querying the repository takes so long that the connection gets dropped by the firewall.