This bug has also been reproduced with a tar-pipe-tar process, using
ssh remote tar cfP - /path/to/dir | tar xf -
After some investigation, it seems there is an issue with the ssh process. When the rsync process is stuck in state 'D', the ssh process is in state 'S+':
The strace track on the ssh process on both sides of the network link show that the processes try to access file descriptors that are not listed in /proc:
client:
=================
root@mail01:/var/log# strace -p 8212
Process 8212 attached - interrupt to quit
select(7, [3 4], [5], NULL, NULL
root@mail01:/var/log# ls -l /proc/8212/fd
total 0
lrwx------ 1 root root 64 May 20 10:28 0 -> socket:[11967]
lrwx------ 1 root root 64 May 20 10:28 1 -> socket:[11970]
l-wx------ 1 root root 64 May 20 10:16 2 -> /var/log/rsync_strace
lrwx------ 1 root root 64 May 20 10:28 3 -> socket:[11980]
lrwx------ 1 root root 64 May 20 10:28 4 -> socket:[11967]
lrwx------ 1 root root 64 May 20 10:28 5 -> socket:[11970]
l-wx------ 1 root root 64 May 20 10:28 6 -> /var/log/rsync_strace
=================
server:
=================
[PROD/MASTER] gforge01:~# strace -p 26107
Process 26107 attached - interrupt to quit
select(14, [3 6], [], NULL, NULL
This bug has also been reproduced with a tar-pipe-tar process, using
ssh remote tar cfP - /path/to/dir | tar xf -
After some investigation, it seems there is an issue with the ssh process. When the rsync process is stuck in state 'D', the ssh process is in state 'S+':
client: vprod.infra. s1.p.fti. net:/var/ lib/mailman/ /var/lib/mailman vprod.infra. s1.p.fti. net:/var/ lib/mailman/ /var/lib/mailman vprod.infra. s1.p.fti. net rsync --server --sender -vlogDtpre.iLsf . /var/lib/mailman/ vprod.infra. s1.p.fti. net:/var/ lib/mailman/ /var/lib/mailman
=================
root 651 0.0 0.0 49252 1080 ? Ss 09:53 0:00 /usr/sbin/sshd
root 2681 0.0 0.0 70572 3308 ? Ss 09:53 0:00 \_ sshd: root@pts/0
root 2693 0.0 0.0 19340 3496 pts/0 Ss 09:53 0:00 | \_ -bash
root 8210 0.6 0.0 4328 800 pts/0 S+ 10:00 0:12 | \_ strace rsync -av --ignore-existing gforge01.
root 8211 0.2 0.3 124076 14536 pts/0 D+ 10:00 0:04 | \_ rsync -av --ignore-existing gforge01.
root 8212 5.0 0.1 44556 6756 pts/0 S+ 10:00 1:39 | \_ ssh gforge01.
root 8213 3.3 0.4 285744 19876 pts/0 D+ 10:00 1:05 | \_ rsync -av --ignore-existing gforge01.
=================
server:
=================
root 26107 5.4 0.2 13452 6616 ? Ss 10:00 1:55 \_ sshd: root@notty
root 26154 2.5 1.3 93880 41428 ? Ss 10:00 0:54 | \_ rsync --server --sender -vlogDtpre.iLsf . /var/lib/mailman/
=================
The strace track on the ssh process on both sides of the network link show that the processes try to access file descriptors that are not listed in /proc:
client: /var/log# strace -p 8212
=================
root@mail01:
Process 8212 attached - interrupt to quit
select(7, [3 4], [5], NULL, NULL
root@mail01: /var/log# ls -l /proc/8212/fd rsync_strace rsync_strace
total 0
lrwx------ 1 root root 64 May 20 10:28 0 -> socket:[11967]
lrwx------ 1 root root 64 May 20 10:28 1 -> socket:[11970]
l-wx------ 1 root root 64 May 20 10:16 2 -> /var/log/
lrwx------ 1 root root 64 May 20 10:28 3 -> socket:[11980]
lrwx------ 1 root root 64 May 20 10:28 4 -> socket:[11967]
lrwx------ 1 root root 64 May 20 10:28 5 -> socket:[11970]
l-wx------ 1 root root 64 May 20 10:28 6 -> /var/log/
=================
server:
=================
[PROD/MASTER] gforge01:~# strace -p 26107
Process 26107 attached - interrupt to quit
select(14, [3 6], [], NULL, NULL
[PROD/MASTER] gforge01:~# ls -l /proc/26107/fd/
total 0
lrwx------ 1 root root 64 2010-05-20 10:36 0 -> /dev/null
lrwx------ 1 root root 64 2010-05-20 10:36 1 -> /dev/null
l-wx------ 1 root root 64 2010-05-20 10:36 10 -> pipe:[4066236232]
lr-x------ 1 root root 64 2010-05-20 10:36 11 -> pipe:[4066236233]
lr-x------ 1 root root 64 2010-05-20 10:36 13 -> pipe:[4066236234]
lrwx------ 1 root root 64 2010-05-20 10:36 2 -> /dev/null
lrwx------ 1 root root 64 2010-05-20 10:36 3 -> socket:[4066235605]
lrwx------ 1 root root 64 2010-05-20 10:36 4 -> socket:[4066235803]
lrwx------ 1 root root 64 2010-05-20 10:36 5 -> socket:[4066236227]
lr-x------ 1 root root 64 2010-05-20 10:36 6 -> pipe:[4066236229]
lr-x------ 1 root root 64 2010-05-20 10:36 7 -> /dev/urandom
l-wx------ 1 root root 64 2010-05-20 10:36 8 -> pipe:[4066236229]
=================