Tried on Fedora 20 server and client, updated (on 2014/03/07), with bluster 3.4.2. Create a Gluster filesystem (fails on distributed and replicated fs) mount -t glusterfs <GLUSTER SHARE> /mnt cd /mnt mkdir test d=1 while [ $d -le 24 ]; do touch test/$d; d=$(( $d + 1 )); done ls test ls: reading directory test: Input/output error cd / umount /mnt mount -t nfs <GLUSTER SHARE> /mnt cd /mnt ls test success cd / umount /mnt mount -t glusterfs <GLUSTER SHARE> /mnt cd /mnt rm -f test/1 ls test success The number of maximum files depends on the filename size.
glusters package version: 3.4.2-1.fc20 kernel version: 3.13.5-202.fc20.x86_64 fuse version: 2.9.3-2.fc20
Additional notes from the IRC channel: * Client is an x86_64, server is an arm7hl (32-bit) * Number of dirents before failure changes based on the length of the filenames, like it's hitting some buffer limit. * Client and brick logs show no errors * Can mount via nfs without issue
strace ls test execve("/usr/bin/ls", ["ls", "test/"], [/* 22 vars */]) = 0 brk(0) = 0x1807000 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f487e20b000 access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory) open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3 fstat(3, {st_mode=S_IFREG|0644, st_size=90898, ...}) = 0 mmap(NULL, 90898, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f487e1f4000 close(3) = 0 open("/lib64/libselinux.so.1", O_RDONLY|O_CLOEXEC) = 3 read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\240d@\223=\0\0\0"..., 832) = 832 fstat(3, {st_mode=S_IFREG|0755, st_size=144952, ...}) = 0 mmap(0x3d93400000, 2242712, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x3d93400000 mprotect(0x3d93421000, 2093056, PROT_NONE) = 0 mmap(0x3d93620000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x20000) = 0x3d93620000 mmap(0x3d93622000, 6296, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x3d93622000 close(3) = 0 open("/lib64/libcap.so.2", O_RDONLY|O_CLOEXEC) = 3 read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0 \26\300\251=\0\0\0"..., 832) = 832 fstat(3, {st_mode=S_IFREG|0755, st_size=21424, ...}) = 0 mmap(0x3da9c00000, 2114112, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x3da9c00000 mprotect(0x3da9c04000, 2093056, PROT_NONE) = 0 mmap(0x3da9e03000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x3000) = 0x3da9e03000 close(3) = 0 open("/lib64/libacl.so.1", O_RDONLY|O_CLOEXEC) = 3 read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\200\37@\250=\0\0\0"..., 832) = 832 fstat(3, {st_mode=S_IFREG|0755, st_size=39200, ...}) = 0 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f487e1f3000 mmap(0x3da8400000, 2130560, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x3da8400000 mprotect(0x3da8407000, 2097152, PROT_NONE) = 0 mmap(0x3da8607000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x7000) = 0x3da8607000 close(3) = 0 open("/lib64/libc.so.6", O_RDONLY|O_CLOEXEC) = 3 read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0p\36\202\221=\0\0\0"..., 832) = 832 fstat(3, {st_mode=S_IFREG|0755, st_size=2100672, ...}) = 0 mmap(0x3d91800000, 3924576, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x3d91800000 mprotect(0x3d919b4000, 2097152, PROT_NONE) = 0 mmap(0x3d91bb4000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1b4000) = 0x3d91bb4000 mmap(0x3d91bba000, 16992, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x3d91bba000 close(3) = 0 open("/lib64/libpcre.so.1", O_RDONLY|O_CLOEXEC) = 3 read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0p\27\0\223=\0\0\0"..., 832) = 832 fstat(3, {st_mode=S_IFREG|0755, st_size=421144, ...}) = 0 mmap(0x3d93000000, 2511368, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x3d93000000 mprotect(0x3d93065000, 2093056, PROT_NONE) = 0 mmap(0x3d93264000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x64000) = 0x3d93264000 close(3) = 0 open("/lib64/liblzma.so.5", O_RDONLY|O_CLOEXEC) = 3 read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\3400\300\222=\0\0\0"..., 832) = 832 fstat(3, {st_mode=S_IFREG|0755, st_size=155400, ...}) = 0 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f487e1f2000 mmap(0x3d92c00000, 2245240, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x3d92c00000 mprotect(0x3d92c24000, 2093056, PROT_NONE) = 0 mmap(0x3d92e23000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x23000) = 0x3d92e23000 close(3) = 0 open("/lib64/libdl.so.2", O_RDONLY|O_CLOEXEC) = 3 read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\320\16\0\222=\0\0\0"..., 832) = 832 fstat(3, {st_mode=S_IFREG|0755, st_size=22440, ...}) = 0 mmap(0x3d92000000, 2109744, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x3d92000000 mprotect(0x3d92003000, 2093056, PROT_NONE) = 0 mmap(0x3d92202000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x2000) = 0x3d92202000 close(3) = 0 open("/lib64/libattr.so.1", O_RDONLY|O_CLOEXEC) = 3 read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\320\23\200\250=\0\0\0"..., 832) = 832 fstat(3, {st_mode=S_IFREG|0755, st_size=22160, ...}) = 0 mmap(0x3da8800000, 2113904, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x3da8800000 mprotect(0x3da8804000, 2093056, PROT_NONE) = 0 mmap(0x3da8a03000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x3000) = 0x3da8a03000 close(3) = 0 open("/lib64/libpthread.so.0", O_RDONLY|O_CLOEXEC) = 3 read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\340m\300\221=\0\0\0"..., 832) = 832 fstat(3, {st_mode=S_IFREG|0755, st_size=150800, ...}) = 0 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f487e1f1000 mmap(0x3d91c00000, 2213104, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x3d91c00000 mprotect(0x3d91c18000, 2093056, PROT_NONE) = 0 mmap(0x3d91e17000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x17000) = 0x3d91e17000 mmap(0x3d91e19000, 13552, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x3d91e19000 close(3) = 0 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f487e1f0000 mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f487e1ee000 arch_prctl(ARCH_SET_FS, 0x7f487e1ee840) = 0 mprotect(0x3d91bb4000, 16384, PROT_READ) = 0 mprotect(0x3d91e17000, 4096, PROT_READ) = 0 mprotect(0x3da8a03000, 4096, PROT_READ) = 0 mprotect(0x3d92202000, 4096, PROT_READ) = 0 mprotect(0x3d92e23000, 4096, PROT_READ) = 0 mprotect(0x3d93264000, 4096, PROT_READ) = 0 mprotect(0x3da8607000, 4096, PROT_READ) = 0 mprotect(0x3da9e03000, 4096, PROT_READ) = 0 mprotect(0x3d93620000, 4096, PROT_READ) = 0 mprotect(0x61a000, 4096, PROT_READ) = 0 mprotect(0x3d9161f000, 4096, PROT_READ) = 0 munmap(0x7f487e1f4000, 90898) = 0 set_tid_address(0x7f487e1eeb10) = 13298 set_robust_list(0x7f487e1eeb20, 24) = 0 rt_sigaction(SIGRTMIN, {0x3d91c068c0, [], SA_RESTORER|SA_SIGINFO, 0x3d91c0f750}, NULL, 8) = 0 rt_sigaction(SIGRT_1, {0x3d91c06950, [], SA_RESTORER|SA_RESTART|SA_SIGINFO, 0x3d91c0f750}, NULL, 8) = 0 rt_sigprocmask(SIG_UNBLOCK, [RTMIN RT_1], NULL, 8) = 0 getrlimit(RLIMIT_STACK, {rlim_cur=8192*1024, rlim_max=RLIM64_INFINITY}) = 0 statfs("/sys/fs/selinux", 0x7fff120969a0) = -1 ENOENT (No such file or directory) statfs("/selinux", 0x7fff120969a0) = -1 ENOENT (No such file or directory) brk(0) = 0x1807000 brk(0x1828000) = 0x1828000 open("/proc/filesystems", O_RDONLY) = 3 fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f487e20a000 read(3, "nodev\tsysfs\nnodev\trootfs\nnodev\tr"..., 1024) = 390 read(3, "", 1024) = 0 close(3) = 0 munmap(0x7f487e20a000, 4096) = 0 open("/usr/lib/locale/locale-archive", O_RDONLY|O_CLOEXEC) = 3 fstat(3, {st_mode=S_IFREG|0644, st_size=106070960, ...}) = 0 mmap(NULL, 106070960, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f4877cc5000 close(3) = 0 ioctl(1, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or TCGETS, {B38400 opost isig icanon echo ...}) = 0 ioctl(1, TIOCGWINSZ, {ws_row=48, ws_col=164, ws_xpixel=0, ws_ypixel=0}) = 0 stat("test/", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0 openat(AT_FDCWD, "test/", O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC) = 3 getdents(3, 0x180cc90, 32768) = -1 EIO (Input/output error) open("/usr/share/locale/locale.alias", O_RDONLY|O_CLOEXEC) = 4 fstat(4, {st_mode=S_IFREG|0644, st_size=2492, ...}) = 0 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f487e20a000 read(4, "# Locale name alias data base.\n#"..., 4096) = 2492 read(4, "", 4096) = 0 close(4) = 0 munmap(0x7f487e20a000, 4096) = 0 open("/usr/share/locale/en_US.UTF-8/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1 ENOENT (No such file or directory) open("/usr/share/locale/en_US.utf8/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1 ENOENT (No such file or directory) open("/usr/share/locale/en_US/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1 ENOENT (No such file or directory) open("/usr/share/locale/en.UTF-8/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1 ENOENT (No such file or directory) open("/usr/share/locale/en.utf8/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1 ENOENT (No such file or directory) open("/usr/share/locale/en/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1 ENOENT (No such file or directory) write(2, "ls: ", 4ls: ) = 4 write(2, "reading directory test/", 23reading directory test/) = 23 open("/usr/share/locale/en_US.UTF-8/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory) open("/usr/share/locale/en_US.utf8/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory) open("/usr/share/locale/en_US/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory) open("/usr/share/locale/en.UTF-8/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory) open("/usr/share/locale/en.utf8/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory) open("/usr/share/locale/en/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory) write(2, ": Input/output error", 20: Input/output error) = 20 write(2, "\n", 1 ) = 1 close(3) = 0 close(1) = 0 close(2) = 0 exit_group(2) = ? +++ exited with 2 +++
From the strace above, these are the most important lines: stat("test/", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0 openat(AT_FDCWD, "test/", O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC) = 3 getdents(3, 0x180cc90, 32768) = -1 EIO (Input/output error) Unfortunately, this happens to me too :-/ My environment: - 32-bit armv7hl storage servers - glusterfs-3.4.2-1.fc20.armv7hl - x86_64 workstation - glusterfs-3.4.0.57rhs-1.el6_5.x86_64 Mounting over glusterfs on the storage servers does not have a problem. When mounting over glusterfs from the workstation, -EIO is returned. When mounting over nfs from the workstation, all is working fine. tcpdump/wireshark of the GlusterFS protocol when the error is seen, does not contain an error. This suggests that the problem is in the glusterfs-fuse client, in the kernel/fuse module or in the kernel/vfs.
One way reproduce (same package versions as in comment #4): 1. install a Fedora 20 i386 Gluster server (hostname vm130-12) 2. create and start a volume with one brick 3. install a RHEL-6 x86_64 Gluster client (hostname vm130-11) 4. mount the volume on the client over NFS # mount -t nfs vm130-12:/bug-1074023 /mnt/nfs 5. populate the volume with some files # for I in {0..50} ; do touch /mnt/nfs/$(uuidgen) ; done 6. mount the volume over glusterfs # mount -t glusterfs vm130-12:/bug-1074023 /mnt/glusterfs 7. check for the contents over the glusterfs mountpoint # ls /mnt/glusterfs ls: reading directory /mnt/glusterfs: Input/output error This seems to be a problem in the READDIRPLUS implementation. I can not reproduce the problem (yet) when I disable READDIRPLUS on the client side. # mount -t glusterfs -o use-readdirp=no vm130-12:/bug-1074023 /mnt/glusterfs
Created attachment 872404 [details] systemtap script for debugging readdir returning -EIO Usage: - written for RHEL6.5 with glusterfs-fuse-3.4.0.59rhs-1.el6_4.x86_64 - make sure to have the matching glusterfs+kernel -debuginfo packages installed - make sure to have /usr/bin/stap available - run ./bug-1074023.stp on the glusterfs-client (vm130-11) When mounting from a 32-bit system: [root@vm130-11 ~]# ./bug-1074023.stp Starting probe, press CTRL+C to exit... send_fuse_data: size = 24 send_fuse_data returns 0 send_fuse_data: size = 104 send_fuse_data returns 0 send_fuse_data: size = 16 send_fuse_data returns 0 send_fuse_data: size = 4160 fuse_dev_write returns -22 send_fuse_data returns 22 fuse_readdir:1231: err=-5 0xffffffff81196380 returns -5, stack follows: Returning from: 0xffffffffa042e940 : fuse_readdir+0x0/0x5e0 [fuse] Returning to : 0xffffffff81196380 0xffffffff81196509 (inexact) 0xffffffff8100b229 (inexact) 0xffffffff8100b288 (inexact) ^C...exiting! When mounting from a 64-bit server: [root@vm130-11 ~]# ./bug-1074023.stp Starting probe, press CTRL+C to exit... send_fuse_data: size = 24 send_fuse_data returns 0 send_fuse_data: size = 104 send_fuse_data returns 0 send_fuse_data: size = 16 send_fuse_data returns 0 send_fuse_data: size = 3808 send_fuse_data returns 0 fuse_readdir:1231: err=0 send_fuse_data: size = 104 send_fuse_data returns 0 send_fuse_data: size = 3808 send_fuse_data returns 0 fuse_readdir:1231: err=0 send_fuse_data: size = 104 send_fuse_data returns 0 send_fuse_data: size = 2496 send_fuse_data returns 0 fuse_readdir:1231: err=0 send_fuse_data: size = 104 send_fuse_data returns 0 send_fuse_data: size = 0 send_fuse_data returns 0 fuse_readdir:1231: err=0 ^C...exiting! Note: 64-bit: send_fuse_data: size = 3808 [several call,s 3808 bytes max] 32-bit: send_fuse_data: size = 4160 [4160 is bigger than PAGESIZE=4096] This is very surprising! Why would a 32-bit server cause a bigger buffer than the 64-bit server? Lets compare the replies that come in over the network. Captured teo tcpdumps like this: [root@vm130-11 ~]# tcpdump -i any -s 0 -w /tmp/bug-1074023_32bit.pcap \ tcp and not port 22 [root@vm130-11 ~]# tcpdump -i any -s 0 -w /tmp/bug-1074023_64bit.pcap \ tcp and not port 22 Filter only the READDIRP procedures, and we're only interested in the replies (rpc.msgtyp == 1). Combine this with an additional 'grep' to show only some of the lines that interest us: $ tshark -r bug-1074023_32bit.pcap \ -Y 'glusterfs.proc == READDIRP && rpc.msgtyp == 1' -V \ | grep -e '^Frame' -e '^Remote Procedure' -e 'Fragment header' \ -e 'Program' -e 'Procedure' Frame 48: 1024 bytes on wire (8192 bits), 1024 bytes captured (8192 bits) Remote Procedure Call, Type:Reply XID:0x0000000c Fragment header: Last fragment, 3848 bytes [Program: GlusterFS (1298437)] [Program Version: 330] [Procedure: READDIRP (40)] [Program Version: 330] $ tshark -r /tmp/bug-1074023_64bit.pcap \ -Y 'glusterfs.proc == READDIRP && rpc.msgtyp == 1' -V \ | grep -e '^Frame' -e '^Remote Procedure' -e 'Fragment header' \ -e 'Program' -e 'Procedure' Frame 48: 704 bytes on wire (5632 bits), 704 bytes captured (5632 bits) Remote Procedure Call, Type:Reply XID:0x0000000c Fragment header: Last fragment, 3528 bytes [Program: GlusterFS (1298437)] [Program Version: 330] [Procedure: READDIRP (40)] [Program Version: 330] Frame 55: 704 bytes on wire (5632 bits), 704 bytes captured (5632 bits) Remote Procedure Call, Type:Reply XID:0x0000000e Fragment header: Last fragment, 3528 bytes [Program: GlusterFS (1298437)] [Program Version: 330] [Procedure: READDIRP (40)] [Program Version: 330] Frame 60: 952 bytes on wire (7616 bits), 952 bytes captured (7616 bits) Remote Procedure Call, Type:Reply XID:0x00000010 Fragment header: Last fragment, 2328 bytes [Program: GlusterFS (1298437)] [Program Version: 330] [Procedure: READDIRP (40)] [Program Version: 330] Frame 65: 112 bytes on wire (896 bits), 112 bytes captured (896 bits) Remote Procedure Call, Type:Reply XID:0x00000012 Fragment header: Last fragment, 40 bytes [Program: GlusterFS (1298437)] [Program Version: 330] [Procedure: READDIRP (40)] [Program Version: 330] Conclusions: The 64-bit Gluster server returns multiple READDIRP replies, and the result always fits in one PAGESIZE. The 32-bit Gluster server sends a first reply which does not fit in the PAGESIZE. In the glusterfs-fuse client, a writev() to /dev/fuse is done (through send_fuse_data()). I currently suspect that the 'struct iovec' passed to writev() is limited to buffers of PAGESIZE. The underlying cause is probably related to the aligning of the structures returned by the local/posix readdir() call on the Gluster server. The 32-bit architecture can align the structure more efficiently than the 64-bit architecture (bigger gaps between the dentries). When receiving the READDIRP response, the structures get converted and aligned one-by-one (the network protocol does not use aligning, it's a stream). Checking in the tcpdump again: $ tshark -r /tmp/bug-1074023_32bit.pcap \ -Y 'glusterfs.proc == READDIRP && rpc.msgtyp == 1' -V \ | grep -e '^Frame' -e '^Remote Procedure' -e 'Fragment header' \ -e 'Program' -e 'Procedure' -e 'Entries returned' Frame 48: 1056 bytes on wire (8448 bits), 1056 bytes captured (8448 bits) Remote Procedure Call, Type:Reply XID:0x0000000c Fragment header: Last fragment, 3880 bytes [Program: GlusterFS (1298437)] [Program Version: 330] [Procedure: READDIRP (40)] [Program Version: 330] Entries returned: 22 $ tshark -r /tmp/bug-1074023_64bit.pcap \ -Y 'glusterfs.proc == READDIRP && rpc.msgtyp == 1' -V \ | grep -e '^Frame' -e '^Remote Procedure' -e 'Fragment header' \ -e 'Program' -e 'Procedure' -e 'Entries returned' Frame 48: 704 bytes on wire (5632 bits), 704 bytes captured (5632 bits) Remote Procedure Call, Type:Reply XID:0x0000000c Fragment header: Last fragment, 3528 bytes [Program: GlusterFS (1298437)] [Program Version: 330] [Procedure: READDIRP (40)] [Program Version: 330] Entries returned: 20 ... The 32-bit Gluster server returns 22 dentries in the 1st READDIPR, where the 64-bit Gluster server returns only 20. For both the READDIRP Call contains a maximum size that the Reply may contain (max size is set to 4096). Planned solution to test: PAGESIZE and alignments can differ per architecture. There is no reason to make the Gluster servers aware of these details from the client side. Gluster should send big replies where possible, which is a little more efficient than sending multiple smaller replies. Splitting the list of 'struct iovec' that is passed to writev() into buffers with a maximum of PAGESIZE might work. This would also make it possible to receive more dentries on any architecture.
Created attachment 872408 [details] Initial patch: write max PAGESIZE bytes to /dev/fuse This works for me. I'll do some more verifications later and probably propose it for inclusion over the next few days.
REVIEW: http://review.gluster.org/7278 (fuse: support bigger than PAGESIZE writes to /dev/fuse) posted (#1) for review on master by Niels de Vos (ndevos)
REVIEW: http://review.gluster.org/7278 (fuse: support bigger than PAGESIZE writes to /dev/fuse) posted (#2) for review on master by Niels de Vos (ndevos)
Created attachment 878702 [details] call send_fuse_data multiple times in readdir callback if neccessary ndevos's patch does not work for me. The attached patch does solve at least my immediate problem.
(In reply to thomas higdon from comment #10) > Created attachment 878702 [details] > call send_fuse_data multiple times in readdir callback if neccessary After re-checking, I do not think this is a correct fix. Without testing it myself, I expect it to result in missing directory entries. My initial patch indeed does not correctly address the problem either. It seems that a fuse-request for readdir(p) includes a maximum size that the response may contain. Any bigger responses will return in writev()-to-"/dev/fuse" errors, which are relatively silently ignored (some log-messages only, no errors returned). The next patch should solve the issue for readdir(p). Other FOPS could be affected in a similar way, so some logging was added to help in debugging future issues.
REVIEW: http://review.gluster.org/7278 (fuse: prevent READDIR(P) from writing to much data to /dev/fuse) posted (#3) for review on master by Niels de Vos (ndevos)
REVIEW: http://review.gluster.org/7278 (fuse: prevent READDIR(P) from writing to much data to /dev/fuse) posted (#4) for review on master by Niels de Vos (ndevos)
*** Bug 1085425 has been marked as a duplicate of this bug. ***
REVIEW: http://review.gluster.org/7278 (fuse: prevent READDIR(P) from writing to much data to /dev/fuse) posted (#5) for review on master by Niels de Vos (ndevos)
COMMIT: http://review.gluster.org/7278 committed in master by Anand Avati (avati) ------ commit 20e317011af7c0f075819bf0648b225f6dc42350 Author: Niels de Vos <ndevos> Date: Thu Mar 27 20:34:44 2014 +0100 fuse: prevent READDIR(P) from writing to much data to /dev/fuse In an environment with mixed architectures (32-bit servers, 64-bit client), it is possible that the on-wire Reply on a READDIR(P) procedure contains more direntries than the client can fit in the maximum size that the fuse-request indicated. A direntry is a dynamically sized structure, because the structure contains the name of the entry. The client sends a maximum size in the READDIR(P) Call to the server, and the server uses this size to limit the number of direntries to return. In case the server can pack more direntries in the requested maximum size (due to alignment differences between the architectures), it can happen that the client unpacks the list of direntries into a buffer that exceeds the maximum size that was given in the initial fuse-request. This change introduces a check for the maximum requested size of the fuse-response in fuse_readdir_cbk() and fuse_readdirp_cbk(). When the conversion from gluster-direntries to the fuse-direntry format takes place, the maximum size is checked, and the 'extra' direntries are discarded. The next readdir()/getdents() that is done, will fetch the just discarded direntries again. In addition to this bugfix, some extra logging in send_fuse_iov() and send_fuse_data() has been added to help diagnosing similar issues. Change-Id: If2eecfcdf9c248f3820035601446d2c89ff9d1a1 BUG: 1074023 Signed-off-by: Niels de Vos <ndevos> Reviewed-on: http://review.gluster.org/7278 Tested-by: Gluster Build System <jenkins.com> Reviewed-by: Xavier Hernandez <xhernandez> Reviewed-by: Anand Avati <avati>
REVIEW: http://review.gluster.org/7547 (fuse: minor improvements for readdir(plus)) posted (#1) for review on master by Niels de Vos (ndevos)
COMMIT: http://review.gluster.org/7547 committed in master by Anand Avati (avati) ------ commit 76ab97169f63da78c9e83daf040d7b09766497cf Author: Niels de Vos <ndevos> Date: Thu Apr 24 13:38:31 2014 +0200 fuse: minor improvements for readdir(plus) Instead of using 'int' for the sizes, use a 'size_t' as it is more correct. Save the size of a fuse_dirent in a temporary variable so that strlen() on the filename is called fewer times. Also correcting some typos in comments. Change-Id: Ic62d9d729a86a1a6a53ed1354fce153bac01d860 BUG: 1074023 Reported-by: Kaleb S. KEITHLEY <kkeithle> Signed-off-by: Niels de Vos <ndevos> Reviewed-on: http://review.gluster.org/7547 Tested-by: Gluster Build System <jenkins.com> Reviewed-by: Anand Avati <avati>