Bug 1089934 - list dir with more than N files results in Input/output error
Summary: list dir with more than N files results in Input/output error
Keywords:
Status: CLOSED CURRENTRELEASE
Alias: None
Product: GlusterFS
Classification: Community
Component: fuse
Version: 3.5.0
Hardware: All
OS: Linux
unspecified
high
Target Milestone: ---
Assignee: Niels de Vos
QA Contact:
URL:
Whiteboard:
Depends On: 1074023
Blocks:
TreeView+ depends on / blocked
 
Reported: 2014-04-22 09:09 UTC by Niels de Vos
Modified: 2014-06-24 11:05 UTC (History)
7 users (show)

Fixed In Version: glusterfs-3.5.1beta
Doc Type: Bug Fix
Doc Text:
Clone Of: 1074023
Environment:
Last Closed: 2014-06-24 11:05:01 UTC
Regression: ---
Mount Type: ---
Documentation: ---
CRM:
Verified Versions:
Embargoed:


Attachments (Terms of Use)

Description Niels de Vos 2014-04-22 09:09:41 UTC
Clone for tracking the fix for glusterfs-3.5.

+++ This bug was initially created as a clone of Bug #1074023 +++

Tried on Fedora 20 server and client, updated (on 2014/03/07), with bluster 3.4.2.

Create a Gluster filesystem (fails on distributed and replicated fs)
mount -t glusterfs <GLUSTER SHARE> /mnt
cd /mnt
mkdir test
d=1
while [ $d -le 24 ]; do touch test/$d; d=$(( $d + 1 )); done
ls test
   ls: reading directory test: Input/output error
cd /
umount /mnt
mount -t nfs <GLUSTER SHARE> /mnt
cd /mnt
ls test
   success
cd /
umount /mnt
mount -t glusterfs <GLUSTER SHARE> /mnt
cd /mnt
rm -f test/1
ls test
   success

The number of maximum files depends on the filename size.

--- Additional comment from Luis Flores on 2014-03-07 17:47:07 CET ---

glusters package version: 3.4.2-1.fc20
kernel version: 3.13.5-202.fc20.x86_64
fuse version: 2.9.3-2.fc20

--- Additional comment from Joe Julian on 2014-03-07 17:59:40 CET ---

Additional notes from the IRC channel:
* Client is an x86_64, server is an arm7hl (32-bit)
* Number of dirents before failure changes based on the length of the filenames, like it's hitting some buffer limit.
* Client and brick logs show no errors
* Can mount via nfs without issue

--- Additional comment from Luis Flores on 2014-03-07 18:05:27 CET ---

strace ls test
execve("/usr/bin/ls", ["ls", "test/"], [/* 22 vars */]) = 0
brk(0)                                  = 0x1807000
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f487e20b000
access("/etc/ld.so.preload", R_OK)      = -1 ENOENT (No such file or directory)
open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=90898, ...}) = 0
mmap(NULL, 90898, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f487e1f4000
close(3)                                = 0
open("/lib64/libselinux.so.1", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\240d@\223=\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=144952, ...}) = 0
mmap(0x3d93400000, 2242712, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x3d93400000
mprotect(0x3d93421000, 2093056, PROT_NONE) = 0
mmap(0x3d93620000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x20000) = 0x3d93620000
mmap(0x3d93622000, 6296, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x3d93622000
close(3)                                = 0
open("/lib64/libcap.so.2", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0 \26\300\251=\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=21424, ...}) = 0
mmap(0x3da9c00000, 2114112, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x3da9c00000
mprotect(0x3da9c04000, 2093056, PROT_NONE) = 0
mmap(0x3da9e03000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x3000) = 0x3da9e03000
close(3)                                = 0
open("/lib64/libacl.so.1", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\200\37@\250=\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=39200, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f487e1f3000
mmap(0x3da8400000, 2130560, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x3da8400000
mprotect(0x3da8407000, 2097152, PROT_NONE) = 0
mmap(0x3da8607000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x7000) = 0x3da8607000
close(3)                                = 0
open("/lib64/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0p\36\202\221=\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=2100672, ...}) = 0
mmap(0x3d91800000, 3924576, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x3d91800000
mprotect(0x3d919b4000, 2097152, PROT_NONE) = 0
mmap(0x3d91bb4000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1b4000) = 0x3d91bb4000
mmap(0x3d91bba000, 16992, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x3d91bba000
close(3)                                = 0
open("/lib64/libpcre.so.1", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0p\27\0\223=\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=421144, ...}) = 0
mmap(0x3d93000000, 2511368, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x3d93000000
mprotect(0x3d93065000, 2093056, PROT_NONE) = 0
mmap(0x3d93264000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x64000) = 0x3d93264000
close(3)                                = 0
open("/lib64/liblzma.so.5", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\3400\300\222=\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=155400, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f487e1f2000
mmap(0x3d92c00000, 2245240, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x3d92c00000
mprotect(0x3d92c24000, 2093056, PROT_NONE) = 0
mmap(0x3d92e23000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x23000) = 0x3d92e23000
close(3)                                = 0
open("/lib64/libdl.so.2", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\320\16\0\222=\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=22440, ...}) = 0
mmap(0x3d92000000, 2109744, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x3d92000000
mprotect(0x3d92003000, 2093056, PROT_NONE) = 0
mmap(0x3d92202000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x2000) = 0x3d92202000
close(3)                                = 0
open("/lib64/libattr.so.1", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\320\23\200\250=\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=22160, ...}) = 0
mmap(0x3da8800000, 2113904, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x3da8800000
mprotect(0x3da8804000, 2093056, PROT_NONE) = 0
mmap(0x3da8a03000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x3000) = 0x3da8a03000
close(3)                                = 0
open("/lib64/libpthread.so.0", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\340m\300\221=\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=150800, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f487e1f1000
mmap(0x3d91c00000, 2213104, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x3d91c00000
mprotect(0x3d91c18000, 2093056, PROT_NONE) = 0
mmap(0x3d91e17000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x17000) = 0x3d91e17000
mmap(0x3d91e19000, 13552, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x3d91e19000
close(3)                                = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f487e1f0000
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f487e1ee000
arch_prctl(ARCH_SET_FS, 0x7f487e1ee840) = 0
mprotect(0x3d91bb4000, 16384, PROT_READ) = 0
mprotect(0x3d91e17000, 4096, PROT_READ) = 0
mprotect(0x3da8a03000, 4096, PROT_READ) = 0
mprotect(0x3d92202000, 4096, PROT_READ) = 0
mprotect(0x3d92e23000, 4096, PROT_READ) = 0
mprotect(0x3d93264000, 4096, PROT_READ) = 0
mprotect(0x3da8607000, 4096, PROT_READ) = 0
mprotect(0x3da9e03000, 4096, PROT_READ) = 0
mprotect(0x3d93620000, 4096, PROT_READ) = 0
mprotect(0x61a000, 4096, PROT_READ)     = 0
mprotect(0x3d9161f000, 4096, PROT_READ) = 0
munmap(0x7f487e1f4000, 90898)           = 0
set_tid_address(0x7f487e1eeb10)         = 13298
set_robust_list(0x7f487e1eeb20, 24)     = 0
rt_sigaction(SIGRTMIN, {0x3d91c068c0, [], SA_RESTORER|SA_SIGINFO, 0x3d91c0f750}, NULL, 8) = 0
rt_sigaction(SIGRT_1, {0x3d91c06950, [], SA_RESTORER|SA_RESTART|SA_SIGINFO, 0x3d91c0f750}, NULL, 8) = 0
rt_sigprocmask(SIG_UNBLOCK, [RTMIN RT_1], NULL, 8) = 0
getrlimit(RLIMIT_STACK, {rlim_cur=8192*1024, rlim_max=RLIM64_INFINITY}) = 0
statfs("/sys/fs/selinux", 0x7fff120969a0) = -1 ENOENT (No such file or directory)
statfs("/selinux", 0x7fff120969a0)      = -1 ENOENT (No such file or directory)
brk(0)                                  = 0x1807000
brk(0x1828000)                          = 0x1828000
open("/proc/filesystems", O_RDONLY)     = 3
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f487e20a000
read(3, "nodev\tsysfs\nnodev\trootfs\nnodev\tr"..., 1024) = 390
read(3, "", 1024)                       = 0
close(3)                                = 0
munmap(0x7f487e20a000, 4096)            = 0
open("/usr/lib/locale/locale-archive", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=106070960, ...}) = 0
mmap(NULL, 106070960, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f4877cc5000
close(3)                                = 0
ioctl(1, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or TCGETS, {B38400 opost isig icanon echo ...}) = 0
ioctl(1, TIOCGWINSZ, {ws_row=48, ws_col=164, ws_xpixel=0, ws_ypixel=0}) = 0
stat("test/", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
openat(AT_FDCWD, "test/", O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC) = 3
getdents(3, 0x180cc90, 32768)           = -1 EIO (Input/output error)
open("/usr/share/locale/locale.alias", O_RDONLY|O_CLOEXEC) = 4
fstat(4, {st_mode=S_IFREG|0644, st_size=2492, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f487e20a000
read(4, "# Locale name alias data base.\n#"..., 4096) = 2492
read(4, "", 4096)                       = 0
close(4)                                = 0
munmap(0x7f487e20a000, 4096)            = 0
open("/usr/share/locale/en_US.UTF-8/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en_US.utf8/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en_US/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en.UTF-8/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en.utf8/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
write(2, "ls: ", 4ls: )                     = 4
write(2, "reading directory test/", 23reading directory test/) = 23
open("/usr/share/locale/en_US.UTF-8/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en_US.utf8/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en_US/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en.UTF-8/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en.utf8/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en/LC_MESSAGES/libc.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
write(2, ": Input/output error", 20: Input/output error)    = 20
write(2, "\n", 1
)                       = 1
close(3)                                = 0
close(1)                                = 0
close(2)                                = 0
exit_group(2)                           = ?
+++ exited with 2 +++

--- Additional comment from Niels de Vos on 2014-03-07 18:15:18 CET ---

From the strace above, these are the most important lines:

stat("test/", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
openat(AT_FDCWD, "test/", O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC) = 3
getdents(3, 0x180cc90, 32768)           = -1 EIO (Input/output error)



Unfortunately, this happens to me too :-/

My environment:
- 32-bit armv7hl storage servers - glusterfs-3.4.2-1.fc20.armv7hl
- x86_64 workstation - glusterfs-3.4.0.57rhs-1.el6_5.x86_64

Mounting over glusterfs on the storage servers does not have a problem.
When mounting over glusterfs from the workstation, -EIO is returned.
When mounting over nfs from the workstation, all is working fine.

tcpdump/wireshark of the GlusterFS protocol when the error is seen, does not contain an error. This suggests that the problem is in the glusterfs-fuse client, in the kernel/fuse module or in the kernel/vfs.

--- Additional comment from Niels de Vos on 2014-03-08 16:18:11 CET ---

One way reproduce (same package versions as in comment #4):

1. install a Fedora 20 i386 Gluster server (hostname vm130-12)
2. create and start a volume with one brick
3. install a RHEL-6 x86_64 Gluster client (hostname vm130-11)
4. mount the volume on the client over NFS
   # mount -t nfs vm130-12:/bug-1074023 /mnt/nfs
5. populate the volume with some files
   # for I in {0..50} ; do touch /mnt/nfs/$(uuidgen) ; done
6. mount the volume over glusterfs
   # mount -t glusterfs vm130-12:/bug-1074023 /mnt/glusterfs
7. check for the contents over the glusterfs mountpoint
   # ls /mnt/glusterfs
   ls: reading directory /mnt/glusterfs: Input/output error


This seems to be a problem in the READDIRPLUS implementation. I can not reproduce the problem (yet) when I disable READDIRPLUS on the client side.

   # mount -t glusterfs -o use-readdirp=no vm130-12:/bug-1074023 /mnt/glusterfs

--- Additional comment from Niels de Vos on 2014-03-09 13:38:30 CET ---

Usage:
- written for RHEL6.5 with glusterfs-fuse-3.4.0.59rhs-1.el6_4.x86_64
- make sure to have the matching glusterfs+kernel -debuginfo packages installed
- make sure to have /usr/bin/stap available
- run ./bug-1074023.stp on the glusterfs-client (vm130-11)

When mounting from a 32-bit system:
[root@vm130-11 ~]# ./bug-1074023.stp
Starting probe, press CTRL+C to exit...
send_fuse_data: size = 24
send_fuse_data returns 0
send_fuse_data: size = 104
send_fuse_data returns 0
send_fuse_data: size = 16
send_fuse_data returns 0
send_fuse_data: size = 4160
fuse_dev_write returns -22
send_fuse_data returns 22
fuse_readdir:1231: err=-5
0xffffffff81196380 returns -5, stack follows:
Returning from:  0xffffffffa042e940 : fuse_readdir+0x0/0x5e0 [fuse]
Returning to  :  0xffffffff81196380
 0xffffffff81196509 (inexact)
 0xffffffff8100b229 (inexact)
 0xffffffff8100b288 (inexact)
^C...exiting!

When mounting from a 64-bit server:
[root@vm130-11 ~]# ./bug-1074023.stp
Starting probe, press CTRL+C to exit...
send_fuse_data: size = 24
send_fuse_data returns 0
send_fuse_data: size = 104
send_fuse_data returns 0
send_fuse_data: size = 16
send_fuse_data returns 0
send_fuse_data: size = 3808
send_fuse_data returns 0
fuse_readdir:1231: err=0
send_fuse_data: size = 104
send_fuse_data returns 0
send_fuse_data: size = 3808
send_fuse_data returns 0
fuse_readdir:1231: err=0
send_fuse_data: size = 104
send_fuse_data returns 0
send_fuse_data: size = 2496
send_fuse_data returns 0
fuse_readdir:1231: err=0
send_fuse_data: size = 104
send_fuse_data returns 0
send_fuse_data: size = 0
send_fuse_data returns 0
fuse_readdir:1231: err=0
^C...exiting!


Note:
64-bit: send_fuse_data: size = 3808 [several call,s 3808 bytes max]
32-bit: send_fuse_data: size = 4160 [4160 is bigger than PAGESIZE=4096]

This is very surprising! Why would a 32-bit server cause a bigger buffer than the 64-bit server?

Lets compare the replies that come in over the network.

Captured teo tcpdumps like this:
[root@vm130-11 ~]# tcpdump -i any -s 0 -w /tmp/bug-1074023_32bit.pcap \
    tcp and not port 22
[root@vm130-11 ~]# tcpdump -i any -s 0 -w /tmp/bug-1074023_64bit.pcap \
    tcp and not port 22


Filter only the READDIRP procedures, and we're only interested in the replies (rpc.msgtyp == 1). Combine this with an additional 'grep' to show only some of the lines that interest us:

$ tshark -r bug-1074023_32bit.pcap \
    -Y 'glusterfs.proc == READDIRP && rpc.msgtyp == 1' -V \
    | grep -e '^Frame' -e '^Remote Procedure' -e 'Fragment header' \
           -e 'Program' -e 'Procedure'
Frame 48: 1024 bytes on wire (8192 bits), 1024 bytes captured (8192 bits)
Remote Procedure Call, Type:Reply XID:0x0000000c
    Fragment header: Last fragment, 3848 bytes
    [Program: GlusterFS (1298437)]
    [Program Version: 330]
    [Procedure: READDIRP (40)]
    [Program Version: 330]

$ tshark -r /tmp/bug-1074023_64bit.pcap \
    -Y 'glusterfs.proc == READDIRP && rpc.msgtyp == 1' -V \
    | grep -e '^Frame' -e '^Remote Procedure' -e 'Fragment header' \
           -e 'Program' -e 'Procedure'
Frame 48: 704 bytes on wire (5632 bits), 704 bytes captured (5632 bits)
Remote Procedure Call, Type:Reply XID:0x0000000c
    Fragment header: Last fragment, 3528 bytes
    [Program: GlusterFS (1298437)]
    [Program Version: 330]
    [Procedure: READDIRP (40)]
    [Program Version: 330]
Frame 55: 704 bytes on wire (5632 bits), 704 bytes captured (5632 bits)
Remote Procedure Call, Type:Reply XID:0x0000000e
    Fragment header: Last fragment, 3528 bytes
    [Program: GlusterFS (1298437)]
    [Program Version: 330]
    [Procedure: READDIRP (40)]
    [Program Version: 330]
Frame 60: 952 bytes on wire (7616 bits), 952 bytes captured (7616 bits)
Remote Procedure Call, Type:Reply XID:0x00000010
    Fragment header: Last fragment, 2328 bytes
    [Program: GlusterFS (1298437)]
    [Program Version: 330]
    [Procedure: READDIRP (40)]
    [Program Version: 330]
Frame 65: 112 bytes on wire (896 bits), 112 bytes captured (896 bits)
Remote Procedure Call, Type:Reply XID:0x00000012
    Fragment header: Last fragment, 40 bytes
    [Program: GlusterFS (1298437)]
    [Program Version: 330]
    [Procedure: READDIRP (40)]
    [Program Version: 330]


Conclusions:
The 64-bit Gluster server returns multiple READDIRP replies, and the result always fits in one PAGESIZE. The 32-bit Gluster server sends a first reply which does not fit in the PAGESIZE.

In the glusterfs-fuse client, a writev() to /dev/fuse is done (through send_fuse_data()). I currently suspect that the 'struct iovec' passed to writev() is limited to buffers of PAGESIZE.

The underlying cause is probably related to the aligning of the structures returned by the local/posix readdir() call on the Gluster server. The 32-bit architecture can align the structure more efficiently than the 64-bit architecture (bigger gaps between the dentries). When receiving the READDIRP response, the structures get converted and aligned one-by-one (the network protocol does not use aligning, it's a stream).

Checking in the tcpdump again:
$ tshark -r /tmp/bug-1074023_32bit.pcap \
    -Y 'glusterfs.proc == READDIRP && rpc.msgtyp == 1' -V \
    | grep -e '^Frame' -e '^Remote Procedure' -e 'Fragment header' \
           -e 'Program' -e 'Procedure' -e 'Entries returned'
Frame 48: 1056 bytes on wire (8448 bits), 1056 bytes captured (8448 bits)
Remote Procedure Call, Type:Reply XID:0x0000000c
    Fragment header: Last fragment, 3880 bytes
    [Program: GlusterFS (1298437)]
    [Program Version: 330]
    [Procedure: READDIRP (40)]
    [Program Version: 330]
    Entries returned: 22

$ tshark -r /tmp/bug-1074023_64bit.pcap \
    -Y 'glusterfs.proc == READDIRP && rpc.msgtyp == 1' -V \
    | grep -e '^Frame' -e '^Remote Procedure' -e 'Fragment header' \
           -e 'Program' -e 'Procedure' -e 'Entries returned'
Frame 48: 704 bytes on wire (5632 bits), 704 bytes captured (5632 bits)
Remote Procedure Call, Type:Reply XID:0x0000000c
    Fragment header: Last fragment, 3528 bytes
    [Program: GlusterFS (1298437)]
    [Program Version: 330]
    [Procedure: READDIRP (40)]
    [Program Version: 330]
    Entries returned: 20
...

The 32-bit Gluster server returns 22 dentries in the 1st READDIPR, where the 64-bit Gluster server returns only 20. For both the READDIRP Call contains a maximum size that the Reply may contain (max size is set to 4096).


Planned solution to test:
PAGESIZE and alignments can differ per architecture. There is no reason to make the Gluster servers aware of these details from the client side. Gluster should send big replies where possible, which is a little more efficient than sending multiple smaller replies.

Splitting the list of 'struct iovec' that is passed to writev() into buffers with a maximum of PAGESIZE might work. This would also make it possible to receive more dentries on any architecture.

--- Additional comment from Niels de Vos on 2014-03-09 14:03:20 CET ---

This works for me. I'll do some more verifications later and probably propose it for inclusion over the next few days.

--- Additional comment from Anand Avati on 2014-03-14 16:28:33 CET ---

REVIEW: http://review.gluster.org/7278 (fuse: support bigger than PAGESIZE writes to /dev/fuse) posted (#1) for review on master by Niels de Vos (ndevos)

--- Additional comment from Anand Avati on 2014-03-24 22:35:50 CET ---

REVIEW: http://review.gluster.org/7278 (fuse: support bigger than PAGESIZE writes to /dev/fuse) posted (#2) for review on master by Niels de Vos (ndevos)

--- Additional comment from thomas higdon on 2014-03-25 22:38:05 CET ---

ndevos's patch does not work for me. The attached patch does solve at least my immediate problem.

--- Additional comment from Niels de Vos on 2014-03-27 21:11:21 CET ---

(In reply to thomas higdon from comment #10)
> Created attachment 878702 [details]
> call send_fuse_data multiple times in readdir callback if neccessary

After re-checking, I do not think this is a correct fix. Without testing it myself, I expect it to result in missing directory entries.

My initial patch indeed does not correctly address the problem either. It seems that a fuse-request for readdir(p) includes a maximum size that the response may contain. Any bigger responses will return in writev()-to-"/dev/fuse" errors, which are relatively silently ignored (some log-messages only, no errors returned).

The next patch should solve the issue for readdir(p). Other FOPS could be affected in a similar way, so some logging was added to help in debugging future issues.

--- Additional comment from Anand Avati on 2014-03-27 21:16:29 CET ---

REVIEW: http://review.gluster.org/7278 (fuse: prevent READDIR(P) from writing to much data to /dev/fuse) posted (#3) for review on master by Niels de Vos (ndevos)

--- Additional comment from Anand Avati on 2014-03-31 15:03:34 CEST ---

REVIEW: http://review.gluster.org/7278 (fuse: prevent READDIR(P) from writing to much data to /dev/fuse) posted (#4) for review on master by Niels de Vos (ndevos)

--- Additional comment from Niels de Vos on 2014-04-09 13:44:11 CEST ---



--- Additional comment from Anand Avati on 2014-04-20 17:44:46 CEST ---

REVIEW: http://review.gluster.org/7278 (fuse: prevent READDIR(P) from writing to much data to /dev/fuse) posted (#5) for review on master by Niels de Vos (ndevos)

--- Additional comment from Anand Avati on 2014-04-21 19:25:27 CEST ---

COMMIT: http://review.gluster.org/7278 committed in master by Anand Avati (avati) 
------
commit 20e317011af7c0f075819bf0648b225f6dc42350
Author: Niels de Vos <ndevos>
Date:   Thu Mar 27 20:34:44 2014 +0100

    fuse: prevent READDIR(P) from writing to much data to /dev/fuse
    
    In an environment with mixed architectures (32-bit servers, 64-bit
    client), it is possible that the on-wire Reply on a READDIR(P) procedure
    contains more direntries than the client can fit in the maximum size
    that the fuse-request indicated.
    
    A direntry is a dynamically sized structure, because the structure
    contains the name of the entry. The client sends a maximum size in the
    READDIR(P) Call to the server, and the server uses this size to limit
    the number of direntries to return. In case the server can pack more
    direntries in the requested maximum size (due to alignment differences
    between the architectures), it can happen that the client unpacks the
    list of direntries into a buffer that exceeds the maximum size that was
    given in the initial fuse-request.
    
    This change introduces a check for the maximum requested size of the
    fuse-response in fuse_readdir_cbk() and fuse_readdirp_cbk(). When the
    conversion from gluster-direntries to the fuse-direntry format takes
    place, the maximum size is checked, and the 'extra' direntries are
    discarded. The next readdir()/getdents() that is done, will fetch the
    just discarded direntries again.
    
    In addition to this bugfix, some extra logging in send_fuse_iov() and
    send_fuse_data() has been added to help diagnosing similar issues.
    
    Change-Id: If2eecfcdf9c248f3820035601446d2c89ff9d1a1
    BUG: 1074023
    Signed-off-by: Niels de Vos <ndevos>
    Reviewed-on: http://review.gluster.org/7278
    Tested-by: Gluster Build System <jenkins.com>
    Reviewed-by: Xavier Hernandez <xhernandez>
    Reviewed-by: Anand Avati <avati>

Comment 1 Anand Avati 2014-04-22 10:37:05 UTC
REVIEW: http://review.gluster.org/7522 (fuse: prevent READDIR(P) from writing to much data to /dev/fuse) posted (#1) for review on release-3.5 by Niels de Vos (ndevos)

Comment 2 Anand Avati 2014-04-27 18:50:26 UTC
REVIEW: http://review.gluster.org/7522 (fuse: prevent READDIR(P) from writing to much data to /dev/fuse) posted (#2) for review on release-3.5 by Niels de Vos (ndevos)

Comment 3 Anand Avati 2014-05-08 17:59:03 UTC
COMMIT: http://review.gluster.org/7522 committed in release-3.5 by Niels de Vos (ndevos) 
------
commit 94282fcae8ecb0d53399baa4e96f3aaa443979f0
Author: Niels de Vos <ndevos>
Date:   Thu Apr 24 13:38:31 2014 +0200

    fuse: prevent READDIR(P) from writing to much data to /dev/fuse
    
    In an environment with mixed architectures (32-bit servers, 64-bit
    client), it is possible that the on-wire Reply on a READDIR(P) procedure
    contains more direntries than the client can fit in the maximum size
    that the fuse-request indicated.
    
    A direntry is a dynamically sized structure, because the structure
    contains the name of the entry. The client sends a maximum size in the
    READDIR(P) Call to the server, and the server uses this size to limit
    the number of direntries to return. In case the server can pack more
    direntries in the requested maximum size (due to alignment differences
    between the architectures), it can happen that the client unpacks the
    list of direntries into a buffer that exceeds the maximum size that was
    given in the initial fuse-request.
    
    This change introduces a check for the maximum requested size of the
    fuse-response in fuse_readdir_cbk() and fuse_readdirp_cbk(). When the
    conversion from gluster-direntries to the fuse-direntry format takes
    place, the maximum size is checked, and the 'extra' direntries are
    discarded. The next readdir()/getdents() that is done, will fetch the
    just discarded direntries again.
    
    In addition to this bugfix, some extra logging in send_fuse_iov() and
    send_fuse_data() has been added to help diagnosing similar issues.
    
    This change combines two commits from master:
    > Commit: 20e317011af7c0f075819bf0648b225f6dc42350
    > Change-Id: If2eecfcdf9c248f3820035601446d2c89ff9d1a1
    > BUG: 1074023
    > Signed-off-by: Niels de Vos <ndevos>
    > Reviewed-on: http://review.gluster.org/7278
    > Tested-by: Gluster Build System <jenkins.com>
    > Reviewed-by: Xavier Hernandez <xhernandez>
    > Reviewed-by: Anand Avati <avati>
    >
    > Commit: 76ab97169f63da78c9e83daf040d7b09766497cf
    > Change-Id: Ic62d9d729a86a1a6a53ed1354fce153bac01d860
    > BUG: 1074023
    > Reported-by: Kaleb S. KEITHLEY <kkeithle>
    > Signed-off-by: Niels de Vos <ndevos>
    > Reviewed-on: http://review.gluster.org/7547
    > Tested-by: Gluster Build System <jenkins.com>
    > Reviewed-by: Anand Avati <avati>
    
    Change-Id: Id847db26d861a9805041ff3367e848da7be05545
    BUG: 1089934
    Signed-off-by: Niels de Vos <ndevos>
    Reviewed-on: http://review.gluster.org/7522
    Tested-by: Gluster Build System <jenkins.com>
    Reviewed-by: Xavier Hernandez <xhernandez>

Comment 4 Niels de Vos 2014-05-25 09:07:58 UTC
The first (and last?) Beta for GlusterFS 3.5.1 has been released [1]. Please verify if the release solves this bug report for you. In case the glusterfs-3.5.1beta release does not have a resolution for this issue, leave a comment in this bug and move the status to ASSIGNED. If this release fixes the problem for you, leave a note and change the status to VERIFIED.

Packages for several distributions should become available in the near future. Keep an eye on the Gluster Users mailinglist [2] and the update (possibly an "updates-testing" repository) infrastructure for your distribution.

[1] http://supercolony.gluster.org/pipermail/gluster-users/2014-May/040377.html
[2] http://supercolony.gluster.org/pipermail/gluster-users/

Comment 5 Niels de Vos 2014-06-24 11:05:01 UTC
This bug is getting closed because a release has been made available that should address the reported issue. In case the problem is still not fixed with glusterfs-3.5.1, please reopen this bug report.

glusterfs-3.5.1 has been announced on the Gluster Users mailinglist [1], packages for several distributions should become available in the near future. Keep an eye on the Gluster Users mailinglist [2] and the update infrastructure for your distribution.

[1] http://supercolony.gluster.org/pipermail/gluster-users/2014-June/040723.html
[2] http://thread.gmane.org/gmane.comp.file-systems.gluster.user


Note You need to log in before you can comment on or make changes to this bug.