Description of problem: While running tests on 2.6.18-128.1.1.el5, I saw unaligned access warnings on our ia64 node. kernel unaligned access to 0xe0000040c13e3537, ip=0xa0000002031ca110 This is down inside gfs2_bitfit() 0xa0000002031c9fa0 <gfs2_bitfit>: [MII] nop.m 0x0 0xa0000002031c9fa1 <gfs2_bitfit+1>: extr.u r15=r34,2,30 0xa0000002031c9fa2 <gfs2_bitfit+2>: zxt1 r35=r35 0xa0000002031c9fb0 <gfs2_bitfit+16>: [MMB] and r17=3,r34 0xa0000002031c9fb1 <gfs2_bitfit+17>: addp4 r33=r33,r0 0xa0000002031c9fb2 <gfs2_bitfit+18>: nop.b 0x0;; 0xa0000002031c9fc0 <gfs2_bitfit+32>: [MMI] adds r14=7,r15 0xa0000002031c9fc1 <gfs2_bitfit+33>: addp4 r16=r15,r0 0xa0000002031c9fc2 <gfs2_bitfit+34>: tbit.z p8,p9=r35,0 0xa0000002031c9fd0 <gfs2_bitfit+48>: [MMI] shladd r17=r17,1,r0 0xa0000002031c9fd1 <gfs2_bitfit+49>: add r20=r32,r33 0xa0000002031c9fd2 <gfs2_bitfit+50>: mov r23=r35;; 0xa0000002031c9fe0 <gfs2_bitfit+64>: [MMI] nop.m 0x0 0xa0000002031c9fe1 <gfs2_bitfit+65>: and r14=-8,r14 0xa0000002031c9fe2 <gfs2_bitfit+66>: add r21=r32,r16 0xa0000002031c9ff0 <gfs2_bitfit+80>: [MLX] nop.m 0x0 0xa0000002031c9ff1 <gfs2_bitfit+81>: (p08) movl r22=0x5555555555555555;; 0xa0000002031ca000 <gfs2_bitfit+96>: [MII] sub r18=r14,r15 0xa0000002031ca001 <gfs2_bitfit+97>: (p09) mov r22=r0;; 0xa0000002031ca002 <gfs2_bitfit+98>: cmp4.eq p7,p6=0,r18;; 0xa0000002031ca010 <gfs2_bitfit+112>: [MIB] (p06) mov r32=r21 0xa0000002031ca011 <gfs2_bitfit+113>: (p06) mov r16=r17 0xa0000002031ca012 <gfs2_bitfit+114>: (p06) br.cond.dptk.few 0xa0000002031ca0d0;; 0xa0000002031ca020 <gfs2_bitfit+128>: [MII] addp4 r2=r14,r0 0xa0000002031ca021 <gfs2_bitfit+129>: mov r16=r17;; 0xa0000002031ca022 <gfs2_bitfit+130>: add r14=r32,r2;; 0xa0000002031ca030 <gfs2_bitfit+144>: [MIB] nop.m 0x0 0xa0000002031ca031 <gfs2_bitfit+145>: mov r18=r14 0xa0000002031ca032 <gfs2_bitfit+146>: br.few 0xa0000002031ca130 0xa0000002031ca040 <gfs2_bitfit+160>: [MMI] addp4 r10=r16,r0 0xa0000002031ca041 <gfs2_bitfit+161>: ld1 r9=[r32] 0xa0000002031ca042 <gfs2_bitfit+162>: nop.i 0x0;; 0xa0000002031ca050 <gfs2_bitfit+176>: [MII] nop.m 0x0 0xa0000002031ca051 <gfs2_bitfit+177>: shr r8=r9,r10;; 0xa0000002031ca052 <gfs2_bitfit+178>: and r3=3,r8;; 0xa0000002031ca060 <gfs2_bitfit+192>: [MIB] nop.m 0x0 0xa0000002031ca061 <gfs2_bitfit+193>: cmp4.eq p11,p10=r3,r23 0xa0000002031ca062 <gfs2_bitfit+194>: (p11) br.cond.dpnt.few 0xa0000002031ca180 0xa0000002031ca070 <gfs2_bitfit+208>: [MII] nop.m 0x0 0xa0000002031ca071 <gfs2_bitfit+209>: adds r16=2,r16;; 0xa0000002031ca072 <gfs2_bitfit+210>: nop.i 0x0 0xa0000002031ca080 <gfs2_bitfit+224>: [MMB] cmp4.lt p13,p12=7,r16 0xa0000002031ca081 <gfs2_bitfit+225>: nop.m 0x0 0xa0000002031ca082 <gfs2_bitfit+226>: (p12) br.cond.dptk.few 0xa0000002031ca0d0 0xa0000002031ca090 <gfs2_bitfit+240>: [MMI] adds r18=-1,r18 0xa0000002031ca091 <gfs2_bitfit+241>: nop.m 0x0 0xa0000002031ca092 <gfs2_bitfit+242>: adds r32=1,r32;; 0xa0000002031ca0a0 <gfs2_bitfit+256>: [MIB] nop.m 0x0 0xa0000002031ca0a1 <gfs2_bitfit+257>: cmp4.eq p15,p14=0,r18 0xa0000002031ca0a2 <gfs2_bitfit+258>: (p14) br.cond.dptk.few 0xa0000002031ca0c0 0xa0000002031ca0b0 <gfs2_bitfit+272>: [MIB] mov r14=r32 0xa0000002031ca0b1 <gfs2_bitfit+273>: mov r16=r0 0xa0000002031ca0b2 <gfs2_bitfit+274>: br.few 0xa0000002031ca030;; 0xa0000002031ca0c0 <gfs2_bitfit+288>: [MMI] mov r16=r0 0xa0000002031ca0c1 <gfs2_bitfit+289>: nop.m 0x0 0xa0000002031ca0c2 <gfs2_bitfit+290>: nop.i 0x0 0xa0000002031ca0d0 <gfs2_bitfit+304>: [MIB] nop.m 0x0 0xa0000002031ca0d1 <gfs2_bitfit+305>: cmp.ltu p6,p7=r32,r20 0xa0000002031ca0d2 <gfs2_bitfit+306>: (p06) br.cond.dptk.few 0xa0000002031ca040 0xa0000002031ca0e0 <gfs2_bitfit+320>: [MIB] nop.m 0x0 0xa0000002031ca0e1 <gfs2_bitfit+321>: nop.i 0x0 0xa0000002031ca0e2 <gfs2_bitfit+322>: br.few 0xa0000002031ca1b0 0xa0000002031ca0f0 <gfs2_bitfit+336>: [MII] lfetch [r15] 0xa0000002031ca0f1 <gfs2_bitfit+337>: nop.i 0x0 0xa0000002031ca0f2 <gfs2_bitfit+338>: nop.i 0x0;; 0xa0000002031ca100 <gfs2_bitfit+352>: [MLX] nop.m 0x0 0xa0000002031ca101 <gfs2_bitfit+353>: movl r25=0x5555555555555555 0xa0000002031ca110 <gfs2_bitfit+368>: [MII] ld8 r24=[r32] <-- HERE 0xa0000002031ca111 <gfs2_bitfit+369>: adds r18=8,r18;; 0xa0000002031ca112 <gfs2_bitfit+370>: and r11=r25,r24;; 0xa0000002031ca120 <gfs2_bitfit+384>: [MIB] nop.m 0x0 0xa0000002031ca121 <gfs2_bitfit+385>: cmp.eq p9,p8=r22,r11 0xa0000002031ca122 <gfs2_bitfit+386>: (p08) br.cond.dpnt.few 0xa0000002031ca150 0xa0000002031ca130 <gfs2_bitfit+400>: [MMI] adds r19=-8,r20 0xa0000002031ca131 <gfs2_bitfit+401>: mov r32=r18 0xa0000002031ca132 <gfs2_bitfit+402>: adds r15=8,r18;; 0xa0000002031ca140 <gfs2_bitfit+416>: [MIB] cmp.ltu p10,p11=r18,r19 0xa0000002031ca141 <gfs2_bitfit+417>: mov r19=r18 0xa0000002031ca142 <gfs2_bitfit+418>: (p10) br.cond.dptk.few 0xa0000002031ca0f0;; 0xa0000002031ca150 <gfs2_bitfit+432>: [MII] nop.m 0x0 0xa0000002031ca151 <gfs2_bitfit+433>: cmp.ltu p13,p12=r19,r20;; 0xa0000002031ca152 <gfs2_bitfit+434>: nop.i 0x0 0xa0000002031ca160 <gfs2_bitfit+448>: [MMB] (p13) mov r18=7 0xa0000002031ca161 <gfs2_bitfit+449>: nop.m 0x0 0xa0000002031ca162 <gfs2_bitfit+450>: (p13) br.cond.dptk.few 0xa0000002031ca0d0 0xa0000002031ca170 <gfs2_bitfit+464>: [MIB] nop.m 0x0 0xa0000002031ca171 <gfs2_bitfit+465>: nop.i 0x0 0xa0000002031ca172 <gfs2_bitfit+466>: br.few 0xa0000002031ca1b0 0xa0000002031ca180 <gfs2_bitfit+480>: [MMI] sub r22=r16,r17 0xa0000002031ca181 <gfs2_bitfit+481>: nop.m 0x0 0xa0000002031ca182 <gfs2_bitfit+482>: sub r20=r32,r21;; 0xa0000002031ca190 <gfs2_bitfit+496>: [MII] nop.m 0x0 0xa0000002031ca191 <gfs2_bitfit+497>: extr r21=r22,1,31;; 0xa0000002031ca192 <gfs2_bitfit+498>: add r18=r21,r34;; 0xa0000002031ca1a0 <gfs2_bitfit+512>: [MIB] nop.m 0x0 0xa0000002031ca1a1 <gfs2_bitfit+513>: shladd r8=r20,2,r18 0xa0000002031ca1a2 <gfs2_bitfit+514>: br.ret.sptk.many b0;; 0xa0000002031ca1b0 <gfs2_bitfit+528>: [MIB] nop.m 0x0 0xa0000002031ca1b1 <gfs2_bitfit+529>: mov r8=-1 0xa0000002031ca1b2 <gfs2_bitfit+530>: br.ret.sptk.many b0;; Version-Release number of selected component (if applicable): kernel-2.6.18-128.1.1.el5 How reproducible: Steps to Reproduce: 1. 2. 3. Actual results: Expected results: Additional info: Several of these messages hit around the time that a few coherency test cases failed when several d_doio processes stalled.
Shouldn't be too hard to fix.
I'm not sure if it's relevant or not without checking back through the logs, but the coherency failure happened while running on a 1k file system block size. I'm not sure if the unaligned messages occurred with 4k block size.
It shouldn't matter what the blocksize is, since we have two bits per block in the bitmaps so there are still multiple blocks per byte. It is possible that there is some strange effect that means that we always allocated blocks on certain boundaries though with different block sizes, but unlikely. Maybe its something to do with the sizes of the files you are creating. Try and give eash file a prime number of blocks I guess in order to get the best scattering of different starting offsets.
I think the offending line of code is probably this one: misaligned += sizeof(unsigned long) - 1; I suspect that the -1 is incorrect since this would appear to make the "unaligned" loop terminate one byte too early and jump back to the "aligned" loop. I need to look at this some more - there seems to be scope for reducing the amount of code somewhat in this function.
I haven't looked at the algorithm lately, but IIRC, this line of code was used when transitioning between byte-at-a-time mode to word-at-a- time mode in the algorithm, so the "- 1" is likely there to adjust for the byte that's currently being examined. In other words, I put it there for a reason, so approach with caution. :7)
Yes - and I think it works ok in all cases bar one, which is when we actually do have a whole unsigned long left, but we got sent to the unaligned code because it was the final unsigned long in the buffer and we didn't want to use the aligned code because it has a prefetch() in it. So its a pretty unlikely case all in all, but I can't see any other way in which we could land up with something thats not aligned unles we have something broken in the code thats calling this function.
Created attachment 332034 [details] Test prog including new version of bitfit This is my first go at sorting out the code... bascially the plan is to eliminate the unaligned branch of the current code. The test program does the following: for each block size in (512, 1024, 2048, 4096): for each of the two possible bitmap sizes: for each "two bit" field in the bitmap (plus the case of not found): create a bitmap, set the bits, test that gfs2_bitfit() finds it in the correct place Needless to say, its rather slow when it gets to 4096 byte blocks. There are still a couple of issues to check up on: 1. if prefetch() really doesn't mind being pointed into thin air, then we can remove one special case at the end of the block. 2. It would be nice to work out a way to reduce the two tests for the termination of the main loop down to a single test. Also I've not yet done any speed comparison between the current & new code. Other thoughts: 1. Is it better to prefetch the words just ahead of time, or run a loop just before we scan the block? 2. Is it better to prefetch on every word, or just when a cach line is crossed? I have a feeling some of those questions might have been answered the last time we looked at this aspect of performance.
I've made a few changes since the previous version in this bz: 1. I've removed prefetch - it was slower with prefetch in my test, so I'm not sure that its doing anything useful 2. I've added an extra test at the end to deal with the case that the end of the rgrp is not on a u64 boundary. 3. I've been able to remove one special case So the code is now, yet again, smaller and, I hope with the addition of some more comments, easier to understand.
Created attachment 332214 [details] Upstream patch This is the patch I posted upstream.
I've been looking into porting this patch to RHEL5 and have a few concerns. First, there are some typos which I fixed: --- /home/bob/Download/bitfit.patch.~1~ 2009-02-19 15:47:35.000000000 -0600 +++ /home/bob/Download/bitfit.patch 2009-02-19 15:51:07.000000000 -0600 @@ -11,7 +11,7 @@ index a068ac9..c0abe69 100644 + * @mask: Mask to use (normally 0x55555.... but adjusted for search start) + * @state: The state we are searching for + * -+ * We xor the bitmap data with a patter which is the bitwise opposite ++ * We xor the bitmap data with a pattern which is the bitwise opposite + * of what we are looking for, this gives rise to a pattern of ones + * wherever there is a match. Since we have two bits per entry, we + * take this pattern, shift it down by one place and then and it with @@ -57,7 +57,7 @@ index a068ac9..c0abe69 100644 + * + * The size of the buffer is in bytes, but is it assumed that it is + * always ok to to read a complete multiple of 64 bits at the end -+ * of the block in case the end is no aligned to a natural boundary. ++ * of the block in case the end is not aligned to a natural boundary. * * Return: the block number (bitmap buffer scope) that was found */ Second, the old algorithm was designed to optimize speed on all archs by gearing everything around sizeof(long). The new algorithm uses u64 which is sizeof(long) for x86_64. I'm concerned that performance may suffer on other archs such as 32-bit boxes. Has there been any performance comparisons versus the previous algorithm (using scenarios that don't run into the bz problem, that is)? I've got the old and new versions on a 32-bit box, so maybe I'll try to do some comparisons. Third, the old algorithm had boundary-crossing issues on some archs but not others. Since this one is geared around 64-bit boxes, can we get into other boundary problems with some archs? I personally can't think of a case, but it's worth asking the question anyway. Fourth, the bit /= 2 is probably optimized into a shift operation by the compiler, but does it make sense to code it as a shift?
Yes, I thought about that a bit... to be honest it shouldn't be any slower on 32 bit really, only the shift would need special attention since is crosses the mid-64bit word boundary. There is also the question of how many people will really be running this on 32bit anyway... probably very few, so I'm not sure its worth spending too much time optimising for that case. I could have used unsigned longs, I suppose, but it does add a number of extra cases to the code, and each extra case needs testing. I did try a number of tests to see if certain changes made any difference - mostly they were down in the noise really. Removing the prefetch() did seem to speed things up, but it is tricky to test since the chances are everything was in cache anyway with my test prog. We can always put it back if it turns out that it is needed later on. There should be no boundary crossing issues, since the pointer will always be aligned to its natural (64bit) boundary, and the test program verifies that. I guess we could optimise the /= 2 in case the compiler doesn't, but its really not a bit issue as it doesn't occur in the main loop, which is where all the time is spent. The doc changes look good though, and it would be nice to get that fixed up. We could spend more time looking into the performance, but at the moment its probably more important that we have the alignment issue fixed. There isn't a big difference in the number of instructions in the inner loop, so I hope that it will not make too much difference to the speed, which I would expect to be dominated by the cache performance and the bandwidth to memory in most cases. We do gain something by only exiting the main loop when we've really found something, whereas in the original there were a few odd corner cases which could cause a jump to the slow path. Its tricky to say how much we gain though.
I have not tested the performance of this algorithm on x86_64, but I can tell you for sure that performance is unacceptable on i386. I ran bonnie++ on my trin-10 system using the same system, same hw, same everything, both times after a clean reboot and clean mkfs.gfs2. The writes are ~23 TIMES (not percent) slower with this algorithm. With the old algorithm, the test took approximately 5 minutes to run. With the new algorithm, the test took more than an hour to run. I was so surprised by the results that I double-checked that I did everything correctly and I completely redid the bonnie++ test with the new algorithm a second time. In other words, I: 1. Compiled gfs2 with the new algorithm and installed it into /lib 2. Rebooted 3. mkfs 4. Ran bonnie++ with the new algorithm (> 1 hour) 5. Recompiled and installed the old algorithm 6. Rebooted 7. mkfs 8. Ran bonnie++ with the old algorithm (5 minutes) 9. Recompiled and installed the new algorithm again 10. Rebooted 11. mkfs 12. Ran bonnie++ with the new algorithm a second time (> 1 hour) The results for the new algorithm were the same. Here are their respective outputs: -- (after fresh reboot and fresh mkfs) With the current (old) bitfit: [root@trin-10 /mnt/bob]# /home/bob/bonnie++-1.03e/bonnie++ -u root Version 1.03e ------Sequential Output------ --Sequential Input- --Random- -Per Chr- --Block-- -Rewrite- -Per Chr- --Block-- --Seeks-- Machine Size K/sec %CP K/sec %CP K/sec %CP K/sec %CP K/sec %CP /sec %CP trin-10 1G 26719 76 28747 10 14193 4 32256 89 36706 3 160.0 0 ------Sequential Create------ --------Random Create-------- -Create-- --Read--- -Delete-- -Create-- --Read--- -Delete-- files /sec %CP /sec %CP /sec %CP /sec %CP /sec %CP /sec %CP 16 2255 17 +++++ +++ 23662 97 1211 25 +++++ +++ 22936 95 trin-10,1G,26719,76,28747,10,14193,4,32256,89,36706,3,160.0,0,16,2255,17,+++++,+++,23662,97,1211,25,+++++,+++,22936,95 -- (after fresh reboot and fresh mkfs) With the proposed (new) bitfit patch: [root@trin-10 /mnt/bob]# /home/bob/bonnie++-1.03e/bonnie++ -u root Version 1.03e ------Sequential Output------ --Sequential Input- --Random- -Per Chr- --Block-- -Rewrite- -Per Chr- --Block-- --Seeks-- Machine Size K/sec %CP K/sec %CP K/sec %CP K/sec %CP K/sec %CP /sec %CP trin-10 1G 1277 3 1224 0 543 0 1139 3 1146 0 105.2 0 ------Sequential Create------ --------Random Create-------- -Create-- --Read--- -Delete-- -Create-- --Read--- -Delete-- files /sec %CP /sec %CP /sec %CP /sec %CP /sec %CP /sec %CP 16 736 5 +++++ +++ 23880 95 347 7 +++++ +++ 708 2 trin-10,1G,1277,3,1224,0,543,0,1139,3,1146,0,105.2,0,16,736,5,+++++,+++,23880,95,347,7,+++++,+++,708,2 Again, this is only on i386. I haven't tested x86_64 performance.
There are some very odd results in the bonnie output.... Firstly, bitfit is only used when allocating blocks, so I wonder why the figures for rewrite and read have both taken a dive. Secondly, it seems that the delete figure is faster with the new patch, but only in the sequential case. It seems to be much slower in the random case. That again makes no sense as bitfit would not be used during deletion. Did you try looking at the assembler output on i386 and comparing it with the x86_64 code? Maybe we are a bit short of registers in the main loop, which is why it goes slower, but even then a factor of 23 seems a bit unlikely. I'm just running another postmark test on x86_64 atm to see if there has been any change, but I don't rememebr seeing any particular change in the performance before I went away, but best to check again I guess....
So postmark on x86_64 does look as if it might be a little slower, but not a huge amount. Certainly nothing like the bonnie runs that you've seen. Also I have a lot of options like quotas, etc turned on at the moment, so it might be partly to do with that. I think this will take some more investigation to get to the bottom of the problem.
I've gone back to my test program, and I've been running the original bitfit in it to see if I can work out why that fails to give correct answers, and I think I've made some progress at least. The first failure is this: [steve@men-an-tol ~]$ ./a.out Testing block size: 511 Bitmap size 383 Search pattern: 0 Failure: bit=32 of 1533, state=0 goal=1 rv=4294967295 So thats the 32nd block (i.e. the first set of two bits in the 9th byte) so its the second 64bit word (this is on x86_64, btw). I think I can see whats going on too. In bitfit itself the "bit" variable is set at the start according to: startbit = bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE; however, if we then "goto ulong_aligned" we land up eventually returned to the "misaligned" code, but without updating the "bit", so that we land up looking at an offset (in this case, 2 bits) into the byte and so miss out on spotting that the first two bits were in fact a valid pattern. If I update the fast path to: while ((unsigned char *)plong < end - sizeof(unsigned long)) { prefetch(plong + 1); if (((*plong) & LBITMASK) != lskipval) break; plong++; bit = 0; <--- now sets bit to zero after one iteration } then it seems to pass the test - will have some performance results shortly.
Results of the original algorithm: real 29m5.749s user 29m4.457s sys 0m0.060s Results of the new algorithm: real 28m12.130s user 28m10.876s sys 0m0.051s So it looks like the new code is slightly faster, although there is not a lot in it really. Might be a good idea to repeat this test on i386 I suppose.... Either way I think that any change in benchmark performance that we see is more likely to be related to incorrect generation of the "goal" blocks rather than anything relating to bitfit.
Created attachment 333989 [details] Latest test prog This includes the small fix to the original algorithm, you have to edit the #if in the source to select which algorithm to use. Remember to compile with -O2 as well if you use this (i.e. the same as the kernel)
Results on i386 for the new algorithm: real 56m53.794s user 56m47.976s sys 0m2.533s now running the old one...
Results on i386 for the old algorithm: real 45m28.770s user 43m55.937s sys 0m1.654s So it is faster by 11min or so, I guess thats about 20%-ish. I think, looking at the asm produced, that the issue is that the compiler has missed out on a number of possible optimisations along the way, and thus run out of registers in the main loop. .L50: movl -24(%ebp), %ecx movl -20(%ebp), %ebx xorl (%esi), %ecx xorl 4(%esi), %ebx movl %ecx, %eax movl %ebx, %edx andl $1431655765, %eax andl $1431655765, %edx jmp .L46 It hasn't spotted that the two halves of the "search" value will be the same as each other, and can thus be in the same register. If you look near L46, then it does at least manage to make the shifting one instruction, so thats not as bad as I thought: .L46: shrdl $1, %ebx, %ecx movl %eax, %edi andl %ecx, %edi addl $8, %esi shrl %ebx movl %edi, -32(%ebp) movl %edx, %edi andl %ebx, %edi movl %edi, %eax orl -32(%ebp), %eax movl %edi, -28(%ebp) jne .L6 cmpl %esi, -36(%ebp) ja .L50 I wonder if the rest of this couldn't be tidied up too, but I'm not sure it would be easy to do something in C which would be generic across arches. Need to think about that a bit. Maybe the while loop needs to be marked "likely" so that we know that looping is the common case as that might remove the "jmp .L46" from the loop. Anyway, the conclusions are: a) This needs a bit more thought to optimise for i386 and b) There is nowhere near a 23x speed difference, even on i386, so that we should look carefully at how "goal" is generated and which rgrps we choose to look at in the allocation code, since this will likely be the answer for any benckmark performance changes that we see. c) The new code does fix the alignment issue on ia64 (from earlier tests on ia64 and also the careful design/test to avoid such problems)
Created attachment 334516 [details] Patch to skip the scanning of full bitmap blocks This patch sets a flag on each bitmap when we have scanned the complete bitmap and discovered that it is full. We reset the flag when (a) we get a glock on an rgrp and (b) when we have deleted something in that rgrp bitmap. This has reminded me what a mess the rgrp code is. I've added a bunch of other ideas for improvments to the (rather old) rgrp bug. There is still a lot of scope for clean up though. It would be nice to unroll the loop in rgblk_search so that in the common case, we can always call gfs2_bitfit with a constant 0 argument for "goal". Also we should separate the "looking for free blocks" case from "looking for unlinked inodes" case since there isn't really a lot in common between the two cases.
The patch from comment #20 won't apply to a recent RHEL5 kernel because the rgrp.c is too different from the upstream version. Can you post a RHEL5 version of the patch, or else a patch that catches rgrp.c up to the upstream version?
Ok, lets leave it for the moment. I guess there are other things which are more important right now. I had been assuming that you were using an upstream kernel to test. The patch does assume that the original bitfit patch and a couple of updates have been applied. I want to have a look at the rgrp code in a bit more detail, so it not a bug issue.
Created attachment 340501 [details] RHEL5 port of upstream patch
The patch from comment #23 applies to the kernel correctly, but it still has the performance problem I noted in comment #12. I upgraded my i686 test system to pre-release RHEL5.4, on the 2.6.18-140.el5 kernel for i686. Before the patch: -------------------------------------------- Version 1.03e ------Sequential Output------ --Sequential Input- --Random- -Per Chr- --Block-- -Rewrite- -Per Chr- --Block-- --Seeks-- Machine Size K/sec %CP K/sec %CP K/sec %CP K/sec %CP K/sec %CP /sec %CP trin-10 1G 32747 93 57612 20 30258 11 36205 90 81191 7 214.0 0 ------Sequential Create------ --------Random Create-------- -Create-- --Read--- -Delete-- -Create-- --Read--- -Delete-- files /sec %CP /sec %CP /sec %CP /sec %CP /sec %CP /sec %CP 16 8675 44 +++++ +++ 32242 100 3624 54 +++++ +++ 31556 99 trin-10,1G,32747,93,57612,20,30258,11,36205,90,81191,7,214.0,0,16,8675,44,+++++,+++,32242,100,3624,54,+++++,+++,31556,99 real 3m0.700s user 0m51.385s sys 0m18.691s After the patch: --------------------------------------------- Version 1.03e ------Sequential Output------ --Sequential Input- --Random- -Per Chr- --Block-- -Rewrite- -Per Chr- --Block-- --Seeks-- Machine Size K/sec %CP K/sec %CP K/sec %CP K/sec %CP K/sec %CP /sec %CP trin-10 1G 1501 4 1483 0 908 0 2217 6 2538 0 156.6 0 ------Sequential Create------ --------Random Create-------- -Create-- --Read--- -Delete-- -Create-- --Read--- -Delete-- files /sec %CP /sec %CP /sec %CP /sec %CP /sec %CP /sec %CP 16 3302 16 +++++ +++ 32767 99 2315 34 +++++ +++ 31531 100 trin-10,1G,1501,4,1483,0,908,0,2217,6,2538,0,156.6,0,16,3302,16,+++++,+++,32767,99,2315,34,+++++,+++,31531,100 real 58m38.119s user 0m55.769s sys 0m22.641s The only thing that's changed between these two runs is the patch. Both runs were done on the same kernel, same hardware, same software, after a fresh reboot and after a fresh mkfs.
We could do with some x86_64 results too as more people are likely to be using that then i686 these days. Bearing in mind the other tests we've done, its very unlikely that this patch is causing the problems see here and that the more likely explanation is that the numbers which we are feeding it are not quite right somehow. Perhaps I need to dig out the other patch so that we can try that on RHEL as well. Also I'd really like to do some tests with tracepoints which means using an upstream kernel, just to see what is really going on here. The fact that the read speeds drop suggests that something odd is going on wrt to allocation. It would be worth trying some other benchmarks too, just in case bonnie++ is doing something which makes it worst case for some reason. Maybe we could even use fiemap if the files are left around after the test, in order to see how fragmented they are.
Here is a fiemap of the file which Abhi created yesterday on exxon-01 which shows the issue nicely: [root@exxon-01 gfs2]# ~/fiemap -r foo Extent 0: start: 135745536 length: 4096 flags 0x1000 Extent 1: start: 135876608 length: 4096 flags 0x1000 Extent 2: start: 136007680 length: 4096 flags 0x1000 Extent 3: start: 136138752 length: 4096 flags 0x1000 Extent 4: start: 136269824 length: 4096 flags 0x1000 Extent 5: start: 136400896 length: 4096 flags 0x1000 Extent 6: start: 136531968 length: 4096 flags 0x1000 Extent 7: start: 136663040 length: 4096 flags 0x1000 Extent 8: start: 136794112 length: 4096 flags 0x1000 Extent 9: start: 136925184 length: 4096 flags 0x1000 Extent 10: start: 137056256 length: 4096 flags 0x1000 Extent 11: start: 137187328 length: 4096 flags 0x1000 Extent 12: start: 137318400 length: 4096 flags 0x1000 Extent 13: start: 137449472 length: 4096 flags 0x1000 Extent 14: start: 137580544 length: 4096 flags 0x1000 Extent 15: start: 137711616 length: 4096 flags 0x1000 Extent 16: start: 137842688 length: 4096 flags 0x1000 Extent 17: start: 137973760 length: 4096 flags 0x1000 Extent 18: start: 138104832 length: 4096 flags 0x1000 Extent 19: start: 138235904 length: 4096 flags 0x1000 Extent 20: start: 138366976 length: 4096 flags 0x1000 Extent 21: start: 138498048 length: 4096 flags 0x1000 Extent 22: start: 138629120 length: 4096 flags 0x1000 Extent 23: start: 138760192 length: 4096 flags 0x1000 Extent 24: start: 138891264 length: 4096 flags 0x1000 Extent 25: start: 139022336 length: 4096 flags 0x1000 Extent 26: start: 139153408 length: 4096 flags 0x1000 Extent 27: start: 139284480 length: 4096 flags 0x1000 Extent 28: start: 139415552 length: 4096 flags 0x1000 Extent 29: start: 139546624 length: 4096 flags 0x1000 Extent 30: start: 139677696 length: 4096 flags 0x1000 Extent 31: start: 139808768 length: 4096 flags 0x1000 Note the start address in each case and the fact that each extent seems to be 4096 in length - there are no longer extents as one might expect.
Some further info: The inode number of that inode is: 33133 and gfs2_edit says: Dinode: mh_magic 0x01161970 (hex) mh_type 4 0x4 mh_format 400 0x190 no_formal_ino 25 0x19 no_addr 33133 0x816d di_mode 0100644 (decimal) di_uid 0 0x0 di_gid 0 0x0 di_nlink 1 0x1 di_size 4194304 0x400000 di_blocks 1028 0x404 di_atime 1240391618 0x49eedfc2 di_mtime 1240378570 0x49eeacca di_ctime 1240378570 0x49eeacca di_major 0 0x0 di_minor 0 0x0 di_goal_meta 33140 0x8174 di_goal_data 33140 0x8174 Note that even after allocating all those blocks, the goal is still set to exactly one block _before_ the initial allocated block in the extent list above (135745536 / 4096) = 33141 so it looks like that is not being updated correctly for some reason. That still doesn't explain the fragmentation though.
There is also a strange pattern of allocation that I'm seeing too - gaps of 32 blocks which is rather odd.
I've found the cause of the gaps now. Should have an updated patch shortly.
Same test again on a patched upstream: [root@men-an-tol gfs0]# ~steve/fiemap -r ./test Extent 0: start: 135675904 length: 409600 flags 0x1001 That looks much better :-)
Created attachment 340731 [details] Fix to upstream patch - preliminary Since there is no __ffs64 operation in the upstream code, I need to look into how to add that. This corrects both the issues that I found earlier, so I hope that we can at least use this for RHEL5, even if we put the __ffs64 into the asm headers where it belongs for upstream.
I tested the "2.6.18-140.gfs2abhi.004" kernel on x86_64 and found bonnie++ performance to exactly match the stock 2.6.18-140 kernel. So that proves the x86_64 version apparently has no performance loss. I haven't tried 32-bit yet, but I likely can't get to that for a few hours.
Created attachment 340967 [details] combined RHEL5 patch Posted this patch to rhkernel-list. It combines the previous RHEL5 patch along with a RHEL5 port of the previous (upstream fix) patch.
I re-tested performance on i686 with this patch and it's back to normal, so it's got my buy-in.
in kernel-2.6.18-141.el5 You can download this test kernel from http://people.redhat.com/dzickus/el5 Please do NOT transition this bugzilla state to VERIFIED until our QE team has sent specific instructions indicating when to do so. However feel free to provide a comment indicating that this fix has been verified.
An advisory has been issued which should help the problem described in this bug report. This report is therefore being closed with a resolution of ERRATA. For more information on therefore solution and/or where to find the updated files, please follow the link below. You may reopen this bug report if the solution does not work for you. http://rhn.redhat.com/errata/RHSA-2009-1243.html