Login
[x]
Log in using an account from:
Fedora Account System
Red Hat Associate
Red Hat Customer
Or login using a Red Hat Bugzilla account
Forgot Password
Login:
Hide Forgot
Create an Account
Red Hat Bugzilla – Attachment 599320 Details for
Bug 826997
grep -i (case-insensitive) is broken with UTF8
[?]
New
Simple Search
Advanced Search
My Links
Browse
Requests
Reports
Current State
Search
Tabular reports
Graphical reports
Duplicates
Other Reports
User Changes
Plotly Reports
Bug Status
Bug Severity
Non-Defaults
|
Product Dashboard
Help
Page Help!
Bug Writing Guidelines
What's new
Browser Support Policy
5.0.4.rh83 Release notes
FAQ
Guides index
User guide
Web Services
Contact
Legal
This site requires JavaScript to be enabled to function correctly, please enable it.
[patch]
Backported fix (including tests)
grep-2.6.3-i-fix.patch (text/plain), 14.13 KB, created by
Jaroslav Škarvada
on 2012-07-20 07:24:26 UTC
(
hide
)
Description:
Backported fix (including tests)
Filename:
MIME Type:
Creator:
Jaroslav Škarvada
Created:
2012-07-20 07:24:26 UTC
Size:
14.13 KB
patch
obsolete
>diff --git a/THANKS b/THANKS >index 9e05912..e21ec72 100644 >--- a/THANKS >+++ b/THANKS >@@ -78,6 +78,7 @@ Shannon Hill <hill@synnet.com> > Sotiris Vassilopoulos <Sotiris.Vassilopoulos@betatech.gr> > Standish Parsley <adsspamtrap01@yahoo.com> > Stewart Levin <stew@sep.stanford.edu> >+Strahinja Kustudic <kustodian@gmail.com> > Sven Joachim <svenjoac@gmx.de> > Sydoruk Stepan <step@unitex.kiev.ua> > Tapani Tarvainen <tt@mit.jyu.fi> >diff --git a/src/dfasearch.c b/src/dfasearch.c >index 5de40b6..0b358f3 100644 >--- a/src/dfasearch.c >+++ b/src/dfasearch.c >@@ -65,11 +65,12 @@ kwsincr_case (const char *must) > { > const char *buf; > size_t n; >+ mb_len_map_t *map = NULL; > > n = strlen (must); > #ifdef MBS_SUPPORT > if (match_icase && MB_CUR_MAX > 1) >- buf = mbtolower (must, &n); >+ buf = mbtolower (must, &n, &map); > else > #endif > buf = must; >@@ -204,9 +205,13 @@ EGexecute (char const *buf, size_t size, size_t *match_size, > { > char const *buflim, *beg, *end, *match, *best_match, *mb_start; > char eol = eolbyte; >- int backref, start, len, best_len; >+ int backref; >+ regoff_t start; >+ ptrdiff_t len, best_len; > struct kwsmatch kwsm; > size_t i, ret_val; >+ mb_len_map_t *map = NULL; >+ > #ifdef MBS_SUPPORT > if (MB_CUR_MAX > 1) > { >@@ -214,7 +219,7 @@ EGexecute (char const *buf, size_t size, size_t *match_size, > { > /* mbtolower adds a NUL byte at the end. That will provide > space for the sentinel byte dfaexec may add. */ >- char *case_buf = mbtolower (buf, &size); >+ char *case_buf = mbtolower (buf, &size, &map); > if (start_ptr) > start_ptr = case_buf + (start_ptr - buf); > buf = case_buf; >@@ -385,9 +390,11 @@ EGexecute (char const *buf, size_t size, size_t *match_size, > > success: > len = end - beg; >- success_in_len: >+ success_in_len:; >+ size_t off = beg - buf; >+ mb_case_map_apply (map, &off, &len); > *match_size = len; >- ret_val = beg - buf; >+ ret_val = off; > out: > return ret_val; > } >diff --git a/src/kwsearch.c b/src/kwsearch.c >index 973eb60..9d44f08 100644 >--- a/src/kwsearch.c >+++ b/src/kwsearch.c >@@ -34,12 +34,13 @@ Fcompile (char const *pattern, size_t size) > { > char const *beg, *end, *lim, *err, *pat; > size_t psize; >+ mb_len_map_t *map = NULL; > > kwsinit (&kwset); > psize = size; > #ifdef MBS_SUPPORT > if (match_icase && MB_CUR_MAX > 1) >- pat = mbtolower (pattern, &psize); >+ pat = mbtolower (pattern, &psize, &map); > else > #endif > pat = pattern; >@@ -81,16 +82,18 @@ Fexecute (char const *buf, size_t size, size_t *match_size, > char const *start_ptr) > { > char const *beg, *try, *end, *mb_start; >- size_t len; >+ ptrdiff_t len; > char eol = eolbyte; > struct kwsmatch kwsmatch; > size_t ret_val; >+ mb_len_map_t *map = NULL; >+ > #ifdef MBS_SUPPORT > if (MB_CUR_MAX > 1) > { > if (match_icase) > { >- char *case_buf = mbtolower (buf, &size); >+ char *case_buf = mbtolower (buf, &size, &map); > if (start_ptr) > start_ptr = case_buf + (start_ptr - buf); > buf = case_buf; >@@ -166,9 +169,12 @@ Fexecute (char const *buf, size_t size, size_t *match_size, > while (buf < beg && beg[-1] != eol) > --beg; > len = end - beg; >- success_in_beg_and_len: >+ success_in_beg_and_len:; >+ size_t off = beg - buf; >+ mb_case_map_apply (map, &off, &len); >+ > *match_size = len; >- ret_val = beg - buf; >+ ret_val = off; > out: > return ret_val; > } >diff --git a/src/search.h b/src/search.h >index e9049a9..db982e8 100644 >--- a/src/search.h >+++ b/src/search.h >@@ -22,6 +22,7 @@ > #include <config.h> > > #include <sys/types.h> >+#include <stdint.h> > > #include "mbsupport.h" > #ifdef MBS_SUPPORT >@@ -37,11 +38,17 @@ > #include "kwset.h" > #include "xalloc.h" > >+/* This must be a signed type. Each value is the difference in the size >+ of a character (in bytes) induced by converting to lower case. >+ The vast majority of values are 0, but a few are 1 or -1, so >+ technically, two bits may be sufficient. */ >+typedef signed char mb_len_map_t; >+ > /* searchutils.c */ > void kwsinit (kwset_t *); > > #ifdef MBS_SUPPORT >-char * mbtolower (const char *, size_t *); >+char * mbtolower (const char *, size_t *, mb_len_map_t **); > bool is_mb_middle(const char **, const char *, const char *, size_t); > #endif > >@@ -58,4 +65,23 @@ void Pcompile (char const *, size_t); > size_t Pexecute (char const *, size_t, size_t *, char const *); > > >+/* Apply a non-NULL MAP from mbtolower to the lowercase-buffer-relative >+ *OFF and *LEN, converting them to be relative to the original buffer. */ >+static inline void >+mb_case_map_apply (mb_len_map_t const *map, size_t *off, ptrdiff_t *len) >+{ >+ if (map) >+ { >+ size_t off_incr = 0; >+ size_t len_incr = 0; >+ size_t k; >+ for (k = 0; k < *off; k++) >+ off_incr += map[k]; >+ for (k = *off; k < *off + *len; k++) >+ len_incr += map[k]; >+ *off += off_incr; >+ *len += len_incr; >+ } >+} >+ > #endif /* GREP_SEARCH_H */ >diff --git a/src/searchutils.c b/src/searchutils.c >index 99f3c2e..d3a4af3 100644 >--- a/src/searchutils.c >+++ b/src/searchutils.c >@@ -46,30 +46,53 @@ kwsinit (kwset_t *kwset) > } > > #ifdef MBS_SUPPORT >-/* Convert the *N-byte string, BEG, to lowercase, and write the >+/* Convert the *N-byte string, BEG, to lower-case, and write the > NUL-terminated result into malloc'd storage. Upon success, set *N > to the length (in bytes) of the resulting string (not including the >- trailing NUL byte), and return a pointer to the lowercase string. >+ trailing NUL byte), and return a pointer to the lower-case string. > Upon memory allocation failure, this function exits. > > Note that while this function returns a pointer to malloc'd storage, > the caller must not free it, since this function retains a pointer > to the buffer and reuses it on any subsequent call. As a consequence, >- this function is not thread-safe. */ >+ this function is not thread-safe. >+ >+ When each character in the lower-case result string has the same length >+ as the corresponding character in the input string, set *LEN_MAP_P >+ to NULL. Otherwise, set it to a malloc'd buffer (like the returned >+ buffer, this must not be freed by caller) of the same length as the >+ result string. (*LEN_MAP_P)[J] is the change in byte-length of the >+ character in BEG that formed byte J of the result as it was converted to >+ lower-case. It is usually zero. For the upper-case Turkish I-with-dot >+ it is -1, since the upper-case character occupies two bytes, while the >+ lower-case one occupies only one byte. For the Turkish-I-without-dot >+ in the tr_TR.utf8 locale, it is 1 because the lower-case representation >+ is one byte longer than the original. When that happens, we have two >+ or more slots in *LEN_MAP_P for each such character. We store the >+ difference in the first one and 0's in any remaining slots. >+ >+ This map is used by the caller to convert offset,length pairs that >+ reference the lower-case result to numbers that refer to the matched >+ part of the original buffer. */ >+ > char * >-mbtolower (const char *beg, size_t *n) >+mbtolower (const char *beg, size_t *n, mb_len_map_t **len_map_p) > { > static char *out; >+ static mb_len_map_t *len_map; > static size_t outalloc; > size_t outlen, mb_cur_max; > mbstate_t is, os; > const char *end; > char *p; >+ mb_len_map_t *m; >+ bool lengths_differ = false; > > if (*n > outalloc || outalloc == 0) > { > outalloc = MAX(1, *n); > out = xrealloc (out, outalloc); >+ len_map = xrealloc (len_map, outalloc); > } > > if (*n == 0) >@@ -81,21 +104,26 @@ mbtolower (const char *beg, size_t *n) > > mb_cur_max = MB_CUR_MAX; > p = out; >+ m = len_map; > outlen = 0; > while (beg < end) > { > wchar_t wc; >- size_t mbclen = mbrtowc(&wc, beg, end - beg, &is); >+ size_t mbclen = mbrtowc (&wc, beg, end - beg, &is); > if (outlen + mb_cur_max >= outalloc) > { >+ size_t dm = m - len_map; > out = x2nrealloc (out, &outalloc, 1); >+ len_map = xrealloc (len_map, outalloc); > p = out + outlen; >+ m = len_map + dm; > } > > if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) > { > /* An invalid sequence, or a truncated multi-octet character. > We treat it as a single-octet character. */ >+ *m++ = 0; > *p++ = *beg++; > outlen++; > memset (&is, 0, sizeof (is)); >@@ -104,14 +132,19 @@ mbtolower (const char *beg, size_t *n) > else > { > beg += mbclen; >- mbclen = wcrtomb (p, towlower ((wint_t) wc), &os); >- p += mbclen; >- outlen += mbclen; >+ size_t ombclen = wcrtomb (p, towlower ((wint_t) wc), &os); >+ *m = mbclen - ombclen; >+ memset (m + 1, 0, ombclen - 1); >+ m += ombclen; >+ p += ombclen; >+ outlen += ombclen; >+ lengths_differ |= (mbclen != ombclen); > } > } > >+ *len_map_p = lengths_differ ? len_map : NULL; > *n = p - out; >- *p++ = 0; >+ *p = 0; > return out; > } > >diff --git a/tests/Makefile.am b/tests/Makefile.am >index 0658de9..df894c5 100644 >--- a/tests/Makefile.am >+++ b/tests/Makefile.am >@@ -57,6 +57,8 @@ TESTS = \ > status.sh \ > prefix-of-multibyte \ > warning.sh \ >+ turkish-I \ >+ turkish-I-without-dot \ > word-multi-file \ > yesno.sh > >diff --git a/tests/Makefile.in b/tests/Makefile.in >index b97c038..6c7f718 100644 >--- a/tests/Makefile.in >+++ b/tests/Makefile.in >@@ -824,6 +824,8 @@ TESTS = \ > status.sh \ > prefix-of-multibyte \ > warning.sh \ >+ turkish-I \ >+ turkish-I-without-dot \ > word-multi-file \ > yesno.sh > >@@ -1239,6 +1241,10 @@ prefix-of-multibyte.log: prefix-of-multibyte > @p='prefix-of-multibyte'; $(am__check_pre) $(LOG_COMPILE) "$$tst" $(am__check_post) > warning.sh.log: warning.sh > @p='warning.sh'; $(am__check_pre) $(LOG_COMPILE) "$$tst" $(am__check_post) >+turkish-I.log: turkish-I >+ @p='turkish-I'; $(am__check_pre) $(LOG_COMPILE) "$$tst" $(am__check_post) >+turkish-I-without-dot.log: turkish-I-without-dot >+ @p='turkish-I-without-dot'; $(am__check_pre) $(LOG_COMPILE) "$$tst" $(am__check_post) > word-multi-file.log: word-multi-file > @p='word-multi-file'; $(am__check_pre) $(LOG_COMPILE) "$$tst" $(am__check_post) > yesno.sh.log: yesno.sh >diff --git a/tests/init.cfg b/tests/init.cfg >index 8fc8c32..515a48f 100644 >--- a/tests/init.cfg >+++ b/tests/init.cfg >@@ -49,3 +49,28 @@ require_en_utf8_locale_() > *) skip_ 'en_US.UTF-8 locale not found' ;; > esac > } >+ >+require_tr_utf8_locale_() >+{ >+ path_prepend_ . >+ case $(get-mb-cur-max tr_TR.UTF-8) in >+ [3456]) ;; >+ *) skip_ 'en_US.UTF-8 locale not found' ;; >+ esac >+} >+ >+require_ru_RU_koi8_r() >+{ >+ path_prepend_ . >+ case $(get-mb-cur-max ru_RU.KOI8-R) in >+ 1) ;; >+ *) skip_ 'ru_RU.KOI8-R locale not found' ;; >+ esac >+} >+ >+require_compiled_in_MB_support() >+{ >+ require_en_utf8_locale_ >+ printf 'é' | LC_ALL=en_US.UTF-8 grep '[[:lower:]]' \ >+ || skip_ this test requires MBS support >+} >diff --git a/tests/turkish-I b/tests/turkish-I >new file mode 100755 >index 0000000..35a3753 >--- /dev/null >+++ b/tests/turkish-I >@@ -0,0 +1,33 @@ >+#!/bin/sh >+# grep -i in UTF-8: missing NL in output on line containing I WITH DOT (U+0130) >+ >+# Copyright (C) 2011-2012 Free Software Foundation, Inc. >+ >+# This program is free software: you can redistribute it and/or modify >+# it under the terms of the GNU General Public License as published by >+# the Free Software Foundation, either version 3 of the License, or >+# (at your option) any later version. >+ >+# This program is distributed in the hope that it will be useful, >+# but WITHOUT ANY WARRANTY; without even the implied warranty of >+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >+# GNU General Public License for more details. >+ >+# You should have received a copy of the GNU General Public License >+# along with this program. If not, see <http://www.gnu.org/licenses/>. >+ >+. "${srcdir=.}/init.sh"; path_prepend_ ../src >+ >+require_en_utf8_locale_ >+require_compiled_in_MB_support >+ >+fail=0 >+ >+i='\xC4\xB0' >+printf "$i$i$i$i$i$i$i\n" > in || framework_failure_ >+ >+LC_ALL=en_US.UTF-8 grep -i .... in > out || fail=1 >+ >+compare out in || fail=1 >+ >+Exit $fail >diff --git a/tests/turkish-I-without-dot b/tests/turkish-I-without-dot >new file mode 100755 >index 0000000..9f92502 >--- /dev/null >+++ b/tests/turkish-I-without-dot >@@ -0,0 +1,55 @@ >+#!/bin/sh >+# grep -i would misbehave for any matched line containing a character >+# (like "I" in the tr_TR.utf8 locale) whose lower-case representation >+# occupies more bytes (two in this case, for 0xc4b1, aka U+0131). >+ >+# Copyright (C) 2011-2012 Free Software Foundation, Inc. >+ >+# This program is free software: you can redistribute it and/or modify >+# it under the terms of the GNU General Public License as published by >+# the Free Software Foundation, either version 3 of the License, or >+# (at your option) any later version. >+ >+# This program is distributed in the hope that it will be useful, >+# but WITHOUT ANY WARRANTY; without even the implied warranty of >+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >+# GNU General Public License for more details. >+ >+# You should have received a copy of the GNU General Public License >+# along with this program. If not, see <http://www.gnu.org/licenses/>. >+ >+. "${srcdir=.}/init.sh"; path_prepend_ ../src >+ >+require_tr_utf8_locale_ >+require_compiled_in_MB_support >+ >+# Before this change, grep could print a lot of uninitialized memory: >+# $ printf "IIIIIII\n" > in >+# $ for i in $(seq 10); do LC_ALL=tr_TR.utf8 src/grep -i . in|wc -c; done >+# 760 >+# 754 >+# 585 >+# 298 >+# 273 >+# 458 >+# 660 >+# 552 >+# 936 >+# 678 >+ >+fail=0 >+ >+printf "IIIIIII\n" > in || framework_failure_ >+LC_ALL=tr_TR.utf8 grep -i .... in > out || fail=1 >+compare out in || fail=1 >+ >+# Also exercise the case in which the original string and the lower-case >+# buffer have precisely the same length (22 bytes here), yet internal >+# offsets do differ. Lengths are the same because while some bytes shrink >+# when converted to lower case, others grow, and here they balance out. >+i='I\xC4\xB0' >+printf "$i$i$i$i$i$i$i\n" > in || framework_failure_ >+LC_ALL=tr_TR.utf8 grep -i .... in > out || fail=1 >+compare out in || fail=1 >+ >+Exit $fail
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Diff
View Attachment As Raw
Actions:
View
|
Diff
Attachments on
bug 826997
:
597568
| 599320