To: vim-dev@vim.org Subject: Patch 6.2.506 (extra) Fcc: outbox From: Bram Moolenaar Mime-Version: 1.0 Content-Type: text/plain; charset=ISO-8859-1 Content-Transfer-Encoding: 8bit ------------ Patch 6.2.506 (extra) Problem: Win32: When 'encoding' is a codepage then reading a utf-8 file only works when iconv is available. Writing a file in another codepage uses the wrong kind of conversion. Solution: Use internal conversion functions. Enable reading and writing files with 'fileencoding' different from 'encoding' for all valid codepages and utf-8 without the need for iconv. Files: src/fileio.c, src/testdir/Make_dos.mak, src/testdir/test52.in, src/testdir/test52.ok *** ../vim-6.2.505/src/fileio.c Sun Apr 25 16:26:29 2004 --- src/fileio.c Tue Apr 27 15:31:34 2004 *************** *** 939,947 **** # ifdef WIN3264 /* ! * Conversion from an MS-Windows codepage to UTF-8 is handled here. */ ! if (fio_flags == 0 && enc_utf8) fio_flags = get_win_fio_flags(fenc); # endif --- 939,948 ---- # ifdef WIN3264 /* ! * Conversion from an MS-Windows codepage to UTF-8 or another codepage ! * is handled with MultiByteToWideChar(). */ ! if (fio_flags == 0) fio_flags = get_win_fio_flags(fenc); # endif *************** *** 1329,1388 **** if (fio_flags & FIO_CODEPAGE) { /* ! * Conversion from an MS-Windows codepage to UTF-8, using ! * standard MS-Windows functions. */ char_u *ucsp; ! size_t from_size; int needed; char_u *p; int u8c; /* ! * We can't tell if the last byte of an MBCS string is valid ! * and MultiByteToWideChar() returns zero if it isn't. ! * Try the whole string, and if that fails, bump the last byte ! * into conv_rest and try again. */ ! from_size = size; ! needed = MultiByteToWideChar(FIO_GET_CP(fio_flags), ! MB_ERR_INVALID_CHARS, (LPCSTR)ptr, from_size, ! NULL, 0); ! if (needed == 0) { ! conv_rest[0] = ptr[from_size - 1]; ! conv_restlen = 1; ! --from_size; needed = MultiByteToWideChar(FIO_GET_CP(fio_flags), ! MB_ERR_INVALID_CHARS, (LPCSTR)ptr, from_size, NULL, 0); ! } ! /* If there really is a conversion error, try using another ! * conversion. */ ! if (needed == 0) ! goto rewind_retry; ! /* Put the result of conversion to UCS-2 at the end of the ! * buffer, then convert from UCS-2 to UTF-8 into the start of ! * the buffer. If there is not enough space just fail, there ! * is probably something wrong. */ ucsp = ptr + real_size - (needed * sizeof(WCHAR)); if (ucsp < ptr + size) goto rewind_retry; ! needed = MultiByteToWideChar(FIO_GET_CP(fio_flags), MB_ERR_INVALID_CHARS, (LPCSTR)ptr, from_size, (LPWSTR)ucsp, needed); ! /* Now go from UCS-2 to UTF-8. */ ! p = ptr; ! for (; needed > 0; --needed) ! { ! u8c = *ucsp++; ! u8c += (*ucsp++ << 8); ! p += utf_char2bytes(u8c, p); } - size = p - ptr; } else # endif --- 1330,1462 ---- if (fio_flags & FIO_CODEPAGE) { /* ! * Conversion from an MS-Windows codepage or UTF-8 to UTF-8 or ! * a codepage, using standard MS-Windows functions. ! * 1. find out how many ucs-2 characters there are. ! * 2. convert from 'fileencoding' to ucs-2 ! * 3. convert from ucs-2 to 'encoding' */ char_u *ucsp; ! size_t from_size = size; int needed; char_u *p; int u8c; + int l, len; /* ! * 1. find out how many ucs-2 characters there are. */ ! if (FIO_GET_CP(fio_flags) == CP_UTF8) { ! /* Handle CP_UTF8 ourselves to be able to handle trailing ! * bytes properly. First find out the number of ! * characters and check for trailing bytes. */ ! needed = 0; ! p = ptr; ! for (len = from_size; len > 0; len -= l) ! { ! l = utf_ptr2len_check_len(p, len); ! if (l > len) /* incomplete char */ ! { ! if (l > CONV_RESTLEN) ! /* weird overlong byte sequence */ ! goto rewind_retry; ! mch_memmove(conv_rest, p, len); ! conv_restlen = len; ! from_size -= len; ! break; ! } ! if (l == 1 && *p >= 0x80) /* illegal byte */ ! goto rewind_retry; ! ++needed; ! p += l; ! } ! } ! else ! { ! /* We can't tell if the last byte of an MBCS string is ! * valid and MultiByteToWideChar() returns zero if it ! * isn't. Try the whole string, and if that fails, bump ! * the last byte into conv_rest and try again. */ needed = MultiByteToWideChar(FIO_GET_CP(fio_flags), ! MB_ERR_INVALID_CHARS, (LPCSTR)ptr, from_size, NULL, 0); ! if (needed == 0) ! { ! conv_rest[0] = ptr[from_size - 1]; ! conv_restlen = 1; ! --from_size; ! needed = MultiByteToWideChar(FIO_GET_CP(fio_flags), ! MB_ERR_INVALID_CHARS, (LPCSTR)ptr, from_size, ! NULL, 0); ! } ! /* If there really is a conversion error, try using another ! * conversion. */ ! if (needed == 0) ! goto rewind_retry; ! } ! /* ! * 2. convert from 'fileencoding' to ucs-2 ! * ! * Put the result of conversion to UCS-2 at the end of the ! * buffer, then convert from UCS-2 to UTF-8 or "enc_codepage" ! * into the start of the buffer. If there is not enough space ! * just fail, there is probably something wrong. ! */ ucsp = ptr + real_size - (needed * sizeof(WCHAR)); if (ucsp < ptr + size) goto rewind_retry; ! ! if (FIO_GET_CP(fio_flags) == CP_UTF8) ! { ! /* Convert from utf-8 to ucs-2. */ ! needed = 0; ! p = ptr; ! for (len = from_size; len > 0; len -= l) ! { ! l = utf_ptr2len_check_len(p, len); ! u8c = utf_ptr2char(p); ! ucsp[needed * 2] = (u8c & 0xff); ! ucsp[needed * 2 + 1] = (u8c >> 8); ! ++needed; ! p += l; ! } ! } ! else ! needed = MultiByteToWideChar(FIO_GET_CP(fio_flags), MB_ERR_INVALID_CHARS, (LPCSTR)ptr, from_size, (LPWSTR)ucsp, needed); ! /* ! * 3. convert from ucs-2 to 'encoding' ! */ ! if (enc_utf8) ! { ! /* From UCS-2 to UTF-8. Cannot fail. */ ! p = ptr; ! for (; needed > 0; --needed) ! { ! u8c = *ucsp++; ! u8c += (*ucsp++ << 8); ! p += utf_char2bytes(u8c, p); ! } ! size = p - ptr; ! } ! else ! { ! BOOL bad = FALSE; ! ! /* From UCS-2 to "enc_codepage". If the conversion uses ! * the default character "?", the data doesn't fit in this ! * encoding, so fail (unless forced). */ ! size = WideCharToMultiByte(enc_codepage, 0, ! (LPCWSTR)ucsp, needed, ! (LPSTR)ptr, real_size, "?", &bad); ! if (bad && !keep_dest_enc) ! goto rewind_retry; } } else # endif *************** *** 3442,3451 **** } # ifdef WIN3264 ! if (converted && wb_flags == 0 && get_win_fio_flags(fenc)) { - wb_flags = get_win_fio_flags(fenc); - /* Convert UTF-8 -> UCS-2 and UCS-2 -> DBCS. Worst-case * 4: */ write_info.bw_conv_buflen = bufsize * 4; write_info.bw_conv_buf --- 3516,3523 ---- } # ifdef WIN3264 ! if (converted && wb_flags == 0 && (wb_flags = get_win_fio_flags(fenc)) != 0) { /* Convert UTF-8 -> UCS-2 and UCS-2 -> DBCS. Worst-case * 4: */ write_info.bw_conv_buflen = bufsize * 4; write_info.bw_conv_buf *************** *** 4474,4486 **** else if (flags & FIO_CODEPAGE) { /* ! * Convert UTF-8 to UCS-2 and then to MS-Windows codepage. */ char_u *from; size_t fromlen; char_u *to; int u8c; BOOL bad = FALSE; if (ip->bw_restlen > 0) { --- 4546,4560 ---- else if (flags & FIO_CODEPAGE) { /* ! * Convert UTF-8 or codepage to UCS-2 and then to MS-Windows ! * codepage. */ char_u *from; size_t fromlen; char_u *to; int u8c; BOOL bad = FALSE; + int needed; if (ip->bw_restlen > 0) { *************** *** 4498,4535 **** fromlen = len; } - /* Convert from UTF-8 to UCS-2, to the start of the buffer. - * The buffer has been allocated to be big enough. */ to = ip->bw_conv_buf; ! while (fromlen > 0) { ! n = utf_ptr2len_check_len(from, fromlen); ! if (n > (int)fromlen) ! break; ! u8c = utf_ptr2char(from); ! *to++ = (u8c & 0xff); ! *to++ = (u8c >> 8); ! fromlen -= n; ! from += n; ! } ! /* copy remainder to ip->bw_rest[] to be used for the next call. */ ! mch_memmove(ip->bw_rest, from, fromlen); ! ip->bw_restlen = fromlen; - /* Convert from UCS-2 to the codepage, using the remainder of the - * conversion buffer. If the conversion uses the default - * character "0", the data doesn't fit in this encoding, so fail. */ fromlen = to - ip->bw_conv_buf; ! len = WideCharToMultiByte(FIO_GET_CP(flags), 0, ! (LPCWSTR)ip->bw_conv_buf, (int)fromlen / sizeof(WCHAR), ! (LPSTR)to, ip->bw_conv_buflen - fromlen, 0, &bad); ! if (bad) { ! ip->bw_conv_error = TRUE; ! return FAIL; } - buf = to; } # endif --- 4572,4675 ---- fromlen = len; } to = ip->bw_conv_buf; ! if (enc_utf8) { ! /* Convert from UTF-8 to UCS-2, to the start of the buffer. ! * The buffer has been allocated to be big enough. */ ! while (fromlen > 0) ! { ! n = utf_ptr2len_check_len(from, fromlen); ! if (n > (int)fromlen) /* incomplete byte sequence */ ! break; ! u8c = utf_ptr2char(from); ! *to++ = (u8c & 0xff); ! *to++ = (u8c >> 8); ! fromlen -= n; ! from += n; ! } ! /* Copy remainder to ip->bw_rest[] to be used for the next ! * call. */ ! if (fromlen > CONV_RESTLEN) ! { ! /* weird overlong sequence */ ! ip->bw_conv_error = TRUE; ! return FAIL; ! } ! mch_memmove(ip->bw_rest, from, fromlen); ! ip->bw_restlen = fromlen; ! } ! else ! { ! /* Convert from enc_codepage to UCS-2, to the start of the ! * buffer. The buffer has been allocated to be big enough. */ ! ip->bw_restlen = 0; ! needed = MultiByteToWideChar(enc_codepage, ! MB_ERR_INVALID_CHARS, (LPCSTR)from, fromlen, ! NULL, 0); ! if (needed == 0) ! { ! /* When conversion fails there may be a trailing byte. */ ! ip->bw_restlen = 1; ! needed = MultiByteToWideChar(enc_codepage, ! MB_ERR_INVALID_CHARS, (LPCSTR)from, fromlen, ! NULL, 0); ! if (needed == 0) ! { ! /* Conversion doesn't work. */ ! ip->bw_conv_error = TRUE; ! return FAIL; ! } ! /* Save the trailing byte for the next call. */ ! *ip->bw_rest = from[fromlen - 1]; ! } ! needed = MultiByteToWideChar(enc_codepage, MB_ERR_INVALID_CHARS, ! (LPCSTR)from, fromlen - ip->bw_restlen, ! (LPWSTR)to, needed); ! if (needed == 0) ! { ! /* Safety check: Conversion doesn't work. */ ! ip->bw_conv_error = TRUE; ! return FAIL; ! } ! to += needed * 2; ! } fromlen = to - ip->bw_conv_buf; ! buf = to; ! if (FIO_GET_CP(flags) == CP_UTF8) { ! /* Convert from UCS-2 to UTF-8, using the remainder of the ! * conversion buffer. Fails when out of space. */ ! for (from = ip->bw_conv_buf; fromlen > 1; fromlen -= 2) ! { ! u8c = *from++; ! u8c += (*from++ << 8); ! to += utf_char2bytes(u8c, to); ! if (to + 6 >= ip->bw_conv_buf + ip->bw_conv_buflen) ! { ! ip->bw_conv_error = TRUE; ! return FAIL; ! } ! } ! len = to - buf; ! } ! else ! { ! /* Convert from UCS-2 to the codepage, using the remainder of ! * the conversion buffer. If the conversion uses the default ! * character "0", the data doesn't fit in this encoding, so ! * fail. */ ! len = WideCharToMultiByte(FIO_GET_CP(flags), 0, ! (LPCWSTR)ip->bw_conv_buf, (int)fromlen / sizeof(WCHAR), ! (LPSTR)to, ip->bw_conv_buflen - fromlen, 0, &bad); ! if (bad) ! { ! ip->bw_conv_error = TRUE; ! return FAIL; ! } } } # endif *************** *** 4775,4789 **** #ifdef WIN3264 /* * Check "ptr" for a MS-Windows codepage name and return the FIO_ flags needed ! * for the conversion MS-Windows can do for us. */ static int get_win_fio_flags(ptr) char_u *ptr; { ! if (ptr[0] == 'c' && ptr[1] == 'p' && VIM_ISDIGIT(ptr[2])) ! return FIO_PUT_CP(atoi(ptr + 2)) | FIO_CODEPAGE; ! return 0; } #endif --- 4915,4942 ---- #ifdef WIN3264 /* * Check "ptr" for a MS-Windows codepage name and return the FIO_ flags needed ! * for the conversion MS-Windows can do for us. Also accept "utf-8". ! * Used for conversion between 'encoding' and 'fileencoding'. */ static int get_win_fio_flags(ptr) char_u *ptr; { ! int cp; ! ! /* Cannot do this when 'encoding' is not utf-8 and not a codepage. */ ! if (!enc_utf8 && enc_codepage <= 0) ! return 0; ! ! cp = encname2codepage(ptr); ! if (cp == 0) ! { ! if (STRCMP(ptr, "utf-8") == 0) ! cp = CP_UTF8; ! else ! return 0; ! } ! return FIO_PUT_CP(cp) | FIO_CODEPAGE; } #endif *** ../vim-6.2.505/src/testdir/Make_dos.mak Mon Mar 22 17:28:47 2004 --- src/testdir/Make_dos.mak Tue Apr 27 15:51:03 2004 *************** *** 24,30 **** test15.out test17.out test18.out test21.out test26.out \ test30.out test31.out test32.out test33.out test34.out \ test37.out test38.out test39.out test40.out test41.out \ ! test42.out SCRIPTS32 = test50.out --- 24,30 ---- test15.out test17.out test18.out test21.out test26.out \ test30.out test31.out test32.out test33.out test34.out \ test37.out test38.out test39.out test40.out test41.out \ ! test42.out test52.out SCRIPTS32 = test50.out *************** *** 51,56 **** --- 51,57 ---- -del tiny.vim -del mbyte.vim -del X* + -del viminfo .in.out: copy $*.ok test.ok *************** *** 60,62 **** --- 61,64 ---- rename test.out $*.out -del X* -del test.ok + -del viminfo *** ../vim-6.2.505/src/testdir/test52.in Tue Apr 27 16:24:44 2004 --- src/testdir/test52.in Tue Apr 27 16:20:18 2004 *************** *** 0 **** --- 1,65 ---- + Tests for reading and writing files with conversion for Win32. + + STARTTEST + :so mbyte.vim + :" make this a dummy test for non-Win32 systems + :if !has("win32") | e! testk.ok | wq! test.out | endif + :" + :" write tests: + :" combine three values for 'encoding' with three values for 'fileencoding' + :" also write files for read tests + /^1 + :set encoding=utf-8 + :.w! ++enc=utf-8 test.out + :.w ++enc=cp1251 >>test.out + :.w ++enc=cp866 >>test.out + :.w! ++enc=utf-8 Xutf8 + /^2 + :set encoding=cp1251 + :.w ++enc=utf-8 >>test.out + :.w ++enc=cp1251 >>test.out + :.w ++enc=cp866 >>test.out + :.w! ++enc=cp1251 Xcp1251 + /^3 + :set encoding=cp866 + :.w ++enc=utf-8 >>test.out + :.w ++enc=cp1251 >>test.out + :.w ++enc=cp866 >>test.out + :.w! ++enc=cp866 Xcp866 + :" + :" read three 'fileencoding's with utf-8 'encoding' + :set encoding=utf-8 fencs=utf-8,cp1251 + :e Xutf8 + :.w ++enc=utf-8 >>test.out + :e Xcp1251 + :.w ++enc=utf-8 >>test.out + :set fencs=utf-8,cp866 + :e Xcp866 + :.w ++enc=utf-8 >>test.out + :" + :" read three 'fileencoding's with cp1251 'encoding' + :set encoding=utf-8 fencs=utf-8,cp1251 + :e Xutf8 + :.w ++enc=cp1251 >>test.out + :e Xcp1251 + :.w ++enc=cp1251 >>test.out + :set fencs=utf-8,cp866 + :e Xcp866 + :.w ++enc=cp1251 >>test.out + :" + :" read three 'fileencoding's with cp866 'encoding' + :set encoding=cp866 fencs=utf-8,cp1251 + :e Xutf8 + :.w ++enc=cp866 >>test.out + :e Xcp1251 + :.w ++enc=cp866 >>test.out + :set fencs=utf-8,cp866 + :e Xcp866 + :.w ++enc=cp866 >>test.out + :" + :qa! + ENDTEST + + 1 utf-8 text: Для Vim version 6.2. Последнее изменение: 1970 Jan 01 + 2 cp1251 text: Vim version 6.2. : 1970 Jan 01 + 3 cp866 text: Vim version 6.2. ᫥ : 1970 Jan 01 *** ../vim-6.2.505/src/testdir/test52.ok Tue Apr 27 16:24:44 2004 --- src/testdir/test52.ok Tue Apr 27 16:20:56 2004 *************** *** 0 **** --- 1,18 ---- + 1 utf-8 text: Для Vim version 6.2. Последнее изменение: 1970 Jan 01 + 1 utf-8 text: Vim version 6.2. : 1970 Jan 01 + 1 utf-8 text: Vim version 6.2. ᫥ : 1970 Jan 01 + 2 cp1251 text: Для Vim version 6.2. Последнее изменение: 1970 Jan 01 + 2 cp1251 text: Vim version 6.2. : 1970 Jan 01 + 2 cp1251 text: Vim version 6.2. ᫥ : 1970 Jan 01 + 3 cp866 text: Для Vim version 6.2. Последнее изменение: 1970 Jan 01 + 3 cp866 text: Vim version 6.2. : 1970 Jan 01 + 3 cp866 text: Vim version 6.2. ᫥ : 1970 Jan 01 + 1 utf-8 text: Для Vim version 6.2. Последнее изменение: 1970 Jan 01 + 2 cp1251 text: Для Vim version 6.2. Последнее изменение: 1970 Jan 01 + 3 cp866 text: Для Vim version 6.2. Последнее изменение: 1970 Jan 01 + 1 utf-8 text: Vim version 6.2. : 1970 Jan 01 + 2 cp1251 text: Vim version 6.2. : 1970 Jan 01 + 3 cp866 text: Vim version 6.2. : 1970 Jan 01 + 1 utf-8 text: Vim version 6.2. ᫥ : 1970 Jan 01 + 2 cp1251 text: Vim version 6.2. ᫥ : 1970 Jan 01 + 3 cp866 text: Vim version 6.2. ᫥ : 1970 Jan 01 *** ../vim-6.2.505/src/version.c Tue Apr 27 10:03:32 2004 --- src/version.c Tue Apr 27 16:23:35 2004 *************** *** 639,640 **** --- 639,642 ---- { /* Add new patch number below this line */ + /**/ + 506, /**/ -- hundred-and-one symptoms of being an internet addict: 34. You laugh at people with 14400 baud modems. /// Bram Moolenaar -- Bram@Moolenaar.net -- http://www.Moolenaar.net \\\ /// Sponsor Vim, vote for features -- http://www.Vim.org/sponsor/ \\\ \\\ Project leader for A-A-P -- http://www.A-A-P.org /// \\\ Buy at Amazon and help AIDS victims -- http://ICCF.nl/click1.html ///