230 lines
6.4 KiB
C

/*
* Copyright (C) 1999-2008, 2011 Free Software Foundation, Inc.
* This file is part of the GNU LIBICONV Library.
*
* The GNU LIBICONV Library is free software; you can redistribute it
* and/or modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* The GNU LIBICONV Library is distributed in the hope that it will be
* useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with the GNU LIBICONV Library; see the file COPYING.LIB.
* If not, write to the Free Software Foundation, Inc., 51 Franklin Street,
* Fifth Floor, Boston, MA 02110-1301, USA.
*/
/* Part 1 of iconv_open.
Input: const char* tocode, const char* fromcode.
Output:
unsigned int from_index;
int from_wchar;
unsigned int to_index;
int to_wchar;
int transliterate;
int discard_ilseq;
Jumps to 'invalid' in case of errror.
*/
{
char buf[MAX_WORD_LENGTH+10+1];
const char* cp;
char* bp;
const struct alias * ap;
unsigned int count;
transliterate = 0;
discard_ilseq = 0;
/* Before calling aliases_lookup, convert the input string to upper case,
* and check whether it's entirely ASCII (we call gperf with option "-7"
* to achieve a smaller table) and non-empty. If it's not entirely ASCII,
* or if it's too long, it is not a valid encoding name.
*/
for (to_wchar = 0;;) {
/* Search tocode in the table. */
for (cp = tocode, bp = buf, count = MAX_WORD_LENGTH+10+1; ; cp++, bp++) {
unsigned char c = * (unsigned char *) cp;
if (c >= 0x80)
goto invalid;
if (c >= 'a' && c <= 'z')
c -= 'a'-'A';
*bp = c;
if (c == '\0')
break;
if (--count == 0)
goto invalid;
}
for (;;) {
if (bp-buf >= 10 && memcmp(bp-10,"//TRANSLIT",10)==0) {
bp -= 10;
*bp = '\0';
transliterate = 1;
continue;
}
if (bp-buf >= 8 && memcmp(bp-8,"//IGNORE",8)==0) {
bp -= 8;
*bp = '\0';
discard_ilseq = 1;
continue;
}
break;
}
if (buf[0] == '\0') {
tocode = locale_charset();
/* Avoid an endless loop that could occur when using an older version
of localcharset.c. */
if (tocode[0] == '\0')
goto invalid;
continue;
}
ap = aliases_lookup(buf,bp-buf);
if (ap == NULL) {
ap = aliases2_lookup(buf);
if (ap == NULL)
goto invalid;
}
if (ap->encoding_index == ei_local_char) {
tocode = locale_charset();
/* Avoid an endless loop that could occur when using an older version
of localcharset.c. */
if (tocode[0] == '\0')
goto invalid;
continue;
}
if (ap->encoding_index == ei_local_wchar_t) {
/* On systems which define __STDC_ISO_10646__, wchar_t is Unicode.
This is also the case on native Woe32 systems and Cygwin >= 1.7, where
we know that it is UTF-16. */
#if ((defined _WIN32 || defined __WIN32__) && !defined __CYGWIN__) || (defined __CYGWIN__ && CYGWIN_VERSION_DLL_MAJOR >= 1007)
if (sizeof(wchar_t) == 4) {
to_index = ei_ucs4internal;
break;
}
if (sizeof(wchar_t) == 2) {
# if WORDS_LITTLEENDIAN
to_index = ei_utf16le;
# else
to_index = ei_utf16be;
# endif
break;
}
#elif __STDC_ISO_10646__
if (sizeof(wchar_t) == 4) {
to_index = ei_ucs4internal;
break;
}
if (sizeof(wchar_t) == 2) {
to_index = ei_ucs2internal;
break;
}
if (sizeof(wchar_t) == 1) {
to_index = ei_iso8859_1;
break;
}
#endif
#if HAVE_MBRTOWC
to_wchar = 1;
tocode = locale_charset();
continue;
#endif
goto invalid;
}
to_index = ap->encoding_index;
break;
}
for (from_wchar = 0;;) {
/* Search fromcode in the table. */
for (cp = fromcode, bp = buf, count = MAX_WORD_LENGTH+10+1; ; cp++, bp++) {
unsigned char c = * (unsigned char *) cp;
if (c >= 0x80)
goto invalid;
if (c >= 'a' && c <= 'z')
c -= 'a'-'A';
*bp = c;
if (c == '\0')
break;
if (--count == 0)
goto invalid;
}
for (;;) {
if (bp-buf >= 10 && memcmp(bp-10,"//TRANSLIT",10)==0) {
bp -= 10;
*bp = '\0';
continue;
}
if (bp-buf >= 8 && memcmp(bp-8,"//IGNORE",8)==0) {
bp -= 8;
*bp = '\0';
continue;
}
break;
}
if (buf[0] == '\0') {
fromcode = locale_charset();
/* Avoid an endless loop that could occur when using an older version
of localcharset.c. */
if (fromcode[0] == '\0')
goto invalid;
continue;
}
ap = aliases_lookup(buf,bp-buf);
if (ap == NULL) {
ap = aliases2_lookup(buf);
if (ap == NULL)
goto invalid;
}
if (ap->encoding_index == ei_local_char) {
fromcode = locale_charset();
/* Avoid an endless loop that could occur when using an older version
of localcharset.c. */
if (fromcode[0] == '\0')
goto invalid;
continue;
}
if (ap->encoding_index == ei_local_wchar_t) {
/* On systems which define __STDC_ISO_10646__, wchar_t is Unicode.
This is also the case on native Woe32 systems and Cygwin >= 1.7, where
we know that it is UTF-16. */
#if ((defined _WIN32 || defined __WIN32__) && !defined __CYGWIN__) || (defined __CYGWIN__ && CYGWIN_VERSION_DLL_MAJOR >= 1007)
if (sizeof(wchar_t) == 4) {
from_index = ei_ucs4internal;
break;
}
if (sizeof(wchar_t) == 2) {
# if WORDS_LITTLEENDIAN
from_index = ei_utf16le;
# else
from_index = ei_utf16be;
# endif
break;
}
#elif __STDC_ISO_10646__
if (sizeof(wchar_t) == 4) {
from_index = ei_ucs4internal;
break;
}
if (sizeof(wchar_t) == 2) {
from_index = ei_ucs2internal;
break;
}
if (sizeof(wchar_t) == 1) {
from_index = ei_iso8859_1;
break;
}
#endif
#if HAVE_WCRTOMB
from_wchar = 1;
fromcode = locale_charset();
continue;
#endif
goto invalid;
}
from_index = ap->encoding_index;
break;
}
}