| 1 /* |
|
| 2 * (C) Copyright 2008-2009 Jakub Zawadzki <darkjames@darkjames.ath.cx> |
|
| 3 * Wojtek Kaniewski <wojtekka@irc.pl> |
|
| 4 * |
|
| 5 * This program is free software; you can redistribute it and/or modify |
|
| 6 * it under the terms of the GNU Lesser General Public License Version |
|
| 7 * 2.1 as published by the Free Software Foundation. |
|
| 8 * |
|
| 9 * This program is distributed in the hope that it will be useful, |
|
| 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
| 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
| 12 * GNU Lesser General Public License for more details. |
|
| 13 * |
|
| 14 * You should have received a copy of the GNU Lesser General Public |
|
| 15 * License along with this program; if not, write to the Free Software |
|
| 16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, |
|
| 17 * USA. |
|
| 18 */ |
|
| 19 |
|
| 20 #include "strman.h" |
|
| 21 #include <stdlib.h> |
|
| 22 #include <errno.h> |
|
| 23 |
|
| 24 #include "libgadu.h" |
|
| 25 #include "encoding.h" |
|
| 26 |
|
| 27 /** |
|
| 28 * \file encoding.c |
|
| 29 * |
|
| 30 * \brief Funkcje konwersji kodowania tekstu |
|
| 31 */ |
|
| 32 |
|
| 33 /** |
|
| 34 * \internal Tablica konwersji CP1250 na Unikod. |
|
| 35 */ |
|
| 36 static const uint16_t table_cp1250[] = |
|
| 37 { |
|
| 38 0x20ac, '?', 0x201a, '?', 0x201e, 0x2026, 0x2020, 0x2021, |
|
| 39 '?', 0x2030, 0x0160, 0x2039, 0x015a, 0x0164, 0x017d, 0x0179, |
|
| 40 '?', 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, |
|
| 41 '?', 0x2122, 0x0161, 0x203a, 0x015b, 0x0165, 0x017e, 0x017a, |
|
| 42 0x00a0, 0x02c7, 0x02d8, 0x0141, 0x00a4, 0x0104, 0x00a6, 0x00a7, |
|
| 43 0x00a8, 0x00a9, 0x015e, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x017b, |
|
| 44 0x00b0, 0x00b1, 0x02db, 0x0142, 0x00b4, 0x00b5, 0x00b6, 0x00b7, |
|
| 45 0x00b8, 0x0105, 0x015f, 0x00bb, 0x013d, 0x02dd, 0x013e, 0x017c, |
|
| 46 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7, |
|
| 47 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e, |
|
| 48 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7, |
|
| 49 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df, |
|
| 50 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7, |
|
| 51 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f, |
|
| 52 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7, |
|
| 53 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9, |
|
| 54 }; |
|
| 55 |
|
| 56 /** |
|
| 57 * \internal Zamienia tekst kodowany CP1250 na UTF-8. |
|
| 58 * |
|
| 59 * \param src Tekst źródłowy w CP1250. |
|
| 60 * \param src_length Długość ciągu źródłowego (nigdy ujemna). |
|
| 61 * \param dst_length Długość ciągu docelowego (jeśli -1, nieograniczona). |
|
| 62 * |
|
| 63 * \return Zaalokowany bufor z tekstem w UTF-8. |
|
| 64 */ |
|
| 65 static char *gg_encoding_convert_cp1250_utf8(const char *src, int src_length, int dst_length) |
|
| 66 { |
|
| 67 int i, j, len; |
|
| 68 char *result = NULL; |
|
| 69 |
|
| 70 for (i = 0, len = 0; (src[i] != 0) && (i < src_length); i++) { |
|
| 71 uint16_t uc; |
|
| 72 |
|
| 73 if ((unsigned char) src[i] < 0x80) |
|
| 74 uc = (unsigned char) src[i]; |
|
| 75 else |
|
| 76 uc = table_cp1250[(unsigned char) src[i] - 128]; |
|
| 77 |
|
| 78 if (uc < 0x80) |
|
| 79 len += 1; |
|
| 80 else if (uc < 0x800) |
|
| 81 len += 2; |
|
| 82 else |
|
| 83 len += 3; |
|
| 84 } |
|
| 85 |
|
| 86 if ((dst_length != -1) && (len > dst_length)) |
|
| 87 len = dst_length; |
|
| 88 |
|
| 89 result = malloc(len + 1); |
|
| 90 |
|
| 91 if (result == NULL) |
|
| 92 return NULL; |
|
| 93 |
|
| 94 for (i = 0, j = 0; (src[i] != 0) && (i < src_length) && (j < len); i++) { |
|
| 95 uint16_t uc; |
|
| 96 |
|
| 97 if ((unsigned char) src[i] < 0x80) |
|
| 98 uc = (unsigned char) src[i]; |
|
| 99 else |
|
| 100 uc = table_cp1250[(unsigned char) src[i] - 128]; |
|
| 101 |
|
| 102 if (uc < 0x80) |
|
| 103 result[j++] = (char) uc; |
|
| 104 else if (uc < 0x800) { |
|
| 105 if (j + 1 > len) |
|
| 106 break; |
|
| 107 result[j++] = 0xc0 | ((uc >> 6) & 0x1f); |
|
| 108 result[j++] = 0x80 | (uc & 0x3f); |
|
| 109 } else { |
|
| 110 if (j + 2 > len) |
|
| 111 break; |
|
| 112 result[j++] = 0xe0 | ((uc >> 12) & 0x1f); |
|
| 113 result[j++] = 0x80 | ((uc >> 6) & 0x3f); |
|
| 114 result[j++] = 0x80 | (uc & 0x3f); |
|
| 115 } |
|
| 116 } |
|
| 117 |
|
| 118 result[j] = 0; |
|
| 119 |
|
| 120 return result; |
|
| 121 } |
|
| 122 |
|
| 123 /** |
|
| 124 * \internal Zamienia tekst kodowany UTF-8 na CP1250. |
|
| 125 * |
|
| 126 * \param src Tekst źródłowy w UTF-8. |
|
| 127 * \param src_length Długość ciągu źródłowego (nigdy ujemna). |
|
| 128 * \param dst_length Długość ciągu docelowego (jeśli -1, nieograniczona). |
|
| 129 * |
|
| 130 * \return Zaalokowany bufor z tekstem w CP1250. |
|
| 131 */ |
|
| 132 static char *gg_encoding_convert_utf8_cp1250(const char *src, int src_length, int dst_length) |
|
| 133 { |
|
| 134 char *result; |
|
| 135 int i, j, len, uc_left = 0; |
|
| 136 uint32_t uc = 0, uc_min = 0; |
|
| 137 |
|
| 138 for (i = 0, len = 0; (src[i] != 0) && (i < src_length); i++) { |
|
| 139 if ((src[i] & 0xc0) != 0x80) |
|
| 140 len++; |
|
| 141 } |
|
| 142 |
|
| 143 if ((dst_length != -1) && (len > dst_length)) |
|
| 144 len = dst_length; |
|
| 145 |
|
| 146 result = malloc(len + 1); |
|
| 147 |
|
| 148 if (result == NULL) |
|
| 149 return NULL; |
|
| 150 |
|
| 151 for (i = 0, j = 0; (src[i] != 0) && (i < src_length) && (j < len); i++) { |
|
| 152 if ((unsigned char) src[i] >= 0xf5) { |
|
| 153 if (uc_left != 0) |
|
| 154 result[j++] = '?'; |
|
| 155 /* Restricted sequences */ |
|
| 156 result[j++] = '?'; |
|
| 157 uc_left = 0; |
|
| 158 } else if ((src[i] & 0xf8) == 0xf0) { |
|
| 159 if (uc_left != 0) |
|
| 160 result[j++] = '?'; |
|
| 161 uc = src[i] & 0x07; |
|
| 162 uc_left = 3; |
|
| 163 uc_min = 0x10000; |
|
| 164 } else if ((src[i] & 0xf0) == 0xe0) { |
|
| 165 if (uc_left != 0) |
|
| 166 result[j++] = '?'; |
|
| 167 uc = src[i] & 0x0f; |
|
| 168 uc_left = 2; |
|
| 169 uc_min = 0x800; |
|
| 170 } else if ((src[i] & 0xe0) == 0xc0) { |
|
| 171 if (uc_left != 0) |
|
| 172 result[j++] = '?'; |
|
| 173 uc = src[i] & 0x1f; |
|
| 174 uc_left = 1; |
|
| 175 uc_min = 0x80; |
|
| 176 } else if ((src[i] & 0xc0) == 0x80) { |
|
| 177 if (uc_left > 0) { |
|
| 178 uc <<= 6; |
|
| 179 uc |= src[i] & 0x3f; |
|
| 180 uc_left--; |
|
| 181 |
|
| 182 if (uc_left == 0) { |
|
| 183 int valid = 0; |
|
| 184 int k; |
|
| 185 |
|
| 186 if (uc >= uc_min) { |
|
| 187 for (k = 0; k < 128; k++) { |
|
| 188 if (uc == table_cp1250[k]) { |
|
| 189 result[j++] = k + 128; |
|
| 190 valid = 1; |
|
| 191 break; |
|
| 192 } |
|
| 193 } |
|
| 194 } |
|
| 195 |
|
| 196 if (!valid && uc != 0xfeff) /* Byte Order Mark */ |
|
| 197 result[j++] = '?'; |
|
| 198 } |
|
| 199 } |
|
| 200 } else { |
|
| 201 if (uc_left != 0) { |
|
| 202 result[j++] = '?'; |
|
| 203 uc_left = 0; |
|
| 204 } |
|
| 205 result[j++] = src[i]; |
|
| 206 } |
|
| 207 } |
|
| 208 |
|
| 209 if ((uc_left != 0) && (src[i] == 0)) |
|
| 210 result[j++] = '?'; |
|
| 211 |
|
| 212 result[j] = 0; |
|
| 213 |
|
| 214 return result; |
|
| 215 } |
|
| 216 |
|
| 217 /** |
|
| 218 * \internal Zamienia kodowanie tekstu. |
|
| 219 * |
|
| 220 * \param src Tekst źródłowy. |
|
| 221 * \param src_encoding Kodowanie tekstu źródłowego. |
|
| 222 * \param dst_encoding Kodowanie tekstu docelowego. |
|
| 223 * \param src_length Długość ciągu źródłowego w bajtach (jeśli -1, zostanie obliczona na podstawie zawartości \p src). |
|
| 224 * \param dst_length Długość ciągu docelowego w bajtach (jeśli -1, nieograniczona). |
|
| 225 * |
|
| 226 * \return Zaalokowany bufor z tekstem w kodowaniu docelowym. |
|
| 227 */ |
|
| 228 char *gg_encoding_convert(const char *src, gg_encoding_t src_encoding, |
|
| 229 gg_encoding_t dst_encoding, int src_length, int dst_length) |
|
| 230 { |
|
| 231 char *result; |
|
| 232 |
|
| 233 if (src == NULL) { |
|
| 234 errno = EINVAL; |
|
| 235 return NULL; |
|
| 236 } |
|
| 237 |
|
| 238 /* specjalny przypadek obsługiwany ekspresowo */ |
|
| 239 if ((dst_encoding == src_encoding) && (dst_length == -1) && (src_length == -1)) |
|
| 240 return strdup(src); |
|
| 241 |
|
| 242 if (src_length == -1) |
|
| 243 src_length = strlen(src); |
|
| 244 |
|
| 245 if (dst_encoding == src_encoding) { |
|
| 246 int len; |
|
| 247 |
|
| 248 if (dst_length == -1) |
|
| 249 len = src_length; |
|
| 250 else |
|
| 251 len = (src_length < dst_length) ? src_length : dst_length; |
|
| 252 |
|
| 253 result = malloc(len + 1); |
|
| 254 |
|
| 255 if (result == NULL) |
|
| 256 return NULL; |
|
| 257 |
|
| 258 strncpy(result, src, len); |
|
| 259 result[len] = 0; |
|
| 260 |
|
| 261 return result; |
|
| 262 } |
|
| 263 |
|
| 264 if (dst_encoding == GG_ENCODING_CP1250 && src_encoding == GG_ENCODING_UTF8) |
|
| 265 return gg_encoding_convert_utf8_cp1250(src, src_length, dst_length); |
|
| 266 |
|
| 267 if (dst_encoding == GG_ENCODING_UTF8 && src_encoding == GG_ENCODING_CP1250) |
|
| 268 return gg_encoding_convert_cp1250_utf8(src, src_length, dst_length); |
|
| 269 |
|
| 270 errno = EINVAL; |
|
| 271 return NULL; |
|
| 272 } |
|