| |
1 /* |
| |
2 * Purple - Internet Messenging Library |
| |
3 * Copyright (C) Pidgin Developers <devel@pidgin.im> |
| |
4 * |
| |
5 * Purple is the legal property of its developers, whose names are too numerous |
| |
6 * to list here. Please refer to the COPYRIGHT file distributed with this |
| |
7 * source distribution. |
| |
8 * |
| |
9 * This program is free software; you can redistribute it and/or modify |
| |
10 * it under the terms of the GNU General Public License as published by |
| |
11 * the Free Software Foundation; either version 2 of the License, or |
| |
12 * (at your option) any later version. |
| |
13 * |
| |
14 * This program is distributed in the hope that it will be useful, |
| |
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| |
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| |
17 * GNU General Public License for more details. |
| |
18 * |
| |
19 * You should have received a copy of the GNU General Public License |
| |
20 * along with this program; if not, see <https://www.gnu.org/licenses/>. |
| |
21 */ |
| |
22 |
| |
23 #include "purplemarkup.h" |
| |
24 |
| |
25 #include "util.h" |
| |
26 |
| |
27 /* |
| |
28 * This function is stolen from glib's gmarkup.c and modified to not |
| |
29 * replace ' with ' |
| |
30 */ |
| |
31 static void append_escaped_text(GString *str, |
| |
32 const gchar *text, gssize length) |
| |
33 { |
| |
34 const gchar *p; |
| |
35 const gchar *end; |
| |
36 gunichar c; |
| |
37 |
| |
38 p = text; |
| |
39 end = text + length; |
| |
40 |
| |
41 while (p != end) |
| |
42 { |
| |
43 const gchar *next; |
| |
44 next = g_utf8_next_char (p); |
| |
45 |
| |
46 switch (*p) |
| |
47 { |
| |
48 case '&': |
| |
49 g_string_append (str, "&"); |
| |
50 break; |
| |
51 |
| |
52 case '<': |
| |
53 g_string_append (str, "<"); |
| |
54 break; |
| |
55 |
| |
56 case '>': |
| |
57 g_string_append (str, ">"); |
| |
58 break; |
| |
59 |
| |
60 case '"': |
| |
61 g_string_append (str, """); |
| |
62 break; |
| |
63 |
| |
64 default: |
| |
65 c = g_utf8_get_char (p); |
| |
66 if ((0x1 <= c && c <= 0x8) || |
| |
67 (0xb <= c && c <= 0xc) || |
| |
68 (0xe <= c && c <= 0x1f) || |
| |
69 (0x7f <= c && c <= 0x84) || |
| |
70 (0x86 <= c && c <= 0x9f)) |
| |
71 g_string_append_printf (str, "&#x%x;", c); |
| |
72 else |
| |
73 g_string_append_len (str, p, next - p); |
| |
74 break; |
| |
75 } |
| |
76 |
| |
77 p = next; |
| |
78 } |
| |
79 } |
| |
80 |
| |
81 /* This function is stolen from glib's gmarkup.c */ |
| |
82 gchar *purple_markup_escape_text(const gchar *text, gssize length) |
| |
83 { |
| |
84 GString *str; |
| |
85 |
| |
86 g_return_val_if_fail(text != NULL, NULL); |
| |
87 |
| |
88 if (length < 0) |
| |
89 length = strlen(text); |
| |
90 |
| |
91 /* prealloc at least as long as original text */ |
| |
92 str = g_string_sized_new(length); |
| |
93 append_escaped_text(str, text, length); |
| |
94 |
| |
95 return g_string_free(str, FALSE); |
| |
96 } |
| |
97 |
| |
98 const char * |
| |
99 purple_markup_unescape_entity(const char *text, int *length) |
| |
100 { |
| |
101 const char *pln; |
| |
102 int len; |
| |
103 |
| |
104 if (!text || *text != '&') |
| |
105 return NULL; |
| |
106 |
| |
107 #define IS_ENTITY(s) (!g_ascii_strncasecmp(text, s, (len = sizeof(s) - 1))) |
| |
108 |
| |
109 if(IS_ENTITY("&")) |
| |
110 pln = "&"; |
| |
111 else if(IS_ENTITY("<")) |
| |
112 pln = "<"; |
| |
113 else if(IS_ENTITY(">")) |
| |
114 pln = ">"; |
| |
115 else if(IS_ENTITY(" ")) |
| |
116 pln = " "; |
| |
117 else if(IS_ENTITY("©")) |
| |
118 pln = "\302\251"; /* or use g_unichar_to_utf8(0xa9); */ |
| |
119 else if(IS_ENTITY(""")) |
| |
120 pln = "\""; |
| |
121 else if(IS_ENTITY("®")) |
| |
122 pln = "\302\256"; /* or use g_unichar_to_utf8(0xae); */ |
| |
123 else if(IS_ENTITY("'")) |
| |
124 pln = "\'"; |
| |
125 else if(text[1] == '#' && (g_ascii_isxdigit(text[2]) || text[2] == 'x')) { |
| |
126 static char buf[7]; |
| |
127 const char *start = text + 2; |
| |
128 char *end; |
| |
129 guint64 pound; |
| |
130 int base = 10; |
| |
131 int buflen; |
| |
132 |
| |
133 if (*start == 'x') { |
| |
134 base = 16; |
| |
135 start++; |
| |
136 } |
| |
137 |
| |
138 pound = g_ascii_strtoull(start, &end, base); |
| |
139 if (pound == 0 || pound > INT_MAX || *end != ';') { |
| |
140 return NULL; |
| |
141 } |
| |
142 |
| |
143 len = (end - text) + 1; |
| |
144 |
| |
145 buflen = g_unichar_to_utf8((gunichar)pound, buf); |
| |
146 buf[buflen] = '\0'; |
| |
147 pln = buf; |
| |
148 } |
| |
149 else |
| |
150 return NULL; |
| |
151 |
| |
152 if (length) |
| |
153 *length = len; |
| |
154 return pln; |
| |
155 } |
| |
156 |
| |
157 char * |
| |
158 purple_markup_get_css_property(const gchar *style, |
| |
159 const gchar *opt) |
| |
160 { |
| |
161 const gchar *css_str = style; |
| |
162 const gchar *css_value_start; |
| |
163 const gchar *css_value_end; |
| |
164 gchar *tmp; |
| |
165 gchar *ret; |
| |
166 |
| |
167 g_return_val_if_fail(opt != NULL, NULL); |
| |
168 |
| |
169 if (!css_str) |
| |
170 return NULL; |
| |
171 |
| |
172 /* find the CSS property */ |
| |
173 while (1) |
| |
174 { |
| |
175 /* skip whitespace characters */ |
| |
176 while (*css_str && g_ascii_isspace(*css_str)) |
| |
177 css_str++; |
| |
178 if (!g_ascii_isalpha(*css_str)) |
| |
179 return NULL; |
| |
180 if (g_ascii_strncasecmp(css_str, opt, strlen(opt))) |
| |
181 { |
| |
182 /* go to next css property positioned after the next ';' */ |
| |
183 while (*css_str && *css_str != '"' && *css_str != ';') |
| |
184 css_str++; |
| |
185 if(*css_str != ';') |
| |
186 return NULL; |
| |
187 css_str++; |
| |
188 } |
| |
189 else |
| |
190 break; |
| |
191 } |
| |
192 |
| |
193 /* find the CSS value position in the string */ |
| |
194 css_str += strlen(opt); |
| |
195 while (*css_str && g_ascii_isspace(*css_str)) |
| |
196 css_str++; |
| |
197 if (*css_str != ':') |
| |
198 return NULL; |
| |
199 css_str++; |
| |
200 while (*css_str && g_ascii_isspace(*css_str)) |
| |
201 css_str++; |
| |
202 if (*css_str == '\0' || *css_str == '"' || *css_str == ';') |
| |
203 return NULL; |
| |
204 |
| |
205 /* mark the CSS value */ |
| |
206 css_value_start = css_str; |
| |
207 while (*css_str && *css_str != '"' && *css_str != ';') |
| |
208 css_str++; |
| |
209 css_value_end = css_str - 1; |
| |
210 |
| |
211 /* Removes trailing whitespace */ |
| |
212 while (css_value_end > css_value_start && g_ascii_isspace(*css_value_end)) |
| |
213 css_value_end--; |
| |
214 |
| |
215 tmp = g_strndup(css_value_start, css_value_end - css_value_start + 1); |
| |
216 ret = purple_unescape_html(tmp); |
| |
217 g_free(tmp); |
| |
218 |
| |
219 return ret; |
| |
220 } |
| |
221 |
| |
222 gboolean purple_markup_is_rtl(const char *html) |
| |
223 { |
| |
224 GData *attributes; |
| |
225 const gchar *start, *end; |
| |
226 gboolean res = FALSE; |
| |
227 |
| |
228 if (purple_markup_find_tag("span", html, &start, &end, &attributes)) |
| |
229 { |
| |
230 /* tmp is a member of attributes and is free with g_datalist_clear call */ |
| |
231 const char *tmp = g_datalist_get_data(&attributes, "dir"); |
| |
232 if (tmp && !g_ascii_strcasecmp(tmp, "RTL")) |
| |
233 res = TRUE; |
| |
234 if (!res) |
| |
235 { |
| |
236 tmp = g_datalist_get_data(&attributes, "style"); |
| |
237 if (tmp) |
| |
238 { |
| |
239 char *tmp2 = purple_markup_get_css_property(tmp, "direction"); |
| |
240 if (tmp2 && !g_ascii_strcasecmp(tmp2, "RTL")) |
| |
241 res = TRUE; |
| |
242 g_free(tmp2); |
| |
243 } |
| |
244 |
| |
245 } |
| |
246 g_datalist_clear(&attributes); |
| |
247 } |
| |
248 return res; |
| |
249 } |
| |
250 |
| |
251 gboolean |
| |
252 purple_markup_find_tag(const char *needle, const char *haystack, |
| |
253 const char **start, const char **end, GData **attributes) |
| |
254 { |
| |
255 GData *attribs; |
| |
256 const char *cur = haystack; |
| |
257 char *name = NULL; |
| |
258 gboolean found = FALSE; |
| |
259 gboolean in_tag = FALSE; |
| |
260 gboolean in_attr = FALSE; |
| |
261 const char *in_quotes = NULL; |
| |
262 size_t needlelen; |
| |
263 |
| |
264 g_return_val_if_fail( needle != NULL, FALSE); |
| |
265 g_return_val_if_fail( *needle != '\0', FALSE); |
| |
266 g_return_val_if_fail( haystack != NULL, FALSE); |
| |
267 g_return_val_if_fail( start != NULL, FALSE); |
| |
268 g_return_val_if_fail( end != NULL, FALSE); |
| |
269 g_return_val_if_fail(attributes != NULL, FALSE); |
| |
270 |
| |
271 needlelen = strlen(needle); |
| |
272 g_datalist_init(&attribs); |
| |
273 |
| |
274 while (*cur && !found) { |
| |
275 if (in_tag) { |
| |
276 if (in_quotes) { |
| |
277 const char *close = cur; |
| |
278 |
| |
279 while (*close && *close != *in_quotes) |
| |
280 close++; |
| |
281 |
| |
282 /* if we got the close quote, store the value and carry on from * |
| |
283 * after it. if we ran to the end of the string, point to the NULL * |
| |
284 * and we're outta here */ |
| |
285 if (*close) { |
| |
286 /* only store a value if we have an attribute name */ |
| |
287 if (name) { |
| |
288 size_t len = close - cur; |
| |
289 char *val = g_strndup(cur, len); |
| |
290 |
| |
291 g_datalist_set_data_full(&attribs, name, val, g_free); |
| |
292 g_free(name); |
| |
293 name = NULL; |
| |
294 } |
| |
295 |
| |
296 in_quotes = NULL; |
| |
297 cur = close + 1; |
| |
298 } else { |
| |
299 cur = close; |
| |
300 } |
| |
301 } else if (in_attr) { |
| |
302 const char *close = cur; |
| |
303 |
| |
304 while (*close && *close != '>' && *close != '"' && |
| |
305 *close != '\'' && *close != ' ' && *close != '=') |
| |
306 close++; |
| |
307 |
| |
308 /* if we got the equals, store the name of the attribute. if we got |
| |
309 * the quote, save the attribute and go straight to quote mode. |
| |
310 * otherwise the tag closed or we reached the end of the string, |
| |
311 * so we can get outta here */ |
| |
312 switch (*close) { |
| |
313 case '"': |
| |
314 case '\'': |
| |
315 in_quotes = close; |
| |
316 /* fall through */ |
| |
317 case '=': |
| |
318 { |
| |
319 size_t len = close - cur; |
| |
320 |
| |
321 /* don't store a blank attribute name */ |
| |
322 if (len) { |
| |
323 g_free(name); |
| |
324 name = g_ascii_strdown(cur, len); |
| |
325 } |
| |
326 |
| |
327 in_attr = FALSE; |
| |
328 cur = close + 1; |
| |
329 } |
| |
330 break; |
| |
331 case ' ': |
| |
332 case '>': |
| |
333 in_attr = FALSE; |
| |
334 /* fall through */ |
| |
335 default: |
| |
336 cur = close; |
| |
337 break; |
| |
338 } |
| |
339 } else { |
| |
340 switch (*cur) { |
| |
341 case ' ': |
| |
342 /* swallow extra spaces inside tag */ |
| |
343 while (*cur && *cur == ' ') cur++; |
| |
344 in_attr = TRUE; |
| |
345 break; |
| |
346 case '>': |
| |
347 found = TRUE; |
| |
348 *end = cur; |
| |
349 break; |
| |
350 case '"': |
| |
351 case '\'': |
| |
352 in_quotes = cur; |
| |
353 /* fall through */ |
| |
354 default: |
| |
355 cur++; |
| |
356 break; |
| |
357 } |
| |
358 } |
| |
359 } else { |
| |
360 /* if we hit a < followed by the name of our tag... */ |
| |
361 if (*cur == '<' && !g_ascii_strncasecmp(cur + 1, needle, needlelen)) { |
| |
362 *start = cur; |
| |
363 cur = cur + needlelen + 1; |
| |
364 |
| |
365 /* if we're pointing at a space or a >, we found the right tag. if * |
| |
366 * we're not, we've found a longer tag, so we need to skip to the * |
| |
367 * >, but not being distracted by >s inside quotes. */ |
| |
368 if (*cur == ' ' || *cur == '>') { |
| |
369 in_tag = TRUE; |
| |
370 } else { |
| |
371 while (*cur && *cur != '"' && *cur != '\'' && *cur != '>') { |
| |
372 if (*cur == '"') { |
| |
373 cur++; |
| |
374 while (*cur && *cur != '"') |
| |
375 cur++; |
| |
376 } else if (*cur == '\'') { |
| |
377 cur++; |
| |
378 while (*cur && *cur != '\'') |
| |
379 cur++; |
| |
380 } else { |
| |
381 cur++; |
| |
382 } |
| |
383 } |
| |
384 } |
| |
385 } else { |
| |
386 cur++; |
| |
387 } |
| |
388 } |
| |
389 } |
| |
390 |
| |
391 /* clean up any attribute name from a premature termination */ |
| |
392 g_free(name); |
| |
393 |
| |
394 if (found) { |
| |
395 *attributes = attribs; |
| |
396 } else { |
| |
397 *start = NULL; |
| |
398 *end = NULL; |
| |
399 *attributes = NULL; |
| |
400 } |
| |
401 |
| |
402 return found; |
| |
403 } |
| |
404 |
| |
405 struct purple_parse_tag { |
| |
406 char *src_tag; |
| |
407 char *dest_tag; |
| |
408 gboolean ignore; |
| |
409 }; |
| |
410 |
| |
411 /* NOTE: Do not put `do {} while(0)` around this macro (as this is the method |
| |
412 recommended in the GCC docs). It contains 'continue's that should |
| |
413 affect the while-loop in purple_markup_html_to_xhtml and doing the |
| |
414 above would break that. |
| |
415 Also, remember to put braces in constructs that require them for |
| |
416 multiple statements when using this macro. */ |
| |
417 #define ALLOW_TAG_ALT(x, y) if(!g_ascii_strncasecmp(c, "<" x " ", strlen("<" x " "))) { \ |
| |
418 const char *o = c + strlen("<" x); \ |
| |
419 const char *p = NULL, *q = NULL, *r = NULL; \ |
| |
420 /* o = iterating over full tag \ |
| |
421 * p = > (end of tag) \ |
| |
422 * q = start of quoted bit \ |
| |
423 * r = < inside tag \ |
| |
424 */ \ |
| |
425 GString *innards = g_string_new(""); \ |
| |
426 while(o && *o) { \ |
| |
427 if(!q && (*o == '\"' || *o == '\'') ) { \ |
| |
428 q = o; \ |
| |
429 } else if(q) { \ |
| |
430 if(*o == *q) { /* end of quoted bit */ \ |
| |
431 char *unescaped = g_strndup(q+1, o-q-1); \ |
| |
432 char *escaped = g_markup_escape_text(unescaped, -1); \ |
| |
433 g_string_append_printf(innards, "%c%s%c", *q, escaped, *q); \ |
| |
434 g_free(unescaped); \ |
| |
435 g_free(escaped); \ |
| |
436 q = NULL; \ |
| |
437 } else if(*c == '\\') { \ |
| |
438 o++; \ |
| |
439 } \ |
| |
440 } else if(*o == '<') { \ |
| |
441 r = o; \ |
| |
442 } else if(*o == '>') { \ |
| |
443 p = o; \ |
| |
444 break; \ |
| |
445 } else { \ |
| |
446 innards = g_string_append_c(innards, *o); \ |
| |
447 } \ |
| |
448 o++; \ |
| |
449 } \ |
| |
450 if(p && !r) { /* got an end of tag and no other < earlier */\ |
| |
451 if(*(p-1) != '/') { \ |
| |
452 struct purple_parse_tag *pt = g_new0(struct purple_parse_tag, 1); \ |
| |
453 pt->src_tag = x; \ |
| |
454 pt->dest_tag = y; \ |
| |
455 tags = g_list_prepend(tags, pt); \ |
| |
456 } \ |
| |
457 if(xhtml) { \ |
| |
458 xhtml = g_string_append(xhtml, "<" y); \ |
| |
459 xhtml = g_string_append(xhtml, innards->str); \ |
| |
460 xhtml = g_string_append_c(xhtml, '>'); \ |
| |
461 } \ |
| |
462 c = p + 1; \ |
| |
463 } else { /* got end of tag with earlier < *or* didn't get anything */ \ |
| |
464 if(xhtml) \ |
| |
465 xhtml = g_string_append(xhtml, "<"); \ |
| |
466 if(plain) \ |
| |
467 plain = g_string_append_c(plain, '<'); \ |
| |
468 c++; \ |
| |
469 } \ |
| |
470 g_string_free(innards, TRUE); \ |
| |
471 continue; \ |
| |
472 } \ |
| |
473 if(!g_ascii_strncasecmp(c, "<" x, strlen("<" x)) && \ |
| |
474 (*(c+strlen("<" x)) == '>' || \ |
| |
475 !g_ascii_strncasecmp(c+strlen("<" x), "/>", 2))) { \ |
| |
476 if(xhtml) \ |
| |
477 xhtml = g_string_append(xhtml, "<" y); \ |
| |
478 c += strlen("<" x); \ |
| |
479 if(*c != '/') { \ |
| |
480 struct purple_parse_tag *pt = g_new0(struct purple_parse_tag, 1); \ |
| |
481 pt->src_tag = x; \ |
| |
482 pt->dest_tag = y; \ |
| |
483 tags = g_list_prepend(tags, pt); \ |
| |
484 if(xhtml) \ |
| |
485 xhtml = g_string_append_c(xhtml, '>'); \ |
| |
486 } else { \ |
| |
487 if(xhtml) \ |
| |
488 xhtml = g_string_append(xhtml, "/>");\ |
| |
489 } \ |
| |
490 c = strchr(c, '>') + 1; \ |
| |
491 continue; \ |
| |
492 } |
| |
493 /* Don't forget to check the note above for ALLOW_TAG_ALT. */ |
| |
494 #define ALLOW_TAG(x) ALLOW_TAG_ALT(x, x) |
| |
495 void |
| |
496 purple_markup_html_to_xhtml(const char *html, char **xhtml_out, |
| |
497 char **plain_out) |
| |
498 { |
| |
499 GString *xhtml = NULL; |
| |
500 GString *plain = NULL; |
| |
501 GString *url = NULL; |
| |
502 GString *cdata = NULL; |
| |
503 GList *tags = NULL, *tag; |
| |
504 const char *c = html; |
| |
505 char quote = '\0'; |
| |
506 |
| |
507 #define CHECK_QUOTE(ptr) if (*(ptr) == '\'' || *(ptr) == '\"') \ |
| |
508 quote = *(ptr++); \ |
| |
509 else \ |
| |
510 quote = '\0'; |
| |
511 |
| |
512 #define VALID_CHAR(ptr) (*(ptr) && *(ptr) != quote && (quote || (*(ptr) != ' ' && *(ptr) != '>'))) |
| |
513 |
| |
514 g_return_if_fail(xhtml_out != NULL || plain_out != NULL); |
| |
515 |
| |
516 if(xhtml_out) |
| |
517 xhtml = g_string_new(""); |
| |
518 if(plain_out) |
| |
519 plain = g_string_new(""); |
| |
520 |
| |
521 while(c && *c) { |
| |
522 if(*c == '<') { |
| |
523 if(*(c+1) == '/') { /* closing tag */ |
| |
524 tag = tags; |
| |
525 while(tag) { |
| |
526 struct purple_parse_tag *pt = tag->data; |
| |
527 if(!g_ascii_strncasecmp((c+2), pt->src_tag, strlen(pt->src_tag)) && *(c+strlen(pt->src_tag)+2) == '>') { |
| |
528 c += strlen(pt->src_tag) + 3; |
| |
529 break; |
| |
530 } |
| |
531 tag = tag->next; |
| |
532 } |
| |
533 if(tag) { |
| |
534 while(tags) { |
| |
535 struct purple_parse_tag *pt = tags->data; |
| |
536 if(xhtml && !pt->ignore) |
| |
537 g_string_append_printf(xhtml, "</%s>", pt->dest_tag); |
| |
538 if(plain && purple_strequal(pt->src_tag, "a")) { |
| |
539 /* if this is a link, we have to add the url to the plaintext, too */ |
| |
540 if (cdata && url && |
| |
541 (!g_string_equal(cdata, url) && (g_ascii_strncasecmp(url->str, "mailto:", 7) != 0 || |
| |
542 g_utf8_collate(url->str + 7, cdata->str) != 0))) |
| |
543 g_string_append_printf(plain, " <%s>", g_strstrip(purple_unescape_html(url->str))); |
| |
544 if (cdata) { |
| |
545 g_string_free(cdata, TRUE); |
| |
546 cdata = NULL; |
| |
547 } |
| |
548 |
| |
549 } |
| |
550 if(tags == tag) |
| |
551 break; |
| |
552 tags = g_list_delete_link(tags, tags); |
| |
553 g_free(pt); |
| |
554 } |
| |
555 g_free(tag->data); |
| |
556 tags = g_list_delete_link(tags, tag); |
| |
557 } else { |
| |
558 /* a closing tag we weren't expecting... |
| |
559 * we'll let it slide, if it's really a tag...if it's |
| |
560 * just a </ we'll escape it properly */ |
| |
561 const char *end = c+2; |
| |
562 while(*end && g_ascii_isalpha(*end)) |
| |
563 end++; |
| |
564 if(*end == '>') { |
| |
565 c = end+1; |
| |
566 } else { |
| |
567 if(xhtml) |
| |
568 xhtml = g_string_append(xhtml, "<"); |
| |
569 if(plain) |
| |
570 plain = g_string_append_c(plain, '<'); |
| |
571 c++; |
| |
572 } |
| |
573 } |
| |
574 } else { /* opening tag */ |
| |
575 ALLOW_TAG("blockquote"); |
| |
576 ALLOW_TAG("cite"); |
| |
577 ALLOW_TAG("div"); |
| |
578 ALLOW_TAG("em"); |
| |
579 ALLOW_TAG("h1"); |
| |
580 ALLOW_TAG("h2"); |
| |
581 ALLOW_TAG("h3"); |
| |
582 ALLOW_TAG("h4"); |
| |
583 ALLOW_TAG("h5"); |
| |
584 ALLOW_TAG("h6"); |
| |
585 /* we only allow html to start the message */ |
| |
586 if(c == html) { |
| |
587 ALLOW_TAG("html"); |
| |
588 } |
| |
589 ALLOW_TAG_ALT("i", "em"); |
| |
590 ALLOW_TAG_ALT("italic", "em"); |
| |
591 ALLOW_TAG("li"); |
| |
592 ALLOW_TAG("ol"); |
| |
593 ALLOW_TAG("p"); |
| |
594 ALLOW_TAG("pre"); |
| |
595 ALLOW_TAG("q"); |
| |
596 ALLOW_TAG("span"); |
| |
597 ALLOW_TAG("ul"); |
| |
598 |
| |
599 |
| |
600 /* we skip <HR> because it's not legal in XHTML-IM. However, |
| |
601 * we still want to send something sensible, so we put a |
| |
602 * linebreak in its place. <BR> also needs special handling |
| |
603 * because putting a </BR> to close it would just be dumb. */ |
| |
604 if((!g_ascii_strncasecmp(c, "<br", 3) |
| |
605 || !g_ascii_strncasecmp(c, "<hr", 3)) |
| |
606 && (*(c+3) == '>' || |
| |
607 !g_ascii_strncasecmp(c+3, "/>", 2) || |
| |
608 !g_ascii_strncasecmp(c+3, " />", 3))) { |
| |
609 c = strchr(c, '>') + 1; |
| |
610 if(xhtml) |
| |
611 xhtml = g_string_append(xhtml, "<br/>"); |
| |
612 if(plain && *c != '\n') |
| |
613 plain = g_string_append_c(plain, '\n'); |
| |
614 continue; |
| |
615 } |
| |
616 if(!g_ascii_strncasecmp(c, "<b>", 3) || !g_ascii_strncasecmp(c, "<bold>", strlen("<bold>")) || !g_ascii_strncasecmp(c, "<strong>", strlen("<strong>"))) { |
| |
617 struct purple_parse_tag *pt = g_new0(struct purple_parse_tag, 1); |
| |
618 if (*(c+2) == '>') |
| |
619 pt->src_tag = "b"; |
| |
620 else if (*(c+2) == 'o') |
| |
621 pt->src_tag = "bold"; |
| |
622 else |
| |
623 pt->src_tag = "strong"; |
| |
624 pt->dest_tag = "span"; |
| |
625 tags = g_list_prepend(tags, pt); |
| |
626 c = strchr(c, '>') + 1; |
| |
627 if(xhtml) |
| |
628 xhtml = g_string_append(xhtml, "<span style='font-weight: bold;'>"); |
| |
629 continue; |
| |
630 } |
| |
631 if(!g_ascii_strncasecmp(c, "<u>", 3) || !g_ascii_strncasecmp(c, "<underline>", strlen("<underline>"))) { |
| |
632 struct purple_parse_tag *pt = g_new0(struct purple_parse_tag, 1); |
| |
633 pt->src_tag = *(c+2) == '>' ? "u" : "underline"; |
| |
634 pt->dest_tag = "span"; |
| |
635 tags = g_list_prepend(tags, pt); |
| |
636 c = strchr(c, '>') + 1; |
| |
637 if (xhtml) |
| |
638 xhtml = g_string_append(xhtml, "<span style='text-decoration: underline;'>"); |
| |
639 continue; |
| |
640 } |
| |
641 if(!g_ascii_strncasecmp(c, "<s>", 3) || !g_ascii_strncasecmp(c, "<strike>", strlen("<strike>"))) { |
| |
642 struct purple_parse_tag *pt = g_new0(struct purple_parse_tag, 1); |
| |
643 pt->src_tag = *(c+2) == '>' ? "s" : "strike"; |
| |
644 pt->dest_tag = "span"; |
| |
645 tags = g_list_prepend(tags, pt); |
| |
646 c = strchr(c, '>') + 1; |
| |
647 if(xhtml) |
| |
648 xhtml = g_string_append(xhtml, "<span style='text-decoration: line-through;'>"); |
| |
649 continue; |
| |
650 } |
| |
651 if(!g_ascii_strncasecmp(c, "<sub>", 5)) { |
| |
652 struct purple_parse_tag *pt = g_new0(struct purple_parse_tag, 1); |
| |
653 pt->src_tag = "sub"; |
| |
654 pt->dest_tag = "span"; |
| |
655 tags = g_list_prepend(tags, pt); |
| |
656 c = strchr(c, '>') + 1; |
| |
657 if(xhtml) |
| |
658 xhtml = g_string_append(xhtml, "<span style='vertical-align:sub;'>"); |
| |
659 continue; |
| |
660 } |
| |
661 if(!g_ascii_strncasecmp(c, "<sup>", 5)) { |
| |
662 struct purple_parse_tag *pt = g_new0(struct purple_parse_tag, 1); |
| |
663 pt->src_tag = "sup"; |
| |
664 pt->dest_tag = "span"; |
| |
665 tags = g_list_prepend(tags, pt); |
| |
666 c = strchr(c, '>') + 1; |
| |
667 if(xhtml) |
| |
668 xhtml = g_string_append(xhtml, "<span style='vertical-align:super;'>"); |
| |
669 continue; |
| |
670 } |
| |
671 if (!g_ascii_strncasecmp(c, "<img", 4) && (*(c+4) == '>' || *(c+4) == ' ')) { |
| |
672 const char *p = c + 4; |
| |
673 GString *src = NULL, *alt = NULL; |
| |
674 #define ESCAPE(from, to) \ |
| |
675 CHECK_QUOTE(from); \ |
| |
676 while (VALID_CHAR(from)) { \ |
| |
677 int len; \ |
| |
678 if ((*from == '&') && (purple_markup_unescape_entity(from, &len) == NULL)) \ |
| |
679 to = g_string_append(to, "&"); \ |
| |
680 else if (*from == '\'') \ |
| |
681 to = g_string_append(to, "'"); \ |
| |
682 else \ |
| |
683 to = g_string_append_c(to, *from); \ |
| |
684 from++; \ |
| |
685 } |
| |
686 |
| |
687 while (*p && *p != '>') { |
| |
688 if (!g_ascii_strncasecmp(p, "src=", 4)) { |
| |
689 const char *q = p + 4; |
| |
690 if (src) |
| |
691 g_string_free(src, TRUE); |
| |
692 src = g_string_new(""); |
| |
693 ESCAPE(q, src); |
| |
694 p = q; |
| |
695 } else if (!g_ascii_strncasecmp(p, "alt=", 4)) { |
| |
696 const char *q = p + 4; |
| |
697 if (alt) |
| |
698 g_string_free(alt, TRUE); |
| |
699 alt = g_string_new(""); |
| |
700 ESCAPE(q, alt); |
| |
701 p = q; |
| |
702 } else { |
| |
703 p++; |
| |
704 } |
| |
705 } |
| |
706 #undef ESCAPE |
| |
707 if ((c = strchr(p, '>')) != NULL) |
| |
708 c++; |
| |
709 else |
| |
710 c = p; |
| |
711 /* src and alt are required! */ |
| |
712 if(src && xhtml) |
| |
713 g_string_append_printf(xhtml, "<img src='%s' alt='%s' />", g_strstrip(src->str), alt ? alt->str : ""); |
| |
714 if(alt) { |
| |
715 if(plain) |
| |
716 plain = g_string_append(plain, purple_unescape_html(alt->str)); |
| |
717 if(!src && xhtml) |
| |
718 xhtml = g_string_append(xhtml, alt->str); |
| |
719 g_string_free(alt, TRUE); |
| |
720 } |
| |
721 g_string_free(src, TRUE); |
| |
722 continue; |
| |
723 } |
| |
724 if (!g_ascii_strncasecmp(c, "<a", 2) && (*(c+2) == '>' || *(c+2) == ' ')) { |
| |
725 const char *p = c + 2; |
| |
726 struct purple_parse_tag *pt; |
| |
727 while (*p && *p != '>') { |
| |
728 if (!g_ascii_strncasecmp(p, "href=", 5)) { |
| |
729 const char *q = p + 5; |
| |
730 if (url) |
| |
731 g_string_free(url, TRUE); |
| |
732 url = g_string_new(""); |
| |
733 if (cdata) |
| |
734 g_string_free(cdata, TRUE); |
| |
735 cdata = g_string_new(""); |
| |
736 CHECK_QUOTE(q); |
| |
737 while (VALID_CHAR(q)) { |
| |
738 int len; |
| |
739 if ((*q == '&') && (purple_markup_unescape_entity(q, &len) == NULL)) |
| |
740 url = g_string_append(url, "&"); |
| |
741 else if (*q == '"') |
| |
742 url = g_string_append(url, """); |
| |
743 else |
| |
744 url = g_string_append_c(url, *q); |
| |
745 q++; |
| |
746 } |
| |
747 p = q; |
| |
748 } else { |
| |
749 p++; |
| |
750 } |
| |
751 } |
| |
752 if ((c = strchr(p, '>')) != NULL) |
| |
753 c++; |
| |
754 else |
| |
755 c = p; |
| |
756 pt = g_new0(struct purple_parse_tag, 1); |
| |
757 pt->src_tag = "a"; |
| |
758 pt->dest_tag = "a"; |
| |
759 tags = g_list_prepend(tags, pt); |
| |
760 if(xhtml) |
| |
761 g_string_append_printf(xhtml, "<a href=\"%s\">", url ? g_strstrip(url->str) : ""); |
| |
762 continue; |
| |
763 } |
| |
764 #define ESCAPE(from, to) \ |
| |
765 CHECK_QUOTE(from); \ |
| |
766 while (VALID_CHAR(from)) { \ |
| |
767 int len; \ |
| |
768 if ((*from == '&') && (purple_markup_unescape_entity(from, &len) == NULL)) \ |
| |
769 to = g_string_append(to, "&"); \ |
| |
770 else if (*from == '\'') \ |
| |
771 to = g_string_append_c(to, '\"'); \ |
| |
772 else \ |
| |
773 to = g_string_append_c(to, *from); \ |
| |
774 from++; \ |
| |
775 } |
| |
776 if(!g_ascii_strncasecmp(c, "<font", 5) && (*(c+5) == '>' || *(c+5) == ' ')) { |
| |
777 const char *p = c + 5; |
| |
778 GString *style = g_string_new(""); |
| |
779 struct purple_parse_tag *pt; |
| |
780 while (*p && *p != '>') { |
| |
781 if (!g_ascii_strncasecmp(p, "back=", 5)) { |
| |
782 const char *q = p + 5; |
| |
783 GString *color = g_string_new(""); |
| |
784 ESCAPE(q, color); |
| |
785 g_string_append_printf(style, "background: %s; ", color->str); |
| |
786 g_string_free(color, TRUE); |
| |
787 p = q; |
| |
788 } else if (!g_ascii_strncasecmp(p, "color=", 6)) { |
| |
789 const char *q = p + 6; |
| |
790 GString *color = g_string_new(""); |
| |
791 ESCAPE(q, color); |
| |
792 g_string_append_printf(style, "color: %s; ", color->str); |
| |
793 g_string_free(color, TRUE); |
| |
794 p = q; |
| |
795 } else if (!g_ascii_strncasecmp(p, "face=", 5)) { |
| |
796 const char *q = p + 5; |
| |
797 GString *face = g_string_new(""); |
| |
798 ESCAPE(q, face); |
| |
799 g_string_append_printf(style, "font-family: %s; ", g_strstrip(face->str)); |
| |
800 g_string_free(face, TRUE); |
| |
801 p = q; |
| |
802 } else if (!g_ascii_strncasecmp(p, "size=", 5)) { |
| |
803 const char *q = p + 5; |
| |
804 int sz; |
| |
805 const char *size = "medium"; |
| |
806 CHECK_QUOTE(q); |
| |
807 sz = atoi(q); |
| |
808 switch (sz) |
| |
809 { |
| |
810 case 1: |
| |
811 size = "xx-small"; |
| |
812 break; |
| |
813 case 2: |
| |
814 size = "small"; |
| |
815 break; |
| |
816 case 3: |
| |
817 size = "medium"; |
| |
818 break; |
| |
819 case 4: |
| |
820 size = "large"; |
| |
821 break; |
| |
822 case 5: |
| |
823 size = "x-large"; |
| |
824 break; |
| |
825 case 6: |
| |
826 case 7: |
| |
827 size = "xx-large"; |
| |
828 break; |
| |
829 default: |
| |
830 break; |
| |
831 } |
| |
832 g_string_append_printf(style, "font-size: %s; ", size); |
| |
833 p = q; |
| |
834 } else { |
| |
835 p++; |
| |
836 } |
| |
837 } |
| |
838 if ((c = strchr(p, '>')) != NULL) |
| |
839 c++; |
| |
840 else |
| |
841 c = p; |
| |
842 pt = g_new0(struct purple_parse_tag, 1); |
| |
843 pt->src_tag = "font"; |
| |
844 pt->dest_tag = "span"; |
| |
845 tags = g_list_prepend(tags, pt); |
| |
846 if(style->len && xhtml) |
| |
847 g_string_append_printf(xhtml, "<span style='%s'>", g_strstrip(style->str)); |
| |
848 else |
| |
849 pt->ignore = TRUE; |
| |
850 g_string_free(style, TRUE); |
| |
851 continue; |
| |
852 } |
| |
853 #undef ESCAPE |
| |
854 if (!g_ascii_strncasecmp(c, "<body ", 6)) { |
| |
855 const char *p = c + 6; |
| |
856 gboolean did_something = FALSE; |
| |
857 while (*p && *p != '>') { |
| |
858 if (!g_ascii_strncasecmp(p, "bgcolor=", 8)) { |
| |
859 const char *q = p + 8; |
| |
860 struct purple_parse_tag *pt = g_new0(struct purple_parse_tag, 1); |
| |
861 GString *color = g_string_new(""); |
| |
862 CHECK_QUOTE(q); |
| |
863 while (VALID_CHAR(q)) { |
| |
864 color = g_string_append_c(color, *q); |
| |
865 q++; |
| |
866 } |
| |
867 if (xhtml) |
| |
868 g_string_append_printf(xhtml, "<span style='background: %s;'>", g_strstrip(color->str)); |
| |
869 g_string_free(color, TRUE); |
| |
870 if ((c = strchr(p, '>')) != NULL) |
| |
871 c++; |
| |
872 else |
| |
873 c = p; |
| |
874 pt->src_tag = "body"; |
| |
875 pt->dest_tag = "span"; |
| |
876 tags = g_list_prepend(tags, pt); |
| |
877 did_something = TRUE; |
| |
878 break; |
| |
879 } |
| |
880 p++; |
| |
881 } |
| |
882 if (did_something) continue; |
| |
883 } |
| |
884 /* this has to come after the special case for bgcolor */ |
| |
885 ALLOW_TAG("body"); |
| |
886 if(!g_ascii_strncasecmp(c, "<!--", strlen("<!--"))) { |
| |
887 char *p = strstr(c + strlen("<!--"), "-->"); |
| |
888 if(p) { |
| |
889 if(xhtml) |
| |
890 xhtml = g_string_append(xhtml, "<!--"); |
| |
891 c += strlen("<!--"); |
| |
892 continue; |
| |
893 } |
| |
894 } |
| |
895 |
| |
896 if(xhtml) |
| |
897 xhtml = g_string_append(xhtml, "<"); |
| |
898 if(plain) |
| |
899 plain = g_string_append_c(plain, '<'); |
| |
900 c++; |
| |
901 } |
| |
902 } else if(*c == '&') { |
| |
903 char buf[7]; |
| |
904 const char *pln; |
| |
905 int len; |
| |
906 |
| |
907 if ((pln = purple_markup_unescape_entity(c, &len)) == NULL) { |
| |
908 len = 1; |
| |
909 g_snprintf(buf, sizeof(buf), "%c", *c); |
| |
910 pln = buf; |
| |
911 } |
| |
912 if(xhtml) |
| |
913 xhtml = g_string_append_len(xhtml, c, len); |
| |
914 if(plain) |
| |
915 plain = g_string_append(plain, pln); |
| |
916 if(cdata) |
| |
917 cdata = g_string_append_len(cdata, c, len); |
| |
918 c += len; |
| |
919 } else { |
| |
920 if(xhtml) |
| |
921 xhtml = g_string_append_c(xhtml, *c); |
| |
922 if(plain) |
| |
923 plain = g_string_append_c(plain, *c); |
| |
924 if(cdata) |
| |
925 cdata = g_string_append_c(cdata, *c); |
| |
926 c++; |
| |
927 } |
| |
928 } |
| |
929 if(xhtml) { |
| |
930 for (tag = tags; tag ; tag = tag->next) { |
| |
931 struct purple_parse_tag *pt = tag->data; |
| |
932 if(!pt->ignore) |
| |
933 g_string_append_printf(xhtml, "</%s>", pt->dest_tag); |
| |
934 } |
| |
935 } |
| |
936 g_list_free(tags); |
| |
937 if(xhtml_out) |
| |
938 *xhtml_out = g_string_free(xhtml, FALSE); |
| |
939 if(plain_out) |
| |
940 *plain_out = g_string_free(plain, FALSE); |
| |
941 if(url) |
| |
942 g_string_free(url, TRUE); |
| |
943 if (cdata) |
| |
944 g_string_free(cdata, TRUE); |
| |
945 #undef CHECK_QUOTE |
| |
946 #undef VALID_CHAR |
| |
947 } |
| |
948 |
| |
949 /* The following are probably reasonable changes: |
| |
950 * - \n should be converted to a normal space |
| |
951 * - in addition to <br>, <p> and <div> etc. should also be converted into \n |
| |
952 * - We want to turn </td>#whitespace<td> sequences into a single tab |
| |
953 * - We want to turn </tr>#whitespace<tr> sequences into a single \n |
| |
954 * - <script>...</script> and <style>...</style> should be completely removed |
| |
955 */ |
| |
956 |
| |
957 char * |
| |
958 purple_markup_strip_html(const char *str) |
| |
959 { |
| |
960 int i, j, k, entlen; |
| |
961 gboolean visible = TRUE; |
| |
962 gboolean closing_td_p = FALSE; |
| |
963 gchar *str2; |
| |
964 const gchar *cdata_close_tag = NULL, *ent; |
| |
965 gchar *href = NULL; |
| |
966 int href_st = 0; |
| |
967 |
| |
968 if(!str) |
| |
969 return NULL; |
| |
970 |
| |
971 str2 = g_strdup(str); |
| |
972 |
| |
973 for (i = 0, j = 0; str2[i]; i++) |
| |
974 { |
| |
975 if (str2[i] == '<') |
| |
976 { |
| |
977 if (cdata_close_tag) |
| |
978 { |
| |
979 /* Note: Don't even assume any other tag is a tag in CDATA */ |
| |
980 if (g_ascii_strncasecmp(str2 + i, cdata_close_tag, |
| |
981 strlen(cdata_close_tag)) == 0) |
| |
982 { |
| |
983 i += strlen(cdata_close_tag) - 1; |
| |
984 cdata_close_tag = NULL; |
| |
985 } |
| |
986 continue; |
| |
987 } |
| |
988 else if (g_ascii_strncasecmp(str2 + i, "<td", 3) == 0 && closing_td_p) |
| |
989 { |
| |
990 str2[j++] = '\t'; |
| |
991 visible = TRUE; |
| |
992 } |
| |
993 else if (g_ascii_strncasecmp(str2 + i, "</td>", 5) == 0) |
| |
994 { |
| |
995 closing_td_p = TRUE; |
| |
996 visible = FALSE; |
| |
997 } |
| |
998 else |
| |
999 { |
| |
1000 closing_td_p = FALSE; |
| |
1001 visible = TRUE; |
| |
1002 } |
| |
1003 |
| |
1004 k = i + 1; |
| |
1005 |
| |
1006 if(g_ascii_isspace(str2[k])) |
| |
1007 visible = TRUE; |
| |
1008 else if (str2[k]) |
| |
1009 { |
| |
1010 /* Scan until we end the tag either implicitly (closed start |
| |
1011 * tag) or explicitly, using a sloppy method (i.e., < or > |
| |
1012 * inside quoted attributes will screw us up) |
| |
1013 */ |
| |
1014 while (str2[k] && str2[k] != '<' && str2[k] != '>') |
| |
1015 { |
| |
1016 k++; |
| |
1017 } |
| |
1018 |
| |
1019 /* If we've got an <a> tag with an href, save the address |
| |
1020 * to print later. */ |
| |
1021 if (g_ascii_strncasecmp(str2 + i, "<a", 2) == 0 && |
| |
1022 g_ascii_isspace(str2[i+2])) |
| |
1023 { |
| |
1024 int st; /* start of href, inclusive [ */ |
| |
1025 int end; /* end of href, exclusive ) */ |
| |
1026 char delim = ' '; |
| |
1027 /* Find start of href */ |
| |
1028 for (st = i + 3; st < k; st++) |
| |
1029 { |
| |
1030 if (g_ascii_strncasecmp(str2+st, "href=", 5) == 0) |
| |
1031 { |
| |
1032 st += 5; |
| |
1033 if (str2[st] == '"' || str2[st] == '\'') |
| |
1034 { |
| |
1035 delim = str2[st]; |
| |
1036 st++; |
| |
1037 } |
| |
1038 break; |
| |
1039 } |
| |
1040 } |
| |
1041 /* find end of address */ |
| |
1042 for (end = st; end < k && str2[end] != delim; end++) |
| |
1043 { |
| |
1044 /* All the work is done in the loop construct above. */ |
| |
1045 } |
| |
1046 |
| |
1047 /* If there's an address, save it. If there was |
| |
1048 * already one saved, kill it. */ |
| |
1049 if (st < k) |
| |
1050 { |
| |
1051 char *tmp; |
| |
1052 g_free(href); |
| |
1053 tmp = g_strndup(str2 + st, end - st); |
| |
1054 href = purple_unescape_html(tmp); |
| |
1055 g_free(tmp); |
| |
1056 href_st = j; |
| |
1057 } |
| |
1058 } |
| |
1059 |
| |
1060 /* Replace </a> with an ascii representation of the |
| |
1061 * address the link was pointing to. */ |
| |
1062 else if (href != NULL && g_ascii_strncasecmp(str2 + i, "</a>", 4) == 0) |
| |
1063 { |
| |
1064 size_t hrlen = strlen(href); |
| |
1065 |
| |
1066 /* Only insert the href if it's different from the CDATA. */ |
| |
1067 if ((hrlen != (gsize)(j - href_st) || |
| |
1068 strncmp(str2 + href_st, href, hrlen)) && |
| |
1069 (hrlen != (gsize)(j - href_st + 7) || /* 7 == strlen("http://") */ |
| |
1070 strncmp(str2 + href_st, href + 7, hrlen - 7))) |
| |
1071 { |
| |
1072 str2[j++] = ' '; |
| |
1073 str2[j++] = '('; |
| |
1074 memmove(str2 + j, href, hrlen); |
| |
1075 j += hrlen; |
| |
1076 str2[j++] = ')'; |
| |
1077 g_free(href); |
| |
1078 href = NULL; |
| |
1079 } |
| |
1080 } |
| |
1081 |
| |
1082 /* Check for tags which should be mapped to newline (but ignore some of |
| |
1083 * the tags at the beginning of the text) */ |
| |
1084 else if ((j && (g_ascii_strncasecmp(str2 + i, "<p>", 3) == 0 |
| |
1085 || g_ascii_strncasecmp(str2 + i, "<tr", 3) == 0 |
| |
1086 || g_ascii_strncasecmp(str2 + i, "<hr", 3) == 0 |
| |
1087 || g_ascii_strncasecmp(str2 + i, "<li", 3) == 0 |
| |
1088 || g_ascii_strncasecmp(str2 + i, "<div", 4) == 0)) |
| |
1089 || g_ascii_strncasecmp(str2 + i, "<br", 3) == 0 |
| |
1090 || g_ascii_strncasecmp(str2 + i, "</table>", 8) == 0) |
| |
1091 { |
| |
1092 str2[j++] = '\n'; |
| |
1093 } |
| |
1094 /* Check for tags which begin CDATA and need to be closed */ |
| |
1095 else if (g_ascii_strncasecmp(str2 + i, "<script", 7) == 0) |
| |
1096 { |
| |
1097 cdata_close_tag = "</script>"; |
| |
1098 } |
| |
1099 else if (g_ascii_strncasecmp(str2 + i, "<style", 6) == 0) |
| |
1100 { |
| |
1101 cdata_close_tag = "</style>"; |
| |
1102 } |
| |
1103 /* Update the index and continue checking after the tag */ |
| |
1104 i = (str2[k] == '<' || str2[k] == '\0')? k - 1: k; |
| |
1105 continue; |
| |
1106 } |
| |
1107 } |
| |
1108 else if (cdata_close_tag) |
| |
1109 { |
| |
1110 continue; |
| |
1111 } |
| |
1112 else if (!g_ascii_isspace(str2[i])) |
| |
1113 { |
| |
1114 visible = TRUE; |
| |
1115 } |
| |
1116 |
| |
1117 if (str2[i] == '&' && (ent = purple_markup_unescape_entity(str2 + i, &entlen)) != NULL) |
| |
1118 { |
| |
1119 while (*ent) |
| |
1120 str2[j++] = *ent++; |
| |
1121 i += entlen - 1; |
| |
1122 continue; |
| |
1123 } |
| |
1124 |
| |
1125 if (visible) |
| |
1126 str2[j++] = g_ascii_isspace(str2[i])? ' ': str2[i]; |
| |
1127 } |
| |
1128 |
| |
1129 g_free(href); |
| |
1130 |
| |
1131 str2[j] = '\0'; |
| |
1132 |
| |
1133 return str2; |
| |
1134 } |
| |
1135 |
| |
1136 static gboolean |
| |
1137 badchar(char c) |
| |
1138 { |
| |
1139 switch (c) { |
| |
1140 case ' ': |
| |
1141 case ',': |
| |
1142 case '\0': |
| |
1143 case '\n': |
| |
1144 case '\r': |
| |
1145 case '<': |
| |
1146 case '>': |
| |
1147 case '"': |
| |
1148 return TRUE; |
| |
1149 default: |
| |
1150 return FALSE; |
| |
1151 } |
| |
1152 } |
| |
1153 |
| |
1154 static gboolean |
| |
1155 badentity(const char *c) |
| |
1156 { |
| |
1157 if (!g_ascii_strncasecmp(c, "<", 4) || |
| |
1158 !g_ascii_strncasecmp(c, ">", 4) || |
| |
1159 !g_ascii_strncasecmp(c, """, 6)) { |
| |
1160 return TRUE; |
| |
1161 } |
| |
1162 return FALSE; |
| |
1163 } |
| |
1164 |
| |
1165 static const char * |
| |
1166 process_link(GString *ret, |
| |
1167 const char *start, const char *c, |
| |
1168 int matchlen, |
| |
1169 const char *urlprefix, |
| |
1170 int inside_paren) |
| |
1171 { |
| |
1172 char *url_buf, *tmpurlbuf; |
| |
1173 const char *t; |
| |
1174 |
| |
1175 for (t = c;; t++) { |
| |
1176 if (!badchar(*t) && !badentity(t)) |
| |
1177 continue; |
| |
1178 |
| |
1179 if (t - c == matchlen) |
| |
1180 break; |
| |
1181 |
| |
1182 if (*t == ',' && *(t + 1) != ' ') { |
| |
1183 continue; |
| |
1184 } |
| |
1185 |
| |
1186 if (t > start && *(t - 1) == '.') |
| |
1187 t--; |
| |
1188 if (t > start && *(t - 1) == ')' && inside_paren > 0) |
| |
1189 t--; |
| |
1190 |
| |
1191 url_buf = g_strndup(c, t - c); |
| |
1192 tmpurlbuf = purple_unescape_html(url_buf); |
| |
1193 g_string_append_printf(ret, "<A HREF=\"%s%s\">%s</A>", |
| |
1194 urlprefix, |
| |
1195 tmpurlbuf, url_buf); |
| |
1196 g_free(tmpurlbuf); |
| |
1197 g_free(url_buf); |
| |
1198 return t; |
| |
1199 } |
| |
1200 |
| |
1201 return c; |
| |
1202 } |
| |
1203 |
| |
1204 char * |
| |
1205 purple_markup_linkify(const char *text) |
| |
1206 { |
| |
1207 const char *c, *t, *q = NULL; |
| |
1208 char *tmpurlbuf, *url_buf; |
| |
1209 gunichar g; |
| |
1210 gboolean inside_html = FALSE; |
| |
1211 int inside_paren = 0; |
| |
1212 GString *ret; |
| |
1213 |
| |
1214 if (text == NULL) |
| |
1215 return NULL; |
| |
1216 |
| |
1217 ret = g_string_new(""); |
| |
1218 |
| |
1219 c = text; |
| |
1220 while (*c) { |
| |
1221 |
| |
1222 if(*c == '(' && !inside_html) { |
| |
1223 inside_paren++; |
| |
1224 ret = g_string_append_c(ret, *c); |
| |
1225 c++; |
| |
1226 } |
| |
1227 |
| |
1228 if(inside_html) { |
| |
1229 if(*c == '>') { |
| |
1230 inside_html = FALSE; |
| |
1231 } else if(!q && (*c == '\"' || *c == '\'')) { |
| |
1232 q = c; |
| |
1233 } else if(q) { |
| |
1234 if(*c == *q) |
| |
1235 q = NULL; |
| |
1236 } |
| |
1237 } else if(*c == '<') { |
| |
1238 inside_html = TRUE; |
| |
1239 if (!g_ascii_strncasecmp(c, "<A", 2)) { |
| |
1240 while (1) { |
| |
1241 if (!g_ascii_strncasecmp(c, "/A>", 3)) { |
| |
1242 inside_html = FALSE; |
| |
1243 break; |
| |
1244 } |
| |
1245 ret = g_string_append_c(ret, *c); |
| |
1246 c++; |
| |
1247 if (!(*c)) |
| |
1248 break; |
| |
1249 } |
| |
1250 } |
| |
1251 } else if (!g_ascii_strncasecmp(c, "http://", 7)) { |
| |
1252 c = process_link(ret, text, c, 7, "", inside_paren); |
| |
1253 } else if (!g_ascii_strncasecmp(c, "https://", 8)) { |
| |
1254 c = process_link(ret, text, c, 8, "", inside_paren); |
| |
1255 } else if (!g_ascii_strncasecmp(c, "ftp://", 6)) { |
| |
1256 c = process_link(ret, text, c, 6, "", inside_paren); |
| |
1257 } else if (!g_ascii_strncasecmp(c, "sftp://", 7)) { |
| |
1258 c = process_link(ret, text, c, 7, "", inside_paren); |
| |
1259 } else if (!g_ascii_strncasecmp(c, "file://", 7)) { |
| |
1260 c = process_link(ret, text, c, 7, "", inside_paren); |
| |
1261 } else if (!g_ascii_strncasecmp(c, "www.", 4) && c[4] != '.' && (c == text || badchar(c[-1]) || badentity(c-1))) { |
| |
1262 c = process_link(ret, text, c, 4, "http://", inside_paren); |
| |
1263 } else if (!g_ascii_strncasecmp(c, "ftp.", 4) && c[4] != '.' && (c == text || badchar(c[-1]) || badentity(c-1))) { |
| |
1264 c = process_link(ret, text, c, 4, "ftp://", inside_paren); |
| |
1265 } else if (!g_ascii_strncasecmp(c, "xmpp:", 5) && (c == text || badchar(c[-1]) || badentity(c-1))) { |
| |
1266 c = process_link(ret, text, c, 5, "", inside_paren); |
| |
1267 } else if (!g_ascii_strncasecmp(c, "mailto:", 7)) { |
| |
1268 t = c; |
| |
1269 while (1) { |
| |
1270 if (badchar(*t) || badentity(t)) { |
| |
1271 char *d; |
| |
1272 if (t - c == 7) { |
| |
1273 break; |
| |
1274 } |
| |
1275 if (t > text && *(t - 1) == '.') |
| |
1276 t--; |
| |
1277 if ((d = strstr(c + 7, "?")) != NULL && d < t) |
| |
1278 url_buf = g_strndup(c + 7, d - c - 7); |
| |
1279 else |
| |
1280 url_buf = g_strndup(c + 7, t - c - 7); |
| |
1281 if (!purple_email_is_valid(url_buf)) { |
| |
1282 g_free(url_buf); |
| |
1283 break; |
| |
1284 } |
| |
1285 g_free(url_buf); |
| |
1286 url_buf = g_strndup(c, t - c); |
| |
1287 tmpurlbuf = purple_unescape_html(url_buf); |
| |
1288 g_string_append_printf(ret, "<A HREF=\"%s\">%s</A>", |
| |
1289 tmpurlbuf, url_buf); |
| |
1290 g_free(url_buf); |
| |
1291 g_free(tmpurlbuf); |
| |
1292 c = t; |
| |
1293 break; |
| |
1294 } |
| |
1295 t++; |
| |
1296 } |
| |
1297 } else if (c != text && (*c == '@')) { |
| |
1298 int flag; |
| |
1299 GString *gurl_buf = NULL; |
| |
1300 const char illegal_chars[] = "!@#$%^&*()[]{}/|\\<>\":;\r\n \0"; |
| |
1301 |
| |
1302 if (strchr(illegal_chars,*(c - 1)) || strchr(illegal_chars, *(c + 1))) |
| |
1303 flag = 0; |
| |
1304 else { |
| |
1305 flag = 1; |
| |
1306 gurl_buf = g_string_new(""); |
| |
1307 } |
| |
1308 |
| |
1309 t = c; |
| |
1310 while (flag) { |
| |
1311 /* iterate backwards grabbing the local part of an email address */ |
| |
1312 g = g_utf8_get_char(t); |
| |
1313 if (badchar(*t) || (g >= 127) || (*t == '(') || |
| |
1314 ((*t == ';') && ((t > (text+2) && (!g_ascii_strncasecmp(t - 3, "<", 4) || |
| |
1315 !g_ascii_strncasecmp(t - 3, ">", 4))) || |
| |
1316 (t > (text+4) && (!g_ascii_strncasecmp(t - 5, """, 6)))))) { |
| |
1317 /* local part will already be part of ret, strip it out */ |
| |
1318 ret = g_string_truncate(ret, ret->len - (c - t)); |
| |
1319 ret = g_string_append_unichar(ret, g); |
| |
1320 break; |
| |
1321 } else { |
| |
1322 g_string_prepend_unichar(gurl_buf, g); |
| |
1323 t = g_utf8_find_prev_char(text, t); |
| |
1324 if (t < text) { |
| |
1325 ret = g_string_assign(ret, ""); |
| |
1326 break; |
| |
1327 } |
| |
1328 } |
| |
1329 } |
| |
1330 |
| |
1331 t = g_utf8_find_next_char(c, NULL); |
| |
1332 |
| |
1333 while (flag) { |
| |
1334 /* iterate forwards grabbing the domain part of an email address */ |
| |
1335 g = g_utf8_get_char(t); |
| |
1336 if (badchar(*t) || (g >= 127) || (*t == ')') || badentity(t)) { |
| |
1337 char *d; |
| |
1338 |
| |
1339 url_buf = g_string_free(gurl_buf, FALSE); |
| |
1340 gurl_buf = NULL; |
| |
1341 |
| |
1342 /* strip off trailing periods */ |
| |
1343 if (*url_buf) { |
| |
1344 for (d = url_buf + strlen(url_buf) - 1; *d == '.'; d--, t--) |
| |
1345 *d = '\0'; |
| |
1346 } |
| |
1347 |
| |
1348 tmpurlbuf = purple_unescape_html(url_buf); |
| |
1349 if (purple_email_is_valid(tmpurlbuf)) { |
| |
1350 g_string_append_printf(ret, "<A HREF=\"mailto:%s\">%s</A>", |
| |
1351 tmpurlbuf, url_buf); |
| |
1352 } else { |
| |
1353 g_string_append(ret, url_buf); |
| |
1354 } |
| |
1355 g_free(url_buf); |
| |
1356 g_free(tmpurlbuf); |
| |
1357 c = t; |
| |
1358 |
| |
1359 break; |
| |
1360 } else { |
| |
1361 g_string_append_unichar(gurl_buf, g); |
| |
1362 t = g_utf8_find_next_char(t, NULL); |
| |
1363 } |
| |
1364 } |
| |
1365 |
| |
1366 if (gurl_buf) { |
| |
1367 g_string_free(gurl_buf, TRUE); |
| |
1368 } |
| |
1369 } |
| |
1370 |
| |
1371 if(*c == ')' && !inside_html) { |
| |
1372 inside_paren--; |
| |
1373 ret = g_string_append_c(ret, *c); |
| |
1374 c++; |
| |
1375 } |
| |
1376 |
| |
1377 if (*c == 0) |
| |
1378 break; |
| |
1379 |
| |
1380 ret = g_string_append_c(ret, *c); |
| |
1381 c++; |
| |
1382 |
| |
1383 } |
| |
1384 return g_string_free(ret, FALSE); |
| |
1385 } |
| |
1386 |
| |
1387 char *purple_unescape_text(const char *in) |
| |
1388 { |
| |
1389 GString *ret; |
| |
1390 const char *c = in; |
| |
1391 |
| |
1392 if (in == NULL) |
| |
1393 return NULL; |
| |
1394 |
| |
1395 ret = g_string_new(""); |
| |
1396 while (*c) { |
| |
1397 int len; |
| |
1398 const char *ent; |
| |
1399 |
| |
1400 if ((ent = purple_markup_unescape_entity(c, &len)) != NULL) { |
| |
1401 g_string_append(ret, ent); |
| |
1402 c += len; |
| |
1403 } else { |
| |
1404 g_string_append_c(ret, *c); |
| |
1405 c++; |
| |
1406 } |
| |
1407 } |
| |
1408 |
| |
1409 return g_string_free(ret, FALSE); |
| |
1410 } |
| |
1411 |
| |
1412 char *purple_unescape_html(const char *html) |
| |
1413 { |
| |
1414 GString *ret; |
| |
1415 const char *c = html; |
| |
1416 |
| |
1417 if (html == NULL) |
| |
1418 return NULL; |
| |
1419 |
| |
1420 ret = g_string_new(""); |
| |
1421 while (*c) { |
| |
1422 int len; |
| |
1423 const char *ent; |
| |
1424 |
| |
1425 if ((ent = purple_markup_unescape_entity(c, &len)) != NULL) { |
| |
1426 g_string_append(ret, ent); |
| |
1427 c += len; |
| |
1428 } else if (!strncmp(c, "<br>", 4)) { |
| |
1429 g_string_append_c(ret, '\n'); |
| |
1430 c += 4; |
| |
1431 } else { |
| |
1432 g_string_append_c(ret, *c); |
| |
1433 c++; |
| |
1434 } |
| |
1435 } |
| |
1436 |
| |
1437 return g_string_free(ret, FALSE); |
| |
1438 } |
| |
1439 |
| |
1440 char * |
| |
1441 purple_markup_slice(const char *str, guint x, guint y) |
| |
1442 { |
| |
1443 GString *ret; |
| |
1444 GQueue *q; |
| |
1445 guint z = 0; |
| |
1446 gboolean appended = FALSE; |
| |
1447 gunichar c; |
| |
1448 char *tag; |
| |
1449 |
| |
1450 g_return_val_if_fail(str != NULL, NULL); |
| |
1451 g_return_val_if_fail(x <= y, NULL); |
| |
1452 |
| |
1453 if (x == y) |
| |
1454 return g_strdup(""); |
| |
1455 |
| |
1456 ret = g_string_new(""); |
| |
1457 q = g_queue_new(); |
| |
1458 |
| |
1459 while (*str && (z < y)) { |
| |
1460 c = g_utf8_get_char(str); |
| |
1461 |
| |
1462 if (c == '<') { |
| |
1463 char *end = strchr(str, '>'); |
| |
1464 |
| |
1465 if (!end) { |
| |
1466 g_string_free(ret, TRUE); |
| |
1467 while ((tag = g_queue_pop_head(q))) |
| |
1468 g_free(tag); |
| |
1469 g_queue_free(q); |
| |
1470 return NULL; |
| |
1471 } |
| |
1472 |
| |
1473 if (!g_ascii_strncasecmp(str, "<img ", 5)) { |
| |
1474 z += strlen("[Image]"); |
| |
1475 } else if (!g_ascii_strncasecmp(str, "<br", 3)) { |
| |
1476 z += 1; |
| |
1477 } else if (!g_ascii_strncasecmp(str, "<hr>", 4)) { |
| |
1478 z += strlen("\n---\n"); |
| |
1479 } else if (!g_ascii_strncasecmp(str, "</", 2)) { |
| |
1480 /* pop stack */ |
| |
1481 char *tmp; |
| |
1482 |
| |
1483 tmp = g_queue_pop_head(q); |
| |
1484 g_free(tmp); |
| |
1485 /* z += 0; */ |
| |
1486 } else { |
| |
1487 /* push it unto the stack */ |
| |
1488 char *tmp; |
| |
1489 |
| |
1490 tmp = g_strndup(str, end - str + 1); |
| |
1491 g_queue_push_head(q, tmp); |
| |
1492 /* z += 0; */ |
| |
1493 } |
| |
1494 |
| |
1495 if (z >= x) { |
| |
1496 g_string_append_len(ret, str, end - str + 1); |
| |
1497 } |
| |
1498 |
| |
1499 str = end; |
| |
1500 } else if (c == '&') { |
| |
1501 char *end = strchr(str, ';'); |
| |
1502 if (!end) { |
| |
1503 g_string_free(ret, TRUE); |
| |
1504 while ((tag = g_queue_pop_head(q))) |
| |
1505 g_free(tag); |
| |
1506 g_queue_free(q); |
| |
1507 |
| |
1508 return NULL; |
| |
1509 } |
| |
1510 |
| |
1511 if (z >= x) |
| |
1512 g_string_append_len(ret, str, end - str + 1); |
| |
1513 |
| |
1514 z++; |
| |
1515 str = end; |
| |
1516 } else { |
| |
1517 if (z == x && z > 0 && !appended) { |
| |
1518 GList *l = q->tail; |
| |
1519 |
| |
1520 while (l) { |
| |
1521 tag = l->data; |
| |
1522 g_string_append(ret, tag); |
| |
1523 l = l->prev; |
| |
1524 } |
| |
1525 appended = TRUE; |
| |
1526 } |
| |
1527 |
| |
1528 if (z >= x) |
| |
1529 g_string_append_unichar(ret, c); |
| |
1530 z++; |
| |
1531 } |
| |
1532 |
| |
1533 str = g_utf8_next_char(str); |
| |
1534 } |
| |
1535 |
| |
1536 while ((tag = g_queue_pop_head(q))) { |
| |
1537 char *name; |
| |
1538 |
| |
1539 name = purple_markup_get_tag_name(tag); |
| |
1540 g_string_append_printf(ret, "</%s>", name); |
| |
1541 g_free(name); |
| |
1542 g_free(tag); |
| |
1543 } |
| |
1544 |
| |
1545 g_queue_free(q); |
| |
1546 return g_string_free(ret, FALSE); |
| |
1547 } |
| |
1548 |
| |
1549 char * |
| |
1550 purple_markup_get_tag_name(const char *tag) |
| |
1551 { |
| |
1552 int i; |
| |
1553 g_return_val_if_fail(tag != NULL, NULL); |
| |
1554 g_return_val_if_fail(*tag == '<', NULL); |
| |
1555 |
| |
1556 for (i = 1; tag[i]; i++) |
| |
1557 if (tag[i] == '>' || tag[i] == ' ' || tag[i] == '/') |
| |
1558 break; |
| |
1559 |
| |
1560 return g_strndup(tag+1, i-1); |
| |
1561 } |