--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libpurple/purplemarkup.c Fri Oct 16 02:27:21 2020 -0500 @@ -0,0 +1,1561 @@ +/* + * Purple - Internet Messenging Library + * Copyright (C) Pidgin Developers <devel@pidgin.im> + * + * Purple is the legal property of its developers, whose names are too numerous + * to list here. Please refer to the COPYRIGHT file distributed with this + * source distribution. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <https://www.gnu.org/licenses/>. + */ + +#include "purplemarkup.h" + +#include "util.h" + +/* + * This function is stolen from glib's gmarkup.c and modified to not + * replace ' with ' + */ +static void append_escaped_text(GString *str, + const gchar *text, gssize length) +{ + const gchar *p; + const gchar *end; + gunichar c; + + p = text; + end = text + length; + + while (p != end) + { + const gchar *next; + next = g_utf8_next_char (p); + + switch (*p) + { + case '&': + g_string_append (str, "&"); + break; + + case '<': + g_string_append (str, "<"); + break; + + case '>': + g_string_append (str, ">"); + break; + + case '"': + g_string_append (str, """); + break; + + default: + c = g_utf8_get_char (p); + if ((0x1 <= c && c <= 0x8) || + (0xb <= c && c <= 0xc) || + (0xe <= c && c <= 0x1f) || + (0x7f <= c && c <= 0x84) || + (0x86 <= c && c <= 0x9f)) + g_string_append_printf (str, "&#x%x;", c); + else + g_string_append_len (str, p, next - p); + break; + } + + p = next; + } +} + +/* This function is stolen from glib's gmarkup.c */ +gchar *purple_markup_escape_text(const gchar *text, gssize length) +{ + GString *str; + + g_return_val_if_fail(text != NULL, NULL); + + if (length < 0) + length = strlen(text); + + /* prealloc at least as long as original text */ + str = g_string_sized_new(length); + append_escaped_text(str, text, length); + + return g_string_free(str, FALSE); +} + +const char * +purple_markup_unescape_entity(const char *text, int *length) +{ + const char *pln; + int len; + + if (!text || *text != '&') + return NULL; + +#define IS_ENTITY(s) (!g_ascii_strncasecmp(text, s, (len = sizeof(s) - 1))) + + if(IS_ENTITY("&")) + pln = "&"; + else if(IS_ENTITY("<")) + pln = "<"; + else if(IS_ENTITY(">")) + pln = ">"; + else if(IS_ENTITY(" ")) + pln = " "; + else if(IS_ENTITY("©")) + pln = "\302\251"; /* or use g_unichar_to_utf8(0xa9); */ + else if(IS_ENTITY(""")) + pln = "\""; + else if(IS_ENTITY("®")) + pln = "\302\256"; /* or use g_unichar_to_utf8(0xae); */ + else if(IS_ENTITY("'")) + pln = "\'"; + else if(text[1] == '#' && (g_ascii_isxdigit(text[2]) || text[2] == 'x')) { + static char buf[7]; + const char *start = text + 2; + char *end; + guint64 pound; + int base = 10; + int buflen; + + if (*start == 'x') { + base = 16; + start++; + } + + pound = g_ascii_strtoull(start, &end, base); + if (pound == 0 || pound > INT_MAX || *end != ';') { + return NULL; + } + + len = (end - text) + 1; + + buflen = g_unichar_to_utf8((gunichar)pound, buf); + buf[buflen] = '\0'; + pln = buf; + } + else + return NULL; + + if (length) + *length = len; + return pln; +} + +char * +purple_markup_get_css_property(const gchar *style, + const gchar *opt) +{ + const gchar *css_str = style; + const gchar *css_value_start; + const gchar *css_value_end; + gchar *tmp; + gchar *ret; + + g_return_val_if_fail(opt != NULL, NULL); + + if (!css_str) + return NULL; + + /* find the CSS property */ + while (1) + { + /* skip whitespace characters */ + while (*css_str && g_ascii_isspace(*css_str)) + css_str++; + if (!g_ascii_isalpha(*css_str)) + return NULL; + if (g_ascii_strncasecmp(css_str, opt, strlen(opt))) + { + /* go to next css property positioned after the next ';' */ + while (*css_str && *css_str != '"' && *css_str != ';') + css_str++; + if(*css_str != ';') + return NULL; + css_str++; + } + else + break; + } + + /* find the CSS value position in the string */ + css_str += strlen(opt); + while (*css_str && g_ascii_isspace(*css_str)) + css_str++; + if (*css_str != ':') + return NULL; + css_str++; + while (*css_str && g_ascii_isspace(*css_str)) + css_str++; + if (*css_str == '\0' || *css_str == '"' || *css_str == ';') + return NULL; + + /* mark the CSS value */ + css_value_start = css_str; + while (*css_str && *css_str != '"' && *css_str != ';') + css_str++; + css_value_end = css_str - 1; + + /* Removes trailing whitespace */ + while (css_value_end > css_value_start && g_ascii_isspace(*css_value_end)) + css_value_end--; + + tmp = g_strndup(css_value_start, css_value_end - css_value_start + 1); + ret = purple_unescape_html(tmp); + g_free(tmp); + + return ret; +} + +gboolean purple_markup_is_rtl(const char *html) +{ + GData *attributes; + const gchar *start, *end; + gboolean res = FALSE; + + if (purple_markup_find_tag("span", html, &start, &end, &attributes)) + { + /* tmp is a member of attributes and is free with g_datalist_clear call */ + const char *tmp = g_datalist_get_data(&attributes, "dir"); + if (tmp && !g_ascii_strcasecmp(tmp, "RTL")) + res = TRUE; + if (!res) + { + tmp = g_datalist_get_data(&attributes, "style"); + if (tmp) + { + char *tmp2 = purple_markup_get_css_property(tmp, "direction"); + if (tmp2 && !g_ascii_strcasecmp(tmp2, "RTL")) + res = TRUE; + g_free(tmp2); + } + + } + g_datalist_clear(&attributes); + } + return res; +} + +gboolean +purple_markup_find_tag(const char *needle, const char *haystack, + const char **start, const char **end, GData **attributes) +{ + GData *attribs; + const char *cur = haystack; + char *name = NULL; + gboolean found = FALSE; + gboolean in_tag = FALSE; + gboolean in_attr = FALSE; + const char *in_quotes = NULL; + size_t needlelen; + + g_return_val_if_fail( needle != NULL, FALSE); + g_return_val_if_fail( *needle != '\0', FALSE); + g_return_val_if_fail( haystack != NULL, FALSE); + g_return_val_if_fail( start != NULL, FALSE); + g_return_val_if_fail( end != NULL, FALSE); + g_return_val_if_fail(attributes != NULL, FALSE); + + needlelen = strlen(needle); + g_datalist_init(&attribs); + + while (*cur && !found) { + if (in_tag) { + if (in_quotes) { + const char *close = cur; + + while (*close && *close != *in_quotes) + close++; + + /* if we got the close quote, store the value and carry on from * + * after it. if we ran to the end of the string, point to the NULL * + * and we're outta here */ + if (*close) { + /* only store a value if we have an attribute name */ + if (name) { + size_t len = close - cur; + char *val = g_strndup(cur, len); + + g_datalist_set_data_full(&attribs, name, val, g_free); + g_free(name); + name = NULL; + } + + in_quotes = NULL; + cur = close + 1; + } else { + cur = close; + } + } else if (in_attr) { + const char *close = cur; + + while (*close && *close != '>' && *close != '"' && + *close != '\'' && *close != ' ' && *close != '=') + close++; + + /* if we got the equals, store the name of the attribute. if we got + * the quote, save the attribute and go straight to quote mode. + * otherwise the tag closed or we reached the end of the string, + * so we can get outta here */ + switch (*close) { + case '"': + case '\'': + in_quotes = close; + /* fall through */ + case '=': + { + size_t len = close - cur; + + /* don't store a blank attribute name */ + if (len) { + g_free(name); + name = g_ascii_strdown(cur, len); + } + + in_attr = FALSE; + cur = close + 1; + } + break; + case ' ': + case '>': + in_attr = FALSE; + /* fall through */ + default: + cur = close; + break; + } + } else { + switch (*cur) { + case ' ': + /* swallow extra spaces inside tag */ + while (*cur && *cur == ' ') cur++; + in_attr = TRUE; + break; + case '>': + found = TRUE; + *end = cur; + break; + case '"': + case '\'': + in_quotes = cur; + /* fall through */ + default: + cur++; + break; + } + } + } else { + /* if we hit a < followed by the name of our tag... */ + if (*cur == '<' && !g_ascii_strncasecmp(cur + 1, needle, needlelen)) { + *start = cur; + cur = cur + needlelen + 1; + + /* if we're pointing at a space or a >, we found the right tag. if * + * we're not, we've found a longer tag, so we need to skip to the * + * >, but not being distracted by >s inside quotes. */ + if (*cur == ' ' || *cur == '>') { + in_tag = TRUE; + } else { + while (*cur && *cur != '"' && *cur != '\'' && *cur != '>') { + if (*cur == '"') { + cur++; + while (*cur && *cur != '"') + cur++; + } else if (*cur == '\'') { + cur++; + while (*cur && *cur != '\'') + cur++; + } else { + cur++; + } + } + } + } else { + cur++; + } + } + } + + /* clean up any attribute name from a premature termination */ + g_free(name); + + if (found) { + *attributes = attribs; + } else { + *start = NULL; + *end = NULL; + *attributes = NULL; + } + + return found; +} + +struct purple_parse_tag { + char *src_tag; + char *dest_tag; + gboolean ignore; +}; + +/* NOTE: Do not put `do {} while(0)` around this macro (as this is the method + recommended in the GCC docs). It contains 'continue's that should + affect the while-loop in purple_markup_html_to_xhtml and doing the + above would break that. + Also, remember to put braces in constructs that require them for + multiple statements when using this macro. */ +#define ALLOW_TAG_ALT(x, y) if(!g_ascii_strncasecmp(c, "<" x " ", strlen("<" x " "))) { \ + const char *o = c + strlen("<" x); \ + const char *p = NULL, *q = NULL, *r = NULL; \ + /* o = iterating over full tag \ + * p = > (end of tag) \ + * q = start of quoted bit \ + * r = < inside tag \ + */ \ + GString *innards = g_string_new(""); \ + while(o && *o) { \ + if(!q && (*o == '\"' || *o == '\'') ) { \ + q = o; \ + } else if(q) { \ + if(*o == *q) { /* end of quoted bit */ \ + char *unescaped = g_strndup(q+1, o-q-1); \ + char *escaped = g_markup_escape_text(unescaped, -1); \ + g_string_append_printf(innards, "%c%s%c", *q, escaped, *q); \ + g_free(unescaped); \ + g_free(escaped); \ + q = NULL; \ + } else if(*c == '\\') { \ + o++; \ + } \ + } else if(*o == '<') { \ + r = o; \ + } else if(*o == '>') { \ + p = o; \ + break; \ + } else { \ + innards = g_string_append_c(innards, *o); \ + } \ + o++; \ + } \ + if(p && !r) { /* got an end of tag and no other < earlier */\ + if(*(p-1) != '/') { \ + struct purple_parse_tag *pt = g_new0(struct purple_parse_tag, 1); \ + pt->src_tag = x; \ + pt->dest_tag = y; \ + tags = g_list_prepend(tags, pt); \ + } \ + if(xhtml) { \ + xhtml = g_string_append(xhtml, "<" y); \ + xhtml = g_string_append(xhtml, innards->str); \ + xhtml = g_string_append_c(xhtml, '>'); \ + } \ + c = p + 1; \ + } else { /* got end of tag with earlier < *or* didn't get anything */ \ + if(xhtml) \ + xhtml = g_string_append(xhtml, "<"); \ + if(plain) \ + plain = g_string_append_c(plain, '<'); \ + c++; \ + } \ + g_string_free(innards, TRUE); \ + continue; \ + } \ + if(!g_ascii_strncasecmp(c, "<" x, strlen("<" x)) && \ + (*(c+strlen("<" x)) == '>' || \ + !g_ascii_strncasecmp(c+strlen("<" x), "/>", 2))) { \ + if(xhtml) \ + xhtml = g_string_append(xhtml, "<" y); \ + c += strlen("<" x); \ + if(*c != '/') { \ + struct purple_parse_tag *pt = g_new0(struct purple_parse_tag, 1); \ + pt->src_tag = x; \ + pt->dest_tag = y; \ + tags = g_list_prepend(tags, pt); \ + if(xhtml) \ + xhtml = g_string_append_c(xhtml, '>'); \ + } else { \ + if(xhtml) \ + xhtml = g_string_append(xhtml, "/>");\ + } \ + c = strchr(c, '>') + 1; \ + continue; \ + } +/* Don't forget to check the note above for ALLOW_TAG_ALT. */ +#define ALLOW_TAG(x) ALLOW_TAG_ALT(x, x) +void +purple_markup_html_to_xhtml(const char *html, char **xhtml_out, + char **plain_out) +{ + GString *xhtml = NULL; + GString *plain = NULL; + GString *url = NULL; + GString *cdata = NULL; + GList *tags = NULL, *tag; + const char *c = html; + char quote = '\0'; + +#define CHECK_QUOTE(ptr) if (*(ptr) == '\'' || *(ptr) == '\"') \ + quote = *(ptr++); \ + else \ + quote = '\0'; + +#define VALID_CHAR(ptr) (*(ptr) && *(ptr) != quote && (quote || (*(ptr) != ' ' && *(ptr) != '>'))) + + g_return_if_fail(xhtml_out != NULL || plain_out != NULL); + + if(xhtml_out) + xhtml = g_string_new(""); + if(plain_out) + plain = g_string_new(""); + + while(c && *c) { + if(*c == '<') { + if(*(c+1) == '/') { /* closing tag */ + tag = tags; + while(tag) { + struct purple_parse_tag *pt = tag->data; + if(!g_ascii_strncasecmp((c+2), pt->src_tag, strlen(pt->src_tag)) && *(c+strlen(pt->src_tag)+2) == '>') { + c += strlen(pt->src_tag) + 3; + break; + } + tag = tag->next; + } + if(tag) { + while(tags) { + struct purple_parse_tag *pt = tags->data; + if(xhtml && !pt->ignore) + g_string_append_printf(xhtml, "</%s>", pt->dest_tag); + if(plain && purple_strequal(pt->src_tag, "a")) { + /* if this is a link, we have to add the url to the plaintext, too */ + if (cdata && url && + (!g_string_equal(cdata, url) && (g_ascii_strncasecmp(url->str, "mailto:", 7) != 0 || + g_utf8_collate(url->str + 7, cdata->str) != 0))) + g_string_append_printf(plain, " <%s>", g_strstrip(purple_unescape_html(url->str))); + if (cdata) { + g_string_free(cdata, TRUE); + cdata = NULL; + } + + } + if(tags == tag) + break; + tags = g_list_delete_link(tags, tags); + g_free(pt); + } + g_free(tag->data); + tags = g_list_delete_link(tags, tag); + } else { + /* a closing tag we weren't expecting... + * we'll let it slide, if it's really a tag...if it's + * just a </ we'll escape it properly */ + const char *end = c+2; + while(*end && g_ascii_isalpha(*end)) + end++; + if(*end == '>') { + c = end+1; + } else { + if(xhtml) + xhtml = g_string_append(xhtml, "<"); + if(plain) + plain = g_string_append_c(plain, '<'); + c++; + } + } + } else { /* opening tag */ + ALLOW_TAG("blockquote"); + ALLOW_TAG("cite"); + ALLOW_TAG("div"); + ALLOW_TAG("em"); + ALLOW_TAG("h1"); + ALLOW_TAG("h2"); + ALLOW_TAG("h3"); + ALLOW_TAG("h4"); + ALLOW_TAG("h5"); + ALLOW_TAG("h6"); + /* we only allow html to start the message */ + if(c == html) { + ALLOW_TAG("html"); + } + ALLOW_TAG_ALT("i", "em"); + ALLOW_TAG_ALT("italic", "em"); + ALLOW_TAG("li"); + ALLOW_TAG("ol"); + ALLOW_TAG("p"); + ALLOW_TAG("pre"); + ALLOW_TAG("q"); + ALLOW_TAG("span"); + ALLOW_TAG("ul"); + + + /* we skip <HR> because it's not legal in XHTML-IM. However, + * we still want to send something sensible, so we put a + * linebreak in its place. <BR> also needs special handling + * because putting a </BR> to close it would just be dumb. */ + if((!g_ascii_strncasecmp(c, "<br", 3) + || !g_ascii_strncasecmp(c, "<hr", 3)) + && (*(c+3) == '>' || + !g_ascii_strncasecmp(c+3, "/>", 2) || + !g_ascii_strncasecmp(c+3, " />", 3))) { + c = strchr(c, '>') + 1; + if(xhtml) + xhtml = g_string_append(xhtml, "<br/>"); + if(plain && *c != '\n') + plain = g_string_append_c(plain, '\n'); + continue; + } + if(!g_ascii_strncasecmp(c, "<b>", 3) || !g_ascii_strncasecmp(c, "<bold>", strlen("<bold>")) || !g_ascii_strncasecmp(c, "<strong>", strlen("<strong>"))) { + struct purple_parse_tag *pt = g_new0(struct purple_parse_tag, 1); + if (*(c+2) == '>') + pt->src_tag = "b"; + else if (*(c+2) == 'o') + pt->src_tag = "bold"; + else + pt->src_tag = "strong"; + pt->dest_tag = "span"; + tags = g_list_prepend(tags, pt); + c = strchr(c, '>') + 1; + if(xhtml) + xhtml = g_string_append(xhtml, "<span style='font-weight: bold;'>"); + continue; + } + if(!g_ascii_strncasecmp(c, "<u>", 3) || !g_ascii_strncasecmp(c, "<underline>", strlen("<underline>"))) { + struct purple_parse_tag *pt = g_new0(struct purple_parse_tag, 1); + pt->src_tag = *(c+2) == '>' ? "u" : "underline"; + pt->dest_tag = "span"; + tags = g_list_prepend(tags, pt); + c = strchr(c, '>') + 1; + if (xhtml) + xhtml = g_string_append(xhtml, "<span style='text-decoration: underline;'>"); + continue; + } + if(!g_ascii_strncasecmp(c, "<s>", 3) || !g_ascii_strncasecmp(c, "<strike>", strlen("<strike>"))) { + struct purple_parse_tag *pt = g_new0(struct purple_parse_tag, 1); + pt->src_tag = *(c+2) == '>' ? "s" : "strike"; + pt->dest_tag = "span"; + tags = g_list_prepend(tags, pt); + c = strchr(c, '>') + 1; + if(xhtml) + xhtml = g_string_append(xhtml, "<span style='text-decoration: line-through;'>"); + continue; + } + if(!g_ascii_strncasecmp(c, "<sub>", 5)) { + struct purple_parse_tag *pt = g_new0(struct purple_parse_tag, 1); + pt->src_tag = "sub"; + pt->dest_tag = "span"; + tags = g_list_prepend(tags, pt); + c = strchr(c, '>') + 1; + if(xhtml) + xhtml = g_string_append(xhtml, "<span style='vertical-align:sub;'>"); + continue; + } + if(!g_ascii_strncasecmp(c, "<sup>", 5)) { + struct purple_parse_tag *pt = g_new0(struct purple_parse_tag, 1); + pt->src_tag = "sup"; + pt->dest_tag = "span"; + tags = g_list_prepend(tags, pt); + c = strchr(c, '>') + 1; + if(xhtml) + xhtml = g_string_append(xhtml, "<span style='vertical-align:super;'>"); + continue; + } + if (!g_ascii_strncasecmp(c, "<img", 4) && (*(c+4) == '>' || *(c+4) == ' ')) { + const char *p = c + 4; + GString *src = NULL, *alt = NULL; +#define ESCAPE(from, to) \ + CHECK_QUOTE(from); \ + while (VALID_CHAR(from)) { \ + int len; \ + if ((*from == '&') && (purple_markup_unescape_entity(from, &len) == NULL)) \ + to = g_string_append(to, "&"); \ + else if (*from == '\'') \ + to = g_string_append(to, "'"); \ + else \ + to = g_string_append_c(to, *from); \ + from++; \ + } + + while (*p && *p != '>') { + if (!g_ascii_strncasecmp(p, "src=", 4)) { + const char *q = p + 4; + if (src) + g_string_free(src, TRUE); + src = g_string_new(""); + ESCAPE(q, src); + p = q; + } else if (!g_ascii_strncasecmp(p, "alt=", 4)) { + const char *q = p + 4; + if (alt) + g_string_free(alt, TRUE); + alt = g_string_new(""); + ESCAPE(q, alt); + p = q; + } else { + p++; + } + } +#undef ESCAPE + if ((c = strchr(p, '>')) != NULL) + c++; + else + c = p; + /* src and alt are required! */ + if(src && xhtml) + g_string_append_printf(xhtml, "<img src='%s' alt='%s' />", g_strstrip(src->str), alt ? alt->str : ""); + if(alt) { + if(plain) + plain = g_string_append(plain, purple_unescape_html(alt->str)); + if(!src && xhtml) + xhtml = g_string_append(xhtml, alt->str); + g_string_free(alt, TRUE); + } + g_string_free(src, TRUE); + continue; + } + if (!g_ascii_strncasecmp(c, "<a", 2) && (*(c+2) == '>' || *(c+2) == ' ')) { + const char *p = c + 2; + struct purple_parse_tag *pt; + while (*p && *p != '>') { + if (!g_ascii_strncasecmp(p, "href=", 5)) { + const char *q = p + 5; + if (url) + g_string_free(url, TRUE); + url = g_string_new(""); + if (cdata) + g_string_free(cdata, TRUE); + cdata = g_string_new(""); + CHECK_QUOTE(q); + while (VALID_CHAR(q)) { + int len; + if ((*q == '&') && (purple_markup_unescape_entity(q, &len) == NULL)) + url = g_string_append(url, "&"); + else if (*q == '"') + url = g_string_append(url, """); + else + url = g_string_append_c(url, *q); + q++; + } + p = q; + } else { + p++; + } + } + if ((c = strchr(p, '>')) != NULL) + c++; + else + c = p; + pt = g_new0(struct purple_parse_tag, 1); + pt->src_tag = "a"; + pt->dest_tag = "a"; + tags = g_list_prepend(tags, pt); + if(xhtml) + g_string_append_printf(xhtml, "<a href=\"%s\">", url ? g_strstrip(url->str) : ""); + continue; + } +#define ESCAPE(from, to) \ + CHECK_QUOTE(from); \ + while (VALID_CHAR(from)) { \ + int len; \ + if ((*from == '&') && (purple_markup_unescape_entity(from, &len) == NULL)) \ + to = g_string_append(to, "&"); \ + else if (*from == '\'') \ + to = g_string_append_c(to, '\"'); \ + else \ + to = g_string_append_c(to, *from); \ + from++; \ + } + if(!g_ascii_strncasecmp(c, "<font", 5) && (*(c+5) == '>' || *(c+5) == ' ')) { + const char *p = c + 5; + GString *style = g_string_new(""); + struct purple_parse_tag *pt; + while (*p && *p != '>') { + if (!g_ascii_strncasecmp(p, "back=", 5)) { + const char *q = p + 5; + GString *color = g_string_new(""); + ESCAPE(q, color); + g_string_append_printf(style, "background: %s; ", color->str); + g_string_free(color, TRUE); + p = q; + } else if (!g_ascii_strncasecmp(p, "color=", 6)) { + const char *q = p + 6; + GString *color = g_string_new(""); + ESCAPE(q, color); + g_string_append_printf(style, "color: %s; ", color->str); + g_string_free(color, TRUE); + p = q; + } else if (!g_ascii_strncasecmp(p, "face=", 5)) { + const char *q = p + 5; + GString *face = g_string_new(""); + ESCAPE(q, face); + g_string_append_printf(style, "font-family: %s; ", g_strstrip(face->str)); + g_string_free(face, TRUE); + p = q; + } else if (!g_ascii_strncasecmp(p, "size=", 5)) { + const char *q = p + 5; + int sz; + const char *size = "medium"; + CHECK_QUOTE(q); + sz = atoi(q); + switch (sz) + { + case 1: + size = "xx-small"; + break; + case 2: + size = "small"; + break; + case 3: + size = "medium"; + break; + case 4: + size = "large"; + break; + case 5: + size = "x-large"; + break; + case 6: + case 7: + size = "xx-large"; + break; + default: + break; + } + g_string_append_printf(style, "font-size: %s; ", size); + p = q; + } else { + p++; + } + } + if ((c = strchr(p, '>')) != NULL) + c++; + else + c = p; + pt = g_new0(struct purple_parse_tag, 1); + pt->src_tag = "font"; + pt->dest_tag = "span"; + tags = g_list_prepend(tags, pt); + if(style->len && xhtml) + g_string_append_printf(xhtml, "<span style='%s'>", g_strstrip(style->str)); + else + pt->ignore = TRUE; + g_string_free(style, TRUE); + continue; + } +#undef ESCAPE + if (!g_ascii_strncasecmp(c, "<body ", 6)) { + const char *p = c + 6; + gboolean did_something = FALSE; + while (*p && *p != '>') { + if (!g_ascii_strncasecmp(p, "bgcolor=", 8)) { + const char *q = p + 8; + struct purple_parse_tag *pt = g_new0(struct purple_parse_tag, 1); + GString *color = g_string_new(""); + CHECK_QUOTE(q); + while (VALID_CHAR(q)) { + color = g_string_append_c(color, *q); + q++; + } + if (xhtml) + g_string_append_printf(xhtml, "<span style='background: %s;'>", g_strstrip(color->str)); + g_string_free(color, TRUE); + if ((c = strchr(p, '>')) != NULL) + c++; + else + c = p; + pt->src_tag = "body"; + pt->dest_tag = "span"; + tags = g_list_prepend(tags, pt); + did_something = TRUE; + break; + } + p++; + } + if (did_something) continue; + } + /* this has to come after the special case for bgcolor */ + ALLOW_TAG("body"); + if(!g_ascii_strncasecmp(c, "<!--", strlen("<!--"))) { + char *p = strstr(c + strlen("<!--"), "-->"); + if(p) { + if(xhtml) + xhtml = g_string_append(xhtml, "<!--"); + c += strlen("<!--"); + continue; + } + } + + if(xhtml) + xhtml = g_string_append(xhtml, "<"); + if(plain) + plain = g_string_append_c(plain, '<'); + c++; + } + } else if(*c == '&') { + char buf[7]; + const char *pln; + int len; + + if ((pln = purple_markup_unescape_entity(c, &len)) == NULL) { + len = 1; + g_snprintf(buf, sizeof(buf), "%c", *c); + pln = buf; + } + if(xhtml) + xhtml = g_string_append_len(xhtml, c, len); + if(plain) + plain = g_string_append(plain, pln); + if(cdata) + cdata = g_string_append_len(cdata, c, len); + c += len; + } else { + if(xhtml) + xhtml = g_string_append_c(xhtml, *c); + if(plain) + plain = g_string_append_c(plain, *c); + if(cdata) + cdata = g_string_append_c(cdata, *c); + c++; + } + } + if(xhtml) { + for (tag = tags; tag ; tag = tag->next) { + struct purple_parse_tag *pt = tag->data; + if(!pt->ignore) + g_string_append_printf(xhtml, "</%s>", pt->dest_tag); + } + } + g_list_free(tags); + if(xhtml_out) + *xhtml_out = g_string_free(xhtml, FALSE); + if(plain_out) + *plain_out = g_string_free(plain, FALSE); + if(url) + g_string_free(url, TRUE); + if (cdata) + g_string_free(cdata, TRUE); +#undef CHECK_QUOTE +#undef VALID_CHAR +} + +/* The following are probably reasonable changes: + * - \n should be converted to a normal space + * - in addition to <br>, <p> and <div> etc. should also be converted into \n + * - We want to turn </td>#whitespace<td> sequences into a single tab + * - We want to turn </tr>#whitespace<tr> sequences into a single \n + * - <script>...</script> and <style>...</style> should be completely removed + */ + +char * +purple_markup_strip_html(const char *str) +{ + int i, j, k, entlen; + gboolean visible = TRUE; + gboolean closing_td_p = FALSE; + gchar *str2; + const gchar *cdata_close_tag = NULL, *ent; + gchar *href = NULL; + int href_st = 0; + + if(!str) + return NULL; + + str2 = g_strdup(str); + + for (i = 0, j = 0; str2[i]; i++) + { + if (str2[i] == '<') + { + if (cdata_close_tag) + { + /* Note: Don't even assume any other tag is a tag in CDATA */ + if (g_ascii_strncasecmp(str2 + i, cdata_close_tag, + strlen(cdata_close_tag)) == 0) + { + i += strlen(cdata_close_tag) - 1; + cdata_close_tag = NULL; + } + continue; + } + else if (g_ascii_strncasecmp(str2 + i, "<td", 3) == 0 && closing_td_p) + { + str2[j++] = '\t'; + visible = TRUE; + } + else if (g_ascii_strncasecmp(str2 + i, "</td>", 5) == 0) + { + closing_td_p = TRUE; + visible = FALSE; + } + else + { + closing_td_p = FALSE; + visible = TRUE; + } + + k = i + 1; + + if(g_ascii_isspace(str2[k])) + visible = TRUE; + else if (str2[k]) + { + /* Scan until we end the tag either implicitly (closed start + * tag) or explicitly, using a sloppy method (i.e., < or > + * inside quoted attributes will screw us up) + */ + while (str2[k] && str2[k] != '<' && str2[k] != '>') + { + k++; + } + + /* If we've got an <a> tag with an href, save the address + * to print later. */ + if (g_ascii_strncasecmp(str2 + i, "<a", 2) == 0 && + g_ascii_isspace(str2[i+2])) + { + int st; /* start of href, inclusive [ */ + int end; /* end of href, exclusive ) */ + char delim = ' '; + /* Find start of href */ + for (st = i + 3; st < k; st++) + { + if (g_ascii_strncasecmp(str2+st, "href=", 5) == 0) + { + st += 5; + if (str2[st] == '"' || str2[st] == '\'') + { + delim = str2[st]; + st++; + } + break; + } + } + /* find end of address */ + for (end = st; end < k && str2[end] != delim; end++) + { + /* All the work is done in the loop construct above. */ + } + + /* If there's an address, save it. If there was + * already one saved, kill it. */ + if (st < k) + { + char *tmp; + g_free(href); + tmp = g_strndup(str2 + st, end - st); + href = purple_unescape_html(tmp); + g_free(tmp); + href_st = j; + } + } + + /* Replace </a> with an ascii representation of the + * address the link was pointing to. */ + else if (href != NULL && g_ascii_strncasecmp(str2 + i, "</a>", 4) == 0) + { + size_t hrlen = strlen(href); + + /* Only insert the href if it's different from the CDATA. */ + if ((hrlen != (gsize)(j - href_st) || + strncmp(str2 + href_st, href, hrlen)) && + (hrlen != (gsize)(j - href_st + 7) || /* 7 == strlen("http://") */ + strncmp(str2 + href_st, href + 7, hrlen - 7))) + { + str2[j++] = ' '; + str2[j++] = '('; + memmove(str2 + j, href, hrlen); + j += hrlen; + str2[j++] = ')'; + g_free(href); + href = NULL; + } + } + + /* Check for tags which should be mapped to newline (but ignore some of + * the tags at the beginning of the text) */ + else if ((j && (g_ascii_strncasecmp(str2 + i, "<p>", 3) == 0 + || g_ascii_strncasecmp(str2 + i, "<tr", 3) == 0 + || g_ascii_strncasecmp(str2 + i, "<hr", 3) == 0 + || g_ascii_strncasecmp(str2 + i, "<li", 3) == 0 + || g_ascii_strncasecmp(str2 + i, "<div", 4) == 0)) + || g_ascii_strncasecmp(str2 + i, "<br", 3) == 0 + || g_ascii_strncasecmp(str2 + i, "</table>", 8) == 0) + { + str2[j++] = '\n'; + } + /* Check for tags which begin CDATA and need to be closed */ + else if (g_ascii_strncasecmp(str2 + i, "<script", 7) == 0) + { + cdata_close_tag = "</script>"; + } + else if (g_ascii_strncasecmp(str2 + i, "<style", 6) == 0) + { + cdata_close_tag = "</style>"; + } + /* Update the index and continue checking after the tag */ + i = (str2[k] == '<' || str2[k] == '\0')? k - 1: k; + continue; + } + } + else if (cdata_close_tag) + { + continue; + } + else if (!g_ascii_isspace(str2[i])) + { + visible = TRUE; + } + + if (str2[i] == '&' && (ent = purple_markup_unescape_entity(str2 + i, &entlen)) != NULL) + { + while (*ent) + str2[j++] = *ent++; + i += entlen - 1; + continue; + } + + if (visible) + str2[j++] = g_ascii_isspace(str2[i])? ' ': str2[i]; + } + + g_free(href); + + str2[j] = '\0'; + + return str2; +} + +static gboolean +badchar(char c) +{ + switch (c) { + case ' ': + case ',': + case '\0': + case '\n': + case '\r': + case '<': + case '>': + case '"': + return TRUE; + default: + return FALSE; + } +} + +static gboolean +badentity(const char *c) +{ + if (!g_ascii_strncasecmp(c, "<", 4) || + !g_ascii_strncasecmp(c, ">", 4) || + !g_ascii_strncasecmp(c, """, 6)) { + return TRUE; + } + return FALSE; +} + +static const char * +process_link(GString *ret, + const char *start, const char *c, + int matchlen, + const char *urlprefix, + int inside_paren) +{ + char *url_buf, *tmpurlbuf; + const char *t; + + for (t = c;; t++) { + if (!badchar(*t) && !badentity(t)) + continue; + + if (t - c == matchlen) + break; + + if (*t == ',' && *(t + 1) != ' ') { + continue; + } + + if (t > start && *(t - 1) == '.') + t--; + if (t > start && *(t - 1) == ')' && inside_paren > 0) + t--; + + url_buf = g_strndup(c, t - c); + tmpurlbuf = purple_unescape_html(url_buf); + g_string_append_printf(ret, "<A HREF=\"%s%s\">%s</A>", + urlprefix, + tmpurlbuf, url_buf); + g_free(tmpurlbuf); + g_free(url_buf); + return t; + } + + return c; +} + +char * +purple_markup_linkify(const char *text) +{ + const char *c, *t, *q = NULL; + char *tmpurlbuf, *url_buf; + gunichar g; + gboolean inside_html = FALSE; + int inside_paren = 0; + GString *ret; + + if (text == NULL) + return NULL; + + ret = g_string_new(""); + + c = text; + while (*c) { + + if(*c == '(' && !inside_html) { + inside_paren++; + ret = g_string_append_c(ret, *c); + c++; + } + + if(inside_html) { + if(*c == '>') { + inside_html = FALSE; + } else if(!q && (*c == '\"' || *c == '\'')) { + q = c; + } else if(q) { + if(*c == *q) + q = NULL; + } + } else if(*c == '<') { + inside_html = TRUE; + if (!g_ascii_strncasecmp(c, "<A", 2)) { + while (1) { + if (!g_ascii_strncasecmp(c, "/A>", 3)) { + inside_html = FALSE; + break; + } + ret = g_string_append_c(ret, *c); + c++; + if (!(*c)) + break; + } + } + } else if (!g_ascii_strncasecmp(c, "http://", 7)) { + c = process_link(ret, text, c, 7, "", inside_paren); + } else if (!g_ascii_strncasecmp(c, "https://", 8)) { + c = process_link(ret, text, c, 8, "", inside_paren); + } else if (!g_ascii_strncasecmp(c, "ftp://", 6)) { + c = process_link(ret, text, c, 6, "", inside_paren); + } else if (!g_ascii_strncasecmp(c, "sftp://", 7)) { + c = process_link(ret, text, c, 7, "", inside_paren); + } else if (!g_ascii_strncasecmp(c, "file://", 7)) { + c = process_link(ret, text, c, 7, "", inside_paren); + } else if (!g_ascii_strncasecmp(c, "www.", 4) && c[4] != '.' && (c == text || badchar(c[-1]) || badentity(c-1))) { + c = process_link(ret, text, c, 4, "http://", inside_paren); + } else if (!g_ascii_strncasecmp(c, "ftp.", 4) && c[4] != '.' && (c == text || badchar(c[-1]) || badentity(c-1))) { + c = process_link(ret, text, c, 4, "ftp://", inside_paren); + } else if (!g_ascii_strncasecmp(c, "xmpp:", 5) && (c == text || badchar(c[-1]) || badentity(c-1))) { + c = process_link(ret, text, c, 5, "", inside_paren); + } else if (!g_ascii_strncasecmp(c, "mailto:", 7)) { + t = c; + while (1) { + if (badchar(*t) || badentity(t)) { + char *d; + if (t - c == 7) { + break; + } + if (t > text && *(t - 1) == '.') + t--; + if ((d = strstr(c + 7, "?")) != NULL && d < t) + url_buf = g_strndup(c + 7, d - c - 7); + else + url_buf = g_strndup(c + 7, t - c - 7); + if (!purple_email_is_valid(url_buf)) { + g_free(url_buf); + break; + } + g_free(url_buf); + url_buf = g_strndup(c, t - c); + tmpurlbuf = purple_unescape_html(url_buf); + g_string_append_printf(ret, "<A HREF=\"%s\">%s</A>", + tmpurlbuf, url_buf); + g_free(url_buf); + g_free(tmpurlbuf); + c = t; + break; + } + t++; + } + } else if (c != text && (*c == '@')) { + int flag; + GString *gurl_buf = NULL; + const char illegal_chars[] = "!@#$%^&*()[]{}/|\\<>\":;\r\n \0"; + + if (strchr(illegal_chars,*(c - 1)) || strchr(illegal_chars, *(c + 1))) + flag = 0; + else { + flag = 1; + gurl_buf = g_string_new(""); + } + + t = c; + while (flag) { + /* iterate backwards grabbing the local part of an email address */ + g = g_utf8_get_char(t); + if (badchar(*t) || (g >= 127) || (*t == '(') || + ((*t == ';') && ((t > (text+2) && (!g_ascii_strncasecmp(t - 3, "<", 4) || + !g_ascii_strncasecmp(t - 3, ">", 4))) || + (t > (text+4) && (!g_ascii_strncasecmp(t - 5, """, 6)))))) { + /* local part will already be part of ret, strip it out */ + ret = g_string_truncate(ret, ret->len - (c - t)); + ret = g_string_append_unichar(ret, g); + break; + } else { + g_string_prepend_unichar(gurl_buf, g); + t = g_utf8_find_prev_char(text, t); + if (t < text) { + ret = g_string_assign(ret, ""); + break; + } + } + } + + t = g_utf8_find_next_char(c, NULL); + + while (flag) { + /* iterate forwards grabbing the domain part of an email address */ + g = g_utf8_get_char(t); + if (badchar(*t) || (g >= 127) || (*t == ')') || badentity(t)) { + char *d; + + url_buf = g_string_free(gurl_buf, FALSE); + gurl_buf = NULL; + + /* strip off trailing periods */ + if (*url_buf) { + for (d = url_buf + strlen(url_buf) - 1; *d == '.'; d--, t--) + *d = '\0'; + } + + tmpurlbuf = purple_unescape_html(url_buf); + if (purple_email_is_valid(tmpurlbuf)) { + g_string_append_printf(ret, "<A HREF=\"mailto:%s\">%s</A>", + tmpurlbuf, url_buf); + } else { + g_string_append(ret, url_buf); + } + g_free(url_buf); + g_free(tmpurlbuf); + c = t; + + break; + } else { + g_string_append_unichar(gurl_buf, g); + t = g_utf8_find_next_char(t, NULL); + } + } + + if (gurl_buf) { + g_string_free(gurl_buf, TRUE); + } + } + + if(*c == ')' && !inside_html) { + inside_paren--; + ret = g_string_append_c(ret, *c); + c++; + } + + if (*c == 0) + break; + + ret = g_string_append_c(ret, *c); + c++; + + } + return g_string_free(ret, FALSE); +} + +char *purple_unescape_text(const char *in) +{ + GString *ret; + const char *c = in; + + if (in == NULL) + return NULL; + + ret = g_string_new(""); + while (*c) { + int len; + const char *ent; + + if ((ent = purple_markup_unescape_entity(c, &len)) != NULL) { + g_string_append(ret, ent); + c += len; + } else { + g_string_append_c(ret, *c); + c++; + } + } + + return g_string_free(ret, FALSE); +} + +char *purple_unescape_html(const char *html) +{ + GString *ret; + const char *c = html; + + if (html == NULL) + return NULL; + + ret = g_string_new(""); + while (*c) { + int len; + const char *ent; + + if ((ent = purple_markup_unescape_entity(c, &len)) != NULL) { + g_string_append(ret, ent); + c += len; + } else if (!strncmp(c, "<br>", 4)) { + g_string_append_c(ret, '\n'); + c += 4; + } else { + g_string_append_c(ret, *c); + c++; + } + } + + return g_string_free(ret, FALSE); +} + +char * +purple_markup_slice(const char *str, guint x, guint y) +{ + GString *ret; + GQueue *q; + guint z = 0; + gboolean appended = FALSE; + gunichar c; + char *tag; + + g_return_val_if_fail(str != NULL, NULL); + g_return_val_if_fail(x <= y, NULL); + + if (x == y) + return g_strdup(""); + + ret = g_string_new(""); + q = g_queue_new(); + + while (*str && (z < y)) { + c = g_utf8_get_char(str); + + if (c == '<') { + char *end = strchr(str, '>'); + + if (!end) { + g_string_free(ret, TRUE); + while ((tag = g_queue_pop_head(q))) + g_free(tag); + g_queue_free(q); + return NULL; + } + + if (!g_ascii_strncasecmp(str, "<img ", 5)) { + z += strlen("[Image]"); + } else if (!g_ascii_strncasecmp(str, "<br", 3)) { + z += 1; + } else if (!g_ascii_strncasecmp(str, "<hr>", 4)) { + z += strlen("\n---\n"); + } else if (!g_ascii_strncasecmp(str, "</", 2)) { + /* pop stack */ + char *tmp; + + tmp = g_queue_pop_head(q); + g_free(tmp); + /* z += 0; */ + } else { + /* push it unto the stack */ + char *tmp; + + tmp = g_strndup(str, end - str + 1); + g_queue_push_head(q, tmp); + /* z += 0; */ + } + + if (z >= x) { + g_string_append_len(ret, str, end - str + 1); + } + + str = end; + } else if (c == '&') { + char *end = strchr(str, ';'); + if (!end) { + g_string_free(ret, TRUE); + while ((tag = g_queue_pop_head(q))) + g_free(tag); + g_queue_free(q); + + return NULL; + } + + if (z >= x) + g_string_append_len(ret, str, end - str + 1); + + z++; + str = end; + } else { + if (z == x && z > 0 && !appended) { + GList *l = q->tail; + + while (l) { + tag = l->data; + g_string_append(ret, tag); + l = l->prev; + } + appended = TRUE; + } + + if (z >= x) + g_string_append_unichar(ret, c); + z++; + } + + str = g_utf8_next_char(str); + } + + while ((tag = g_queue_pop_head(q))) { + char *name; + + name = purple_markup_get_tag_name(tag); + g_string_append_printf(ret, "</%s>", name); + g_free(name); + g_free(tag); + } + + g_queue_free(q); + return g_string_free(ret, FALSE); +} + +char * +purple_markup_get_tag_name(const char *tag) +{ + int i; + g_return_val_if_fail(tag != NULL, NULL); + g_return_val_if_fail(*tag == '<', NULL); + + for (i = 1; tag[i]; i++) + if (tag[i] == '>' || tag[i] == ' ' || tag[i] == '/') + break; + + return g_strndup(tag+1, i-1); +}