2022-07-19 22:37:41 +02:00
|
|
|
#include "autolink.h"
|
|
|
|
|
|
|
|
#include <string.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <ctype.h>
|
|
|
|
|
|
|
|
#ifndef _MSC_VER
|
|
|
|
#include <strings.h>
|
|
|
|
#else
|
|
|
|
#define strncasecmp _strnicmp
|
|
|
|
#endif
|
|
|
|
|
|
|
|
int
|
|
|
|
hoedown_autolink_is_safe(const uint8_t *data, size_t size)
|
|
|
|
{
|
|
|
|
static const size_t valid_uris_count = 6;
|
|
|
|
static const char *valid_uris[] = {
|
|
|
|
"http://", "https://", "/", "#", "ftp://", "mailto:"
|
|
|
|
};
|
|
|
|
static const size_t valid_uris_size[] = { 7, 8, 1, 1, 6, 7 };
|
|
|
|
size_t i;
|
|
|
|
|
|
|
|
for (i = 0; i < valid_uris_count; ++i) {
|
|
|
|
size_t len = valid_uris_size[i];
|
|
|
|
|
|
|
|
if (size > len &&
|
|
|
|
strncasecmp((char *)data, valid_uris[i], len) == 0 &&
|
|
|
|
isalnum(data[len]))
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static size_t
|
|
|
|
autolink_delim(uint8_t *data, size_t link_end, size_t max_rewind, size_t size)
|
|
|
|
{
|
|
|
|
uint8_t cclose, copen = 0;
|
|
|
|
size_t i;
|
|
|
|
|
|
|
|
for (i = 0; i < link_end; ++i)
|
|
|
|
if (data[i] == '<') {
|
|
|
|
link_end = i;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
while (link_end > 0) {
|
|
|
|
if (strchr("?!.,:", data[link_end - 1]) != NULL)
|
|
|
|
link_end--;
|
|
|
|
|
|
|
|
else if (data[link_end - 1] == ';') {
|
|
|
|
size_t new_end = link_end - 2;
|
|
|
|
|
|
|
|
while (new_end > 0 && isalpha(data[new_end]))
|
|
|
|
new_end--;
|
|
|
|
|
|
|
|
if (new_end < link_end - 2 && data[new_end] == '&')
|
|
|
|
link_end = new_end;
|
|
|
|
else
|
|
|
|
link_end--;
|
|
|
|
}
|
|
|
|
else break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (link_end == 0)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
cclose = data[link_end - 1];
|
|
|
|
|
|
|
|
switch (cclose) {
|
|
|
|
case '"': copen = '"'; break;
|
|
|
|
case '\'': copen = '\''; break;
|
|
|
|
case ')': copen = '('; break;
|
|
|
|
case ']': copen = '['; break;
|
|
|
|
case '}': copen = '{'; break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (copen != 0) {
|
|
|
|
size_t closing = 0;
|
|
|
|
size_t opening = 0;
|
2022-07-20 00:40:49 +02:00
|
|
|
i = 0;
|
2022-07-19 22:37:41 +02:00
|
|
|
|
|
|
|
/* Try to close the final punctuation sign in this same line;
|
|
|
|
* if we managed to close it outside of the URL, that means that it's
|
|
|
|
* not part of the URL. If it closes inside the URL, that means it
|
|
|
|
* is part of the URL.
|
|
|
|
*
|
|
|
|
* Examples:
|
|
|
|
*
|
|
|
|
* foo http://www.pokemon.com/Pikachu_(Electric) bar
|
|
|
|
* => http://www.pokemon.com/Pikachu_(Electric)
|
|
|
|
*
|
|
|
|
* foo (http://www.pokemon.com/Pikachu_(Electric)) bar
|
|
|
|
* => http://www.pokemon.com/Pikachu_(Electric)
|
|
|
|
*
|
|
|
|
* foo http://www.pokemon.com/Pikachu_(Electric)) bar
|
|
|
|
* => http://www.pokemon.com/Pikachu_(Electric))
|
|
|
|
*
|
|
|
|
* (foo http://www.pokemon.com/Pikachu_(Electric)) bar
|
|
|
|
* => foo http://www.pokemon.com/Pikachu_(Electric)
|
|
|
|
*/
|
|
|
|
|
|
|
|
while (i < link_end) {
|
|
|
|
if (data[i] == copen)
|
|
|
|
opening++;
|
|
|
|
else if (data[i] == cclose)
|
|
|
|
closing++;
|
|
|
|
|
|
|
|
i++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (closing != opening)
|
|
|
|
link_end--;
|
|
|
|
}
|
|
|
|
|
|
|
|
return link_end;
|
|
|
|
}
|
|
|
|
|
|
|
|
static size_t
|
|
|
|
check_domain(uint8_t *data, size_t size, int allow_short)
|
|
|
|
{
|
|
|
|
size_t i, np = 0;
|
|
|
|
|
|
|
|
if (!isalnum(data[0]))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
for (i = 1; i < size - 1; ++i) {
|
|
|
|
if (strchr(".:", data[i]) != NULL) np++;
|
|
|
|
else if (!isalnum(data[i]) && data[i] != '-') break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (allow_short) {
|
|
|
|
/* We don't need a valid domain in the strict sense (with
|
|
|
|
* least one dot; so just make sure it's composed of valid
|
|
|
|
* domain characters and return the length of the the valid
|
|
|
|
* sequence. */
|
|
|
|
return i;
|
|
|
|
} else {
|
|
|
|
/* a valid domain needs to have at least a dot.
|
|
|
|
* that's as far as we get */
|
|
|
|
return np ? i : 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t
|
|
|
|
hoedown_autolink__www(
|
|
|
|
size_t *rewind_p,
|
|
|
|
hoedown_buffer *link,
|
|
|
|
uint8_t *data,
|
|
|
|
size_t max_rewind,
|
|
|
|
size_t size,
|
2024-06-18 10:17:25 +02:00
|
|
|
hoedown_autolink_flags flags)
|
2022-07-19 22:37:41 +02:00
|
|
|
{
|
|
|
|
size_t link_end;
|
|
|
|
|
|
|
|
if (max_rewind > 0 && !ispunct(data[-1]) && !isspace(data[-1]))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (size < 4 || memcmp(data, "www.", strlen("www.")) != 0)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
link_end = check_domain(data, size, 0);
|
|
|
|
|
|
|
|
if (link_end == 0)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
while (link_end < size && !isspace(data[link_end]))
|
|
|
|
link_end++;
|
|
|
|
|
|
|
|
link_end = autolink_delim(data, link_end, max_rewind, size);
|
|
|
|
|
|
|
|
if (link_end == 0)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
hoedown_buffer_put(link, data, link_end);
|
|
|
|
*rewind_p = 0;
|
|
|
|
|
|
|
|
return (int)link_end;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t
|
|
|
|
hoedown_autolink__email(
|
|
|
|
size_t *rewind_p,
|
|
|
|
hoedown_buffer *link,
|
|
|
|
uint8_t *data,
|
|
|
|
size_t max_rewind,
|
|
|
|
size_t size,
|
2024-06-18 10:17:25 +02:00
|
|
|
hoedown_autolink_flags flags)
|
2022-07-19 22:37:41 +02:00
|
|
|
{
|
|
|
|
size_t link_end, rewind;
|
|
|
|
int nb = 0, np = 0;
|
|
|
|
|
|
|
|
for (rewind = 0; rewind < max_rewind; ++rewind) {
|
|
|
|
uint8_t c = data[-1 - rewind];
|
|
|
|
|
|
|
|
if (isalnum(c))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (strchr(".+-_", c) != NULL)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (rewind == 0)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
for (link_end = 0; link_end < size; ++link_end) {
|
|
|
|
uint8_t c = data[link_end];
|
|
|
|
|
|
|
|
if (isalnum(c))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (c == '@')
|
|
|
|
nb++;
|
|
|
|
else if (c == '.' && link_end < size - 1)
|
|
|
|
np++;
|
|
|
|
else if (c != '-' && c != '_')
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (link_end < 2 || nb != 1 || np == 0 ||
|
|
|
|
!isalpha(data[link_end - 1]))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
link_end = autolink_delim(data, link_end, max_rewind, size);
|
|
|
|
|
|
|
|
if (link_end == 0)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
hoedown_buffer_put(link, data - rewind, link_end + rewind);
|
|
|
|
*rewind_p = rewind;
|
|
|
|
|
|
|
|
return link_end;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t
|
|
|
|
hoedown_autolink__url(
|
|
|
|
size_t *rewind_p,
|
|
|
|
hoedown_buffer *link,
|
|
|
|
uint8_t *data,
|
|
|
|
size_t max_rewind,
|
|
|
|
size_t size,
|
2024-06-18 10:17:25 +02:00
|
|
|
hoedown_autolink_flags flags)
|
2022-07-19 22:37:41 +02:00
|
|
|
{
|
|
|
|
size_t link_end, rewind = 0, domain_len;
|
|
|
|
|
|
|
|
if (size < 4 || data[1] != '/' || data[2] != '/')
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
while (rewind < max_rewind && isalpha(data[-1 - rewind]))
|
|
|
|
rewind++;
|
|
|
|
|
|
|
|
if (!hoedown_autolink_is_safe(data - rewind, size + rewind))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
link_end = strlen("://");
|
|
|
|
|
|
|
|
domain_len = check_domain(
|
|
|
|
data + link_end,
|
|
|
|
size - link_end,
|
|
|
|
flags & HOEDOWN_AUTOLINK_SHORT_DOMAINS);
|
|
|
|
|
|
|
|
if (domain_len == 0)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
link_end += domain_len;
|
|
|
|
while (link_end < size && !isspace(data[link_end]))
|
|
|
|
link_end++;
|
|
|
|
|
|
|
|
link_end = autolink_delim(data, link_end, max_rewind, size);
|
|
|
|
|
|
|
|
if (link_end == 0)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
hoedown_buffer_put(link, data - rewind, link_end + rewind);
|
|
|
|
*rewind_p = rewind;
|
|
|
|
|
|
|
|
return link_end;
|
|
|
|
}
|