Commit 679fdfcd authored by Dries's avatar Dries

- Patch #161217 by sun, hingo: URL filter fixes and tests.

parent fb300d1f
......@@ -1298,46 +1298,202 @@ function _filter_url_settings($form, &$form_state, $filter, $format, $defaults)
}
/**
* URL filter. Automatically converts text web addresses (URLs, e-mail addresses,
* ftp links, etc.) into hyperlinks.
* URL filter. Automatically converts text into hyperlinks.
*
* This filter identifies and makes clickable three types of "links".
* - URLs like http://example.com.
* - E-mail addresses like name@example.com.
* - Web addresses without the "http://" protocol defined, like www.example.com.
* Each type must be processed separately, as there is no one regular
* expression that could possibly match all of the cases in one pass.
*/
function _filter_url($text, $filter) {
// Pass length to regexp callback
// Tags to skip and not recurse into.
$ignore_tags = 'a|script|style|code|pre';
// Pass length to regexp callback.
_filter_url_trim(NULL, $filter->settings['filter_url_length']);
$text = ' ' . $text . ' ';
// Create an array which contains the regexps for each type of link.
// The key to the regexp is the name of a function that is used as
// callback function to process matches of the regexp. The callback function
// is to return the replacement for the match. The array is used and
// matching/replacement done below inside some loops.
$tasks = array();
// Prepare protocols pattern for absolute URLs.
// check_url() will replace any bad protocols with HTTP, so we need to support
// the identical list. While '//' is technically optional for MAILTO only,
// we cannot cleanly differ between protocols here without hard-coding MAILTO,
// so '//' is optional for all protocols.
// @see filter_xss_bad_protocol()
$protocols = variable_get('filter_allowed_protocols', array('http', 'https', 'ftp', 'news', 'nntp', 'telnet', 'mailto', 'irc', 'ssh', 'sftp', 'webcal', 'rtsp'));
$protocols = implode(':(?://)?|', $protocols) . ':(?://)?';
// Prepare domain name pattern.
// The ICANN seems to be on track towards accepting more diverse top level
// domains, so this pattern has been "future-proofed" to allow for TLDs
// of length 2-64.
$domain = '(?:[A-Za-z0-9._+-]+\.)?[A-Za-z]{2,64}\b';
$ip = '(?:[0-9]{1,3}\.){3}[0-9]{1,3}';
$auth = '[a-zA-Z0-9:%_+*~#?&=.,/;-]+@';
$trail = '[a-zA-Z0-9:%_+*~#&\[\]=/;?\.,-]*[a-zA-Z0-9:%_+*~#&\[\]=/;-]';
// Prepare pattern for optional trailing punctuation.
// Even these characters could have a valid meaning for the URL, such usage is
// rare compared to using a URL at the end of or within a sentence, so these
// trailing characters are optionally excluded.
$punctuation = '[\.,?!]*?';
// Match absolute URLs.
$text = preg_replace_callback("`(<p>|<li>|<br\s*/?>|[ \n\r\t\(])((http://|https://|ftp://|mailto:|smb://|afp://|file://|gopher://|news://|ssl://|sslv2://|sslv3://|tls://|tcp://|udp://)([a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+*~#&=/;-]))([.,?!]*?)(?=(</p>|</li>|<br\s*/?>|[ \n\r\t\)]))`i", '_filter_url_parse_full_links', $text);
$url_pattern = "(?:$auth)?(?:$domain|$ip)/?(?:$trail)?";
$pattern = "`((?:$protocols)(?:$url_pattern))($punctuation)`";
$tasks['_filter_url_parse_full_links'] = $pattern;
// Match e-mail addresses.
$text = preg_replace("`(<p>|<li>|<br\s*/?>|[ \n\r\t\(])([A-Za-z0-9._-]+@[A-Za-z0-9._+-]+\.[A-Za-z]{2,4})([.,?!]*?)(?=(</p>|</li>|<br\s*/?>|[ \n\r\t\)]))`i", '\1<a href="mailto:\2">\2</a>\3', $text);
$url_pattern = "[A-Za-z0-9._-]+@(?:$domain)";
$pattern = "`($url_pattern)`";
$tasks['_filter_url_parse_email_links'] = $pattern;
// Match www domains.
$url_pattern = "www\.(?:$domain)/?(?:$trail)?";
$pattern = "`($url_pattern)($punctuation)`";
$tasks['_filter_url_parse_partial_links'] = $pattern;
// Each type of URL needs to be processed separately. The text is joined and
// re-split after each task, since all injected HTML tags must be correctly
// protected before the next task.
foreach ($tasks as $task => $pattern) {
// HTML comments need to be handled separately, as they may contain HTML
// markup, especially a '>'. Therefore, remove all comment contents and add
// them back later.
_filter_url_escape_comments('', TRUE);
$text = preg_replace_callback('`<!--(.*?)-->`s', '_filter_url_escape_comments', $text);
// Split at all tags; ensures that no tags or attributes are processed.
$chunks = preg_split('/(<.+?>)/is', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
// PHP ensures that the array consists of alternating delimiters and
// literals, and begins and ends with a literal (inserting NULL as
// required). Therefore, the first chunk is always text:
$chunk_type = 'text';
// If a tag of $ignore_tags is found, it is stored in $open_tag and only
// removed when the closing tag is found. Until the closing tag is found,
// no replacements are made.
$open_tag = '';
for ($i = 0; $i < count($chunks); $i++) {
if ($chunk_type == 'text') {
// Only process this text if there are no unclosed $ignore_tags.
if ($open_tag == '') {
// If there is a match, inject a link into this chunk via the callback
// function contained in $task.
$chunks[$i] = preg_replace_callback($pattern, $task, $chunks[$i]);
}
// Text chunk is done, so next chunk must be a tag.
$chunk_type = 'tag';
}
else {
// Only process this tag if there are no unclosed $ignore_tags.
if ($open_tag == '') {
// Check whether this tag is contained in $ignore_tags.
if (preg_match("`<($ignore_tags)(?:\s|>)`i", $chunks[$i], $matches)) {
$open_tag = $matches[1];
}
}
// Otherwise, check whether this is the closing tag for $open_tag.
else {
if (preg_match("`<\/$open_tag>`i", $chunks[$i], $matches)) {
$open_tag = '';
}
}
// Tag chunk is done, so next chunk must be text.
$chunk_type = 'text';
}
}
// Match www domains/addresses.
$text = preg_replace_callback("`(<p>|<li>|[ \n\r\t\(])(www\.[a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+~#\&=/;-])([.,?!]*?)(?=(</p>|</li>|<br\s*/?>|[ \n\r\t\)]))`i", '_filter_url_parse_partial_links', $text);
$text = substr($text, 1, -1);
$text = implode($chunks);
// Revert back to the original comment contents
_filter_url_escape_comments('', FALSE);
$text = preg_replace_callback('`<!--(.*?)-->`', '_filter_url_escape_comments', $text);
}
return $text;
}
/**
* Make links out of absolute URLs.
* preg_replace callback to make links out of absolute URLs.
*/
function _filter_url_parse_full_links($match) {
$match[2] = decode_entities($match[2]);
$caption = check_plain(_filter_url_trim($match[2]));
$match[2] = check_url($match[2]);
return $match[1] . '<a href="' . $match[2] . '">' . $caption . '</a>' . $match[5];
// The $i:th parenthesis in the regexp contains the URL.
$i = 1;
$match[$i] = decode_entities($match[$i]);
$caption = check_plain(_filter_url_trim($match[$i]));
$match[$i] = check_plain($match[$i]);
return '<a href="' . $match[$i] . '">' . $caption . '</a>' . $match[$i + 1];
}
/**
* preg_replace callback to make links out of e-mail addresses.
*/
function _filter_url_parse_email_links($match) {
// The $i:th parenthesis in the regexp contains the URL.
$i = 0;
$match[$i] = decode_entities($match[$i]);
$caption = check_plain(_filter_url_trim($match[$i]));
$match[$i] = check_plain($match[$i]);
return '<a href="mailto:' . $match[$i] . '">' . $caption . '</a>';
}
/**
* Make links out of domain names starting with "www."
* preg_replace callback to make links out of domain names starting with "www."
*/
function _filter_url_parse_partial_links($match) {
$match[2] = decode_entities($match[2]);
$caption = check_plain(_filter_url_trim($match[2]));
$match[2] = check_plain($match[2]);
return $match[1] . '<a href="http://' . $match[2] . '">' . $caption . '</a>' . $match[3];
// The $i:th parenthesis in the regexp contains the URL.
$i = 1;
$match[$i] = decode_entities($match[$i]);
$caption = check_plain(_filter_url_trim($match[$i]));
$match[$i] = check_plain($match[$i]);
return '<a href="http://' . $match[$i] . '">' . $caption . '</a>' . $match[$i + 1];
}
/**
* preg_replace callback to escape contents of HTML comments
*
* @param $match
* An array containing matches to replace from preg_replace_callback(),
* whereas $match[1] is expected to contain the content to be filtered.
* @param $escape
* (optional) Boolean whether to escape (TRUE) or unescape comments (FALSE).
* Defaults to neither. If TRUE, statically cached $comments are reset.
*/
function _filter_url_escape_comments($match, $escape = NULL) {
static $mode, $comments = array();
if (isset($escape)) {
$mode = $escape;
if ($escape){
$comments = array();
}
return;
}
// Replace all HTML coments with a '<!-- [hash] -->' placeholder.
if ($mode) {
$content = $match[1];
$hash = md5($content);
$comments[$hash] = $content;
return "<!-- $hash -->";
}
// Or replace placeholders with actual comment contents.
else {
$hash = $match[1];
$hash = trim($hash);
$content = $comments[$hash];
return "<!--$content-->";
}
}
/**
......@@ -1350,7 +1506,7 @@ function _filter_url_trim($text, $length = NULL) {
}
// Use +3 for '...' string length.
if (strlen($text) > $_length + 3) {
if ($_length && strlen($text) > $_length + 3) {
$text = substr($text, 0, $_length) . '...';
}
......
This diff is collapsed.
This is just a www.test.com. paragraph with person@test.com. some http://www.test.com. urls thrown in and also <code>using www.test.com the code tag</code>.
<blockquote>
This is just a www.test.com. paragraph with person@test.com. some http://www.test.com. urls thrown in and also <code>using www.test.com the code tag</code>.
</blockquote>
<code>Testing code tag http://www.test.com abc</code>
http://www.test.com
www.test.com
person@test.com
<code>www.test.com</code>
What about tags that don't exist <x>like x say www.test.com</x>? And what about tag <pooh>beginning www.test.com with p?</pooh>
Test &lt;br/&gt;: This is just a www.test.com. paragraph <strong>with</strong> some http://www.test.com urls thrown in. *<br/> This is just a www.test.com paragraph *<br/> with some http://www.test.com urls thrown in. *<br/>This is just a www.test.com paragraph person@test.com with some http://www.test.com urls *img*<img/> thrown in. This is just a www.test.com paragraph with some http://www.test.com urls thrown in. This is just a www.test.com paragraph person@test.com with some http://www.test.com urls thrown in.
This is just a www.test.com paragraph <strong>with</strong> some http://www.test.com urls thrown in. <br /> This is just a www.test.com paragraph with some http://www.test.com urls thrown in. This is just a www.test.com paragraph person@test.com with some http://www.test.com urls thrown in. This is just a www.test.com paragraph with some http://www.test.com urls thrown in. This is just a www.test.com paragraph person@test.com with some http://www.test.com urls thrown in.
The old URL filter has problems with <a title="kind of link www.example.com with text" href="http://www.example.com">this kind of link</a> with www address as part of text in title. www.test.com
<!-- This url www.test.com is inside a comment -->
<dl>
<dt>www.test.com</dt>
<dd>http://www.test.com</dd>
<dd>person@test.com</dd>
<dt>check www.test.com</dt>
<dd>this with some text around: http://www.test.com not so easy person@test.com now?</dd>
</dl>
<!-- <p>This url http://www.test.com is
inside a comment containing newlines and
<em>html</em> tags.</p> -->
This is the end!
\ No newline at end of file
This is just a <a href="http://www.test.com">www.test.com</a>. paragraph with <a href="mailto:person@test.com">person@test.com</a>. some <a href="http://www.test.com">http://www.test.com</a>. urls thrown in and also <code>using www.test.com the code tag</code>.
<blockquote>
This is just a <a href="http://www.test.com">www.test.com</a>. paragraph with <a href="mailto:person@test.com">person@test.com</a>. some <a href="http://www.test.com">http://www.test.com</a>. urls thrown in and also <code>using www.test.com the code tag</code>.
</blockquote>
<code>Testing code tag http://www.test.com abc</code>
<a href="http://www.test.com">http://www.test.com</a>
<a href="http://www.test.com">www.test.com</a>
<a href="mailto:person@test.com">person@test.com</a>
<code>www.test.com</code>
What about tags that don't exist <x>like x say <a href="http://www.test.com">www.test.com</a></x>? And what about tag <pooh>beginning <a href="http://www.test.com">www.test.com</a> with p?</pooh>
Test &lt;br/&gt;: This is just a <a href="http://www.test.com">www.test.com</a>. paragraph <strong>with</strong> some <a href="http://www.test.com">http://www.test.com</a> urls thrown in. *<br/> This is just a <a href="http://www.test.com">www.test.com</a> paragraph *<br/> with some <a href="http://www.test.com">http://www.test.com</a> urls thrown in. *<br/>This is just a <a href="http://www.test.com">www.test.com</a> paragraph <a href="mailto:person@test.com">person@test.com</a> with some <a href="http://www.test.com">http://www.test.com</a> urls *img*<img/> thrown in. This is just a <a href="http://www.test.com">www.test.com</a> paragraph with some <a href="http://www.test.com">http://www.test.com</a> urls thrown in. This is just a <a href="http://www.test.com">www.test.com</a> paragraph <a href="mailto:person@test.com">person@test.com</a> with some <a href="http://www.test.com">http://www.test.com</a> urls thrown in.
This is just a <a href="http://www.test.com">www.test.com</a> paragraph <strong>with</strong> some <a href="http://www.test.com">http://www.test.com</a> urls thrown in. <br /> This is just a <a href="http://www.test.com">www.test.com</a> paragraph with some <a href="http://www.test.com">http://www.test.com</a> urls thrown in. This is just a <a href="http://www.test.com">www.test.com</a> paragraph <a href="mailto:person@test.com">person@test.com</a> with some <a href="http://www.test.com">http://www.test.com</a> urls thrown in. This is just a <a href="http://www.test.com">www.test.com</a> paragraph with some <a href="http://www.test.com">http://www.test.com</a> urls thrown in. This is just a <a href="http://www.test.com">www.test.com</a> paragraph <a href="mailto:person@test.com">person@test.com</a> with some <a href="http://www.test.com">http://www.test.com</a> urls thrown in.
The old URL filter has problems with <a title="kind of link www.example.com with text" href="http://www.example.com">this kind of link</a> with www address as part of text in title. <a href="http://www.test.com">www.test.com</a>
<!-- This url www.test.com is inside a comment -->
<dl>
<dt><a href="http://www.test.com">www.test.com</a></dt>
<dd><a href="http://www.test.com">http://www.test.com</a></dd>
<dd><a href="mailto:person@test.com">person@test.com</a></dd>
<dt>check <a href="http://www.test.com">www.test.com</a></dt>
<dd>this with some text around: <a href="http://www.test.com">http://www.test.com</a> not so easy <a href="mailto:person@test.com">person@test.com</a> now?</dd>
</dl>
<!-- <p>This url http://www.test.com is
inside a comment containing newlines and
<em>html</em> tags.</p> -->
This is the end!
\ No newline at end of file
......@@ -411,6 +411,24 @@ protected function error($message = '', $group = 'Other', array $caller = NULL)
return $this->assert('exception', $message, $group, $caller);
}
/**
* Logs verbose message in a text file.
*
* The a link to the vebose message will be placed in the test results via
* as a passing assertion with the text '[verbose message]'.
*
* @param $message
* The verbose message to be stored.
*
* @see simpletest_verbose()
*/
protected function verbose($message) {
if ($id = simpletest_verbose($message)) {
$url = file_create_url($this->originalFileDirectory . '/simpletest/verbose/' . get_class($this) . '-' . $id . '.html');
$this->error(l(t('Verbose message'), $url, array('attributes' => array('target' => '_blank'))), 'User notice');
}
}
/**
* Run all tests in this class.
*/
......@@ -3071,24 +3089,6 @@ protected function verboseEmail($count = 1) {
$this->verbose(t('Email:') . '<pre>' . print_r($mail, TRUE) . '</pre>');
}
}
/**
* Logs verbose message in a text file.
*
* The a link to the vebose message will be placed in the test results via
* as a passing assertion with the text '[verbose message]'.
*
* @param $message
* The verbose message to be stored.
*
* @see simpletest_verbose()
*/
protected function verbose($message) {
if ($id = simpletest_verbose($message)) {
$url = file_create_url($this->originalFileDirectory . '/simpletest/verbose/' . get_class($this) . '-' . $id . '.html');
$this->error(l(t('Verbose message'), $url, array('attributes' => array('target' => '_blank'))), 'User notice');
}
}
}
/**
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment