fixes of RegExps and behaviour of htmlFormatMsg() and slightly filterLang()

This commit is contained in:
Simon Grim 2015-05-12 20:38:41 +05:00
parent 257aedf61a
commit c149e8bbc1
2 changed files with 36 additions and 23 deletions

View File

@ -506,13 +506,13 @@ function filterLang(string) {
// before detection attempts we cut out any mentions and links, and replace _ with space
langFilterSubj = string.replace(/@\S\w*|https?:\/\/\S*/g, '').replace(/_+/g, ' ')
// cut out common frequently used words FIXME I believe there is a list of similar international stuff somewhere outside which is waiting for us, we should just find it
.replace(/\btwister|github|google|twitter\b/g, '')
.replace(/\btwister|tox|github|linux|ubuntu|debian|windows|google|twitter|facebook|microsoft|ping|pong|email|javascript\b/ig, '')
// replace zero-width word boundaries, such as between letters from different alphabets [or other symbols], with spaces
// FIXME not so good idea because 'Za pomocą białej listy' may turn into 'Za pomoc ą bia ł ej listy' for e.g.
// FIXME but first one was recognized as 'hrv' and second as 'pol' and you know it's 'pol' actually
.replace(/\b/g, ' ')
// cut out some more symbols
.replace(/[#\[\]\(\)\{\}\-\+\=\^\:\;\\\/0-9]/g, '')
.replace(/[#<>\.,:;\?\!\*\[\]\(\)\{\}\-\+\=\^\\\/0-9\u201C\u201D\u2026\u2014\u4E00\u3002\uFF0C\uFF1A\uFF1F\uFF01\u3010\u3011]/g, '') // unicode escaped stuff is '“”…—一。,:?!【】'
// clear unwanted spaces
.replace(/\s+/g, ' ').trim();

View File

@ -183,60 +183,73 @@ function htmlFormatMsg(msg, mentions) {
function htmlMention(str, pre) {
str = str.replace(new RegExp(['^', pre, '@'].join('')), '').toLowerCase();
mentions.push(str); // FIXME feel the scope
mentions.push(str); // FIXME feel the pain of the scope chain
// FIXME we're trying to not interact with DOM, coz' we want to run really fast [to hell of RegExps]
// FIXME actually we should avoid it by dropping a template idea and construct html right here
return $('#msg-user-link-template')[0].outerHTML
html.push($('#msg-user-link-template')[0].outerHTML
.replace(/\bid\s*=\s*"[^]*?"+/ig, '') // $().removeAttr('id')
//.replace(/\bhref\s*=\s*"[^]*?"+/ig, '') // $().removeAttr('href')
.replace(/<a\s+(?=[^>]*?\bclass\s*=\s*"(?=[^"]*?\bopen-profile-modal\b))/ig, [pre, '<a href="', $.MAL.userUrl(str), '" '].join('')) // $().closest('a.open-profile-modal').attr('href', $.MAL.userUrl(username))
.replace(/<a\s+(?=[^>]*?\bclass\s*=\s*"(?=[^"]*?\bopen-profile-modal\b))/ig, ['<a href="', $.MAL.userUrl(str), '" '].join('')) // $().closest('a.open-profile-modal').attr('href', $.MAL.userUrl(username))
.replace(/(<a\s+(?=[^>]*?\bclass\s*=\s*"(?=[^"]*?\bopen-profile-modal\b))[^]*?>)[^]*?(<\/a>)/ig, [pre, '$1@', str, '$2'].join('')) // $().closest('a.open-profile-modal').text('@'+username)
;
);
return ['>', html.length - 1, '<'].join('');
}
function htmlHashtag(str, pre) {
str = str.replace(new RegExp(['^', pre, '#'].join('')), '');
return $('#hashtag-link-template')[0].outerHTML
html.push($('#hashtag-link-template')[0].outerHTML
.replace(/\bid\s*=\s*"[^]*?"+/ig, '') // $().removeAttr('id')
//.replace(/\bhref\s*=\s*"[^]*?"+/ig, '') // $().removeAttr('href')
.replace(/<a\s+(?=[^>]*?\bclass\s*=\s*"(?=[^"]*?\bopen-hashtag-modal\b))/ig, ['<a href="', $.MAL.hashtagUrl(str.toLowerCase()), '" '].join('')) // $().closest('a.open-profile-modal').attr('href', $.MAL.hashtagUrl(hashtag))
.replace(/(<a\s+(?=[^>]*?\bclass\s*=\s*"(?=[^"]*?\bopen-hashtag-modal\b))[^]*?>)[^]*?(<\/a>)/ig, [pre, '$1#', str, '$2'].join('')) // $().closest('a.open-profile-modal').text('#'+hashtag)
;
);
return ['>', html.length - 1, '<'].join('');
}
function htmlHttp(str) {
return $('#external-page-link-template')[0].outerHTML
html.push($('#external-page-link-template')[0].outerHTML
.replace(/\bid\s*=\s*"[^]*?"+/ig, '') // $().removeAttr('id')
//.replace(/\bhref\s*=\s*"[^]*?"+/ig, '') // $().removeAttr('href')
.replace(/<a\s+/ig, ['<a href="', proxyURL(str), '" '].join('')) // $().closest('a').attr('href', proxyURL(url))
.replace(/(<a\s+[^]*?>)[^]*?(<\/a>)/ig, ['$1', str, '$2'].join('')) // $().closest('a').text(url)
;
);
return ['>', html.length - 1, '<'].join('');
}
function htmlEmail(str) {
return $('#external-page-link-template')[0].outerHTML
function htmlEmail(str, pre) {
str = str.replace(new RegExp(['^', pre].join('')), '');
html.push($('#external-page-link-template')[0].outerHTML
.replace(/\bid\s*=\s*"[^]*?"+/ig, '') // $().removeAttr('id')
//.replace(/\bhref\s*=\s*"[^]*?"+/ig, '') // $().removeAttr('href')
.replace(/<a\s+/ig, ['<a href="mailto:', str.toLowerCase(), '" '].join('')) // $().closest('a').attr('href', 'mailto:'+url)
.replace(/(<a\s+[^]*?>)[^]*?(<\/a>)/ig, ['$1', str, '$2'].join('')) // $().closest('a').text(url)
;
.replace(/(<a\s+[^]*?>)[^]*?(<\/a>)/ig, [pre, '$1', str, '$2'].join('')) // $().closest('a').text(url)
);
return ['>', html.length - 1, '<'].join('');
}
function htmlSplitCounter(str) {
return ['<span class="splited-post-counter">', str, '</span>'].join('');
html.push(['<span class="splited-post-counter">', str, '</span>'].join(''));
return ['>', html.length - 1, '<'].join('');
}
msg = escapeHtmlEntities(msg)
.replace(/(^|\s|\w)@\S\w*/g, htmlMention)
.replace(/(^|\s|\w)#\S\w*/g, htmlHashtag)
.replace(/\bhttps?:\/\/\S+/ig, htmlHttp)
.replace(/\S+@\S+\.\S+/g, htmlEmail)
.replace(/\(\d{1,2}\/\d{1,2}\)$/, htmlSplitCounter)
;
var html = [];
return _formatText(msg);
return _formatText(escapeHtmlEntities(msg)
.replace(/(^|[^\/]\B(?!\S*:\/\/\S*@))@\w+\b/g, htmlMention)
.replace(/(^|[^<\/]\B(?!\S*:\/\/\S*#))#[^#\\\/\.,:;\?\!\*\[\]\(\)\{\}\-\+\=\^\|%'"\u201C\u201D\u2026\u2014\u4E00\u3002\uFF0C\uFF1A\uFF1F\uFF01\u3010\u3011>\s]+/g, htmlHashtag) // unicode escaped stuff is '“”…—一。,:?!【】' for our chinese friends
.replace(/\bhttps?:\/\/\S[^>\s]+/ig, htmlHttp)
.replace(/([^<\/])\b(?!\S*:\/\/\S*@)\S+@\S+\.\S[^>\s]+/g, htmlEmail)
.replace(/\(\d{1,2}\/\d{1,2}\)$/, htmlSplitCounter)
.replace(/>(\d+)</g, function(candy, core) {return html[core]})
);
}
function proxyURL(url) {