Support for international characters on hashtags

Non exaustive list of accented characters taken from [Twitter text handling libraries][1]

[1]: 431b6ad441/src/com/twitter/Regex.java
This commit is contained in:
Júlio César 2014-08-17 20:57:35 -03:00
parent 160ae729dd
commit d9ffe5d324

View File

@ -3049,8 +3049,35 @@ angular.module('myApp.services', [])
emojiUtf.push(emojiData[emojiCode][0]); emojiUtf.push(emojiData[emojiCode][0]);
emojiMap[emojiData[emojiCode][0]] = emojiCode; emojiMap[emojiData[emojiCode][0]] = emojiCode;
} }
var regexAlphaChars = "a-z" +
"\\u00c0-\\u00d6\\u00d8-\\u00f6\\u00f8-\\u00ff" + // Latin-1
"\\u0100-\\u024f" + // Latin Extended A and B
"\\u0253\\u0254\\u0256\\u0257\\u0259\\u025b\\u0263\\u0268\\u026f\\u0272\\u0289\\u028b" + // IPA Extensions
"\\u02bb" + // Hawaiian
"\\u0300-\\u036f" + // Combining diacritics
"\\u1e00-\\u1eff"; // Latin Extended Additional (mostly for Vietnamese)
"\\u0400-\\u04ff\\u0500-\\u0527" + // Cyrillic
"\\u2de0-\\u2dff\\ua640-\\ua69f" + // Cyrillic Extended A/B
"\\u0591-\\u05bf\\u05c1-\\u05c2\\u05c4-\\u05c5\\u05c7" +
"\\u05d0-\\u05ea\\u05f0-\\u05f4" + // Hebrew
"\\ufb1d-\\ufb28\\ufb2a-\\ufb36\\ufb38-\\ufb3c\\ufb3e\\ufb40-\\ufb41" +
"\\ufb43-\\ufb44\\ufb46-\\ufb4f" + // Hebrew Pres. Forms
"\\u0610-\\u061a\\u0620-\\u065f\\u066e-\\u06d3\\u06d5-\\u06dc" +
"\\u06de-\\u06e8\\u06ea-\\u06ef\\u06fa-\\u06fc\\u06ff" + // Arabic
"\\u0750-\\u077f\\u08a0\\u08a2-\\u08ac\\u08e4-\\u08fe" + // Arabic Supplement and Extended A
"\\ufb50-\\ufbb1\\ufbd3-\\ufd3d\\ufd50-\\ufd8f\\ufd92-\\ufdc7\\ufdf0-\\ufdfb" + // Pres. Forms A
"\\ufe70-\\ufe74\\ufe76-\\ufefc" + // Pres. Forms B
"\\u200c" + // Zero-Width Non-Joiner
"\\u0e01-\\u0e3a\\u0e40-\\u0e4e" + // Thai
"\\u1100-\\u11ff\\u3130-\\u3185\\uA960-\\uA97F\\uAC00-\\uD7AF\\uD7B0-\\uD7FF" + // Hangul (Korean)
"\\u3003\\u3005\\u303b" + // Kanji/Han iteration marks
"\\uff21-\\uff3a\\uff41-\\uff5a" + // full width Alphabet
"\\uff66-\\uff9f" + // half width Katakana
"\\uffa1-\\uffdc"; // half width Hangul (Korean)
var regexAlphaNumericChars = "0-9\.\_" + regexAlphaChars;
var regExp = new RegExp('((?:(ftp|https?)://|(?:mailto:)?([A-Za-z0-9._%+-]+@))(\\S*\\.\\S*[^\\s.;,(){}<>"\']))|(\\n)|(' + emojiUtf.join('|') + ')|(^|\\s)(#[A-Za-z0-9\_\.]{4,20})', 'i'); var regExp = new RegExp('((?:(ftp|https?)://|(?:mailto:)?([A-Za-z0-9._%+-]+@))(\\S*\\.\\S*[^\\s.;,(){}<>"\']))|(\\n)|(' + emojiUtf.join('|') + ')|(^|\\s)(#[' + regexAlphaNumericChars + ']{4,20})', 'i');
var youtubeRegex = /(?:https?:\/\/)?(?:www\.)?youtu(?:|.be|be.com|.b)(?:\/v\/|\/watch\\?v=|e\/|\/watch(?:.+)v=)(.{11})(?:\&[^\s]*)?/; var youtubeRegex = /(?:https?:\/\/)?(?:www\.)?youtu(?:|.be|be.com|.b)(?:\/v\/|\/watch\\?v=|e\/|\/watch(?:.+)v=)(.{11})(?:\&[^\s]*)?/;
return { return {