From d9ffe5d324b8ad52609b6a6977f7384ec88dd24c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=BAlio=20C=C3=A9sar?= Date: Sun, 17 Aug 2014 20:57:35 -0300 Subject: [PATCH] Support for international characters on hashtags Non exaustive list of accented characters taken from [Twitter text handling libraries][1] [1]: https://github.com/twitter/twitter-text-java/blob/431b6ad44158c4a4a38d80d34df5b101ae2559fc/src/com/twitter/Regex.java --- app/js/services.js | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/app/js/services.js b/app/js/services.js index cf9b8cf3..e806095b 100644 --- a/app/js/services.js +++ b/app/js/services.js @@ -3049,8 +3049,35 @@ angular.module('myApp.services', []) emojiUtf.push(emojiData[emojiCode][0]); emojiMap[emojiData[emojiCode][0]] = emojiCode; } - - var regExp = new RegExp('((?:(ftp|https?)://|(?:mailto:)?([A-Za-z0-9._%+-]+@))(\\S*\\.\\S*[^\\s.;,(){}<>"\']))|(\\n)|(' + emojiUtf.join('|') + ')|(^|\\s)(#[A-Za-z0-9\_\.]{4,20})', 'i'); + + var regexAlphaChars = "a-z" + + "\\u00c0-\\u00d6\\u00d8-\\u00f6\\u00f8-\\u00ff" + // Latin-1 + "\\u0100-\\u024f" + // Latin Extended A and B + "\\u0253\\u0254\\u0256\\u0257\\u0259\\u025b\\u0263\\u0268\\u026f\\u0272\\u0289\\u028b" + // IPA Extensions + "\\u02bb" + // Hawaiian + "\\u0300-\\u036f" + // Combining diacritics + "\\u1e00-\\u1eff"; // Latin Extended Additional (mostly for Vietnamese) + "\\u0400-\\u04ff\\u0500-\\u0527" + // Cyrillic + "\\u2de0-\\u2dff\\ua640-\\ua69f" + // Cyrillic Extended A/B + "\\u0591-\\u05bf\\u05c1-\\u05c2\\u05c4-\\u05c5\\u05c7" + + "\\u05d0-\\u05ea\\u05f0-\\u05f4" + // Hebrew + "\\ufb1d-\\ufb28\\ufb2a-\\ufb36\\ufb38-\\ufb3c\\ufb3e\\ufb40-\\ufb41" + + "\\ufb43-\\ufb44\\ufb46-\\ufb4f" + // Hebrew Pres. Forms + "\\u0610-\\u061a\\u0620-\\u065f\\u066e-\\u06d3\\u06d5-\\u06dc" + + "\\u06de-\\u06e8\\u06ea-\\u06ef\\u06fa-\\u06fc\\u06ff" + // Arabic + "\\u0750-\\u077f\\u08a0\\u08a2-\\u08ac\\u08e4-\\u08fe" + // Arabic Supplement and Extended A + "\\ufb50-\\ufbb1\\ufbd3-\\ufd3d\\ufd50-\\ufd8f\\ufd92-\\ufdc7\\ufdf0-\\ufdfb" + // Pres. Forms A + "\\ufe70-\\ufe74\\ufe76-\\ufefc" + // Pres. Forms B + "\\u200c" + // Zero-Width Non-Joiner + "\\u0e01-\\u0e3a\\u0e40-\\u0e4e" + // Thai + "\\u1100-\\u11ff\\u3130-\\u3185\\uA960-\\uA97F\\uAC00-\\uD7AF\\uD7B0-\\uD7FF" + // Hangul (Korean) + "\\u3003\\u3005\\u303b" + // Kanji/Han iteration marks + "\\uff21-\\uff3a\\uff41-\\uff5a" + // full width Alphabet + "\\uff66-\\uff9f" + // half width Katakana + "\\uffa1-\\uffdc"; // half width Hangul (Korean) +var regexAlphaNumericChars = "0-9\.\_" + regexAlphaChars; + + var regExp = new RegExp('((?:(ftp|https?)://|(?:mailto:)?([A-Za-z0-9._%+-]+@))(\\S*\\.\\S*[^\\s.;,(){}<>"\']))|(\\n)|(' + emojiUtf.join('|') + ')|(^|\\s)(#[' + regexAlphaNumericChars + ']{4,20})', 'i'); var youtubeRegex = /(?:https?:\/\/)?(?:www\.)?youtu(?:|.be|be.com|.b)(?:\/v\/|\/watch\\?v=|e\/|\/watch(?:.+)v=)(.{11})(?:\&[^\s]*)?/; return {