Yang Guo | 4fd355c | 2019-09-19 08:59:03 | [diff] [blame] | 1 | /** Used to compose unicode character classes. */ |
| 2 | var rsAstralRange = '\\ud800-\\udfff', |
| 3 | rsComboMarksRange = '\\u0300-\\u036f', |
| 4 | reComboHalfMarksRange = '\\ufe20-\\ufe2f', |
| 5 | rsComboSymbolsRange = '\\u20d0-\\u20ff', |
| 6 | rsComboRange = rsComboMarksRange + reComboHalfMarksRange + rsComboSymbolsRange, |
| 7 | rsDingbatRange = '\\u2700-\\u27bf', |
| 8 | rsLowerRange = 'a-z\\xdf-\\xf6\\xf8-\\xff', |
| 9 | rsMathOpRange = '\\xac\\xb1\\xd7\\xf7', |
| 10 | rsNonCharRange = '\\x00-\\x2f\\x3a-\\x40\\x5b-\\x60\\x7b-\\xbf', |
| 11 | rsPunctuationRange = '\\u2000-\\u206f', |
| 12 | rsSpaceRange = ' \\t\\x0b\\f\\xa0\\ufeff\\n\\r\\u2028\\u2029\\u1680\\u180e\\u2000\\u2001\\u2002\\u2003\\u2004\\u2005\\u2006\\u2007\\u2008\\u2009\\u200a\\u202f\\u205f\\u3000', |
| 13 | rsUpperRange = 'A-Z\\xc0-\\xd6\\xd8-\\xde', |
| 14 | rsVarRange = '\\ufe0e\\ufe0f', |
| 15 | rsBreakRange = rsMathOpRange + rsNonCharRange + rsPunctuationRange + rsSpaceRange; |
| 16 | |
| 17 | /** Used to compose unicode capture groups. */ |
| 18 | var rsApos = "['\u2019]", |
| 19 | rsBreak = '[' + rsBreakRange + ']', |
| 20 | rsCombo = '[' + rsComboRange + ']', |
| 21 | rsDigits = '\\d+', |
| 22 | rsDingbat = '[' + rsDingbatRange + ']', |
| 23 | rsLower = '[' + rsLowerRange + ']', |
| 24 | rsMisc = '[^' + rsAstralRange + rsBreakRange + rsDigits + rsDingbatRange + rsLowerRange + rsUpperRange + ']', |
| 25 | rsFitz = '\\ud83c[\\udffb-\\udfff]', |
| 26 | rsModifier = '(?:' + rsCombo + '|' + rsFitz + ')', |
| 27 | rsNonAstral = '[^' + rsAstralRange + ']', |
| 28 | rsRegional = '(?:\\ud83c[\\udde6-\\uddff]){2}', |
| 29 | rsSurrPair = '[\\ud800-\\udbff][\\udc00-\\udfff]', |
| 30 | rsUpper = '[' + rsUpperRange + ']', |
| 31 | rsZWJ = '\\u200d'; |
| 32 | |
| 33 | /** Used to compose unicode regexes. */ |
| 34 | var rsMiscLower = '(?:' + rsLower + '|' + rsMisc + ')', |
| 35 | rsMiscUpper = '(?:' + rsUpper + '|' + rsMisc + ')', |
| 36 | rsOptContrLower = '(?:' + rsApos + '(?:d|ll|m|re|s|t|ve))?', |
| 37 | rsOptContrUpper = '(?:' + rsApos + '(?:D|LL|M|RE|S|T|VE))?', |
| 38 | reOptMod = rsModifier + '?', |
| 39 | rsOptVar = '[' + rsVarRange + ']?', |
| 40 | rsOptJoin = '(?:' + rsZWJ + '(?:' + [rsNonAstral, rsRegional, rsSurrPair].join('|') + ')' + rsOptVar + reOptMod + ')*', |
| 41 | rsOrdLower = '\\d*(?:1st|2nd|3rd|(?![123])\\dth)(?=\\b|[A-Z_])', |
| 42 | rsOrdUpper = '\\d*(?:1ST|2ND|3RD|(?![123])\\dTH)(?=\\b|[a-z_])', |
| 43 | rsSeq = rsOptVar + reOptMod + rsOptJoin, |
| 44 | rsEmoji = '(?:' + [rsDingbat, rsRegional, rsSurrPair].join('|') + ')' + rsSeq; |
| 45 | |
| 46 | /** Used to match complex or compound words. */ |
| 47 | var reUnicodeWord = RegExp([ |
| 48 | rsUpper + '?' + rsLower + '+' + rsOptContrLower + '(?=' + [rsBreak, rsUpper, '$'].join('|') + ')', |
| 49 | rsMiscUpper + '+' + rsOptContrUpper + '(?=' + [rsBreak, rsUpper + rsMiscLower, '$'].join('|') + ')', |
| 50 | rsUpper + '?' + rsMiscLower + '+' + rsOptContrLower, |
| 51 | rsUpper + '+' + rsOptContrUpper, |
| 52 | rsOrdUpper, |
| 53 | rsOrdLower, |
| 54 | rsDigits, |
| 55 | rsEmoji |
| 56 | ].join('|'), 'g'); |
| 57 | |
| 58 | /** |
| 59 | * Splits a Unicode `string` into an array of its words. |
| 60 | * |
| 61 | * @private |
| 62 | * @param {string} The string to inspect. |
| 63 | * @returns {Array} Returns the words of `string`. |
| 64 | */ |
| 65 | function unicodeWords(string) { |
| 66 | return string.match(reUnicodeWord) || []; |
| 67 | } |
| 68 | |
| 69 | module.exports = unicodeWords; |