I'm trying to write a method to count the number of words when the content is in chinese and japanese. This should exclude the special characters / punctuations / whiteSpaces.
I tried creating a regex for each locale and find the words based on it. Tried looking for existing regex on internet but none of them seems to be working. My approach -
function countWords(text, locale) {
let wordCount = 0;
// Set the word boundary based on the locale
let wordBoundary = '\\b';
if (locale === 'ja') {
// Japanese word boundary
wordBoundary = '[\\p{Script=Hiragana}\\p{Script=Katakana}\\p{Script=Han}ー]+';
} else if (locale === 'zh') {
// Chinese word boundary
wordBoundary = '[\\p{Script=Han}]+';
}
const regex = new RegExp(wordBoundary, 'gu');
const matches = text.matchAll(regex);
for (const match of matches) {
wordCount++;
}
return wordCount;
}
I thought this should work, but I'm comparing the word count in MS word and using this logic, they are coming different
A possible word count approach could be based on a text segmentation array which was the result of calling an Intl.Segmenter
instance's segment
method.
Each segmented item features properties like e.g. ...
{ segment: 'words', index: 9, input: 'How many words ...', isWordLike: true }
... thus, in order to get the total word count, one could reduce
the array of text segment items by validating an item's isWordLike
value ...
function countWords(text, locale) {
return [
...new Intl.Segmenter(locale, { granularity: 'word' })
.segment(text)
]
.reduce((wordCount, { isWordLike }) =>
wordCount + Number(isWordLike), 0
);
}
console.log(
"countWords('How many words does the text contain?', 'en') ?..",
countWords('How many words does the text contain?', 'en'),
);
console.log(
"countWords('Combien de mots contient ce texte ?', 'fr') ?..",
countWords('Combien de mots contient ce texte ?', 'fr'),
);
console.log(
"countWords('そのテキストには何語含まれていますか?', 'ja') ?..",
countWords('そのテキストには何語含まれていますか?', 'ja'),
);
console.log(
"countWords('该文本包含多少个单词?', 'zh') ?..",
countWords('该文本包含多少个单词?', 'zh'),
);
.as-console-wrapper { min-height: 100%!important; top: 0; }
Note ... as of now Firefox still does not support/implement Intl.Segmenter