Make number_of_words respect CJK characters (#7813)

Merge pull request 7813
This commit is contained in:
iBug ♦ 2020-05-22 23:01:17 +08:00 committed by GitHub
parent 2e80c557e3
commit 13b7291649
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 54 additions and 5 deletions

View File

@ -200,10 +200,25 @@
#
- name: Number of Words
description: Count the number of words in some text.
description: >-
Count the number of words in some text.<br/>
From <span class="version-badge">v4.1.0</span>, this filter takes an optional
argument to control the handling of Chinese-Japanese-Korean (CJK) characters
in the <code>input</code> string.<br/>
Passing <code>'cjk'</code> as the argument will count every CJK character
detected as one word irrespective of being separated by whitespace.<br/>
Passing <code>'auto'</code> (auto-detect) works similar to <code>'cjk'</code>
but is more performant if the filter is used on a variable string that may
or may not contain CJK chars.
examples:
- input: '{{ page.content | number_of_words }}'
output: 1337
- input: '{{ "Hello world!" | number_of_words }}'
output: 2
- input: '{{ "你好hello世界world" | number_of_words }}'
output: 1
- input: '{{ "你好hello世界world" | number_of_words: "cjk" }}'
output: 6
- input: '{{ "你好hello世界world" | number_of_words: "auto" }}'
output: 6
#

View File

@ -121,9 +121,21 @@ module Jekyll
# input - The String on which to operate.
#
# Returns the Integer word count.
def number_of_words(input)
def number_of_words(input, mode = nil)
cjk_charset = '\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}'
cjk_regex = %r![#{cjk_charset}]!o
word_regex = %r![^#{cjk_charset}\s]+!o
case mode
when "cjk"
input.scan(cjk_regex).length + input.scan(word_regex).length
when "auto"
cjk_count = input.scan(cjk_regex).length
cjk_count.zero? ? input.split.length : cjk_count + input.scan(word_regex).length
else
input.split.length
end
end
# Join an array of things into a string by separating with commas and the
# word "and" for the last one.

View File

@ -1513,5 +1513,27 @@ class TestFilters < JekyllUnitTest
end
end
end
context "number_of_words filter" do
should "return the number of words for Latin-only text" do
assert_equal 5, @filter.number_of_words("hello world and taoky strong!", "auto")
assert_equal 5, @filter.number_of_words("hello world and taoky strong!", "cjk")
end
should "return the number of characters for CJK-only text" do
assert_equal 17, @filter.number_of_words("こんにちは、世界!안녕하세요 세상!", "auto")
assert_equal 17, @filter.number_of_words("こんにちは、世界!안녕하세요 세상!", "cjk")
end
should "process Latin and CJK independently" do
# Intentional: No space between Latin and CJK
assert_equal 6, @filter.number_of_words("你好hello世界world", "auto")
assert_equal 6, @filter.number_of_words("你好hello世界world", "cjk")
end
should "maintain original behavior unless specified" do
assert_equal 1, @filter.number_of_words("你好hello世界world")
end
end
end
end