Make number_of_words respect CJK characters (#7813)
Merge pull request 7813
This commit is contained in:
parent
2e80c557e3
commit
13b7291649
|
@ -200,10 +200,25 @@
|
|||
#
|
||||
|
||||
- name: Number of Words
|
||||
description: Count the number of words in some text.
|
||||
description: >-
|
||||
Count the number of words in some text.<br/>
|
||||
From <span class="version-badge">v4.1.0</span>, this filter takes an optional
|
||||
argument to control the handling of Chinese-Japanese-Korean (CJK) characters
|
||||
in the <code>input</code> string.<br/>
|
||||
Passing <code>'cjk'</code> as the argument will count every CJK character
|
||||
detected as one word irrespective of being separated by whitespace.<br/>
|
||||
Passing <code>'auto'</code> (auto-detect) works similar to <code>'cjk'</code>
|
||||
but is more performant if the filter is used on a variable string that may
|
||||
or may not contain CJK chars.
|
||||
examples:
|
||||
- input: '{{ page.content | number_of_words }}'
|
||||
output: 1337
|
||||
- input: '{{ "Hello world!" | number_of_words }}'
|
||||
output: 2
|
||||
- input: '{{ "你好hello世界world" | number_of_words }}'
|
||||
output: 1
|
||||
- input: '{{ "你好hello世界world" | number_of_words: "cjk" }}'
|
||||
output: 6
|
||||
- input: '{{ "你好hello世界world" | number_of_words: "auto" }}'
|
||||
output: 6
|
||||
|
||||
#
|
||||
|
||||
|
|
|
@ -121,8 +121,20 @@ module Jekyll
|
|||
# input - The String on which to operate.
|
||||
#
|
||||
# Returns the Integer word count.
|
||||
def number_of_words(input)
|
||||
input.split.length
|
||||
def number_of_words(input, mode = nil)
|
||||
cjk_charset = '\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}'
|
||||
cjk_regex = %r![#{cjk_charset}]!o
|
||||
word_regex = %r![^#{cjk_charset}\s]+!o
|
||||
|
||||
case mode
|
||||
when "cjk"
|
||||
input.scan(cjk_regex).length + input.scan(word_regex).length
|
||||
when "auto"
|
||||
cjk_count = input.scan(cjk_regex).length
|
||||
cjk_count.zero? ? input.split.length : cjk_count + input.scan(word_regex).length
|
||||
else
|
||||
input.split.length
|
||||
end
|
||||
end
|
||||
|
||||
# Join an array of things into a string by separating with commas and the
|
||||
|
|
|
@ -1513,5 +1513,27 @@ class TestFilters < JekyllUnitTest
|
|||
end
|
||||
end
|
||||
end
|
||||
|
||||
context "number_of_words filter" do
|
||||
should "return the number of words for Latin-only text" do
|
||||
assert_equal 5, @filter.number_of_words("hello world and taoky strong!", "auto")
|
||||
assert_equal 5, @filter.number_of_words("hello world and taoky strong!", "cjk")
|
||||
end
|
||||
|
||||
should "return the number of characters for CJK-only text" do
|
||||
assert_equal 17, @filter.number_of_words("こんにちは、世界!안녕하세요 세상!", "auto")
|
||||
assert_equal 17, @filter.number_of_words("こんにちは、世界!안녕하세요 세상!", "cjk")
|
||||
end
|
||||
|
||||
should "process Latin and CJK independently" do
|
||||
# Intentional: No space between Latin and CJK
|
||||
assert_equal 6, @filter.number_of_words("你好hello世界world", "auto")
|
||||
assert_equal 6, @filter.number_of_words("你好hello世界world", "cjk")
|
||||
end
|
||||
|
||||
should "maintain original behavior unless specified" do
|
||||
assert_equal 1, @filter.number_of_words("你好hello世界world")
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue