Make number_of_words respect CJK characters (#7813)
Merge pull request 7813
This commit is contained in:
parent
2e80c557e3
commit
13b7291649
|
@ -200,10 +200,25 @@
|
||||||
#
|
#
|
||||||
|
|
||||||
- name: Number of Words
|
- name: Number of Words
|
||||||
description: Count the number of words in some text.
|
description: >-
|
||||||
|
Count the number of words in some text.<br/>
|
||||||
|
From <span class="version-badge">v4.1.0</span>, this filter takes an optional
|
||||||
|
argument to control the handling of Chinese-Japanese-Korean (CJK) characters
|
||||||
|
in the <code>input</code> string.<br/>
|
||||||
|
Passing <code>'cjk'</code> as the argument will count every CJK character
|
||||||
|
detected as one word irrespective of being separated by whitespace.<br/>
|
||||||
|
Passing <code>'auto'</code> (auto-detect) works similar to <code>'cjk'</code>
|
||||||
|
but is more performant if the filter is used on a variable string that may
|
||||||
|
or may not contain CJK chars.
|
||||||
examples:
|
examples:
|
||||||
- input: '{{ page.content | number_of_words }}'
|
- input: '{{ "Hello world!" | number_of_words }}'
|
||||||
output: 1337
|
output: 2
|
||||||
|
- input: '{{ "你好hello世界world" | number_of_words }}'
|
||||||
|
output: 1
|
||||||
|
- input: '{{ "你好hello世界world" | number_of_words: "cjk" }}'
|
||||||
|
output: 6
|
||||||
|
- input: '{{ "你好hello世界world" | number_of_words: "auto" }}'
|
||||||
|
output: 6
|
||||||
|
|
||||||
#
|
#
|
||||||
|
|
||||||
|
|
|
@ -121,8 +121,20 @@ module Jekyll
|
||||||
# input - The String on which to operate.
|
# input - The String on which to operate.
|
||||||
#
|
#
|
||||||
# Returns the Integer word count.
|
# Returns the Integer word count.
|
||||||
def number_of_words(input)
|
def number_of_words(input, mode = nil)
|
||||||
input.split.length
|
cjk_charset = '\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}'
|
||||||
|
cjk_regex = %r![#{cjk_charset}]!o
|
||||||
|
word_regex = %r![^#{cjk_charset}\s]+!o
|
||||||
|
|
||||||
|
case mode
|
||||||
|
when "cjk"
|
||||||
|
input.scan(cjk_regex).length + input.scan(word_regex).length
|
||||||
|
when "auto"
|
||||||
|
cjk_count = input.scan(cjk_regex).length
|
||||||
|
cjk_count.zero? ? input.split.length : cjk_count + input.scan(word_regex).length
|
||||||
|
else
|
||||||
|
input.split.length
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
# Join an array of things into a string by separating with commas and the
|
# Join an array of things into a string by separating with commas and the
|
||||||
|
|
|
@ -1513,5 +1513,27 @@ class TestFilters < JekyllUnitTest
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
context "number_of_words filter" do
|
||||||
|
should "return the number of words for Latin-only text" do
|
||||||
|
assert_equal 5, @filter.number_of_words("hello world and taoky strong!", "auto")
|
||||||
|
assert_equal 5, @filter.number_of_words("hello world and taoky strong!", "cjk")
|
||||||
|
end
|
||||||
|
|
||||||
|
should "return the number of characters for CJK-only text" do
|
||||||
|
assert_equal 17, @filter.number_of_words("こんにちは、世界!안녕하세요 세상!", "auto")
|
||||||
|
assert_equal 17, @filter.number_of_words("こんにちは、世界!안녕하세요 세상!", "cjk")
|
||||||
|
end
|
||||||
|
|
||||||
|
should "process Latin and CJK independently" do
|
||||||
|
# Intentional: No space between Latin and CJK
|
||||||
|
assert_equal 6, @filter.number_of_words("你好hello世界world", "auto")
|
||||||
|
assert_equal 6, @filter.number_of_words("你好hello世界world", "cjk")
|
||||||
|
end
|
||||||
|
|
||||||
|
should "maintain original behavior unless specified" do
|
||||||
|
assert_equal 1, @filter.number_of_words("你好hello世界world")
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue