Make number_of_words respect CJK characters (#7813)

Merge pull request 7813
2020-05-22 23:01:17 +08:00 · 2020-05-22 23:01:17 +08:00 · 13b7291649
parent 2e80c557e3
commit 13b7291649
3 changed files with 54 additions and 5 deletions
--- a/docs/_data/jekyll_filters.yml
+++ b/docs/_data/jekyll_filters.yml
@ -200,10 +200,25 @@
 #
 - name: Number of Words
-  description: Count the number of words in some text.
+  description: >-
    Count the number of words in some text.<br/>
    From <span class="version-badge">v4.1.0</span>, this filter takes an optional
    argument to control the handling of Chinese-Japanese-Korean (CJK) characters
    in the <code>input</code> string.<br/>
    Passing <code>'cjk'</code> as the argument will count every CJK character
    detected as one word irrespective of being separated by whitespace.<br/>
    Passing <code>'auto'</code> (auto-detect) works similar to <code>'cjk'</code>
    but is more performant if the filter is used on a variable string that may
    or may not contain CJK chars.
  examples:
-    - input: '{{ page.content | number_of_words }}'
+    - input: '{{ "Hello world!" | number_of_words }}'
-      output: 1337
+      output: 2
    - input: '{{ "你好hello世界world" | number_of_words }}'
      output: 1
    - input: '{{ "你好hello世界world" | number_of_words: "cjk" }}'
      output: 6
    - input: '{{ "你好hello世界world" | number_of_words: "auto" }}'
      output: 6
 #
--- a/lib/jekyll/filters.rb
+++ b/lib/jekyll/filters.rb
@ -121,8 +121,20 @@ module Jekyll
    # input - The String on which to operate.
    #
    # Returns the Integer word count.
-    def number_of_words(input)
+    def number_of_words(input, mode = nil)
-      input.split.length
+      cjk_charset = '\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}'
      cjk_regex = %r![#{cjk_charset}]!o
      word_regex = %r![^#{cjk_charset}\s]+!o
      case mode
      when "cjk"
        input.scan(cjk_regex).length + input.scan(word_regex).length
      when "auto"
        cjk_count = input.scan(cjk_regex).length
        cjk_count.zero? ? input.split.length : cjk_count + input.scan(word_regex).length
      else
        input.split.length
      end
    end
    # Join an array of things into a string by separating with commas and the
--- a/test/test_filters.rb
+++ b/test/test_filters.rb
@ -1513,5 +1513,27 @@ class TestFilters < JekyllUnitTest
        end
      end
    end
    context "number_of_words filter" do
      should "return the number of words for Latin-only text" do
        assert_equal 5, @filter.number_of_words("hello world and taoky strong!", "auto")
        assert_equal 5, @filter.number_of_words("hello world and taoky strong!", "cjk")
      end
      should "return the number of characters for CJK-only text" do
        assert_equal 17, @filter.number_of_words("こんにちは、世界！안녕하세요 세상!", "auto")
        assert_equal 17, @filter.number_of_words("こんにちは、世界！안녕하세요 세상!", "cjk")
      end
      should "process Latin and CJK independently" do
        # Intentional: No space between Latin and CJK
        assert_equal 6, @filter.number_of_words("你好hello世界world", "auto")
        assert_equal 6, @filter.number_of_words("你好hello世界world", "cjk")
      end
      should "maintain original behavior unless specified" do
        assert_equal 1, @filter.number_of_words("你好hello世界world")
      end
    end
  end
 end