Make number_of_words respect CJK characters (#7813)
Merge pull request 7813
This commit is contained in:
		
							parent
							
								
									2e80c557e3
								
							
						
					
					
						commit
						13b7291649
					
				|  | @ -200,10 +200,25 @@ | |||
| # | ||||
| 
 | ||||
| - name: Number of Words | ||||
|   description: Count the number of words in some text. | ||||
|   description: >- | ||||
|     Count the number of words in some text.<br/> | ||||
|     From <span class="version-badge">v4.1.0</span>, this filter takes an optional | ||||
|     argument to control the handling of Chinese-Japanese-Korean (CJK) characters | ||||
|     in the <code>input</code> string.<br/> | ||||
|     Passing <code>'cjk'</code> as the argument will count every CJK character | ||||
|     detected as one word irrespective of being separated by whitespace.<br/> | ||||
|     Passing <code>'auto'</code> (auto-detect) works similar to <code>'cjk'</code> | ||||
|     but is more performant if the filter is used on a variable string that may | ||||
|     or may not contain CJK chars. | ||||
|   examples: | ||||
|     - input: '{{ page.content | number_of_words }}' | ||||
|       output: 1337 | ||||
|     - input: '{{ "Hello world!" | number_of_words }}' | ||||
|       output: 2 | ||||
|     - input: '{{ "你好hello世界world" | number_of_words }}' | ||||
|       output: 1 | ||||
|     - input: '{{ "你好hello世界world" | number_of_words: "cjk" }}' | ||||
|       output: 6 | ||||
|     - input: '{{ "你好hello世界world" | number_of_words: "auto" }}' | ||||
|       output: 6 | ||||
| 
 | ||||
| # | ||||
| 
 | ||||
|  |  | |||
|  | @ -121,9 +121,21 @@ module Jekyll | |||
|     # input - The String on which to operate. | ||||
|     # | ||||
|     # Returns the Integer word count. | ||||
|     def number_of_words(input) | ||||
|     def number_of_words(input, mode = nil) | ||||
|       cjk_charset = '\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}' | ||||
|       cjk_regex = %r![#{cjk_charset}]!o | ||||
|       word_regex = %r![^#{cjk_charset}\s]+!o | ||||
| 
 | ||||
|       case mode | ||||
|       when "cjk" | ||||
|         input.scan(cjk_regex).length + input.scan(word_regex).length | ||||
|       when "auto" | ||||
|         cjk_count = input.scan(cjk_regex).length | ||||
|         cjk_count.zero? ? input.split.length : cjk_count + input.scan(word_regex).length | ||||
|       else | ||||
|         input.split.length | ||||
|       end | ||||
|     end | ||||
| 
 | ||||
|     # Join an array of things into a string by separating with commas and the | ||||
|     # word "and" for the last one. | ||||
|  |  | |||
|  | @ -1513,5 +1513,27 @@ class TestFilters < JekyllUnitTest | |||
|         end | ||||
|       end | ||||
|     end | ||||
| 
 | ||||
|     context "number_of_words filter" do | ||||
|       should "return the number of words for Latin-only text" do | ||||
|         assert_equal 5, @filter.number_of_words("hello world and taoky strong!", "auto") | ||||
|         assert_equal 5, @filter.number_of_words("hello world and taoky strong!", "cjk") | ||||
|       end | ||||
| 
 | ||||
|       should "return the number of characters for CJK-only text" do | ||||
|         assert_equal 17, @filter.number_of_words("こんにちは、世界!안녕하세요 세상!", "auto") | ||||
|         assert_equal 17, @filter.number_of_words("こんにちは、世界!안녕하세요 세상!", "cjk") | ||||
|       end | ||||
| 
 | ||||
|       should "process Latin and CJK independently" do | ||||
|         # Intentional: No space between Latin and CJK | ||||
|         assert_equal 6, @filter.number_of_words("你好hello世界world", "auto") | ||||
|         assert_equal 6, @filter.number_of_words("你好hello世界world", "cjk") | ||||
|       end | ||||
| 
 | ||||
|       should "maintain original behavior unless specified" do | ||||
|         assert_equal 1, @filter.number_of_words("你好hello世界world") | ||||
|       end | ||||
|     end | ||||
|   end | ||||
| end | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue