Make number_of_words respect CJK characters (#7813)
Merge pull request 7813
This commit is contained in:
		
							parent
							
								
									2e80c557e3
								
							
						
					
					
						commit
						13b7291649
					
				|  | @ -200,10 +200,25 @@ | ||||||
| # | # | ||||||
| 
 | 
 | ||||||
| - name: Number of Words | - name: Number of Words | ||||||
|   description: Count the number of words in some text. |   description: >- | ||||||
|  |     Count the number of words in some text.<br/> | ||||||
|  |     From <span class="version-badge">v4.1.0</span>, this filter takes an optional | ||||||
|  |     argument to control the handling of Chinese-Japanese-Korean (CJK) characters | ||||||
|  |     in the <code>input</code> string.<br/> | ||||||
|  |     Passing <code>'cjk'</code> as the argument will count every CJK character | ||||||
|  |     detected as one word irrespective of being separated by whitespace.<br/> | ||||||
|  |     Passing <code>'auto'</code> (auto-detect) works similar to <code>'cjk'</code> | ||||||
|  |     but is more performant if the filter is used on a variable string that may | ||||||
|  |     or may not contain CJK chars. | ||||||
|   examples: |   examples: | ||||||
|     - input: '{{ page.content | number_of_words }}' |     - input: '{{ "Hello world!" | number_of_words }}' | ||||||
|       output: 1337 |       output: 2 | ||||||
|  |     - input: '{{ "你好hello世界world" | number_of_words }}' | ||||||
|  |       output: 1 | ||||||
|  |     - input: '{{ "你好hello世界world" | number_of_words: "cjk" }}' | ||||||
|  |       output: 6 | ||||||
|  |     - input: '{{ "你好hello世界world" | number_of_words: "auto" }}' | ||||||
|  |       output: 6 | ||||||
| 
 | 
 | ||||||
| # | # | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -121,8 +121,20 @@ module Jekyll | ||||||
|     # input - The String on which to operate. |     # input - The String on which to operate. | ||||||
|     # |     # | ||||||
|     # Returns the Integer word count. |     # Returns the Integer word count. | ||||||
|     def number_of_words(input) |     def number_of_words(input, mode = nil) | ||||||
|       input.split.length |       cjk_charset = '\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}' | ||||||
|  |       cjk_regex = %r![#{cjk_charset}]!o | ||||||
|  |       word_regex = %r![^#{cjk_charset}\s]+!o | ||||||
|  | 
 | ||||||
|  |       case mode | ||||||
|  |       when "cjk" | ||||||
|  |         input.scan(cjk_regex).length + input.scan(word_regex).length | ||||||
|  |       when "auto" | ||||||
|  |         cjk_count = input.scan(cjk_regex).length | ||||||
|  |         cjk_count.zero? ? input.split.length : cjk_count + input.scan(word_regex).length | ||||||
|  |       else | ||||||
|  |         input.split.length | ||||||
|  |       end | ||||||
|     end |     end | ||||||
| 
 | 
 | ||||||
|     # Join an array of things into a string by separating with commas and the |     # Join an array of things into a string by separating with commas and the | ||||||
|  |  | ||||||
|  | @ -1513,5 +1513,27 @@ class TestFilters < JekyllUnitTest | ||||||
|         end |         end | ||||||
|       end |       end | ||||||
|     end |     end | ||||||
|  | 
 | ||||||
|  |     context "number_of_words filter" do | ||||||
|  |       should "return the number of words for Latin-only text" do | ||||||
|  |         assert_equal 5, @filter.number_of_words("hello world and taoky strong!", "auto") | ||||||
|  |         assert_equal 5, @filter.number_of_words("hello world and taoky strong!", "cjk") | ||||||
|  |       end | ||||||
|  | 
 | ||||||
|  |       should "return the number of characters for CJK-only text" do | ||||||
|  |         assert_equal 17, @filter.number_of_words("こんにちは、世界!안녕하세요 세상!", "auto") | ||||||
|  |         assert_equal 17, @filter.number_of_words("こんにちは、世界!안녕하세요 세상!", "cjk") | ||||||
|  |       end | ||||||
|  | 
 | ||||||
|  |       should "process Latin and CJK independently" do | ||||||
|  |         # Intentional: No space between Latin and CJK | ||||||
|  |         assert_equal 6, @filter.number_of_words("你好hello世界world", "auto") | ||||||
|  |         assert_equal 6, @filter.number_of_words("你好hello世界world", "cjk") | ||||||
|  |       end | ||||||
|  | 
 | ||||||
|  |       should "maintain original behavior unless specified" do | ||||||
|  |         assert_equal 1, @filter.number_of_words("你好hello世界world") | ||||||
|  |       end | ||||||
|  |     end | ||||||
|   end |   end | ||||||
| end | end | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue