From 2037836c4f4f712975c2f7adf555f4064675bb1a Mon Sep 17 00:00:00 2001 From: Nisanth Chunduru Date: Fri, 26 Dec 2014 23:51:25 +0530 Subject: [PATCH 01/10] Split token regex into understandable chunks --- lib/truncate_html/html_string.rb | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/lib/truncate_html/html_string.rb b/lib/truncate_html/html_string.rb index 76d82e9..5a83ef7 100644 --- a/lib/truncate_html/html_string.rb +++ b/lib/truncate_html/html_string.rb @@ -3,7 +3,17 @@ module TruncateHtml class HtmlString < String UNPAIRED_TAGS = %w(br hr img).freeze - REGEX = /(?:.*<\/script>)+|<\/?[^>]+>|[[[:alpha:]][0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?]+|\s+|[[:punct:]]/.freeze + REGEX = %r{ + (?:.*<\/script>)+ # Match script tags. They aren't counted in length. + | + <\/?[^>]+> # Match html tags + | + [[[:alpha:]][0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?]+ # Match tag body + | + \s+ # Match consecutive spaces. They are later truncated to a single space. + | + [[:punct:]] # Don't gobble up Chinese punctuation characters + }x.freeze def initialize(original_html) super(original_html) From 954a66a6e2300ccc155d442f32528cd743a92248 Mon Sep 17 00:00:00 2001 From: Nisanth Chunduru Date: Sat, 27 Dec 2014 00:02:21 +0530 Subject: [PATCH 02/10] Add test to ensure non alphabetic unicode characters are not gobbled up --- spec/truncate_html/html_truncator_spec.rb | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/spec/truncate_html/html_truncator_spec.rb b/spec/truncate_html/html_truncator_spec.rb index 0da14d6..a2528fe 100644 --- a/spec/truncate_html/html_truncator_spec.rb +++ b/spec/truncate_html/html_truncator_spec.rb @@ -204,4 +204,9 @@ def truncate(html, opts = {}) '

hello and ...

' end end + + it "doesn't gobble up non alphabetical unicode characters" do + truncate('+




a (double-byte space)







><


「」

、。', length: 100).should == + '+




a (double-byte space)







><


「」

、。' + end end From bd7ddbf1c60c955e2211e1842bc88ec572ae3caa Mon Sep 17 00:00:00 2001 From: Nisanth Chunduru Date: Sat, 27 Dec 2014 00:04:02 +0530 Subject: [PATCH 03/10] Add support for unicode currency characters --- lib/truncate_html/html_string.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/truncate_html/html_string.rb b/lib/truncate_html/html_string.rb index 5a83ef7..306f831 100644 --- a/lib/truncate_html/html_string.rb +++ b/lib/truncate_html/html_string.rb @@ -8,7 +8,7 @@ class HtmlString < String | <\/?[^>]+> # Match html tags | - [[[:alpha:]][0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?]+ # Match tag body + [[[:alpha:]]\p{Sc}[0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?]+ # Match tag body | \s+ # Match consecutive spaces. They are later truncated to a single space. | From 8cbfff19c12cafaeb5140eeac99963c8b0994879 Mon Sep 17 00:00:00 2001 From: Nisanth Chunduru Date: Sat, 27 Dec 2014 00:05:24 +0530 Subject: [PATCH 04/10] Add support for unicode math characters --- lib/truncate_html/html_string.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/truncate_html/html_string.rb b/lib/truncate_html/html_string.rb index 306f831..daf9599 100644 --- a/lib/truncate_html/html_string.rb +++ b/lib/truncate_html/html_string.rb @@ -8,7 +8,7 @@ class HtmlString < String | <\/?[^>]+> # Match html tags | - [[[:alpha:]]\p{Sc}[0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?]+ # Match tag body + [[[:alpha:]]\p{Sc}[\p{Sm}&&[^<]][0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?]+ # Match tag body | \s+ # Match consecutive spaces. They are later truncated to a single space. | From 350712a42dd465323b1b0422062a10ffcaa84ba6 Mon Sep 17 00:00:00 2001 From: Nisanth Chunduru Date: Sat, 27 Dec 2014 00:08:46 +0530 Subject: [PATCH 05/10] Add support for unicode space characters --- lib/truncate_html/html_string.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/truncate_html/html_string.rb b/lib/truncate_html/html_string.rb index daf9599..d9f4ffa 100644 --- a/lib/truncate_html/html_string.rb +++ b/lib/truncate_html/html_string.rb @@ -8,7 +8,7 @@ class HtmlString < String | <\/?[^>]+> # Match html tags | - [[[:alpha:]]\p{Sc}[\p{Sm}&&[^<]][0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?]+ # Match tag body + [[[:alpha:]]\p{Sc}[\p{Sm}&&[^<]][\p{Zs}&&[^\s]][0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?]+ # Match tag body | \s+ # Match consecutive spaces. They are later truncated to a single space. | From 3d2591a4d67deb4d888644981361ed43f0636391 Mon Sep 17 00:00:00 2001 From: Nisanth Chunduru Date: Sat, 27 Dec 2014 00:11:44 +0530 Subject: [PATCH 06/10] Add support for other unicode symbols http://www.fileformat.info/info/unicode/category/So/list.htm --- lib/truncate_html/html_string.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/truncate_html/html_string.rb b/lib/truncate_html/html_string.rb index d9f4ffa..2f1c71c 100644 --- a/lib/truncate_html/html_string.rb +++ b/lib/truncate_html/html_string.rb @@ -8,7 +8,7 @@ class HtmlString < String | <\/?[^>]+> # Match html tags | - [[[:alpha:]]\p{Sc}[\p{Sm}&&[^<]][\p{Zs}&&[^\s]][0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?]+ # Match tag body + [[[:alpha:]]\p{Sc}\p{So}[\p{Sm}&&[^<]][\p{Zs}&&[^\s]][0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?]+ # Match tag body | \s+ # Match consecutive spaces. They are later truncated to a single space. | From e3b3efaa6032f184f9a4db8bd1d6829d4aa3cbe3 Mon Sep 17 00:00:00 2001 From: Nisanth Chunduru Date: Sat, 27 Dec 2014 00:15:43 +0530 Subject: [PATCH 07/10] Refactor token regex --- lib/truncate_html/html_string.rb | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/truncate_html/html_string.rb b/lib/truncate_html/html_string.rb index 2f1c71c..838cbff 100644 --- a/lib/truncate_html/html_string.rb +++ b/lib/truncate_html/html_string.rb @@ -8,11 +8,9 @@ class HtmlString < String | <\/?[^>]+> # Match html tags | - [[[:alpha:]]\p{Sc}\p{So}[\p{Sm}&&[^<]][\p{Zs}&&[^\s]][0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?]+ # Match tag body - | \s+ # Match consecutive spaces. They are later truncated to a single space. | - [[:punct:]] # Don't gobble up Chinese punctuation characters + [[[:alpha:]]\p{Sc}\p{So}[\p{Sm}&&[^<]][\p{Zs}&&[^\s]][0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?[[:punct:]]]+ # Match tag body }x.freeze def initialize(original_html) From c950cf764cb3628a0692ad0c746e205dfa560de2 Mon Sep 17 00:00:00 2001 From: Nisanth Chunduru Date: Sat, 27 Dec 2014 00:22:14 +0530 Subject: [PATCH 08/10] Add comments explaining the token regex --- lib/truncate_html/html_string.rb | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/lib/truncate_html/html_string.rb b/lib/truncate_html/html_string.rb index 838cbff..0e03e86 100644 --- a/lib/truncate_html/html_string.rb +++ b/lib/truncate_html/html_string.rb @@ -10,6 +10,15 @@ class HtmlString < String | \s+ # Match consecutive spaces. They are later truncated to a single space. | + # [[:alpha]] - Match unicode alphabetical characters + # \p{Sc} - Match unicode currency characters + # \p{So} - Match unicode other symbols + # [\p{Sm}&&[^<]] - Match unicode math characters except ASCII <. < opens html tags. + # [\p{Zs}&&[^\s]] - Match unicode space characters except \s+. We truncate consecutive normal spaces. + # [0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/? - Match digits, few more characters + # [[:punct]] - Don't gobble up chinese punctuation characters + # + # Refer to ruby's regex docs (http://www.ruby-doc.org/core-1.9.3/Regexp.html) for more info [[[:alpha:]]\p{Sc}\p{So}[\p{Sm}&&[^<]][\p{Zs}&&[^\s]][0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?[[:punct:]]]+ # Match tag body }x.freeze From 9000c427985d570ed2168419380203403e7c3ccd Mon Sep 17 00:00:00 2001 From: Nisanth Chunduru Date: Sun, 28 Dec 2014 15:41:50 +0530 Subject: [PATCH 09/10] Extract valid tag body characters into a separate variable --- lib/truncate_html/html_string.rb | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/lib/truncate_html/html_string.rb b/lib/truncate_html/html_string.rb index 0e03e86..3f51c3e 100644 --- a/lib/truncate_html/html_string.rb +++ b/lib/truncate_html/html_string.rb @@ -3,6 +3,15 @@ module TruncateHtml class HtmlString < String UNPAIRED_TAGS = %w(br hr img).freeze + TAG_BODY_CHARACTERS = + '[[:alpha:]]' + # Match unicode alphabetical characters + '\p{Sc}' + # Match unicode currency characters + '\p{So}' + # Match unicode other symbols + '[\p{Sm}&&[^<]]' + # Match unicode math symbols except ascii <. < opens html tags. + '[\p{Zs}&&[^\s]]' + # Match unicode space characters except \s+ + '[0-9]' + # Match digits + %q(\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?) + # Match some special characters + '[[:punct:]]' # Don't gobble up chinese punctuation characters REGEX = %r{ (?:.*<\/script>)+ # Match script tags. They aren't counted in length. | @@ -10,16 +19,7 @@ class HtmlString < String | \s+ # Match consecutive spaces. They are later truncated to a single space. | - # [[:alpha]] - Match unicode alphabetical characters - # \p{Sc} - Match unicode currency characters - # \p{So} - Match unicode other symbols - # [\p{Sm}&&[^<]] - Match unicode math characters except ASCII <. < opens html tags. - # [\p{Zs}&&[^\s]] - Match unicode space characters except \s+. We truncate consecutive normal spaces. - # [0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/? - Match digits, few more characters - # [[:punct]] - Don't gobble up chinese punctuation characters - # - # Refer to ruby's regex docs (http://www.ruby-doc.org/core-1.9.3/Regexp.html) for more info - [[[:alpha:]]\p{Sc}\p{So}[\p{Sm}&&[^<]][\p{Zs}&&[^\s]][0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?[[:punct:]]]+ # Match tag body + [#{TAG_BODY_CHARACTERS}]+ # Match tag body }x.freeze def initialize(original_html) From bf787cc37a4d5c6d5d6d98f4789425ac6ace3a0e Mon Sep 17 00:00:00 2001 From: Avinasha Shastry Date: Mon, 18 May 2015 14:38:45 +0530 Subject: [PATCH 10/10] Adds support for HalfWidth and FullWidth forms of unicode charecters --- lib/truncate_html/html_string.rb | 5 ++--- spec/truncate_html/html_truncator_spec.rb | 6 ++++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/lib/truncate_html/html_string.rb b/lib/truncate_html/html_string.rb index 3f51c3e..5be96dc 100644 --- a/lib/truncate_html/html_string.rb +++ b/lib/truncate_html/html_string.rb @@ -4,13 +4,12 @@ class HtmlString < String UNPAIRED_TAGS = %w(br hr img).freeze TAG_BODY_CHARACTERS = - '[[:alpha:]]' + # Match unicode alphabetical characters + '[[:alnum:]]' + # Match unicode alpha numberic characters '\p{Sc}' + # Match unicode currency characters '\p{So}' + # Match unicode other symbols '[\p{Sm}&&[^<]]' + # Match unicode math symbols except ascii <. < opens html tags. '[\p{Zs}&&[^\s]]' + # Match unicode space characters except \s+ - '[0-9]' + # Match digits - %q(\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?) + # Match some special characters + %q(\|^` ̄`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?) + # Match some special characters '[[:punct:]]' # Don't gobble up chinese punctuation characters REGEX = %r{ (?:.*<\/script>)+ # Match script tags. They aren't counted in length. diff --git a/spec/truncate_html/html_truncator_spec.rb b/spec/truncate_html/html_truncator_spec.rb index a2528fe..f326078 100644 --- a/spec/truncate_html/html_truncator_spec.rb +++ b/spec/truncate_html/html_truncator_spec.rb @@ -209,4 +209,10 @@ def truncate(html, opts = {}) truncate('+




a (double-byte space)







><


「」

、。', length: 100).should == '+




a (double-byte space)







><


「」

、。' end + + it "doesn't gobble up halfwidth and fullwidth forms of unicode charecters" do + input = '!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~⦅⦆。「」、・ヲァィゥェォャュョッーアイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワン゙ᄀᄁᆪᄂᆬᆭᄃᄄᄅᆰᆱᆲᆳᆴᆵᄚᄆᄇᄈᄡᄉᄊᄋᄌᄍᄎᄏᄐᄑ하ᅢᅣᅤᅥᅦᅧᅨᅩᅪᅫᅬᅭᅮᅯᅰᅱᅲᅳᅴᅵ¢£¬ ̄¦¥₩│←↑→↓■○0123456789' + output = truncate(input, length: 300) + output.should == input + end end