Skip to content
Open
18 changes: 17 additions & 1 deletion lib/truncate_html/html_string.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,23 @@ module TruncateHtml
class HtmlString < String

UNPAIRED_TAGS = %w(br hr img).freeze
REGEX = /(?:<script.*>.*<\/script>)+|<\/?[^>]+>|[[[:alpha:]][0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?]+|\s+|[[:punct:]]/.freeze
TAG_BODY_CHARACTERS =
'[[:alnum:]]' + # Match unicode alpha numberic characters
'\p{Sc}' + # Match unicode currency characters
'\p{So}' + # Match unicode other symbols
'[\p{Sm}&&[^<]]' + # Match unicode math symbols except ascii <. < opens html tags.
'[\p{Zs}&&[^\s]]' + # Match unicode space characters except \s+
%q(\|^` ̄`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?) + # Match some special characters
'[[:punct:]]' # Don't gobble up chinese punctuation characters
REGEX = %r{
(?:<script.*>.*<\/script>)+ # Match script tags. They aren't counted in length.
|
<\/?[^>]+> # Match html tags
|
\s+ # Match consecutive spaces. They are later truncated to a single space.
|
[#{TAG_BODY_CHARACTERS}]+ # Match tag body
}x.freeze

def initialize(original_html)
super(original_html)
Expand Down
11 changes: 11 additions & 0 deletions spec/truncate_html/html_truncator_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -204,4 +204,15 @@ def truncate(html, opts = {})
'<h1>hello <!-- stuff --> and <!-- la -->...</h1>'
end
end

it "doesn't gobble up non alphabetical unicode characters" do
truncate('+<br />ー<br />〜<br />=<br />─<br />a (double-byte space)<br />¥<br />&<br />%<br />#<br />$<br />!<br />?<br />><<br />・<br />/<br />「」<br />@<br />、。', length: 100).should ==
'+<br />ー<br />〜<br />=<br />─<br />a (double-byte space)<br />¥<br />&<br />%<br />#<br />$<br />!<br />?<br />><<br />・<br />/<br />「」<br />@<br />、。'
end

it "doesn't gobble up halfwidth and fullwidth forms of unicode charecters" do
input = '!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~⦅⦆。「」、・ヲァィゥェォャュョッーアイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワン゙ᄀᄁᆪᄂᆬᆭᄃᄄᄅᆰᆱᆲᆳᆴᆵᄚᄆᄇᄈᄡᄉᄊᄋᄌᄍᄎᄏᄐᄑ하ᅢᅣᅤᅥᅦᅧᅨᅩᅪᅫᅬᅭᅮᅯᅰᅱᅲᅳᅴᅵ¢£¬ ̄¦¥₩│←↑→↓■○0123456789'
output = truncate(input, length: 300)
output.should == input
end
end