forked from levinishka/html2text
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhtml2text.go
More file actions
148 lines (137 loc) · 3.78 KB
/
html2text.go
File metadata and controls
148 lines (137 loc) · 3.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
package html2text
import (
"fmt"
"strings"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)
const (
newLine = "\n"
spaceRune = ' '
)
// HTML2Text extracts text from html
func HTML2Text(htmlString string) string {
var (
// prevents from many new lines in a row
canPrintNewline = false
// prevents from many spaces in a row
isSpaceNeeded = false
// tells if was some space between tags
wasSpace = false
// unwanted tags counter
skipTags = 0
// determines if the list is ordered
isOrderedList = false
// the index of the last ordered list item
orderedListIndex = 0
)
// use tokenizer, not parser, because it faster, and we do not need html tree
tokenizer := html.NewTokenizer(strings.NewReader(htmlString))
clearString := strings.Builder{}
clearString.Grow(len(htmlString))
// writeString writes text to string builder
writeString := func(text string) {
if len(strings.TrimSpace(text)) > 0 {
if isSpaceNeeded && wasSpace && text[0] != spaceRune {
clearString.WriteRune(spaceRune)
isSpaceNeeded = false
}
clearString.WriteString(text)
canPrintNewline = true
isSpaceNeeded = text[len(text)-1] != spaceRune
wasSpace = false
} else {
wasSpace = true
}
}
// writeNewLine writes new line without conditions, e.g. because of <br> tag
writeNewLine := func() {
if skipTags == 0 {
clearString.WriteString(newLine)
isSpaceNeeded = false
}
}
// writeNewLineConditional writes new line only if needed
writeNewLineConditional := func() {
if skipTags == 0 && canPrintNewline {
clearString.WriteString(newLine)
canPrintNewline = false
isSpaceNeeded = false
}
}
// parse new token
tokenType := tokenizer.Next()
for tokenType != html.ErrorToken {
token := tokenizer.Token()
switch tokenType {
// if token is text - write it (skip empty strings)
case html.TextToken:
// do not move skipTags == 0 to writeString in order to avoid unnecessary tokenizer operations
if skipTags == 0 {
text := token.Data
writeString(text)
}
// add new line instead of some tags
case html.StartTagToken:
switch token.DataAtom {
case atom.Br:
writeNewLine()
case atom.Li:
writeNewLineConditional()
if isOrderedList {
orderedListIndex++
writeString(fmt.Sprintf("%d. ", orderedListIndex))
} else {
writeString("• ")
}
case atom.Ol:
isOrderedList = true
case atom.P, atom.H1, atom.H2, atom.H3, atom.H4, atom.H5, atom.H6:
writeNewLineConditional()
case atom.Noscript:
tokenizer.Next()
// because of bug in golang.org/x/net/html (all tokens inside <noscript> are TextToken)
// we have to parse tags inside noscript tag one more time
// do not move skipTags == 0 to writeString in order to avoid unnecessary recursion
if skipTags == 0 {
writeString(HTML2Text(token.Data))
}
// we do not want to parse content from these tags, so skip them
case atom.Head, atom.Script, atom.Style:
skipTags++
default:
// write a new line if the tag has the heading role
attributes := token.Attr
for _, attr := range attributes {
if attr.Key == "role" && attr.Val == "heading" {
writeNewLineConditional()
break
}
}
}
// add new line instead of some tags
case html.EndTagToken:
switch token.DataAtom {
case atom.Ul:
writeNewLineConditional()
case atom.Ol:
writeNewLineConditional()
orderedListIndex = 0
isOrderedList = false
case atom.P, atom.H1, atom.H2, atom.H3, atom.H4, atom.H5, atom.H6:
writeNewLineConditional()
// end of unwanted tags
case atom.Head, atom.Script, atom.Style:
skipTags--
}
case html.SelfClosingTagToken:
switch token.DataAtom {
case atom.Br, atom.Li:
writeNewLine()
}
}
// parse next token
tokenType = tokenizer.Next()
}
return clearString.String()
}