html2text/html2text.go at main · AlsoAsked/html2text · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
package html2text

import (
	"fmt"
	"strings"

	"golang.org/x/net/html"
	"golang.org/x/net/html/atom"
)

const (
	newLine   = "\n"
	spaceRune = ' '
)

// HTML2Text extracts text from html
func HTML2Text(htmlString string) string {
	var (
		// prevents from many new lines in a row
		canPrintNewline = false
		// prevents from many spaces in a row
		isSpaceNeeded = false
		// tells if was some space between tags
		wasSpace = false
		// unwanted tags counter
		skipTags = 0
		// determines if the list is ordered
		isOrderedList = false
		// the index of the last ordered list item
		orderedListIndex = 0
	)

	// use tokenizer, not parser, because it faster, and we do not need html tree
	tokenizer := html.NewTokenizer(strings.NewReader(htmlString))
	clearString := strings.Builder{}
	clearString.Grow(len(htmlString))

	// writeString writes text to string builder
	writeString := func(text string) {
		if len(strings.TrimSpace(text)) > 0 {
			if isSpaceNeeded && wasSpace && text[0] != spaceRune {
				clearString.WriteRune(spaceRune)
				isSpaceNeeded = false
			}
			clearString.WriteString(text)
			canPrintNewline = true
			isSpaceNeeded = text[len(text)-1] != spaceRune
			wasSpace = false
		} else {
			wasSpace = true
		}
	}

	// writeNewLine writes new line without conditions, e.g. because of <br> tag
	writeNewLine := func() {
		if skipTags == 0 {
			clearString.WriteString(newLine)
			isSpaceNeeded = false
		}
	}

	// writeNewLineConditional writes new line only if needed
	writeNewLineConditional := func() {
		if skipTags == 0 && canPrintNewline {
			clearString.WriteString(newLine)
			canPrintNewline = false
			isSpaceNeeded = false
		}
	}

	// parse new token
	tokenType := tokenizer.Next()
	for tokenType != html.ErrorToken {
		token := tokenizer.Token()
		switch tokenType {
		// if token is text - write it (skip empty strings)
		case html.TextToken:
			// do not move skipTags == 0 to writeString in order to avoid unnecessary tokenizer operations
			if skipTags == 0 {
				text := token.Data
				writeString(text)
			}
		// add new line instead of some tags
		case html.StartTagToken:
			switch token.DataAtom {
			case atom.Br:
				writeNewLine()
			case atom.Li:
				writeNewLineConditional()
				if isOrderedList {
					orderedListIndex++
					writeString(fmt.Sprintf("%d. ", orderedListIndex))
				} else {
					writeString("• ")
				}
			case atom.Ol:
				isOrderedList = true
			case atom.P, atom.H1, atom.H2, atom.H3, atom.H4, atom.H5, atom.H6:
				writeNewLineConditional()
			case atom.Noscript:
				tokenizer.Next()
				// because of bug in golang.org/x/net/html (all tokens inside <noscript> are TextToken)
				// we have to parse tags inside noscript tag one more time
				// do not move skipTags == 0 to writeString in order to avoid unnecessary recursion
				if skipTags == 0 {
					writeString(HTML2Text(token.Data))
				}
			// we do not want to parse content from these tags, so skip them
			case atom.Head, atom.Script, atom.Style:
				skipTags++
			default:
				// write a new line if the tag has the heading role
				attributes := token.Attr
				for _, attr := range attributes {
					if attr.Key == "role" && attr.Val == "heading" {
						writeNewLineConditional()
						break
					}
				}
			}
		// add new line instead of some tags
		case html.EndTagToken:
			switch token.DataAtom {
			case atom.Ul:
				writeNewLineConditional()
			case atom.Ol:
				writeNewLineConditional()
				orderedListIndex = 0
				isOrderedList = false
			case atom.P, atom.H1, atom.H2, atom.H3, atom.H4, atom.H5, atom.H6:
				writeNewLineConditional()
			// end of unwanted tags
			case atom.Head, atom.Script, atom.Style:
				skipTags--
			}
		case html.SelfClosingTagToken:
			switch token.DataAtom {
			case atom.Br, atom.Li:
				writeNewLine()
			}
		}

		// parse next token
		tokenType = tokenizer.Next()
	}

	return clearString.String()
}