Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions .golangci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
run:
concurrency: 4
timeout: 1m

linters-settings:
wrapcheck:
ignoreSigs:
- .Errorf(
- errors.New(
- errors.Unwrap(
- .Wrap(
- .Wrapf(
- .WithMessage(
- .WithMessagef(
- .WithStack(
- .WrapError(
ignoreSigRegexps:
- \.New.*Error\(
ignorePackageGlobs:
- encoding/*
- github.com/pkg/*
revive:
ignore-generated-header: true
severity: warning
rules:
- name: exported
severity: warning
- name: error-return
severity: warning
- name: error-naming
severity: warning
- name: if-return
severity: warning
- name: var-naming
severity: warning
- name: var-declaration
severity: warning
- name: receiver-naming
severity: warning
- name: errorf
severity: warning
- name: empty-block
severity: warning
- name: unused-parameter
severity: warning
- name: unreachable-code
severity: warning
- name: redefines-builtin-id
severity: warning
- name: superfluous-else
severity: warning
- name: unexported-return
severity: warning
- name: indent-error-flow
severity: warning
- name: blank-imports
severity: warning
- name: range
severity: warning
- name: time-naming
severity: warning
- name: context-as-argument
severity: warning
- name: context-keys-type
severity: warning
- name: indent-error-flow
severity: warning

linters:
disable-all: true
enable:
- asciicheck
- durationcheck
- errcheck
- errorlint
- exhaustive
- gosec
- govet
- makezero
- nilerr
- rowserrcheck
- exportloopref
- sqlclosecheck
- staticcheck
- typecheck
- bodyclose
- noctx
- prealloc
- gosimple
presets:
- comment
- error
- format
- metalinter
- unused

issues:
exclude-use-default: false
5 changes: 5 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
module github.com/kennygrant/sanitize

go 1.19

require golang.org/x/net v0.0.0-20220826154423-83b083e8dc8b
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
golang.org/x/net v0.0.0-20220826154423-83b083e8dc8b h1:ZmngSVLe/wycRns9MKikG9OWIEjGcGAkacif7oYQaUY=
golang.org/x/net v0.0.0-20220826154423-83b083e8dc8b/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk=
105 changes: 75 additions & 30 deletions sanitize.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
// Package sanitize provides functions for sanitizing text.
/*
Package sanitize provides functions for sanitizing text.
*/

//nolint:wrapcheck
package sanitize

import (
"bytes"
"errors"
"html"
"html/template"
"io"
Expand All @@ -14,17 +19,55 @@ import (
)

var (
ignoreTags = []string{"title", "script", "style", "iframe", "frame", "frameset", "noframes", "noembed", "embed", "applet", "object", "base"}
ignoreTags = []string{
"title",
"script",
"style",
"iframe",
"frame",
"frameset",
"noframes",
"noembed",
"embed",
"applet",
"object",
"base",
}

defaultTags = []string{"h1", "h2", "h3", "h4", "h5", "h6", "div", "span", "hr", "p", "br", "b", "i", "strong", "em", "ol", "ul", "li", "a", "img", "pre", "code", "blockquote", "article", "section"}
defaultTags = []string{
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"div",
"span",
"hr",
"p",
"br",
"b",
"i",
"strong",
"em",
"ol",
"ul",
"li",
"a",
"img",
"pre",
"code",
"blockquote",
"article",
"section",
}

defaultAttributes = []string{"id", "class", "src", "href", "title", "alt", "name", "rel"}
)

// HTMLAllowing sanitizes html, allowing some tags.
// Arrays of allowed tags and allowed attributes may optionally be passed as the second and third arguments.
func HTMLAllowing(s string, args ...[]string) (string, error) {

allowedTags := defaultTags
if len(args) > 0 {
allowedTags = args[0]
Expand All @@ -48,7 +91,7 @@ func HTMLAllowing(s string, args ...[]string) (string, error) {

case parser.ErrorToken:
err := tokenizer.Err()
if err == io.EOF {
if errors.Is(err, io.EOF) {
return buffer.String(), nil
}
return "", err
Expand Down Expand Up @@ -94,28 +137,26 @@ func HTMLAllowing(s string, args ...[]string) (string, error) {
}

}

}

// HTML strips html tags, replace common entities, and escapes <>&;'" in the result.
// Note the returned text may contain entities as it is escaped by HTMLEscapeString, and most entities are not translated.
func HTML(s string) (output string) {

// Shortcut strings with no tags in them
if !strings.ContainsAny(s, "<>") {
output = s
} else {

// First remove line breaks etc as these have no meaning outside html tags (except pre)
// this means pre sections will lose formatting... but will result in less unintentional paras.
s = strings.Replace(s, "\n", "", -1)
s = strings.ReplaceAll(s, "\n", "")

// Then replace line breaks with newlines, to preserve that formatting
s = strings.Replace(s, "</p>", "\n", -1)
s = strings.Replace(s, "<br>", "\n", -1)
s = strings.Replace(s, "</br>", "\n", -1)
s = strings.Replace(s, "<br/>", "\n", -1)
s = strings.Replace(s, "<br />", "\n", -1)
s = strings.ReplaceAll(s, "</p>", "\n")
s = strings.ReplaceAll(s, "<br>", "\n")
s = strings.ReplaceAll(s, "</br>", "\n")
s = strings.ReplaceAll(s, "<br/>", "\n")
s = strings.ReplaceAll(s, "<br />", "\n")

// Walk through the string removing all tags
b := bytes.NewBufferString("")
Expand All @@ -136,13 +177,13 @@ func HTML(s string) (output string) {
}

// Remove a few common harmless entities, to arrive at something more like plain text
output = strings.Replace(output, "&#8216;", "'", -1)
output = strings.Replace(output, "&#8217;", "'", -1)
output = strings.Replace(output, "&#8220;", "\"", -1)
output = strings.Replace(output, "&#8221;", "\"", -1)
output = strings.Replace(output, "&nbsp;", " ", -1)
output = strings.Replace(output, "&quot;", "\"", -1)
output = strings.Replace(output, "&apos;", "'", -1)
output = strings.ReplaceAll(output, "&#8216;", "'")
output = strings.ReplaceAll(output, "&#8217;", "'")
output = strings.ReplaceAll(output, "&#8220;", "\"")
output = strings.ReplaceAll(output, "&#8221;", "\"")
output = strings.ReplaceAll(output, "&nbsp;", " ")
output = strings.ReplaceAll(output, "&quot;", "\"")
output = strings.ReplaceAll(output, "&apos;", "'")

// Translate some entities into their plain text equivalent (for example accents, if encoded as entities)
output = html.UnescapeString(output)
Expand All @@ -151,15 +192,15 @@ func HTML(s string) (output string) {
output = template.HTMLEscapeString(output)

// After processing, remove some harmless entities &, ' and " which are encoded by HTMLEscapeString
output = strings.Replace(output, "&#34;", "\"", -1)
output = strings.Replace(output, "&#39;", "'", -1)
output = strings.Replace(output, "&amp; ", "& ", -1) // NB space after
output = strings.Replace(output, "&amp;amp; ", "& ", -1) // NB space after
output = strings.ReplaceAll(output, "&#34;", "\"")
output = strings.ReplaceAll(output, "&#39;", "'")
output = strings.ReplaceAll(output, "&amp; ", "& ") // NB space after
output = strings.ReplaceAll(output, "&amp;amp; ", "& ") // NB space after

return output
}

// We are very restrictive as this is intended for ascii url slugs
// We are very restrictive as this is intended for ascii url slugs.
var illegalPath = regexp.MustCompile(`[^[:alnum:]\~\-\./]`)

// Path makes a string safe to use as a URL path,
Expand All @@ -169,7 +210,7 @@ var illegalPath = regexp.MustCompile(`[^[:alnum:]\~\-\./]`)
func Path(s string) string {
// Start with lowercase string
filePath := strings.ToLower(s)
filePath = strings.Replace(filePath, "..", "", -1)
filePath = strings.ReplaceAll(filePath, "..", "")
filePath = path.Clean(filePath)

// Remove illegal characters for paths, flattening accents
Expand All @@ -180,7 +221,7 @@ func Path(s string) string {
return filePath
}

// Remove all other unrecognised characters apart from
// Remove all other unrecognised characters apart from.
var illegalName = regexp.MustCompile(`[^[:alnum:]-.]`)

// Name makes a string safe to use in a file name by first finding the path basename, then replacing non-ascii characters.
Expand All @@ -196,13 +237,12 @@ func Name(s string) string {
return fileName
}

// Replace these separators with -
// Replace these separators with -.
var baseNameSeparators = regexp.MustCompile(`[./]`)

// BaseName makes a string safe to use in a file name, producing a sanitized basename replacing . or / with -.
// No attempt is made to normalise a path or normalise case.
func BaseName(s string) string {

// Replace certain joining characters with a dash
baseName := baseNameSeparators.ReplaceAllString(s, "-")

Expand All @@ -228,6 +268,8 @@ var transliterations = map[rune]string{
'É': "E",
'Ê': "E",
'Ë': "E",
'Ğ': "G",
'İ': "I",
'Ì': "I",
'Í': "I",
'Î': "I",
Expand All @@ -240,6 +282,7 @@ var transliterations = map[rune]string{
'Ô': "O",
'Õ': "O",
'Ö': "OE",
'Ş': "S",
'Ø': "OE",
'Œ': "OE",
'Ù': "U",
Expand All @@ -261,6 +304,8 @@ var transliterations = map[rune]string{
'é': "e",
'ê': "e",
'ë': "e",
'ğ': "g",
'ı': "i",
'ì': "i",
'í': "i",
'î': "i",
Expand All @@ -277,6 +322,7 @@ var transliterations = map[rune]string{
'ö': "oe",
'ø': "oe",
'œ': "oe",
'ş': "s",
'ś': "s",
'ù': "u",
'ú': "u",
Expand Down Expand Up @@ -358,7 +404,6 @@ var (
// cleanString replaces separators with - and removes characters listed in the regexp provided from string.
// Accents, spaces, and all characters not in A-Za-z0-9 are replaced.
func cleanString(s string, r *regexp.Regexp) string {

// Remove any trailing space to avoid ending on -
s = strings.Trim(s, " ")

Expand Down
Loading