diff --git a/.golangci.yml b/.golangci.yml new file mode 100644 index 0000000..9a19d6a --- /dev/null +++ b/.golangci.yml @@ -0,0 +1,98 @@ +run: + concurrency: 4 + timeout: 1m + +linters-settings: + wrapcheck: + ignoreSigs: + - .Errorf( + - errors.New( + - errors.Unwrap( + - .Wrap( + - .Wrapf( + - .WithMessage( + - .WithMessagef( + - .WithStack( + - .WrapError( + ignoreSigRegexps: + - \.New.*Error\( + ignorePackageGlobs: + - encoding/* + - github.com/pkg/* + revive: + ignore-generated-header: true + severity: warning + rules: + - name: exported + severity: warning + - name: error-return + severity: warning + - name: error-naming + severity: warning + - name: if-return + severity: warning + - name: var-naming + severity: warning + - name: var-declaration + severity: warning + - name: receiver-naming + severity: warning + - name: errorf + severity: warning + - name: empty-block + severity: warning + - name: unused-parameter + severity: warning + - name: unreachable-code + severity: warning + - name: redefines-builtin-id + severity: warning + - name: superfluous-else + severity: warning + - name: unexported-return + severity: warning + - name: indent-error-flow + severity: warning + - name: blank-imports + severity: warning + - name: range + severity: warning + - name: time-naming + severity: warning + - name: context-as-argument + severity: warning + - name: context-keys-type + severity: warning + - name: indent-error-flow + severity: warning + +linters: + disable-all: true + enable: + - asciicheck + - durationcheck + - errcheck + - errorlint + - exhaustive + - gosec + - govet + - makezero + - nilerr + - rowserrcheck + - exportloopref + - sqlclosecheck + - staticcheck + - typecheck + - bodyclose + - noctx + - prealloc + - gosimple + presets: + - comment + - error + - format + - metalinter + - unused + +issues: + exclude-use-default: false \ No newline at end of file diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..b0384e1 --- /dev/null +++ b/go.mod @@ -0,0 +1,5 @@ +module github.com/kennygrant/sanitize + +go 1.19 + +require golang.org/x/net v0.0.0-20220826154423-83b083e8dc8b diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..8fca440 --- /dev/null +++ b/go.sum @@ -0,0 +1,2 @@ +golang.org/x/net v0.0.0-20220826154423-83b083e8dc8b h1:ZmngSVLe/wycRns9MKikG9OWIEjGcGAkacif7oYQaUY= +golang.org/x/net v0.0.0-20220826154423-83b083e8dc8b/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk= diff --git a/sanitize.go b/sanitize.go index 2932209..5955273 100755 --- a/sanitize.go +++ b/sanitize.go @@ -1,8 +1,13 @@ -// Package sanitize provides functions for sanitizing text. +/* +Package sanitize provides functions for sanitizing text. +*/ + +//nolint:wrapcheck package sanitize import ( "bytes" + "errors" "html" "html/template" "io" @@ -14,9 +19,48 @@ import ( ) var ( - ignoreTags = []string{"title", "script", "style", "iframe", "frame", "frameset", "noframes", "noembed", "embed", "applet", "object", "base"} + ignoreTags = []string{ + "title", + "script", + "style", + "iframe", + "frame", + "frameset", + "noframes", + "noembed", + "embed", + "applet", + "object", + "base", + } - defaultTags = []string{"h1", "h2", "h3", "h4", "h5", "h6", "div", "span", "hr", "p", "br", "b", "i", "strong", "em", "ol", "ul", "li", "a", "img", "pre", "code", "blockquote", "article", "section"} + defaultTags = []string{ + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "div", + "span", + "hr", + "p", + "br", + "b", + "i", + "strong", + "em", + "ol", + "ul", + "li", + "a", + "img", + "pre", + "code", + "blockquote", + "article", + "section", + } defaultAttributes = []string{"id", "class", "src", "href", "title", "alt", "name", "rel"} ) @@ -24,7 +68,6 @@ var ( // HTMLAllowing sanitizes html, allowing some tags. // Arrays of allowed tags and allowed attributes may optionally be passed as the second and third arguments. func HTMLAllowing(s string, args ...[]string) (string, error) { - allowedTags := defaultTags if len(args) > 0 { allowedTags = args[0] @@ -48,7 +91,7 @@ func HTMLAllowing(s string, args ...[]string) (string, error) { case parser.ErrorToken: err := tokenizer.Err() - if err == io.EOF { + if errors.Is(err, io.EOF) { return buffer.String(), nil } return "", err @@ -94,13 +137,11 @@ func HTMLAllowing(s string, args ...[]string) (string, error) { } } - } // HTML strips html tags, replace common entities, and escapes <>&;'" in the result. // Note the returned text may contain entities as it is escaped by HTMLEscapeString, and most entities are not translated. func HTML(s string) (output string) { - // Shortcut strings with no tags in them if !strings.ContainsAny(s, "<>") { output = s @@ -108,14 +149,14 @@ func HTML(s string) (output string) { // First remove line breaks etc as these have no meaning outside html tags (except pre) // this means pre sections will lose formatting... but will result in less unintentional paras. - s = strings.Replace(s, "\n", "", -1) + s = strings.ReplaceAll(s, "\n", "") // Then replace line breaks with newlines, to preserve that formatting - s = strings.Replace(s, "

", "\n", -1) - s = strings.Replace(s, "
", "\n", -1) - s = strings.Replace(s, "
", "\n", -1) - s = strings.Replace(s, "
", "\n", -1) - s = strings.Replace(s, "
", "\n", -1) + s = strings.ReplaceAll(s, "

", "\n") + s = strings.ReplaceAll(s, "
", "\n") + s = strings.ReplaceAll(s, "
", "\n") + s = strings.ReplaceAll(s, "
", "\n") + s = strings.ReplaceAll(s, "
", "\n") // Walk through the string removing all tags b := bytes.NewBufferString("") @@ -136,13 +177,13 @@ func HTML(s string) (output string) { } // Remove a few common harmless entities, to arrive at something more like plain text - output = strings.Replace(output, "‘", "'", -1) - output = strings.Replace(output, "’", "'", -1) - output = strings.Replace(output, "“", "\"", -1) - output = strings.Replace(output, "”", "\"", -1) - output = strings.Replace(output, " ", " ", -1) - output = strings.Replace(output, """, "\"", -1) - output = strings.Replace(output, "'", "'", -1) + output = strings.ReplaceAll(output, "‘", "'") + output = strings.ReplaceAll(output, "’", "'") + output = strings.ReplaceAll(output, "“", "\"") + output = strings.ReplaceAll(output, "”", "\"") + output = strings.ReplaceAll(output, " ", " ") + output = strings.ReplaceAll(output, """, "\"") + output = strings.ReplaceAll(output, "'", "'") // Translate some entities into their plain text equivalent (for example accents, if encoded as entities) output = html.UnescapeString(output) @@ -151,15 +192,15 @@ func HTML(s string) (output string) { output = template.HTMLEscapeString(output) // After processing, remove some harmless entities &, ' and " which are encoded by HTMLEscapeString - output = strings.Replace(output, """, "\"", -1) - output = strings.Replace(output, "'", "'", -1) - output = strings.Replace(output, "& ", "& ", -1) // NB space after - output = strings.Replace(output, "&amp; ", "& ", -1) // NB space after + output = strings.ReplaceAll(output, """, "\"") + output = strings.ReplaceAll(output, "'", "'") + output = strings.ReplaceAll(output, "& ", "& ") // NB space after + output = strings.ReplaceAll(output, "&amp; ", "& ") // NB space after return output } -// We are very restrictive as this is intended for ascii url slugs +// We are very restrictive as this is intended for ascii url slugs. var illegalPath = regexp.MustCompile(`[^[:alnum:]\~\-\./]`) // Path makes a string safe to use as a URL path, @@ -169,7 +210,7 @@ var illegalPath = regexp.MustCompile(`[^[:alnum:]\~\-\./]`) func Path(s string) string { // Start with lowercase string filePath := strings.ToLower(s) - filePath = strings.Replace(filePath, "..", "", -1) + filePath = strings.ReplaceAll(filePath, "..", "") filePath = path.Clean(filePath) // Remove illegal characters for paths, flattening accents @@ -180,7 +221,7 @@ func Path(s string) string { return filePath } -// Remove all other unrecognised characters apart from +// Remove all other unrecognised characters apart from. var illegalName = regexp.MustCompile(`[^[:alnum:]-.]`) // Name makes a string safe to use in a file name by first finding the path basename, then replacing non-ascii characters. @@ -196,13 +237,12 @@ func Name(s string) string { return fileName } -// Replace these separators with - +// Replace these separators with -. var baseNameSeparators = regexp.MustCompile(`[./]`) // BaseName makes a string safe to use in a file name, producing a sanitized basename replacing . or / with -. // No attempt is made to normalise a path or normalise case. func BaseName(s string) string { - // Replace certain joining characters with a dash baseName := baseNameSeparators.ReplaceAllString(s, "-") @@ -228,6 +268,8 @@ var transliterations = map[rune]string{ 'É': "E", 'Ê': "E", 'Ë': "E", + 'Ğ': "G", + 'İ': "I", 'Ì': "I", 'Í': "I", 'Î': "I", @@ -240,6 +282,7 @@ var transliterations = map[rune]string{ 'Ô': "O", 'Õ': "O", 'Ö': "OE", + 'Ş': "S", 'Ø': "OE", 'Œ': "OE", 'Ù': "U", @@ -261,6 +304,8 @@ var transliterations = map[rune]string{ 'é': "e", 'ê': "e", 'ë': "e", + 'ğ': "g", + 'ı': "i", 'ì': "i", 'í': "i", 'î': "i", @@ -277,6 +322,7 @@ var transliterations = map[rune]string{ 'ö': "oe", 'ø': "oe", 'œ': "oe", + 'ş': "s", 'ś': "s", 'ù': "u", 'ú': "u", @@ -358,7 +404,6 @@ var ( // cleanString replaces separators with - and removes characters listed in the regexp provided from string. // Accents, spaces, and all characters not in A-Za-z0-9 are replaced. func cleanString(s string, r *regexp.Regexp) string { - // Remove any trailing space to avoid ending on - s = strings.Trim(s, " ") diff --git a/sanitize_test.go b/sanitize_test.go index a2242dc..23cdef8 100644 --- a/sanitize_test.go +++ b/sanitize_test.go @@ -12,7 +12,8 @@ type Test struct { expected string } -// NB the treatment of accents - they are removed and replaced with ascii transliterations +// NB the treatment of accents - they are removed and replaced with +// ascii transliterations. var urls = []Test{ {"ReAd ME.md", `read-me.md`}, {"E88E08A7-279C-4CC1-8B90-86DE0D7044_3C.html", `e88e08a7-279c-4cc1-8b90-86de0d7044-3c.html`}, @@ -54,6 +55,8 @@ var fileNames = []Test{ {"../4 icon-testé *8%^\"'\".jpg ", `4-icon-teste-8.jpg`}, {"Überfluß an Döner macht schöner.JPEG", `ueberfluss-an-doener-macht-schoener.jpeg`}, {"Ä-_-Ü_:()_Ö-_-ä-_-ü-_-ö-_ß.webm", `ae-ue-oe-ae-ue-oe-ss.webm`}, + {"uğur özyılmazel şukela", `ugur-oezyilmazel-sukela`}, + {"uĞur Özyılmazel İğneli Şukela", `ugur-oezyilmazel-igneli-sukela`}, } func TestName(t *testing.T) { @@ -97,7 +100,8 @@ func TestBaseName(t *testing.T) { // NB because we remove all tokens after a < until the next > // and do not attempt to parse, we should be safe from invalid html, // but will sometimes completely empty the string if we have invalid input -// Note we sometimes use " in order to keep things on one line and use the ` character +// Note we sometimes use " in order to keep things on one line and use +// the ` character. var htmlTests = []Test{ {` `, " "}, {`&#x000D;`, `&#x000D;`}, @@ -106,13 +110,22 @@ var htmlTests = []Test{ {`FOO ZOO`, "FOO\rZOO"}, {`">`, `alert("XSS")"`}, {``, ``}, @@ -122,8 +135,11 @@ var htmlTests = []Test{ {`> & test <`, `> & test <`}, {``, ``}, {`“hello” it’s for ‘real’`, `"hello" it's for 'real'`}, - {``, ``}, + { + ``, + ``, + }, {`'';!--"=&{()}`, `'';!--"=&{()}`}, {"LINE 1
\nLINE 2", "LINE 1\nLINE 2"}, @@ -157,8 +173,14 @@ var htmlTestsAllowing = []Test{ {``, ``}, {`hello world`, `hello world`}, {`hello



rulers`, `hello



rulers`}, - {`

Span

`, `

Span

`}, - {`
Div

test

invalid

test

`, `
Div

test

invalid

test

`}, + { + `

Span

`, + `

Span

`, + }, + { + `
Div

test

invalid

test

`, + `
Div

test

invalid

test

`, + }, {`

Some text

`, `

Some text

`}, {`hello world`, `hello world`}, {`text

inside

too`, `text

inside

too`}, @@ -167,11 +189,20 @@ var htmlTestsAllowing = []Test{ {"

Bold Not bold

\nAlso not bold.", "

Bold Not bold

\nAlso not bold."}, {"`FOO ZOO", "`FOO ZOO"}, {`PT SRC="http://ha.ckers.org/xss.js">`, `PT SRC="http://ha.ckers.org/xss.js">`}, - {``, ``}, + { + ` +ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4-`, + ` +ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4-`, + }, + { + `PT SRC="http://ha.ckers.org/xss.js">`, + `PT SRC="http://ha.ckers.org/xss.js">`, + }, + { + ``, + ``, + }, {`'';!--"=&{()}`, `'';!--"=&{()}`}, {`">`, `">`}, - {``, ``}, - {`cool guy`, `cool guy`}, + { + ``, + ``, + }, + { + `cool guy`, + `cool guy`, + }, } func TestHTMLAllowed(t *testing.T) { - for _, test := range htmlTestsAllowing { output, err := HTMLAllowing(test.input) if err != nil {