From e23c99ea2869140e446ec74e787777160827078f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?U=C4=9Fur=20=C3=96zy=C4=B1lmazel?=
Date: Sun, 4 Sep 2022 17:32:45 +0300
Subject: [PATCH] add go-mod support and turkish chars + golangci-lint
---
.golangci.yml | 98 ++++++++++++++++++++++++++++++++++++++++++
go.mod | 5 +++
go.sum | 2 +
sanitize.go | 105 ++++++++++++++++++++++++++++++++-------------
sanitize_test.go | 109 ++++++++++++++++++++++++++++++++++++-----------
5 files changed, 264 insertions(+), 55 deletions(-)
create mode 100644 .golangci.yml
create mode 100644 go.mod
create mode 100644 go.sum
diff --git a/.golangci.yml b/.golangci.yml
new file mode 100644
index 0000000..9a19d6a
--- /dev/null
+++ b/.golangci.yml
@@ -0,0 +1,98 @@
+run:
+ concurrency: 4
+ timeout: 1m
+
+linters-settings:
+ wrapcheck:
+ ignoreSigs:
+ - .Errorf(
+ - errors.New(
+ - errors.Unwrap(
+ - .Wrap(
+ - .Wrapf(
+ - .WithMessage(
+ - .WithMessagef(
+ - .WithStack(
+ - .WrapError(
+ ignoreSigRegexps:
+ - \.New.*Error\(
+ ignorePackageGlobs:
+ - encoding/*
+ - github.com/pkg/*
+ revive:
+ ignore-generated-header: true
+ severity: warning
+ rules:
+ - name: exported
+ severity: warning
+ - name: error-return
+ severity: warning
+ - name: error-naming
+ severity: warning
+ - name: if-return
+ severity: warning
+ - name: var-naming
+ severity: warning
+ - name: var-declaration
+ severity: warning
+ - name: receiver-naming
+ severity: warning
+ - name: errorf
+ severity: warning
+ - name: empty-block
+ severity: warning
+ - name: unused-parameter
+ severity: warning
+ - name: unreachable-code
+ severity: warning
+ - name: redefines-builtin-id
+ severity: warning
+ - name: superfluous-else
+ severity: warning
+ - name: unexported-return
+ severity: warning
+ - name: indent-error-flow
+ severity: warning
+ - name: blank-imports
+ severity: warning
+ - name: range
+ severity: warning
+ - name: time-naming
+ severity: warning
+ - name: context-as-argument
+ severity: warning
+ - name: context-keys-type
+ severity: warning
+ - name: indent-error-flow
+ severity: warning
+
+linters:
+ disable-all: true
+ enable:
+ - asciicheck
+ - durationcheck
+ - errcheck
+ - errorlint
+ - exhaustive
+ - gosec
+ - govet
+ - makezero
+ - nilerr
+ - rowserrcheck
+ - exportloopref
+ - sqlclosecheck
+ - staticcheck
+ - typecheck
+ - bodyclose
+ - noctx
+ - prealloc
+ - gosimple
+ presets:
+ - comment
+ - error
+ - format
+ - metalinter
+ - unused
+
+issues:
+ exclude-use-default: false
\ No newline at end of file
diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..b0384e1
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,5 @@
+module github.com/kennygrant/sanitize
+
+go 1.19
+
+require golang.org/x/net v0.0.0-20220826154423-83b083e8dc8b
diff --git a/go.sum b/go.sum
new file mode 100644
index 0000000..8fca440
--- /dev/null
+++ b/go.sum
@@ -0,0 +1,2 @@
+golang.org/x/net v0.0.0-20220826154423-83b083e8dc8b h1:ZmngSVLe/wycRns9MKikG9OWIEjGcGAkacif7oYQaUY=
+golang.org/x/net v0.0.0-20220826154423-83b083e8dc8b/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk=
diff --git a/sanitize.go b/sanitize.go
index 2932209..5955273 100755
--- a/sanitize.go
+++ b/sanitize.go
@@ -1,8 +1,13 @@
-// Package sanitize provides functions for sanitizing text.
+/*
+Package sanitize provides functions for sanitizing text.
+*/
+
+//nolint:wrapcheck
package sanitize
import (
"bytes"
+ "errors"
"html"
"html/template"
"io"
@@ -14,9 +19,48 @@ import (
)
var (
- ignoreTags = []string{"title", "script", "style", "iframe", "frame", "frameset", "noframes", "noembed", "embed", "applet", "object", "base"}
+ ignoreTags = []string{
+ "title",
+ "script",
+ "style",
+ "iframe",
+ "frame",
+ "frameset",
+ "noframes",
+ "noembed",
+ "embed",
+ "applet",
+ "object",
+ "base",
+ }
- defaultTags = []string{"h1", "h2", "h3", "h4", "h5", "h6", "div", "span", "hr", "p", "br", "b", "i", "strong", "em", "ol", "ul", "li", "a", "img", "pre", "code", "blockquote", "article", "section"}
+ defaultTags = []string{
+ "h1",
+ "h2",
+ "h3",
+ "h4",
+ "h5",
+ "h6",
+ "div",
+ "span",
+ "hr",
+ "p",
+ "br",
+ "b",
+ "i",
+ "strong",
+ "em",
+ "ol",
+ "ul",
+ "li",
+ "a",
+ "img",
+ "pre",
+ "code",
+ "blockquote",
+ "article",
+ "section",
+ }
defaultAttributes = []string{"id", "class", "src", "href", "title", "alt", "name", "rel"}
)
@@ -24,7 +68,6 @@ var (
// HTMLAllowing sanitizes html, allowing some tags.
// Arrays of allowed tags and allowed attributes may optionally be passed as the second and third arguments.
func HTMLAllowing(s string, args ...[]string) (string, error) {
-
allowedTags := defaultTags
if len(args) > 0 {
allowedTags = args[0]
@@ -48,7 +91,7 @@ func HTMLAllowing(s string, args ...[]string) (string, error) {
case parser.ErrorToken:
err := tokenizer.Err()
- if err == io.EOF {
+ if errors.Is(err, io.EOF) {
return buffer.String(), nil
}
return "", err
@@ -94,13 +137,11 @@ func HTMLAllowing(s string, args ...[]string) (string, error) {
}
}
-
}
// HTML strips html tags, replace common entities, and escapes <>&;'" in the result.
// Note the returned text may contain entities as it is escaped by HTMLEscapeString, and most entities are not translated.
func HTML(s string) (output string) {
-
// Shortcut strings with no tags in them
if !strings.ContainsAny(s, "<>") {
output = s
@@ -108,14 +149,14 @@ func HTML(s string) (output string) {
// First remove line breaks etc as these have no meaning outside html tags (except pre)
// this means pre sections will lose formatting... but will result in less unintentional paras.
- s = strings.Replace(s, "\n", "", -1)
+ s = strings.ReplaceAll(s, "\n", "")
// Then replace line breaks with newlines, to preserve that formatting
- s = strings.Replace(s, "
", "\n", -1)
- s = strings.Replace(s, "
", "\n", -1)
- s = strings.Replace(s, "", "\n", -1)
- s = strings.Replace(s, "
", "\n", -1)
- s = strings.Replace(s, "
", "\n", -1)
+ s = strings.ReplaceAll(s, "", "\n")
+ s = strings.ReplaceAll(s, "
", "\n")
+ s = strings.ReplaceAll(s, "", "\n")
+ s = strings.ReplaceAll(s, "
", "\n")
+ s = strings.ReplaceAll(s, "
", "\n")
// Walk through the string removing all tags
b := bytes.NewBufferString("")
@@ -136,13 +177,13 @@ func HTML(s string) (output string) {
}
// Remove a few common harmless entities, to arrive at something more like plain text
- output = strings.Replace(output, "‘", "'", -1)
- output = strings.Replace(output, "’", "'", -1)
- output = strings.Replace(output, "“", "\"", -1)
- output = strings.Replace(output, "”", "\"", -1)
- output = strings.Replace(output, " ", " ", -1)
- output = strings.Replace(output, """, "\"", -1)
- output = strings.Replace(output, "'", "'", -1)
+ output = strings.ReplaceAll(output, "‘", "'")
+ output = strings.ReplaceAll(output, "’", "'")
+ output = strings.ReplaceAll(output, "“", "\"")
+ output = strings.ReplaceAll(output, "”", "\"")
+ output = strings.ReplaceAll(output, " ", " ")
+ output = strings.ReplaceAll(output, """, "\"")
+ output = strings.ReplaceAll(output, "'", "'")
// Translate some entities into their plain text equivalent (for example accents, if encoded as entities)
output = html.UnescapeString(output)
@@ -151,15 +192,15 @@ func HTML(s string) (output string) {
output = template.HTMLEscapeString(output)
// After processing, remove some harmless entities &, ' and " which are encoded by HTMLEscapeString
- output = strings.Replace(output, """, "\"", -1)
- output = strings.Replace(output, "'", "'", -1)
- output = strings.Replace(output, "& ", "& ", -1) // NB space after
- output = strings.Replace(output, "& ", "& ", -1) // NB space after
+ output = strings.ReplaceAll(output, """, "\"")
+ output = strings.ReplaceAll(output, "'", "'")
+ output = strings.ReplaceAll(output, "& ", "& ") // NB space after
+ output = strings.ReplaceAll(output, "& ", "& ") // NB space after
return output
}
-// We are very restrictive as this is intended for ascii url slugs
+// We are very restrictive as this is intended for ascii url slugs.
var illegalPath = regexp.MustCompile(`[^[:alnum:]\~\-\./]`)
// Path makes a string safe to use as a URL path,
@@ -169,7 +210,7 @@ var illegalPath = regexp.MustCompile(`[^[:alnum:]\~\-\./]`)
func Path(s string) string {
// Start with lowercase string
filePath := strings.ToLower(s)
- filePath = strings.Replace(filePath, "..", "", -1)
+ filePath = strings.ReplaceAll(filePath, "..", "")
filePath = path.Clean(filePath)
// Remove illegal characters for paths, flattening accents
@@ -180,7 +221,7 @@ func Path(s string) string {
return filePath
}
-// Remove all other unrecognised characters apart from
+// Remove all other unrecognised characters apart from.
var illegalName = regexp.MustCompile(`[^[:alnum:]-.]`)
// Name makes a string safe to use in a file name by first finding the path basename, then replacing non-ascii characters.
@@ -196,13 +237,12 @@ func Name(s string) string {
return fileName
}
-// Replace these separators with -
+// Replace these separators with -.
var baseNameSeparators = regexp.MustCompile(`[./]`)
// BaseName makes a string safe to use in a file name, producing a sanitized basename replacing . or / with -.
// No attempt is made to normalise a path or normalise case.
func BaseName(s string) string {
-
// Replace certain joining characters with a dash
baseName := baseNameSeparators.ReplaceAllString(s, "-")
@@ -228,6 +268,8 @@ var transliterations = map[rune]string{
'É': "E",
'Ê': "E",
'Ë': "E",
+ 'Ğ': "G",
+ 'İ': "I",
'Ì': "I",
'Í': "I",
'Î': "I",
@@ -240,6 +282,7 @@ var transliterations = map[rune]string{
'Ô': "O",
'Õ': "O",
'Ö': "OE",
+ 'Ş': "S",
'Ø': "OE",
'Œ': "OE",
'Ù': "U",
@@ -261,6 +304,8 @@ var transliterations = map[rune]string{
'é': "e",
'ê': "e",
'ë': "e",
+ 'ğ': "g",
+ 'ı': "i",
'ì': "i",
'í': "i",
'î': "i",
@@ -277,6 +322,7 @@ var transliterations = map[rune]string{
'ö': "oe",
'ø': "oe",
'œ': "oe",
+ 'ş': "s",
'ś': "s",
'ù': "u",
'ú': "u",
@@ -358,7 +404,6 @@ var (
// cleanString replaces separators with - and removes characters listed in the regexp provided from string.
// Accents, spaces, and all characters not in A-Za-z0-9 are replaced.
func cleanString(s string, r *regexp.Regexp) string {
-
// Remove any trailing space to avoid ending on -
s = strings.Trim(s, " ")
diff --git a/sanitize_test.go b/sanitize_test.go
index a2242dc..23cdef8 100644
--- a/sanitize_test.go
+++ b/sanitize_test.go
@@ -12,7 +12,8 @@ type Test struct {
expected string
}
-// NB the treatment of accents - they are removed and replaced with ascii transliterations
+// NB the treatment of accents - they are removed and replaced with
+// ascii transliterations.
var urls = []Test{
{"ReAd ME.md", `read-me.md`},
{"E88E08A7-279C-4CC1-8B90-86DE0D7044_3C.html", `e88e08a7-279c-4cc1-8b90-86de0d7044-3c.html`},
@@ -54,6 +55,8 @@ var fileNames = []Test{
{"../4 icon-testé *8%^\"'\".jpg ", `4-icon-teste-8.jpg`},
{"Überfluß an Döner macht schöner.JPEG", `ueberfluss-an-doener-macht-schoener.jpeg`},
{"Ä-_-Ü_:()_Ö-_-ä-_-ü-_-ö-_ß.webm", `ae-ue-oe-ae-ue-oe-ss.webm`},
+ {"uğur özyılmazel şukela", `ugur-oezyilmazel-sukela`},
+ {"uĞur Özyılmazel İğneli Şukela", `ugur-oezyilmazel-igneli-sukela`},
}
func TestName(t *testing.T) {
@@ -97,7 +100,8 @@ func TestBaseName(t *testing.T) {
// NB because we remove all tokens after a < until the next >
// and do not attempt to parse, we should be safe from invalid html,
// but will sometimes completely empty the string if we have invalid input
-// Note we sometimes use " in order to keep things on one line and use the ` character
+// Note we sometimes use " in order to keep things on one line and use
+// the ` character.
var htmlTests = []Test{
{` `, " "},
{`
`, `
`},
@@ -106,13 +110,22 @@ var htmlTests = []Test{
{`FOO
ZOO`, "FOO\rZOO"},
{`">`, `alert("XSS")"`},
{`
`, ``},
@@ -122,8 +135,11 @@ var htmlTests = []Test{
{`> & test <`, `> & test <`},
{`
`, ``},
{`“hello” it’s for ‘real’`, `"hello" it's for 'real'`},
- {`
`, ``},
+ {
+ `
`,
+ ``,
+ },
{`'';!--"=&{()}`, `'';!--"=&{()}`},
{"LINE 1
\nLINE 2", "LINE 1\nLINE 2"},
@@ -157,8 +173,14 @@ var htmlTestsAllowing = []Test{
{`
`, `
`},
{`hello world`, `hello world`},
{`hello
rulers`, `hello
rulers`},
- {`Span
`, `Span
`},
- {`Div
test
invalidtest
`, `Div
test
invalidtest
`},
+ {
+ `Span
`,
+ `Span
`,
+ },
+ {
+ `Div
test
invalidtest
`,
+ `Div
test
invalidtest
`,
+ },
{`Some text
`, `Some text
`},
{`hello world`, `hello world`},
{`textinside
too`, `textinside
too`},
@@ -167,11 +189,20 @@ var htmlTestsAllowing = []Test{
{"Bold Not bold
\nAlso not bold.", "Bold Not bold
\nAlso not bold."},
{"`FOO
ZOO", "`FOO
ZOO"},
{`PT SRC="http://ha.ckers.org/xss.js">`, `PT SRC="http://ha.ckers.org/xss.js">`},
- {``, ``},
+ {
+ ` +ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4-`,
+ ` +ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4-`,
+ },
+ {
+ `PT SRC="http://ha.ckers.org/xss.js">`,
+ `PT SRC="http://ha.ckers.org/xss.js">`,
+ },
+ {
+ ``,
+ ``,
+ },
{`'';!--"=&{()}`, `'';!--"=&{()}`},
{`
">`, `
">`},
- {`
`, `
`},
- {`cool guy`, `cool guy`},
+ {
+ `
`,
+ `
`,
+ },
+ {
+ `cool guy`,
+ `cool guy`,
+ },
}
func TestHTMLAllowed(t *testing.T) {
-
for _, test := range htmlTestsAllowing {
output, err := HTMLAllowing(test.input)
if err != nil {