Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions backend/extractor/pics.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import (
log "github.com/go-pkgz/lgr"
)

func (f UReadability) extractPics(iselect *goquery.Selection, url string) (mainImage string, allImages []string, ok bool) {
func (f *UReadability) extractPics(iselect *goquery.Selection, url string) (mainImage string, allImages []string, ok bool) {
images := make(map[int]string)

type imgInfo struct {
Expand Down Expand Up @@ -58,7 +58,7 @@ func (f UReadability) extractPics(iselect *goquery.Selection, url string) (mainI
}

// getImageSize loads image to get size
func (f UReadability) getImageSize(url string) (size int) {
func (f *UReadability) getImageSize(url string) (size int) {
httpClient := &http.Client{Timeout: time.Second * 30}
req, err := http.NewRequest("GET", url, nil)
if err != nil {
Expand Down
10 changes: 5 additions & 5 deletions backend/extractor/readability.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,17 +59,17 @@ var (
const userAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15"

// Extract fetches page and retrieves article
func (f UReadability) Extract(ctx context.Context, reqURL string) (*Response, error) {
func (f *UReadability) Extract(ctx context.Context, reqURL string) (*Response, error) {
return f.extractWithRules(ctx, reqURL, nil)
}

// ExtractByRule fetches page and retrieves article using a specific rule
func (f UReadability) ExtractByRule(ctx context.Context, reqURL string, rule *datastore.Rule) (*Response, error) {
func (f *UReadability) ExtractByRule(ctx context.Context, reqURL string, rule *datastore.Rule) (*Response, error) {
return f.extractWithRules(ctx, reqURL, rule)
}

// ExtractWithRules is the core function that handles extraction with or without a specific rule
func (f UReadability) extractWithRules(ctx context.Context, reqURL string, rule *datastore.Rule) (*Response, error) {
func (f *UReadability) extractWithRules(ctx context.Context, reqURL string, rule *datastore.Rule) (*Response, error) {
log.Printf("[INFO] extract %s", reqURL)
rb := &Response{}

Expand Down Expand Up @@ -140,7 +140,7 @@ func (f UReadability) extractWithRules(ctx context.Context, reqURL string, rule
// getContent retrieves content from raw body string, both content (text only) and rich (with html tags)
// if rule is provided, it uses custom rule, otherwise tries to retrieve one from the storage,
// and at last tries to use general readability parser
func (f UReadability) getContent(ctx context.Context, body, reqURL string, rule *datastore.Rule) (content, rich string, err error) {
func (f *UReadability) getContent(ctx context.Context, body, reqURL string, rule *datastore.Rule) (content, rich string, err error) {
// general parser
genParser := func(body, _ string) (content, rich string, err error) {
doc, err := readability.NewDocument(body)
Expand Down Expand Up @@ -192,7 +192,7 @@ func (f UReadability) getContent(ctx context.Context, body, reqURL string, rule
}

// makes all links absolute and returns all found links
func (f UReadability) normalizeLinks(data string, reqContext *http.Request) (result string, links []string) {
func (f *UReadability) normalizeLinks(data string, reqContext *http.Request) (result string, links []string) {
absoluteLink := func(link string) (absLink string, changed bool) {
if r, err := reqContext.URL.Parse(link); err == nil {
return r.String(), r.String() != link
Expand Down
6 changes: 3 additions & 3 deletions backend/extractor/text.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import (
)

// get clean text from html content
func (f UReadability) getText(content, title string) string {
func (f *UReadability) getText(content, title string) string {
cleanText := sanitize.HTML(content)
cleanText = strings.Replace(cleanText, title, "", 1) // get rid of title in snippet
cleanText = strings.ReplaceAll(cleanText, "\t", " ")
Expand All @@ -32,7 +32,7 @@ func (f UReadability) getText(content, title string) string {
}

// get snippet from clean text content
func (f UReadability) getSnippet(cleanText string) string {
func (f *UReadability) getSnippet(cleanText string) string {
cleanText = strings.ReplaceAll(cleanText, "\n", " ")
size := len([]rune(cleanText))
if size > f.SnippetSize {
Expand All @@ -50,7 +50,7 @@ func (f UReadability) getSnippet(cleanText string) string {
}

// detect encoding, content type and convert content to utf8
func (f UReadability) toUtf8(content []byte, header http.Header) (contentType, origEncoding, result string) {
func (f *UReadability) toUtf8(content []byte, header http.Header) (contentType, origEncoding, result string) {
getContentTypeAndEncoding := func(str string) (contentType, encoding string) { // from "text/html; charset=windows-1251"
elems := strings.Split(str, ";")
contentType = strings.TrimSpace(elems[0])
Expand Down