From dd1250446490241f26d2cd79279c0edc2e75504b Mon Sep 17 00:00:00 2001 From: Dmitry Verkhoturov Date: Sat, 22 Mar 2025 22:07:03 +0100 Subject: [PATCH] Refactor UReadability methods to use pointer receiver Needed for OpenAI support later on --- backend/extractor/pics.go | 4 ++-- backend/extractor/readability.go | 10 +++++----- backend/extractor/text.go | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/backend/extractor/pics.go b/backend/extractor/pics.go index abe583aa..7117dc7f 100644 --- a/backend/extractor/pics.go +++ b/backend/extractor/pics.go @@ -11,7 +11,7 @@ import ( log "github.com/go-pkgz/lgr" ) -func (f UReadability) extractPics(iselect *goquery.Selection, url string) (mainImage string, allImages []string, ok bool) { +func (f *UReadability) extractPics(iselect *goquery.Selection, url string) (mainImage string, allImages []string, ok bool) { images := make(map[int]string) type imgInfo struct { @@ -58,7 +58,7 @@ func (f UReadability) extractPics(iselect *goquery.Selection, url string) (mainI } // getImageSize loads image to get size -func (f UReadability) getImageSize(url string) (size int) { +func (f *UReadability) getImageSize(url string) (size int) { httpClient := &http.Client{Timeout: time.Second * 30} req, err := http.NewRequest("GET", url, nil) if err != nil { diff --git a/backend/extractor/readability.go b/backend/extractor/readability.go index 89afb8b0..d837f484 100644 --- a/backend/extractor/readability.go +++ b/backend/extractor/readability.go @@ -59,17 +59,17 @@ var ( const userAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15" // Extract fetches page and retrieves article -func (f UReadability) Extract(ctx context.Context, reqURL string) (*Response, error) { +func (f *UReadability) Extract(ctx context.Context, reqURL string) (*Response, error) { return f.extractWithRules(ctx, reqURL, nil) } // ExtractByRule fetches page and retrieves article using a specific rule -func (f UReadability) ExtractByRule(ctx context.Context, reqURL string, rule *datastore.Rule) (*Response, error) { +func (f *UReadability) ExtractByRule(ctx context.Context, reqURL string, rule *datastore.Rule) (*Response, error) { return f.extractWithRules(ctx, reqURL, rule) } // ExtractWithRules is the core function that handles extraction with or without a specific rule -func (f UReadability) extractWithRules(ctx context.Context, reqURL string, rule *datastore.Rule) (*Response, error) { +func (f *UReadability) extractWithRules(ctx context.Context, reqURL string, rule *datastore.Rule) (*Response, error) { log.Printf("[INFO] extract %s", reqURL) rb := &Response{} @@ -140,7 +140,7 @@ func (f UReadability) extractWithRules(ctx context.Context, reqURL string, rule // getContent retrieves content from raw body string, both content (text only) and rich (with html tags) // if rule is provided, it uses custom rule, otherwise tries to retrieve one from the storage, // and at last tries to use general readability parser -func (f UReadability) getContent(ctx context.Context, body, reqURL string, rule *datastore.Rule) (content, rich string, err error) { +func (f *UReadability) getContent(ctx context.Context, body, reqURL string, rule *datastore.Rule) (content, rich string, err error) { // general parser genParser := func(body, _ string) (content, rich string, err error) { doc, err := readability.NewDocument(body) @@ -192,7 +192,7 @@ func (f UReadability) getContent(ctx context.Context, body, reqURL string, rule } // makes all links absolute and returns all found links -func (f UReadability) normalizeLinks(data string, reqContext *http.Request) (result string, links []string) { +func (f *UReadability) normalizeLinks(data string, reqContext *http.Request) (result string, links []string) { absoluteLink := func(link string) (absLink string, changed bool) { if r, err := reqContext.URL.Parse(link); err == nil { return r.String(), r.String() != link diff --git a/backend/extractor/text.go b/backend/extractor/text.go index 0f8c2dfe..ba21cefe 100644 --- a/backend/extractor/text.go +++ b/backend/extractor/text.go @@ -12,7 +12,7 @@ import ( ) // get clean text from html content -func (f UReadability) getText(content, title string) string { +func (f *UReadability) getText(content, title string) string { cleanText := sanitize.HTML(content) cleanText = strings.Replace(cleanText, title, "", 1) // get rid of title in snippet cleanText = strings.ReplaceAll(cleanText, "\t", " ") @@ -32,7 +32,7 @@ func (f UReadability) getText(content, title string) string { } // get snippet from clean text content -func (f UReadability) getSnippet(cleanText string) string { +func (f *UReadability) getSnippet(cleanText string) string { cleanText = strings.ReplaceAll(cleanText, "\n", " ") size := len([]rune(cleanText)) if size > f.SnippetSize { @@ -50,7 +50,7 @@ func (f UReadability) getSnippet(cleanText string) string { } // detect encoding, content type and convert content to utf8 -func (f UReadability) toUtf8(content []byte, header http.Header) (contentType, origEncoding, result string) { +func (f *UReadability) toUtf8(content []byte, header http.Header) (contentType, origEncoding, result string) { getContentTypeAndEncoding := func(str string) (contentType, encoding string) { // from "text/html; charset=windows-1251" elems := strings.Split(str, ";") contentType = strings.TrimSpace(elems[0])