diff --git a/bin/goose b/bin/goose index 84387bb..c6e8799 100755 Binary files a/bin/goose and b/bin/goose differ diff --git a/go.mod b/go.mod index e939be3..adf644c 100644 --- a/go.mod +++ b/go.mod @@ -1,8 +1,8 @@ module github.com/advancedlogic/GoOse -go 1.21.1 +go 1.24.3 -toolchain go1.24.5 +toolchain go1.24.10 require ( github.com/PuerkitoBio/goquery v1.4.1 @@ -10,22 +10,28 @@ require ( github.com/fatih/set v0.2.1 github.com/gigawattio/window v0.0.0-20180317192513-0f5467e35573 github.com/go-resty/resty/v2 v2.0.0 - github.com/jaytaylor/html2text v0.0.0-20180606194806-57d518f124b0 + github.com/inbucket/html2text v1.0.0 github.com/pkg/errors v0.9.1 github.com/spf13/cobra v1.9.1 github.com/spf13/viper v1.20.1 - golang.org/x/net v0.33.0 - golang.org/x/text v0.21.0 + golang.org/x/net v0.41.0 + golang.org/x/text v0.26.0 ) require ( github.com/andybalholm/cascadia v1.0.0 // indirect + github.com/fatih/color v1.18.0 // indirect github.com/fsnotify/fsnotify v1.8.0 // indirect github.com/go-viper/mapstructure/v2 v2.2.1 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect - github.com/mattn/go-runewidth v0.0.3 // indirect - github.com/olekukonko/tablewriter v0.0.0-20180506121414-d4647c9c7a84 // indirect + github.com/mattn/go-colorable v0.1.14 // indirect + github.com/mattn/go-isatty v0.0.20 // indirect + github.com/mattn/go-runewidth v0.0.16 // indirect + github.com/olekukonko/errors v1.1.0 // indirect + github.com/olekukonko/ll v0.0.9 // indirect + github.com/olekukonko/tablewriter v1.0.7 // indirect github.com/pelletier/go-toml/v2 v2.2.3 // indirect + github.com/rivo/uniseg v0.4.7 // indirect github.com/rogpeppe/go-internal v1.12.0 // indirect github.com/sagikazarmark/locafero v0.7.0 // indirect github.com/simplereach/timeutils v1.2.0 // indirect @@ -37,7 +43,7 @@ require ( github.com/subosito/gotenv v1.6.0 // indirect go.uber.org/atomic v1.9.0 // indirect go.uber.org/multierr v1.9.0 // indirect - golang.org/x/sys v0.29.0 // indirect + golang.org/x/sys v0.33.0 // indirect gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect gopkg.in/mgo.v2 v2.0.0-20190816093944-a6b53ec6cb22 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/go.sum b/go.sum index 18f3e91..8ef99b3 100644 --- a/go.sum +++ b/go.sum @@ -8,6 +8,8 @@ github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6N github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/fatih/color v1.18.0 h1:S8gINlzdQ840/4pfAwic/ZE0djQEH3wM94VfqLTZcOM= +github.com/fatih/color v1.18.0/go.mod h1:4FelSpRwEGDpQ12mAdzqdOukCy4u8WUtOY6lkT/6HfU= github.com/fatih/set v0.2.1 h1:nn2CaJyknWE/6txyUDGwysr3G5QC6xWB/PtVjPBbeaA= github.com/fatih/set v0.2.1/go.mod h1:+RKtMCH+favT2+3YecHGxcc0b4KyVWA1QWWJUs4E0CI= github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= @@ -22,10 +24,10 @@ github.com/go-viper/mapstructure/v2 v2.2.1 h1:ZAaOCxANMuZx5RCeg0mBdEZk7DZasvvZIx github.com/go-viper/mapstructure/v2 v2.2.1/go.mod h1:oJDH3BJKyqBA2TXFhDsKDGDTlndYOZ6rGS0BRZIxGhM= github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/inbucket/html2text v1.0.0 h1:N5kza++4uBBDJ2Z3KUnTRyPNoBcW+YfOgNiNmNB+sgs= +github.com/inbucket/html2text v1.0.0/go.mod h1:5TrhXQKGU+LXurODaSm55Y9eXoPBRnYiOz4x2XfUoJU= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= -github.com/jaytaylor/html2text v0.0.0-20180606194806-57d518f124b0 h1:xqgexXAGQgY3HAjNPSaCqn5Aahbo5TKsmhp8VRfr1iQ= -github.com/jaytaylor/html2text v0.0.0-20180606194806-57d518f124b0/go.mod h1:CVKlgaMiht+LXvHG173ujK6JUhZXKb2u/BQtjPDIvyk= github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= @@ -33,16 +35,27 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= -github.com/mattn/go-runewidth v0.0.3 h1:a+kO+98RDGEfo6asOGMmpodZq4FNtnGP54yps8BzLR4= -github.com/mattn/go-runewidth v0.0.3/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU= -github.com/olekukonko/tablewriter v0.0.0-20180506121414-d4647c9c7a84 h1:fiKJgB4JDUd43CApkmCeTSQlWjtTtABrU2qsgbuP0BI= -github.com/olekukonko/tablewriter v0.0.0-20180506121414-d4647c9c7a84/go.mod h1:vsDQFd/mU46D+Z4whnwzcISnGGzXWMclvtLoiIKAKIo= +github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE= +github.com/mattn/go-colorable v0.1.14/go.mod h1:6LmQG8QLFO4G5z1gPvYEzlUgJ2wF+stgPZH1UqBm1s8= +github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= +github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc= +github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= +github.com/olekukonko/errors v1.1.0 h1:RNuGIh15QdDenh+hNvKrJkmxxjV4hcS50Db478Ou5sM= +github.com/olekukonko/errors v1.1.0/go.mod h1:ppzxA5jBKcO1vIpCXQ9ZqgDh8iwODz6OXIGKU8r5m4Y= +github.com/olekukonko/ll v0.0.9 h1:Y+1YqDfVkqMWuEQMclsF9HUR5+a82+dxJuL1HHSRpxI= +github.com/olekukonko/ll v0.0.9/go.mod h1:En+sEW0JNETl26+K8eZ6/W4UQ7CYSrrgg/EdIYT2H8g= +github.com/olekukonko/tablewriter v1.0.7 h1:HCC2e3MM+2g72M81ZcJU11uciw6z/p82aEnm4/ySDGw= +github.com/olekukonko/tablewriter v1.0.7/go.mod h1:H428M+HzoUXC6JU2Abj9IT9ooRmdq9CxuDmKMtrOCMs= github.com/pelletier/go-toml/v2 v2.2.3 h1:YmeHyLY8mFWbdkNWwpr+qIL2bEqT0o95WSdkNHvL12M= github.com/pelletier/go-toml/v2 v2.2.3/go.mod h1:MfCQTFTvCcUyyvvwm1+G6H/jORL20Xlb6rzQu9GuUkc= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= +github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= +github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= @@ -77,14 +90,15 @@ go.uber.org/multierr v1.9.0/go.mod h1:X2jQV1h+kxSjClGpnseKVIxpmcjrj7MNnI0bnlfKTV golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190628185345-da137c7871d7/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I= -golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= +golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw= +golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.29.0 h1:TPYlXGxvx1MGTn2GiZDhnjPA9wZzZeGKHHmKhHYvgaU= -golang.org/x/sys v0.29.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= +golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= -golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= +golang.org/x/text v0.26.0 h1:P42AVeLghgTYr4+xUnTRKDMqpar+PtX7KWuNQL21L8M= +golang.org/x/text v0.26.0/go.mod h1:QK15LZJUUQVJxhz7wXgxSy/CJaTFjd0G+YLonydOVQA= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= diff --git a/internal/extractor/extractor.go b/internal/extractor/extractor.go index f557e2f..764c44d 100644 --- a/internal/extractor/extractor.go +++ b/internal/extractor/extractor.go @@ -14,7 +14,7 @@ import ( "github.com/araddon/dateparse" "github.com/fatih/set" "github.com/gigawattio/window" - "github.com/jaytaylor/html2text" + "github.com/inbucket/html2text" "golang.org/x/net/html" "golang.org/x/net/html/atom" @@ -125,16 +125,16 @@ func (extr *ContentExtractor) splitTitle(titles []string) string { for i := range titles { titles[i] = strings.TrimSpace(titles[i]) } - + // Check if last part looks like a site name (common pattern) lastPart := titles[len(titles)-1] // Common site name patterns - if len(titles) == 2 && (strings.Contains(lastPart, "News") || - strings.Contains(lastPart, "BBC") || - strings.Contains(lastPart, "CNN") || + if len(titles) == 2 && (strings.Contains(lastPart, "News") || + strings.Contains(lastPart, "BBC") || + strings.Contains(lastPart, "CNN") || strings.Contains(lastPart, "ABC") || - strings.Contains(lastPart, "Times") || - strings.Contains(lastPart, "Post") || + strings.Contains(lastPart, "Times") || + strings.Contains(lastPart, "Post") || strings.Contains(lastPart, "Journal") || len(lastPart) < 20) { // Return the first part @@ -142,7 +142,7 @@ func (extr *ContentExtractor) splitTitle(titles []string) string { return title } } - + // Fallback to the original logic - choose the longest part largeTextLength := 0 largeTextIndex := 0 @@ -417,7 +417,7 @@ func (extr *ContentExtractor) CalculateBestNode(document *goquery.Document) *goq if siteSpecificNode := extr.tryNewsSelectors(document); siteSpecificNode != nil { return siteSpecificNode } - + var topNode *goquery.Selection nodesToCheck := extr.nodesToCheck(document) if extr.config.Debug { @@ -432,11 +432,11 @@ func (extr *ContentExtractor) CalculateBestNode(document *goquery.Document) *goq textNode := node.Text() ws := extr.config.StopWords.StopWordsCount(extr.config.TargetLanguage, textNode) highLinkDensity := extr.isHighLinkDensity(node) - + // Boost scoring for nodes that look like article content articleBoost := extr.getArticleContentBoost(node) adjustedWs := ws + articleBoost - + if adjustedWs > 2 && !highLinkDensity { nodesWithText.PushBack(node) } @@ -619,90 +619,90 @@ func (extr *ContentExtractor) isLikelyNonContent(node *goquery.Selection) bool { for parent := node.Parent(); parent != nil && parent.Length() > 0; parent = parent.Parent() { class, hasClass := parent.Attr("class") id, hasId := parent.Attr("id") - + if hasClass { class = strings.ToLower(class) - if strings.Contains(class, "nav") || strings.Contains(class, "menu") || - strings.Contains(class, "header") || strings.Contains(class, "footer") || - strings.Contains(class, "sidebar") || strings.Contains(class, "aside") || - strings.Contains(class, "ad") || strings.Contains(class, "banner") || - strings.Contains(class, "breadcrumb") || strings.Contains(class, "related") { + if strings.Contains(class, "nav") || strings.Contains(class, "menu") || + strings.Contains(class, "header") || strings.Contains(class, "footer") || + strings.Contains(class, "sidebar") || strings.Contains(class, "aside") || + strings.Contains(class, "ad") || strings.Contains(class, "banner") || + strings.Contains(class, "breadcrumb") || strings.Contains(class, "related") { return true } } - + if hasId { id = strings.ToLower(id) - if strings.Contains(id, "nav") || strings.Contains(id, "menu") || - strings.Contains(id, "header") || strings.Contains(id, "footer") || - strings.Contains(id, "sidebar") || strings.Contains(id, "aside") || - strings.Contains(id, "ad") || strings.Contains(id, "banner") { + if strings.Contains(id, "nav") || strings.Contains(id, "menu") || + strings.Contains(id, "header") || strings.Contains(id, "footer") || + strings.Contains(id, "sidebar") || strings.Contains(id, "aside") || + strings.Contains(id, "ad") || strings.Contains(id, "banner") { return true } } - + // Check tag type tagName := parent.Get(0).DataAtom.String() if tagName == "nav" || tagName == "header" || tagName == "footer" || tagName == "aside" { return true } } - + // Check if the node itself has very short text (likely navigation link) text := strings.TrimSpace(node.Text()) if len(text) < 10 { return true } - + return false } // getArticleContentBoost provides additional scoring for nodes that appear to be article content func (extr *ContentExtractor) getArticleContentBoost(node *goquery.Selection) int { boost := 0 - + // Check parent hierarchy for article-related classes/ids for parent := node.Parent(); parent != nil && parent.Length() > 0; parent = parent.Parent() { class, hasClass := parent.Attr("class") id, hasId := parent.Attr("id") - + if hasClass { class = strings.ToLower(class) if strings.Contains(class, "article") || strings.Contains(class, "content") || - strings.Contains(class, "story") || strings.Contains(class, "post") || - strings.Contains(class, "entry") || strings.Contains(class, "main") || - strings.Contains(class, "body") || strings.Contains(class, "text") { + strings.Contains(class, "story") || strings.Contains(class, "post") || + strings.Contains(class, "entry") || strings.Contains(class, "main") || + strings.Contains(class, "body") || strings.Contains(class, "text") { boost += 10 } } - + if hasId { id = strings.ToLower(id) if strings.Contains(id, "article") || strings.Contains(id, "content") || - strings.Contains(id, "story") || strings.Contains(id, "post") || - strings.Contains(id, "entry") || strings.Contains(id, "main") { + strings.Contains(id, "story") || strings.Contains(id, "post") || + strings.Contains(id, "entry") || strings.Contains(id, "main") { boost += 10 } } - + // Check for semantic HTML5 tags tagName := parent.Get(0).DataAtom.String() if tagName == "article" || tagName == "main" { boost += 15 } } - + // Penalize nodes that seem to be in navigation or sidebars text := strings.TrimSpace(node.Text()) if len(text) > 100 { // Long text is more likely to be content boost += 5 } - + // Look for paragraph length - articles typically have substantial paragraphs if len(text) > 200 { boost += 5 } - + return boost } @@ -716,7 +716,7 @@ func (extr *ContentExtractor) tryNewsSelectors(document *goquery.Document) *goqu ".article-body", ".story-body", ".post-content", - ".entry-content", + ".entry-content", ".content-body", "main article", "[role='main'] article", @@ -731,7 +731,7 @@ func (extr *ContentExtractor) tryNewsSelectors(document *goquery.Document) *goqu "[data-testid='article-body']", "[data-testid='story-body']", } - + for _, selector := range selectors { selection := document.Find(selector) if selection.Length() > 0 { @@ -744,7 +744,7 @@ func (extr *ContentExtractor) tryNewsSelectors(document *goquery.Document) *goqu // Additional validation: ensure it's not mostly navigation if !extr.isHighLinkDensity(selection) && extr.hasGoodContentSignals(selection) { if extr.config.Debug { - log.Printf("Found article content using selector: %s (text length: %d, paragraphs: %d)\n", + log.Printf("Found article content using selector: %s (text length: %d, paragraphs: %d)\n", selector, len(text), paragraphs.Length()) } // Extract only the paragraph content, not the entire container @@ -754,23 +754,23 @@ func (extr *ContentExtractor) tryNewsSelectors(document *goquery.Document) *goqu } } } - + // Try looking for elements with substantial text content that aren't navigation var bestCandidate *goquery.Selection var bestScore int - + document.Find("div, article, section").Each(func(i int, s *goquery.Selection) { class, _ := s.Attr("class") id, _ := s.Attr("id") - + // Look for likely content containers - if strings.Contains(strings.ToLower(class), "content") || - strings.Contains(strings.ToLower(class), "article") || - strings.Contains(strings.ToLower(class), "story") || - strings.Contains(strings.ToLower(id), "content") || - strings.Contains(strings.ToLower(id), "article") || - strings.Contains(strings.ToLower(id), "story") { - + if strings.Contains(strings.ToLower(class), "content") || + strings.Contains(strings.ToLower(class), "article") || + strings.Contains(strings.ToLower(class), "story") || + strings.Contains(strings.ToLower(id), "content") || + strings.Contains(strings.ToLower(id), "article") || + strings.Contains(strings.ToLower(id), "story") { + text := strings.TrimSpace(s.Text()) if len(text) > 500 { // Substantial content paragraphs := s.Find("p") @@ -782,7 +782,7 @@ func (extr *ContentExtractor) tryNewsSelectors(document *goquery.Document) *goqu bestCandidate = s bestScore = score if extr.config.Debug { - log.Printf("Found potential article content by class/id: %s %s (text length: %d, score: %d)\n", + log.Printf("Found potential article content by class/id: %s %s (text length: %d, score: %d)\n", class, id, len(text), score) } } @@ -791,20 +791,20 @@ func (extr *ContentExtractor) tryNewsSelectors(document *goquery.Document) *goqu } } }) - + return bestCandidate } // hasGoodContentSignals checks if a node contains signals that indicate it's article content func (extr *ContentExtractor) hasGoodContentSignals(node *goquery.Selection) bool { text := strings.TrimSpace(node.Text()) - + // Check for article-like sentence structure (sentences ending with periods) sentences := strings.Split(text, ".") if len(sentences) < 3 { return false // Too few sentences for an article } - + // Check average sentence length (articles have substantial sentences) totalLength := 0 validSentences := 0 @@ -815,16 +815,16 @@ func (extr *ContentExtractor) hasGoodContentSignals(node *goquery.Selection) boo validSentences++ } } - + if validSentences < 3 { return false } - + avgSentenceLength := totalLength / validSentences if avgSentenceLength < 50 { // Articles typically have longer sentences return false } - + // Check for common navigation patterns to exclude lowerText := strings.ToLower(text) navigationWords := []string{ @@ -835,19 +835,19 @@ func (extr *ContentExtractor) hasGoodContentSignals(node *goquery.Selection) boo "calculators", "markets", "investing", "fashion", "beauty", "games", "crossword", "photos", "investigations", "profiles", } - + navigationCount := 0 for _, word := range navigationWords { if strings.Contains(lowerText, word) { navigationCount++ } } - + // If it contains many navigation words, it's likely not article content if navigationCount > 5 { return false } - + return true } @@ -856,26 +856,26 @@ func (extr *ContentExtractor) extractParagraphContent(selection *goquery.Selecti // Create a new document fragment with only the article paragraphs paragraphs := selection.Find("p") var cleanParagraphs []*goquery.Selection - + paragraphs.Each(func(i int, p *goquery.Selection) { text := strings.TrimSpace(p.Text()) // Only include paragraphs with substantial content if len(text) > 30 { // Skip paragraphs that look like metadata or navigation lowerText := strings.ToLower(text) - if !strings.Contains(lowerText, "updated") && - !strings.Contains(lowerText, "published") && - !strings.Contains(lowerText, "min read") && - !strings.Contains(lowerText, "follow") && - !strings.Contains(lowerText, "subscribe") && - !strings.Contains(lowerText, "sign in") && - !strings.Contains(lowerText, "analysis by") && - !strings.Contains(lowerText, "see all topics") { + if !strings.Contains(lowerText, "updated") && + !strings.Contains(lowerText, "published") && + !strings.Contains(lowerText, "min read") && + !strings.Contains(lowerText, "follow") && + !strings.Contains(lowerText, "subscribe") && + !strings.Contains(lowerText, "sign in") && + !strings.Contains(lowerText, "analysis by") && + !strings.Contains(lowerText, "see all topics") { cleanParagraphs = append(cleanParagraphs, p) } } }) - + // If we found good paragraphs, return the first one's parent and modify it if len(cleanParagraphs) > 0 { // Return the original selection but with filtered content @@ -897,7 +897,7 @@ func (extr *ContentExtractor) extractParagraphContent(selection *goquery.Selecti }) return selection } - + return selection } @@ -920,12 +920,12 @@ func (extr *ContentExtractor) isHighLinkDensity(node *goquery.Selection) bool { linkWords := strings.Split(linkText, " ") nlinkWords := len(linkWords) nlinks := links.Size() - + // Avoid division by zero if nwords == 0 { return true } - + linkDivisor := float64(nlinkWords) / float64(nwords) score := linkDivisor * float64(nlinks) @@ -934,7 +934,7 @@ func (extr *ContentExtractor) isHighLinkDensity(node *goquery.Selection) bool { if nlinks > 5 && linkDivisor > 0.3 { return true } - + // If more than 60% of words are in links, it's likely navigation if linkDivisor > 0.6 { return true @@ -949,7 +949,7 @@ func (extr *ContentExtractor) isHighLinkDensity(node *goquery.Selection) bool { } log.Printf("Calculated link density score as %1.5f for node %s (links: %d, linkDivisor: %1.3f)\n", score, logText, nlinks, linkDivisor) } - if score > 0.8 { // Lowered from 1.0 to be more aggressive + if score > 0.8 { // Lowered from 1.0 to be more aggressive return true } return false