From 4b68bc8916ae0340a73c15028a882de0a084e093 Mon Sep 17 00:00:00 2001 From: l3akage Date: Mon, 9 Jun 2025 12:10:45 +0200 Subject: [PATCH 1/3] Fix br tag conversion to newlines in goodreads book descriptions - Replace
tags with
before html2text processing - html2text library doesn't handle
(with space) format - Add test to verify br tag replacement works correctly Fixes #16 --- goodreads/book.go | 4 +++- goodreads/book_test.go | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/goodreads/book.go b/goodreads/book.go index 2e82830..455bdec 100644 --- a/goodreads/book.go +++ b/goodreads/book.go @@ -141,7 +141,9 @@ func (e *Edition) Sanitise() { // Break tags need to be specially handled to add new lines as html2text does // not convert them to new lines properly e.Description = descriptionAlternativeCoverRegex.ReplaceAllString(e.Description, "") - e.Description = breakTagRegex.ReplaceAllString(e.Description, "\n") + // HACK: html2text only handles
and
, goodreads uses
+ // replace unsupported br tag is a supported one + e.Description = breakTagRegex.ReplaceAllString(e.Description, "
") e.Description = html2text.HTML2TextWithOptions(e.Description, html2text.WithUnixLineBreaks()) e.Description = strings.TrimSpace(e.Description) diff --git a/goodreads/book_test.go b/goodreads/book_test.go index aed5d4d..006f351 100644 --- a/goodreads/book_test.go +++ b/goodreads/book_test.go @@ -37,3 +37,40 @@ func TestUnmarshalGenres(t *testing.T) { expectedGenres := goodreads.Genres{"Fantasy", "Classic", "Fiction"} require.Equal(t, expectedGenres, genres) } + +func TestBookUnmarshalBrTagReplacement(t *testing.T) { + testXML := ` + + + 123 + Test Book + This should be on a new line.
Another line.
Final line.]]>
+ + Test Book + 100 + 10 + + + + +
+
+ ` + + var response struct { + Book goodreads.Book `xml:"book"` + } + + err := xml.Unmarshal([]byte(testXML), &response) + require.NoError(t, err) + + description := response.Book.BestEdition.Description + t.Logf("Description after processing: %q", description) + + // Verify that
tags have been correctly converted to newlines + require.Contains(t, description, "test description.\nThis should be on a new line.\nAnother line.\nFinal line.") + + // Verify that no HTML br tags remain + require.NotContains(t, description, "") +} From 49b3624288cc2de664878e5d77b25619d6e4a7dd Mon Sep 17 00:00:00 2001 From: Martin Date: Wed, 11 Jun 2025 21:18:10 +0200 Subject: [PATCH 2/3] Update goodreads/book.go Co-authored-by: Arran Hobson Sayers <32173585+ahobsonsayers@users.noreply.github.com> --- goodreads/book.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goodreads/book.go b/goodreads/book.go index 455bdec..e52a35d 100644 --- a/goodreads/book.go +++ b/goodreads/book.go @@ -142,7 +142,7 @@ func (e *Edition) Sanitise() { // not convert them to new lines properly e.Description = descriptionAlternativeCoverRegex.ReplaceAllString(e.Description, "") // HACK: html2text only handles
and
, goodreads uses
- // replace unsupported br tag is a supported one + // replace unsupported br tag with a supported one e.Description = breakTagRegex.ReplaceAllString(e.Description, "
") e.Description = html2text.HTML2TextWithOptions(e.Description, html2text.WithUnixLineBreaks()) e.Description = strings.TrimSpace(e.Description) From ee49bc731da0808894908c7333ca0da13949d032 Mon Sep 17 00:00:00 2001 From: l3akage Date: Fri, 13 Jun 2025 07:07:19 +0200 Subject: [PATCH 3/3] Fix make linter happy and shortend a line --- goodreads/book_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/goodreads/book_test.go b/goodreads/book_test.go index 006f351..a57a466 100644 --- a/goodreads/book_test.go +++ b/goodreads/book_test.go @@ -44,7 +44,7 @@ func TestBookUnmarshalBrTagReplacement(t *testing.T) { 123 Test Book - This should be on a new line.
Another line.
Final line.]]>
+ 2. line
3. line
4. line]]>
Test Book 100 @@ -68,7 +68,7 @@ func TestBookUnmarshalBrTagReplacement(t *testing.T) { t.Logf("Description after processing: %q", description) // Verify that
tags have been correctly converted to newlines - require.Contains(t, description, "test description.\nThis should be on a new line.\nAnother line.\nFinal line.") + require.Contains(t, description, "Test description\n2. line\n3. line\n4. line") // Verify that no HTML br tags remain require.NotContains(t, description, "