diff --git a/core/loader/atom.ts b/core/loader/atom.ts index 88b834dc..62a6eace 100644 --- a/core/loader/atom.ts +++ b/core/loader/atom.ts @@ -4,8 +4,9 @@ import { createPostsList } from '../posts-list.ts' import type { Loader } from './index.ts' import { findAnchorHrefs, + findDocumentLinks, + findHeaderLinks, findImageByAttr, - findLinksByType, isHTML, toTime } from './utils.ts' @@ -37,9 +38,12 @@ function parsePosts(text: TextResponse): OriginPost[] { export const atom: Loader = { getMineLinksFromText(text) { - if (!isHTML(text)) return [] + let type = 'application/atom+xml' + let headerLinks = findHeaderLinks(text, type) + if (!isHTML(text)) return headerLinks let links = [ - ...findLinksByType(text, 'application/atom+xml'), + ...headerLinks, + ...findDocumentLinks(text, type), ...findAnchorHrefs(text, /feeds\.|feed\.|\.atom|\/atom/i, /feed|atom/i) ] if (links.length > 0) { diff --git a/core/loader/json-feed.ts b/core/loader/json-feed.ts index 2d64d6cd..e7cc534b 100644 --- a/core/loader/json-feed.ts +++ b/core/loader/json-feed.ts @@ -5,7 +5,8 @@ import { createPostsList } from '../posts-list.ts' import type { Loader } from './index.ts' import { findAnchorHrefs, - findLinksByType, + findDocumentLinks, + findHeaderLinks, isHTML, toTime, unique @@ -131,10 +132,12 @@ function parsePosts(text: TextResponse): OriginPost[] { export const jsonFeed: Loader = { getMineLinksFromText(text) { - if (!isHTML(text)) return [] - let linksByType = findLinksByType(text, 'application/feed+json') + let type = 'application/feed+json' + let headerLinks = findHeaderLinks(text, type) + if (!isHTML(text)) return headerLinks + let linksByType = [...headerLinks, ...findDocumentLinks(text, type)] if (linksByType.length === 0) { - linksByType = findLinksByType(text, 'application/json') + linksByType = findDocumentLinks(text, 'application/json') } return [...linksByType, ...findAnchorHrefs(text, /feed\.json/i)] }, diff --git a/core/loader/rss.ts b/core/loader/rss.ts index ee299e4c..8123893f 100644 --- a/core/loader/rss.ts +++ b/core/loader/rss.ts @@ -4,8 +4,9 @@ import { createPostsList } from '../posts-list.ts' import type { Loader } from './index.ts' import { findAnchorHrefs, + findDocumentLinks, + findHeaderLinks, findImageByAttr, - findLinksByType, isHTML, toTime, unique @@ -48,9 +49,12 @@ function parsePosts(text: TextResponse): OriginPost[] { export const rss: Loader = { getMineLinksFromText(text) { - if (!isHTML(text)) return [] + let type = 'application/rss+xml' + let headerLinks = findHeaderLinks(text, type) + if (!isHTML(text)) return headerLinks return [ - ...findLinksByType(text, 'application/rss+xml'), + ...headerLinks, + ...findDocumentLinks(text, type), ...findAnchorHrefs(text, /\.rss|\/rss/i, /rss/i) ] }, diff --git a/core/loader/utils.ts b/core/loader/utils.ts index 7e281597..368fca71 100644 --- a/core/loader/utils.ts +++ b/core/loader/utils.ts @@ -30,7 +30,7 @@ function buildFullURL( ) } -export function findLinksByType(text: TextResponse, type: string): string[] { +export function findDocumentLinks(text: TextResponse, type: string): string[] { let document = text.parseXml() if (!document) return [] return [...document.querySelectorAll('link')] @@ -61,6 +61,34 @@ export function findAnchorHrefs( .map(a => buildFullURL(a, text.url)) } +/** + * Returns an array of links found in the Link http header of a website, + * if the header is present. + * An example of a Link header with multiple urls: + * ; rel="alternate"; type="application/rss+xml" + * Urls can also be multiple, comma-separated. And possibly relative. + */ +export function findHeaderLinks( + response: TextResponse, + type: string +): string[] { + let linkHeader = response.headers.get('Link') + if (!linkHeader) { + return [] + } + return linkHeader.split(/,\s?/).reduce((urls, link) => { + let [, url] = link.match(/<(.*)>/) || [] + let attributes = link.split(/;\s?/) + let matchesType = attributes.includes(`type="${type}"`) + let isAlternate = attributes.includes('rel="alternate"') + if (url && matchesType && isAlternate) { + let fullUrl = /^https?/.test(url) ? url : new URL(url, response.url).href + urls.push(fullUrl) + } + return urls + }, []) +} + export function toTime(date: null | string | undefined): number | undefined { if (!date) return undefined let time = new Date(date).getTime() / 1000 diff --git a/core/test/loader/atom.test.ts b/core/test/loader/atom.test.ts index 1aca5efd..0d628ba9 100644 --- a/core/test/loader/atom.test.ts +++ b/core/test/loader/atom.test.ts @@ -134,11 +134,15 @@ test('finds atom links in elements', () => { `, { + headers: new Headers({ + Link: '; rel="alternate"; type="application/atom+xml"' + }), url: 'https://example.com/news' } ) ), [ + 'https://example.com/news/feed', 'https://example.com/news/atom', 'https://example.com/blog/feed.xml', 'https://example.com/something.atom', diff --git a/core/test/loader/json-feed.test.ts b/core/test/loader/json-feed.test.ts index b05a4194..3bed987d 100644 --- a/core/test/loader/json-feed.test.ts +++ b/core/test/loader/json-feed.test.ts @@ -55,11 +55,15 @@ test('detects links', () => { `, { + headers: new Headers({ + Link: '; rel="alternate"; type="application/feed+json"' + }), url: 'https://example.com/news/' } ) ), [ + 'https://example.com/news/json', 'https://example.com/a', 'https://example.com/news/b', 'https://example.com/c', diff --git a/core/test/loader/rss.test.ts b/core/test/loader/rss.test.ts index 4aeb9839..0df57f73 100644 --- a/core/test/loader/rss.test.ts +++ b/core/test/loader/rss.test.ts @@ -39,11 +39,15 @@ test('detects links', () => { `, { + headers: new Headers({ + Link: '; rel="alternate"; type="application/rss+xml"' + }), url: 'https://example.com/news/' } ) ), [ + 'https://example.com/news/rss', 'https://example.com/a', 'https://example.com/news/b', 'https://example.com/c', diff --git a/core/test/loader/utils.test.ts b/core/test/loader/utils.test.ts new file mode 100644 index 00000000..9ad61239 --- /dev/null +++ b/core/test/loader/utils.test.ts @@ -0,0 +1,51 @@ +import { deepStrictEqual } from 'node:assert' +import { test } from 'node:test' + +import { createTextResponse } from '../../download.js' +import { findHeaderLinks } from '../../loader/utils.js' + +test('returns urls from link http header', () => { + deepStrictEqual( + findHeaderLinks( + createTextResponse(``, { + headers: new Headers({ + Link: '; rel="alternate"; type="application/rss+xml"' + + ', ; rel="alternate"; type="application/rss+xml"' + }), + url: 'https://example.com' + }), + 'application/rss+xml' + ), + ['https://one.example.com', 'https://two.example.com'] + ) +}) + +test('handles root-relative urls in http header', () => { + deepStrictEqual( + findHeaderLinks( + createTextResponse(``, { + headers: new Headers({ + Link: '; rel="alternate"; type="application/atom+xml"' + }), + url: 'https://example.com/blog' + }), + 'application/atom+xml' + ), + ['https://example.com/rss'] + ) +}) + +test('handles relative urls in http header', () => { + deepStrictEqual( + findHeaderLinks( + createTextResponse(``, { + headers: new Headers({ + Link: '<./rss>; rel="alternate"; type="application/atom+xml"' + }), + url: 'https://example.com/blog/' + }), + 'application/atom+xml' + ), + ['https://example.com/blog/rss'] + ) +})