Skip to content
This repository was archived by the owner on Jan 17, 2021. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions packages/fancy-sitemap/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@

maxCacheDays - Only works when {path} is provided, so we check if there is a sitemap.xml already created in your filesystem.
3 changes: 3 additions & 0 deletions packages/fancy-sitemap/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import lib from './lib'

module.exports = lib
203 changes: 203 additions & 0 deletions packages/fancy-sitemap/lib/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
import fs from 'fs';
import util from 'util';
import moment from 'moment';

const SitemapCrawler = require('sitemap-generator');
const Generator = require('sitemap')

const CHANGE_FREQ_ALWAYS = 'always'
const CHANGE_FREQ_HOURLY = 'hourly'
const CHANGE_FREQ_DAILY = 'daily'
const CHANGE_FREQ_WEEKLY = 'weekly'
const CHANGE_FREQ_MONTHLY = 'monthly'
const CHANGE_FREQ_YEARLY = 'yearly'
const CHANGE_FREQ_NEVER = 'never'

const DEFAULT_CHANGE_FREQ = CHANGE_FREQ_MONTHLY
const DEFAULT_PRIORITY = 0.5

const CACHE_TIME = 600000

/**
* Gets all the URLS that a website has accessible by users.
*
* @param {*} options
*/
const runCrawler = (options) => {
let urls = [];
let errors = [];

const {hostname} = options;

// NB: We use SitemapCrawler generator to create all the sitemap tree.
// filepath should be null, so it doesn't generate an xml file.
const crawlerOptions = {
stripQuerystring: options.stripQuerystring || true,
filepath: null,
}

const crawler = SitemapCrawler(hostname, crawlerOptions);

return new Promise((resolve, reject) => {
crawler.start();

crawler.on('add', (url) => {
urls.push(url)
});

crawler.on('error', (error) => {
errors.push(error)
});

crawler.on('done', () => {
resolve([urls, errors])
});
})
}

const getXmlUrls = (options, generatedUrls = []) => {
const {rules = [], defaultPriority, defaultChangeFreq} = options;

const urls = generatedUrls.map((url) => {
const foundRule = rules.find(({path}) => (
RegExp(path,'g').exec(url) !== null
)) || {}

// TODO: Once we check the validity of the values and set the correct ones
// we can remove the last condition ||
return {
url,
changefreq: foundRule.changeFreq || defaultChangeFreq || DEFAULT_CHANGE_FREQ,
priority: foundRule.priority || defaultPriority || DEFAULT_PRIORITY,
}
})

return urls
}

const generateXmlSitemap = (options, generatedUrls = []) => {
const urls = getXmlUrls(options, generatedUrls)
const sitemapGenerator = Generator.createSitemap ({
cacheTime: CACHE_TIME, // 600 sec - cache purge period
urls,
})

return new Promise((resolve, reject) => {
sitemapGenerator.toXML((err, xml) => {
if (err) {
return reject(err)
}

return resolve(xml)
})
})
}

const createSitemapFile = (path, fileContent) => (
new Promise((resolve, reject) => (
fs.writeFile(path, fileContent, (err) => {
if (err) {
return reject(err);
}
return resolve();
})
))
)

const shouldReturnCachedFile = (path, maxCacheDays) => (
new Promise((resolve, reject) => (
fs.stat(path, (err, stats) => {
if (err) {
return resolve(false)
}

const today = moment(new Date(), 'YYYY-MM-DD')
const fileModifiedDate = moment(new Date(util.inspect(stats.mtime)), 'YYYY-MM-DD')
const creationDays = moment.duration(today.diff(fileModifiedDate)).asDays()

return resolve(parseInt(creationDays) < maxCacheDays)
})
))
)

const getFileXmlSitemap = (path) => (
new Promise((resolve, reject) => (
fs.readFile(path, (error, fileContent) => {
if (error) {
return reject(error)
}

return resolve(fileContent)
})
))
)

const start = (options) => (
// TODO: Check validity of the options object.
Copy link
Copy Markdown
Contributor

@ferreiro-eb ferreiro-eb May 3, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: Use Joi or other JSON schema validation to check it. https://github.com/hapijs/joi

// Example: hostname is provided and required!

// TODO: Also check if user default values are valid:
// priority should be [0-1]
// changeFreq should be one of the constants above
// otherwise, sets the default values: DEFAULT_CHANGE_FREQ and DEFAULT_PRIORITY

new Promise(async (resolve, reject) => {
const {maxCacheDays} = options;

// TODO: Clean this logic... Probably put it inside a method?
if (maxCacheDays && maxCacheDays > 0) {
const {path} = options;

try {
const hasCachedFile = await shouldReturnCachedFile(path, maxCacheDays)

if (hasCachedFile) {
const fileXmlSitemap = await getFileXmlSitemap(path);
return resolve(fileXmlSitemap);
}

// SKIP Cached version: Generate the file again
} catch (error) {
console.log('Trying to get file from cached failed...')
console.log('error', error)
// SKIP Cached version: Generate the file again
}
}

// Generate file...
let urls;
let errors;

try {
[urls, errors] = await runCrawler(options)
} catch (error) {
return reject(error);
}

// TODO: Do something with the errors???

try {
const generatedXmlSitemap = await generateXmlSitemap(options, urls);
const {path} = options;

if (path) {
// NB: If it returns an error, we don't care.
await createSitemapFile(path, generatedXmlSitemap);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: createCacheFile() without waiting for it


return resolve(generatedXmlSitemap);
} else {
return resolve(generatedXmlSitemap);
}
}
catch (error) {
console.log(error)
return reject(error);
}
})
)

const SitemapGenerator = {
start,
}

module.exports = SitemapGenerator
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where are the tests at? 😂

23 changes: 23 additions & 0 deletions packages/fancy-sitemap/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"name": "fancy-sitemap",
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should include a main, that way it doesn't have to guess where the entry file is.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"main": "./lib"

"description": "PUT A FANCY DESCRIPTION",
"homepage": "http://www.github.com/ferreiro/fancy-sitemap",
"version": "0.0.9",
"private": false,
"license": "MIT",
"keywords": [],
"author": {
"name": "Jorge Ferreiro",
"email": "jorge@ferreiro.me",
"url": "https://www.ferreiro.me/"
},
"scripts": {
"start": ""
},
"dependencies": {
"lodash": "4.17.11",
"sitemap": "2.1.0",
"sitemap-generator": "8.3.3"
},
"devDependencies": {}
}
1 change: 1 addition & 0 deletions packages/ferreiro-server/env/development.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ module.exports = {
ADMIN_EMAIL: 'admin',
ADMIN_PASS: 'admin',
MAILCHIMP_API_TOKEN: process.env.MAILCHIMP_API_TOKEN,
MAX_CACHE_DAYS_SITEMAP: 0,

// AMAZON WEB SERVICES
S3_REGION: process.env.S3_REGION,
Expand Down
1 change: 1 addition & 0 deletions packages/ferreiro-server/env/production.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ module.exports = {
ADMIN_EMAIL: process.env.ADMIN_EMAIL,
ADMIN_PASS: process.env.ADMIN_PASS,
MAILCHIMP_API_TOKEN: process.env.MAILCHIMP_API_TOKEN,
MAX_CACHE_DAYS_SITEMAP: process.env.MAX_CACHE_DAYS_SITEMAP,

// AMAZON WEB SERVICES
S3_REGION: process.env.S3_REGION,
Expand Down
2 changes: 2 additions & 0 deletions packages/ferreiro-server/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
"cookie-parser": "^1.4.4",
"debug": "^3.1.0",
"express": "^4.16.1",
"express-rate-limit": "3.5.0",
"express-recaptcha": "^3.0.0",
"express-session": "^1.15.6",
"force-ssl-heroku": "^1.0.2",
Expand All @@ -56,6 +57,7 @@
"lodash": "^4.17.11",
"mailchimp-api-v3": "^1.7.1",
"marked": "^0.6.1",
"moment": "2.24.0",
"mongoose": "^4.11.13",
"mongoose-paginate": "^5.0.3",
"mongoose-permalink": "^2.0.0",
Expand Down
Binary file added packages/ferreiro-server/pene.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
60 changes: 57 additions & 3 deletions packages/ferreiro-server/setup/setupMiddlewares.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,15 @@ const helmet = require('helmet')
const session = require('express-session')
const express = require('express')
const compression = require('compression')
const rateLimit = require('express-rate-limit');

const crawlersRateLimit = rateLimit({
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this a real concern?

windowMs: 15 * 60 * 1000, // 15 minutes
max: 15 // limit each IP to 15 requests per windowMs
});

// TODO: Move to it's own repository and deploy...
const sitemapGenerator = require('../../fancy-sitemap/index')
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are in a workspace and it is defined as a package. Let the workspaces handle resolution.


const env = require('../env')

Expand All @@ -18,11 +27,56 @@ module.exports = (app) => {
app.use(compression())

// Search engines
app.get('/robots.txt', (req, res) => {
app.get('/robots.txt', crawlersRateLimit, (req, res) => {
res.sendFile(path.join(__dirname + '/../robots.txt'))
})
app.get('/sitemap.xml', (req, res) => {
res.sendFile(path.join(__dirname + '/../sitemap.xml'))

app.get('/sitemap.xml', crawlersRateLimit, (req, res) => {
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible instead to generate this at build time and deploy it as part of the bundle? Then you don't need to continuously regenerate it. The reason to not do this if there is a way to dynamically change/add urls. If that is the case another option could be to create a webhook that generates the file as a result of an action(create post, delete post)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think what you have is great and will work, but may be more performant to run it less.

// TODO: Move the sitemap configuration to it's own file...
sitemapGenerator
.start({
hostname: process.env.NODE_ENV === 'DEV' ? 'localhost:3000' : 'https://www.ferreiro.me',
path: 'sitemap.xml',
maxCacheDays: env.MAX_CACHE_DAYS_SITEMAP || 0,
stripQuerystring: true,
filepath: 'sitemap.xml',
defaultPriority: 0.8,
defaultChangeFreq: 'yearly',
rules: [
{
path: '/$',
changeFreq: 'monthly',
priority: 1,
},
{
path: '/about$',
changeFreq: 'monthly',
priority: 1,
},
{
path: '/blog$',
changeFreq: 'weekly',
priority: 1,
},
{
path: '/blog/*',
changeFreq: 'monthly',
priority: 0.9,
},
{
path: '/talks$',
changeFreq: 'monthly',
priority: 1,
},
],
})
.then((sitemap) => {
res.header('Content-Type', 'application/xml');
return res.send(sitemap)
})
.catch((error) => (
res.status(500).send(error).end()
))
})

// Serve static bower: http://goo.gl/e2nTBf
Expand Down
Loading