ferreiro · ferreiro · May 2, 2019 · May 2, 2019 · May 2, 2019 · May 2, 2019
diff --git a/packages/fancy-sitemap/README.md b/packages/fancy-sitemap/README.md
@@ -0,0 +1,2 @@
+
+maxCacheDays - Only works when {path} is provided, so we check if there is a sitemap.xml already created in your filesystem.
diff --git a/packages/fancy-sitemap/index.js b/packages/fancy-sitemap/index.js
@@ -0,0 +1,3 @@
+import lib from './lib'
+
+module.exports = lib
diff --git a/packages/fancy-sitemap/lib/index.js b/packages/fancy-sitemap/lib/index.js
@@ -0,0 +1,203 @@
+import fs from 'fs';
+import util from 'util';
+import moment from 'moment';
+
+const SitemapCrawler = require('sitemap-generator');
+const Generator = require('sitemap')
+
+const CHANGE_FREQ_ALWAYS = 'always'
+const CHANGE_FREQ_HOURLY = 'hourly'
+const CHANGE_FREQ_DAILY = 'daily'
+const CHANGE_FREQ_WEEKLY = 'weekly'
+const CHANGE_FREQ_MONTHLY = 'monthly'
+const CHANGE_FREQ_YEARLY = 'yearly'
+const CHANGE_FREQ_NEVER = 'never'
+
+const DEFAULT_CHANGE_FREQ = CHANGE_FREQ_MONTHLY
+const DEFAULT_PRIORITY = 0.5
+
+const CACHE_TIME = 600000
+
+/**
+ * Gets all the URLS that a website has accessible by users.
+ * 
+ * @param {*} options
+ */
+const runCrawler = (options) => {
+    let urls = [];
+    let errors = [];
+
+    const {hostname} = options;
+
+    // NB: We use SitemapCrawler generator to create all the sitemap tree.
+    // filepath should be null, so it doesn't generate an xml file.
+    const crawlerOptions = {
+        stripQuerystring: options.stripQuerystring || true,
+        filepath: null,
+    }
+
+    const crawler = SitemapCrawler(hostname, crawlerOptions);
+
+    return new Promise((resolve, reject) => {
+        crawler.start();
+
+        crawler.on('add', (url) => {
+            urls.push(url)
+        });
+
+        crawler.on('error', (error) => {
+            errors.push(error)
+        });
+
+        crawler.on('done', () => {
+            resolve([urls, errors])
+        });
+    })
+}
+
+const getXmlUrls = (options, generatedUrls = []) => {
+    const {rules = [], defaultPriority, defaultChangeFreq} = options;
+
+    const urls = generatedUrls.map((url) => {
+        const foundRule = rules.find(({path}) => (
+            RegExp(path,'g').exec(url) !== null
+        )) || {}
+
+        // TODO: Once we check the validity of the values and set the correct ones
+        // we can remove the last condition ||
+        return {
+            url,
+            changefreq: foundRule.changeFreq || defaultChangeFreq || DEFAULT_CHANGE_FREQ,
+            priority: foundRule.priority || defaultPriority || DEFAULT_PRIORITY,
+        }
+    })
+
+    return urls
+}
+
+const generateXmlSitemap = (options, generatedUrls = []) => {
+    const urls = getXmlUrls(options, generatedUrls)
+    const sitemapGenerator = Generator.createSitemap ({
+        cacheTime: CACHE_TIME,        // 600 sec - cache purge period
+        urls,
+    })
+
+    return new Promise((resolve, reject) => {
+        sitemapGenerator.toXML((err, xml) => {
+            if (err) {
+                return reject(err)
+            }
+
+            return resolve(xml)
+        })
+    })
+}
+
+const createSitemapFile = (path, fileContent) => (
+    new Promise((resolve, reject) => (
+        fs.writeFile(path, fileContent, (err) => {
+            if (err) {
+                return reject(err);
+            }
+            return resolve();
+        })
+    ))
+)
+
+const shouldReturnCachedFile = (path, maxCacheDays) => (
+    new Promise((resolve, reject) => (
+        fs.stat(path, (err, stats) => {
+            if (err) {
+                return resolve(false)
+            }
+
+            const today = moment(new Date(), 'YYYY-MM-DD')
+            const fileModifiedDate = moment(new Date(util.inspect(stats.mtime)), 'YYYY-MM-DD')
+            const creationDays = moment.duration(today.diff(fileModifiedDate)).asDays()
+
+            return resolve(parseInt(creationDays) < maxCacheDays)
+        })
+    ))
+)
+
+const getFileXmlSitemap = (path) => (
+    new Promise((resolve, reject) => (
+        fs.readFile(path, (error, fileContent) => {
+            if (error) {
+                return reject(error)
+            }
+
+            return resolve(fileContent)
+        })
+    ))
+)
+
+const start = (options) => (
+    // TODO: Check validity of the options object.
+    // Example: hostname is provided and required!
+
+    // TODO: Also check if user default values are valid:
+    // priority should be [0-1]
+    // changeFreq should be one of the constants above
+    // otherwise, sets the default values: DEFAULT_CHANGE_FREQ and DEFAULT_PRIORITY
+
+    new Promise(async (resolve, reject) => {
+        const {maxCacheDays} = options;
+
+        // TODO: Clean this logic... Probably put it inside a method?
+        if (maxCacheDays && maxCacheDays > 0) {
+            const {path} = options;
+
+            try {
+                const hasCachedFile = await shouldReturnCachedFile(path, maxCacheDays)
+
+                if (hasCachedFile) {
+                    const fileXmlSitemap = await getFileXmlSitemap(path);
+                    return resolve(fileXmlSitemap);
+                }
+
+                // SKIP Cached version: Generate the file again
+            } catch (error) {
+                console.log('Trying to get file from cached failed...')
+                console.log('error', error)
+                // SKIP Cached version: Generate the file again
+            }
+        }
+
+        // Generate file...
+        let urls;
+        let errors;    
+
+        try {
+            [urls, errors] = await runCrawler(options)
+        } catch (error) {
+            return reject(error);
+        }
+
+        // TODO: Do something with the errors???
+
+        try {
+            const generatedXmlSitemap = await generateXmlSitemap(options, urls);
+            const {path} = options;
+
+            if (path) {
+                // NB: If it returns an error, we don't care.
+                await createSitemapFile(path, generatedXmlSitemap);
+
+                return resolve(generatedXmlSitemap);
+            } else {
+                return resolve(generatedXmlSitemap);
+            }
+        }
+        catch (error) {
+            console.log(error)
+            return reject(error);
+        }
+    })
+)
+
+const SitemapGenerator = {
+    start,
+}
+
+module.exports = SitemapGenerator
diff --git a/packages/fancy-sitemap/package.json b/packages/fancy-sitemap/package.json
@@ -0,0 +1,23 @@
+{
+  "name": "fancy-sitemap",
+  "description": "PUT A FANCY DESCRIPTION",
+  "homepage": "http://www.github.com/ferreiro/fancy-sitemap",
+  "version": "0.0.9",
+  "private": false,
+  "license": "MIT",
+  "keywords": [],
+  "author": {
+    "name": "Jorge Ferreiro",
+    "email": "jorge@ferreiro.me",
+    "url": "https://www.ferreiro.me/"
+  },
+  "scripts": {
+    "start": ""
+  },
+  "dependencies": {
+    "lodash": "4.17.11",
+    "sitemap": "2.1.0",
+    "sitemap-generator": "8.3.3"
+  },
+  "devDependencies": {}
+}
diff --git a/packages/ferreiro-server/env/development.js b/packages/ferreiro-server/env/development.js
@@ -5,6 +5,7 @@ module.exports = {
   ADMIN_EMAIL: 'admin',
   ADMIN_PASS: 'admin',
   MAILCHIMP_API_TOKEN: process.env.MAILCHIMP_API_TOKEN,
+  MAX_CACHE_DAYS_SITEMAP: 0,
 
   // AMAZON WEB SERVICES
   S3_REGION: process.env.S3_REGION,

diff --git a/packages/ferreiro-server/env/production.js b/packages/ferreiro-server/env/production.js
@@ -5,6 +5,7 @@ module.exports = {
   ADMIN_EMAIL: process.env.ADMIN_EMAIL,
   ADMIN_PASS: process.env.ADMIN_PASS,
   MAILCHIMP_API_TOKEN: process.env.MAILCHIMP_API_TOKEN,
+  MAX_CACHE_DAYS_SITEMAP: process.env.MAX_CACHE_DAYS_SITEMAP,
 
   // AMAZON WEB SERVICES
   S3_REGION: process.env.S3_REGION,

diff --git a/packages/ferreiro-server/package.json b/packages/ferreiro-server/package.json
@@ -48,6 +48,7 @@
     "cookie-parser": "^1.4.4",
     "debug": "^3.1.0",
     "express": "^4.16.1",
+    "express-rate-limit": "3.5.0",
     "express-recaptcha": "^3.0.0",
     "express-session": "^1.15.6",
     "force-ssl-heroku": "^1.0.2",
@@ -56,6 +57,7 @@
     "lodash": "^4.17.11",
     "mailchimp-api-v3": "^1.7.1",
     "marked": "^0.6.1",
+    "moment": "2.24.0",
     "mongoose": "^4.11.13",
     "mongoose-paginate": "^5.0.3",
     "mongoose-permalink": "^2.0.0",

diff --git a/packages/ferreiro-server/pene.jpg b/packages/ferreiro-server/pene.jpg
diff --git a/packages/ferreiro-server/setup/setupMiddlewares.js b/packages/ferreiro-server/setup/setupMiddlewares.js
@@ -7,6 +7,15 @@ const helmet = require('helmet')
 const session = require('express-session')
 const express = require('express')
 const compression = require('compression')
+const rateLimit = require('express-rate-limit');
+
+const crawlersRateLimit = rateLimit({
+  windowMs: 15 * 60 * 1000, // 15 minutes
+  max: 15 // limit each IP to 15 requests per windowMs
+});
+
+// TODO: Move to it's own repository and deploy...
+const sitemapGenerator = require('../../fancy-sitemap/index')
 
 const env = require('../env')
 
@@ -18,11 +27,56 @@ module.exports = (app) => {
   app.use(compression())
 
   // Search engines
-  app.get('/robots.txt', (req, res) => {
+  app.get('/robots.txt', crawlersRateLimit, (req, res) => {
     res.sendFile(path.join(__dirname + '/../robots.txt'))
   })
-  app.get('/sitemap.xml', (req, res) => {
-    res.sendFile(path.join(__dirname + '/../sitemap.xml'))
+
+  app.get('/sitemap.xml', crawlersRateLimit, (req, res) => {
+    // TODO: Move the sitemap configuration to it's own file...
+    sitemapGenerator
+      .start({
+        hostname: process.env.NODE_ENV === 'DEV' ? 'localhost:3000' : 'https://www.ferreiro.me',
+        path: 'sitemap.xml',
+        maxCacheDays: env.MAX_CACHE_DAYS_SITEMAP || 0,
+        stripQuerystring: true,
+        filepath: 'sitemap.xml',
+        defaultPriority: 0.8,
+        defaultChangeFreq: 'yearly',
+        rules: [
+          {
+            path: '/$',
+            changeFreq: 'monthly',
+            priority: 1,
+          },
+          {
+            path: '/about$',
+            changeFreq: 'monthly',
+            priority: 1,
+          },
+          {
+            path: '/blog$',
+            changeFreq: 'weekly',
+            priority: 1,
+          },
+          {
+            path: '/blog/*',
+            changeFreq: 'monthly',
+            priority: 0.9,
+          },
+          {
+            path: '/talks$',
+            changeFreq: 'monthly',
+            priority: 1,
+          },
+        ],
+      })
+      .then((sitemap) => {
+        res.header('Content-Type', 'application/xml');
+        return res.send(sitemap)
+      })
+      .catch((error) => (
+        res.status(500).send(error).end()
+      ))
   })
 
   // Serve static bower: http://goo.gl/e2nTBf
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@

		maxCacheDays - Only works when {path} is provided, so we check if there is a sitemap.xml already created in your filesystem.