From abf37d265f9ad4b0e84cd7f6cda9212eeefad47d Mon Sep 17 00:00:00 2001 From: Aaron Terry Date: Mon, 13 Apr 2020 06:05:30 -0500 Subject: [PATCH 01/15] prevent creation of package-lock.json and ignore it just in case --- .gitignore | 1 + .npmrc | 1 + 2 files changed, 2 insertions(+) create mode 100644 .npmrc diff --git a/.gitignore b/.gitignore index f10ef49..1c0471d 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ pids # Dependency directory # https://www.npmjs.org/doc/misc/npm-faq.html#should-i-check-my-node_modules-folder-into-git node_modules +package-lock.json diff --git a/.npmrc b/.npmrc new file mode 100644 index 0000000..43c97e7 --- /dev/null +++ b/.npmrc @@ -0,0 +1 @@ +package-lock=false From 139e41364837c311d1c012432d5788ce92eaee28 Mon Sep 17 00:00:00 2001 From: Aaron Terry Date: Mon, 13 Apr 2020 06:33:50 -0500 Subject: [PATCH 02/15] updated packages to latest, except mocha. mocha > 6.0 require setup changes. deferring for now --- lib/htmlPlaner.js | 57 ++++++++++++++---- lib/planer.js | 145 ++++++++++++++++++++++++++++++++++++---------- lib/regexes.js | 4 +- package.json | 8 +-- test/mocha.opts | 2 +- 5 files changed, 167 insertions(+), 49 deletions(-) diff --git a/lib/htmlPlaner.js b/lib/htmlPlaner.js index 41b7425..5a59ff7 100644 --- a/lib/htmlPlaner.js +++ b/lib/htmlPlaner.js @@ -1,4 +1,4 @@ -// Generated by CoffeeScript 1.12.7 +// Generated by CoffeeScript 2.5.1 (function() { var BREAK_TAG_REGEX, CHECKPOINT_PREFIX, CHECKPOINT_SUFFIX, DOCUMENT_POSITION_FOLLOWING, DOCUMENT_POSITION_PRECEDING, OUTLOOK_SPLITTER_QUERY_SELECTORS, OUTLOOK_SPLITTER_QUOTE_IDS, OUTLOOK_XPATH_SPLITTER_QUERIES, QUOTE_IDS, compareByDomPosition, elementIsAllContent, ensureTextNodeBetweenChildElements, findMicrosoftSplitter, findOutlookSplitterWithQuerySelector, findOutlookSplitterWithQuoteId, findOutlookSplitterWithXpathQuery, findParentDiv, hasTagName, isTextNodeWrappedInSpan, removeNodes; @@ -6,36 +6,45 @@ CHECKPOINT_SUFFIX = '!%!#'; - exports.CHECKPOINT_PATTERN = new RegExp(CHECKPOINT_PREFIX + "\\d+" + CHECKPOINT_SUFFIX, 'g'); + exports.CHECKPOINT_PATTERN = new RegExp(`${CHECKPOINT_PREFIX}\\d+${CHECKPOINT_SUFFIX}`, 'g'); + // HTML quote indicators (tag ids) QUOTE_IDS = ['OLK_SRC_BODY_SECTION']; + // Create an instance of Document using the message html and the injected base document exports.createEmailDocument = function(msgBody, dom) { var emailBodyElement, emailDocument, head, htmlElement; emailDocument = dom.implementation.createHTMLDocument(); - htmlElement = emailDocument.getElementsByTagName('html')[0]; + // Write html of email to `html` element + [htmlElement] = emailDocument.getElementsByTagName('html'); htmlElement.innerHTML = msgBody.trim(); if (emailDocument.body == null) { - emailBodyElement = emailDocument.getElementsByTagName('body')[0]; + [emailBodyElement] = emailDocument.getElementsByTagName('body'); emailDocument.body = emailBodyElement; } - head = emailDocument.getElementsByTagName('head')[0]; + // Remove 'head' element from document + [head] = emailDocument.getElementsByTagName('head'); if (head) { emailDocument.documentElement.removeChild(head); } return emailDocument; }; + // Recursively adds checkpoints to html tree. exports.addCheckpoints = function(htmlNode, counter) { var childNode, i, len, ref; + // 3 is a text node if (htmlNode.nodeType === 3) { - htmlNode.nodeValue = "" + (htmlNode.nodeValue.trim()) + CHECKPOINT_PREFIX + counter + CHECKPOINT_SUFFIX + "\n"; + htmlNode.nodeValue = `${htmlNode.nodeValue.trim()}${CHECKPOINT_PREFIX}${counter}${CHECKPOINT_SUFFIX}\n`; counter++; } + // 1 is an element if (htmlNode.nodeType === 1) { if (!hasTagName(htmlNode, 'body')) { - htmlNode.innerHTML = " " + htmlNode.innerHTML + " "; + // Pad with spacing to ensure there are text nodes at the begining and end of non-body elements + htmlNode.innerHTML = ` ${htmlNode.innerHTML} `; } + // Ensure that there are text nodes between sibling elements ensureTextNodeBetweenChildElements(htmlNode); ref = htmlNode.childNodes; for (i = 0, len = ref.length; i < len; i++) { @@ -47,8 +56,9 @@ }; exports.deleteQuotationTags = function(htmlNode, counter, quotationCheckpoints) { - var childNode, childTagInQuotation, i, j, len, len1, quotationChildren, ref, ref1, tagInQuotation; + var childNode, childTagInQuotation, i, j, len, len1, quotationChildren, ref, tagInQuotation; tagInQuotation = true; + // 3 is a text node if (htmlNode.nodeType === 3) { if (!quotationCheckpoints[counter]) { tagInQuotation = false; @@ -56,26 +66,33 @@ counter++; return [counter, tagInQuotation]; } + // 1 is an element if (htmlNode.nodeType === 1) { + // Collect child nodes that are marked as in the quotation childTagInQuotation = false; quotationChildren = []; if (!hasTagName(htmlNode, 'body')) { - htmlNode.innerHTML = " " + htmlNode.innerHTML + " "; + // Pad with spacing to ensure there are text nodes at the begining and end of non-body elements + htmlNode.innerHTML = ` ${htmlNode.innerHTML} `; } + // Ensure that there are text nodes between sibling elements ensureTextNodeBetweenChildElements(htmlNode); ref = htmlNode.childNodes; for (i = 0, len = ref.length; i < len; i++) { childNode = ref[i]; - ref1 = exports.deleteQuotationTags(childNode, counter, quotationCheckpoints), counter = ref1[0], childTagInQuotation = ref1[1]; + [counter, childTagInQuotation] = exports.deleteQuotationTags(childNode, counter, quotationCheckpoints); + // Keep tracking if all children are in the quotation tagInQuotation = tagInQuotation && childTagInQuotation; if (childTagInQuotation) { quotationChildren.push(childNode); } } } + // If all of an element's children are part of a quotation, let parent delete whole element if (tagInQuotation) { return [counter, tagInQuotation]; } else { +// Otherwise, delete specific quotation children for (j = 0, len1 = quotationChildren.length; j < len1; j++) { childNode = quotationChildren[j]; htmlNode.removeChild(childNode); @@ -110,6 +127,7 @@ return true; }; + // Remove the last non-nested blockquote element exports.cutBlockQuote = function(emailDocument) { var blockquoteElement, div, parent, xpathQuery, xpathResult; xpathQuery = '(.//blockquote)[not(ancestor::blockquote)][last()]'; @@ -140,33 +158,42 @@ exports.cutFromBlock = function(emailDocument) { var afterSplitter, fromBlock, lastBlock, parentDiv, ref, splitterElement, textNode, xpathQuery, xpathResult; + // Handle case where From: block is enclosed in a tag xpathQuery = "//*[starts-with(normalize-space(.), 'From:')]|//*[starts-with(normalize-space(.), 'Date:')]"; xpathResult = emailDocument.evaluate(xpathQuery, emailDocument, null, 5, null); + // Find last element in iterator while (fromBlock = xpathResult.iterateNext()) { lastBlock = fromBlock; } if (lastBlock != null) { + // Find parent div and remove from document parentDiv = findParentDiv(lastBlock); if ((parentDiv != null) && !elementIsAllContent(parentDiv)) { parentDiv.parentElement.removeChild(parentDiv); return true; } } + // Handle the case when From: block goes right after e.g.
and is not enclosed in a tag itself xpathQuery = "//text()[starts-with(normalize-space(.), 'From:')]|//text()[starts-with(normalize-space(.), 'Date:')]"; xpathResult = emailDocument.evaluate(xpathQuery, emailDocument, null, 9, null); + // The text node that is the result textNode = xpathResult.singleNodeValue; if (textNode == null) { return false; } if (isTextNodeWrappedInSpan(textNode)) { + // The text node is wrapped in a span element. All sorts formatting could be happening here. + // Return false and hope plain text algorithm can figure it out. return false; } + // The previous sibling stopped the initial xpath query from working, so it is likely a splitter (like an hr) splitterElement = textNode.previousSibling; if (splitterElement != null) { if ((ref = splitterElement.parentElement) != null) { ref.removeChild(splitterElement); } } + // Remove all subsequent siblings of the textNode afterSplitter = textNode.nextSibling; while (afterSplitter != null) { afterSplitter.parentNode.removeChild(afterSplitter); @@ -207,17 +234,22 @@ return emailDocument.body.innerHTML = currentHtml.replace(BREAK_TAG_REGEX, "\n"); }; + // Queries to find a splitter that's the only child of a single parent div + // Usually represents the dividing line between messages in the Outlook html OUTLOOK_SPLITTER_QUERY_SELECTORS = { outlook2007: "div[style='border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0cm 0cm 0cm']", outlookForAndroid: "div[style='border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0cm 0cm 0cm']", windowsMail: "div[style='padding-top: 5px; border-top-color: rgb(229, 229, 229); border-top-width: 1px; border-top-style: solid;']" }; + // More complicated Xpath queries for versions of Outlook that don't use the dividing lines OUTLOOK_XPATH_SPLITTER_QUERIES = { outlook2003: "//div/div[@class='MsoNormal' and @align='center' and @style='text-align:center']/font/span/hr[@size='3' and @width='100%' and @align='center' and @tabindex='-1']" }; + // For more modern versions of Outlook that contain replies in quote block with an id OUTLOOK_SPLITTER_QUOTE_IDS = { + // There's potentially multiple elements with this id so we need to cut everything after this quote as well office365: '#divRplyFwdMsg' }; @@ -245,6 +277,7 @@ if (!possibleSplitterElements.length) { return null; } + // Find the earliest splitter in the DOM to remove everything after it return possibleSplitterElements.sort(compareByDomPosition)[0]; }; @@ -267,6 +300,7 @@ var splitterElement, xpathResult; xpathResult = emailDocument.evaluate(xpathQuery, emailDocument, null, 9, null); splitterElement = xpathResult.singleNodeValue; + // Go up the tree to find the enclosing div. if (splitterElement != null) { splitterElement = splitterElement.parentElement.parentElement; splitterElement = splitterElement.parentElement.parentElement; @@ -299,7 +333,7 @@ removeNodes = function(nodesArray) { var i, index, node, ref, ref1, results; results = []; - for (index = i = ref = nodesArray.length - 1; ref <= 0 ? i <= 0 : i >= 0; index = ref <= 0 ? ++i : --i) { + for (index = i = ref = nodesArray.length - 1; (ref <= 0 ? i <= 0 : i >= 0); index = ref <= 0 ? ++i : --i) { node = nodesArray[index]; results.push(node != null ? (ref1 = node.parentNode) != null ? ref1.removeChild(node) : void 0 : void 0); } @@ -317,6 +351,7 @@ } results = []; while (currentNode.nextSibling) { + // An element is followed by an element if (currentNode.nodeType === 1 && currentNode.nextSibling.nodeType === 1) { newTextNode = dom.createTextNode(' '); element.insertBefore(newTextNode, currentNode.nextSibling); diff --git a/lib/planer.js b/lib/planer.js index 7dd0272..724e72e 100644 --- a/lib/planer.js +++ b/lib/planer.js @@ -1,4 +1,4 @@ -// Generated by CoffeeScript 1.12.7 +// Generated by CoffeeScript 2.5.1 (function() { var CONTENT_CHUNK_SIZE, MAX_LINES_COUNT, MAX_LINE_LENGTH, REGEXES, SPLITTER_MAX_LINES, _CRLF_to_LF, _restore_CRLF, getDelimiter, htmlPlaner, isSplitter, postprocess, preprocess, setReturnFlags; @@ -12,13 +12,15 @@ MAX_LINE_LENGTH = 200000; - exports.extractFrom = function(msgBody, contentType, dom) { - if (contentType == null) { - contentType = 'text/plain'; - } - if (dom == null) { - dom = null; - } + // Extract actual message from email. + + // Will use provided `contentType` to decide which algorithm to use (plain text or html). + + // @param msgBody [String] the html content of the email + // @param contentType [String] the contentType of the email. Only `text/plain` and `text/html` are supported. + // @param dom [Document] the document object to use for html parsing. + // @return [String] the text/html of the actual message without quotations + exports.extractFrom = function(msgBody, contentType = 'text/plain', dom = null) { if (contentType === 'text/plain') { return exports.extractFromPlain(msgBody); } else if (contentType === 'text/html') { @@ -29,6 +31,17 @@ return msgBody; }; + // Extract actual message from provided textual email. + + // Store delimiter used by the email (\n or \r\n), + // split the email into lines, + // use regexes to mark each line as either part of the message or quotation, + // remove lines that are part of the quotation, + // put message back together using the saved delimeter, + // remove changes made by algorithm. + + // @param msgBody [String] the html content of the email + // @return [String] the text of the message without quotations exports.extractFromPlain = function(msgBody) { var delimiter, lines, markers; delimiter = getDelimiter(msgBody); @@ -41,8 +54,29 @@ return msgBody; }; + // Extract actual message from provided html message body + // using tags and plain text algorithm. + + // Cut out the 'blockquote', 'gmail_quote' tags. + // Cut out Microsoft (Outlook, Windows mail) quotations. + + // Then use plain text algorithm to cut out splitter or + // leftover quotation. + // This works by adding checkpoint text to all html tags, + // then converting html to text, + // then extracting quotations from text, + // then checking deleted checkpoints, + // then deleting necessary tags. + + // Will use the document provided to create a new document using: + // Document.implementation.createHTMLDocument() + + // @param msgBody [String] the html content of the email + // @param dom [Document] a document object or equivalent implementation. + // Must respond to `DOMImplementation.createHTMLDocument()`. + // @see https://developer.mozilla.org/en-US/docs/Web/API/DOMImplementation/createHTMLDocument exports.extractFromHtml = function(msgBody, dom) { - var checkpoint, crlfReplaced, emailDocument, emailDocumentCopy, haveCutQuotations, i, index, k, l, len, len1, line, lineCheckpoints, lines, m, markers, matches, numberOfCheckpoints, plainTextMsg, quotationCheckpoints, ref, ref1, ref2, ref3, returnFlags; + var checkpoint, crlfReplaced, emailDocument, emailDocumentCopy, haveCutQuotations, i, index, k, l, len, len1, line, lineCheckpoints, lines, m, markers, matches, numberOfCheckpoints, plainTextMsg, quotationCheckpoints, ref, ref1, ref2, returnFlags; if (dom == null) { console.error("No dom provided to parse html."); return msgBody; @@ -50,14 +84,21 @@ if (msgBody.trim() === '') { return msgBody; } - ref = _CRLF_to_LF(msgBody), msgBody = ref[0], crlfReplaced = ref[1]; + [msgBody, crlfReplaced] = _CRLF_to_LF(msgBody); emailDocument = htmlPlaner.createEmailDocument(msgBody, dom); + // TODO: this check does not handle cases of emails between various email providers well because + // it will find whichever splitter comes first in this list, not necessarily the top-most and stop + // checking for others. Possible solution is to use something like compareByDomPosition from htmlPlaner + // to find the earliest splitter in the DOM. haveCutQuotations = htmlPlaner.cutGmailQuote(emailDocument) || htmlPlaner.cutBlockQuote(emailDocument) || htmlPlaner.cutMicrosoftQuote(emailDocument) || htmlPlaner.cutById(emailDocument) || htmlPlaner.cutFromBlock(emailDocument); + // Create unaltered copy of email document emailDocumentCopy = htmlPlaner.createEmailDocument(emailDocument.documentElement.outerHTML, dom); + // Add checkpoints to html document numberOfCheckpoints = htmlPlaner.addCheckpoints(emailDocument.body, 0); quotationCheckpoints = Array.apply(null, Array(numberOfCheckpoints)).map(function() { return false; }); + // Get plain text version to put through plain text algorithm htmlPlaner.replaceBreakTagsWithLineFeeds(emailDocument); plainTextMsg = emailDocument.body.textContent; plainTextMsg = preprocess(plainTextMsg, "\n", 'text/html'); @@ -65,6 +106,7 @@ if (lines.length > MAX_LINES_COUNT) { return msgBody; } + // Collect checkpoints for each line lineCheckpoints = new Array(lines.length); for (index = k = 0, len = lines.length; k < len; index = ++k) { line = lines[index]; @@ -73,6 +115,7 @@ return parseInt(match.slice(4, -4)); }); } + // Remove checkpoints from lines to pass through plain text algorithm lines = lines.map(function(line) { return line.replace(htmlPlaner.CHECKPOINT_PATTERN, ''); }); @@ -81,41 +124,58 @@ exports.processMarkedLines(lines, markers, returnFlags); if (!returnFlags.wereLinesDeleted) { if (haveCutQuotations) { + // If we cut a quotation element out of the html, return the html output of the copied document. return _restore_CRLF(emailDocumentCopy.documentElement.outerHTML, crlfReplaced); } else { + // There was nothing to remove, return original message. return msgBody; } } - for (i = l = ref1 = returnFlags.firstLine, ref2 = returnFlags.lastLine; ref1 <= ref2 ? l <= ref2 : l >= ref2; i = ref1 <= ref2 ? ++l : --l) { +// Set quotationCheckpoints to true for checkpoints on lines that were removed + for (i = l = ref = returnFlags.firstLine, ref1 = returnFlags.lastLine; (ref <= ref1 ? l <= ref1 : l >= ref1); i = ref <= ref1 ? ++l : --l) { if (!lineCheckpoints[i]) { continue; } - ref3 = lineCheckpoints[i]; - for (m = 0, len1 = ref3.length; m < len1; m++) { - checkpoint = ref3[m]; + ref2 = lineCheckpoints[i]; + for (m = 0, len1 = ref2.length; m < len1; m++) { + checkpoint = ref2[m]; quotationCheckpoints[checkpoint] = true; } } + // Remove the element that have been identified as part of the quoted message htmlPlaner.deleteQuotationTags(emailDocumentCopy.body, 0, quotationCheckpoints); return emailDocumentCopy.documentElement.outerHTML; }; + // Mark message lines with markers to distinguish quotation lines. + + // Markers: + // * e - empty line + // * f - Forwarded message line, see REGEXES.FWD + // * m - line that starts with quotation marker '>' + // * s - splitter line + // * t - presumably lines from the last message in the conversation + + // $> markMessageLines(['answer', 'From: foo@bar.com', '', '> question']) + // 'tsem' + exports.markMessageLines = function(lines) { var i, j, k, markers, ref, splitter, splitterLines; markers = []; i = 0; while (i < lines.length) { if (lines[i].trim() === '') { - markers[i] = 'e'; + markers[i] = 'e'; // empty line } else if (REGEXES.QUOT_PATTERN.test(lines[i])) { - markers[i] = 'm'; + markers[i] = 'm'; // line with quotation marker } else if (REGEXES.FWD.test(lines[i])) { - markers[i] = 'f'; + markers[i] = 'f'; // ---- Forwarded message ---- } else { splitter = isSplitter(lines.slice(i, i + SPLITTER_MAX_LINES).join("\n")); if (splitter) { + // splitter[0] is the entire match splitterLines = splitter[0].split("\n"); - for (j = k = 0, ref = splitterLines.length; 0 <= ref ? k <= ref : k >= ref; j = 0 <= ref ? ++k : --k) { + for (j = k = 0, ref = splitterLines.length; (0 <= ref ? k <= ref : k >= ref); j = 0 <= ref ? ++k : --k) { markers[i + j] = 's'; } i += splitterLines.length - 1; @@ -128,6 +188,7 @@ return markers.join(''); }; + // Check the line for each splitter regex. isSplitter = function(line) { var k, len, matchArray, pattern, ref; if (line.length > MAX_LINE_LENGTH) { @@ -144,18 +205,27 @@ return null; }; - exports.processMarkedLines = function(lines, markers, returnFlags) { + // Run regexes against message's marked lines to strip quotations. + + // Return only last message lines. + // $> processMarkedLines(['Hello', 'From: foo@bar.com', '', '> Hi'], 'tsem']) + // ['Hello'] + + // Will also modify the provided returnFlags object and set the following properties: + // returnFlags = { wereLinesDeleted: (true|false), firstLine: (Number), lastLine: (Number) } + // @see setReturnFlags + exports.processMarkedLines = function(lines, markers, returnFlags = {}) { var inlineMatchRegex, inlineReplyIndex, inlineReplyMatch, isInlineReplyLink, quotationEnd, quotationMatch; - if (returnFlags == null) { - returnFlags = {}; - } + // If there are no splitters there should be no markers if (markers.indexOf('s') < 0 && !/(me*){3}/.test(markers)) { markers = markers.replace(/m/g, 't'); } + // If the message is a forward do nothing. if (/^[te]*f/.test(markers)) { setReturnFlags(returnFlags, false, -1, -1); return lines; } + // Find inline replies (tm's following the first m in markers string) inlineMatchRegex = new RegExp('m(?=e*((?:t+e*)+)m)', 'g'); while (inlineReplyMatch = inlineMatchRegex.exec(lines)) { inlineReplyIndex = markers.indexOf(inlineReplyMatch[1], inlineReplyMatch.index); @@ -168,11 +238,13 @@ return lines; } } + // Cut out text lines coming after splitter if there are no markers there quotationMatch = new RegExp('(se*)+((t|f)+e*)+', 'g').exec(markers); if (quotationMatch) { setReturnFlags(returnFlags, true, quotationMatch.index, lines.length); return lines.slice(0, quotationMatch.index); } + // Handle the case with markers quotationMatch = REGEXES.QUOTATION.exec(markers) || REGEXES.EMPTY_QUOTATION.exec(markers); if (quotationMatch) { quotationEnd = quotationMatch.index + quotationMatch[1].length; @@ -189,23 +261,33 @@ return returnFlags.lastLine = lastLine; }; - preprocess = function(msgBody, delimiter, contentType) { - if (contentType == null) { - contentType = 'text/plain'; - } + // Prepares msgBody for being stripped. + + // Replaces link brackets so that they couldn't be taken for quotation marker. + // Splits line in two if splitter pattern preceded by some text on the same + // line (done only for 'On wrote:' pattern). + + preprocess = function(msgBody, delimiter, contentType = 'text/plain') { + // Normalize links i.e. replace '<', '>' wrapping the link with some symbols + // so that '>' closing the link couldn't be mistakenly taken for quotation + // marker. + // REGEXES.LINK has 1 captured group msgBody = msgBody.replace(REGEXES.LINK, function(entireMatch, groupMatch1, matchIndex) { var newLineIndex; + // Look for closest newline character newLineIndex = msgBody.lastIndexOf("\n", matchIndex); + // If the new current line starts with a '>' quotation marker, don't mess with the link if (newLineIndex > 0 && msgBody[newLineIndex + 1] === '>') { return entireMatch; } else { - return "@@" + groupMatch1 + "@@"; + return `@@${groupMatch1}@@`; } }); if (contentType === 'text/plain' && msgBody.length < MAX_LINE_LENGTH) { + // ON_DATE_SMB_WROTE has 4 captured groups msgBody = msgBody.replace(REGEXES.ON_DATE_SMB_WROTE, function(entireMatch, groupMatch1, groupMatch2, groupMatch3, groupMatch4, matchIndex) { if (matchIndex && msgBody[matchIndex - 1] !== "\n") { - return "" + delimiter + entireMatch; + return `${delimiter}${entireMatch}`; } else { return entireMatch; } @@ -214,6 +296,8 @@ return msgBody; }; + // Make up for changes done at preprocessing message. + // Replace link brackets back to '<' and '>'. postprocess = function(msgBody) { return msgBody.replace(REGEXES.NORMALIZED_LINK, '<$1>').trim(); }; @@ -245,10 +329,7 @@ return [msgBody, false]; }; - _restore_CRLF = function(msgBody, replaced) { - if (replaced == null) { - replaced = true; - } + _restore_CRLF = function(msgBody, replaced = true) { if (replaced) { return msgBody.replace(new RegExp('\n', 'g'), '\r\n'); } diff --git a/lib/regexes.js b/lib/regexes.js index b42fc83..8a08565 100644 --- a/lib/regexes.js +++ b/lib/regexes.js @@ -1,11 +1,13 @@ -// Generated by CoffeeScript 1.12.7 +// Generated by CoffeeScript 2.5.1 (function() { exports.DELIMITER = new RegExp('\r?\n'); exports.FWD = new RegExp("^[-]+[ ]*Forwarded message[ ]*[-]+$", 'im'); + // On {date}, {somebody} wrote: exports.ON_DATE_SMB_WROTE = new RegExp("(-*[>]?[ ]?(On|Le|W dniu|Op|Am|P\xe5|Den)[ ].*(,|u\u017cytkownik)(.*\n){0,2}.*(wrote|sent|a \xe9crit|napisa\u0142|schreef|verzond|geschreven|schrieb|skrev):?-*)"); + // On {date} wrote {somebody}: exports.ON_DATE_WROTE_SMB = new RegExp('(-*[>]?[ ]?(Op|Am)[ ].*(.*\n){0,2}.*(schreef|verzond|geschreven|schrieb)[ ]*.*:)'); exports.QUOTATION = new RegExp('((?:s|(?:me*){2,}).*me*)[te]*$'); diff --git a/package.json b/package.json index 14142c3..025e79d 100644 --- a/package.json +++ b/package.json @@ -27,9 +27,9 @@ }, "homepage": "https://github.com/lever/planer#readme", "devDependencies": { - "chai": "^3.4.1", - "coffee-script": "^1.10.0", - "jsdom": "^11.6.0", - "mocha": "^2.3.4" + "chai": "^4.2.0", + "coffeescript": "^2.5.1", + "jsdom": "^16.2.2", + "mocha": "^5.2.0" } } diff --git a/test/mocha.opts b/test/mocha.opts index 8510727..90b511a 100644 --- a/test/mocha.opts +++ b/test/mocha.opts @@ -1,3 +1,3 @@ ---compilers coffee:coffee-script/register +--compilers coffee:coffeescript/register --recursive --reporter spec From 76934a76e9d335fec367abde0909ec3e4e9fce06 Mon Sep 17 00:00:00 2001 From: Aaron Terry Date: Tue, 14 Apr 2020 14:51:45 -0500 Subject: [PATCH 03/15] npm audit won't work is you disallow package lock files --- .npmrc | 1 - 1 file changed, 1 deletion(-) delete mode 100644 .npmrc diff --git a/.npmrc b/.npmrc deleted file mode 100644 index 43c97e7..0000000 --- a/.npmrc +++ /dev/null @@ -1 +0,0 @@ -package-lock=false From 86fd077ac18921a8983f8b8b27be7dab17c0d9b7 Mon Sep 17 00:00:00 2001 From: Aaron Terry Date: Tue, 14 Apr 2020 15:03:45 -0500 Subject: [PATCH 04/15] remove mocha.opts since it's depreciated in newer versions of Mocha. add debug npm command to attach to chrome devtools for debugging --- package.json | 5 +++-- test/mocha.opts | 3 --- 2 files changed, 3 insertions(+), 5 deletions(-) delete mode 100644 test/mocha.opts diff --git a/package.json b/package.json index 025e79d..cfda604 100644 --- a/package.json +++ b/package.json @@ -7,7 +7,8 @@ "registry": "http://registry.npmjs.org/" }, "scripts": { - "test": "mocha test/", + "test": "mocha --reporter spec --require coffeescript/register \"test/**/*.{js,coffee}\"", + "debug": "mocha --inspect-brk --reporter spec --require coffeescript/register \"test/**/*.{js,coffee}\"", "compile": "coffee -o lib -c src" }, "repository": { @@ -30,6 +31,6 @@ "chai": "^4.2.0", "coffeescript": "^2.5.1", "jsdom": "^16.2.2", - "mocha": "^5.2.0" + "mocha": "^6.2.3" } } diff --git a/test/mocha.opts b/test/mocha.opts deleted file mode 100644 index 90b511a..0000000 --- a/test/mocha.opts +++ /dev/null @@ -1,3 +0,0 @@ ---compilers coffee:coffeescript/register ---recursive ---reporter spec From 8a0e4b9663f2c17b7955d441c13092d09798f6ca Mon Sep 17 00:00:00 2001 From: Aaron Terry Date: Tue, 14 Apr 2020 15:08:26 -0500 Subject: [PATCH 05/15] fixed zero-based array issue in findOutlookSplitterWithQuerySelector. code was fail detection unless at least two splitters were found and it would leave the first quoted message in the result. --- src/htmlPlaner.coffee | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/htmlPlaner.coffee b/src/htmlPlaner.coffee index 0c1740d..f94aa49 100644 --- a/src/htmlPlaner.coffee +++ b/src/htmlPlaner.coffee @@ -258,10 +258,9 @@ findOutlookSplitterWithXpathQuery = (emailDocument, xpathQuery) -> findOutlookSplitterWithQuerySelector = (emailDocument, query) -> splitterResult = emailDocument.querySelectorAll(query) + return unless splitterResult.length > 0 - return unless splitterResult.length > 1 - - splitterElement = splitterResult[1] + splitterElement = splitterResult[0] if splitterElement.parentElement? && splitterElement == splitterElement.parentElement.children[0] splitterElement = splitterElement.parentElement From 8fe542b369469fcb73e2a58704efe0dc385e6a0d Mon Sep 17 00:00:00 2001 From: Aaron Terry Date: Tue, 14 Apr 2020 15:09:41 -0500 Subject: [PATCH 06/15] added a reply message to the outlook mixed test and changed testspec to make sure it was handled correctly --- test/examples/html/outlook-mixed.html | 2 +- test/planerHtml.test.coffee | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/examples/html/outlook-mixed.html b/test/examples/html/outlook-mixed.html index 4ae3ac2..c4a2f3f 100644 --- a/test/examples/html/outlook-mixed.html +++ b/test/examples/html/outlook-mixed.html @@ -92,7 +92,7 @@
-

 

+

We can talk tomorrow. 

 

diff --git a/test/planerHtml.test.coffee b/test/planerHtml.test.coffee index 3cf24a7..431acc9 100644 --- a/test/planerHtml.test.coffee +++ b/test/planerHtml.test.coffee @@ -234,7 +234,7 @@ describe 'planer#extractFromHtml', -> expect(extractedHtml).not.to.contain(originalMsgSnippet) it 'handles emails from various Outlook versions', -> - replySnippet = 'This is how it looks on my emails' + replySnippet = 'We can talk tomorrow.' originalMsgSnippet = "We'd love to set up a quick phone call with you" msgBody = fs.readFileSync(absolutePath('examples/html/outlook-mixed.html'), 'utf8') From 8314ba1d220da5a3bd9e3f3cc873fd34771d87d0 Mon Sep 17 00:00:00 2001 From: Aaron Terry Date: Tue, 14 Apr 2020 15:23:58 -0500 Subject: [PATCH 07/15] added support for several Outlook 2010 American, Outlook 2013/16/19 American and International --- src/htmlPlaner.coffee | 7 +- test/examples/html/outlook-2010-american.html | 157 ++++++++++++++++++ .../html/outlook-2010-international.html | 157 ++++++++++++++++++ test/examples/html/outlook-2016-american.html | 111 +++++++++++++ .../html/outlook-2016-international.html | 111 +++++++++++++ test/planerHtml.test.coffee | 57 ++++++- 6 files changed, 596 insertions(+), 4 deletions(-) create mode 100644 test/examples/html/outlook-2010-american.html create mode 100644 test/examples/html/outlook-2010-international.html create mode 100644 test/examples/html/outlook-2016-american.html create mode 100644 test/examples/html/outlook-2016-international.html diff --git a/src/htmlPlaner.coffee b/src/htmlPlaner.coffee index f94aa49..bbb17f5 100644 --- a/src/htmlPlaner.coffee +++ b/src/htmlPlaner.coffee @@ -201,8 +201,10 @@ exports.replaceBreakTagsWithLineFeeds = (emailDocument) -> # Queries to find a splitter that's the only child of a single parent div # Usually represents the dividing line between messages in the Outlook html OUTLOOK_SPLITTER_QUERY_SELECTORS = - outlook2007: "div[style='border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0cm 0cm 0cm']" - outlookForAndroid: "div[style='border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0cm 0cm 0cm']" + outlook2007and2010International: "div[style='border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0cm 0cm 0cm']" + outlook2007and2010American: "div[style='border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0in 0in 0in']" + outlook2013_2016_2019International: "div[style='border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0cm 0cm 0cm']" + outlook2013_2016_2019American: "div[style='border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0in 0in 0in']" windowsMail: "div[style='padding-top: 5px; border-top-color: rgb(229, 229, 229); border-top-width: 1px; border-top-style: solid;']" # More complicated Xpath queries for versions of Outlook that don't use the dividing lines @@ -216,7 +218,6 @@ OUTLOOK_SPLITTER_QUOTE_IDS = findMicrosoftSplitter = (emailDocument) -> possibleSplitterElements = [] - for _, querySelector of OUTLOOK_SPLITTER_QUERY_SELECTORS if (splitterElement = findOutlookSplitterWithQuerySelector(emailDocument, querySelector)) possibleSplitterElements.push splitterElement diff --git a/test/examples/html/outlook-2010-american.html b/test/examples/html/outlook-2010-american.html new file mode 100644 index 0000000..4cb57dc --- /dev/null +++ b/test/examples/html/outlook-2010-american.html @@ -0,0 +1,157 @@ + + + + + + + + + + + + +
+

OK by me

+

+   +

+
+
+

From: John Wilson + [mailto:sdfds@sdfsd.com]
Sent: Wednesday, November 16, 2016 1:40 + PM
To: 'Jim Jones'
Cc: 'Susan Johns'; 'Dan Toms' +
Subject: + +

+
+
+

+   +

+

I further revised this – I can’t help myself – please quickly review and + REPLY ALL +

+

+   +

+

+   +

+

+   +

+

Thanks for your patience + during our system upgrades.

+

+   +

+
+ + + \ No newline at end of file diff --git a/test/examples/html/outlook-2010-international.html b/test/examples/html/outlook-2010-international.html new file mode 100644 index 0000000..b70cc7a --- /dev/null +++ b/test/examples/html/outlook-2010-international.html @@ -0,0 +1,157 @@ + + + + + + + + + + + + +
+

OK by me

+

+   +

+
+
+

From: John Wilson + [mailto:sdfds@sdfsd.com]
Sent: Wednesday, November 16, 2016 1:40 + PM
To: 'Jim Jones'
Cc: 'Susan Johns'; 'Dan Toms' +
Subject: + +

+
+
+

+   +

+

I further revised this – I can’t help myself – please quickly review and + REPLY ALL +

+

+   +

+

+   +

+

+   +

+

Thanks for your patience + during our system upgrades.

+

+   +

+
+ + + \ No newline at end of file diff --git a/test/examples/html/outlook-2016-american.html b/test/examples/html/outlook-2016-american.html new file mode 100644 index 0000000..31391fe --- /dev/null +++ b/test/examples/html/outlook-2016-american.html @@ -0,0 +1,111 @@ + + + + + + + + + + + + +
+

That time works for me. +

+

+   +

+
+
+

From: Bill <bill@example.com>
Sent: Wednesday, + April 8, 2020 7:51 PM
To: Tom <Tom@example.com>
Cc: Susan + <susan@example.com>
Subject: Re: Let's meet +

+
+
+

+   +

+
+

I can meet tomorrow. +

+
+
+

+   +

+
+ +
+ + + \ No newline at end of file diff --git a/test/examples/html/outlook-2016-international.html b/test/examples/html/outlook-2016-international.html new file mode 100644 index 0000000..a62bed4 --- /dev/null +++ b/test/examples/html/outlook-2016-international.html @@ -0,0 +1,111 @@ + + + + + + + + + + + + +
+

That time works for me. +

+

+   +

+
+
+

From: Bill <bill@example.com>
Sent: Wednesday, + April 8, 2020 7:51 PM
To: Tom <Tom@example.com>
Cc: Susan + <susan@example.com>
Subject: Re: Let's meet +

+
+
+

+   +

+
+

I can meet tomorrow. +

+
+
+

+   +

+
+ +
+ + + \ No newline at end of file diff --git a/test/planerHtml.test.coffee b/test/planerHtml.test.coffee index 431acc9..4fa1356 100644 --- a/test/planerHtml.test.coffee +++ b/test/planerHtml.test.coffee @@ -212,7 +212,6 @@ describe 'planer#extractFromHtml', -> msgBody = fs.readFileSync(absolutePath('examples/html/microsoft-namespaces.html'), 'utf8') expect(msgBody).to.contain(replySnippet) expect(msgBody).to.contain(originalMsgSnippet) - extractedHtml = planer.extractFromHtml(msgBody, @dom) expect(extractedHtml).to.exist @@ -247,3 +246,59 @@ describe 'planer#extractFromHtml', -> expect(extractedHtml).to.contain(replySnippet) expect(extractedHtml).not.to.contain(originalMsgSnippet) + + it 'handles emails from Office 2010 American', -> + replySnippet = "OK by me" + originalMsgSnippet = 'further revised' + + msgBody = fs.readFileSync(absolutePath('examples/html/outlook-2010-american.html'), 'utf8') + expect(msgBody).to.contain(replySnippet) + expect(msgBody).to.contain(originalMsgSnippet) + + extractedHtml = planer.extractFromHtml(msgBody, @dom) + + expect(extractedHtml).to.exist + expect(extractedHtml).to.contain(replySnippet) + expect(extractedHtml).not.to.contain(originalMsgSnippet) + + it 'handles emails from Office 2010 International', -> + replySnippet = "OK by me" + originalMsgSnippet = 'further revised' + + msgBody = fs.readFileSync(absolutePath('examples/html/outlook-2010-international.html'), 'utf8') + expect(msgBody).to.contain(replySnippet) + expect(msgBody).to.contain(originalMsgSnippet) + + extractedHtml = planer.extractFromHtml(msgBody, @dom) + + expect(extractedHtml).to.exist + expect(extractedHtml).to.contain(replySnippet) + expect(extractedHtml).not.to.contain(originalMsgSnippet) + + it 'handles emails from Office 2013/2016/2019 American', -> + replySnippet = "That time works for me." + originalMsgSnippet = 'I can meet tomorrow.' + + msgBody = fs.readFileSync(absolutePath('examples/html/outlook-2016-american.html'), 'utf8') + expect(msgBody).to.contain(replySnippet) + expect(msgBody).to.contain(originalMsgSnippet) + + extractedHtml = planer.extractFromHtml(msgBody, @dom) + + expect(extractedHtml).to.exist + expect(extractedHtml).to.contain(replySnippet) + expect(extractedHtml).not.to.contain(originalMsgSnippet) + + it 'handles emails from Office 2013/2016/2019 International', -> + replySnippet = "That time works for me." + originalMsgSnippet = 'I can meet tomorrow.' + + msgBody = fs.readFileSync(absolutePath('examples/html/outlook-2016-international.html'), 'utf8') + expect(msgBody).to.contain(replySnippet) + expect(msgBody).to.contain(originalMsgSnippet) + + extractedHtml = planer.extractFromHtml(msgBody, @dom) + + expect(extractedHtml).to.exist + expect(extractedHtml).to.contain(replySnippet) + expect(extractedHtml).not.to.contain(originalMsgSnippet) From 5b9bf479784c2b57f37063916bbd97c451ae116e Mon Sep 17 00:00:00 2001 From: Aaron Terry Date: Tue, 14 Apr 2020 15:24:58 -0500 Subject: [PATCH 08/15] compiles coffeescript to js --- lib/htmlPlaner.js | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/lib/htmlPlaner.js b/lib/htmlPlaner.js index 5a59ff7..6dd0cde 100644 --- a/lib/htmlPlaner.js +++ b/lib/htmlPlaner.js @@ -237,8 +237,10 @@ // Queries to find a splitter that's the only child of a single parent div // Usually represents the dividing line between messages in the Outlook html OUTLOOK_SPLITTER_QUERY_SELECTORS = { - outlook2007: "div[style='border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0cm 0cm 0cm']", - outlookForAndroid: "div[style='border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0cm 0cm 0cm']", + outlook2007and2010International: "div[style='border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0cm 0cm 0cm']", + outlook2007and2010American: "div[style='border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0in 0in 0in']", + outlook2013_2016_2019International: "div[style='border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0cm 0cm 0cm']", + outlook2013_2016_2019American: "div[style='border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0in 0in 0in']", windowsMail: "div[style='padding-top: 5px; border-top-color: rgb(229, 229, 229); border-top-width: 1px; border-top-style: solid;']" }; @@ -311,10 +313,10 @@ findOutlookSplitterWithQuerySelector = function(emailDocument, query) { var splitterElement, splitterResult; splitterResult = emailDocument.querySelectorAll(query); - if (!(splitterResult.length > 1)) { + if (!(splitterResult.length > 0)) { return; } - splitterElement = splitterResult[1]; + splitterElement = splitterResult[0]; if ((splitterElement.parentElement != null) && splitterElement === splitterElement.parentElement.children[0]) { splitterElement = splitterElement.parentElement; } From 119adeb784ae7a536d0b72c23f42d00cede202c2 Mon Sep 17 00:00:00 2001 From: Aaron Terry Date: Tue, 14 Apr 2020 16:08:50 -0500 Subject: [PATCH 09/15] added test for reply from iOS Mail Client --- test/examples/html/iosMail.html | 21 +++++++++++++++++++++ test/planerHtml.test.coffee | 14 ++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 test/examples/html/iosMail.html diff --git a/test/examples/html/iosMail.html b/test/examples/html/iosMail.html new file mode 100644 index 0000000..6056411 --- /dev/null +++ b/test/examples/html/iosMail.html @@ -0,0 +1,21 @@ + + + + + + + +
This is an html reply.
+

+

+

+
On Apr 14, 2020, at 3:41 PM, John Doe <john@example.com> wrote:

+
+
+
+
This is my original message from GMail iOS 
+
+
+ + + \ No newline at end of file diff --git a/test/planerHtml.test.coffee b/test/planerHtml.test.coffee index 4fa1356..e88f139 100644 --- a/test/planerHtml.test.coffee +++ b/test/planerHtml.test.coffee @@ -302,3 +302,17 @@ describe 'planer#extractFromHtml', -> expect(extractedHtml).to.exist expect(extractedHtml).to.contain(replySnippet) expect(extractedHtml).not.to.contain(originalMsgSnippet) + + it 'handles emails from Apple iOS Mail', -> + replySnippet = "html reply" + originalMsgSnippet = 'original message from GMail' + + msgBody = fs.readFileSync(absolutePath('examples/html/iosMail.html'), 'utf8') + expect(msgBody).to.contain(replySnippet) + expect(msgBody).to.contain(originalMsgSnippet) + + extractedHtml = planer.extractFromHtml(msgBody, @dom) + + expect(extractedHtml).to.exist + expect(extractedHtml).to.contain(replySnippet) + expect(extractedHtml).not.to.contain(originalMsgSnippet) From 17529ea6c4ad8bce9b2407f0a75755211780f200 Mon Sep 17 00:00:00 2001 From: Aaron Terry Date: Tue, 14 Apr 2020 16:31:03 -0500 Subject: [PATCH 10/15] updated test name --- test/planerHtml.test.coffee | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/planerHtml.test.coffee b/test/planerHtml.test.coffee index e88f139..13ba3c4 100644 --- a/test/planerHtml.test.coffee +++ b/test/planerHtml.test.coffee @@ -247,7 +247,7 @@ describe 'planer#extractFromHtml', -> expect(extractedHtml).not.to.contain(originalMsgSnippet) - it 'handles emails from Office 2010 American', -> + it 'handles emails from Office 2007/2010 American', -> replySnippet = "OK by me" originalMsgSnippet = 'further revised' @@ -261,7 +261,7 @@ describe 'planer#extractFromHtml', -> expect(extractedHtml).to.contain(replySnippet) expect(extractedHtml).not.to.contain(originalMsgSnippet) - it 'handles emails from Office 2010 International', -> + it 'handles emails from Office 2007/2010 International', -> replySnippet = "OK by me" originalMsgSnippet = 'further revised' From eedcf47edb0bf5141521366289241f0bd6ba6af0 Mon Sep 17 00:00:00 2001 From: Aaron Terry Date: Wed, 15 Apr 2020 05:40:47 -0500 Subject: [PATCH 11/15] bump version to 1.2.0 --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index cfda604..200c22a 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "planer", - "version": "1.1.1", + "version": "1.2.0", "description": "Remove reply quotations from emails", "main": "lib/planer.js", "publishConfig": { From e809a0cbee749d2e49858a4b01d87ae274d6dfa1 Mon Sep 17 00:00:00 2001 From: Aaron Terry Date: Wed, 22 Apr 2020 06:23:29 -0500 Subject: [PATCH 12/15] handle reply chains among multiple clients by simply running all cut functions on each message instead of prioritizing order --- lib/planer.js | 15 +-- package.json | 2 +- src/planer.coffee | 22 +++-- .../html/mixedEmailClientReplyChain.html | 93 +++++++++++++++++++ test/planerHtml.test.coffee | 14 +++ 5 files changed, 130 insertions(+), 16 deletions(-) create mode 100644 test/examples/html/mixedEmailClientReplyChain.html diff --git a/lib/planer.js b/lib/planer.js index 724e72e..f79e79c 100644 --- a/lib/planer.js +++ b/lib/planer.js @@ -76,7 +76,7 @@ // Must respond to `DOMImplementation.createHTMLDocument()`. // @see https://developer.mozilla.org/en-US/docs/Web/API/DOMImplementation/createHTMLDocument exports.extractFromHtml = function(msgBody, dom) { - var checkpoint, crlfReplaced, emailDocument, emailDocumentCopy, haveCutQuotations, i, index, k, l, len, len1, line, lineCheckpoints, lines, m, markers, matches, numberOfCheckpoints, plainTextMsg, quotationCheckpoints, ref, ref1, ref2, returnFlags; + var checkpoint, crlfReplaced, emailDocument, emailDocumentCopy, haveCutQuotations, haveCutQuotationsBlock, haveCutQuotationsById, haveCutQuotationsFromBlock, haveCutQuotationsGMail, haveCutQuotationsMicrosoft, i, index, k, l, len, len1, line, lineCheckpoints, lines, m, markers, matches, numberOfCheckpoints, plainTextMsg, quotationCheckpoints, ref, ref1, ref2, returnFlags; if (dom == null) { console.error("No dom provided to parse html."); return msgBody; @@ -86,11 +86,14 @@ } [msgBody, crlfReplaced] = _CRLF_to_LF(msgBody); emailDocument = htmlPlaner.createEmailDocument(msgBody, dom); - // TODO: this check does not handle cases of emails between various email providers well because - // it will find whichever splitter comes first in this list, not necessarily the top-most and stop - // checking for others. Possible solution is to use something like compareByDomPosition from htmlPlaner - // to find the earliest splitter in the DOM. - haveCutQuotations = htmlPlaner.cutGmailQuote(emailDocument) || htmlPlaner.cutBlockQuote(emailDocument) || htmlPlaner.cutMicrosoftQuote(emailDocument) || htmlPlaner.cutById(emailDocument) || htmlPlaner.cutFromBlock(emailDocument); + // handle cases of emails between various email providers by running all checks instead of + // stopping at whichever check returns positive first + haveCutQuotationsGMail = htmlPlaner.cutGmailQuote(emailDocument); + haveCutQuotationsBlock = htmlPlaner.cutBlockQuote(emailDocument); + haveCutQuotationsMicrosoft = htmlPlaner.cutMicrosoftQuote(emailDocument); + haveCutQuotationsById = htmlPlaner.cutById(emailDocument); + haveCutQuotationsFromBlock = htmlPlaner.cutFromBlock(emailDocument); + haveCutQuotations = haveCutQuotationsGMail || haveCutQuotationsBlock || haveCutQuotationsMicrosoft || haveCutQuotationsById || haveCutQuotationsFromBlock; // Create unaltered copy of email document emailDocumentCopy = htmlPlaner.createEmailDocument(emailDocument.documentElement.outerHTML, dom); // Add checkpoints to html document diff --git a/package.json b/package.json index 200c22a..7021fbf 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "planer", - "version": "1.2.0", + "version": "1.3.0", "description": "Remove reply quotations from emails", "main": "lib/planer.js", "publishConfig": { diff --git a/src/planer.coffee b/src/planer.coffee index 68a4ce0..cbbf9f7 100644 --- a/src/planer.coffee +++ b/src/planer.coffee @@ -78,16 +78,20 @@ exports.extractFromHtml = (msgBody, dom) -> [msgBody, crlfReplaced] = _CRLF_to_LF msgBody emailDocument = htmlPlaner.createEmailDocument msgBody, dom - # TODO: this check does not handle cases of emails between various email providers well because - # it will find whichever splitter comes first in this list, not necessarily the top-most and stop - # checking for others. Possible solution is to use something like compareByDomPosition from htmlPlaner - # to find the earliest splitter in the DOM. + # handle cases of emails between various email providers by running all checks instead of + # stopping at whichever check returns positive first + haveCutQuotationsGMail = htmlPlaner.cutGmailQuote(emailDocument) + haveCutQuotationsBlock = htmlPlaner.cutBlockQuote(emailDocument) + haveCutQuotationsMicrosoft = htmlPlaner.cutMicrosoftQuote(emailDocument) + haveCutQuotationsById = htmlPlaner.cutById(emailDocument) + haveCutQuotationsFromBlock = htmlPlaner.cutFromBlock(emailDocument) + haveCutQuotations = ( - htmlPlaner.cutGmailQuote(emailDocument) || - htmlPlaner.cutBlockQuote(emailDocument) || - htmlPlaner.cutMicrosoftQuote(emailDocument) || - htmlPlaner.cutById(emailDocument) || - htmlPlaner.cutFromBlock(emailDocument) + haveCutQuotationsGMail || + haveCutQuotationsBlock || + haveCutQuotationsMicrosoft || + haveCutQuotationsById || + haveCutQuotationsFromBlock ) # Create unaltered copy of email document diff --git a/test/examples/html/mixedEmailClientReplyChain.html b/test/examples/html/mixedEmailClientReplyChain.html new file mode 100644 index 0000000..29d3944 --- /dev/null +++ b/test/examples/html/mixedEmailClientReplyChain.html @@ -0,0 +1,93 @@ + + + + + + + + + +
Here is the answer +
+
+

+
+
+
+

Thomas Smith

+

+

+
+
+
+
+
+
+
From: Bob Smith <bob@smith.com>
+Sent: Monday, April 13, 2020 9:12 AM
+To: My group <thegroup@list.com>
+Subject: Re: [group] Having Trouble
+
 
+
+
+ +
Tom I am having trouble pulling up that case as well. Could someone post or direct me?
+
+
+
On Sat, Apr 11, 2020 at 9:51 AM R. Smith <group@example.com> wrote:
+
+
+
+
+
Please share, I haven't seen it yet.
+
+

+
+
+
+

+
R. Smith
+
+
+
+
+
+
From: Thomas Smith <group@example.com>
+Sent: Saturday, April 11, 2020 9:24 AM
+To: My Group <thegroup@list.com>
+Subject: Re: [group] Having Trouble
+
 
+
+
+
+
I have gotten past this before. 
+

+
+ +
From: "Jim Johnson" <group@example.com>
+Reply-To: My Group <thegroup@list.com>
+Date: Friday, April 17, 2020 at 6:28 PM
+To: My Group <thegroup@list.com>
+Subject: [group] Having Trouble
+
+

+
+
+
+
+

Anyone had any success on getting past this big problem?

+

 

+

Jim

+
+
+
+
+
+
+
+
+
+
+ + + \ No newline at end of file diff --git a/test/planerHtml.test.coffee b/test/planerHtml.test.coffee index 13ba3c4..6a04f78 100644 --- a/test/planerHtml.test.coffee +++ b/test/planerHtml.test.coffee @@ -316,3 +316,17 @@ describe 'planer#extractFromHtml', -> expect(extractedHtml).to.exist expect(extractedHtml).to.contain(replySnippet) expect(extractedHtml).not.to.contain(originalMsgSnippet) + + + it 'handles emails reply chains involving multiple email clients', -> + replySnippet = "Here is the answer" + originalMsgSnippet = 'I am having trouble' + msgBody = fs.readFileSync(absolutePath('examples/html/mixedEmailClientReplyChain.html'), 'utf8') + expect(msgBody).to.contain(replySnippet) + expect(msgBody).to.contain(originalMsgSnippet) + + extractedHtml = planer.extractFromHtml(msgBody, @dom) + + expect(extractedHtml).to.exist + expect(extractedHtml).to.contain(replySnippet) + expect(extractedHtml).not.to.contain(originalMsgSnippet) From 6df227d9c8767960ab1f4ca5759370f4c7a43f40 Mon Sep 17 00:00:00 2001 From: Aaron Terry Date: Wed, 22 Apr 2020 16:04:22 -0500 Subject: [PATCH 13/15] make querySelectorAll selectors case-insensitive since hex color colrs are sometimes lowercased by outlook --- src/htmlPlaner.coffee | 11 ++++++----- test/examples/html/outlook-2016-american.html | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/htmlPlaner.coffee b/src/htmlPlaner.coffee index bbb17f5..9cba64c 100644 --- a/src/htmlPlaner.coffee +++ b/src/htmlPlaner.coffee @@ -200,12 +200,13 @@ exports.replaceBreakTagsWithLineFeeds = (emailDocument) -> # Queries to find a splitter that's the only child of a single parent div # Usually represents the dividing line between messages in the Outlook html +# using case-insensitive modifier "i" at the end of each selector since the color hex color has been seen lowercased in some outlook emails OUTLOOK_SPLITTER_QUERY_SELECTORS = - outlook2007and2010International: "div[style='border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0cm 0cm 0cm']" - outlook2007and2010American: "div[style='border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0in 0in 0in']" - outlook2013_2016_2019International: "div[style='border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0cm 0cm 0cm']" - outlook2013_2016_2019American: "div[style='border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0in 0in 0in']" - windowsMail: "div[style='padding-top: 5px; border-top-color: rgb(229, 229, 229); border-top-width: 1px; border-top-style: solid;']" + outlook2007and2010International: "div[style='border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0cm 0cm 0cm' i]" + outlook2007and2010American: "div[style='border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0in 0in 0in' i]" + outlook2013_2016_2019International: "div[style='border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0cm 0cm 0cm' i]" + outlook2013_2016_2019American: "div[style='border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0in 0in 0in' i]" + windowsMail: "div[style='padding-top: 5px; border-top-color: rgb(229, 229, 229); border-top-width: 1px; border-top-style: solid;' i]" # More complicated Xpath queries for versions of Outlook that don't use the dividing lines OUTLOOK_XPATH_SPLITTER_QUERIES = diff --git a/test/examples/html/outlook-2016-american.html b/test/examples/html/outlook-2016-american.html index 31391fe..b000b6f 100644 --- a/test/examples/html/outlook-2016-american.html +++ b/test/examples/html/outlook-2016-american.html @@ -85,7 +85,7 @@  

-
+

From: Bill <bill@example.com>
Sent: Wednesday, April 8, 2020 7:51 PM
To: Tom <Tom@example.com>
Cc: Susan <susan@example.com>
Subject: Re: Let's meet From e372470569317314a25a09b34d53ce61f34aaa24 Mon Sep 17 00:00:00 2001 From: Aaron Terry Date: Wed, 22 Apr 2020 16:18:39 -0500 Subject: [PATCH 14/15] compiling coffeescript --- lib/htmlPlaner.js | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/lib/htmlPlaner.js b/lib/htmlPlaner.js index 6dd0cde..4eb299e 100644 --- a/lib/htmlPlaner.js +++ b/lib/htmlPlaner.js @@ -236,12 +236,13 @@ // Queries to find a splitter that's the only child of a single parent div // Usually represents the dividing line between messages in the Outlook html + // using case-insensitive modifier "i" at the end of each selector since the color hex color has been seen lowercased in some outlook emails OUTLOOK_SPLITTER_QUERY_SELECTORS = { - outlook2007and2010International: "div[style='border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0cm 0cm 0cm']", - outlook2007and2010American: "div[style='border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0in 0in 0in']", - outlook2013_2016_2019International: "div[style='border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0cm 0cm 0cm']", - outlook2013_2016_2019American: "div[style='border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0in 0in 0in']", - windowsMail: "div[style='padding-top: 5px; border-top-color: rgb(229, 229, 229); border-top-width: 1px; border-top-style: solid;']" + outlook2007and2010International: "div[style='border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0cm 0cm 0cm' i]", + outlook2007and2010American: "div[style='border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0in 0in 0in' i]", + outlook2013_2016_2019International: "div[style='border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0cm 0cm 0cm' i]", + outlook2013_2016_2019American: "div[style='border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0in 0in 0in' i]", + windowsMail: "div[style='padding-top: 5px; border-top-color: rgb(229, 229, 229); border-top-width: 1px; border-top-style: solid;' i]" }; // More complicated Xpath queries for versions of Outlook that don't use the dividing lines From 42b6e1cadf492d1c455d41894130a16821ad19cd Mon Sep 17 00:00:00 2001 From: Aaron Terry Date: Thu, 23 Apr 2020 07:12:16 -0500 Subject: [PATCH 15/15] detect replies using the yahoo_quoted class --- lib/htmlPlaner.js | 10 + lib/planer.js | 5 +- src/htmlPlaner.coffee | 7 + src/planer.coffee | 2 + test/examples/html/yahooMail2020.html | 435 ++++++++++++++++++++++++++ test/planerHtml.test.coffee | 13 + 6 files changed, 470 insertions(+), 2 deletions(-) create mode 100644 test/examples/html/yahooMail2020.html diff --git a/lib/htmlPlaner.js b/lib/htmlPlaner.js index 4eb299e..818b578 100644 --- a/lib/htmlPlaner.js +++ b/lib/htmlPlaner.js @@ -111,6 +111,16 @@ return true; }; + exports.cutYahooQuote = function(emailDocument) { + var nodesArray; + nodesArray = emailDocument.getElementsByClassName('yahoo_quoted'); + if (!(nodesArray.length > 0)) { + return false; + } + removeNodes(nodesArray); + return true; + }; + exports.cutMicrosoftQuote = function(emailDocument) { var afterSplitter, parentElement, splitterElement; splitterElement = findMicrosoftSplitter(emailDocument); diff --git a/lib/planer.js b/lib/planer.js index f79e79c..82cbcdc 100644 --- a/lib/planer.js +++ b/lib/planer.js @@ -76,7 +76,7 @@ // Must respond to `DOMImplementation.createHTMLDocument()`. // @see https://developer.mozilla.org/en-US/docs/Web/API/DOMImplementation/createHTMLDocument exports.extractFromHtml = function(msgBody, dom) { - var checkpoint, crlfReplaced, emailDocument, emailDocumentCopy, haveCutQuotations, haveCutQuotationsBlock, haveCutQuotationsById, haveCutQuotationsFromBlock, haveCutQuotationsGMail, haveCutQuotationsMicrosoft, i, index, k, l, len, len1, line, lineCheckpoints, lines, m, markers, matches, numberOfCheckpoints, plainTextMsg, quotationCheckpoints, ref, ref1, ref2, returnFlags; + var checkpoint, crlfReplaced, emailDocument, emailDocumentCopy, haveCutQuotations, haveCutQuotationsBlock, haveCutQuotationsById, haveCutQuotationsFromBlock, haveCutQuotationsGMail, haveCutQuotationsMicrosoft, haveCutQuotationsYahoo, i, index, k, l, len, len1, line, lineCheckpoints, lines, m, markers, matches, numberOfCheckpoints, plainTextMsg, quotationCheckpoints, ref, ref1, ref2, returnFlags; if (dom == null) { console.error("No dom provided to parse html."); return msgBody; @@ -89,11 +89,12 @@ // handle cases of emails between various email providers by running all checks instead of // stopping at whichever check returns positive first haveCutQuotationsGMail = htmlPlaner.cutGmailQuote(emailDocument); + haveCutQuotationsYahoo = htmlPlaner.cutYahooQuote(emailDocument); haveCutQuotationsBlock = htmlPlaner.cutBlockQuote(emailDocument); haveCutQuotationsMicrosoft = htmlPlaner.cutMicrosoftQuote(emailDocument); haveCutQuotationsById = htmlPlaner.cutById(emailDocument); haveCutQuotationsFromBlock = htmlPlaner.cutFromBlock(emailDocument); - haveCutQuotations = haveCutQuotationsGMail || haveCutQuotationsBlock || haveCutQuotationsMicrosoft || haveCutQuotationsById || haveCutQuotationsFromBlock; + haveCutQuotations = haveCutQuotationsGMail || haveCutQuotationsYahoo || haveCutQuotationsBlock || haveCutQuotationsMicrosoft || haveCutQuotationsById || haveCutQuotationsFromBlock; // Create unaltered copy of email document emailDocumentCopy = htmlPlaner.createEmailDocument(emailDocument.documentElement.outerHTML, dom); // Add checkpoints to html document diff --git a/src/htmlPlaner.coffee b/src/htmlPlaner.coffee index 9cba64c..c11c507 100644 --- a/src/htmlPlaner.coffee +++ b/src/htmlPlaner.coffee @@ -86,6 +86,13 @@ exports.cutGmailQuote = (emailDocument) -> removeNodes(nodesArray) return true +exports.cutYahooQuote = (emailDocument) -> + nodesArray = emailDocument.getElementsByClassName('yahoo_quoted') + return false unless nodesArray.length > 0 + + removeNodes(nodesArray) + return true + exports.cutMicrosoftQuote = (emailDocument) -> splitterElement = findMicrosoftSplitter(emailDocument) return false unless splitterElement? diff --git a/src/planer.coffee b/src/planer.coffee index cbbf9f7..bce3623 100644 --- a/src/planer.coffee +++ b/src/planer.coffee @@ -81,6 +81,7 @@ exports.extractFromHtml = (msgBody, dom) -> # handle cases of emails between various email providers by running all checks instead of # stopping at whichever check returns positive first haveCutQuotationsGMail = htmlPlaner.cutGmailQuote(emailDocument) + haveCutQuotationsYahoo = htmlPlaner.cutYahooQuote(emailDocument) haveCutQuotationsBlock = htmlPlaner.cutBlockQuote(emailDocument) haveCutQuotationsMicrosoft = htmlPlaner.cutMicrosoftQuote(emailDocument) haveCutQuotationsById = htmlPlaner.cutById(emailDocument) @@ -88,6 +89,7 @@ exports.extractFromHtml = (msgBody, dom) -> haveCutQuotations = ( haveCutQuotationsGMail || + haveCutQuotationsYahoo || haveCutQuotationsBlock || haveCutQuotationsMicrosoft || haveCutQuotationsById || diff --git a/test/examples/html/yahooMail2020.html b/test/examples/html/yahooMail2020.html new file mode 100644 index 0000000..1cf1f27 --- /dev/null +++ b/test/examples/html/yahooMail2020.html @@ -0,0 +1,435 @@ + + + + + + + + + + + + +

I didn't realize this was such a big problem.  
+

+
+
+

 

+

Tom Wilson

+

+
+

+
+

+
+
+
+
On Wednesday, April 22, 2020, 04:41:30 PM CDT, Sam Smith (MyGroup listserver) <sender@lists.example.com> wrote:
+

+
+

+
+
+
+
+
+

I saw this coming behind the scenes a few years back.

+

 

+

 

+

 

+
+

From: "John Wilson (MyGroup listserver)" <sender@lists.example.com>
+Reply-To: MyGroup Listserv <MyGroup@lists.example.com>
+Date: Wednesday, April 22, 2020 at 4:26 PM
+To: MyGroup Listserv <MyGroup@lists.example.com>
+Subject: Re: [MyGroup] Can you believe this?

+
+
+

 

+
+

 

+
+

I get where they are coming from too, wondering if they are just trying to create a new process.

+
+
+
+
+
+

From: James Jones (MyGroup listserver) <sender@lists.example.com>
+Sent: Wednesday, April 22, 2020 3:39 PM
+To: My Group List Server <MyGroup@lists.example.com>
+Subject: Re: [MyGroup] Can you believe this?

+
+

 

+
+
+
+

 

+
+

Some bad eggs out there, I see where they are coming from.

+

 

+ +
+

James Jones

+

Owner/Managing Partner

+
+

 

+

 

+
+

From: "Tom Wilson (MyGroup listserver)" <sender@lists.example.com>
+Reply-To: My Group List Server <MyGroup@lists.example.com>
+Date: Wednesday, April 22, 2020 at 3:28 PM
+To: My Group List Server <MyGroup@lists.example.com>
+Subject: Re: [MyGroup] Can you believe this?

+
+
+

 

+
+

 

+ + +
+

Sounds like a trust issue.

+
+
+

 

+
+
+

I would just remind them.

+
+
+

 

+
+
+

That's my two cents.....

+
+
+

 

+
+
+

 

+

Tom Wilson

+
+
+

 

+
+
+

 

+
+
+
+
+

On Wednesday, April 22, 2020, 03:18:33 PM CDT, John Wilson (MyGroup listserver) <sender@lists.example.com> wrote:

+
+
+

 

+
+
+

 

+
+
+
+
+

 

+
+

First one for me.  I assume its just to notify them.

+
+
+

 

+
+
+

Thoughts. 

+
+
+
+

 

+
+
+
+

 Best Regards,

+

 

+

John Wilson

+
+
+
+
+
+
+
+
+

First one for me.  I assume its just to notify them.

+
+
+

 

+
+
+

Thoughts. 

+
+
+
+

 

+
+
+
+

 Best Regards,

+

 

+

John Wilson

+
+
+
+
+
+
+
+
+

 

+
+

 

+
+


+

+
+
+
+
+
+
+
+

I saw this coming behind the scenes a few years back.

+

 

+

 

+

 

+
+

From: "John Wilson (MyGroup listserver)" <sender@lists.example.com>
+Reply-To: MyGroup Listserv <MyGroup@lists.example.com>
+Date: Wednesday, April 22, 2020 at 4:26 PM
+To: MyGroup Listserv <MyGroup@lists.example.com>
+Subject: Re: [MyGroup] Can you believe this?

+
+
+

 

+
+

 

+
+

I get where they are coming from too, wondering if they are just trying to create a new process.

+
+
+
+
+
+

From: James Jones (MyGroup listserver) <sender@lists.example.com>
+Sent: Wednesday, April 22, 2020 3:39 PM
+To: My Group List Server <MyGroup@lists.example.com>
+Subject: Re: [MyGroup] Can you believe this?

+
+

 

+
+
+
+

 

+
+

Some bad eggs out there, I see where they are coming from.

+

 

+
+

James Jones

+

Owner/Managing Partner

+
+

 

+

 

+
+

From: "Tom Wilson (MyGroup listserver)" <sender@lists.example.com>
+Reply-To: My Group List Server <MyGroup@lists.example.com>
+Date: Wednesday, April 22, 2020 at 3:28 PM
+To: My Group List Server <MyGroup@lists.example.com>
+Subject: Re: [MyGroup] Can you believe this?

+
+
+

 

+
+

 

+
+

Sounds like a trust issue.

+
+
+

 

+
+
+

I would just remind them.

+
+
+

 

+
+
+

That's my two cents.....

+
+
+

 

+
+
+

 

+

Tom Wilson

+
+
+

 

+
+
+

 

+
+
+
+
+

On Wednesday, April 22, 2020, 03:18:33 PM CDT, John Wilson (MyGroup listserver) <sender@lists.example.com> wrote:

+
+
+

 

+
+
+

 

+
+
+
+
+

 

+
+

First one for me.  I assume its just to notify them.

+
+
+

 

+
+
+

Thoughts. 

+
+
+
+

 

+
+
+
+

 Best Regards,

+

 

+

John Wilson

+
+
+
+
+
+
+
+
+

First one for me.  I assume its just to notify them.

+
+
+

 

+
+
+

Thoughts. 

+
+
+
+

 

+
+
+
+

 Best Regards,

+

 

+

John Wilson

+
+
+
+
+
+
+
+
+

 

+
+

 

+
+


+

+
+
+
+
+
+
+ + \ No newline at end of file diff --git a/test/planerHtml.test.coffee b/test/planerHtml.test.coffee index 6a04f78..dc2f1f0 100644 --- a/test/planerHtml.test.coffee +++ b/test/planerHtml.test.coffee @@ -330,3 +330,16 @@ describe 'planer#extractFromHtml', -> expect(extractedHtml).to.exist expect(extractedHtml).to.contain(replySnippet) expect(extractedHtml).not.to.contain(originalMsgSnippet) + + it 'handles emails Yahoo replies using the yahooo_quoted class', -> + replySnippet = "such a big problem" + originalMsgSnippet = 'new process' + msgBody = fs.readFileSync(absolutePath('examples/html/yahooMail2020.html'), 'utf8') + expect(msgBody).to.contain(replySnippet) + expect(msgBody).to.contain(originalMsgSnippet) + + extractedHtml = planer.extractFromHtml(msgBody, @dom) + + expect(extractedHtml).to.exist + expect(extractedHtml).to.contain(replySnippet) + expect(extractedHtml).not.to.contain(originalMsgSnippet)