From 893fdb7409ee554c03fe0ee3c2e7a24660587ad6 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Mon, 5 Jun 2017 13:27:13 +0200 Subject: [PATCH] Limit pattern matching URLs embedded in CSS to match max. 8000 characters, add unit test, fixes commoncrawl/ia-web-commons#12 --- .../resource/html/ExtractingParseObserver.java | 2 +- .../html/ExtractingParseObserverTest.java | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java index 52989455..66aa36a7 100644 --- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java +++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java @@ -26,7 +26,7 @@ public class ExtractingParseObserver implements ParseObserver { boolean inTitle = false; protected static String cssUrlPatString = - "url\\s*\\(\\s*((?:\\\\?[\"'])?.+?(?:\\\\?[\"'])?)\\s*\\)"; + "url\\s*\\(\\s*([^)\\s]{1,8000}?)\\s*\\)"; protected static String cssUrlTrimPatString = "^(?:\\\\?[\"'])+|(?:\\\\?[\"'])+$"; protected static String cssImportNoUrlPatString = diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java index 8f690a06..a7fa272f 100644 --- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java +++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java @@ -93,6 +93,24 @@ public void testHandleStyleNodeNoHangupTruncated() throws Exception { checkExtract(test); } + /** + * Test whether the pattern matcher does not stack overflow with overlong + * sequence of quote characters around a CSS link. + */ + public void testHandleStyleNodeNoStackOverflow() throws Exception { + StringBuilder sb = new StringBuilder(); + sb.append("url("); + for (int i = 0; i < 20000; i++) + sb.append('\''); + sb.append("foos.gif"); + for (int i = 0; i < 20000; i++) + sb.append('\''); + sb.append(");"); + String[] test = new String[1]; + test[0] = sb.toString(); + checkExtract(test); + } + private void checkExtract(String[] data) throws JSONException { // System.err.format("CSS(%s) want[0](%s)\n",css,want[0]); String css = data[0];