From bd66a42db0756409663adc3b94d9c74cb3186816 Mon Sep 17 00:00:00 2001 From: Dong Date: Mon, 30 Sep 2019 14:13:35 +0900 Subject: [PATCH 1/2] support of HTTP 30X redirect --- src/main/java/org/opengraph/OpenGraph.java | 672 +++++++++--------- .../java/org/opengraph/OpenGraphTest.java | 10 +- 2 files changed, 346 insertions(+), 336 deletions(-) diff --git a/src/main/java/org/opengraph/OpenGraph.java b/src/main/java/org/opengraph/OpenGraph.java index 62eda63..127beb9 100644 --- a/src/main/java/org/opengraph/OpenGraph.java +++ b/src/main/java/org/opengraph/OpenGraph.java @@ -1,113 +1,135 @@ package org.opengraph; -import org.htmlcleaner.HtmlCleaner; -import org.htmlcleaner.TagNode; - import java.io.BufferedReader; import java.io.InputStreamReader; +import java.net.HttpURLConnection; import java.net.URL; import java.net.URLConnection; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Hashtable; +import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.htmlcleaner.HtmlCleaner; +import org.htmlcleaner.TagNode; + /** - * A Java object representation of an Open Graph enabled webpage. - * A simplified layer over a Hastable. + * A Java object representation of an Open Graph enabled webpage. A simplified + * layer over a Hastable. * * @author Callum Jones */ -public class OpenGraph -{ - private String pageUrl; +public class OpenGraph { + private String pageUrl; private ArrayList pageNamespaces; - private Hashtable> metaAttributes; - private String baseType; - private boolean isImported; // determine if the object is a new incarnation or representation of a web page - private boolean hasChanged; // track if object has been changed - - public final static String[] REQUIRED_META = new String[]{"title", "type", "image", "url" }; - - public final static Hashtable BASE_TYPES = new Hashtable(); - static - { - BASE_TYPES.put("activity", new String[] {"activity", "sport"}); - BASE_TYPES.put("business", new String[] {"bar", "company", "cafe", "hotel", "restaurant"}); - BASE_TYPES.put("group", new String[] {"cause", "sports_league", "sports_team"}); - BASE_TYPES.put("organization", new String[] {"band", "government", "non_profit", "school", "university"}); - BASE_TYPES.put("person", new String[] {"actor", "athlete", "author", "director", "musician", "politician", "profile", "public_figure"}); - BASE_TYPES.put("place", new String[] {"city", "country", "landmark", "state_province"}); - BASE_TYPES.put("product", new String[] {"album", "book", "drink", "food", "game", "movie", "product", "song", "tv_show"}); - BASE_TYPES.put("website", new String[] {"blog", "website", "article"}); + private Hashtable> metaAttributes; + private String baseType; + private boolean isImported; // determine if the object is a new incarnation or representation of a web page + private boolean hasChanged; // track if object has been changed + private AtomicInteger redirectCounter; + + public final static String[] REQUIRED_META = new String[] { "title", "type", "image", "url" }; + + public final static Hashtable BASE_TYPES = new Hashtable(); + static { + BASE_TYPES.put("activity", new String[] { "activity", "sport" }); + BASE_TYPES.put("business", new String[] { "bar", "company", "cafe", "hotel", "restaurant" }); + BASE_TYPES.put("group", new String[] { "cause", "sports_league", "sports_team" }); + BASE_TYPES.put("organization", new String[] { "band", "government", "non_profit", "school", "university" }); + BASE_TYPES.put("person", new String[] { "actor", "athlete", "author", "director", "musician", "politician", + "profile", "public_figure" }); + BASE_TYPES.put("place", new String[] { "city", "country", "landmark", "state_province" }); + BASE_TYPES.put("product", + new String[] { "album", "book", "drink", "food", "game", "movie", "product", "song", "tv_show" }); + BASE_TYPES.put("website", new String[] { "blog", "website", "article" }); } - /** - * Create an open graph representation for generating your own Open Graph object - */ - public OpenGraph() - { + /** + * Create an open graph representation for generating your own Open Graph object + */ + public OpenGraph() { pageNamespaces = new ArrayList(); - metaAttributes = new Hashtable>(); - hasChanged = false; - isImported = false; - } - - /** - * Fetch the open graph representation from a web site - * @param url The address to the web page to fetch Open Graph data - * @param ignoreSpecErrors Set this option to true if you don't wish to have an exception throw if the page does not conform to the basic 4 attributes - * @throws java.io.IOException If a network error occurs, the HTML parser will throw an IO Exception - * @throws java.lang.Exception A generic exception is throw if the specific page fails to conform to the basic Open Graph standard as define by the constant REQUIRED_META - */ - public OpenGraph(String url, boolean ignoreSpecErrors) throws java.io.IOException, Exception { - this(); - isImported = true; - - - // download the (X)HTML content, but only up to the closing head tag. We do not want to waste resources parsing irrelevant content - URL pageURL = new URL(url); - URLConnection siteConnection = pageURL.openConnection(); - Charset charset = getConnectionCharset(siteConnection); - BufferedReader dis = new BufferedReader(new InputStreamReader(siteConnection.getInputStream(), charset)); - String inputLine; - StringBuffer headContents = new StringBuffer(); - - // Loop through each line, looking for the closing head element - while ((inputLine = dis.readLine()) != null) - { - if (inputLine.contains("")) - { - inputLine = inputLine.substring(0, inputLine.indexOf("") + 7); - inputLine = inputLine.concat(""); - headContents.append(inputLine + "\r\n"); - break; - } - headContents.append(inputLine + "\r\n"); - } - - String headContentsStr = headContents.toString(); - HtmlCleaner cleaner = new HtmlCleaner(); - // parse the string HTML - TagNode pageData = cleaner.clean(headContentsStr); + metaAttributes = new Hashtable>(); + hasChanged = false; + isImported = false; + redirectCounter = new AtomicInteger(3); + } + + /** + * Fetch the open graph representation from a web site + * + * @param url The address to the web page to fetch Open Graph data + * @param ignoreSpecErrors Set this option to true if you don't wish to have an + * exception throw if the page does not conform to the + * basic 4 attributes + * @throws java.io.IOException If a network error occurs, the HTML parser will + * throw an IO Exception + * @throws java.lang.Exception A generic exception is throw if the specific page + * fails to conform to the basic Open Graph standard + * as define by the constant REQUIRED_META + */ + public OpenGraph(String url, boolean ignoreSpecErrors) throws java.io.IOException, Exception { + this(); + this.load(url, ignoreSpecErrors); + } + + private int load(String url, boolean ignoreSpecErrors) throws java.io.IOException, Exception { + isImported = true; + + // download the (X)HTML content, but only up to the closing head tag. We do not + // want to waste resources parsing irrelevant content + URL pageURL = new URL(url); + HttpURLConnection siteConnection = (HttpURLConnection)pageURL.openConnection(); + Charset charset = getConnectionCharset(siteConnection); + BufferedReader dis = new BufferedReader(new InputStreamReader(siteConnection.getInputStream(), charset)); + String inputLine; + StringBuffer headContents = new StringBuffer(); + + int statusCode = siteConnection.getResponseCode(); + + if (statusCode == HttpURLConnection.HTTP_MOVED_PERM || statusCode == HttpURLConnection.HTTP_MOVED_TEMP) { + if (redirectCounter.decrementAndGet() < 0) { + throw new Exception("Maximum redirect count exceeded."); + } + String location = siteConnection.getHeaderField("Location"); + if (location == null) { + throw new Exception("The Site reponsed 30X without location header."); + } + return load(location, ignoreSpecErrors); + } + + // Loop through each line, looking for the closing head element + while ((inputLine = dis.readLine()) != null) { + if (inputLine.contains("")) { + inputLine = inputLine.substring(0, inputLine.indexOf("") + 7); + inputLine = inputLine.concat(""); + headContents.append(inputLine + "\r\n"); + break; + } + headContents.append(inputLine + "\r\n"); + } + + String headContentsStr = headContents.toString(); + HtmlCleaner cleaner = new HtmlCleaner(); + // parse the string HTML + TagNode pageData = cleaner.clean(headContentsStr); // read in the declared namespaces boolean hasOGspec = false; TagNode headElement = pageData.findElementByName("head", true); - if (headElement.hasAttribute("prefix")) - { + if (headElement.hasAttribute("prefix")) { String namespaceData = headElement.getAttributeByName("prefix"); Pattern pattern = Pattern.compile("(([A-Za-z0-9_]+):\\s+(http:\\/\\/ogp.me\\/ns(\\/\\w+)*#))\\s*"); Matcher matcher = pattern.matcher(namespaceData); - while (matcher.find()) - { - String prefix = matcher.group(2); + while (matcher.find()) { + String prefix = matcher.group(2); String documentURI = matcher.group(3); pageNamespaces.add(new OpenGraphNamespace(prefix, documentURI)); if (prefix.equals("og")) hasOGspec = true; - } + } } // some pages do not include the new OG spec @@ -115,251 +137,230 @@ public OpenGraph(String url, boolean ignoreSpecErrors) throws java.io.IOExceptio if (!hasOGspec) pageNamespaces.add(new OpenGraphNamespace("og", "http:// ogp.me/ns#")); - // open only the meta tags - TagNode[] metaData = pageData.getElementsByName("meta", true); - for (TagNode metaElement : metaData) - { - for (OpenGraphNamespace namespace : pageNamespaces) - { + // open only the meta tags + TagNode[] metaData = pageData.getElementsByName("meta", true); + for (TagNode metaElement : metaData) { + for (OpenGraphNamespace namespace : pageNamespaces) { String target = null; - if (metaElement.hasAttribute("property")) - target = "property"; - else if (metaElement.hasAttribute("name")) - target = "name"; - - if (target != null && metaElement.getAttributeByName(target).startsWith(namespace.getPrefix() + ":")) - { - setProperty(namespace, metaElement.getAttributeByName(target), metaElement.getAttributeByName("content")); + if (metaElement.hasAttribute("property")) + target = "property"; + else if (metaElement.hasAttribute("name")) + target = "name"; + + if (target != null && metaElement.getAttributeByName(target).startsWith(namespace.getPrefix() + ":")) { + setProperty(namespace, metaElement.getAttributeByName(target), + metaElement.getAttributeByName("content")); break; } } - } - - /** - * Check that page conforms to Open Graph protocol - */ - if (!ignoreSpecErrors) - { - for (String req : REQUIRED_META) - { - if (!metaAttributes.containsKey(req)) - throw new Exception("Does not conform to Open Graph protocol"); - } - } - - /** - * Has conformed, now determine basic sub type. - */ - baseType = null; + } + + /** + * Check that page conforms to Open Graph protocol + */ + if (!ignoreSpecErrors) { + for (String req : REQUIRED_META) { + if (!metaAttributes.containsKey(req)) + throw new Exception("Does not conform to Open Graph protocol"); + } + } + + /** + * Has conformed, now determine basic sub type. + */ + baseType = null; String currentType = getContent("type"); // some apps use their OG namespace as a prefix - if (currentType != null) - { - for (OpenGraphNamespace ns : pageNamespaces) - { - if (currentType.startsWith(ns.getPrefix() + ":")) - { - currentType = currentType.replaceFirst(ns.getPrefix() + ":",""); - break; // done here - } - } - } - for (String base : BASE_TYPES.keySet()) - { - String[] baseList = BASE_TYPES.get(base); - boolean finished = false; - for (String expandedType : baseList) - { - if (expandedType.equals(currentType)) - { - baseType = base; - finished = true; - break; - } - } - if (finished) break; - } - - // read the original page url - URL realURL = siteConnection.getURL(); - pageUrl = realURL.toExternalForm(); - } - - /** - * Gets the charset for specified connection. - * Content Type header is parsed to get the charset name. - * - * @param connection the connection. - * @return the Charset object for response charset name; - * if it's not found then the default charset. - */ - private static Charset getConnectionCharset(URLConnection connection) - { - String contentType = connection.getContentType(); - if (contentType != null && contentType.length() > 0) - { - contentType = contentType.toLowerCase(); - String charsetName = extractCharsetName(contentType); - if (charsetName != null && charsetName.length() > 0) - { - try - { - return Charset.forName(charsetName); - } - catch (Exception e) { - // specified charset is not found, - // skip it to return the default one - } - } - } - - // return the default charset - return Charset.defaultCharset(); - } - - /** - * Extract the charset name form the content type string. - * Content type string is received from Content-Type header. - * - * @param contentType the content type string, must be not null. - * @return the found charset name or null if not found. - */ - private static String extractCharsetName(String contentType) - { - // split onto media types - final String[] mediaTypes = contentType.split(":"); - if (mediaTypes.length > 0) - { - // use only the first one, and split it on parameters - final String[] params = mediaTypes[0].split(";"); - - // find the charset parameter and return it's value - for (String each : params) - { - each = each.trim(); - if (each.startsWith("charset=")) - { - // return the charset name - return each.substring(8).trim(); - } - } - } - - return null; - } - - /** - * Get the basic type of the Open graph page as per the specification - * @return Base type as defined by specification, null otherwise - */ - public String getBaseType() - { - return baseType; - } - - /** - * Get a value of a given Open Graph property - * @param property The Open graph property key - * @return Returns the value of the first property defined, null otherwise - */ - public String getContent(String property) - { - if (metaAttributes.containsKey(property) && metaAttributes.get(property).size() > 0) + if (currentType != null) { + for (OpenGraphNamespace ns : pageNamespaces) { + if (currentType.startsWith(ns.getPrefix() + ":")) { + currentType = currentType.replaceFirst(ns.getPrefix() + ":", ""); + break; // done here + } + } + } + for (String base : BASE_TYPES.keySet()) { + String[] baseList = BASE_TYPES.get(base); + boolean finished = false; + for (String expandedType : baseList) { + if (expandedType.equals(currentType)) { + baseType = base; + finished = true; + break; + } + } + if (finished) + break; + } + + // read the original page url + URL realURL = siteConnection.getURL(); + pageUrl = realURL.toExternalForm(); + + return statusCode; + } + + /** + * Gets the charset for specified connection. Content Type header is parsed to + * get the charset name. + * + * @param connection the connection. + * @return the Charset object for response charset name; if it's not found then + * the default charset. + */ + private static Charset getConnectionCharset(URLConnection connection) { + String contentType = connection.getContentType(); + if (contentType != null && contentType.length() > 0) { + contentType = contentType.toLowerCase(); + String charsetName = extractCharsetName(contentType); + if (charsetName != null && charsetName.length() > 0) { + try { + return Charset.forName(charsetName); + } catch (Exception e) { + // specified charset is not found, + // skip it to return the default one + } + } + } + + // return the default charset + return Charset.defaultCharset(); + } + + /** + * Extract the charset name form the content type string. Content type string is + * received from Content-Type header. + * + * @param contentType the content type string, must be not null. + * @return the found charset name or null if not found. + */ + private static String extractCharsetName(String contentType) { + // split onto media types + final String[] mediaTypes = contentType.split(":"); + if (mediaTypes.length > 0) { + // use only the first one, and split it on parameters + final String[] params = mediaTypes[0].split(";"); + + // find the charset parameter and return it's value + for (String each : params) { + each = each.trim(); + if (each.startsWith("charset=")) { + // return the charset name + return each.substring(8).trim(); + } + } + } + + return null; + } + + /** + * Get the basic type of the Open graph page as per the specification + * + * @return Base type as defined by specification, null otherwise + */ + public String getBaseType() { + return baseType; + } + + /** + * Get a value of a given Open Graph property + * + * @param property The Open graph property key + * @return Returns the value of the first property defined, null otherwise + */ + public String getContent(String property) { + if (metaAttributes.containsKey(property) && metaAttributes.get(property).size() > 0) return metaAttributes.get(property).get(0).getContent(); else return null; - } - - /** - * Get all the defined properties of the Open Graph object - * @return An array of all currently defined properties - */ - public MetaElement[] getProperties() - { + } + + /** + * Get all the defined properties of the Open Graph object + * + * @return An array of all currently defined properties + */ + public MetaElement[] getProperties() { ArrayList allElements = new ArrayList(); - for (ArrayList collection : metaAttributes.values()) + for (ArrayList collection : metaAttributes.values()) allElements.addAll(collection); return (MetaElement[]) allElements.toArray(new MetaElement[allElements.size()]); - } + } - /** - * Get all the defined properties of the Open Graph object + /** + * Get all the defined properties of the Open Graph object + * * @param property The property to focus on - * @return An array of all currently defined properties - */ - public MetaElement[] getProperties(String property) - { - if (metaAttributes.containsKey(property)) - { + * @return An array of all currently defined properties + */ + public MetaElement[] getProperties(String property) { + if (metaAttributes.containsKey(property)) { ArrayList target = metaAttributes.get(property); return (MetaElement[]) target.toArray(new MetaElement[target.size()]); - } - else + } else return null; - } - - /** - * Get the original URL the Open Graph page was obtained from - * @return The address to the Open Graph object page - */ - public String getOriginalUrl() - { - return pageUrl; - } - - - /** - * Get the HTML representation of the Open Graph data. - * @return An array of meta elements as Strings - */ - public String[] toHTML() - { - // allocate the array - ArrayList returnHTML = new ArrayList(); - - int index = 0; // keep track of the index to insert into - for (ArrayList elements : metaAttributes.values()) - { + } + + /** + * Get the original URL the Open Graph page was obtained from + * + * @return The address to the Open Graph object page + */ + public String getOriginalUrl() { + return pageUrl; + } + + /** + * Get the HTML representation of the Open Graph data. + * + * @return An array of meta elements as Strings + */ + public String[] toHTML() { + // allocate the array + ArrayList returnHTML = new ArrayList(); + + int index = 0; // keep track of the index to insert into + for (ArrayList elements : metaAttributes.values()) { for (MetaElement element : elements) - returnHTML.add(""); + returnHTML.add(""); } - // return the array - return (String[]) returnHTML.toArray(); - } - - /** - * Get the XHTML representation of the Open Graph data. - * @return An array of meta elements as Strings - */ - public String[] toXHTML() - { - // allocate the array - ArrayList returnHTML = new ArrayList(); - - int index = 0; // keep track of the index to insert into - for (ArrayList elements : metaAttributes.values()) - { + // return the array + return (String[]) returnHTML.toArray(); + } + + /** + * Get the XHTML representation of the Open Graph data. + * + * @return An array of meta elements as Strings + */ + public String[] toXHTML() { + // allocate the array + ArrayList returnHTML = new ArrayList(); + + int index = 0; // keep track of the index to insert into + for (ArrayList elements : metaAttributes.values()) { for (MetaElement element : elements) - returnHTML.add(""); + returnHTML.add(""); } - // return the array - return (String[]) returnHTML.toArray(); - } + // return the array + return (String[]) returnHTML.toArray(); + } - /** - * Set the Open Graph property to a specific value + /** + * Set the Open Graph property to a specific value + * * @param namespace The OpenGraph namespace the content belongs to - * @param property The og:XXXX where XXXX is the property you wish to set - * @param content The value or contents of the property to be set - */ - public void setProperty(OpenGraphNamespace namespace, String property, String content) - { - if (!pageNamespaces.contains(namespace)) + * @param property The og:XXXX where XXXX is the property you wish to set + * @param content The value or contents of the property to be set + */ + public void setProperty(OpenGraphNamespace namespace, String property, String content) { + if (!pageNamespaces.contains(namespace)) pageNamespaces.add(namespace); property = property.replaceAll(namespace.getPrefix() + ":", ""); @@ -368,41 +369,42 @@ public void setProperty(OpenGraphNamespace namespace, String property, String co metaAttributes.put(property, new ArrayList()); metaAttributes.get(property).add(element); - } - - /** - * Removed a defined property - * @param property The og:XXXX where XXXX is the property you wish to remove - */ - public void removeProperty(String property) - { - metaAttributes.remove(property); - } - - /** - * Obtain the underlying HashTable - * @return The underlying structure as a Hashtable - */ - public Hashtable> exposeTable() { - return metaAttributes; - } - - /** - * Test if the Open Graph object was initially a representation of a web page - * @return True if the object is from a web page, false otherwise - */ - public boolean isFromWeb() - { - return isImported; - } - - /** - * Test if the object has been modified by setters/deleters. - * This is only relevant if this object initially represented a web page - * @return True True if the object has been modified, false otherwise - */ - public boolean hasChanged() - { - return hasChanged; - } + } + + /** + * Removed a defined property + * + * @param property The og:XXXX where XXXX is the property you wish to remove + */ + public void removeProperty(String property) { + metaAttributes.remove(property); + } + + /** + * Obtain the underlying HashTable + * + * @return The underlying structure as a Hashtable + */ + public Hashtable> exposeTable() { + return metaAttributes; + } + + /** + * Test if the Open Graph object was initially a representation of a web page + * + * @return True if the object is from a web page, false otherwise + */ + public boolean isFromWeb() { + return isImported; + } + + /** + * Test if the object has been modified by setters/deleters. This is only + * relevant if this object initially represented a web page + * + * @return True True if the object has been modified, false otherwise + */ + public boolean hasChanged() { + return hasChanged; + } } diff --git a/src/test/main/java/org/opengraph/OpenGraphTest.java b/src/test/main/java/org/opengraph/OpenGraphTest.java index e8d5a19..e57da15 100644 --- a/src/test/main/java/org/opengraph/OpenGraphTest.java +++ b/src/test/main/java/org/opengraph/OpenGraphTest.java @@ -9,7 +9,7 @@ public class OpenGraphTest { @Test public void shouldHandleMissingContentType() throws java.lang.Exception { OpenGraph site = new OpenGraph("http://www.bbc.com/future/story/20140428-the-myth-of-tech-revolutions", true); - assertEquals("Why it’s time to ditch the word ‘revolution’ in tech", site.getContent("title")); + assertEquals("Why it�s time to ditch the word �revolution� in tech", site.getContent("title")); assertEquals("624", site.getContent("image:width")); } @@ -18,4 +18,12 @@ public void shouldNoOGMarkup() throws java.lang.Exception { OpenGraph site = new OpenGraph("http://clang.llvm.org/docs/UsersManual.html", true); assertNull(site.getContent("title")); } + @Test + public void testRedirect() throws java.lang.Exception { + OpenGraph site = new OpenGraph("http://www.naver.com", true); // this url should be redirected to https scheme. + assertNotNull(site.getContent("title")); + + site = new OpenGraph("https://bit.ly/2maMYF0", true); + assertNotNull(site.getContent("title")); + } } \ No newline at end of file From 419a86f9985e7fb24b9af89f75f45fc7ac444a71 Mon Sep 17 00:00:00 2001 From: Dong Date: Mon, 30 Sep 2019 14:35:53 +0900 Subject: [PATCH 2/2] fix of erata --- src/test/main/java/org/opengraph/OpenGraphTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/main/java/org/opengraph/OpenGraphTest.java b/src/test/main/java/org/opengraph/OpenGraphTest.java index e57da15..5de1357 100644 --- a/src/test/main/java/org/opengraph/OpenGraphTest.java +++ b/src/test/main/java/org/opengraph/OpenGraphTest.java @@ -9,7 +9,7 @@ public class OpenGraphTest { @Test public void shouldHandleMissingContentType() throws java.lang.Exception { OpenGraph site = new OpenGraph("http://www.bbc.com/future/story/20140428-the-myth-of-tech-revolutions", true); - assertEquals("Why it�s time to ditch the word �revolution� in tech", site.getContent("title")); + assertEquals("Why it’s time to ditch the word ‘revolution’ in tech", site.getContent("title")); assertEquals("624", site.getContent("image:width")); }