diff --git a/.gitignore b/.gitignore index e9aa7e9..3ba916c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,14 @@ #Ignore -bin/* -testreport/* -examples/backtothefuture/build/* -target/* +bin/ +testreport/ +examples/backtothefuture/build/ + +## Maven +target/ + +## IntelliJ +*.iml +.idea + +## Mac OS X +.DS_Store diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..17d470f --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,28 @@ +Copyright (c) 2010-2021 John Deverall. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/README.md b/README.md index a4203ae..37ded66 100644 --- a/README.md +++ b/README.md @@ -6,11 +6,11 @@ Due to the lack of useful native DOM parsers this class implements the HTMLClean ## Usage ## In this example we will fetch the og:title and og:type contents, while ignoring any errors if this page does not comply with the Open Graph protocol standard (set in the constructor via true) -> OpenGraph testPage = new OpenGraph("http://uk.rottentomatoes.com/m/1217700-kick_ass", true); +> `OpenGraph testPage = new OpenGraph("http://uk.rottentomatoes.com/m/1217700-kick_ass", true);` -> String title = testPage.getContent("title"); +> `String title = testPage.getContent("title");` -> String type = testPage.getContent("type"); +> `String type = testPage.getContent("type");` Another example (available in the examples/ folder) demonstrates the support for custom OpenGraph namespaces diff --git a/examples/backtothefuture/build.xml b/examples/backtothefuture/build.xml deleted file mode 100644 index e8d4a48..0000000 --- a/examples/backtothefuture/build.xml +++ /dev/null @@ -1,72 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/examples/backtothefuture/src/BackToTheFuture.java b/examples/backtothefuture/src/main/java/org/opengraph/examples/BackToTheFuture.java similarity index 73% rename from examples/backtothefuture/src/BackToTheFuture.java rename to examples/backtothefuture/src/main/java/org/opengraph/examples/BackToTheFuture.java index 6bd13e7..c822b26 100644 --- a/examples/backtothefuture/src/BackToTheFuture.java +++ b/examples/backtothefuture/src/main/java/org/opengraph/examples/BackToTheFuture.java @@ -1,19 +1,16 @@ package org.opengraph.examples; -import org.opengraph.OpenGraph; import org.opengraph.MetaElement; +import org.opengraph.OpenGraph; public class BackToTheFuture { - static String uri = "http://www.rottentomatoes.com/m/back_to_the_future/"; - - public static void main(String [] args) - { - try - { + static String uri = "https://web.archive.org/web/20110924151516/https://www.rottentomatoes.com/m/back_to_the_future"; + + public static void main(String[] args) { + try { OpenGraph movie = new OpenGraph(uri, true); System.out.println("Movie: " + movie.getContent("title")); - for (MetaElement director : movie.getProperties("director")) - { + for (MetaElement director : movie.getProperties("director")) { OpenGraph extendedInfo = director.getExtendedData(); System.out.println("Directed by: " + extendedInfo.getContent("title")); } diff --git a/examples/pom.xml b/examples/pom.xml new file mode 100644 index 0000000..53dc844 --- /dev/null +++ b/examples/pom.xml @@ -0,0 +1,55 @@ + + 4.0.0 + opengraph-examples + ${project.parent.version} + jar + + + opengraph + opengraph + 0.0.2-SNAPSHOT + + + + + opengraph + opengraph-plugin + 0.0.2-SNAPSHOT + + + + backtothefuture/src/main/java + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + + org.apache.maven.plugins + maven-assembly-plugin + 3.3.0 + + + package + + single + + + + + org.opengraph.examples.BackToTheFuture + + + + jar-with-dependencies + + + + + + + + + \ No newline at end of file diff --git a/plugin/pom.xml b/plugin/pom.xml new file mode 100644 index 0000000..b917484 --- /dev/null +++ b/plugin/pom.xml @@ -0,0 +1,24 @@ + + 4.0.0 + opengraph-plugin + ${project.parent.version} + jar + + + opengraph + opengraph + 0.0.2-SNAPSHOT + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + + + + \ No newline at end of file diff --git a/plugin/src/main/java/org/opengraph/MetaElement.java b/plugin/src/main/java/org/opengraph/MetaElement.java new file mode 100644 index 0000000..d7e8916 --- /dev/null +++ b/plugin/src/main/java/org/opengraph/MetaElement.java @@ -0,0 +1,65 @@ +package org.opengraph; + +import java.net.URL; + +/** + * Represents OpenGraph enabled meta data for a specific document + * + * @author Callum Jones + */ +public class MetaElement { + private OpenGraphNamespace namespace; //either "og" an NS specific + private String property; + private String content; + + /** + * Construct the representation of an element + * + * @param namespace The namespace the element belongs to + * @param property The property key + * @param content The content or value of this element + */ + public MetaElement(OpenGraphNamespace namespace, String property, String content) { + this.namespace = namespace; + this.property = property; + this.content = content; + } + + /** + * Fetch the content string of the element + */ + public String getContent() { + return content; + } + + /** + * Fetch the OpenGraph namespace + */ + public OpenGraphNamespace getNamespace() { + return namespace; + } + + /** + * Fetch the property of the element + */ + public String getProperty() { + return property; + } + + /** + * Fetch the OpenGraph data from the object + * + * @return If the content is a URL, then an attempted will be made to build OpenGraph data from the object + */ + public OpenGraph getExtendedData() { + //The Java language should know the best form of a URL + try { + URL url = new URL(getContent()); + + //success + return new OpenGraph(url.toString(), true); + } catch (Exception e) { + return null; //not a valid URL + } + } +} \ No newline at end of file diff --git a/plugin/src/main/java/org/opengraph/OpenGraph.java b/plugin/src/main/java/org/opengraph/OpenGraph.java new file mode 100644 index 0000000..766a932 --- /dev/null +++ b/plugin/src/main/java/org/opengraph/OpenGraph.java @@ -0,0 +1,404 @@ +package org.opengraph; + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.net.HttpURLConnection; +import java.net.URL; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Hashtable; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.htmlcleaner.HtmlCleaner; +import org.htmlcleaner.TagNode; + +/** + * A Java object representation of an Open Graph enabled webpage. + * A simplified layer over a Hastable. + * + * @author Callum Jones + */ +public class OpenGraph { + private String pageUrl; + private ArrayList pageNamespaces; + private Hashtable> metaAttributes; + private String baseType; + private boolean isImported; // determine if the object is a new incarnation or representation of a web page + private boolean hasChanged; // track if object has been changed + + public final static String[] REQUIRED_META = new String[] {"title", "type", "image", "url"}; + + public final static Hashtable BASE_TYPES = new Hashtable(); + + static { + BASE_TYPES.put("activity", new String[] {"activity", "sport"}); + BASE_TYPES.put("business", new String[] {"bar", "company", "cafe", "hotel", "restaurant"}); + BASE_TYPES.put("group", new String[] {"cause", "sports_league", "sports_team"}); + BASE_TYPES.put("organization", new String[] {"band", "government", "non_profit", "school", "university"}); + BASE_TYPES.put("person", + new String[] {"actor", "athlete", "author", "director", "musician", "politician", "profile", "public_figure"}); + BASE_TYPES.put("place", new String[] {"city", "country", "landmark", "state_province"}); + BASE_TYPES + .put("product", new String[] {"album", "book", "drink", "food", "game", "movie", "product", "song", "tv_show"}); + BASE_TYPES.put("website", new String[] {"blog", "website", "article"}); + } + + /** + * Create an open graph representation for generating your own Open Graph object + */ + public OpenGraph() { + pageNamespaces = new ArrayList(); + metaAttributes = new Hashtable>(); + hasChanged = false; + isImported = false; + } + + /** + * Fetch the open graph representation from a web site + * + * @param url The address to the web page to fetch Open Graph data + * @param ignoreSpecErrors Set this option to true if you don't wish to have an exception throw if the page does not conform to the basic 4 attributes + * @throws java.io.IOException If a network error occurs, the HTML parser will throw an IO Exception + * @throws java.lang.Exception A generic exception is throw if the specific page fails to conform to the basic Open Graph standard as define by the constant REQUIRED_META + */ + public OpenGraph(String url, boolean ignoreSpecErrors) throws java.io.IOException, Exception { + this(); + isImported = true; + + + // download the (X)HTML content, but only up to the closing head tag. We do not want to waste resources parsing irrelevant content + System.out.println(url); + URL pageURL = new URL(url); + HttpURLConnection siteConnection = (HttpURLConnection) pageURL.openConnection(); + siteConnection.connect(); + if(siteConnection.getHeaderField("Location") != null ) + { + String redirect = siteConnection.getHeaderField("Location"); + siteConnection.disconnect(); + pageURL = new URL(redirect); + siteConnection = (HttpURLConnection) pageURL.openConnection(); + siteConnection.connect(); + } + Charset charset = getConnectionCharset(siteConnection); + BufferedReader dis = new BufferedReader(new InputStreamReader(siteConnection.getInputStream(), charset)); + String inputLine; + StringBuffer headContents = new StringBuffer(); + + // Loop through each line, looking for the closing head element + while ((inputLine = dis.readLine()) != null) { + if (inputLine.contains("")) { + inputLine = inputLine.substring(0, inputLine.indexOf("") + 7); + inputLine = inputLine.concat(""); + headContents.append(inputLine + "\r\n"); + break; + } + headContents.append(inputLine + "\r\n"); + } + + String headContentsStr = headContents.toString(); + HtmlCleaner cleaner = new HtmlCleaner(); + // parse the string HTML + TagNode pageData = cleaner.clean(headContentsStr); + + // read in the declared namespaces + boolean hasOGspec = false; + TagNode headElement = pageData.findElementByName("head", true); + if (headElement.hasAttribute("prefix")) { + String namespaceData = headElement.getAttributeByName("prefix"); + Pattern pattern = Pattern.compile("(([A-Za-z0-9_]+):\\s+(http:\\/\\/ogp.me\\/ns(\\/\\w+)*#))\\s*"); + Matcher matcher = pattern.matcher(namespaceData); + while (matcher.find()) { + String prefix = matcher.group(2); + String documentURI = matcher.group(3); + pageNamespaces.add(new OpenGraphNamespace(prefix, documentURI)); + if (prefix.equals("og")) { + hasOGspec = true; + } + } + } + + // some pages do not include the new OG spec + // this fixes compatibility + if (!hasOGspec) { + pageNamespaces.add(new OpenGraphNamespace("og", "http:// ogp.me/ns#")); + } + + // open only the meta tags + TagNode[] metaData = pageData.getElementsByName("meta", true); + for (TagNode metaElement : metaData) { + for (OpenGraphNamespace namespace : pageNamespaces) { + String target = null; + if (metaElement.hasAttribute("property")) { + target = "property"; + } else if (metaElement.hasAttribute("name")) { + target = "name"; + } + + if (target != null && metaElement.getAttributeByName(target).startsWith(namespace.getPrefix() + ":")) { + setProperty(namespace, metaElement.getAttributeByName(target), metaElement.getAttributeByName("content")); + break; + } + } + } + + /** + * Check that page conforms to Open Graph protocol + */ + if (!ignoreSpecErrors) { + for (String req : REQUIRED_META) { + if (!metaAttributes.containsKey(req)) { + throw new Exception("Does not conform to Open Graph protocol"); + } + } + } + + /** + * Has conformed, now determine basic sub type. + */ + baseType = null; + String currentType = getContent("type"); + // some apps use their OG namespace as a prefix + if (currentType != null) { + for (OpenGraphNamespace ns : pageNamespaces) { + if (currentType.startsWith(ns.getPrefix() + ":")) { + currentType = currentType.replaceFirst(ns.getPrefix() + ":", ""); + break; // done here + } + } + } + for (String base : BASE_TYPES.keySet()) { + String[] baseList = BASE_TYPES.get(base); + boolean finished = false; + for (String expandedType : baseList) { + if (expandedType.equals(currentType)) { + baseType = base; + finished = true; + break; + } + } + if (finished) { + break; + } + } + + // read the original page url + URL realURL = siteConnection.getURL(); + pageUrl = realURL.toExternalForm(); + } + + /** + * Gets the charset for specified connection. + * Content Type header is parsed to get the charset name. + * + * @param connection the connection. + * @return the Charset object for response charset name; + * if it's not found then the default charset. + */ + private static Charset getConnectionCharset(HttpURLConnection connection) { + String contentType = connection.getContentType(); + if (contentType != null && contentType.length() > 0) { + contentType = contentType.toLowerCase(); + String charsetName = extractCharsetName(contentType); + if (charsetName != null && charsetName.length() > 0) { + try { + return Charset.forName(charsetName); + } catch (Exception e) { + // specified charset is not found, + // skip it to return the default one + } + } + } + + // return the default charset + return Charset.defaultCharset(); + } + + /** + * Extract the charset name form the content type string. + * Content type string is received from Content-Type header. + * + * @param contentType the content type string, must be not null. + * @return the found charset name or null if not found. + */ + private static String extractCharsetName(String contentType) { + // split onto media types + final String[] mediaTypes = contentType.split(":"); + if (mediaTypes.length > 0) { + // use only the first one, and split it on parameters + final String[] params = mediaTypes[0].split(";"); + + // find the charset parameter and return it's value + for (String each : params) { + each = each.trim(); + if (each.startsWith("charset=")) { + // return the charset name + return each.substring(8).trim(); + } + } + } + + return null; + } + + /** + * Get the basic type of the Open graph page as per the specification + * + * @return Base type as defined by specification, null otherwise + */ + public String getBaseType() { + return baseType; + } + + /** + * Get a value of a given Open Graph property + * + * @param property The Open graph property key + * @return Returns the value of the first property defined, null otherwise + */ + public String getContent(String property) { + if (metaAttributes.containsKey(property) && metaAttributes.get(property).size() > 0) { + return metaAttributes.get(property).get(0).getContent(); + } else { + return null; + } + } + + /** + * Get all the defined properties of the Open Graph object + * + * @return An array of all currently defined properties + */ + public MetaElement[] getProperties() { + ArrayList allElements = new ArrayList(); + for (ArrayList collection : metaAttributes.values()) { + allElements.addAll(collection); + } + + return (MetaElement[]) allElements.toArray(new MetaElement[allElements.size()]); + } + + /** + * Get all the defined properties of the Open Graph object + * + * @param property The property to focus on + * @return An array of all currently defined properties + */ + public MetaElement[] getProperties(String property) { + if (metaAttributes.containsKey(property)) { + ArrayList target = metaAttributes.get(property); + return (MetaElement[]) target.toArray(new MetaElement[target.size()]); + } else { + return null; + } + } + + /** + * Get the original URL the Open Graph page was obtained from + * + * @return The address to the Open Graph object page + */ + public String getOriginalUrl() { + return pageUrl; + } + + + /** + * Get the HTML representation of the Open Graph data. + * + * @return An array of meta elements as Strings + */ + public String[] toHTML() { + // allocate the array + ArrayList returnHTML = new ArrayList(); + + int index = 0; // keep track of the index to insert into + for (ArrayList elements : metaAttributes.values()) { + for (MetaElement element : elements) { + returnHTML.add(""); + } + } + + // return the array + return (String[]) returnHTML.toArray(); + } + + /** + * Get the XHTML representation of the Open Graph data. + * + * @return An array of meta elements as Strings + */ + public String[] toXHTML() { + // allocate the array + ArrayList returnHTML = new ArrayList(); + + int index = 0; // keep track of the index to insert into + for (ArrayList elements : metaAttributes.values()) { + for (MetaElement element : elements) { + returnHTML.add(""); + } + } + + // return the array + return (String[]) returnHTML.toArray(); + } + + /** + * Set the Open Graph property to a specific value + * + * @param namespace The OpenGraph namespace the content belongs to + * @param property The og:XXXX where XXXX is the property you wish to set + * @param content The value or contents of the property to be set + */ + public void setProperty(OpenGraphNamespace namespace, String property, String content) { + if (!pageNamespaces.contains(namespace)) { + pageNamespaces.add(namespace); + } + + property = property.replaceAll(namespace.getPrefix() + ":", ""); + MetaElement element = new MetaElement(namespace, property, content); + if (!metaAttributes.containsKey(property)) { + metaAttributes.put(property, new ArrayList()); + } + + metaAttributes.get(property).add(element); + } + + /** + * Removed a defined property + * + * @param property The og:XXXX where XXXX is the property you wish to remove + */ + public void removeProperty(String property) { + metaAttributes.remove(property); + } + + /** + * Obtain the underlying HashTable + * + * @return The underlying structure as a Hashtable + */ + public Hashtable> exposeTable() { + return metaAttributes; + } + + /** + * Test if the Open Graph object was initially a representation of a web page + * + * @return True if the object is from a web page, false otherwise + */ + public boolean isFromWeb() { + return isImported; + } + + /** + * Test if the object has been modified by setters/deleters. + * This is only relevant if this object initially represented a web page + * + * @return True True if the object has been modified, false otherwise + */ + public boolean hasChanged() { + return hasChanged; + } +} diff --git a/plugin/src/main/java/org/opengraph/OpenGraphNamespace.java b/plugin/src/main/java/org/opengraph/OpenGraphNamespace.java new file mode 100644 index 0000000..64643f4 --- /dev/null +++ b/plugin/src/main/java/org/opengraph/OpenGraphNamespace.java @@ -0,0 +1,36 @@ +package org.opengraph; + +/** + * Represents an OpenGraph namespace + * + * @author Callum Jones + */ +public class OpenGraphNamespace { + private String prefix; + private String schemaURI; + + /** + * Construct a namespace + * + * @param prefix The OpenGraph assigned namespace prefix such as og or og_appname + * @param schemaURI The URL for the OpenGraph schema + */ + public OpenGraphNamespace(String prefix, String schemaURI) { + this.prefix = prefix; + this.schemaURI = schemaURI; + } + + /* + * Fetch the prefix used for the namespace + */ + public String getPrefix() { + return prefix; + } + + /* + * Fetch the address for the schema reference + */ + public String getSchemaURI() { + return schemaURI; + } +} \ No newline at end of file diff --git a/src/test/main/java/org/opengraph/OpenGraphTest.java b/plugin/src/test/java/org/opengraph/OpenGraphTest.java similarity index 57% rename from src/test/main/java/org/opengraph/OpenGraphTest.java rename to plugin/src/test/java/org/opengraph/OpenGraphTest.java index e8d5a19..fd45453 100644 --- a/src/test/main/java/org/opengraph/OpenGraphTest.java +++ b/plugin/src/test/java/org/opengraph/OpenGraphTest.java @@ -1,15 +1,16 @@ package org.opengraph; -import org.junit.Test; -import org.opengraph.OpenGraph; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; + -import static org.junit.Assert.*; +import org.junit.Test; public class OpenGraphTest { @Test public void shouldHandleMissingContentType() throws java.lang.Exception { - OpenGraph site = new OpenGraph("http://www.bbc.com/future/story/20140428-the-myth-of-tech-revolutions", true); - assertEquals("Why it’s time to ditch the word ‘revolution’ in tech", site.getContent("title")); + OpenGraph site = new OpenGraph("https://www.bbc.com/future/article/20140428-the-myth-of-tech-revolutions", true); + assertEquals("Why it’s time to ditch the word ‘revolution’ in tech", site.getContent("title")); assertEquals("624", site.getContent("image:width")); } diff --git a/pom.xml b/pom.xml index 3281a04..d1c6d97 100644 --- a/pom.xml +++ b/pom.xml @@ -1,41 +1,122 @@ - 4.0.0 - OpenGraph - OpenGraph - 0.0.1-SNAPSHOT - A Facebook OpenGraph implementation for Java - OpenGraph for Java + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + 4.0.0 + opengraph + opengraph + 0.0.2-SNAPSHOT + pom + A Facebook OpenGraph implementation for Java + OpenGraph for Java - - - org.hamcrest - hamcrest-core - 1.3 - - - net.sourceforge.htmlcleaner - htmlcleaner - 2.16 - - - junit - junit - 4.12 - - + + 1.8 + 1.8 + UTF-8 + + + + plugin + examples + + + + org.hamcrest + hamcrest-core + 1.3 + + + net.sourceforge.htmlcleaner + htmlcleaner + 2.24 + + + junit + junit + 4.13.1 + + + + + + BSD-3-Clause + https://opensource.org/licenses/BSD-3-Clause + Best guess license from repo comments. Compatible with the HTMLCleaner BSD-3-Clause license. + + manual + + + + + + John Deverall + johndeverall@gmail.com + https://github.com/johndeverall + John Deverall + https://github.com/johndeverall + + developer + + Pacific/Auckland + + + + + + Callum Jones + https://github.com/callumj + Callum Jones + https://github.com/callumj/ + + contributor + + America/Los_Angeles + + + Ruslan Khmelyuk + ruslan@khmelyuk.xyz + http://www.khmelyuk.com/ + Ruslan Khmelyuk + http://www.khmelyuk.com/ + + contributor + + America/Los_Angeles + + + Niall Kennedy + niall@niallkennedy.com + https://www.niallkennedy.com/blog/ + Niall Kennedy + https://www.niallkennedy.com/blog/ + + contributor + + America/Los_Angeles + + @niall + + + + Timothy Stone + javafueled@gmail.com + https://github.com/timothystone + Timothy Stone + https://www.anothercaffeinatedday.com/ + + contributor + + America/New_York + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + + - - src - - - maven-compiler-plugin - 3.3 - - - - - - - \ No newline at end of file