diff --git a/build.sbt b/build.sbt index 931e025..c8ff123 100644 --- a/build.sbt +++ b/build.sbt @@ -15,11 +15,12 @@ lazy val root = (project in file(".")) .settings(commons: _*) .settings( libraryDependencies ++= Seq( + "org.specs2" %% "specs2" % "3.7", "com.machinepublishers" % "jbrowserdriver" % "0.14.7", "org.jsoup" % "jsoup" % "1.9.2", "com.typesafe.scala-logging" % "scala-logging_2.11" % "3.4.0", "ch.qos.logback" % "logback-classic" % "1.1.7", "com.rockymadden.stringmetric" % "stringmetric-core_2.11" % "0.27.4", - "io.github.lukehutch" % "fast-classpath-scanner" % "1.9.21" + "io.github.lukehutch" % "fast-classpath-scanner" % "1.9.19" ) - ) +) diff --git a/src/main/scala/eu/unicredit/web/HtmlExtractor.scala b/src/main/scala/eu/unicredit/web/HtmlExtractor.scala index 4b96574..63b98d8 100644 --- a/src/main/scala/eu/unicredit/web/HtmlExtractor.scala +++ b/src/main/scala/eu/unicredit/web/HtmlExtractor.scala @@ -71,9 +71,11 @@ class VisualTagTreeBuilder(headless: Boolean = true, quickRender: Boolean = true .split(";") .filter(_.contains("::")) .map(_.split("::")) - .map { + .flatMap { case Array(prop, value) => - prop -> value + Some(prop -> value) + case Array(prop) => + Some(prop -> "") } .toMap .filterNot(_._1.startsWith("-webkit")) diff --git a/src/main/scala/eu/unicredit/web/Models.scala b/src/main/scala/eu/unicredit/web/Models.scala index f172c40..99f8c38 100644 --- a/src/main/scala/eu/unicredit/web/Models.scala +++ b/src/main/scala/eu/unicredit/web/Models.scala @@ -5,7 +5,8 @@ import org.jsoup.Jsoup import scala.annotation.tailrec import scala.collection.JavaConversions._ import scala.collection.mutable -import scala.util.{ Failure, Success, Try } +import scala.util.Try + /** * Created by fabiofumarola on 24/05/16. @@ -125,6 +126,7 @@ object Models { from: Seq[WebList] = Seq.empty) { lazy val urls = elements.flatMap(_.urls) lazy val bfs = elements.flatMap(_.bfs) + } } diff --git a/src/main/scala/eu/unicredit/web/hylien/Distances.scala b/src/main/scala/eu/unicredit/web/hylien/Distances.scala index 9e4c7f4..4d158ca 100644 --- a/src/main/scala/eu/unicredit/web/hylien/Distances.scala +++ b/src/main/scala/eu/unicredit/web/hylien/Distances.scala @@ -1,6 +1,7 @@ package eu.unicredit.web.hylien import com.rockymadden.stringmetric.similarity._ +import eu.unicredit.web.Models.DomNode import scala.collection.mutable import scala.util.Try @@ -33,6 +34,58 @@ object Distances { Encoder.encode(b).toArray) } + /** + * Implement the simple tree matching algorithm + * + * @param a + * @param b + * @return + */ + def treeEditDistance (a: DomNode, b:DomNode): Double = { + + a.tagName.equals(b.tagName) match { + case false => 0D + case true => + val num_rows = a.children.size + 1 + val num_columns = b.children.size + 1 + val matchMatrix = Array.ofDim[Double](num_rows, num_columns) + + //Initialize 0th row and 0th column + matchMatrix.indices.foreach(row => matchMatrix(row)(0) = 0D) + matchMatrix(0).indices.foreach(column => matchMatrix(0)(column) = 0D) + + val pairs = for{ + row <- 1 until num_rows + column <- 1 until num_columns + } yield (row, column) + + pairs.foreach { + case (row, column) => + val left_distance = matchMatrix(row)(column - 1) + val up_distance = matchMatrix(row - 1)(column) + val diagonal_distance = matchMatrix(row - 1)(column - 1) + treeEditDistance(a.children(row - 1), b.children(column - 1)) + val bestDistance = List(left_distance, up_distance, diagonal_distance).max + matchMatrix(row)(column) = bestDistance + } + 1D + matchMatrix(matchMatrix.length - 1)(matchMatrix(0).length - 1) + } + } + + def normalizedTreeEditDistance (a: DomNode, b:DomNode) : Double = { + def getSize0(nodes: List[DomNode], acc:Int): Int = nodes match { + case List() => acc + case h::tail => getSize0(h.children.toList ++ tail, acc+1) + } + + def getSize(tree: DomNode): Int = { + getSize0(List(tree), 0) + } + + val ted = treeEditDistance(a,b) + val avgNodes = (getSize(a) + getSize(b)).toDouble /2 + 1- (ted.toDouble / avgNodes) + } + } object Encoder { diff --git a/src/main/scala/eu/unicredit/web/hylien/HyLiEn.scala b/src/main/scala/eu/unicredit/web/hylien/HyLiEn.scala index db53a4a..046a3ec 100644 --- a/src/main/scala/eu/unicredit/web/hylien/HyLiEn.scala +++ b/src/main/scala/eu/unicredit/web/hylien/HyLiEn.scala @@ -20,9 +20,11 @@ class VisualHyLiEn(headless: Boolean = true, quickRender: Boolean = true, logReqs = logReqs, browserSize = browserSize) - def extract(url: String, tagSimFactor: Float = 0.4F, maxRecordTags: Int = 30): Seq[WebList] = { + def extract(url: String, tagSimFactor: Float = 0.4F, maxRecordTags: Int = 60): Seq[WebList] = { + val startTime = System.currentTimeMillis() val root = webExtractor.parse(url) - logger.debug(s"parsed $url, start extracting lists") + val totalTime = System.currentTimeMillis() - startTime + logger.info(s"parsed $url in $totalTime millisec, start extracting lists") @tailrec def extract0(notAligned: List[DomNode], acc: List[WebList]): List[WebList] = diff --git a/src/main/scala/eu/unicredit/web/hylien/ListsFinder.scala b/src/main/scala/eu/unicredit/web/hylien/ListsFinder.scala index 8275fce..c8bb485 100644 --- a/src/main/scala/eu/unicredit/web/hylien/ListsFinder.scala +++ b/src/main/scala/eu/unicredit/web/hylien/ListsFinder.scala @@ -91,7 +91,8 @@ private[this] object VisualListFinder { //take the head and for the tail filter all the elements similar to the head case head :: tail => head :: tail.filter { n => - val dist = Distances.normalizedEditDistance(head.bfs, n.bfs) + //val dist = Distances.normalizedEditDistance(head.bfs, n.bfs) + val dist = Distances.normalizedTreeEditDistance(head, n) if (dist > minsim) nonSimilar = n :: nonSimilar dist <= minsim } diff --git a/src/test/scala/eu/unicredit/web/HtmlExtractorTest.scala b/src/test/scala/eu/unicredit/web/HtmlExtractorTest.scala index 8688a83..dfe05fa 100644 --- a/src/test/scala/eu/unicredit/web/HtmlExtractorTest.scala +++ b/src/test/scala/eu/unicredit/web/HtmlExtractorTest.scala @@ -9,8 +9,6 @@ import scala.collection.mutable */ object VisualTagTreeBuilderTest extends App { - // val url = "https://ec.europa.eu/research/participants/portal/desktop/en/opportunities/h2020/#c,calls=level3/t/EU.1./0/1/1/default-group&level4/t/EU.1.1./0/1/1/default-group&level4/t/EU.1.2./0/1/1/default-group&level4/t/EU.1.3./0/1/1/default-group&level4/t/EU.1.4./0/1/1/default-group&level3/t/EU.2./0/1/1/default-group&level4/t/EU.2.1./0/1/1/default-group&level5/t/EU.2.1.1./0/1/1/default-group&level5/t/EU.2.1.2./0/1/1/default-group&level5/t/EU.2.1.3./0/1/1/default-group&level5/t/EU.2.1.4./0/1/1/default-group&level5/t/EU.2.1.5./0/1/1/default-group&level5/t/EU.2.1.6./0/1/1/default-group&level4/t/EU.2.2./0/1/1/default-group&level4/t/EU.2.3./0/1/1/default-group&level3/t/EU.3./0/1/1/default-group&level4/t/EU.3.1./0/1/1/default-group&level4/t/EU.3.2./0/1/1/default-group&level4/t/EU.3.3./0/1/1/default-group&level4/t/EU.3.4./0/1/1/default-group&level4/t/EU.3.5./0/1/1/default-group&level4/t/EU.3.6./0/1/1/default-group&level4/t/EU.3.7./0/1/1/default-group&level3/t/EU.4./0/1/1/default-group&level3/t/EU.5./0/1/1/default-group&level3/t/EU.7./0/1/1/default-group&level2/t/Euratom/0/1/1/default-group&hasForthcomingTopics/t/true/1/1/0/default-group&hasOpenTopics/t/true/1/1/0/default-group&allClosedTopics/t/true/0/1/0/default-group&+PublicationDateLong/asc" - // val url = "https://www.stanford.edu/" val url = "http://www.bsvillage.com/Piscine-Fuori-Terra/" val time2 = System.currentTimeMillis() @@ -19,35 +17,13 @@ object VisualTagTreeBuilderTest extends App { println(s"page parsed into ${System.currentTimeMillis() - time2}") parser1.close() - // var time1 = System.currentTimeMillis() - // val visual1 = new VisualWebExtractor(true, true) - // (1 to 10).foreach(_ => visual1.parse(url)) - // println(s"page parsed into ${(System.currentTimeMillis() - time1) / 10}") - - // val time3 = System.currentTimeMillis() - // val parser2 = new VisualWebExtractor(true, true) - // parser2.parse(url) - // println(s"page parsed into ${System.currentTimeMillis() - time3}") - // parser2.close() - println(root) } object TagTreeBuilderTest extends App { - // val url = "https://ec.europa.eu/research/participants/portal/desktop/en/opportunities/h2020/#c,calls=level3/t/EU.1./0/1/1/default-group&level4/t/EU.1.1./0/1/1/default-group&level4/t/EU.1.2./0/1/1/default-group&level4/t/EU.1.3./0/1/1/default-group&level4/t/EU.1.4./0/1/1/default-group&level3/t/EU.2./0/1/1/default-group&level4/t/EU.2.1./0/1/1/default-group&level5/t/EU.2.1.1./0/1/1/default-group&level5/t/EU.2.1.2./0/1/1/default-group&level5/t/EU.2.1.3./0/1/1/default-group&level5/t/EU.2.1.4./0/1/1/default-group&level5/t/EU.2.1.5./0/1/1/default-group&level5/t/EU.2.1.6./0/1/1/default-group&level4/t/EU.2.2./0/1/1/default-group&level4/t/EU.2.3./0/1/1/default-group&level3/t/EU.3./0/1/1/default-group&level4/t/EU.3.1./0/1/1/default-group&level4/t/EU.3.2./0/1/1/default-group&level4/t/EU.3.3./0/1/1/default-group&level4/t/EU.3.4./0/1/1/default-group&level4/t/EU.3.5./0/1/1/default-group&level4/t/EU.3.6./0/1/1/default-group&level4/t/EU.3.7./0/1/1/default-group&level3/t/EU.4./0/1/1/default-group&level3/t/EU.5./0/1/1/default-group&level3/t/EU.7./0/1/1/default-group&level2/t/Euratom/0/1/1/default-group&hasForthcomingTopics/t/true/1/1/0/default-group&hasOpenTopics/t/true/1/1/0/default-group&allClosedTopics/t/true/0/1/0/default-group&+PublicationDateLong/asc" - // val url = "https://www.stanford.edu/" val url = "http://www.cs.illinois.edu/directory/faculty" -// var time1 = System.currentTimeMillis() -// val visual1 = new TagTreeBuilder() -// (1 to 10).foreach(_ => visual1.parse(url)) -// println(s"page parsed into ${(System.currentTimeMillis() - time1) / 10}") -// -// val time2 = System.currentTimeMillis() -// val root = new TagTreeBuilder().parse(url) -// println(s"page parsed into ${System.currentTimeMillis() - time2}") - val time3 = System.currentTimeMillis() val root = new TagTreeBuilder().parse(url) println(s"page parsed into ${System.currentTimeMillis() - time3}") diff --git a/src/test/scala/eu/unicredit/web/HyLiEnTest.scala b/src/test/scala/eu/unicredit/web/HyLiEnTest.scala index d16287d..fd38a36 100644 --- a/src/test/scala/eu/unicredit/web/HyLiEnTest.scala +++ b/src/test/scala/eu/unicredit/web/HyLiEnTest.scala @@ -12,21 +12,7 @@ object HyLiEnTest extends App { headless = true, quickRender = true, logReqs = false, browserSize = BrowserSize(1920, 1080)) - val lists = hylien.extract("http://www.immobiliare.it/44602950-Vendita-Bilocale-via-Pola-2-Milano.html") - - - //("http://www.cs.illinois.edu/directory/faculty?quicktabs_faculty_tabs_new=1#quicktabs-faculty_tabs_new") - - //("http://www.cs.illinois.edu") - - //("https://it.wikipedia.org/wiki/Fiat_Chrysler_Automobiles") - //("http://www.bsvillage.com/Piscine-Fuori-Terra/") - - //("http://www.cs.illinois.edu/directory/faculty") - - //("http://www.cs.ox.ac.uk/") - - // //("http://www.harvard.edu/") //("http://cs.stanford.edu/") + val lists = hylien.extract("http://www.idealista.it/vendita-case/milano-milano/") println(s"Got ${lists.size} lists") lists.foreach { l => @@ -38,6 +24,7 @@ object HyLiEnTest extends App { def toString(l: WebList): String = { val buf = new StringBuilder + buf ++= s"Printing ${l.orientation} of by ${l.elements.size} elements obtained merging ${l.from.size} lists \n" buf ++= s"parent dom tag = ${l.parent.tagName}\n" buf ++= s"location = ${l.location} \n" @@ -65,4 +52,6 @@ object HyLiEnTest extends App { buf.toString() } + + } diff --git a/src/test/scala/hylien/TreeEditDistanceSpecs.scala b/src/test/scala/hylien/TreeEditDistanceSpecs.scala new file mode 100644 index 0000000..75e747c --- /dev/null +++ b/src/test/scala/hylien/TreeEditDistanceSpecs.scala @@ -0,0 +1,290 @@ +package hylien + +import eu.unicredit.web.Models.{DomNode, Location, Size} +import eu.unicredit.web.hylien.Distances +import org.specs2.mutable.Specification +import org.specs2.specification.Scope + +import scala.collection.mutable + + +/** + * Created by fabiana on 7/4/16. + */ +class TreeEditDistanceSpecs extends Specification { + class Context extends Scope { + + val node_tagA = DomNode(id =1, + parentId=0, + tagName = "a", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "node A", + html = "") + + val node_tagB = DomNode(id = 2, + parentId=0, + tagName = "b", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "node B", + html = "") + + val node_tagC = DomNode(id = 3, + parentId=0, + tagName = "c", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "node C", + html = "") + + val node_tagD = DomNode(id = 2, + parentId=0, + tagName = "d", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "node D", + html = "") + + val node_tagE = DomNode(id = 2, + parentId=0, + tagName = "e", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "node E", + html = "") + + val node_tagF = DomNode(id = 2, + parentId=0, + tagName = "f", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "node F", + html = "") + + val node_tagG = DomNode(id = 2, + parentId=0, + tagName = "g", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "node G", + html = "") + + val node_tagH = DomNode(id = 2, + parentId=0, + tagName = "h", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "node H", + html = "") + + val node_tagI = DomNode(id = 2, + parentId=0, + tagName = "i", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "node I", + html = "") + + val T1 = DomNode(id = 2, + parentId=0, + tagName = "b", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "T1", + html = "T1", + children = mutable.Buffer(node_tagC, node_tagD) + ) + val T2 = DomNode(id = 2, + parentId=0, + tagName = "c", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "T2", + html = "", + children = mutable.Buffer(node_tagF) + ) + + val T3 = DomNode(id = 2, + parentId=0, + tagName = "d", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "T3", + html = "", + children = mutable.Buffer(node_tagE) + ) + + val T4 = DomNode(id = 2, + parentId=0, + tagName = "g", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "T4", + html = "", + children = mutable.Buffer(node_tagH, node_tagF) + ) + + val T5 = DomNode(id = 2, + parentId=0, + tagName = "c", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "T5", + html = "", + children = mutable.Buffer(T4, node_tagF) + ) + + val T6 = DomNode(id = 2, + parentId=0, + tagName = "a", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "T6", + html = "", + children = mutable.Buffer(T1, T2, T3, T5) + ) + + val T7 = DomNode(id = 2, + parentId=0, + tagName = "b", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "T7", + html = "", + children = mutable.Buffer(node_tagC, node_tagD) + ) + + val T8 = DomNode(id = 2, + parentId=0, + tagName = "g", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "T8", + html = "", + children = mutable.Buffer( node_tagF, node_tagH, node_tagI) + ) + + val T9 = DomNode(id = 2, + parentId=0, + tagName = "c", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "T9", + html = "", + children = mutable.Buffer(T8) + ) + + val T10 = DomNode(id = 2, + parentId=0, + tagName = "e", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "T10", + html = "", + children = mutable.Buffer( node_tagF) + ) + + val T11 = DomNode(id = 2, + parentId=0, + tagName = "d", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "T11", + html = "", + children = mutable.Buffer(T10) + ) + + val T12 = DomNode(id = 2, + parentId=0, + tagName = "a", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "T12", + html = "", + children = mutable.Buffer(T7, T9, T11) + ) + + } + + "TreeEditDistance" should { + "returns a score of 7.0 when you compare 2 trees having 7 nodes in common" in new Context{ + Distances.treeEditDistance(T6, T12) === 7D + } + "returns a score of 0.0 when you compare 2 trees having no common nodes" in new Context{ + Distances.treeEditDistance(node_tagA, node_tagB) === 0 + } + } + + "NormalizedTreeEditDistance" should { + "returns a score of 0.56 when compare 2 trees having 7 nodes in common and size 13 and 12 respectively" in new Context{ + Distances.normalizedTreeEditDistance(T6, T12) === 0.56 + } + "returns a score of 1 when a tree is compared with itself" in new Context { + Distances.normalizedTreeEditDistance(T6,T6) === 1 + } + } +}