From a6558fdb2ada0135b9e3a21a6b41d8e4223f2551 Mon Sep 17 00:00:00 2001 From: fabiana Date: Tue, 31 May 2016 22:15:56 +0200 Subject: [PATCH 1/9] added html information of DomNode --- .../eu/unicredit/web/HtmlExtractor.scala | 6 ++-- src/main/scala/eu/unicredit/web/Models.scala | 30 +++++++++++++++++-- .../unicredit/web/hylien/PageListfinder.scala | 6 ++-- .../eu/unicredit/web/HtmlExtractorTest.scala | 5 ++-- .../scala/eu/unicredit/web/HyLiEnTest.scala | 10 +++++-- 5 files changed, 46 insertions(+), 11 deletions(-) diff --git a/src/main/scala/eu/unicredit/web/HtmlExtractor.scala b/src/main/scala/eu/unicredit/web/HtmlExtractor.scala index fb9a6e2..4615fcd 100644 --- a/src/main/scala/eu/unicredit/web/HtmlExtractor.scala +++ b/src/main/scala/eu/unicredit/web/HtmlExtractor.scala @@ -63,7 +63,8 @@ class VisualTagTreeBuilder(headless: Boolean = true, quickRender: Boolean = true cssSelector = noCssSelector, location = Location(e.getLocation.x, e.getLocation.y), size = Size(e.getSize.width, e.getSize.height), - text = e.getText) + text = e.getText, + html = e.getAttribute("outerHTML")) def close() = driver.close() @@ -97,5 +98,6 @@ class TagTreeBuilder extends WebExtractor { cssSelector = e.cssSelector(), location = noLocation, size = noSize, - text = e.ownText()) + text = e.ownText(), + html = e.html()) } diff --git a/src/main/scala/eu/unicredit/web/Models.scala b/src/main/scala/eu/unicredit/web/Models.scala index e4193db..d5b36dd 100644 --- a/src/main/scala/eu/unicredit/web/Models.scala +++ b/src/main/scala/eu/unicredit/web/Models.scala @@ -1,7 +1,11 @@ package eu.unicredit.web +import org.jsoup.Jsoup + import scala.annotation.tailrec +import scala.collection.JavaConverters._ import scala.collection.mutable +import scala.util.{Failure, Success, Try} /** * Created by fabiofumarola on 24/05/16. @@ -26,8 +30,26 @@ object Models { location: Location, size: Size, text: String, - children: mutable.Buffer[DomNode] = mutable.Buffer.empty[DomNode]) { + children: mutable.Buffer[DomNode] = mutable.Buffer.empty[DomNode], + html: String) { lazy val bfs = DomNode.bfs(this) + + def getUrls(html: String): Seq[String] = { + val tryhtml = Try { + Jsoup.parse(html) + .select("a[href]") + .asScala + .map(link => link.attr("href")) + .toList + .filter(s => s.size > 0) + } + tryhtml match { + case Success(lists) => lists + case Failure(ex) => List() + } + } + + lazy val urls = getUrls(html) } object DomNode { @@ -56,7 +78,11 @@ object Models { case class WebList( parent: DomNode, orientation: Orientation, - elements: Seq[DomNode]) + location: Location, + size: Size, + elements: Seq[DomNode]){ + lazy val urls = elements.flatMap(n => n.urls) + } } diff --git a/src/main/scala/eu/unicredit/web/hylien/PageListfinder.scala b/src/main/scala/eu/unicredit/web/hylien/PageListfinder.scala index 7721739..c4cef66 100644 --- a/src/main/scala/eu/unicredit/web/hylien/PageListfinder.scala +++ b/src/main/scala/eu/unicredit/web/hylien/PageListfinder.scala @@ -27,14 +27,14 @@ private[this] object VisualListFinder { case (pos, list) => val (similar, nonSimilar) = VisualListFinder.structuralFilter(list, minsim) notAligned = notAligned ++ nonSimilar - pos -> WebList(domNode, Orientation.vertical, similar) + pos -> WebList(domNode, Orientation.vertical, domNode.location, domNode.size ,similar) }.filter(_._2.elements.size > 1).values val horizontalList = horizontalAligned.map { case (pos, list) => val (similar, nonSimilar) = VisualListFinder.structuralFilter(list, minsim) notAligned = notAligned ++ nonSimilar - pos -> WebList(domNode, Orientation.horizontal, similar) + pos -> WebList(domNode, Orientation.horizontal, domNode.location, domNode.size, similar) }.filter(_._2.elements.size > 1).values (verticalList ++ horizontalList toSeq, notAligned.toSeq) @@ -86,7 +86,7 @@ private[this] object VisualListFinder { } } - if (similar.size > 2) (similar, nonSimilar) + if (similar.size > 1) (similar, nonSimilar) else (Seq.empty, similar ++ nonSimilar) } diff --git a/src/test/scala/eu/unicredit/web/HtmlExtractorTest.scala b/src/test/scala/eu/unicredit/web/HtmlExtractorTest.scala index 4d23d3f..36e4b5e 100644 --- a/src/test/scala/eu/unicredit/web/HtmlExtractorTest.scala +++ b/src/test/scala/eu/unicredit/web/HtmlExtractorTest.scala @@ -64,7 +64,9 @@ object DomNodeTest extends App { cssSelector = "", location = noLocation, size = noSize, - text = "" + text = "", + html = "
\n MY.CS\n
" + ) val child1 = root.copy(id = 1, tagName = "b", children = mutable.Buffer.empty[DomNode]) @@ -76,7 +78,6 @@ object DomNodeTest extends App { child2.children.append(child3) val bfs = DomNode.bfs(root) - println(bfs) } \ No newline at end of file diff --git a/src/test/scala/eu/unicredit/web/HyLiEnTest.scala b/src/test/scala/eu/unicredit/web/HyLiEnTest.scala index b18cd08..e7e7942 100644 --- a/src/test/scala/eu/unicredit/web/HyLiEnTest.scala +++ b/src/test/scala/eu/unicredit/web/HyLiEnTest.scala @@ -9,7 +9,7 @@ import eu.unicredit.web.hylien.VisualHyLiEn object HyLiEnTest extends App { val hylien = new VisualHyLiEn() - val result = hylien.extract("http://www.harvard.edu/") + val result = hylien.extract("http://www.cs.illinois.edu") result.foreach { l => val r = toString(l) @@ -23,12 +23,18 @@ object HyLiEnTest extends App { val buf = new StringBuilder buf ++= s"Printing ${l.orientation} of by ${l.elements.size} elements \n" buf ++= s"parent dom tag = ${l.parent.tagName} \n" + buf ++= s"location = ${l.location} \n" + buf ++= s"location = ${l.size} \n" l.elements.foreach { n => buf ++= s"\t tag = ${n.tagName} \n" buf ++= s"\t text = || ${n.text.replace("\n", " ")} || \n" - buf ++= s"\t bfs = ${n.bfs}" + //buf ++= s"\t html = || ${n.html} || \n" + buf ++= s"\t bfs = ${n.bfs}\n" + buf ++= s"\t urls = ${n.urls}\n" buf ++= "----------------------- \n" } buf.toString() } + + } From 8b5246bb92a1a1bd0b1859a590e8d2089c77722f Mon Sep 17 00:00:00 2001 From: fabiana Date: Thu, 2 Jun 2016 10:57:59 +0200 Subject: [PATCH 2/9] first commit --- build.sbt | 3 ++- src/main/scala/eu/unicredit/web/Models.scala | 6 +----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/build.sbt b/build.sbt index 913331d..e544087 100644 --- a/build.sbt +++ b/build.sbt @@ -20,5 +20,6 @@ lazy val root = (project in file(".")) "com.typesafe.scala-logging" % "scala-logging_2.11" % "3.4.0", "ch.qos.logback" % "logback-classic" % "1.1.7", "com.rockymadden.stringmetric" % "stringmetric-core_2.11" % "0.27.4" - ) + +) ) diff --git a/src/main/scala/eu/unicredit/web/Models.scala b/src/main/scala/eu/unicredit/web/Models.scala index d5b36dd..8a02edb 100644 --- a/src/main/scala/eu/unicredit/web/Models.scala +++ b/src/main/scala/eu/unicredit/web/Models.scala @@ -37,11 +37,7 @@ object Models { def getUrls(html: String): Seq[String] = { val tryhtml = Try { Jsoup.parse(html) - .select("a[href]") - .asScala - .map(link => link.attr("href")) - .toList - .filter(s => s.size > 0) + } tryhtml match { case Success(lists) => lists From 976abe9c6e96d91be3e82ba60597a2b29627784f Mon Sep 17 00:00:00 2001 From: fabiana Date: Thu, 2 Jun 2016 11:00:56 +0200 Subject: [PATCH 3/9] first commit --- src/main/scala/eu/unicredit/web/Models.scala | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/main/scala/eu/unicredit/web/Models.scala b/src/main/scala/eu/unicredit/web/Models.scala index 8a02edb..d5b36dd 100644 --- a/src/main/scala/eu/unicredit/web/Models.scala +++ b/src/main/scala/eu/unicredit/web/Models.scala @@ -37,7 +37,11 @@ object Models { def getUrls(html: String): Seq[String] = { val tryhtml = Try { Jsoup.parse(html) - + .select("a[href]") + .asScala + .map(link => link.attr("href")) + .toList + .filter(s => s.size > 0) } tryhtml match { case Success(lists) => lists From 67423432dbcd15e2aad34baf991edde2e8edc641 Mon Sep 17 00:00:00 2001 From: fabiana Date: Mon, 4 Jul 2016 12:29:48 +0200 Subject: [PATCH 4/9] added tree edit distance plus test --- build.sbt | 4 +- .../eu/unicredit/web/hylien/Distances.scala | 54 ++++ .../scala/hylien/TreeEditDistanceSpecs.scala | 287 ++++++++++++++++++ 3 files changed, 343 insertions(+), 2 deletions(-) create mode 100644 src/test/scala/hylien/TreeEditDistanceSpecs.scala diff --git a/build.sbt b/build.sbt index 6119e39..1bf8497 100644 --- a/build.sbt +++ b/build.sbt @@ -15,6 +15,7 @@ lazy val root = (project in file(".")) .settings(commons: _*) .settings( libraryDependencies ++= Seq( + "org.specs2" %% "specs2" % "3.7", "com.machinepublishers" % "jbrowserdriver" % "0.14.7", "org.jsoup" % "jsoup" % "1.9.2", "com.typesafe.scala-logging" % "scala-logging_2.11" % "3.4.0", @@ -22,5 +23,4 @@ lazy val root = (project in file(".")) "com.rockymadden.stringmetric" % "stringmetric-core_2.11" % "0.27.4", "io.github.lukehutch" % "fast-classpath-scanner" % "1.9.21" ) ->>>>>>> upstream/master - ) +) diff --git a/src/main/scala/eu/unicredit/web/hylien/Distances.scala b/src/main/scala/eu/unicredit/web/hylien/Distances.scala index 9e4c7f4..8f3b70e 100644 --- a/src/main/scala/eu/unicredit/web/hylien/Distances.scala +++ b/src/main/scala/eu/unicredit/web/hylien/Distances.scala @@ -1,6 +1,7 @@ package eu.unicredit.web.hylien import com.rockymadden.stringmetric.similarity._ +import eu.unicredit.web.Models.DomNode import scala.collection.mutable import scala.util.Try @@ -33,6 +34,59 @@ object Distances { Encoder.encode(b).toArray) } + /** + * Implement the simple tree matching algorithm + * + * @param a + * @param b + * @return + */ + def treeEditDistance (a: DomNode, b:DomNode): Double = { + + a.tagName.equals(b.tagName) match { + case false => 0D + case true => + val num_rows = a.children.size + 1 + val num_columns = b.children.size + 1 + val matchMatrix = Array.ofDim[Double](num_rows, num_columns) + + //Initialize 0th row and 0th column + matchMatrix.indices.foreach(row => matchMatrix(row)(0) = 0D) + matchMatrix(0).indices.foreach(column => matchMatrix(0)(column) = 0D) + + val pairs = for{ + row <- 1 until num_rows + column <- 1 until num_columns + } yield (row, column) + + pairs.foreach { + case (row, column) => + val left_distance = matchMatrix(row)(column - 1) + val up_distance = matchMatrix(row - 1)(column) + val diagonal_distance = matchMatrix(row - 1)(column - 1) + treeEditDistance(a.children(row - 1), b.children(column - 1)) + val bestDistance = List(left_distance, up_distance, diagonal_distance).max + matchMatrix(row)(column) = bestDistance + } + 1D + matchMatrix(matchMatrix.length - 1)(matchMatrix(0).length - 1) + } + } + + def normalizedTreeEditDistance (a: DomNode, b:DomNode) : Double = { + def getSize0(nodes: List[DomNode], acc:Int): Int = { + nodes match { + case List() => acc + case h::tail => getSize0(h.children.toList ++ tail, acc+1) + } + } + def getSize(tree: DomNode): Int = { + getSize0(List(tree), 0) + } + + val ted = treeEditDistance(a,b) + val avgNodes = (getSize(a) + getSize(b)).toDouble /2 + ted.toDouble / avgNodes + } + } object Encoder { diff --git a/src/test/scala/hylien/TreeEditDistanceSpecs.scala b/src/test/scala/hylien/TreeEditDistanceSpecs.scala new file mode 100644 index 0000000..3bbb838 --- /dev/null +++ b/src/test/scala/hylien/TreeEditDistanceSpecs.scala @@ -0,0 +1,287 @@ +package hylien + +import eu.unicredit.web.Models.{DomNode, Location, Size} + +import scala.collection.mutable + + +/** + * Created by fabiana on 7/4/16. + */ +class TreeEditDistanceSpecs extends Specification { + class Context extends Scope { + + val node_tagA = DomNode(id =1, + parentId=0, + tagName = "a", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "node A", + html = "") + + val node_tagB = DomNode(id = 2, + parentId=0, + tagName = "b", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "node B", + html = "") + + val node_tagC = DomNode(id = 3, + parentId=0, + tagName = "c", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "node C", + html = "") + + val node_tagD = DomNode(id = 2, + parentId=0, + tagName = "d", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "node D", + html = "") + + val node_tagE = DomNode(id = 2, + parentId=0, + tagName = "e", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "node E", + html = "") + + val node_tagF = DomNode(id = 2, + parentId=0, + tagName = "f", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "node F", + html = "") + + val node_tagG = DomNode(id = 2, + parentId=0, + tagName = "g", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "node G", + html = "") + + val node_tagH = DomNode(id = 2, + parentId=0, + tagName = "h", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "node H", + html = "") + + val node_tagI = DomNode(id = 2, + parentId=0, + tagName = "i", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "node I", + html = "") + + val T1 = DomNode(id = 2, + parentId=0, + tagName = "b", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "T1", + html = "T1", + children = mutable.Buffer(node_tagC, node_tagD) + ) + val T2 = DomNode(id = 2, + parentId=0, + tagName = "c", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "T2", + html = "", + children = mutable.Buffer(node_tagF) + ) + + val T3 = DomNode(id = 2, + parentId=0, + tagName = "d", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "T3", + html = "", + children = mutable.Buffer(node_tagE) + ) + + val T4 = DomNode(id = 2, + parentId=0, + tagName = "g", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "T4", + html = "", + children = mutable.Buffer(node_tagH, node_tagF) + ) + + val T5 = DomNode(id = 2, + parentId=0, + tagName = "c", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "T5", + html = "", + children = mutable.Buffer(T4, node_tagF) + ) + + val T6 = DomNode(id = 2, + parentId=0, + tagName = "a", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "T6", + html = "", + children = mutable.Buffer(T1, T2, T3, T5) + ) + + val T7 = DomNode(id = 2, + parentId=0, + tagName = "b", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "T7", + html = "", + children = mutable.Buffer(node_tagC, node_tagD) + ) + + val T8 = DomNode(id = 2, + parentId=0, + tagName = "g", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "T8", + html = "", + children = mutable.Buffer( node_tagF, node_tagH, node_tagI) + ) + + val T9 = DomNode(id = 2, + parentId=0, + tagName = "c", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "T9", + html = "", + children = mutable.Buffer(T8) + ) + + val T10 = DomNode(id = 2, + parentId=0, + tagName = "e", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "T10", + html = "", + children = mutable.Buffer( node_tagF) + ) + + val T11 = DomNode(id = 2, + parentId=0, + tagName = "d", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "T11", + html = "", + children = mutable.Buffer(T10) + ) + + val T12 = DomNode(id = 2, + parentId=0, + tagName = "a", + cssClass ="", + cssProperties = Map(), + cssSelector = "", + location = Location(100, 100), + size = Size(100,100), + text= "T12", + html = "", + children = mutable.Buffer(T7, T9, T11) + ) + + } + + "TreeEditDistance" should { + "returns a score of 7.0 when you compare 2 trees having 7 nodes in common" in new Context{ + Distances.treeEditDistance(T6, T12) === 7D + } + "returns a score of 0.0 when you compare 2 trees having no common nodes" in new Context{ + Distances.treeEditDistance(node_tagA, node_tagB) === 0 + } + } + + "NormalizedTreeEditDistance" should { + "returns a score of 0.56 when compare 2 trees having 7 nodes in common and size 13 and 12 respectively" in new Context{ + Distances.normalizedTreeEditDistance(T6, T12) === 0.56 + } + "returns a score of 1 when a tree is compared with itself" in new Context { + Distances.normalizedTreeEditDistance(T6,T6) === 1 + } + } +} From c73ca459e11e4dd4960e3ea4071c241816c2341b Mon Sep 17 00:00:00 2001 From: fabiana Date: Mon, 4 Jul 2016 12:35:58 +0200 Subject: [PATCH 5/9] merged fabio fork --- src/main/scala/eu/unicredit/web/Models.scala | 13 +-- .../unicredit/web/hylien/PageListfinder.scala | 99 ------------------- 2 files changed, 2 insertions(+), 110 deletions(-) delete mode 100644 src/main/scala/eu/unicredit/web/hylien/PageListfinder.scala diff --git a/src/main/scala/eu/unicredit/web/Models.scala b/src/main/scala/eu/unicredit/web/Models.scala index 2c80a1b..758fbcf 100644 --- a/src/main/scala/eu/unicredit/web/Models.scala +++ b/src/main/scala/eu/unicredit/web/Models.scala @@ -3,15 +3,10 @@ package eu.unicredit.web import org.jsoup.Jsoup import scala.annotation.tailrec -<<<<<<< HEAD -import scala.collection.JavaConverters._ -import scala.collection.mutable -import scala.util.{Failure, Success, Try} -======= import scala.collection.JavaConversions._ import scala.collection.mutable import scala.util.{ Failure, Success, Try } ->>>>>>> upstream/master + /** * Created by fabiofumarola on 24/05/16. @@ -147,15 +142,11 @@ object Models { orientation: Orientation, location: Location, size: Size, -<<<<<<< HEAD - elements: Seq[DomNode]){ - lazy val urls = elements.flatMap(n => n.urls) -======= elements: Seq[DomNode], from: Seq[WebList] = Seq.empty) { lazy val urls = elements.flatMap(_.urls) lazy val bfs = elements.flatMap(_.bfs) ->>>>>>> upstream/master + } } diff --git a/src/main/scala/eu/unicredit/web/hylien/PageListfinder.scala b/src/main/scala/eu/unicredit/web/hylien/PageListfinder.scala deleted file mode 100644 index c4cef66..0000000 --- a/src/main/scala/eu/unicredit/web/hylien/PageListfinder.scala +++ /dev/null @@ -1,99 +0,0 @@ -package eu.unicredit.web.hylien - -import eu.unicredit.web.Models._ -import scala.language.postfixOps - -import scala.util.Random - -private[this] object VisualListFinder { - /** - * - * @param domNode - * @param minsim - * @param maxRecordTags - * @return return a tuple with lists and non aligned nodes - */ - def find(domNode: DomNode, minsim: Float, maxRecordTags: Int): (Seq[WebList], Seq[DomNode]) = { - - val verticalAligned: Map[Int, Seq[DomNode]] = - VisualListFinder.verticallyAligned(domNode, maxRecordTags) - - val horizontalAligned: Map[Int, Seq[DomNode]] = - VisualListFinder.horizontallyAligned(domNode, maxRecordTags) - - var notAligned = VisualListFinder.notAligned(domNode, verticalAligned, horizontalAligned) - - val verticalList = verticalAligned.map { - case (pos, list) => - val (similar, nonSimilar) = VisualListFinder.structuralFilter(list, minsim) - notAligned = notAligned ++ nonSimilar - pos -> WebList(domNode, Orientation.vertical, domNode.location, domNode.size ,similar) - }.filter(_._2.elements.size > 1).values - - val horizontalList = horizontalAligned.map { - case (pos, list) => - val (similar, nonSimilar) = VisualListFinder.structuralFilter(list, minsim) - notAligned = notAligned ++ nonSimilar - pos -> WebList(domNode, Orientation.horizontal, domNode.location, domNode.size, similar) - }.filter(_._2.elements.size > 1).values - - (verticalList ++ horizontalList toSeq, notAligned.toSeq) - } - - /** - * - * @param domNode - * @param maxRecordTags - * @param mapper - * @return meta function to get all the aligned elements, fold all the aligned element in a Map[Int, Seq[DomNode] ] - */ - private def aligned(domNode: DomNode, maxRecordTags: Int, mapper: DomNode => (Int, DomNode)): Map[Int, Seq[DomNode]] = - domNode.children - .filter(_.bfs.size <= maxRecordTags) - .map(mapper) - .foldLeft(Map.empty[Int, Seq[DomNode]]) { (map, posNode) => - val (pos, node) = posNode - map + (pos -> (map.getOrElse(pos, Seq.empty) :+ node)) - }.filter(_._2.size > 1) - - private def verticallyAligned(domNode: DomNode, maxRecordTags: Int) = - aligned(domNode, maxRecordTags, n => n.location.x -> n) - - private def horizontallyAligned(domNode: DomNode, maxRecordTags: Int) = - aligned(domNode, maxRecordTags, n => n.location.y -> n) - - private def notAligned(domNode: DomNode, vertical: Map[Int, Seq[DomNode]], horizontal: Map[Int, Seq[DomNode]]) = { - val aligned = (vertical.values ++ horizontal.values).flatten.toSet - domNode.children.toSet.diff(aligned) - } - - /** - * - * @param seq - * @param minsim - * @return a seq of structurally similar DomNode and a seq of non structurally similar DomNodes - */ - private def structuralFilter(seq: Seq[DomNode], minsim: Float): (Seq[DomNode], Seq[DomNode]) = { - var nonSimilar = List.empty[DomNode] - - val similar = Random.shuffle(seq) match { - //take the head and for the tail filter all the elements similar to the head - case head :: tail => - head :: tail.filter { n => - val dist = Distances.normalizedEditDistance(head.bfs, n.bfs) - if (dist > minsim) nonSimilar = n :: nonSimilar - dist <= minsim - } - } - - if (similar.size > 1) (similar, nonSimilar) - else (Seq.empty, similar ++ nonSimilar) - - } - -} - -object TiledListFinder - -object ListMerger - From 514b0510980015e19e573a8bb2b61eb857d6831a Mon Sep 17 00:00:00 2001 From: fabiana Date: Mon, 4 Jul 2016 12:44:48 +0200 Subject: [PATCH 6/9] merged conflicts with fabio project --- src/main/scala/eu/unicredit/web/Models.scala | 22 +------------------ .../scala/eu/unicredit/web/HyLiEnTest.scala | 16 +------------- .../scala/hylien/TreeEditDistanceSpecs.scala | 3 +++ 3 files changed, 5 insertions(+), 36 deletions(-) diff --git a/src/main/scala/eu/unicredit/web/Models.scala b/src/main/scala/eu/unicredit/web/Models.scala index 758fbcf..99f8c38 100644 --- a/src/main/scala/eu/unicredit/web/Models.scala +++ b/src/main/scala/eu/unicredit/web/Models.scala @@ -5,7 +5,7 @@ import org.jsoup.Jsoup import scala.annotation.tailrec import scala.collection.JavaConversions._ import scala.collection.mutable -import scala.util.{ Failure, Success, Try } +import scala.util.Try /** @@ -51,29 +51,9 @@ object Models { children: mutable.Buffer[DomNode] = mutable.Buffer.empty[DomNode], html: String) { lazy val bfs = DomNode.bfs(this) -<<<<<<< HEAD - - def getUrls(html: String): Seq[String] = { - val tryhtml = Try { - Jsoup.parse(html) - .select("a[href]") - .asScala - .map(link => link.attr("href")) - .toList - .filter(s => s.size > 0) - } - tryhtml match { - case Success(lists) => lists - case Failure(ex) => List() - } - } - - lazy val urls = getUrls(html) -======= lazy val urls = DomNode.getUrls(html) lazy val bfsCssClasses = DomNode.bfsCssClasses(this) lazy val visualFeatures = DomNode.visualFeatures(this) ->>>>>>> upstream/master } object DomNode { diff --git a/src/test/scala/eu/unicredit/web/HyLiEnTest.scala b/src/test/scala/eu/unicredit/web/HyLiEnTest.scala index 87fa3be..68dcaa1 100644 --- a/src/test/scala/eu/unicredit/web/HyLiEnTest.scala +++ b/src/test/scala/eu/unicredit/web/HyLiEnTest.scala @@ -8,14 +8,9 @@ import eu.unicredit.web.hylien.VisualHyLiEn */ object HyLiEnTest extends App { -<<<<<<< HEAD - val hylien = new VisualHyLiEn() - val result = hylien.extract("http://www.cs.illinois.edu") -======= val hylien = new VisualHyLiEn( headless = true, quickRender = true, logReqs = false, browserSize = BrowserSize(1920, 1080)) ->>>>>>> upstream/master val lists = hylien.extract("http://www.immobiliare.it/44602950-Vendita-Bilocale-via-Pola-2-Milano.html") @@ -43,32 +38,23 @@ object HyLiEnTest extends App { def toString(l: WebList): String = { val buf = new StringBuilder -<<<<<<< HEAD - buf ++= s"Printing ${l.orientation} of by ${l.elements.size} elements \n" - buf ++= s"parent dom tag = ${l.parent.tagName} \n" - buf ++= s"location = ${l.location} \n" - buf ++= s"location = ${l.size} \n" -======= + buf ++= s"Printing ${l.orientation} of by ${l.elements.size} elements obtained merging ${l.from.size} lists \n" buf ++= s"parent dom tag = ${l.parent.tagName}\n" buf ++= s"location = ${l.location} \n" buf ++= s"size = ${l.size} \n" buf ++= s"parent Visual Features = ${l.parent.visualFeatures} \n" ->>>>>>> upstream/master l.elements.foreach { n => buf ++= s"\t tag = ${n.tagName} \n" buf ++= s"\t text = || ${n.text.replace("\n", " ")} || \n" //buf ++= s"\t html = || ${n.html} || \n" buf ++= s"\t bfs = ${n.bfs}\n" buf ++= s"\t urls = ${n.urls}\n" -<<<<<<< HEAD -======= buf ++= s"\t urls absolutes = ${DomNode.getUrls(n.html, l.pageUrl)} \n" buf ++= s"\t node class attribute = ${n.cssClass} \n" // buf ++= s"\t node MapCssProps = ${n.cssProperties} \n" buf ++= s"\t BFS nodes Styles = ${n.bfsCssClasses} \n" buf ++= s"\t Visual Features = ${n.visualFeatures} \n" ->>>>>>> upstream/master buf ++= "----------------------- \n" } diff --git a/src/test/scala/hylien/TreeEditDistanceSpecs.scala b/src/test/scala/hylien/TreeEditDistanceSpecs.scala index 3bbb838..75e747c 100644 --- a/src/test/scala/hylien/TreeEditDistanceSpecs.scala +++ b/src/test/scala/hylien/TreeEditDistanceSpecs.scala @@ -1,6 +1,9 @@ package hylien import eu.unicredit.web.Models.{DomNode, Location, Size} +import eu.unicredit.web.hylien.Distances +import org.specs2.mutable.Specification +import org.specs2.specification.Scope import scala.collection.mutable From 9165cc8cfb292d7fcc0a024d4e3a3c6365723440 Mon Sep 17 00:00:00 2001 From: fabiana Date: Wed, 6 Jul 2016 11:29:54 +0200 Subject: [PATCH 7/9] downgrade dependency --- build.sbt | 2 +- .../scala/eu/unicredit/web/hylien/Distances.scala | 11 +++++------ src/main/scala/eu/unicredit/web/hylien/HyLiEn.scala | 6 ++++-- .../scala/eu/unicredit/web/hylien/ListsFinder.scala | 3 ++- src/test/scala/eu/unicredit/web/HyLiEnTest.scala | 2 +- 5 files changed, 13 insertions(+), 11 deletions(-) diff --git a/build.sbt b/build.sbt index 1bf8497..c8ff123 100644 --- a/build.sbt +++ b/build.sbt @@ -21,6 +21,6 @@ lazy val root = (project in file(".")) "com.typesafe.scala-logging" % "scala-logging_2.11" % "3.4.0", "ch.qos.logback" % "logback-classic" % "1.1.7", "com.rockymadden.stringmetric" % "stringmetric-core_2.11" % "0.27.4", - "io.github.lukehutch" % "fast-classpath-scanner" % "1.9.21" + "io.github.lukehutch" % "fast-classpath-scanner" % "1.9.19" ) ) diff --git a/src/main/scala/eu/unicredit/web/hylien/Distances.scala b/src/main/scala/eu/unicredit/web/hylien/Distances.scala index 8f3b70e..4d158ca 100644 --- a/src/main/scala/eu/unicredit/web/hylien/Distances.scala +++ b/src/main/scala/eu/unicredit/web/hylien/Distances.scala @@ -72,19 +72,18 @@ object Distances { } def normalizedTreeEditDistance (a: DomNode, b:DomNode) : Double = { - def getSize0(nodes: List[DomNode], acc:Int): Int = { - nodes match { - case List() => acc - case h::tail => getSize0(h.children.toList ++ tail, acc+1) - } + def getSize0(nodes: List[DomNode], acc:Int): Int = nodes match { + case List() => acc + case h::tail => getSize0(h.children.toList ++ tail, acc+1) } + def getSize(tree: DomNode): Int = { getSize0(List(tree), 0) } val ted = treeEditDistance(a,b) val avgNodes = (getSize(a) + getSize(b)).toDouble /2 - ted.toDouble / avgNodes + 1- (ted.toDouble / avgNodes) } } diff --git a/src/main/scala/eu/unicredit/web/hylien/HyLiEn.scala b/src/main/scala/eu/unicredit/web/hylien/HyLiEn.scala index db53a4a..046a3ec 100644 --- a/src/main/scala/eu/unicredit/web/hylien/HyLiEn.scala +++ b/src/main/scala/eu/unicredit/web/hylien/HyLiEn.scala @@ -20,9 +20,11 @@ class VisualHyLiEn(headless: Boolean = true, quickRender: Boolean = true, logReqs = logReqs, browserSize = browserSize) - def extract(url: String, tagSimFactor: Float = 0.4F, maxRecordTags: Int = 30): Seq[WebList] = { + def extract(url: String, tagSimFactor: Float = 0.4F, maxRecordTags: Int = 60): Seq[WebList] = { + val startTime = System.currentTimeMillis() val root = webExtractor.parse(url) - logger.debug(s"parsed $url, start extracting lists") + val totalTime = System.currentTimeMillis() - startTime + logger.info(s"parsed $url in $totalTime millisec, start extracting lists") @tailrec def extract0(notAligned: List[DomNode], acc: List[WebList]): List[WebList] = diff --git a/src/main/scala/eu/unicredit/web/hylien/ListsFinder.scala b/src/main/scala/eu/unicredit/web/hylien/ListsFinder.scala index 8275fce..c8bb485 100644 --- a/src/main/scala/eu/unicredit/web/hylien/ListsFinder.scala +++ b/src/main/scala/eu/unicredit/web/hylien/ListsFinder.scala @@ -91,7 +91,8 @@ private[this] object VisualListFinder { //take the head and for the tail filter all the elements similar to the head case head :: tail => head :: tail.filter { n => - val dist = Distances.normalizedEditDistance(head.bfs, n.bfs) + //val dist = Distances.normalizedEditDistance(head.bfs, n.bfs) + val dist = Distances.normalizedTreeEditDistance(head, n) if (dist > minsim) nonSimilar = n :: nonSimilar dist <= minsim } diff --git a/src/test/scala/eu/unicredit/web/HyLiEnTest.scala b/src/test/scala/eu/unicredit/web/HyLiEnTest.scala index 68dcaa1..2ae9ffb 100644 --- a/src/test/scala/eu/unicredit/web/HyLiEnTest.scala +++ b/src/test/scala/eu/unicredit/web/HyLiEnTest.scala @@ -12,7 +12,7 @@ object HyLiEnTest extends App { headless = true, quickRender = true, logReqs = false, browserSize = BrowserSize(1920, 1080)) - val lists = hylien.extract("http://www.immobiliare.it/44602950-Vendita-Bilocale-via-Pola-2-Milano.html") + val lists = hylien.extract("http://www.idealista.it/vendita-case/milano-milano/") //("http://www.cs.illinois.edu/directory/faculty?quicktabs_faculty_tabs_new=1#quicktabs-faculty_tabs_new") From 811f35bf3fec5e2ac1caa18a68f1ca299889f015 Mon Sep 17 00:00:00 2001 From: Fabiana Lanotte Date: Thu, 7 Feb 2019 12:19:47 +0100 Subject: [PATCH 8/9] fixed #1 --- src/main/scala/eu/unicredit/web/HtmlExtractor.scala | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/main/scala/eu/unicredit/web/HtmlExtractor.scala b/src/main/scala/eu/unicredit/web/HtmlExtractor.scala index 4b96574..63b98d8 100644 --- a/src/main/scala/eu/unicredit/web/HtmlExtractor.scala +++ b/src/main/scala/eu/unicredit/web/HtmlExtractor.scala @@ -71,9 +71,11 @@ class VisualTagTreeBuilder(headless: Boolean = true, quickRender: Boolean = true .split(";") .filter(_.contains("::")) .map(_.split("::")) - .map { + .flatMap { case Array(prop, value) => - prop -> value + Some(prop -> value) + case Array(prop) => + Some(prop -> "") } .toMap .filterNot(_._1.startsWith("-webkit")) From 00612be42aad1b5377a61a3b55ecf8a5bcc2cbcf Mon Sep 17 00:00:00 2001 From: Fabiana Lanotte Date: Thu, 7 Feb 2019 12:22:31 +0100 Subject: [PATCH 9/9] removed few comments --- .../eu/unicredit/web/HtmlExtractorTest.scala | 24 ------------------- .../scala/eu/unicredit/web/HyLiEnTest.scala | 14 ----------- 2 files changed, 38 deletions(-) diff --git a/src/test/scala/eu/unicredit/web/HtmlExtractorTest.scala b/src/test/scala/eu/unicredit/web/HtmlExtractorTest.scala index 8688a83..dfe05fa 100644 --- a/src/test/scala/eu/unicredit/web/HtmlExtractorTest.scala +++ b/src/test/scala/eu/unicredit/web/HtmlExtractorTest.scala @@ -9,8 +9,6 @@ import scala.collection.mutable */ object VisualTagTreeBuilderTest extends App { - // val url = "https://ec.europa.eu/research/participants/portal/desktop/en/opportunities/h2020/#c,calls=level3/t/EU.1./0/1/1/default-group&level4/t/EU.1.1./0/1/1/default-group&level4/t/EU.1.2./0/1/1/default-group&level4/t/EU.1.3./0/1/1/default-group&level4/t/EU.1.4./0/1/1/default-group&level3/t/EU.2./0/1/1/default-group&level4/t/EU.2.1./0/1/1/default-group&level5/t/EU.2.1.1./0/1/1/default-group&level5/t/EU.2.1.2./0/1/1/default-group&level5/t/EU.2.1.3./0/1/1/default-group&level5/t/EU.2.1.4./0/1/1/default-group&level5/t/EU.2.1.5./0/1/1/default-group&level5/t/EU.2.1.6./0/1/1/default-group&level4/t/EU.2.2./0/1/1/default-group&level4/t/EU.2.3./0/1/1/default-group&level3/t/EU.3./0/1/1/default-group&level4/t/EU.3.1./0/1/1/default-group&level4/t/EU.3.2./0/1/1/default-group&level4/t/EU.3.3./0/1/1/default-group&level4/t/EU.3.4./0/1/1/default-group&level4/t/EU.3.5./0/1/1/default-group&level4/t/EU.3.6./0/1/1/default-group&level4/t/EU.3.7./0/1/1/default-group&level3/t/EU.4./0/1/1/default-group&level3/t/EU.5./0/1/1/default-group&level3/t/EU.7./0/1/1/default-group&level2/t/Euratom/0/1/1/default-group&hasForthcomingTopics/t/true/1/1/0/default-group&hasOpenTopics/t/true/1/1/0/default-group&allClosedTopics/t/true/0/1/0/default-group&+PublicationDateLong/asc" - // val url = "https://www.stanford.edu/" val url = "http://www.bsvillage.com/Piscine-Fuori-Terra/" val time2 = System.currentTimeMillis() @@ -19,35 +17,13 @@ object VisualTagTreeBuilderTest extends App { println(s"page parsed into ${System.currentTimeMillis() - time2}") parser1.close() - // var time1 = System.currentTimeMillis() - // val visual1 = new VisualWebExtractor(true, true) - // (1 to 10).foreach(_ => visual1.parse(url)) - // println(s"page parsed into ${(System.currentTimeMillis() - time1) / 10}") - - // val time3 = System.currentTimeMillis() - // val parser2 = new VisualWebExtractor(true, true) - // parser2.parse(url) - // println(s"page parsed into ${System.currentTimeMillis() - time3}") - // parser2.close() - println(root) } object TagTreeBuilderTest extends App { - // val url = "https://ec.europa.eu/research/participants/portal/desktop/en/opportunities/h2020/#c,calls=level3/t/EU.1./0/1/1/default-group&level4/t/EU.1.1./0/1/1/default-group&level4/t/EU.1.2./0/1/1/default-group&level4/t/EU.1.3./0/1/1/default-group&level4/t/EU.1.4./0/1/1/default-group&level3/t/EU.2./0/1/1/default-group&level4/t/EU.2.1./0/1/1/default-group&level5/t/EU.2.1.1./0/1/1/default-group&level5/t/EU.2.1.2./0/1/1/default-group&level5/t/EU.2.1.3./0/1/1/default-group&level5/t/EU.2.1.4./0/1/1/default-group&level5/t/EU.2.1.5./0/1/1/default-group&level5/t/EU.2.1.6./0/1/1/default-group&level4/t/EU.2.2./0/1/1/default-group&level4/t/EU.2.3./0/1/1/default-group&level3/t/EU.3./0/1/1/default-group&level4/t/EU.3.1./0/1/1/default-group&level4/t/EU.3.2./0/1/1/default-group&level4/t/EU.3.3./0/1/1/default-group&level4/t/EU.3.4./0/1/1/default-group&level4/t/EU.3.5./0/1/1/default-group&level4/t/EU.3.6./0/1/1/default-group&level4/t/EU.3.7./0/1/1/default-group&level3/t/EU.4./0/1/1/default-group&level3/t/EU.5./0/1/1/default-group&level3/t/EU.7./0/1/1/default-group&level2/t/Euratom/0/1/1/default-group&hasForthcomingTopics/t/true/1/1/0/default-group&hasOpenTopics/t/true/1/1/0/default-group&allClosedTopics/t/true/0/1/0/default-group&+PublicationDateLong/asc" - // val url = "https://www.stanford.edu/" val url = "http://www.cs.illinois.edu/directory/faculty" -// var time1 = System.currentTimeMillis() -// val visual1 = new TagTreeBuilder() -// (1 to 10).foreach(_ => visual1.parse(url)) -// println(s"page parsed into ${(System.currentTimeMillis() - time1) / 10}") -// -// val time2 = System.currentTimeMillis() -// val root = new TagTreeBuilder().parse(url) -// println(s"page parsed into ${System.currentTimeMillis() - time2}") - val time3 = System.currentTimeMillis() val root = new TagTreeBuilder().parse(url) println(s"page parsed into ${System.currentTimeMillis() - time3}") diff --git a/src/test/scala/eu/unicredit/web/HyLiEnTest.scala b/src/test/scala/eu/unicredit/web/HyLiEnTest.scala index 2ae9ffb..fd38a36 100644 --- a/src/test/scala/eu/unicredit/web/HyLiEnTest.scala +++ b/src/test/scala/eu/unicredit/web/HyLiEnTest.scala @@ -14,20 +14,6 @@ object HyLiEnTest extends App { val lists = hylien.extract("http://www.idealista.it/vendita-case/milano-milano/") - - //("http://www.cs.illinois.edu/directory/faculty?quicktabs_faculty_tabs_new=1#quicktabs-faculty_tabs_new") - - //("http://www.cs.illinois.edu") - - //("https://it.wikipedia.org/wiki/Fiat_Chrysler_Automobiles") - //("http://www.bsvillage.com/Piscine-Fuori-Terra/") - - //("http://www.cs.illinois.edu/directory/faculty") - - //("http://www.cs.ox.ac.uk/") - - // //("http://www.harvard.edu/") //("http://cs.stanford.edu/") - println(s"Got ${lists.size} lists") lists.foreach { l => val r = toString(l)