From 76f4a5b97ad4569ae5442ee3865b68b504725056 Mon Sep 17 00:00:00 2001 From: Leo Born Date: Wed, 13 May 2015 20:18:05 +0900 Subject: [PATCH 1/3] added cky parser [cky] --- .../scala/jigg/nlp/ccg/lexicon/Category.scala | 21 +++++-------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/src/main/scala/jigg/nlp/ccg/lexicon/Category.scala b/src/main/scala/jigg/nlp/ccg/lexicon/Category.scala index 423899c6..c0e6234e 100644 --- a/src/main/scala/jigg/nlp/ccg/lexicon/Category.scala +++ b/src/main/scala/jigg/nlp/ccg/lexicon/Category.scala @@ -1,20 +1,4 @@ package jigg.nlp.ccg.lexicon - -/* - Copyright 2013-2015 Hiroshi Noji - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ import Slash._ sealed trait Category extends Numbered[Unit] { @@ -30,6 +14,11 @@ case class AtomicCategory(override val id:Int, base:String, feature:CategoryFeat } override def toStringNoFeature = base + + def hasFeatures = feature.toString match { + case "" => false + case _ => true + } } @SerialVersionUID(3754315949719248198L) case class ComplexCategory(override val id:Int, From 1e78b9b4064734e247ebc8a7ba841e8e59f6881b Mon Sep 17 00:00:00 2001 From: Leo Born Date: Wed, 13 May 2015 22:38:44 +0900 Subject: [PATCH 2/3] added cky parser [cky] --- .../scala/jigg/nlp/ccg/lexicon/Category.scala | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/main/scala/jigg/nlp/ccg/lexicon/Category.scala b/src/main/scala/jigg/nlp/ccg/lexicon/Category.scala index c0e6234e..ff4613a5 100644 --- a/src/main/scala/jigg/nlp/ccg/lexicon/Category.scala +++ b/src/main/scala/jigg/nlp/ccg/lexicon/Category.scala @@ -1,4 +1,20 @@ package jigg.nlp.ccg.lexicon + +/* + Copyright 2013-2015 Hiroshi Noji + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ import Slash._ sealed trait Category extends Numbered[Unit] { @@ -14,7 +30,7 @@ case class AtomicCategory(override val id:Int, base:String, feature:CategoryFeat } override def toStringNoFeature = base - + def hasFeatures = feature.toString match { case "" => false case _ => true From 3512ac07b07d5870f90b387b5b7e7c3e202ac629 Mon Sep 17 00:00:00 2001 From: Leo Born Date: Wed, 13 May 2015 22:42:39 +0900 Subject: [PATCH 3/3] Add CKY parser [cky] --- src/main/scala/jigg/nlp/ccg/CCGRules.scala | 346 ++++++++++++++++++ src/main/scala/jigg/nlp/ccg/CCGrammar.scala | 11 + .../scala/jigg/nlp/ccg/CKYEvaluator.scala | 233 ++++++++++++ src/main/scala/jigg/nlp/ccg/CKYParser.scala | 226 ++++++++++++ src/main/scala/jigg/nlp/ccg/CKYTest.scala | 141 +++++++ .../nlp/ccg/ChartCellWithBackpointers.scala | 87 +++++ .../scala/jigg/nlp/ccg/LexiconReader.scala | 276 ++++++++++++++ .../scala/jigg/nlp/ccg/SimplifyCCGBank.scala | 30 ++ 8 files changed, 1350 insertions(+) create mode 100644 src/main/scala/jigg/nlp/ccg/CCGRules.scala create mode 100644 src/main/scala/jigg/nlp/ccg/CCGrammar.scala create mode 100644 src/main/scala/jigg/nlp/ccg/CKYEvaluator.scala create mode 100644 src/main/scala/jigg/nlp/ccg/CKYParser.scala create mode 100644 src/main/scala/jigg/nlp/ccg/CKYTest.scala create mode 100644 src/main/scala/jigg/nlp/ccg/ChartCellWithBackpointers.scala create mode 100644 src/main/scala/jigg/nlp/ccg/LexiconReader.scala create mode 100644 src/main/scala/jigg/nlp/ccg/SimplifyCCGBank.scala diff --git a/src/main/scala/jigg/nlp/ccg/CCGRules.scala b/src/main/scala/jigg/nlp/ccg/CCGRules.scala new file mode 100644 index 00000000..2279bf02 --- /dev/null +++ b/src/main/scala/jigg/nlp/ccg/CCGRules.scala @@ -0,0 +1,346 @@ +package jigg.nlp.ccg + +import jigg.nlp.ccg.lexicon._ +import jigg.nlp.ccg.lexicon.Slash._ +import scala.collection.mutable.HashMap + +/** + * Implements CCG Rules (hard-coded). + */ +trait CCGRules + +/** + * Implements unary rules, i.e. type raising rules. + * Uses the set of atomic categories of the grammar in order to determine possible features for + * a category. + */ +trait UnaryRule extends CCGRules{ + def isApplicable(input: Category): Boolean + def apply(input: Category, setAC:Set[AtomicCategory]): Set[ComplexCategory] +} + +/** + * Handles: S -> NP/NP (RelExt) + * S[mod=adn] + */ +object TypeChangingRule1 extends UnaryRule{ + override def isApplicable(current: Category) = current match { + case ac: AtomicCategory => if(ac.toString.matches("^S\\[.*(mod=adn).*\\]")) true else false + case cc: ComplexCategory => false + } + + override def apply(current: Category, setAC:Set[AtomicCategory]): Set[ComplexCategory] = { + for(ac <- setAC if ac.base == "NP") yield ComplexCategory(ac.id*42, ac, ac, Slash.Right) + } +} + + +/** + * Handles: S\NP[1] -> NP[1]/NP[1] (RelIn) + */ +object TypeChangingRule2 extends UnaryRule{ + override def isApplicable(current: Category) = current match { + case ac:AtomicCategory => false + case cc:ComplexCategory => { + if(cc.slash.equals(Slash.Left) && cc.left.isInstanceOf[AtomicCategory] && cc.left.toString.startsWith("S") && + cc.right.isInstanceOf[AtomicCategory] && cc.right.toString.startsWith("NP")){ + true + } + else false + } + } + + override def apply(current: Category, setAC:Set[AtomicCategory]): Set[ComplexCategory] = { + Set(ComplexCategory(0, current.asInstanceOf[ComplexCategory].right, current.asInstanceOf[ComplexCategory].right, Slash.Right)) + } +} + +/** + * Handles: S -> S/S (Con) + * S[mod=adv] + */ +object TypeChangingRule3 extends UnaryRule{ + override def isApplicable(current: Category) = current match { + case ac: AtomicCategory => if(ac.toString.matches("^S\\[.*(mod=adv).*\\]")) true else false + case cc: ComplexCategory => false + } + + override def apply(current: Category, setAC:Set[AtomicCategory]): Set[ComplexCategory] = { + for(ac <- setAC if ac.base == "S") yield ComplexCategory(ac.id*42, ac, ac, Slash.Right) + } +} + + + + +/** + * Implements binary combinatory rules. + * At the moment only functional application and composition. + */ +trait BinaryRule extends CCGRules{ + def isApplicable(candidate: Category, current: Category): Boolean + def apply(candidate: Category, current: Category): Category +} + +/** + * Implements normal forward rule (>). + * + * Candidate: S/NP, Current: NP + * --> S + */ +object ForwardApplication extends BinaryRule{ + override def isApplicable(candidate: Category, current: Category) = current match { + case cc:ComplexCategory => false + case ac:AtomicCategory => { + candidate match{ + case c:AtomicCategory => false + case cand:ComplexCategory => { + if(cand.slash.equals(Slash.Right) && cand.right.isInstanceOf[AtomicCategory]){ + if(cand.right.equals(current)) { true } + /* + * Else if candidate is underspecified on BOTH SIDES. + * cand: NP/NP and curr: NP[ac, nm] + */ + else if(cand.right.toStringNoFeature == cand.left.toStringNoFeature && cand.left.isInstanceOf[AtomicCategory] && + !cand.left.asInstanceOf[AtomicCategory].hasFeatures && !cand.right.asInstanceOf[AtomicCategory].hasFeatures && + cand.right.toStringNoFeature == current.toStringNoFeature){ true } + else false + } + else false + } + } + } + } + + override def apply(candidate: Category, current: Category): Category = { + // NP/NP + NP[ac, nm] -> NP[ac, nm] + if(candidate.asInstanceOf[ComplexCategory].left.isInstanceOf[AtomicCategory] && + !candidate.asInstanceOf[ComplexCategory].left.asInstanceOf[AtomicCategory].hasFeatures && + !candidate.asInstanceOf[ComplexCategory].right.asInstanceOf[AtomicCategory].hasFeatures){ + current + } + // S[base]/NP[ac,nm] + NP[ac,nm] -> S[base] + // (NP/NP)/NP + NP -> NP/NP + else{ + candidate.asInstanceOf[ComplexCategory].left + } + } + + override def toString() = "Forward Application" +} + +/** + * Implements forward composition rule (>B). + * + * Candidate: S/NP, current: NP/PP + * --> S/PP + */ +object ForwardComposition extends BinaryRule{ + override def isApplicable(candidate: Category, current: Category): Boolean = { + var result: Boolean = false + if(candidate.isInstanceOf[ComplexCategory] && current.isInstanceOf[ComplexCategory]){ + if(candidate.asInstanceOf[ComplexCategory].slash.equals(Slash.Right) && current.asInstanceOf[ComplexCategory].slash.equals(Slash.Right)){ + if(candidate.asInstanceOf[ComplexCategory].right.isInstanceOf[AtomicCategory] && current.asInstanceOf[ComplexCategory].left.isInstanceOf[AtomicCategory]){ + if(candidate.asInstanceOf[ComplexCategory].right.equals(current.asInstanceOf[ComplexCategory].left)){ + result = true + } + } + } + } + result + } + + override def apply(candidate: Category, current: Category): Category = { + val newId:Int = candidate.id + current.id + val newLeft:Category = candidate.asInstanceOf[ComplexCategory].left + val newRight:Category = current.asInstanceOf[ComplexCategory].right + val newSlash:Slash = candidate.asInstanceOf[ComplexCategory].slash + ComplexCategory(newId, newLeft, newRight, newSlash) + } + + override def toString() = "Forward Composition" +} + +/** + * Implements simple forward crossed composition (>Bx). + * + * Candidate: X/Y Current: Y\Z + * --> X\Z + */ +object ForwardCrossedComposition extends BinaryRule{ + override def isApplicable(candidate: Category, current: Category): Boolean = { + var result: Boolean = false + if(candidate.isInstanceOf[ComplexCategory] && current.isInstanceOf[ComplexCategory]){ + if(candidate.asInstanceOf[ComplexCategory].slash.equals(Slash.Right) && current.asInstanceOf[ComplexCategory].slash.equals(Slash.Left)){ + if(candidate.asInstanceOf[ComplexCategory].right.isInstanceOf[AtomicCategory] && current.asInstanceOf[ComplexCategory].left.isInstanceOf[AtomicCategory]){ + if(candidate.asInstanceOf[ComplexCategory].right.equals(current.asInstanceOf[ComplexCategory].left)){ + result = true + } + } + } + } + result + } + + override def apply(candidate: Category, current: Category): Category = { + val newId:Int = candidate.id + current.id + val newLeft:Category = candidate.asInstanceOf[ComplexCategory].left + val newRight:Category = current.asInstanceOf[ComplexCategory].right + val newSlash:Slash = current.asInstanceOf[ComplexCategory].slash + ComplexCategory(newId, newLeft, newRight, newSlash) + } + + override def toString() = "Forward Crossed Composition" +} + +/** + * Implements simple backward rule (<). Candidate must be atomic. + * + * Candidate: NP, Current: S\NP + * --> S + */ +object BackwardApplication extends BinaryRule{ + override def isApplicable(candidate: Category, current: Category) = current match { + case ac:AtomicCategory => false + case curr:ComplexCategory => { + candidate match{ + case c:ComplexCategory => false + case cand:AtomicCategory => { + if(curr.slash.equals(Slash.Left) && curr.right.isInstanceOf[AtomicCategory]){ + if(cand.equals(curr.right)) true + /* + * Else if current category is underspecified on BOTH SIDES. + * cand: NP[ac,nm] and curr: NP\NP + */ + else if(curr.right.toStringNoFeature == curr.left.toStringNoFeature && curr.left.isInstanceOf[AtomicCategory] && + !curr.left.asInstanceOf[AtomicCategory].hasFeatures && !curr.right.asInstanceOf[AtomicCategory].hasFeatures && + curr.right.toStringNoFeature == candidate.toStringNoFeature){ true } + else false + } + else false + } + } + } + } + + override def apply(candidate: Category, current: Category): Category = { + // NP[ac, nm] + NP\NP -> NP[ac, nm] + if(current.asInstanceOf[ComplexCategory].left.isInstanceOf[AtomicCategory] && + !current.asInstanceOf[ComplexCategory].left.asInstanceOf[AtomicCategory].hasFeatures && + !current.asInstanceOf[ComplexCategory].right.asInstanceOf[AtomicCategory].hasFeatures){ + candidate + } + // NP[ac, nm] + S\NP[ac,nm] -> S + // NP[ac, nm] + (S\NP)\NP -> S\NP + else{ + current.asInstanceOf[ComplexCategory].left + } + } + + override def toString() = "Backward Application" +} + +/** + * Implements backward composition rule ( S[cont]\NP[ga] + */ +private object BackwardCompositionSimple extends BinaryRule{ + override def isApplicable(candidate: Category, current: Category): Boolean = { + var result: Boolean = false + if(candidate.isInstanceOf[ComplexCategory] && current.isInstanceOf[ComplexCategory]){ + if(candidate.asInstanceOf[ComplexCategory].slash.equals(Slash.Left) && current.asInstanceOf[ComplexCategory].slash.equals(Slash.Left)){ + if(candidate.asInstanceOf[ComplexCategory].left.isInstanceOf[AtomicCategory] && current.asInstanceOf[ComplexCategory].right.isInstanceOf[AtomicCategory]){ + if(candidate.asInstanceOf[ComplexCategory].left.equals(current.asInstanceOf[ComplexCategory].right)){ + result = true + } + } + } + } + result + } + + override def apply(candidate: Category, current: Category): Category = { + val newId:Int = candidate.id + current.id + val newLeft: Category = current.asInstanceOf[ComplexCategory].left + val newRight: Category = candidate.asInstanceOf[ComplexCategory].right + val newSlash: Slash = candidate.asInstanceOf[ComplexCategory].slash + ComplexCategory(newId, newLeft, newRight, newSlash) + } +} + +/** + * Implements nested backward composition rule ( (S[cont]\NP[ga])\NP[ni] + */ +object BackwardCompositionNested extends BinaryRule{ + override def isApplicable(candidate: Category, current: Category): Boolean = { + var result: Boolean = false + if(candidate.isInstanceOf[ComplexCategory] && current.isInstanceOf[ComplexCategory]){ + if(candidate.asInstanceOf[ComplexCategory].slash.equals(Slash.Left) && current.asInstanceOf[ComplexCategory].slash.equals(Slash.Left)){ + candidate.asInstanceOf[ComplexCategory].left match{ + case ac: AtomicCategory => result = BackwardCompositionSimple.isApplicable(candidate, current) + case cc: ComplexCategory => { + if(cc.slash.equals(Slash.Left)){ + result = BackwardCompositionNested.isApplicable(cc, current) + } + } + } + } + } + result + } + + override def apply(candidate: Category, current: Category): Category = { + candidate.asInstanceOf[ComplexCategory].left match{ + case ac: AtomicCategory => BackwardCompositionSimple.apply(candidate, current) + case cc: ComplexCategory => { + val newId:Int = candidate.id + current.id + val newLeft:Category = BackwardCompositionNested.apply(cc, current) + val newRight:Category = candidate.asInstanceOf[ComplexCategory].right + val newSlash:Slash = candidate.asInstanceOf[ComplexCategory].slash + ComplexCategory(newId, newLeft, newRight, newSlash) + } + } + } + + override def toString() = "Backward Composition" +} + + +/** + * Implements simple backward crossed composition ( X/Z + */ +object BackwardCrossedComposition extends BinaryRule{ + override def isApplicable(candidate: Category, current: Category): Boolean = { + var result: Boolean = false + if(candidate.isInstanceOf[ComplexCategory] && current.isInstanceOf[ComplexCategory]){ + if(candidate.asInstanceOf[ComplexCategory].slash.equals(Slash.Left) && current.asInstanceOf[ComplexCategory].slash.equals(Slash.Right)){ + if(candidate.asInstanceOf[ComplexCategory].right.isInstanceOf[AtomicCategory] && current.asInstanceOf[ComplexCategory].left.isInstanceOf[AtomicCategory]){ // Maybe irrelevant? + if(candidate.asInstanceOf[ComplexCategory].right.equals(current.asInstanceOf[ComplexCategory].left)){ + result = true + } + } + } + } + result + } + + override def apply(candidate: Category, current: Category): Category = { + val newId:Int = candidate.id + current.id + val newLeft:Category = candidate.asInstanceOf[ComplexCategory].left + val newRight:Category = current.asInstanceOf[ComplexCategory].right + val newSlash:Slash = current.asInstanceOf[ComplexCategory].slash + ComplexCategory(newId, newLeft, newRight, newSlash) + } + + override def toString() = "Backward Crossed Composition" +} \ No newline at end of file diff --git a/src/main/scala/jigg/nlp/ccg/CCGrammar.scala b/src/main/scala/jigg/nlp/ccg/CCGrammar.scala new file mode 100644 index 00000000..5fc56d21 --- /dev/null +++ b/src/main/scala/jigg/nlp/ccg/CCGrammar.scala @@ -0,0 +1,11 @@ +package jigg.nlp.ccg + +import jigg.nlp.ccg.lexicon._ +import scala.collection.mutable.HashMap +import scala.language.existentials + +/** + * Implements a simplififed CCG Grammar. Contains a Set of terminals (i.e. tokens), + * a Set of atomic Non-Terminals (both are retrieved from ccgbank-lexicon) and a Set of CCG Rules. + */ +case class CCGrammar(terminals: Set[String], atomicCategories: Set[AtomicCategory], rules: Set[_ <: CCGRules]) \ No newline at end of file diff --git a/src/main/scala/jigg/nlp/ccg/CKYEvaluator.scala b/src/main/scala/jigg/nlp/ccg/CKYEvaluator.scala new file mode 100644 index 00000000..a9f6b570 --- /dev/null +++ b/src/main/scala/jigg/nlp/ccg/CKYEvaluator.scala @@ -0,0 +1,233 @@ +package jigg.nlp.ccg + +import jigg.nlp.ccg.lexicon._ +import scala.collection.mutable.ArrayBuffer + +/** + * Implements an evalutor for the cky output parse trees. + * Works only for unlabeled precision, recall and f-score. + */ +class CKYEvaluator { + type Tree = ParseTree[String] + + private def calculatePrecision(parserTree: Tree, gold: Tree): Double = { + val constGold = getConstituents(gold, new ArrayBuffer[Tree]()) + val constInput = getConstituents(parserTree, new ArrayBuffer[Tree]()) + + val numberOfInputConst = constInput.size + val numberOfCorrectInInput = corrConstitInInput(constInput, constGold) + + val precision = numberOfCorrectInInput / numberOfInputConst + precision + } + + def calculatePrecisions(parser: List[ParseTree[String]], gold: Tree) = for(el <- parser) yield calculatePrecision(el, gold) + + private def calculateRecall(gold: Tree, parserTree: Tree): Double = { + val constGold = getConstituents(gold, new ArrayBuffer[Tree]()) + val constInput = getConstituents(parserTree, new ArrayBuffer[Tree]()) + + val numberOfGoldConst = constGold.size + val numberOfCorrectOfGold = corrConstitOfGold(constGold, constInput) + + val recall = numberOfCorrectOfGold / numberOfGoldConst + recall + } + + def calculateRecalls(gold: Tree, parser: List[ParseTree[String]]) = for(el <- parser) yield calculateRecall(gold, el) + + private def calculateFScore(parserTree: Tree, gold: Tree): Double = { + val precision = calculatePrecision(parserTree, gold) + val recall = calculateRecall(gold, parserTree) + if(precision + recall == 0.0 ){ 0.0 } + else{ + val fscore = (2 * precision * recall) / (precision + recall) + fscore + } + } + + def calculateFScores(parser: List[ParseTree[String]], gold: Tree) = for(el <- parser) yield calculateFScore(el, gold) + + /** + * Gets the constituents of the tree. + * (S((NP((DT the) (NN man)) (VP eats))) has 2 constituents: S -> NP VP and NP -> DT NN + */ + private def getConstituents(input: Tree, constit: ArrayBuffer[Tree]): ArrayBuffer[Tree] = { + if(input.children.size == 0){ + constit + } + else if(input.children.size == 1){ + constit += new UnaryTree(new LeafTree(input.children(0).label), input.label) + constit ++= getConstituents(input.children(0), new ArrayBuffer[Tree]()) + constit + } + else{ + constit += new BinaryTree(new LeafTree(input.children(0).label), new LeafTree(input.children(1).label), input.label) + constit ++= getConstituents(input.children(0), new ArrayBuffer[Tree]()) + constit ++= getConstituents(input.children(1), new ArrayBuffer[Tree]()) + constit + } + } + + /** + * Checks how many constituents of the parser tree are in the gold tree. + */ + private def corrConstitInInput(parser: ArrayBuffer[Tree], gold: ArrayBuffer[Tree]): Double = { + var result = 0.0 + var alreadyMatched = Set[Tree]() + for(el <- parser){ + for(el2 <- gold){ + if(el.getClass() == el2.getClass() && !alreadyMatched.contains(el)){ + val cleanParserLabel = cleanLabel(el.label) + val parserLabel = changeFeatureOrdering(cleanParserLabel) + el match{ + case l: LeafTree[String] => { + if(parserLabel == cleanLabel(el2.label)){ + result += 1.0 + alreadyMatched += el + } + } + case u: UnaryTree[String] => { + val parserChild = u.child + val goldChild = el2.asInstanceOf[UnaryTree[String]].child + + val cleanParserChildLabel = cleanLabel(parserChild.label) + val parserChildLabel = changeFeatureOrdering(cleanParserChildLabel) + + if(parserLabel == cleanLabel(el2.label) && parserChildLabel == cleanLabel(goldChild.label)){ + result += 1.0 + alreadyMatched += el + } + } + case b: BinaryTree[String] => { + val parserChildLeft = b.left + val parserChildRight = b.right + val goldChildLeft = el2.asInstanceOf[BinaryTree[String]].left + val goldChildRight = el2.asInstanceOf[BinaryTree[String]].right + + val cleanParserChildLeftLabel = cleanLabel(parserChildLeft.label) + val cleanParserChildRightLabel = cleanLabel(parserChildRight.label) + val parserChildLeftLabel = changeFeatureOrdering(cleanParserChildLeftLabel) + val parserChildRightLabel = changeFeatureOrdering(cleanParserChildRightLabel) + + if(parserLabel == cleanLabel(el2.label) && parserChildLeftLabel == cleanLabel(goldChildLeft.label) && + parserChildRightLabel == cleanLabel(goldChildRight.label)){ + result += 1.0 + alreadyMatched += el + } + } + } + } + } + } + result + } + + /** + * Checks how many constituents of the gold tree are in the parser tree. + * Very similar to @corrConstitInInput, only first two for-loops are reversed. Not very nice. + */ + private def corrConstitOfGold(gold: ArrayBuffer[Tree], parser: ArrayBuffer[Tree]): Double = { + var result = 0.0 + var alreadyFound = Set[Tree]() + for(el <- gold){ + for(el2 <- parser){ + if(el.getClass() == el2.getClass() && !alreadyFound.contains(el)){ + val goldLabel = cleanLabel(el.label) + val cleanParserLabel = cleanLabel(el2.label) + val parserLabel = changeFeatureOrdering(cleanParserLabel) + + el match{ + case l: LeafTree[String] => { + if(goldLabel == parserLabel){ + result += 1.0 + alreadyFound += el + } + } + case u: UnaryTree[String] => { + val goldChild = u.child + val parserChild = el2. asInstanceOf[UnaryTree[String]].child + + val cleanParserChildLabel = cleanLabel(parserChild.label) + val parserChildLabel = changeFeatureOrdering(cleanParserChildLabel) + + if(goldLabel == parserLabel && cleanLabel(goldChild.label) == parserChildLabel){ + result += 1.0 + alreadyFound += el + } + } + case b: BinaryTree[String] => { + val goldChildLeft = b.left + val goldChildRight = b.right + val parserChildLeft = el2.asInstanceOf[BinaryTree[String]].left + val parserChildRight = el2.asInstanceOf[BinaryTree[String]].right + + val cleanParserChildLeftLabel = cleanLabel(parserChildLeft.label) + val cleanParserChildRightLabel = cleanLabel(parserChildRight.label) + val parserChildLeftLabel = changeFeatureOrdering(cleanParserChildLeftLabel) + val parserChildRightLabel = changeFeatureOrdering(cleanParserChildRightLabel) + + if(goldLabel == parserLabel && cleanLabel(goldChildLeft.label) == parserChildLeftLabel && + cleanLabel(goldChildRight.label) == parserChildRightLabel){ + result += 1.0 + alreadyFound += el + } + } + } + } + } + } + result + } + + /** + * Changes the feature ordering of NP-categories because the category construction in 'LexiconReader.scala' + * results in switched NP-categories. So, in order to compare with CCGBank gold annotation on a String level, + * NP[mod=nm,case=nc] will be changed to NP[case=nc,mod=nm]. + */ + private def changeFeatureOrdering(input: String): String = { + if(input.startsWith("NP") && input.contains(',')){ + val feature1 = input.substring(input.indexOf("[") + 1, input.indexOf(",")) + val feature2 = input.substring(input.indexOf(",") + 1, input.indexOf("]")) + val nString = new StringBuilder("NP") + nString.append("[").append(feature2).append(",").append(feature1).append("]") + nString.toString + } + else{ input } + } + + /** + * Cleans the CCGBank gold categories so that they are comparable with CKY categories. + * In detail, it gets rid of any rule symbols at the beginning, {I1}, {I2}, or '_none' kind + * of attachments. + */ + private def cleanLabel(input: String): String = { + val nString = { + if(input.contains(" ")){ + val x = input.split(" ") + // Guarantees that it's a sensible category + if(input.matches("^(\\<|\\>)(\\w(\\d)?)?.*") || input.startsWith("ADNext") || + input.startsWith("ADV0") || input.startsWith("ADNint") || input.startsWith("SSEQ")){ + x(1) + } + else{ x(0) } + } + else{ input } + } + if(nString.contains("(")){ + val nnString = nString.replaceFirst("\\(", "").reverse.replaceFirst("\\)", "").reverse + val result = { + if(nnString.contains("_")){ nnString.replaceAll("_\\w+", "") } + else{ nnString } + } + val nresult = result.replaceAll("\\[\\w+=(X1|X2),\\w+=(X1|X2)\\]", "") + val nnresult = nresult.replaceAll("\\w+=(X1|X2)", "") + nnresult + } + else{ + val result = nString.replaceAll("\\[\\w+=(X1|X2),\\w+=(X1|X2)\\]", "") + val nresult = result.replaceAll("\\w+=(X1|X2)", "") + nresult + } + } +} \ No newline at end of file diff --git a/src/main/scala/jigg/nlp/ccg/CKYParser.scala b/src/main/scala/jigg/nlp/ccg/CKYParser.scala new file mode 100644 index 00000000..9e253ae0 --- /dev/null +++ b/src/main/scala/jigg/nlp/ccg/CKYParser.scala @@ -0,0 +1,226 @@ +package jigg.nlp.ccg + +import jigg.nlp.ccg.lexicon._ +import jigg.nlp.ccg.lexicon.Slash._ +import scala.collection.mutable.HashMap +import scala.collection.mutable.ArrayBuffer + +/** + * Implements a CKY-based parser that has backpointers for tracing trees. + * Takes a @CCGrammar as input. + */ +class CKYParser(grammar: CCGrammar){ + private[this] val rules:Set[_ <: CCGRules] = grammar.rules + private[this] val features:Set[AtomicCategory] = grammar.atomicCategories + + /** + * Parses a super-tagged sentence according to the CKY algorithm. + */ + def parseSentence(sentence: TaggedSentence, sentenceInfo: Array[Array[(Category, Float)]]): Array[Array[ChartCellWithBackpointers]] = { + val startTime = System.nanoTime() + val chart = Array.ofDim[ChartCellWithBackpointers](sentence.size, sentence.size + 1) // initiate Chart with n x n+1 dimensions + + /* Fill the chart with Cells that contain no associated tokens and no candidate categories.*/ + for(i <- 0 to sentence.size - 1){ + for(j <- 1 to sentence.size){ + chart(i)(j) = new ChartCellWithBackpointers("", i, j, HashMap[(Category, Double), Set[(BackPointer, BackPointer)]]()) + } + } + + /* Add to each 'token Cell' its corresponding token and candidate categories.*/ + for(i <- 0 to sentence.size - 1){ + val j = i + 1 + val currToken = sentence.word(i).toString + chart(i)(j).setToken(currToken) + + val possCats = sentenceInfo(i) + for(cat <- possCats){ + chart(i)(j).addOnlyCandidate(cat._1, cat._2) + } + } + + /* + * Commence filling rest. Moves columns (from second one) to the right. + * Moves all the Cells in each column bottom-up. + */ + for(index <- 2 to sentence.size){ + for(j <- index - 1 to 0 by - 1){ + val currs = chart(j)(index).candidateCats.keySet.toArray + + var currID = 0 + while(currID < currs.length){ // Using 'while' instead of 'for' for speed reasons. + val currentCatPair = currs(currID) + val currentCat = currentCatPair._1 + val currentProb = currentCatPair._2 + + var isBinaryApplicable:Boolean = false + /* Applies to everything that is not [0, x], i.e. which can potentially have a left side of a binary rule.*/ + if(j != 0){ + /* Goes through all (relevant) cells in lines above current one.*/ + for(j2 <- j - 1 to 0 by -1){ + val cands = chart(j2)(j).candidateCats.keySet.toArray + + var candID = 0 + while(candID < cands.length){ + val candidCatPair = cands(candID) + val candidCat = candidCatPair._1 + val candidProb = candidCatPair._2 + + /* Go through every binary rule.*/ + for(brule <- rules.filter(_.isInstanceOf[BinaryRule])){ + if(brule.asInstanceOf[BinaryRule].isApplicable(candidCat, currentCat)){ + val resultCat = brule.asInstanceOf[BinaryRule].apply(candidCat, currentCat) + val resultProb = currentProb + candidProb + chart(j2)(index).addCandidate(resultCat, resultProb, new BackPointer(Tuple2(j2, j), candidCat), new BackPointer(Tuple2(j, index), currentCat)) + isBinaryApplicable = true + } + } + candID += 1 + } + } + } + /* + * If current category can potentially be type raised (i.e. it starts with "S"), + * none of the binary rules could be applied AND current cell is not the upper right one + * (i.e. spanning the whole sentence), apply unary rules. + */ + if(currentCat.toString.startsWith("S") && !isBinaryApplicable && !(j == 0 && index == sentence.size)){ + for(urule <- rules.filter(_.isInstanceOf[UnaryRule])){ + if(urule.asInstanceOf[UnaryRule].isApplicable(currentCat)){ + val res:Set[ComplexCategory] = urule.asInstanceOf[UnaryRule].apply(currentCat, features) + for(cc <- res){ + /* + * Because type-raising means that category was derived from only one Cell instead of two, assign an impossible backpointer + * as second backpointer. Class BackPointer's "isEmpty" checks for this. + * When tracing Parse Trees, if second bp is empty, Unary Tree is constructed. + */ + chart(j)(index).addCandidate(cc, currentProb, new BackPointer(Tuple2(j, index), currentCat), new BackPointer(Tuple2(-1, -1), currentCat)) + } + } + } + } + currID += 1 + } + } + } + val endTime = System.nanoTime() + val resultTime: Double = (endTime - startTime)/1000000000d + println("\nParsing took "+ resultTime + " s") + chart + } + + /** + * Prints the filled chart. + */ + def printChart(chart: Array[Array[ChartCellWithBackpointers]]): Unit = { + for(i <- 0 to chart.size - 1){ + for(j <- 1 to chart.size){ println(chart(i)(j).toString) } + } + } + + type Tree = ParseTree[String] + + /* + def printParseTrees(chart: Array[Array[ChartCellWithBackpointers]]): Unit = { + if(chart(0)(chart.size).isEmpty()){ Console.err.println("No parse for the whole sentence!") } + else{ + for(rt <- chart(0)(chart.size).candidateCats.keySet if rt.isInstanceOf[AtomicCategory] && (rt.toString.startsWith("S") || rt.toString.startsWith("NP"))){ + val result:ArrayBuffer[Tree] = traceParse(chart, rt, 0, chart.size) + for(tree <- result.toList){ + println(tree.toString()) + } + } + } + } + * + */ + + def getMostProbableParseTrees(chart: Array[Array[ChartCellWithBackpointers]]): List[Tree] = { + if(chart(0)(chart.size).isEmpty()){ + Console.err.println("No parse for the whole sentence!") + List[Tree]() + } + else{ + val resultBuffer = ArrayBuffer[Tuple2[Tree, Double]]() + for(rt <- chart(0)(chart.size).candidateCats.keySet){ + val mostProbableRoot = chart(0)(chart.size).candidateCats.keySet.reduceLeft((x,y) => if(x._2 > y._2) x else y) + //val mostProbableRoot = rt // WEG + val result = traceMostProbableParse(chart, mostProbableRoot._1, 0, chart.size) + val resultList = result.toList + for(el <- resultList){ + resultBuffer += el + } + } + val list = resultBuffer.toList + if(list.size == 1){ + List(list.head._1) + } + else{ + val mostProbableTreePair = list.reduceLeft((x,y) => if(x._2 > y._2) x else y) + List(mostProbableTreePair._1) + } + } + } + + def traceMostProbableParse(chart: Array[Array[ChartCellWithBackpointers]], start:Category, spansFrom: Int, spansTo: Int): ArrayBuffer[Tuple2[Tree, Double]] = { + val ptList = ArrayBuffer[Tuple2[Tree, Double]]() + if(spansTo == spansFrom + 1){ + val nTree = new LeafTree(start.toString + ' ' + chart(spansFrom)(spansTo).getToken) + val cats = chart(spansFrom)(spansTo).candidateCats.keySet + val tempSet = cats.filter(_.toString == start.toString) + if(!tempSet.isEmpty){ + val temp = tempSet.head + /* This is the probability of the category of the token. */ + val nProb = temp._2 + + ptList += (Tuple2(nTree, nProb)) + ptList + } + else{ + ptList += (Tuple2(nTree, 0.0)) + ptList + } + } + else{ + val catsInChartCell = chart(spansFrom)(spansTo).candidateCats.keySet + val temp = catsInChartCell.filter(_._1.toString == start.toString).head // THIS SHOULD ONLY CONTAIN ONE! + val thisCat = temp._1 + val thisProb = temp._2 + + val bp = chart(spansFrom)(spansTo).candidateCats(temp) + val bps = bp.head + //for(bps <- bp){ + val leftSpansFrom = bps._1.getSpansFrom() + val leftSpansTo = bps._1.getSpansTo() + val leftAssocCat = bps._1.associatedCat + if(bps._2.isEmpty){ + val unarySide = traceMostProbableParse(chart, leftAssocCat, leftSpansFrom, leftSpansTo) + + // gets most probable tree + val x = unarySide.reduceLeft((x,y) => if(x._2 > y._2) x else y) + val nTree = new UnaryTree(x._1, start.toString) + ptList += (Tuple2(nTree, x._2)) + } + else{ + val rightSpansFrom = bps._2.getSpansFrom() + val rightSpansTo = bps._2.getSpansTo() + val rightAssocCat = bps._2.associatedCat + + val oneSide:ArrayBuffer[Tuple2[Tree, Double]] = traceMostProbableParse(chart, leftAssocCat, leftSpansFrom, leftSpansTo) + val otherSide:ArrayBuffer[Tuple2[Tree, Double]] = traceMostProbableParse(chart, rightAssocCat, rightSpansFrom, rightSpansTo) + + val label = start.toString + + // gets most probable trees + val mptLeft = oneSide.reduceLeft((x,y) => if(x._2 > y._2) x else y) + val mptRight = otherSide.reduceLeft((x,y) => if(x._2 > y._2) x else y) + val nTree = new BinaryTree(mptLeft._1, mptRight._1, label) + val nProb = mptLeft._2 * mptRight._2 + ptList += (Tuple2(nTree, nProb)) + } + //} + val bestParse = ptList.reduceLeft((x,y) => if(x._2 > y._2) x else y) + ArrayBuffer(bestParse) + } + } +} \ No newline at end of file diff --git a/src/main/scala/jigg/nlp/ccg/CKYTest.scala b/src/main/scala/jigg/nlp/ccg/CKYTest.scala new file mode 100644 index 00000000..bda0ec5a --- /dev/null +++ b/src/main/scala/jigg/nlp/ccg/CKYTest.scala @@ -0,0 +1,141 @@ +package jigg.nlp.ccg + +import jigg.nlp.ccg.lexicon._ +import jigg.nlp.ccg.tagger._ +import jigg.ml._ +import jigg.nlp.ccg.parser._ +import java.io._ +import scala.collection.mutable.HashMap +import scala.collection.mutable.ArrayBuffer + +/** + * Small demonstration of CKY Parser on ccgbank. + * + * NOTE: It needs a pre-trained tagging-model which is loaded in the beginning. + * For successful loading, the serialized data have to be pointed at in "Options.scala" + * (e.g. @Option(gloss = "Path to trained model") var loadModelPath = "jar/ccg-models/parser/beam=64.ser.gz") + */ +object CKYTest{ + var tagging: SuperTagging = _ + def instantiateSuperTagging = new JapaneseSuperTagging + def loadSuperTagging = { + tagging = instantiateSuperTagging + tagging.load + } + + def main( args: Array[String] ): Unit = { + val startTime = System.nanoTime() + + loadSuperTagging + + val bankreader = tagging.newCCGBankReader + val simplifier = new SimplifyCCGBank + + val x = tagging.featureExtractors + /* + * Latest ccgbank must be simplified in order to work with JapaneseParseTreeConverter. + * Otherwise: java.util.NoSuchElementException: key not found: f + */ + val pathToSimpleBank = simplifier.simplifyAndGetNewPath("res/ccgbank-20150216/test.ccgbank") // Get the ccgbank from where ever it is. + val res = bankreader.readParseTrees(pathToSimpleBank, 50, false) + val converter = tagging.parseTreeConverter + + /* Map each sentence to its gold annotation. */ + type Tree = ParseTree[String] + val sentenceToAnnotation = new HashMap[String, Tree]() + val goldSentences = new ArrayBuffer[TaggedSentence] + + val c = res.toList + for(el <- c){ + val x = converter.toSentenceFromStringTree(el) + sentenceToAnnotation += (x.wordSeq.mkString("") -> el) + goldSentences += x + } + + val filesHere = (new java.io.File("res/lexicon/")).listFiles() + + /* Since lexicon has to be obtained only once, make this check. */ + if(!filesHere.exists(_.getName().startsWith("lexicon_all"))){ + println("Save lexicon...") + val reader = new LexiconReader + + // TAKES ABOUT FIVE MINUTES! + val res = reader.getLexicon("res/ccgbank-20150216/Japanese.lexicon") // Get the lexicon from where ever it is. + + // Save the resulting lexicon in a sensible directory! + val fout = new FileOutputStream("res/lexicon/lexicon_all.ser") + val out = new ObjectOutputStream(fout) + out.writeObject(res) + out.close + fout.close + println("Lexicon saved!") + } + else{ + /* Get the serialized lexicon. */ + println("Load lexicon...") + + // Load the resulting lexicon from a sensible directory! + val fin = new FileInputStream("res/lexicon/lexicon_all.ser") + val oin = new ObjectInputStream(fin) + val lex = oin.readObject.asInstanceOf[Tuple3[Set[String], Set[AtomicCategory], HashMap[String, HashMap[String, Set[Category]]]]] + oin.close + fin.close + println("Lexicon loaded!") + + val terminalsLoad = lex._1 + val categoriesLoad = lex._2 + //val mappingLoad = lex._3 + + /* Create set of ccg rules. */ + val rules = Set(TypeChangingRule1, TypeChangingRule2, TypeChangingRule3, + BackwardApplication, BackwardCompositionNested, BackwardCrossedComposition, ForwardApplication, ForwardComposition, ForwardCrossedComposition) + + /* Create grammar. */ + val jg = new CCGrammar(terminalsLoad, categoriesLoad, rules) + + val jcky = new CKYParser(jg) + + + val tagger = tagging.getTagger + + /* Supertag the sentences. */ + val anno:Seq[CandAssignedSentence] = tagging.superTagToSentences(goldSentences.toArray) + + for(goldSentence <- anno){ + val sentenceInfo = tagger.unigramCategoryDistributions(goldSentence) + + val goldAnno = sentenceToAnnotation(goldSentence.wordSeq.mkString("")) + + println("Start parsing sentence: " + goldSentence.wordSeq.mkString("")) + + val jchart: Array[Array[ChartCellWithBackpointers]] = jcky.parseSentence(goldSentence, sentenceInfo) + //println("Print contents of chart:") + //jcky.printChart(jchart) + + println("\nGetting parses...") + val xy = jcky.getMostProbableParseTrees(jchart) + println("Got " + xy.size + " parse trees!") + + /* If there are valid output parses, evaluate them and only print the best value of each precision, recall and f-score. */ + if(!xy.isEmpty){ + println("\nEvaluate " + xy.size + " output parses:") + val evaluator = new CKYEvaluator + val listOfFScores = evaluator.calculateFScores(xy, goldAnno) + val listOfPrecisions = evaluator.calculatePrecisions(xy, goldAnno) + val listOfRecalls = evaluator.calculateRecalls(goldAnno, xy) + + println("Evaluation complete!\n") + + println("Best precision: " + listOfPrecisions.max) + println("Best recall: " + listOfRecalls.max) + println("Best f-score: " + listOfFScores.max) + } + + } + + val endTime = System.nanoTime() + val resultTime: Double = (endTime - startTime)/1000000000d + println("\nTook "+ resultTime + " s") + } + } +} \ No newline at end of file diff --git a/src/main/scala/jigg/nlp/ccg/ChartCellWithBackpointers.scala b/src/main/scala/jigg/nlp/ccg/ChartCellWithBackpointers.scala new file mode 100644 index 00000000..550ac6b1 --- /dev/null +++ b/src/main/scala/jigg/nlp/ccg/ChartCellWithBackpointers.scala @@ -0,0 +1,87 @@ +package jigg.nlp.ccg + +import jigg.nlp.ccg.lexicon._ +import scala.collection.mutable.HashMap + +/** + * Implements a backpointer for a ChartCell for tracing trees (i.e. derivation process). + * Consists of a Tuple2[Int, Int], indicating the range of the ChartCell, and the Category + * associated with the pointed ChartCell. + */ +class BackPointer(tup: Tuple2[Int, Int], symbol: Category){ + val sFromTo: Tuple2[Int, Int] = tup + val associatedCat: Category = symbol + + /** Gets spansFrom of the associated Cell. */ + def getSpansFrom(): Int = sFromTo._1 + + /** Gets spansTo of the associated Cell. */ + def getSpansTo(): Int = sFromTo._2 + + override def toString: String = { + val s:StringBuilder = new StringBuilder("[" + sFromTo._1 + ", " + sFromTo._2 + "] is (" + associatedCat.toString + ")") + s.toString + } + + def isEmpty() = if(tup._1 == -1 && tup._2 == -1) true else false +} + +/** + * Implements a ChartCell with BackPointers. + * Specifically, the 4th argument contains a Map that maps the potential categories of this Cell + * to a Set of BackPointer-Tuples (since every Cell/Category is produced by two other Cells). + */ +class ChartCellWithBackpointers(token:String, i: Int, j: Int, possibleCat: HashMap[(Category, Double), Set[(BackPointer, BackPointer)]]){ + private var associatedToken: String = token + val spansFrom:Int = i + val spansTo:Int = j + val candidateCats:HashMap[(Category, Double), Set[(BackPointer, BackPointer)]] = possibleCat + + /** Simplifies the query if Cell is empty to query if there are no candidate categories. */ + def isEmpty(): Boolean = if(candidateCats.isEmpty) true else false + + def setToken(t:String) { this.associatedToken = t } + def getToken() = associatedToken + + /** + * Adds just a category to the Cell without BackPointers, meaning that its the lowermost Cell + * (i.e. it contains the token, which is derived from no other Cell). + */ + def addOnlyCandidate(cand:Category, prob: Double) { candidateCats += ((cand, prob) -> Set[(BackPointer, BackPointer)]()) } + + /** + * Standard addCandidate method. + * Adds to the Cell a candidate category with two BackPointers associated to the Cells + * that result in this candidate category. + */ + def addCandidate(cand:Category, prob: Double, bp1: BackPointer, bp2: BackPointer) = candidateCats.get(cand, prob) match { + case None => candidateCats += ((cand, prob) -> Set((bp1, bp2))) + case Some(tupleSet) => { + var alreadyPresent: Boolean = false + for(tupel <- tupleSet){ + if(bp1.sFromTo.equals(tupel._1.sFromTo) && bp2.sFromTo.equals(tupel._2.sFromTo)){ + if(bp1.associatedCat.toString == tupel._1.associatedCat.toString && bp2.associatedCat.toString == tupel._2.associatedCat.toString){ + alreadyPresent = true + } + } + } + if(!alreadyPresent){ + candidateCats((cand, prob)) += Tuple2(bp1, bp2) + } + } + } + + /** + * Taking a category as input, gets the set of BackPointers that indicate where this category + * has been derived from. + */ + def getCellsForCandidate(cand:Category, prob: Double):Set[Tuple2[BackPointer, BackPointer]] = candidateCats.get((cand, prob)) match { + case None => Set[Tuple2[BackPointer, BackPointer]]() + case Some(tupleSet) => tupleSet + } + + override def toString: String = { + val s = new StringBuilder("Cell [" + spansFrom + ", " + spansTo + "] (" + this.associatedToken + ") contains " + candidateCats.toString) + s.toString + } +} diff --git a/src/main/scala/jigg/nlp/ccg/LexiconReader.scala b/src/main/scala/jigg/nlp/ccg/LexiconReader.scala new file mode 100644 index 00000000..9fd1bcce --- /dev/null +++ b/src/main/scala/jigg/nlp/ccg/LexiconReader.scala @@ -0,0 +1,276 @@ +package jigg.nlp.ccg + +import jigg.nlp.ccg.lexicon._ +import jigg.nlp.ccg.lexicon.Slash._ +import scala.io.Source +import scala.collection.mutable.HashMap +import scala.collection.mutable.ArrayBuffer + +/** + * Implements a lexicon reader for the latest ccgbank (ccgbank-20150216). + * Works with 'CCGrammar.scala' and retrieves the lexicon in a format + * usable with CKY parser. + */ +class LexiconReader{ + private[this] val terminalsBuffer = ArrayBuffer[String]() + private[this] val acBuffer = ArrayBuffer[AtomicCategory]() + private[this] val ccBuffer = ArrayBuffer[ComplexCategory]() + private[this] val mapping = new HashMap[String, HashMap[String, Set[Category]]] + + /** + * In addition to getting a set of all terminals (i.e. tokens) and atomic categories in the lexicon, + * it also outputs a mapping from token to pos to a Set of associated categories. + * The mapping is not used in current version of cky parser. + */ + def getLexicon(path: String): Tuple3[Set[String], Set[AtomicCategory], HashMap[String, HashMap[String, Set[Category]]]] = { + /* Main part. Pre-processes the lines and creates categories for each category string in line. */ + val source = Source.fromFile(path) + for(line <- source.getLines()){ + if(!line.isEmpty){ + val contents = line.replaceAll("\\{I\\d\\}", "").replaceAll("_.*\\>", "").replaceAll("(,)?fin=(t|f)", "").split(" ") + val cleanContents = Array.ofDim[String](contents.size) + cleanContents(0) = contents(0) + for(i <- 1 to contents.size - 1){ + cleanContents(i) = contents(i).substring(0, contents(i).indexOf("_")) + } + val token = cleanContents(0).substring(0, cleanContents(0).indexOf("/")) + val pos = cleanContents(0).substring(cleanContents(0).indexOf("/") + 1, cleanContents(0).length) + + if(token != "@UNK@"){ terminalsBuffer += token } + + if(!mapping.contains(token)){ + mapping += (token -> HashMap[String, Set[Category]]()) + mapping(token) += (pos -> Set()) + } + else if(mapping.contains(token) && !(mapping(token).contains(pos))){ + mapping(token) += (pos -> Set()) + } + cleanContents.drop(1).foreach { x: String => + val cat = decomposeCategory(x) + mapping(token)(pos) += cat + } + } + } + source.close() + + @serializable val result = (terminalsBuffer.toSet, acBuffer.toSet, mapping) + result + } + + /** Checks whether an atomic category in question is already present or not. */ + private def atomicCategoryExistent(acTemp: AtomicCategory) = acBuffer.filter(_.base == acTemp.base).exists(_.toString == acTemp.toString) + + + /** For a String that is an atomic category, construct one. */ + private def getAtomicCategory(cat: String): AtomicCategory = { + //println(cat) + val base = cat.substring(0, cat.indexOf("[")) + val features = cat.substring(cat.indexOf("[") + 1, cat.indexOf("]")).split(",") + val feat = for(f <- features) yield f.substring(f.indexOf("=") + 1, f.length) + + val nCatTemp = AtomicCategory(0, base, JPCategoryFeature.createFromValues(feat.toSeq)) + /* If there are no categories at all, take temporary one as first one (index = 0). */ + if(acBuffer.isEmpty){ + acBuffer += nCatTemp + nCatTemp + } + else{ + /* If the category already exists, give out the existing one. */ + if(atomicCategoryExistent(nCatTemp)){ + acBuffer.find(_.toString == nCatTemp.toString) match{ + case None => AtomicCategory(-1, base, JPCategoryFeature.createFromValues(feat.toSeq)) // NEVER REACHED + case Some(x) => x + } + } + /* Else, construct new Category with new maximum ID. */ + else{ + val ids = for(ac <- acBuffer) yield ac.id + val nCat = AtomicCategory(ids.max + 1, base, JPCategoryFeature.createFromValues(feat.toSeq)) + acBuffer += nCat + nCat + } + } + } + + + /** + * Checks whether a complex category in question is already present or not. + * If so, returns the already existing one, if not, creates a new category + * based on the input temporary category. + */ + private def checkAndGetComplexCategory(ccTemp: ComplexCategory): ComplexCategory = { + if(ccBuffer.exists(_.toString == ccTemp.toString)){ + ccBuffer.find(_.toString == ccTemp.toString) match{ + case None => ComplexCategory(-1, ccTemp.left, ccTemp.right, ccTemp.slash) // NEVER REACHED + case Some(x) => x + } + } + else{ + val ids = for(cc <- ccBuffer) yield cc.id + val nCat = ComplexCategory(ids.max + 1, ccTemp.left, ccTemp.right, ccTemp.slash) + ccBuffer += nCat + nCat + } + } + + + /** + * Breaks up a simple complex category X/Y where X and Y are atomic categories + * and creates a complex category. + */ + private def getComplexCategory(cat: String): ComplexCategory = { + val compositeCat = cat.split("(/|\\\\)") + val newLeft = getAtomicCategory(compositeCat(0)) + val newRight= getAtomicCategory(compositeCat(1)) + val nCat = { + if(cat.matches(".+/.+")){ ComplexCategory(0, newLeft, newRight, Slash.Right) } + else { ComplexCategory(0, newLeft, newRight, Slash.Left) } + } + if(ccBuffer.isEmpty){ + ccBuffer += nCat + nCat + } + else{ checkAndGetComplexCategory(nCat) } + } + + + /** + * Decomposes a symmetrical complex category. This means that both sides are complex catgeories. + * + * ((NP[case=X1,mod=X2]/NP[case=X1,mod=X2]))\((NP[case=X1,mod=X2]/NP[case=X1,mod=X2])) + * (((((S[mod=nm,form=stem]\NP[case=ga,mod=nm]))\NP[case=o,mod=nm]))\(NP[case=ni,mod=nm]/NP[case=ni,mod=nm])) + */ + private def decomposeComplexSymmetrical(cat: String): ComplexCategory = { + val PAT = "^(\\()+\\w+\\[(\\w+=(\\w+|X1|X2)(,)?)+\\]((\\/|\\\\)\\w+\\[(\\w+=(\\w+|X1|X2)(,)?)+\\](\\))+)+".r + var rareCase = false + val leftSide = { + PAT.findFirstIn(cat) match { + case None => AtomicCategory(-1, "", JPCategoryFeature.createFromValues(Seq(""))) // NEVER REACHED + case Some(x) => { + val subCategory = x.replaceFirst("\\(", "").reverse.replaceFirst("\\)", "").reverse + try{ + decomposeCategory(subCategory) + } + /* + * This handles an extremely rare case (2 in whole training corpus). + */ + catch{ + case e: java.lang.StringIndexOutOfBoundsException => { + rareCase = true + val PAT_ALT = "^((\\()+\\w+\\[(\\w+=(\\w+|X1|X2)(,)?)+\\]((\\/|\\\\)\\w+\\[(\\w+=(\\w+|X1|X2)(,)?)+\\](\\))+)+)+((\\/|\\\\)((\\()+\\w+\\[(\\w+=(\\w+|X1|X2)(,)?)+\\]((\\/|\\\\)\\w+\\[(\\w+=(\\w+|X1|X2)(,)?)+\\](\\))+)+)+)?".r.unanchored + PAT_ALT.findFirstIn(cat) match{ + case None => AtomicCategory(-1, "", JPCategoryFeature.createFromValues(Seq(""))) // NEVER REACHED + case Some(x) => { + val subCategory = x.replaceFirst("\\(", "").reverse.replaceFirst("\\)", "").reverse + decomposeCategory(subCategory) + } + } + } + } + } + } + } + val PAT2 = "(\\()+\\w+\\[(\\w+=(\\w+|X1|X2)(,)?)+\\](\\/|\\\\)\\w+\\[(\\w+=(\\w+|X1|X2)(,)?)+\\](\\))+$".r + val rightSide = { + if(!rareCase){ + PAT2.findFirstIn(cat) match { + case None => AtomicCategory(-1, "", JPCategoryFeature.createFromValues(Seq(""))) // NEVER REACHED + case Some(x) => { + val subCategory = x.replaceFirst("\\(", "").reverse.replaceFirst("\\)", "").reverse + decomposeCategory(subCategory) + } + } + } + else{ + println("We have a rare case!") + val PAT2_ALT = "((\\()+\\w+\\[(\\w+=(\\w+|X1|X2)(,)?)+\\]((\\/|\\\\)\\w+\\[(\\w+=(\\w+|X1|X2)(,)?)+\\](\\))+)+)+((\\/|\\\\)((\\()+\\w+\\[(\\w+=(\\w+|X1|X2)(,)?)+\\]((\\/|\\\\)\\w+\\[(\\w+=(\\w+|X1|X2)(,)?)+\\](\\))+)+)+)?$".r.unanchored + PAT2_ALT.findFirstIn(cat) match { + case None => AtomicCategory(-1, "", JPCategoryFeature.createFromValues(Seq(""))) // NEVER REACHED + case Some(x) => { + val subCategory = x.replaceFirst("\\(", "").reverse.replaceFirst("\\)", "").reverse + decomposeCategory(subCategory) + } + } + } + } + /* Get length of substring that matches left side of category. */ + val index = { + PAT.findFirstIn(cat) match{ + case None => 0 // NEVER REACHED + case Some(x) => x.length() + } + } + /* Since above index points to the character after the last of the left side, it is the Slash. */ + val nCatTemp = cat.charAt(index) match { + case '/' => ComplexCategory(0, leftSide, rightSide, Slash.Right) + case _ => ComplexCategory(0, leftSide, rightSide, Slash.Left) + } + checkAndGetComplexCategory(nCatTemp) + } + + + /** + * Decomposes asymmetrical complex categories. This means that the left side is a complex, + * and the right side is an atomic category. + * + * ((S[mod=nm,form=stem]\NP[case=ga,mod=nm]))\NP[case=ni,mod=nm] + * ((((S[mod=nm,form=stem]\NP[case=ga,mod=nm]))\NP[case=o,mod=nm]))\NP[case=ni,mod=nm] + * ((((S[mod=X1,form=X2]/S[mod=X1,form=X2]))/((S[mod=X1,form=X2]/S[mod=X1,form=X2]))))\NP[case=nc,mod=nm] + */ + private def decomposeComplexAsymmetrical(cat: String): ComplexCategory = { + val PAT = "^((\\()+\\w+\\[(\\w+=(\\w+|X1|X2)(,)?)+\\]((\\/|\\\\)\\w+\\[(\\w+=(\\w+|X1|X2)(,)?)+\\](\\))+)+)+((\\/|\\\\)((\\()+\\w+\\[(\\w+=(\\w+|X1|X2)(,)?)+\\]((\\/|\\\\)\\w+\\[(\\w+=(\\w+|X1|X2)(,)?)+\\](\\))+)+)+)?".r.unanchored + val leftSide = PAT.findFirstIn(cat) match{ + case None => AtomicCategory(-1, "", JPCategoryFeature.createFromValues(Seq(""))) // NEVER REACHED + case Some(x) => { + val subCategory = x.replaceFirst("\\(", "").reverse.replaceFirst("\\)", "").reverse + decomposeCategory(subCategory) + } + } + val PAT2 = "\\w+\\[(\\w+=(\\w+|X1|X2)(,)?)+\\]$".r.unanchored + val rightSide = PAT2.findFirstIn(cat) match{ + case None => AtomicCategory(-1, "", JPCategoryFeature.createFromValues(Seq(""))) // NEVER REACHED + case Some(x) => getAtomicCategory(x) + } + /* Checks direction of Slash. */ + val nCatTemp = { + cat.reverse.charAt(cat.reverse.indexOf(")") - 1) match { + case '/' => ComplexCategory(0, leftSide, rightSide, Slash.Right) + case _ => ComplexCategory(0, leftSide, rightSide, Slash.Left) + } + } + checkAndGetComplexCategory(nCatTemp) + } + + + /** + * Decomposes a complex category and determines what kind it is + * (symmetrical, asymmetrical, normal). + */ + private def decomposeComplex(cat: String): ComplexCategory = { + /* Checks if it's an asymmetrcial complex category: (X/X)/X */ + if(cat.startsWith("(") && !cat.reverse.startsWith(")")){ + decomposeComplexAsymmetrical(cat) + } + /* Checks if it's a symmetrical category: (X/X)/(X/X) */ + else if(cat.startsWith("(") && cat.reverse.startsWith(")")){ + decomposeComplexSymmetrical(cat) + } + /* Applies to normal complex categories: X/X */ + else{ + getComplexCategory(cat) + } + } + + + /** + * If category starts with "(" it is complex -> invoke @decomposeComplex. + * Else, it is atomic -> invoke @getAtomicCategory. + */ + def decomposeCategory(el: String): Category = { + if(el.startsWith("(")){ + val category = el.replaceFirst("\\(", "").reverse.replaceFirst("\\)", "").reverse + decomposeComplex(category) + } + else{ getAtomicCategory(el) } + } +} \ No newline at end of file diff --git a/src/main/scala/jigg/nlp/ccg/SimplifyCCGBank.scala b/src/main/scala/jigg/nlp/ccg/SimplifyCCGBank.scala new file mode 100644 index 00000000..7cd90318 --- /dev/null +++ b/src/main/scala/jigg/nlp/ccg/SimplifyCCGBank.scala @@ -0,0 +1,30 @@ +package jigg.nlp.ccg + +import scala.io.Source +import java.io._ + +/** + * This class is used so that latest ccgbank (ccgbank-20150216) + * is compatible with CCGBankReader. + */ +class SimplifyCCGBank { + + def simplifyAndGetNewPath(path: String): String = { + val source = Source.fromFile(path) + val fname = path.substring(0, path.lastIndexOf(".")) + val npath = fname + ".simplified.ccgbank" + val bw = new BufferedWriter(new FileWriter(new File(npath))) + + for(line <- source.getLines()){ + if(!line.isEmpty){ + val nLine = line.replaceAll("\\{I\\d\\}", "").replaceAll("(,)?fin=(t|f|X\\d)", "").replaceAll("(_\\w+)+", "") + bw.write(nLine) + bw.newLine() + } + } + source.close() + bw.flush() + bw.close() + npath + } +} \ No newline at end of file