diff --git a/src/gaknn/core/Instance.java b/src/gaknn/core/Instance.java index 253377c..1d81fec 100644 --- a/src/gaknn/core/Instance.java +++ b/src/gaknn/core/Instance.java @@ -1,6 +1,10 @@ package gaknn.core; + + + + /** * Reprsents an instance of the data set * @@ -69,11 +73,138 @@ public Instances dataset() { public void AddElement(double value, int index){ m_AttValues[index] = value; } - + /** Return number of values in instance. + * + * + */ + //@author thimal + public int numValues(){ + return m_AttValues.length; + } + /** + * Returns the index of the attribute stored at the given position. + * Just returns the given value. + * + * @param position the position + * @return the index of the attribute stored at the given position + */ + //@author thimal + public /*@pure@*/ int index(int position) { + + return position; + } + /** + * Returns an instance's attribute value in internal format. + * Does exactly the same thing as value() if applied to an Instance. + * + * @param indexOfIndex the index of the attribute's index + * @return the specified value as a double (If the corresponding + * attribute is nominal (or a string) then it returns the value's index as a + * double). + */ + //@author thimal + public /*@pure@*/ double valueSparse(int indexOfIndex) { + + return m_AttValues[indexOfIndex]; + } + /** + * Tests if a specific value is "missing". + * + * @param attIndex the attribute's index + * @return true if the value is "missing" + */ + //@author thimal + public /*@pure@*/ boolean isMissing(int attIndex) { + + if (Double.isNaN(m_AttValues[attIndex])) { + return true; + } + return false; + } + /** + * Tests if the given value codes "missing". + * + * @param val the value to be tested + * @return true if val codes "missing" + */ + //@author thimal + public static /*@pure@*/ boolean isMissingValue(double val) { + + return Double.isNaN(val); + } /** Returns the element at the given position * * @param index of the elrment. */ + /** + * Returns the values of each attribute as an array of doubles. + * + * @return an array containing all the instance attribute values + */ + //@author thimal + public double[] toDoubleArray() { + + double[] newValues = new double[m_AttValues.length]; + System.arraycopy(m_AttValues, 0, newValues, 0, + m_AttValues.length); + return newValues; + } + /** + * Clones the attribute vector of the instance and + * overwrites it with the clone. + */ + //@author thimal + private void freshAttributeVector() { + + m_AttValues = toDoubleArray(); + } + /** + * Sets a specific value in the instance to the given value + * (internal floating-point format). Performs a deep copy + * of the vector of attribute values before the value is set. + * + * @param attIndex the attribute's index + * @param value the new attribute value (If the corresponding + * attribute is nominal (or a string) then this is the new value's + * index as a double). + */ + //@author thimal + public void setValue(int attIndex, double value) { + + freshAttributeVector(); + m_AttValues[attIndex] = value; + } + /** + * Sets the reference to the dataset. Does not check if the instance + * is compatible with the dataset. Note: the dataset does not know + * about this instance. If the structure of the dataset's header + * gets changed, this instance will not be adjusted automatically. + * + * @param instances the reference to the dataset + */ + //@author thimal + public final void setDataset(Instances instances) { + + m_Dataset = instances; + } + /** + * Produces a shallow copy of this instance. The copy has + * access to the same dataset. (if you want to make a copy + * that doesn't have access to the dataset, use + * new Instance(instance) + * + * @return the shallow copy + */ + //@ also ensures \result != null; + //@ also ensures \result instanceof Instance; + //@ also ensures ((Instance)\result).m_Dataset == m_Dataset; + public /*@pure@*/ Object copy() { + + Instance result = new Instance(this); + result.m_Dataset = m_Dataset; + return result; + } + public double GetElementAt(int index){ return m_AttValues[index]; } diff --git a/src/gaknn/core/Instances.java b/src/gaknn/core/Instances.java index 83f4752..60f3152 100644 --- a/src/gaknn/core/Instances.java +++ b/src/gaknn/core/Instances.java @@ -1,5 +1,11 @@ package gaknn.core; + + + + +//import Instance; + /** * Instances class represents the data. * @@ -39,6 +45,9 @@ public Instances(String name, m_RelationName = name; m_ClassIndex = -1; m_Attributes = attInfo; + + //initialize the m_instances + m_Instances=new FastVector(); for (int i=0; i 0) m_DataSet[RecNo] = values; m_ClassIdList[RecNo] = classIndex; m_Lines++; + + // set the m_instances + Instance ins=new Instance(values); + add(ins); } @@ -225,6 +288,20 @@ private static Object ResizeArray (Object oldArray, int newSize) { System.arraycopy (oldArray,0,newArray,0,preserveLength); return newArray; } + + /** + * Returns the instance at the given position. + * + * @param index the instance's index (index starts with 0) + * @return the instance at the given position + */ + // @ requires 0 <= index; + // @ requires index < numInstances(); + // @ author thimal + public/* @non_null pure@ */Instance instance(int index) { + + return (Instance) m_Instances.elementAt(index); + } } diff --git a/src/gaknn/core/kdtree/AdditionalMeasureProducer.java b/src/gaknn/core/kdtree/AdditionalMeasureProducer.java new file mode 100644 index 0000000..aaf9c47 --- /dev/null +++ b/src/gaknn/core/kdtree/AdditionalMeasureProducer.java @@ -0,0 +1,51 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* + * AdditionalMeasureProducer.java + * Copyright (C) 2000 University of Waikato, Hamilton, New Zealand + * + */ + +package gaknn.core.kdtree; + +import java.util.*; + +/** + * Interface to something that can produce measures other than those + * calculated by evaluation modules. + * + * @author Mark Hall (mhall@cs.waikato.ac.nz) + * @version $Revision: 1.8 $ + */ +public interface AdditionalMeasureProducer { + + /** + * Returns an enumeration of the measure names. Additional measures + * must follow the naming convention of starting with "measure", eg. + * double measureBlah() + * @return an enumeration of the measure names + */ + Enumeration enumerateMeasures(); + + /** + * Returns the value of the named measure + * @param measureName the name of the measure to query for its value + * @return the value of the named measure + * @exception IllegalArgumentException if the named measure is not supported + */ + double getMeasure(String measureName); +} diff --git a/src/gaknn/core/kdtree/DistanceFunction.java b/src/gaknn/core/kdtree/DistanceFunction.java new file mode 100644 index 0000000..a8f50af --- /dev/null +++ b/src/gaknn/core/kdtree/DistanceFunction.java @@ -0,0 +1,162 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* + * DistanceFunction.java + * Copyright (C) 1999-2005 University of Waikato, Hamilton, New Zealand + * + */ + +package gaknn.core.kdtree; + + +//import weka.core.neighboursearch.PerformanceStats; +import gaknn.core.Instances; +import gaknn.core.Instance; + +/** + * Interface for any class that can compute and return distances between two + * instances. + * + * @author Ashraf M. Kibriya (amk14@cs.waikato.ac.nz) + * @version $Revision: 1.7 $ + */ +public interface DistanceFunction { + + /** + * Sets the instances. + * + * @param insts the instances to use + */ + public void setInstances(Instances insts); + + /** + * returns the instances currently set. + * + * @return the current instances + */ + public Instances getInstances(); + + /** + * Sets the range of attributes to use in the calculation of the distance. + * The indices start from 1, 'first' and 'last' are valid as well. + * E.g.: first-3,5,6-last + * + * @param value the new attribute index range + */ + public void setAttributeIndices(String value); + + /** + * Gets the range of attributes used in the calculation of the distance. + * + * @return the attribute index range + */ + public String getAttributeIndices(); + + /** + * Sets whether the matching sense of attribute indices is inverted or not. + * + * @param value if true the matching sense is inverted + */ + public void setInvertSelection(boolean value); + + /** + * Gets whether the matching sense of attribute indices is inverted or not. + * + * @return true if the matching sense is inverted + */ + public boolean getInvertSelection(); + + /** + * Calculates the distance between two instances. + * + * @param first the first instance + * @param second the second instance + * @return the distance between the two given instances + */ + public double distance(Instance first, Instance second); + + /** + * Calculates the distance between two instances. + * + * @param first the first instance + * @param second the second instance + * @param stats the performance stats object + * @return the distance between the two given instances + * @throws Exception if calculation fails + */ + public double distance(Instance first, Instance second, PerformanceStats stats) + throws Exception; + + /** + * Calculates the distance between two instances. Offers speed up (if the + * distance function class in use supports it) in nearest neighbour search by + * taking into account the cutOff or maximum distance. Depending on the + * distance function class, post processing of the distances by + * postProcessDistances(double []) may be required if this function is used. + * + * @param first the first instance + * @param second the second instance + * @param cutOffValue If the distance being calculated becomes larger than + * cutOffValue then the rest of the calculation is + * discarded. + * @return the distance between the two given instances or + * Double.POSITIVE_INFINITY if the distance being + * calculated becomes larger than cutOffValue. + */ + public double distance(Instance first, Instance second, double cutOffValue); + + /** + * Calculates the distance between two instances. Offers speed up (if the + * distance function class in use supports it) in nearest neighbour search by + * taking into account the cutOff or maximum distance. Depending on the + * distance function class, post processing of the distances by + * postProcessDistances(double []) may be required if this function is used. + * + * @param first the first instance + * @param second the second instance + * @param cutOffValue If the distance being calculated becomes larger than + * cutOffValue then the rest of the calculation is + * discarded. + * @param stats the performance stats object + * @return the distance between the two given instances or + * Double.POSITIVE_INFINITY if the distance being + * calculated becomes larger than cutOffValue. + */ + public double distance(Instance first, Instance second, + double cutOffValue, PerformanceStats stats); + + /** + * Does post processing of the distances (if necessary) returned by + * distance(distance(Instance first, Instance second, double cutOffValue). It + * may be necessary, depending on the distance function, to do post processing + * to set the distances on the correct scale. Some distance function classes + * may not return correct distances using the cutOffValue distance function to + * minimize the inaccuracies resulting from floating point comparison and + * manipulation. + * + * @param distances the distances to post-process + */ + public void postProcessDistances(double distances[]); + + /** + * Update the distance function (if necessary) for the newly added instance. + * + * @param ins the instance to add + */ + public void update(Instance ins); + +} diff --git a/src/gaknn/core/kdtree/EuclideanDistance.java b/src/gaknn/core/kdtree/EuclideanDistance.java new file mode 100644 index 0000000..be92260 --- /dev/null +++ b/src/gaknn/core/kdtree/EuclideanDistance.java @@ -0,0 +1,283 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* + * EuclideanDistance.java + * Copyright (C) 1999-2007 University of Waikato, Hamilton, New Zealand + * + */ + +package gaknn.core.kdtree; + +//import weka.core.TechnicalInformation.Field; +//import weka.core.TechnicalInformation.Type; +import gaknn.core.Instance; +import gaknn.core.Instances; +import gaknn.core.kdtree.PerformanceStats; + +/** + + * Implementing Euclidean distance (or similarity) function.
+ *
+ * One object defines not one distance but the data model in which the distances between objects of that data model can be computed.
+ *
+ * Attention: For efficiency reasons the use of consistency checks (like are the data models of the two instances exactly the same), is low.
+ *
+ * For more information, see:
+ *
+ * Wikipedia. Euclidean distance. URL http://en.wikipedia.org/wiki/Euclidean_distance. + *

+ + * + + * BibTeX: + *

+ * @misc{missing_id,
+ *    author = {Wikipedia},
+ *    title = {Euclidean distance},
+ *    URL = {http://en.wikipedia.org/wiki/Euclidean_distance}
+ * }
+ * 
+ *

+ + * + + * Valid options are:

+ * + *

 -D
+ *  Turns off the normalization of attribute 
+ *  values in distance calculation.
+ * + *
 -R <col1,col2-col4,...>
+ *  Specifies list of columns to used in the calculation of the 
+ *  distance. 'first' and 'last' are valid indices.
+ *  (default: first-last)
+ * + *
 -V
+ *  Invert matching sense of column indices.
+ * + + * + * @author Gabi Schmidberger (gabi@cs.waikato.ac.nz) + * @author Ashraf M. Kibriya (amk14@cs.waikato.ac.nz) + * @author FracPete (fracpete at waikato dot ac dot nz) + * @version $Revision: 1.13 $ + */ +public class EuclideanDistance + extends NormalizableDistance + implements Cloneable { + + /** for serialization. */ + private static final long serialVersionUID = 1068606253458807903L; + + /** + * Constructs an Euclidean Distance object, Instances must be still set. + */ + public EuclideanDistance() { + super(); + } + + /** + * Constructs an Euclidean Distance object and automatically initializes the + * ranges. + * + * @param data the instances the distance function should work on + */ + public EuclideanDistance(Instances data,double[] weights) { + super(data); + m_Weights=weights; + } + + /** + * Returns a string describing this object. + * + * @return a description of the evaluator suitable for + * displaying in the explorer/experimenter gui + */ +// public String globalInfo() { +// return +// "Implementing Euclidean distance (or similarity) function.\n\n" +// + "One object defines not one distance but the data model in which " +// + "the distances between objects of that data model can be computed.\n\n" +// + "Attention: For efficiency reasons the use of consistency checks " +// + "(like are the data models of the two instances exactly the same), " +// + "is low.\n\n" +// + "For more information, see:\n\n" +// + getTechnicalInformation().toString(); +// } +// +// /** +// * Returns an instance of a TechnicalInformation object, containing +// * detailed information about the technical background of this class, +// * e.g., paper reference or book this class is based on. +// * +// * @return the technical information about this class +// */ +// public TechnicalInformation getTechnicalInformation() { +// TechnicalInformation result; +// +// result = new TechnicalInformation(Type.MISC); +// result.setValue(Field.AUTHOR, "Wikipedia"); +// result.setValue(Field.TITLE, "Euclidean distance"); +// result.setValue(Field.URL, "http://en.wikipedia.org/wiki/Euclidean_distance"); +// +// return result; +// } + + /** + * Calculates the distance between two instances. + * + * @param first the first instance + * @param second the second instance + * @return the distance between the two given instances + */ + public double distance(Instance first, Instance second) { + return Math.sqrt(distance(first, second, Double.POSITIVE_INFINITY)); + } + + /** + * Calculates the distance (or similarity) between two instances. Need to + * pass this returned distance later on to postprocess method to set it on + * correct scale.
+ * P.S.: Please don't mix the use of this function with + * distance(Instance first, Instance second), as that already does post + * processing. Please consider passing Double.POSITIVE_INFINITY as the cutOffValue to + * this function and then later on do the post processing on all the + * distances. + * + * @param first the first instance + * @param second the second instance + * @param stats the structure for storing performance statistics. + * @return the distance between the two given instances or + * Double.POSITIVE_INFINITY. + */ + public double distance(Instance first, Instance second, PerformanceStats stats) { //debug method pls remove after use + return Math.sqrt(distance(first, second, Double.POSITIVE_INFINITY, stats)); + } + + /** + * Updates the current distance calculated so far with the new difference + * between two attributes. The difference between the attributes was + * calculated with the difference(int,double,double) method. + * + * @param currDist the current distance calculated so far + * @param diff the difference between two new attributes + * @return the update distance + * @see #difference(int, double, double) + */ + protected double updateDistance(double currDist, double diff) { + double result; + + result = currDist; + result += diff * diff; + + return result; + } + + /** + * Does post processing of the distances (if necessary) returned by + * distance(distance(Instance first, Instance second, double cutOffValue). It + * is necessary to do so to get the correct distances if + * distance(distance(Instance first, Instance second, double cutOffValue) is + * used. This is because that function actually returns the squared distance + * to avoid inaccuracies arising from floating point comparison. + * + * @param distances the distances to post-process + */ + public void postProcessDistances(double distances[]) { + for(int i = 0; i < distances.length; i++) { + distances[i] = Math.sqrt(distances[i]); + } + } + + /** + * Returns the squared difference of two values of an attribute. + * + * @param index the attribute index + * @param val1 the first value + * @param val2 the second value + * @return the squared difference + */ + public double sqDifference(int index, double val1, double val2) { + double val = difference(index, val1, val2); + return val*val; + } + + /** + * Returns value in the middle of the two parameter values. + * + * @param ranges the ranges to this dimension + * @return the middle value + */ + public double getMiddle(double[] ranges) { + + double middle = ranges[R_MIN] + ranges[R_WIDTH] * 0.5; + return middle; + } + + /** + * Returns the index of the closest point to the current instance. + * Index is index in Instances object that is the second parameter. + * + * @param instance the instance to assign a cluster to + * @param allPoints all points + * @param pointList the list of points + * @return the index of the closest point + * @throws Exception if something goes wrong + */ + public int closestPoint(Instance instance, Instances allPoints, + int[] pointList) throws Exception { + double minDist = Integer.MAX_VALUE; + int bestPoint = 0; + for (int i = 0; i < pointList.length; i++) { + double dist = distance(instance, allPoints.instance(pointList[i]), Double.POSITIVE_INFINITY); + if (dist < minDist) { + minDist = dist; + bestPoint = i; + } + } + return pointList[bestPoint]; + } + + /** + * Returns true if the value of the given dimension is smaller or equal the + * value to be compared with. + * + * @param instance the instance where the value should be taken of + * @param dim the dimension of the value + * @param value the value to compare with + * @return true if value of instance is smaller or equal value + */ + public boolean valueIsSmallerEqual(Instance instance, int dim, + double value) { //This stays + return instance.GetElementAt(dim) <= value; + } + +@Override +public String globalInfo() { + // TODO Auto-generated method stub + return null; +} + + /** + * Returns the revision string. + * + * @return the revision + */ +// public String getRevision() { +// return RevisionUtils.extract("$Revision: 1.13 $"); +// } +} diff --git a/src/gaknn/core/kdtree/KDTree.java b/src/gaknn/core/kdtree/KDTree.java new file mode 100644 index 0000000..bf48831 --- /dev/null +++ b/src/gaknn/core/kdtree/KDTree.java @@ -0,0 +1,1310 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* + * KDTree.java + * Copyright (C) 2000-2007 University of Waikato + * + */ + +package gaknn.core.kdtree; + +//import gaknn.core.kdtree.DistanceFunction; +import gaknn.core.kdtree.EuclideanDistance; +import gaknn.core.Instance; +import gaknn.core.Instances; +//import weka.core.Option; +//import weka.core.RevisionUtils; +//import weka.core.TechnicalInformation; +//import weka.core.TechnicalInformationHandler; +//import weka.core.Utils; +//import weka.core.TechnicalInformation.Field; +//import weka.core.TechnicalInformation.Type; +import gaknn.core.kdtree.KDTreeNode; +import gaknn.core.kdtree.KDTreeNodeSplitter; +import gaknn.core.kdtree.SlidingMidPointOfWidestSide; + +import java.util.Enumeration; +import java.util.Vector; + +/** + + * Class implementing the KDTree search algorithm for nearest neighbour search.
+ * The connection to dataset is only a reference. For the tree structure the indexes are stored in an array.
+ * Building the tree:
+ * If a node has <maximal-inst-number> (option -L) instances no further splitting is done. Also if the split would leave one side empty, the branch is not split any further even if the instances in the resulting node are more than <maximal-inst-number> instances.
+ * **PLEASE NOTE:** The algorithm can not handle missing values, so it is advisable to run ReplaceMissingValues filter if there are any missing values in the dataset.
+ *
+ * For more information see:
+ *
+ * Jerome H. Friedman, Jon Luis Bentley, Raphael Ari Finkel (1977). An Algorithm for Finding Best Matches in Logarithmic Expected Time. ACM Transactions on Mathematics Software. 3(3):209-226.
+ *
+ * Andrew Moore (1991). A tutorial on kd-trees. + *

+ + * + + * BibTeX: + *

+ * @article{Friedman1977,
+ *    author = {Jerome H. Friedman and Jon Luis Bentley and Raphael Ari Finkel},
+ *    journal = {ACM Transactions on Mathematics Software},
+ *    month = {September},
+ *    number = {3},
+ *    pages = {209-226},
+ *    title = {An Algorithm for Finding Best Matches in Logarithmic Expected Time},
+ *    volume = {3},
+ *    year = {1977}
+ * }
+ * 
+ * @techreport{Moore1991,
+ *    author = {Andrew Moore},
+ *    booktitle = {University of Cambridge Computer Laboratory Technical Report No. 209},
+ *    howpublished = {Extract from PhD Thesis},
+ *    title = {A tutorial on kd-trees},
+ *    year = {1991},
+ *    HTTP = {Available from http://www.autonlab.org/autonweb/14665.html}
+ * }
+ * 
+ *

+ + * + + * Valid options are:

+ * + *

 -S <classname and options>
+ *  Node splitting method to use.
+ *  (default: weka.core.neighboursearch.kdtrees.SlidingMidPointOfWidestSide)
+ * + *
 -W <value>
+ *  Set minimal width of a box
+ *  (default: 1.0E-2).
+ * + *
 -L
+ *  Maximal number of instances in a leaf
+ *  (default: 40).
+ * + *
 -N
+ *  Normalizing will be done
+ *  (Select dimension for split, with normalising to universe).
+ * + + * + * @author Gabi Schmidberger (gabi[at-the-rate]cs[dot]waikato[dot]ac[dot]nz) + * @author Malcolm Ware (mfw4[at-the-rate]cs[dot]waikato[dot]ac[dot]nz) + * @author Ashraf M. Kibriya (amk14[at-the-rate]cs[dot]waikato[dot]ac[dot]nz) + * @version $Revision: 1.3 $ + */ +public class KDTree + extends NearestNeighbourSearch + { + + /** For serialization. */ + private static final long serialVersionUID = 1505717283763272533L; + + /** + * Array holding the distances of the nearest neighbours. It is filled up both + * by nearestNeighbour() and kNearestNeighbours(). + */ + protected double[] m_DistanceList; + + /** + * Indexlist of the instances of this kdtree. Instances get sorted according + * to the splits. the nodes of the KDTree just hold their start and end + * indices + */ + protected int[] m_InstList; + + /** The root node of the tree. */ + protected KDTreeNode m_Root; + + /** The node splitter. */ + protected KDTreeNodeSplitter m_Splitter = new SlidingMidPointOfWidestSide(); + + /** Tree stats. */ + protected int m_NumNodes, m_NumLeaves, m_MaxDepth; + + /** Tree Stats variables. */ + protected TreePerformanceStats m_TreeStats = null; + + // Constants + /** The index of MIN value in attributes' range array. */ + public static final int MIN = EuclideanDistance.R_MIN; + + /** The index of MAX value in attributes' range array. */ + public static final int MAX = EuclideanDistance.R_MAX; + + /** The index of WIDTH (MAX-MIN) value in attributes' range array. */ + public static final int WIDTH = EuclideanDistance.R_WIDTH; + + /** keep the weights. */ + //@author thimal + protected static double[] m_Weights; + + + /** + * Creates a new instance of KDTree. + */ + public KDTree() { + super(); + if (getMeasurePerformance()) + m_Stats = m_TreeStats = new TreePerformanceStats(); + } + + /** + * Creates a new instance of KDTree. + * It also builds the tree on supplied set of Instances. + * @param insts The instances/points on which the BallTree + * should be built on. + */ + public KDTree(Instances insts) { + super(insts); + if (getMeasurePerformance()) + m_Stats = m_TreeStats = new TreePerformanceStats(); + } + + /** + * Builds the KDTree on the supplied set of instances/points. It + * is adviseable to run the replace missing attributes filter + * on the passed instances first. + * NOTE: This method should not be called from outside this + * class. Outside classes should call setInstances(Instances) + * instead. + * + * @param instances The instances to build the tree on + * @throws Exception if something goes wrong + */ + protected void buildKDTree(Instances instances) throws Exception { + + checkMissing(instances); + if (m_EuclideanDistance == null) + m_DistanceFunction = m_EuclideanDistance = new EuclideanDistance( + instances,m_Weights); + else + m_EuclideanDistance.setInstances(instances); + + m_Instances = instances; + int numInst = m_Instances.Size(); + + // Make the global index list + m_InstList = new int[numInst]; + + for (int i = 0; i < numInst; i++) { + m_InstList[i] = i; + } + + double[][] universe = m_EuclideanDistance.getRanges(); + + // initializing internal fields of KDTreeSplitter + m_Splitter.setInstances(m_Instances); + m_Splitter.setInstanceList(m_InstList); + m_Splitter.setEuclideanDistanceFunction(m_EuclideanDistance); + m_Splitter.setNodeWidthNormalization(m_NormalizeNodeWidth); + + // building tree + m_NumNodes = m_NumLeaves = 1; + m_MaxDepth = 0; + m_Root = new KDTreeNode(m_NumNodes, 0, m_Instances.Size() - 1, + universe); + + splitNodes(m_Root, universe, m_MaxDepth + 1); + } + + /** + * Recursively splits nodes of a tree starting from the supplied node. + * The splitting stops for any node for which the number of instances/points + * falls below a given threshold (given by m_MaxInstInLeaf), or if the + * maximum relative width/range of the instances/points + * (i.e. max_i(max(att_i) - min(att_i)) ) falls below a given threshold + * (given by m_MinBoxRelWidth). + * + * @param node The node to start splitting from. + * @param universe The attribute ranges of the whole dataset. + * @param depth The depth of the supplied node. + * @throws Exception If there is some problem + * splitting. + */ + protected void splitNodes(KDTreeNode node, double[][] universe, + int depth) throws Exception { + double[][] nodeRanges = m_EuclideanDistance.initializeRanges(m_InstList, + node.m_Start, node.m_End); + if (node.numInstances() <= m_MaxInstInLeaf + || getMaxRelativeNodeWidth(nodeRanges, universe) <= m_MinBoxRelWidth) + return; + + // splitting a node so it is no longer a leaf + m_NumLeaves--; + + if (depth > m_MaxDepth) + m_MaxDepth = depth; + + m_Splitter.splitNode(node, m_NumNodes, nodeRanges, universe); + m_NumNodes += 2; + m_NumLeaves += 2; + + splitNodes(node.m_Left, universe, depth + 1); + splitNodes(node.m_Right, universe, depth + 1); + } + + /** + * Returns (in the supplied heap object) the k nearest + * neighbours of the given instance starting from the give + * tree node. >k neighbours are returned if there are more than + * one neighbours at the kth boundary. NOTE: This method should + * not be used from outside this class. Outside classes should + * call kNearestNeighbours(Instance, int). + * + * @param target The instance to find the nearest neighbours for. + * @param node The KDTreeNode to start the search from. + * @param k The number of neighbours to find. + * @param heap The MyHeap object to store/update the kNNs found + * during the search. + * @param distanceToParents The distance of the supplied target + * to the parents of the supplied tree node. + * @throws Exception if the nearest neighbour could not be found. + */ + protected void findNearestNeighbours(Instance target, KDTreeNode node, int k, + MyHeap heap, double distanceToParents) throws Exception { + if (node.isALeaf()) { + if (m_TreeStats != null) { + m_TreeStats.updatePointCount(node.numInstances()); + m_TreeStats.incrLeafCount(); + } + double distance; + // look at all the instances in this leaf + for (int idx = node.m_Start; idx <= node.m_End; idx++) { + if (target == m_Instances.instance(m_InstList[idx])) // for + // hold-one-out + // cross-validation + continue; + if (heap.size() < k) { + distance = m_EuclideanDistance.distance(target, m_Instances + .instance(m_InstList[idx]), Double.POSITIVE_INFINITY, m_Stats); + heap.put(m_InstList[idx], distance); + } else { + MyHeapElement temp = heap.peek(); + distance = m_EuclideanDistance.distance(target, m_Instances + .instance(m_InstList[idx]), temp.distance, m_Stats); + if (distance < temp.distance) { + heap.putBySubstitute(m_InstList[idx], distance); + } else if (distance == temp.distance) { + heap.putKthNearest(m_InstList[idx], distance); + } + }// end else heap.size==k + }// end for + + } else { + if (m_TreeStats != null) { + m_TreeStats.incrIntNodeCount(); + } + KDTreeNode nearer, further; + boolean targetInLeft = m_EuclideanDistance.valueIsSmallerEqual(target, + node.m_SplitDim, node.m_SplitValue); + + if (targetInLeft) { + nearer = node.m_Left; + further = node.m_Right; + } else { + nearer = node.m_Right; + further = node.m_Left; + } + findNearestNeighbours(target, nearer, k, heap, distanceToParents); + + // ... now look in further half if maxDist reaches into it + if (heap.size() < k) { // if haven't found the first k + double distanceToSplitPlane = distanceToParents + + m_EuclideanDistance.sqDifference(node.m_SplitDim, target + .GetElementAt(node.m_SplitDim), node.m_SplitValue); + findNearestNeighbours(target, further, k, heap, distanceToSplitPlane); + return; + } else { // else see if ball centered at query intersects with the other + // side. + double distanceToSplitPlane = distanceToParents + + m_EuclideanDistance.sqDifference(node.m_SplitDim, target + .GetElementAt(node.m_SplitDim), node.m_SplitValue); + if (heap.peek().distance >= distanceToSplitPlane) { + findNearestNeighbours(target, further, k, heap, distanceToSplitPlane); + } + }// end else + }// end else_if an internal node + } + + /** + * Returns the k nearest neighbours of the supplied instance. + * >k neighbours are returned if there are more than one + * neighbours at the kth boundary. + * + * @param target The instance to find the nearest neighbours for. + * @param k The number of neighbours to find. + * @return The k nearest neighbours (or >k if more there are than + * one neighbours at the kth boundary). + * @throws Exception if the nearest neighbour could not be found. + */ + public Instances kNearestNeighbours(Instance target, int k) throws Exception { + checkMissing(target); + + if (m_Stats != null) + m_Stats.searchStart(); + + MyHeap heap = new MyHeap(k); + findNearestNeighbours(target, m_Root, k, heap, 0.0); + + if (m_Stats != null) + m_Stats.searchFinish(); + + Instances neighbours = new Instances(m_Instances, (heap.size() + heap.noOfKthNearest())); + m_DistanceList = new double[heap.size() + heap.noOfKthNearest()]; + int[] indices = new int[heap.size() + heap.noOfKthNearest()]; + int i = indices.length - 1; + MyHeapElement h; + while (heap.noOfKthNearest() > 0) { + h = heap.getKthNearest(); + indices[i] = h.index; + m_DistanceList[i] = h.distance; + i--; + } + while (heap.size() > 0) { + h = heap.get(); + indices[i] = h.index; + m_DistanceList[i] = h.distance; + i--; + } + m_DistanceFunction.postProcessDistances(m_DistanceList); + for (int idx = 0; idx < indices.length; idx++) { + neighbours.add(m_Instances.instance(indices[idx])); + } + + return neighbours; + } + + + /** + * Returns the nearest neighbour of the supplied target + * instance. + * + * @param target The instance to find the nearest neighbour for. + * @return The nearest neighbour from among the previously + * supplied training instances. + * @throws Exception if the neighbours could not be found. + */ + public Instance nearestNeighbour(Instance target) throws Exception { + return (kNearestNeighbours(target, 1)).instance(0); + } + + /** + * Returns the distances to the kNearest or 1 nearest neighbour currently + * found with either the kNearestNeighbours or the nearestNeighbour method. + * + * @return array containing the distances of the + * nearestNeighbours. The length and ordering of the array + * is the same as that of the instances returned by + * nearestNeighbour functions. + * @throws Exception if called before calling kNearestNeighbours or + * nearestNeighbours. + */ + public double[] getDistances() throws Exception { + if (m_Instances == null || m_DistanceList == null) + throw new Exception("The tree has not been supplied with a set of " + + "instances or getDistances() has been called " + + "before calling kNearestNeighbours()."); + return m_DistanceList; + } + + + /** + * Builds the KDTree on the given set of instances. + * @param instances The insts on which the KDTree is to be + * built. + * @throws Exception If some error occurs while + * building the KDTree + */ + public void setInstances(Instances instances) throws Exception { + super.setInstances(instances); + buildKDTree(instances); + } + + + /** + * Adds one instance to the KDTree. This updates the KDTree structure to take + * into account the newly added training instance. + * + * @param instance the instance to be added. Usually the newly added instance in the + * training set. + * @throws Exception If the instance cannot be added. + */ + public void update(Instance instance) throws Exception { // better to change + // to addInstance + if (m_Instances == null) + throw new Exception("No instances supplied yet. Have to call " + + "setInstances(instances) with a set of Instances " + "first."); + + addInstanceInfo(instance); + addInstanceToTree(instance, m_Root); + } + + /** + * Recursively adds an instance to the tree starting from + * the supplied KDTreeNode. + * NOTE: This should not be called by outside classes, + * outside classes should instead call update(Instance) + * method. + * + * @param inst The instance to add to the tree + * @param node The node to start the recursive search + * from, for the leaf node where the supplied instance + * would go. + * @throws Exception If some error occurs while adding + * the instance. + */ + protected void addInstanceToTree(Instance inst, KDTreeNode node) + throws Exception { + if (node.isALeaf()) { + int instList[] = new int[m_Instances.Size()]; + try { + System.arraycopy(m_InstList, 0, instList, 0, node.m_End + 1); // m_InstList.squeezeIn(m_End, + // index); + if (node.m_End < m_InstList.length - 1) + System.arraycopy(m_InstList, node.m_End + 1, instList, + node.m_End + 2, m_InstList.length - node.m_End - 1); + instList[node.m_End + 1] = m_Instances.Size() - 1; + } catch (ArrayIndexOutOfBoundsException ex) { + System.err.println("m_InstList.length: " + m_InstList.length + + " instList.length: " + instList.length + "node.m_End+1: " + + (node.m_End + 1) + "m_InstList.length-node.m_End+1: " + + (m_InstList.length - node.m_End - 1)); + throw ex; + } + m_InstList = instList; + + node.m_End++; + node.m_NodeRanges = m_EuclideanDistance.updateRanges(inst, + node.m_NodeRanges); + + m_Splitter.setInstanceList(m_InstList); + + // split this leaf node if necessary + double[][] universe = m_EuclideanDistance.getRanges(); + if (node.numInstances() > m_MaxInstInLeaf + && getMaxRelativeNodeWidth(node.m_NodeRanges, universe) > m_MinBoxRelWidth) { + m_Splitter.splitNode(node, m_NumNodes, node.m_NodeRanges, universe); + m_NumNodes += 2; + } + }// end if node is a leaf + else { + if (m_EuclideanDistance.valueIsSmallerEqual(inst, node.m_SplitDim, + node.m_SplitValue)) { + addInstanceToTree(inst, node.m_Left); + afterAddInstance(node.m_Right); + } else + addInstanceToTree(inst, node.m_Right); + + node.m_End++; + node.m_NodeRanges = m_EuclideanDistance.updateRanges(inst, + node.m_NodeRanges); + } + } + + /** + * Corrects the start and end indices of a + * KDTreeNode after an instance is added to + * the tree. The start and end indices for + * the master index array (m_InstList) + * stored in the nodes need to be updated + * for all nodes in the subtree on the + * right of a node where the instance + * was added. + * NOTE: No outside class should call this + * method. + * + * @param node KDTreeNode whose start and end indices + * need to be updated. + */ + protected void afterAddInstance(KDTreeNode node) { + node.m_Start++; + node.m_End++; + if (!node.isALeaf()) { + afterAddInstance(node.m_Left); + afterAddInstance(node.m_Right); + } + } + + /** + * Adds one instance to KDTree loosly. It only changes the ranges in + * EuclideanDistance, and does not affect the structure of the KDTree. + * + * @param instance the new instance. Usually this is the test instance + * supplied to update the range of attributes in the distance function. + */ + public void addInstanceInfo(Instance instance) { + m_EuclideanDistance.updateRanges(instance); + } + + /** + * Checks if there is any instance with missing values. Throws an exception if + * there is, as KDTree does not handle missing values. + * + * @param instances the instances to check + * @throws Exception if missing values are encountered + */ + protected void checkMissing(Instances instances) throws Exception { + for (int i = 0; i < instances.Size(); i++) { + Instance ins = instances.instance(i); + for (int j = 0; j < ins.numValues(); j++) { + if (ins.index(j) != ins.GetClassIndex()) + if (ins.isMissing(j)) { + throw new Exception("ERROR: KDTree can not deal with missing " + + "values. Please run ReplaceMissingValues filter " + + "on the dataset before passing it on to the KDTree."); + } + } + } + } + + /** + * Checks if there is any missing value in the given + * instance. + * @param ins The instance to check missing values in. + * @throws Exception If there is a missing value in the + * instance. + */ + protected void checkMissing(Instance ins) throws Exception { + for (int j = 0; j < ins.numValues(); j++) { + if (ins.index(j) != ins.GetClassIndex()) + if (ins.isMissing(j)) { + throw new Exception("ERROR: KDTree can not deal with missing " + + "values. Please run ReplaceMissingValues filter " + + "on the dataset before passing it on to the KDTree."); + } + } + } + + /** + * Returns the maximum attribute width of instances/points + * in a KDTreeNode relative to the whole dataset. + * + * @param nodeRanges The attribute ranges of the + * KDTreeNode whose maximum relative width is to be + * determined. + * @param universe The attribute ranges of the whole + * dataset (training instances + test instances so + * far encountered). + * @return The maximum relative width + */ + protected double getMaxRelativeNodeWidth(double[][] nodeRanges, + double[][] universe) { + int widest = widestDim(nodeRanges, universe); + if(widest < 0) + return 0.0; + else + return nodeRanges[widest][WIDTH] / universe[widest][WIDTH]; + } + + /** + * Returns the widest dimension/attribute in a + * KDTreeNode (widest after normalizing). + * @param nodeRanges The attribute ranges of + * the KDTreeNode. + * @param universe The attribute ranges of the + * whole dataset (training instances + test + * instances so far encountered). + * @return The index of the widest + * dimension/attribute. + */ + protected int widestDim(double[][] nodeRanges, double[][] universe) { + final int classIdx = m_Instances.GetClassIndex(); + double widest = 0.0; + int w = -1; + if (m_NormalizeNodeWidth) { + for (int i = 0; i < nodeRanges.length; i++) { + double newWidest = nodeRanges[i][WIDTH] / universe[i][WIDTH]; + if (newWidest > widest) { + if (i == classIdx) + continue; + widest = newWidest; + w = i; + } + } + } else { + for (int i = 0; i < nodeRanges.length; i++) { + if (nodeRanges[i][WIDTH] > widest) { + if (i == classIdx) + continue; + widest = nodeRanges[i][WIDTH]; + w = i; + } + } + } + return w; + } + + /** + * Returns the size of the tree. + * + * @return the size of the tree + */ + public double measureTreeSize() { + return m_NumNodes; + } + + /** + * Returns the number of leaves. + * + * @return the number of leaves + */ + public double measureNumLeaves() { + return m_NumLeaves; + } + + /** + * Returns the depth of the tree. + * + * @return The depth of the tree + */ + public double measureMaxDepth() { + return m_MaxDepth; + } + + /** + * Returns an enumeration of the additional measure names. + * + * @return an enumeration of the measure names + */ + public Enumeration enumerateMeasures() { + Vector newVector = new Vector(); + newVector.addElement("measureTreeSize"); + newVector.addElement("measureNumLeaves"); + newVector.addElement("measureMaxDepth"); + if (m_Stats != null) { + for (Enumeration e = m_Stats.enumerateMeasures(); e.hasMoreElements();) { + newVector.addElement(e.nextElement()); + } + } + return newVector.elements(); + } + + /** + * Returns the value of the named measure. + * + * @param additionalMeasureName the name of + * the measure to query for its value. + * @return The value of the named measure + * @throws IllegalArgumentException If the named measure + * is not supported. + */ + public double getMeasure(String additionalMeasureName) { + if (additionalMeasureName.compareToIgnoreCase("measureMaxDepth") == 0) { + return measureMaxDepth(); + } else if (additionalMeasureName.compareToIgnoreCase("measureTreeSize") == 0) { + return measureTreeSize(); + } else if (additionalMeasureName.compareToIgnoreCase("measureNumLeaves") == 0) { + return measureNumLeaves(); + } else if (m_Stats != null) { + return m_Stats.getMeasure(additionalMeasureName); + } else { + throw new IllegalArgumentException(additionalMeasureName + + " not supported (KDTree)"); + } + } + + /** + * Sets whether to calculate the performance statistics or not. + * @param measurePerformance Should be true if performance + * statistics are to be measured. + */ + public void setMeasurePerformance(boolean measurePerformance) { + m_MeasurePerformance = measurePerformance; + if (m_MeasurePerformance) { + if (m_Stats == null) + m_Stats = m_TreeStats = new TreePerformanceStats(); + } else + m_Stats = m_TreeStats = null; + } + + /** + * Assigns instances to centers using KDTree. + * + * @param centers the current centers + * @param assignments the centerindex for each instance + * @param pc the threshold value for pruning. + * @throws Exception If there is some problem + * assigning instances to centers. + */ + public void centerInstances(Instances centers, int[] assignments, double pc) + throws Exception { + + int[] centList = new int[centers.Size()]; + for (int i = 0; i < centers.Size(); i++) + centList[i] = i; + + determineAssignments(m_Root, centers, centList, assignments, pc); + } + + /** + * Assigns instances to the current centers called candidates. + * + * @param node The node to start assigning the instances from. + * @param centers all the current centers. + * @param candidates the current centers the method works on. + * @param assignments the center index for each instance. + * @param pc the threshold value for pruning. + * @throws Exception If there is some problem assigning + * instances to centers. + */ + protected void determineAssignments(KDTreeNode node, Instances centers, + int[] candidates, int[] assignments, double pc) throws Exception { + + // reduce number of owners for current hyper rectangle + int[] owners = refineOwners(node, centers, candidates); + + // only one owner + if (owners.length == 1) { + // all instances of this node are owned by one center + for (int i = node.m_Start; i <= node.m_End; i++) { + assignments[m_InstList[i]] // the assignment of this instance + = owners[0]; // is the current owner + } + } else if (!node.isALeaf()) { + // more than one owner and it is not a leaf + determineAssignments(node.m_Left, centers, owners, assignments, pc); + determineAssignments(node.m_Right, centers, owners, assignments, pc); + } else { + // this is a leaf and there are more than 1 owner + // XMeans. + assignSubToCenters(node, centers, owners, assignments); + } + } + + /** + * Refines the ownerlist. + * + * @param node The current tree node. + * @param centers all centers + * @param candidates the indexes of those centers that are candidates. + * @return list of owners + * @throws Exception If some problem occurs in refining. + */ + protected int[] refineOwners(KDTreeNode node, Instances centers, + int[] candidates) throws Exception { + + int[] owners = new int[candidates.length]; + double minDistance = Double.POSITIVE_INFINITY; + int ownerIndex = -1; + Instance owner; + int numCand = candidates.length; + double[] distance = new double[numCand]; + boolean[] inside = new boolean[numCand]; + for (int i = 0; i < numCand; i++) { + distance[i] = distanceToHrect(node, centers.instance(candidates[i])); + inside[i] = (distance[i] == 0.0); + if (distance[i] < minDistance) { + minDistance = distance[i]; + ownerIndex = i; + } + } + owner = new Instance(centers.instance(candidates[ownerIndex])); + + // are there other owners + // loop also goes over already found owner, keeps order + // in owner list + int index = 0; + for (int i = 0; i < numCand; i++) { + // 1. all centers that are points within rectangle are owners + if ((inside[i]) + + // 2. take all points with same distance to the rect. as the owner + || (distance[i] == distance[ownerIndex])) { + + // add competitor to owners list + owners[index++] = candidates[i]; + } else { + + Instance competitor = new Instance(centers.instance(candidates[i])); + if + + // 3. point has larger distance to rectangle but still can compete + // with owner for some points in the rectangle + (!candidateIsFullOwner(node, owner, competitor)) + + { + // also add competitor to owners list + owners[index++] = candidates[i]; + } + } + } + int[] result = new int[index]; + for (int i = 0; i < index; i++) + result[i] = owners[i]; + return result; + } + + /** + * Returns the distance between a point and an hyperrectangle. + * + * @param node The current node from whose hyperrectangle + * the distance is to be measured. + * @param x the point + * @return the distance + * @throws Exception If some problem occurs in determining + * the distance to the hyperrectangle. + */ + protected double distanceToHrect(KDTreeNode node, Instance x) throws Exception { + double distance = 0.0; + + Instance closestPoint = new Instance(x); + boolean inside; + inside = clipToInsideHrect(node, closestPoint); + if (!inside) + distance = m_EuclideanDistance.distance(closestPoint, x); + return distance; + } + + /** + * Finds the closest point in the hyper rectangle to a given point. Change the + * given point to this closest point by clipping of at all the dimensions to + * be clipped of. If the point is inside the rectangle it stays unchanged. The + * return value is true if the point was not changed, so the the return value + * is true if the point was inside the rectangle. + * + * @param node The current KDTreeNode in whose hyperrectangle the closest + * point is to be found. + * @param x a point + * @return true if the input point stayed unchanged. + */ + protected boolean clipToInsideHrect(KDTreeNode node, Instance x) { + boolean inside = true; + for (int i = 0; i < m_Instances.NumAttributes(); i++) { + // TODO treat nominals differently!?? + if (x.GetElementAt(i) < node.m_NodeRanges[i][MIN]) { + x.setValue(i, node.m_NodeRanges[i][MIN]); + inside = false; + } else if (x.GetElementAt(i) > node.m_NodeRanges[i][MAX]) { + x.setValue(i, node.m_NodeRanges[i][MAX]); + inside = false; + } + } + return inside; + } + + /** + * Returns true if candidate is a full owner in respect to a competitor. + *

+ * + * The candidate has been the closer point to the current rectangle or even + * has been a point within the rectangle. The competitor is competing with the + * candidate for a few points out of the rectangle although it is a point + * further away from the rectangle then the candidate. The extrem point is the + * corner of the rectangle that is furthest away from the candidate towards + * the direction of the competitor. + * + * If the distance candidate to this extreme point is smaller then the + * distance competitor to this extreme point, then it is proven that none of + * the points in the rectangle can be owned be the competitor and the + * candidate is full owner of the rectangle in respect to this competitor. See + * also D. Pelleg and A. Moore's paper 'Accelerating exact k-means Algorithms + * with Geometric Reasoning'. + *

+ * + * @param node The current KDTreeNode / hyperrectangle. + * @param candidate instance that is candidate to be owner + * @param competitor instance that competes against the candidate + * @return true if candidate is full owner + * @throws Exception If some problem occurs. + */ + protected boolean candidateIsFullOwner(KDTreeNode node, Instance candidate, + Instance competitor) throws Exception { + // get extreme point + Instance extreme = new Instance(candidate); + for (int i = 0; i < m_Instances.NumAttributes(); i++) { + if ((competitor.GetElementAt(i) - candidate.GetElementAt(i)) > 0) { + extreme.setValue(i, node.m_NodeRanges[i][MAX]); + } else { + extreme.setValue(i, node.m_NodeRanges[i][MIN]); + } + } + boolean isFullOwner = m_EuclideanDistance.distance(extreme, candidate) < m_EuclideanDistance + .distance(extreme, competitor); + + return isFullOwner; + } + + /** + * Assigns instances of this node to center. Center to be assign to is decided + * by the distance function. + * + * @param node The KDTreeNode whose instances are to be assigned. + * @param centers all the input centers + * @param centList the list of centers to work with + * @param assignments index list of last assignments + * @throws Exception If there is error assigning the instances. + */ + public void assignSubToCenters(KDTreeNode node, Instances centers, + int[] centList, int[] assignments) throws Exception { + // todo: undecided situations + int numCent = centList.length; + + // WARNING: assignments is "input/output-parameter" + // should not be null and the following should not happen + if (assignments == null) { + assignments = new int[m_Instances.Size()]; + for (int i = 0; i < assignments.length; i++) { + assignments[i] = -1; + } + } + + // set assignments for all instances of this node + for (int i = node.m_Start; i <= node.m_End; i++) { + int instIndex = m_InstList[i]; + Instance inst = m_Instances.instance(instIndex); + // if (instList[i] == 664) System.out.println("664***"); + int newC = m_EuclideanDistance.closestPoint(inst, centers, centList); + // int newC = clusterProcessedInstance(inst, centers); + assignments[instIndex] = newC; + } + } + + /** + * Properties' variables ===================================================== + */ + + /** flag for normalizing. */ + boolean m_NormalizeNodeWidth = true; + + /** The euclidean distance function to use. */ + protected EuclideanDistance m_EuclideanDistance; + { // to make sure we have only one object of EuclideanDistance + if (m_DistanceFunction instanceof EuclideanDistance) + m_EuclideanDistance = (EuclideanDistance) m_DistanceFunction; + else + m_DistanceFunction = m_EuclideanDistance = new EuclideanDistance(); + } + + /** minimal relative width of a KDTree rectangle. */ + protected double m_MinBoxRelWidth = 1.0E-2; + + /** maximal number of instances in a leaf. */ + protected int m_MaxInstInLeaf = 40; + + /** + * the GET and SET - functions =============================================== + */ + + /** + * Tip text for this property. + * + * @return the tip text for this property + */ + public String minBoxRelWidthTipText() { + return "The minimum relative width of the box. A node is only made a leaf " + + "if the width of the split dimension of the instances in a node " + + "normalized over the width of the split dimension of all the " + + "instances is less than or equal to this minimum relative width."; + } + + /** + * Sets the minimum relative box width. + * + * @param i the minimum relative box width + */ + public void setMinBoxRelWidth(double i) { + m_MinBoxRelWidth = i; + } + + /** + * Gets the minimum relative box width. + * + * @return the minimum relative box width + */ + public double getMinBoxRelWidth() { + return m_MinBoxRelWidth; + } + + /** + * Tip text for this property. + * + * @return the tip text for this property + */ + public String maxInstInLeafTipText() { + return "The max number of instances in a leaf."; + } + + /** + * Sets the maximum number of instances in a leaf. + * + * @param i the maximum number of instances in a leaf + */ + public void setMaxInstInLeaf(int i) { + m_MaxInstInLeaf = i; + } + + /** + * Get the maximum number of instances in a leaf. + * + * @return the maximum number of instances in a leaf + */ + public int getMaxInstInLeaf() { + return m_MaxInstInLeaf; + } + + /** + * Tip text for this property. + * + * @return the tip text for this property + */ + public String normalizeNodeWidthTipText() { + return "Whether if the widths of the KDTree node should be normalized " + + "by the width of the universe or not. " + + "Where, width of the node is the range of the split attribute " + + "based on the instances in that node, and width of the " + + "universe is the range of the split attribute based on all the " + + "instances (default: false)."; + } + + /** + * Sets the flag for normalizing the widths of a KDTree Node by the width of + * the dimension in the universe. + * + * @param n true to use normalizing. + */ + public void setNormalizeNodeWidth(boolean n) { + m_NormalizeNodeWidth = n; + } + + /** + * Gets the normalize flag. + * + * @return True if normalizing + */ + public boolean getNormalizeNodeWidth() { + return m_NormalizeNodeWidth; + } + + /** + * returns the distance function currently in use. + * + * @return the distance function + */ + public DistanceFunction getDistanceFunction() { + return (DistanceFunction) m_EuclideanDistance; + } + + /** + * sets the distance function to use for nearest neighbour search. + * + * @param df the distance function to use + * @throws Exception if not EuclideanDistance + */ + public void setDistanceFunction(DistanceFunction df) throws Exception { + if (!(df instanceof EuclideanDistance)) + throw new Exception("KDTree currently only works with " + + "EuclideanDistanceFunction."); + m_DistanceFunction = m_EuclideanDistance = (EuclideanDistance) df; + } + + /** + * Returns the tip text for this property. + * + * @return tip text for this property suitable for + * displaying in the explorer/experimenter gui + */ + public String nodeSplitterTipText() { + return "The the splitting method to split the nodes of the KDTree."; + } + + /** + * Returns the splitting method currently in use to split the nodes of the + * KDTree. + * + * @return The KDTreeNodeSplitter currently in use. + */ + public KDTreeNodeSplitter getNodeSplitter() { + return m_Splitter; + } + + /** + * Sets the splitting method to use to split the nodes of the KDTree. + * + * @param splitter The KDTreeNodeSplitter to use. + */ + public void setNodeSplitter(KDTreeNodeSplitter splitter) { + m_Splitter = splitter; + } + + /** set the weights + * @param get double array of weights. */ + public void SetWeights(double[] weights){ + m_Weights = weights; + m_EuclideanDistance.SetWeights(weights); + } + + /** + * Returns a string describing this nearest neighbour search algorithm. + * + * @return a description of the algorithm for displaying in the + * explorer/experimenter gui + */ +// public String globalInfo() { +// return +// "Class implementing the KDTree search algorithm for nearest " +// + "neighbour search.\n" +// + "The connection to dataset is only a reference. For the tree " +// + "structure the indexes are stored in an array. \n" +// + "Building the tree:\n" +// + "If a node has (option -L) instances no " +// + "further splitting is done. Also if the split would leave one " +// + "side empty, the branch is not split any further even if the " +// + "instances in the resulting node are more than " +// + " instances.\n" +// + "**PLEASE NOTE:** The algorithm can not handle missing values, so it " +// + "is advisable to run ReplaceMissingValues filter if there are any " +// + "missing values in the dataset.\n\n" +// + "For more information see:\n\n" +// + getTechnicalInformation().toString(); +// } +// +// /** +// * Returns an enumeration describing the available options. +// * +// * @return an enumeration of all the available options. +// */ +// public Enumeration listOptions() { +// Vector newVector = new Vector(); +// +// newVector.add(new Option( +// "\tNode splitting method to use.\n" +// + "\t(default: weka.core.neighboursearch.kdtrees.SlidingMidPointOfWidestSide)", +// "S", 1, "-S ")); +// +// newVector.addElement(new Option( +// "\tSet minimal width of a box\n" +// + "\t(default: 1.0E-2).", +// "W", 0, "-W ")); +// +// newVector.addElement(new Option( +// "\tMaximal number of instances in a leaf\n" +// + "\t(default: 40).", +// "L", 0, "-L")); +// +// newVector.addElement(new Option( +// "\tNormalizing will be done\n" +// + "\t(Select dimension for split, with normalising to universe).", +// "N", 0, "-N")); +// +// return newVector.elements(); +// } + + /** + * Parses a given list of options.

+ * + + * Valid options are:

+ * + *

 -S <classname and options>
+   *  Node splitting method to use.
+   *  (default: weka.core.neighboursearch.kdtrees.SlidingMidPointOfWidestSide)
+ * + *
 -W <value>
+   *  Set minimal width of a box
+   *  (default: 1.0E-2).
+ * + *
 -L
+   *  Maximal number of instances in a leaf
+   *  (default: 40).
+ * + *
 -N
+   *  Normalizing will be done
+   *  (Select dimension for split, with normalising to universe).
+ * + + * + * @param options the list of options as an array of strings + * @throws Exception if an option is not supported + */ +// public void setOptions(String[] options) throws Exception { +// super.setOptions(options); +// +// String optionString = Utils.getOption('S', options); +// if (optionString.length() != 0) { +// String splitMethodSpec[] = Utils.splitOptions(optionString); +// if (splitMethodSpec.length == 0) { +// throw new Exception("Invalid DistanceFunction specification string."); +// } +// String className = splitMethodSpec[0]; +// splitMethodSpec[0] = ""; +// +// setNodeSplitter((KDTreeNodeSplitter) Utils.forName( +// KDTreeNodeSplitter.class, className, splitMethodSpec)); +// } +// else { +// setNodeSplitter(new SlidingMidPointOfWidestSide()); +// } +// +// optionString = Utils.getOption('W', options); +// if (optionString.length() != 0) +// setMinBoxRelWidth(Double.parseDouble(optionString)); +// else +// setMinBoxRelWidth(1.0E-2); +// +// optionString = Utils.getOption('L', options); +// if (optionString.length() != 0) +// setMaxInstInLeaf(Integer.parseInt(optionString)); +// else +// setMaxInstInLeaf(40); +// +// setNormalizeNodeWidth(Utils.getFlag('N', options)); +// } + + /** + * Gets the current settings of KDtree. + * + * @return an array of strings suitable for passing to setOptions + */ +// public String[] getOptions() { +// Vector result; +// String[] options; +// int i; +// +// result = new Vector(); +// +// options = super.getOptions(); +// for (i = 0; i < options.length; i++) +// result.add(options[i]); +// +// result.add("-S"); +// result.add( +// (m_Splitter.getClass().getName() + " " + +// Utils.joinOptions(m_Splitter.getOptions())).trim()); +// +// result.add("-W"); +// result.add("" + getMinBoxRelWidth()); +// +// result.add("-L"); +// result.add("" + getMaxInstInLeaf()); +// +// if (getNormalizeNodeWidth()) +// result.add("-N"); +// +// return result.toArray(new String[result.size()]); +// } +// +// /** +// * Returns the revision string. +// * +// * @return the revision +// */ +// public String getRevision() { +// return RevisionUtils.extract("$Revision: 1.3 $"); +// } +} diff --git a/src/gaknn/core/kdtree/KDTreeNode.java b/src/gaknn/core/kdtree/KDTreeNode.java new file mode 100644 index 0000000..d52b11a --- /dev/null +++ b/src/gaknn/core/kdtree/KDTreeNode.java @@ -0,0 +1,182 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* + * KDTreeNode.java + * Copyright (C) 2007 University of Waikato, Hamilton, New Zealand + */ + +package gaknn.core.kdtree; + +//import weka.core.RevisionHandler; +//import weka.core.RevisionUtils; + +import java.io.Serializable; + +/** + * A class representing a KDTree node. A node does not explicitly + * store the instances that it contains. Instead, it only stores + * the start and end index of a portion in a master index array. Each + * node is assigned a portion in the master index array that stores + * the indices of the instances that the node contains. Every time a + * node is split by the KDTree's contruction method, the instances of + * its left child are moved to the left and the instances of its + * right child are moved to the right, in the portion of the master + * index array belonging to the node. The start and end index in each + * of its children are then set accordingly within that portion so + * that each have their own portion which contains their instances. + * P.S.: The master index array is only stored in KDTree class. + * + * @author Ashraf M. Kibriya (amk14[at-the-rate]cs[dot]waikato[dot]ac[dot]nz) + * @version $Revision: 1.2 $ + */ +public class KDTreeNode + implements Serializable { + + /** for serialization. */ + private static final long serialVersionUID = -3660396067582792648L; + + /** node number (only for debug). */ + public int m_NodeNumber; + + /** left subtree; contains instances with smaller or equal to split value. */ + public KDTreeNode m_Left = null; + + /** right subtree; contains instances with larger than split value. */ + public KDTreeNode m_Right = null; + + /** value to split on. */ + public double m_SplitValue; + + /** attribute to split on. */ + public int m_SplitDim; + + /** + * lowest and highest value and width (= high - low) for each + * dimension. + */ + public double[][] m_NodeRanges; + + /** + * The lo and high bounds of the hyper rectangle described by the + * node. + */ + public double[][] m_NodesRectBounds; + + /** + * The start index of the portion of the master index array, + * which stores the indices of the instances/points the node + * contains. + */ + public int m_Start = 0; + + /** + * The end index of the portion of the master index array, + * which stores indices of the instances/points the node + * contains. + */ + public int m_End = 0; + + /** + * Constructor. + */ + public KDTreeNode() {} + + /** + * Constructor. + * + * @param nodeNum The node number/id. + * @param startidx The start index of node's portion + * in master index array. + * @param endidx The start index of node's portion + * in master index array. + * @param nodeRanges The attribute ranges of the + * Instances/points contained in this node. + */ + public KDTreeNode(int nodeNum, int startidx, int endidx, double[][] nodeRanges) { + m_NodeNumber = nodeNum; + m_Start = startidx; m_End = endidx; + m_NodeRanges = nodeRanges; + } + + /** + * + * @param nodeNum The node number/id. + * @param startidx The start index of node's portion + * in master index array. + * @param endidx The start index of node's portion + * in master index array. + * @param nodeRanges The attribute ranges of the + * Instances/points contained in this node. + * @param rectBounds The range of the rectangular + * region in the point space that this node + * represents (points inside this rectangular + * region can have different range). + */ + public KDTreeNode(int nodeNum, int startidx, int endidx, double[][] nodeRanges, double[][] rectBounds) { + m_NodeNumber = nodeNum; + m_Start = startidx; m_End = endidx; + m_NodeRanges = nodeRanges; + m_NodesRectBounds = rectBounds; + } + + /** + * Gets the splitting dimension. + * + * @return splitting dimension + */ + public int getSplitDim() { + return m_SplitDim; + } + + /** + * Gets the splitting value. + * + * @return splitting value + */ + public double getSplitValue() { + return m_SplitValue; + } + + /** + * Checks if node is a leaf. + * + * @return true if it is a leaf + */ + public boolean isALeaf() { + return (m_Left == null); + } + + /** + * Returns the number of Instances + * in the rectangular region defined + * by this node. + * @return The number of instances in + * this KDTreeNode. + */ + public int numInstances() { + return (m_End-m_Start+1); + } + + /** + * Returns the revision string. + * + * @return the revision + */ +// public String getRevision() { +// return RevisionUtils.extract("$Revision: 1.2 $"); +// } +} diff --git a/src/gaknn/core/kdtree/KDTreeNodeSplitter.java b/src/gaknn/core/kdtree/KDTreeNodeSplitter.java new file mode 100644 index 0000000..60bc37f --- /dev/null +++ b/src/gaknn/core/kdtree/KDTreeNodeSplitter.java @@ -0,0 +1,253 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* + * KDTreeNodeSplitter.java + * Copyright (C) 1999-2007 University of Waikato + */ + +package gaknn.core.kdtree; + +import gaknn.core.kdtree.EuclideanDistance; +import gaknn.core.Instances; +//import weka.core.OptionHandler; +//import weka.core.RevisionHandler; +//import weka.core.RevisionUtils; + +import java.io.Serializable; +import java.util.Enumeration; +import java.util.Vector; + +/** + * Class that splits up a KDTreeNode. + * + * @author Ashraf M. Kibriya (amk14[at-the-rate]cs[dot]waikato[dot]ac[dot]nz) + * @version $Revision: 1.2 $ + */ +public abstract class KDTreeNodeSplitter + implements Serializable { + + /** The instances that'll be used for tree construction. */ + protected Instances m_Instances; + + /** The distance function used for building the tree. */ + protected EuclideanDistance m_EuclideanDistance; + + /** + * The master index array that'll be reshuffled as nodes + * are split and the tree is constructed. + */ + protected int[] m_InstList; + + /** + * Stores whether if the width of a KDTree + * node is normalized or not. + */ + protected boolean m_NormalizeNodeWidth; + + // Constants + /** Index of min value in an array of attributes' range. */ + public static final int MIN = EuclideanDistance.R_MIN; + + /** Index of max value in an array of attributes' range. */ + public static final int MAX = EuclideanDistance.R_MAX; + + /** Index of width value (max-min) in an array of attributes' range. */ + public static final int WIDTH = EuclideanDistance.R_WIDTH; + + /** + * default constructor. + */ + public KDTreeNodeSplitter() { + } + + /** + * Creates a new instance of KDTreeNodeSplitter. + * @param instList Reference of the master index array. + * @param insts The set of training instances on which + * the tree is built. + * @param e The EuclideanDistance object that is used + * in tree contruction. + */ + public KDTreeNodeSplitter(int[] instList, Instances insts, EuclideanDistance e) { + m_InstList = instList; + m_Instances = insts; + m_EuclideanDistance = e; + } + + /** + * Returns an enumeration describing the available options. + * + * @return an enumeration of all the available options. + */ + public Enumeration listOptions() { + return new Vector().elements(); + } + + /** + * Parses a given list of options. + * + * @param options the list of options as an array of strings + * @throws Exception if an option is not supported + */ + public void setOptions(String[] options) throws Exception { + } + + /** + * Gets the current settings of the object. + * + * @return an array of strings suitable for passing to setOptions + */ + public String[] getOptions() { + return new String[0]; + } + + /** + * Checks whether an object of this class has been correctly + * initialized. Performs checks to see if all the necessary + * things (master index array, training instances, distance + * function) have been supplied or not. + * @throws Exception If the object has not been correctly + * initialized. + */ + protected void correctlyInitialized() throws Exception { + if(m_Instances==null) + throw new Exception("No instances supplied."); + else if(m_InstList==null) + throw new Exception("No instance list supplied."); + else if(m_EuclideanDistance==null) + throw new Exception("No Euclidean distance function supplied."); + else if(m_Instances.Size() != m_InstList.length) + throw new Exception("The supplied instance list doesn't seem to match " + + "the supplied instances"); + } + + /** + * Splits a node into two. After splitting two new nodes are created + * and correctly initialised. And, node.left and node.right are + * set appropriately. + * @param node The node to split. + * @param numNodesCreated The number of nodes that so far have been + * created for the tree, so that the newly created nodes are + * assigned correct/meaningful node numbers/ids. + * @param nodeRanges The attributes' range for the points inside + * the node that is to be split. + * @param universe The attributes' range for the whole + * point-space. + * @throws Exception If there is some problem in splitting the + * given node. + */ + public abstract void splitNode(KDTreeNode node, int numNodesCreated, + double[][] nodeRanges, double[][] universe) + throws Exception; + + /** + * Sets the training instances on which the tree is (or is + * to be) built. + * @param inst The training instances. + */ + public void setInstances(Instances inst) { + m_Instances = inst; + } + + /** + * Sets the master index array containing indices of the + * training instances. This array will be rearranged as + * the tree is built, so that each node is assigned a + * portion in this array which contain the instances + * insides the node's region. + * @param instList The master index array. + */ + public void setInstanceList(int[] instList) { + m_InstList = instList; + } + + /** + * Sets the EuclideanDistance object to use for + * splitting nodes. + * @param func The EuclideanDistance object. + */ + public void setEuclideanDistanceFunction(EuclideanDistance func) { + m_EuclideanDistance = func; + } + + /** + * Sets whether if a nodes region is normalized + * or not. If set to true then, when selecting + * the widest attribute/dimension for splitting, + * the width of each attribute/dimension, + * of the points inside the node's region, is + * divided by the width of that + * attribute/dimension for the whole point-space. + * Thus, each attribute/dimension of that node + * is normalized. + * + * @param normalize Should be true if + * normalization is required. + */ + public void setNodeWidthNormalization(boolean normalize) { + m_NormalizeNodeWidth = normalize; + } + + /** + * Returns the widest dimension. The width of each + * dimension (for the points inside the node) is + * normalized, if m_NormalizeNodeWidth is set to + * true. + * @param nodeRanges The attributes' range of the + * points inside the node that is to be split. + * @param universe The attributes' range for the + * whole point-space. + * @return The index of the attribute/dimension + * in which the points of the node have widest + * spread. + */ + protected int widestDim(double[][] nodeRanges, double[][] universe) { + final int classIdx = m_Instances.GetClassIndex(); + double widest = 0.0; + int w = -1; + if (m_NormalizeNodeWidth) { + for (int i = 0; i < nodeRanges.length; i++) { + double newWidest = nodeRanges[i][WIDTH] / universe[i][WIDTH]; + if (newWidest > widest) { + if (i == classIdx) + continue; + widest = newWidest; + w = i; + } + } + } else { + for (int i = 0; i < nodeRanges.length; i++) { + if (nodeRanges[i][WIDTH] > widest) { + if (i == classIdx) + continue; + widest = nodeRanges[i][WIDTH]; + w = i; + } + } + } + return w; + } + + /** + * Returns the revision string. + * + * @return the revision + */ +// public String getRevision() { +// return RevisionUtils.extract("$Revision: 1.2 $"); +// } +} diff --git a/src/gaknn/core/kdtree/KMeansInpiredMethod.java b/src/gaknn/core/kdtree/KMeansInpiredMethod.java new file mode 100644 index 0000000..e4995fc --- /dev/null +++ b/src/gaknn/core/kdtree/KMeansInpiredMethod.java @@ -0,0 +1,382 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* + * KMeansInpiredMethod.java + * Copyright (C) 2007 University of Waikato, Hamilton, New Zealand + */ + +package gaknn.core.kdtree; + +import gaknn.core.Instance; +import gaknn.core.Instances; +//import gaknn.core.Instances; +//import weka.core.RevisionUtils; +//import weka.core.TechnicalInformation; +//import weka.core.TechnicalInformationHandler; +//import weka.core.TechnicalInformation.Field; +//import weka.core.TechnicalInformation.Type; + +/** + + * The class that splits a node into two such that the overall sum of squared distances of points to their centres on both sides of the (axis-parallel) splitting plane is minimum.
+ *
+ * For more information see also:
+ *
+ * Ashraf Masood Kibriya (2007). Fast Algorithms for Nearest Neighbour Search. Hamilton, New Zealand. + *

+ + * + + * BibTeX: + *

+ * @mastersthesis{Kibriya2007,
+ *    address = {Hamilton, New Zealand},
+ *    author = {Ashraf Masood Kibriya},
+ *    school = {Department of Computer Science, School of Computing and Mathematical Sciences, University of Waikato},
+ *    title = {Fast Algorithms for Nearest Neighbour Search},
+ *    year = {2007}
+ * }
+ * 
+ *

+ + * + + + * + * @author Ashraf M. Kibriya (amk14[at-the-rate]cs[dot]waikato[dot]ac[dot]nz) + * @version $Revision: 1.2 $ + */ +public class KMeansInpiredMethod + extends KDTreeNodeSplitter + { + + /** for serialization. */ + private static final long serialVersionUID = -866783749124714304L; + + /** + * Returns a string describing this nearest neighbour search algorithm. + * + * @return a description of the algorithm for displaying in the + * explorer/experimenter gui + */ +// public String globalInfo() { +// return +// "The class that splits a node into two such that the overall sum " +// + "of squared distances of points to their centres on both sides " +// + "of the (axis-parallel) splitting plane is minimum.\n\n" +// + "For more information see also:\n\n" +// + getTechnicalInformation().toString(); +// } + + /** + * Returns an instance of a TechnicalInformation object, containing detailed + * information about the technical background of this class, e.g., paper + * reference or book this class is based on. + * + * @return the technical information about this class + */ +// public TechnicalInformation getTechnicalInformation() { +// TechnicalInformation result; +// +// result = new TechnicalInformation(Type.MASTERSTHESIS); +// result.setValue(Field.AUTHOR, "Ashraf Masood Kibriya"); +// result.setValue(Field.TITLE, "Fast Algorithms for Nearest Neighbour Search"); +// result.setValue(Field.YEAR, "2007"); +// result.setValue(Field.SCHOOL, "Department of Computer Science, School of Computing and Mathematical Sciences, University of Waikato"); +// result.setValue(Field.ADDRESS, "Hamilton, New Zealand"); +// +// return result; +// } + + /** + * Splits a node into two such that the overall sum of squared distances + * of points to their centres on both sides of the (axis-parallel) + * splitting plane is minimum. The two nodes created after the whole + * splitting are correctly initialised. And, node.left and node.right + * are set appropriately. + * @param node The node to split. + * @param numNodesCreated The number of nodes that so far have been + * created for the tree, so that the newly created nodes are + * assigned correct/meaningful node numbers/ids. + * @param nodeRanges The attributes' range for the points inside + * the node that is to be split. + * @param universe The attributes' range for the whole + * point-space. + * @throws Exception If there is some problem in splitting the + * given node. + */ + public void splitNode(KDTreeNode node, int numNodesCreated, + double[][] nodeRanges, double[][] universe) throws Exception { + + correctlyInitialized(); + + int splitDim = -1; + double splitVal = Double.NEGATIVE_INFINITY; + + double leftAttSum[] = new double[m_Instances.NumAttributes()], + rightAttSum[] = new double[m_Instances.NumAttributes()], + leftAttSqSum[] = new double[m_Instances.NumAttributes()], + rightAttSqSum[] = new double[m_Instances.NumAttributes()], + rightSqMean, leftSqMean, leftSqSum, rightSqSum, + minSum = Double.POSITIVE_INFINITY, val; + + for (int dim = 0; dim < m_Instances.NumAttributes(); dim++) { + // m_MaxRelativeWidth in KDTree ensure there'll be atleast one dim with + // width > 0.0 + if (node.m_NodeRanges[dim][WIDTH] == 0.0 + || dim == m_Instances.GetClassIndex()) + continue; + + quickSort(m_Instances, m_InstList, dim, node.m_Start, node.m_End); + + for (int i = node.m_Start; i <= node.m_End; i++) { + for (int j = 0; j < m_Instances.NumAttributes(); j++) { + if (j == m_Instances.GetClassIndex()) + continue; + val = m_Instances.instance(m_InstList[i]).valueSparse(j); + if (m_NormalizeNodeWidth) { + if (Double.isNaN(universe[j][MIN]) + || universe[j][MIN] == universe[j][MAX]) + val = 0.0; + else + val = ((val - universe[j][MIN]) / universe[j][WIDTH]); // normalizing + // value + } + if (i == node.m_Start) { + leftAttSum[j] = rightAttSum[j] = leftAttSqSum[j] = rightAttSqSum[j] = 0.0; + } + rightAttSum[j] += val; + rightAttSqSum[j] += val * val; + } + } + + for (int i = node.m_Start; i <= node.m_End - 1; i++) { + Instance inst = m_Instances.instance(m_InstList[i]); + leftSqSum = rightSqSum = 0.0; + for (int j = 0; j < m_Instances.NumAttributes(); j++) { + if (j == m_Instances.GetClassIndex()) + continue; + val = inst.GetElementAt(j); + + if (m_NormalizeNodeWidth) { + if (Double.isNaN(universe[j][MIN]) + || universe[j][MIN] == universe[j][MAX]) + val = 0.0; + else + val = ((val - universe[j][MIN]) / universe[j][WIDTH]); // normalizing + // value + } + + leftAttSum[j] += val; + rightAttSum[j] -= val; + leftAttSqSum[j] += val * val; + rightAttSqSum[j] -= val * val; + leftSqMean = leftAttSum[j] / (i - node.m_Start + 1); + leftSqMean *= leftSqMean; + rightSqMean = rightAttSum[j] / (node.m_End - i); + rightSqMean *= rightSqMean; + + leftSqSum += leftAttSqSum[j] - (i - node.m_Start + 1) * leftSqMean; + rightSqSum += rightAttSqSum[j] - (node.m_End - i) * rightSqMean; + } + + if (minSum > (leftSqSum + rightSqSum)) { + minSum = leftSqSum + rightSqSum; + + if (i < node.m_End) + splitVal = (m_Instances.instance(m_InstList[i]).valueSparse(dim) + m_Instances + .instance(m_InstList[i + 1]).valueSparse(dim)) / 2; + else + splitVal = m_Instances.instance(m_InstList[i]).valueSparse(dim); + + splitDim = dim; + } + }// end for instance i + }// end for attribute dim + + int rightStart = rearrangePoints(m_InstList, node.m_Start, node.m_End, + splitDim, splitVal); + + if (rightStart == node.m_Start || rightStart > node.m_End) { + System.out.println("node.m_Start: " + node.m_Start + " node.m_End: " + + node.m_End + " splitDim: " + splitDim + " splitVal: " + splitVal + + " node.min: " + node.m_NodeRanges[splitDim][MIN] + " node.max: " + + node.m_NodeRanges[splitDim][MAX] + " node.numInstances: " + + node.numInstances()); + + if (rightStart == node.m_Start) + throw new Exception("Left child is empty in node " + node.m_NodeNumber + + ". Not possible with " + + "KMeanInspiredMethod splitting method. Please " + "check code."); + else + throw new Exception("Right child is empty in node " + node.m_NodeNumber + + ". Not possible with " + + "KMeansInspiredMethod splitting method. Please " + "check code."); + } + + node.m_SplitDim = splitDim; + node.m_SplitValue = splitVal; + node.m_Left = new KDTreeNode(numNodesCreated + 1, node.m_Start, + rightStart - 1, m_EuclideanDistance.initializeRanges(m_InstList, + node.m_Start, rightStart - 1)); + node.m_Right = new KDTreeNode(numNodesCreated + 2, rightStart, node.m_End, + m_EuclideanDistance + .initializeRanges(m_InstList, rightStart, node.m_End)); + } + + /** + * Partitions the instances around a pivot. Used by quicksort and + * kthSmallestValue. + * + * @param insts The instances on which the tree is (or is + * to be) built. + * @param index The master index array containing indices + * of the instances. + * @param attidx The attribution/dimension based on which + * the instances should be partitioned. + * @param l The begining index of the portion of master index + * array that should be partitioned. + * @param r The end index of the portion of master index array + * that should be partitioned. + * @return the index of the middle element + */ + protected static int partition(Instances insts, int[] index, int attidx, int l, int r) { + + double pivot = insts.instance(index[(l + r) / 2]).valueSparse(attidx); + int help; + + while (l < r) { + while ((insts.instance(index[l]).valueSparse(attidx) < pivot) && (l < r)) { + l++; + } + while ((insts.instance(index[r]).valueSparse(attidx) > pivot) && (l < r)) { + r--; + } + if (l < r) { + help = index[l]; + index[l] = index[r]; + index[r] = help; + l++; + r--; + } + } + if ((l == r) && (insts.instance(index[r]).valueSparse(attidx) > pivot)) { + r--; + } + + return r; + } + + /** + * Sorts the instances according to the given attribute/dimension. + * The sorting is done on the master index array and not on the + * actual instances object. + * + * @param insts The instances on which the tree is (or is + * to be) built. + * @param indices The master index array containing indices + * of the instances. + * @param attidx The dimension/attribute based on which + * the instances should be sorted. + * @param left The begining index of the portion of the master + * index array that needs to be sorted. + * @param right The end index of the portion of the master index + * array that needs to be sorted. + */ + protected static void quickSort(Instances insts, int[] indices, int attidx, int left, int right) { + + if (left < right) { + int middle = partition(insts, indices, attidx, left, right); + quickSort(insts, indices, attidx, left, middle); + quickSort(insts, indices, attidx, middle + 1, right); + } + } + + /** + * Method to validate the sorting done by quickSort(). + * + * @param insts The instances on which the tree is (or is + * to be) built. + * @param indices The master index array containing indices + * of the instances. + * @param attidx The dimension/attribute based on which + * the instances should be sorted. + * @param start The start of the portion in master index + * array that needs to be sorted. + * @param end The end of the portion in master index + * array that needs to be sorted. + * @throws Exception If the indices of the instances + * are not in sorted order. + */ + private static void checkSort(Instances insts, int[] indices, int attidx, + int start, int end) throws Exception { + for(int i=start+1; i<=end; i++) { + if( insts.instance(indices[i-1]).valueSparse(attidx) > + insts.instance(indices[i]).valueSparse(attidx) ) { + System.out.println("value[i-1]: "+insts.instance(indices[i-1]).valueSparse(attidx)); + System.out.println("value[i]: "+insts.instance(indices[i]).valueSparse(attidx)); + System.out.println("indices[i-1]: "+indices[i-1]); + System.out.println("indices[i]: "+indices[i]); + System.out.println("i: "+i); + if(insts.instance(indices[i-1]).valueSparse(attidx) > insts.instance(indices[i]).valueSparse(attidx)) + System.out.println("value[i-1] > value[i]"); + + throw new Exception("Indices not sorted correctly."); + }//end if + } + } + + /** + * Re-arranges the indices array so that in the portion of the array + * belonging to the node to be split, the points <= to the splitVal + * are on the left of the portion and those > the splitVal are on the right. + * + * @param indices The master index array. + * @param startidx The begining index of portion of indices that needs + * re-arranging. + * @param endidx The end index of portion of indices that needs + * re-arranging. + * @param splitDim The split dimension/attribute. + * @param splitVal The split value. + * @return The startIdx of the points > the splitVal (the points + * belonging to the right child of the node). + */ + protected int rearrangePoints(int[] indices, final int startidx, final int endidx, + final int splitDim, final double splitVal) { + + int tmp, left = startidx - 1; + for (int i = startidx; i <= endidx; i++) { + if (m_EuclideanDistance.valueIsSmallerEqual(m_Instances + .instance(indices[i]), splitDim, splitVal)) { + left++; + tmp = indices[left]; + indices[left] = indices[i]; + indices[i] = tmp; + }// end valueIsSmallerEqual + }// endfor + return left + 1; + } + + /** + * Returns the revision string. + * + * @return the revision + */ +// public String getRevision() { +// return RevisionUtils.extract("$Revision: 1.2 $"); +// } +} diff --git a/src/gaknn/core/kdtree/NearestNeighbourSearch.java b/src/gaknn/core/kdtree/NearestNeighbourSearch.java new file mode 100644 index 0000000..efd989c --- /dev/null +++ b/src/gaknn/core/kdtree/NearestNeighbourSearch.java @@ -0,0 +1,922 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* + * NearestNeighbourSearch.java + * Copyright (C) 1999-2007 University of Waikato + */ + +package gaknn.core.kdtree; + +//import weka.core.AdditionalMeasureProducer; +//import weka.core.DistanceFunction; +//import weka.core.EuclideanDistance; +import gaknn.core.Instance; +import gaknn.core.Instances; +//import weka.core.Option; +//import weka.core.OptionHandler; +//import weka.core.RevisionHandler; +//import weka.core.RevisionUtils; +//import weka.core.Utils; + +import java.io.Serializable; +import java.util.Enumeration; +import java.util.Vector; + +/** + * Abstract class for nearest neighbour search. All algorithms (classes) that + * do nearest neighbour search should extend this class. + * + * @author Ashraf M. Kibriya (amk14[at-the-rate]cs[dot]waikato[dot]ac[dot]nz) + * @version $Revision: 1.2 $ + */ +public abstract class NearestNeighbourSearch + implements Serializable,AdditionalMeasureProducer + { + + /** + * A class for a heap to store the nearest k neighbours to an instance. + * The heap also takes care of cases where multiple neighbours are the same + * distance away. + * i.e. the minimum size of the heap is k. + * + * @author Ashraf M. Kibriya (amk14[at-the-rate]cs[dot]waikato[dot]ac[dot]nz) + * @version $Revision: 1.2 $ + */ + protected class MyHeap + { + + /** the heap. */ + MyHeapElement m_heap[] = null; + + /** + * constructor. + * + * @param maxSize the maximum size of the heap + */ + public MyHeap(int maxSize) { + if((maxSize%2)==0) + maxSize++; + + m_heap = new MyHeapElement[maxSize+1]; + m_heap[0] = new MyHeapElement(0, 0); + } + + /** + * returns the size of the heap. + * + * @return the size + */ + public int size() { + return m_heap[0].index; + } + + /** + * peeks at the first element. + * + * @return the first element + */ + public MyHeapElement peek() { + return m_heap[1]; + } + + /** + * returns the first element and removes it from the heap. + * + * @return the first element + * @throws Exception if no elements in heap + */ + public MyHeapElement get() throws Exception { + if(m_heap[0].index==0) + throw new Exception("No elements present in the heap"); + MyHeapElement r = m_heap[1]; + m_heap[1] = m_heap[m_heap[0].index]; + m_heap[0].index--; + downheap(); + return r; + } + + /** + * adds the value to the heap. + * + * @param i the index + * @param d the distance + * @throws Exception if the heap gets too large + */ + public void put(int i, double d) throws Exception { + if((m_heap[0].index+1)>(m_heap.length-1)) + throw new Exception("the number of elements cannot exceed the "+ + "initially set maximum limit"); + m_heap[0].index++; + m_heap[m_heap[0].index] = new MyHeapElement(i, d); + upheap(); + } + + /** + * Puts an element by substituting it in place of + * the top most element. + * + * @param i the index + * @param d the distance + * @throws Exception if distance is smaller than that of the head + * element + */ + public void putBySubstitute(int i, double d) throws Exception { + MyHeapElement head = get(); + put(i, d); + // System.out.println("previous: "+head.distance+" current: "+m_heap[1].distance); + if(head.distance == m_heap[1].distance) { //Utils.eq(head.distance, m_heap[1].distance)) { + putKthNearest(head.index, head.distance); + } + else if(head.distance > m_heap[1].distance) { //Utils.gr(head.distance, m_heap[1].distance)) { + m_KthNearest = null; + m_KthNearestSize = 0; + initSize = 10; + } + else if(head.distance < m_heap[1].distance) { + throw new Exception("The substituted element is smaller than the "+ + "head element. put() should have been called "+ + "in place of putBySubstitute()"); + } + } + + /** the kth nearest ones. */ + MyHeapElement m_KthNearest[] = null; + + /** The number of kth nearest elements. */ + int m_KthNearestSize = 0; + + /** the initial size of the heap. */ + int initSize=10; + + /** + * returns the number of k nearest. + * + * @return the number of k nearest + * @see #m_KthNearestSize + */ + public int noOfKthNearest() { + return m_KthNearestSize; + } + + /** + * Stores kth nearest elements (if there are + * more than one). + * @param i the index + * @param d the distance + */ + public void putKthNearest(int i, double d) { + if(m_KthNearest==null) { + m_KthNearest = new MyHeapElement[initSize]; + } + if(m_KthNearestSize>=m_KthNearest.length) { + initSize += initSize; + MyHeapElement temp[] = new MyHeapElement[initSize]; + System.arraycopy(m_KthNearest, 0, temp, 0, m_KthNearest.length); + m_KthNearest = temp; + } + m_KthNearest[m_KthNearestSize++] = new MyHeapElement(i, d); + } + + /** + * returns the kth nearest element or null if none there. + * + * @return the kth nearest element + */ + public MyHeapElement getKthNearest() { + if(m_KthNearestSize==0) + return null; + m_KthNearestSize--; + return m_KthNearest[m_KthNearestSize]; + } + + /** + * performs upheap operation for the heap + * to maintian its properties. + */ + protected void upheap() { + int i = m_heap[0].index; + MyHeapElement temp; + while( i > 1 && m_heap[i].distance>m_heap[i/2].distance) { + temp = m_heap[i]; + m_heap[i] = m_heap[i/2]; + i = i/2; + m_heap[i] = temp; //this is i/2 done here to avoid another division. + } + } + + /** + * performs downheap operation for the heap + * to maintian its properties. + */ + protected void downheap() { + int i = 1; + MyHeapElement temp; + while( ( (2*i) <= m_heap[0].index && + m_heap[i].distance < m_heap[2*i].distance ) + || + ( (2*i+1) <= m_heap[0].index && + m_heap[i].distance < m_heap[2*i+1].distance) ) { + if((2*i+1)<=m_heap[0].index) { + if(m_heap[2*i].distance>m_heap[2*i+1].distance) { + temp = m_heap[i]; + m_heap[i] = m_heap[2*i]; + i = 2*i; + m_heap[i] = temp; + } + else { + temp = m_heap[i]; + m_heap[i] = m_heap[2*i+1]; + i = 2*i+1; + m_heap[i] = temp; + } + } + else { + temp = m_heap[i]; + m_heap[i] = m_heap[2*i]; + i = 2*i; + m_heap[i] = temp; + } + } + } + + /** + * returns the total size. + * + * @return the total size + */ + public int totalSize() { + return size()+noOfKthNearest(); + } + + /** + * Returns the revision string. + * + * @return the revision + */ +// public String getRevision() { +// return RevisionUtils.extract("$Revision: 1.2 $"); +// } + } + + /** + * A class for storing data about a neighboring instance. + * + * @author Ashraf M. Kibriya (amk14[at-the-rate]cs[dot]waikato[dot]ac[dot]nz) + * @version $Revision: 1.2 $ + */ + protected class MyHeapElement + { + + /** the index of this element. */ + public int index; + + /** the distance of this element. */ + public double distance; + + /** + * constructor. + * + * @param i the index + * @param d the distance + */ + public MyHeapElement(int i, double d) { + distance = d; + index = i; + } + + /** + * Returns the revision string. + * + * @return the revision + */ +// public String getRevision() { +// return RevisionUtils.extract("$Revision: 1.2 $"); +// } + } + + /** + * A class for storing data about a neighboring instance. + * + * @author Ashraf M. Kibriya (amk14[at-the-rate]cs[dot]waikato[dot]ac[dot]nz) + * @version $Revision: 1.2 $ + */ //better to change this into a heap element + protected class NeighborNode + { + + /** The neighbor instance. */ + public Instance m_Instance; + + /** The distance from the current instance to this neighbor. */ + public double m_Distance; + + /** A link to the next neighbor instance. */ + public NeighborNode m_Next; + + /** + * Create a new neighbor node. + * + * @param distance the distance to the neighbor + * @param instance the neighbor instance + * @param next the next neighbor node + */ + public NeighborNode(double distance, Instance instance, NeighborNode next) { + m_Distance = distance; + m_Instance = instance; + m_Next = next; + } + + /** + * Create a new neighbor node that doesn't link to any other nodes. + * + * @param distance the distance to the neighbor + * @param instance the neighbor instance + */ + public NeighborNode(double distance, Instance instance) { + + this(distance, instance, null); + } + + /** + * Returns the revision string. + * + * @return the revision + */ +// public String getRevision() { +// return RevisionUtils.extract("$Revision: 1.2 $"); +// } + } + + /** + * A class for a linked list to store the nearest k neighbours + * to an instance. We use a list so that we can take care of + * cases where multiple neighbours are the same distance away. + * i.e. the minimum length of the list is k. + * + * @author Ashraf M. Kibriya (amk14[at-the-rate]cs[dot]waikato[dot]ac[dot]nz) + * @version $Revision: 1.2 $ + */ //better to change this into a heap + protected class NeighborList + { + + /** The first node in the list. */ + protected NeighborNode m_First; + + /** The last node in the list. */ + protected NeighborNode m_Last; + + /** The number of nodes to attempt to maintain in the list. */ + protected int m_Length = 1; + + /** + * Creates the neighborlist with a desired length. + * + * @param length the length of list to attempt to maintain + */ + public NeighborList(int length) { + m_Length = length; + } + + /** + * Gets whether the list is empty. + * + * @return true if list is empty + */ + public boolean isEmpty() { + return (m_First == null); + } + + /** + * Gets the current length of the list. + * + * @return the current length of the list + */ + public int currentLength() { + int i = 0; + NeighborNode current = m_First; + while (current != null) { + i++; + current = current.m_Next; + } + return i; + } + + /** + * Inserts an instance neighbor into the list, maintaining the list + * sorted by distance. + * + * @param distance the distance to the instance + * @param instance the neighboring instance + */ + public void insertSorted(double distance, Instance instance) { + + if (isEmpty()) { + m_First = m_Last = new NeighborNode(distance, instance); + } else { + NeighborNode current = m_First; + if (distance < m_First.m_Distance) {// Insert at head + m_First = new NeighborNode(distance, instance, m_First); + } else { // Insert further down the list + for( ;(current.m_Next != null) && + (current.m_Next.m_Distance < distance); + current = current.m_Next); + current.m_Next = new NeighborNode(distance, instance, + current.m_Next); + if (current.equals(m_Last)) { + m_Last = current.m_Next; + } + } + + // Trip down the list until we've got k list elements (or more if the + // distance to the last elements is the same). + int valcount = 0; + for(current = m_First; current.m_Next != null; + current = current.m_Next) { + valcount++; + if ((valcount >= m_Length) && (current.m_Distance != + current.m_Next.m_Distance)) { + m_Last = current; + current.m_Next = null; + break; + } + } + } + } + + /** + * Prunes the list to contain the k nearest neighbors. If there are + * multiple neighbors at the k'th distance, all will be kept. + * + * @param k the number of neighbors to keep in the list. + */ + public void pruneToK(int k) { + + if (isEmpty()) { + return; + } + if (k < 1) { + k = 1; + } + int currentK = 0; + double currentDist = m_First.m_Distance; + NeighborNode current = m_First; + for(; current.m_Next != null; current = current.m_Next) { + currentK++; + currentDist = current.m_Distance; + if ((currentK >= k) && (currentDist != current.m_Next.m_Distance)) { + m_Last = current; + current.m_Next = null; + break; + } + } + } + + /** + * Prints out the contents of the neighborlist. + */ + public void printList() { + + if (isEmpty()) { + System.out.println("Empty list"); + } else { + NeighborNode current = m_First; + while (current != null) { + System.out.println("Node: instance " + current.m_Instance + + ", distance " + current.m_Distance); + current = current.m_Next; + } + System.out.println(); + } + } + + /** + * returns the first element in the list. + * + * @return the first element + */ + public NeighborNode getFirst() { + return m_First; + } + + /** + * returns the last element in the list. + * + * @return the last element + */ + public NeighborNode getLast() { + return m_Last; + } + + /** + * Returns the revision string. + * + * @return the revision + */ +// public String getRevision() { +// return RevisionUtils.extract("$Revision: 1.2 $"); +// } + } + + /** The neighbourhood of instances to find neighbours in. */ + protected Instances m_Instances; + + /** The number of neighbours to find. */ + protected int m_kNN; + + /** the distance function used. */ + protected DistanceFunction m_DistanceFunction = new EuclideanDistance(); + + /** Performance statistics. */ + protected PerformanceStats m_Stats = null; + + /** Should we measure Performance. */ + protected boolean m_MeasurePerformance = false; + + /** + * Constructor. + */ + public NearestNeighbourSearch() { + if(m_MeasurePerformance) + m_Stats = new PerformanceStats(); + } + + /** + * Constructor. + * + * @param insts The set of instances that constitute the neighbourhood. + */ + public NearestNeighbourSearch(Instances insts) { + this(); + m_Instances = insts; + } + + /** + * Returns a string describing this nearest neighbour search algorithm. + * + * @return a description of the algorithm for displaying in the + * explorer/experimenter gui + */ + public String globalInfo() { + return + "Abstract class for nearest neighbour search. All algorithms (classes) that " + + "do nearest neighbour search should extend this class."; + } + + /** + * Returns an enumeration describing the available options. + * + * @return an enumeration of all the available options. + */ +// public Enumeration listOptions() { +// Vector newVector = new Vector(); +// +// newVector.add(new Option( +// "\tDistance function to use.\n" +// + "\t(default: weka.core.EuclideanDistance)", +// "A", 1,"-A ")); +// +// newVector.add(new Option( +// "\tCalculate performance statistics.", +// "P", 0,"-P")); +// +// return newVector.elements(); +// } + + /** + * Parses a given list of options. Valid options are: + * + + + * + * @param options the list of options as an array of strings + * @throws Exception if an option is not supported + */ +// public void setOptions(String[] options) throws Exception { +// String nnSearchClass = Utils.getOption('A', options); +// if(nnSearchClass.length() != 0) { +// String nnSearchClassSpec[] = Utils.splitOptions(nnSearchClass); +// if(nnSearchClassSpec.length == 0) { +// throw new Exception("Invalid DistanceFunction specification string."); +// } +// String className = nnSearchClassSpec[0]; +// nnSearchClassSpec[0] = ""; +// +// setDistanceFunction( (DistanceFunction) +// Utils.forName( DistanceFunction.class, +// className, nnSearchClassSpec) ); +// } +// else { +// setDistanceFunction(new EuclideanDistance()); +// } +// +// setMeasurePerformance(Utils.getFlag('P',options)); +// } +// +// /** +// * Gets the current settings. +// * +// * @return an array of strings suitable for passing to setOptions() +// */ +// public String [] getOptions() { +// Vector result; +// +// result = new Vector(); +// +// result.add("-A"); +// result.add((m_DistanceFunction.getClass().getName() + " " + +// Utils.joinOptions(m_DistanceFunction.getOptions())).trim()); +// +// if(getMeasurePerformance()) +// result.add("-P"); +// +// return result.toArray(new String[result.size()]); +// } + + /** + * Returns the tip text for this property. + * + * @return tip text for this property suitable for + * displaying in the explorer/experimenter gui + */ + public String distanceFunctionTipText() { + return "The distance function to use for finding neighbours " + + "(default: weka.core.EuclideanDistance). "; + } + + /** + * returns the distance function currently in use. + * + * @return the distance function + */ + public DistanceFunction getDistanceFunction() { + return m_DistanceFunction; + } + + /** + * sets the distance function to use for nearest neighbour search. + * + * @param df the new distance function to use + * @throws Exception if instances cannot be processed + */ + public void setDistanceFunction(DistanceFunction df) throws Exception { + m_DistanceFunction = df; + } + + /** + * Returns the tip text for this property. + * + * @return tip text for this property suitable for + * displaying in the explorer/experimenter gui + */ + public String measurePerformanceTipText() { + return "Whether to calculate performance statistics " + + "for the NN search or not"; + } + + /** + * Gets whether performance statistics are being calculated or not. + * + * @return true if the measure performance is calculated + */ + public boolean getMeasurePerformance() { + return m_MeasurePerformance; + } + + /** + * Sets whether to calculate the performance statistics or not. + * + * @param measurePerformance if true then the performance is calculated + */ + public void setMeasurePerformance(boolean measurePerformance) { + m_MeasurePerformance = measurePerformance; + if(m_MeasurePerformance) { + if(m_Stats==null) + m_Stats = new PerformanceStats(); + } + else + m_Stats = null; + } + + /** + * Returns the nearest instance in the current neighbourhood to the supplied + * instance. + * + * @param target The instance to find the nearest neighbour for. + * @return the nearest neighbor + * @throws Exception if the nearest neighbour could not be found. + */ + public abstract Instance nearestNeighbour(Instance target) throws Exception; + + /** + * Returns k nearest instances in the current neighbourhood to the supplied + * instance. + * + * @param target The instance to find the k nearest neighbours for. + * @param k The number of nearest neighbours to find. + * @return the k nearest neighbors + * @throws Exception if the neighbours could not be found. + */ + public abstract Instances kNearestNeighbours(Instance target, int k) throws Exception; + + /** + * Returns the distances of the k nearest neighbours. The kNearestNeighbours + * or nearestNeighbour needs to be called first for this to work. + * + * @return the distances + * @throws Exception if called before calling kNearestNeighbours + * or nearestNeighbours. + */ + public abstract double[] getDistances() throws Exception; + + /** + * Updates the NearNeighbourSearch algorithm for the new added instance. + * P.S.: The method assumes the instance has already been added to the + * m_Instances object by the caller. + * + * @param ins the instance to add + * @throws Exception if updating fails + */ + public abstract void update(Instance ins) throws Exception; + + /** + * Adds information from the given instance without modifying the + * datastructure a lot. + * + * @param ins the instance to add the information from + */ + public void addInstanceInfo(Instance ins) { + } + + /** + * Sets the instances. + * + * @param insts the instances to use + * @throws Exception if setting fails + */ + public void setInstances(Instances insts) throws Exception { + m_Instances = insts; + } + + /** + * returns the instances currently set. + * + * @return the current instances + */ + public Instances getInstances() { + return m_Instances; + } + + /** + * Gets the class object that contains the performance statistics of + * the search method. + * + * @return the performance statistics + */ + public PerformanceStats getPerformanceStats() { + return m_Stats; + } + + /** + * Returns an enumeration of the additional measure names. + * + * @return an enumeration of the measure names + */ + public Enumeration enumerateMeasures() { + Vector newVector; + if(m_Stats == null) { + newVector = new Vector(0); + } + else { + newVector = new Vector(); + Enumeration en = m_Stats.enumerateMeasures(); + while(en.hasMoreElements()) + newVector.add(en.nextElement()); + } + return newVector.elements(); + } + + /** + * Returns the value of the named measure. + * + * @param additionalMeasureName the name of the measure to query for + * its value + * @return the value of the named measure + * @throws IllegalArgumentException if the named measure is not supported + */ + public double getMeasure(String additionalMeasureName) { + if(m_Stats==null) + throw new IllegalArgumentException(additionalMeasureName + + " not supported (NearestNeighbourSearch)"); + else + return m_Stats.getMeasure(additionalMeasureName); + } + + /** + * sorts the two given arrays. + * + * @param arrayToSort The array sorting should be based on. + * @param linkedArray The array that should have the same ordering as + * arrayToSort. + */ + public static void combSort11(double arrayToSort[], int linkedArray[]) { + int switches, j, top, gap; + double hold1; int hold2; + gap = arrayToSort.length; + do { + gap=(int)(gap/1.3); + switch(gap) { + case 0: + gap = 1; + break; + case 9: + case 10: + gap=11; + break; + default: + break; + } + switches=0; + top = arrayToSort.length-gap; + for(int i=0; i arrayToSort[j]) { + hold1=arrayToSort[i]; + hold2=linkedArray[i]; + arrayToSort[i]=arrayToSort[j]; + linkedArray[i]=linkedArray[j]; + arrayToSort[j]=hold1; + linkedArray[j]=hold2; + switches++; + }//endif + }//endfor + } while(switches>0 || gap>1); + } + + /** + * Partitions the instances around a pivot. Used by quicksort and + * kthSmallestValue. + * + * @param arrayToSort the array of doubles to be sorted + * @param linkedArray the linked array + * @param l the first index of the subset + * @param r the last index of the subset + * @return the index of the middle element + */ + protected static int partition(double[] arrayToSort, double[] linkedArray, int l, int r) { + double pivot = arrayToSort[(l + r) / 2]; + double help; + + while (l < r) { + while ((arrayToSort[l] < pivot) && (l < r)) { + l++; + } + while ((arrayToSort[r] > pivot) && (l < r)) { + r--; + } + if (l < r) { + help = arrayToSort[l]; + arrayToSort[l] = arrayToSort[r]; + arrayToSort[r] = help; + help = linkedArray[l]; + linkedArray[l] = linkedArray[r]; + linkedArray[r] = help; + l++; + r--; + } + } + if ((l == r) && (arrayToSort[r] > pivot)) { + r--; + } + + return r; + } + + /** + * performs quicksort. + * + * @param arrayToSort the array to sort + * @param linkedArray the linked array + * @param left the first index of the subset + * @param right the last index of the subset + */ + public static void quickSort(double[] arrayToSort, double[] linkedArray, int left, int right) { + if (left < right) { + int middle = partition(arrayToSort, linkedArray, left, right); + quickSort(arrayToSort, linkedArray, left, middle); + quickSort(arrayToSort, linkedArray, middle + 1, right); + } + } +} diff --git a/src/gaknn/core/kdtree/NormalizableDistance.java b/src/gaknn/core/kdtree/NormalizableDistance.java new file mode 100644 index 0000000..47854e9 --- /dev/null +++ b/src/gaknn/core/kdtree/NormalizableDistance.java @@ -0,0 +1,829 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* + * NormalizableDistance.java + * Copyright (C) 2007 University of Waikato, Hamilton, New Zealand + * + */ + +package gaknn.core.kdtree; + +//import weka.core.neighboursearch.PerformanceStats; +import gaknn.core.Attribute; +import gaknn.core.Instance; +import gaknn.core.Instances; +import java.io.Serializable; +import java.util.Enumeration; +import java.util.Vector; + +/** + * Represents the abstract ancestor for normalizable distance functions, like + * Euclidean or Manhattan distance. + * + * @author Fracpete (fracpete at waikato dot ac dot nz) + * @author Gabi Schmidberger (gabi@cs.waikato.ac.nz) -- original code from weka.core.EuclideanDistance + * @author Ashraf M. Kibriya (amk14@cs.waikato.ac.nz) -- original code from weka.core.EuclideanDistance + * @version $Revision: 1.2 $ + */ +public abstract class NormalizableDistance + implements DistanceFunction, Serializable{ + + /** Index in ranges for MIN. */ + public static final int R_MIN = 0; + + /** Index in ranges for MAX. */ + + public static final int R_MAX = 1; + + /** Index in ranges for WIDTH. */ + public static final int R_WIDTH = 2; + + /** the instances used internally. */ + protected Instances m_Data = null; + + /** True if normalization is turned off (default false).*/ + protected boolean m_DontNormalize = false; + + /** The range of the attributes. */ + protected double[][] m_Ranges; + + /** The range of attributes to use for calculating the distance. */ + protected Range m_AttributeIndices = new Range("first-last"); + + /** The boolean flags, whether an attribute will be used or not. */ + protected boolean[] m_ActiveIndices; + + /** Whether all the necessary preparations have been done. */ + protected boolean m_Validated; + + /** + * Invalidates the distance function, Instances must be still set. + */ + /** Keep the weight elements. */ + //@author thimal + static double[] m_Weights; + + public NormalizableDistance() { + invalidate(); + } + + /** + * Initializes the distance function and automatically initializes the + * ranges. + * + * @param data the instances the distance function should work on + */ + public NormalizableDistance(Instances data) { + setInstances(data); + } + + /** + * Returns a string describing this object. + * + * @return a description of the evaluator suitable for + * displaying in the explorer/experimenter gui + */ + public abstract String globalInfo(); + + /** + * Returns an enumeration describing the available options. + * + * @return an enumeration of all the available options. + */ +// public Enumeration listOptions() { +// Vector result = new Vector(); +// +// result.add(new Option( +// "\tTurns off the normalization of attribute \n" +// + "\tvalues in distance calculation.", +// "D", 0, "-D")); +// +// result.addElement(new Option( +// "\tSpecifies list of columns to used in the calculation of the \n" +// + "\tdistance. 'first' and 'last' are valid indices.\n" +// + "\t(default: first-last)", +// "R", 1, "-R ")); +// +// result.addElement(new Option( +// "\tInvert matching sense of column indices.", +// "V", 0, "-V")); +// +// return result.elements(); +// } + + /** + * Gets the current settings. Returns empty array. + * + * @return an array of strings suitable for passing to setOptions() + */ + public String[] getOptions() { + Vector result; + + result = new Vector(); + + if (getDontNormalize()) + result.add("-D"); + + result.add("-R"); + result.add(getAttributeIndices()); + + if (getInvertSelection()) + result.add("-V"); + + return result.toArray(new String[result.size()]); + } + + /** + * Parses a given list of options. + * + * @param options the list of options as an array of strings + * @throws Exception if an option is not supported + */ +// public void setOptions(String[] options) throws Exception { +// String tmpStr; +// +// setDontNormalize(Utils.getFlag('D', options)); +// +// tmpStr = Utils.getOption('R', options); +// if (tmpStr.length() != 0) +// setAttributeIndices(tmpStr); +// else +// setAttributeIndices("first-last"); +// +// setInvertSelection(Utils.getFlag('V', options)); +// } + + /** + * Returns the tip text for this property. + * + * @return tip text for this property suitable for + * displaying in the explorer/experimenter gui + */ + public String dontNormalizeTipText() { + return "Whether if the normalization of attributes should be turned off " + + "for distance calculation (Default: false i.e. attribute values " + + "are normalized). "; + } + + /** + * Sets whether if the attribute values are to be normalized in distance + * calculation. + * + * @param dontNormalize if true the values are not normalized + */ + public void setDontNormalize(boolean dontNormalize) { + m_DontNormalize = dontNormalize; + invalidate(); + } + + /** + * Gets whether if the attribute values are to be normazlied in distance + * calculation. (default false i.e. attribute values are normalized.) + * + * @return false if values get normalized + */ + public boolean getDontNormalize() { + return m_DontNormalize; + } + + /** + * Returns the tip text for this property. + * + * @return tip text for this property suitable for + * displaying in the explorer/experimenter gui + */ + public String attributeIndicesTipText() { + return + "Specify range of attributes to act on. " + + "This is a comma separated list of attribute indices, with " + + "\"first\" and \"last\" valid values. Specify an inclusive " + + "range with \"-\". E.g: \"first-3,5,6-10,last\"."; + } + + /** + * Sets the range of attributes to use in the calculation of the distance. + * The indices start from 1, 'first' and 'last' are valid as well. + * E.g.: first-3,5,6-last + * + * @param value the new attribute index range + */ + public void setAttributeIndices(String value) { + m_AttributeIndices.setRanges(value); + invalidate(); + } + /** + * set the weights + * + * @param double array of weights + */ + //@author thimal + public void SetWeights(double[] weights){ + m_Weights = weights; + } + + /** + * Gets the range of attributes used in the calculation of the distance. + * + * @return the attribute index range + */ + public String getAttributeIndices() { + return m_AttributeIndices.getRanges(); + } + + /** + * Returns the tip text for this property. + * + * @return tip text for this property suitable for + * displaying in the explorer/experimenter gui + */ + public String invertSelectionTipText() { + return + "Set attribute selection mode. If false, only selected " + + "attributes in the range will be used in the distance calculation; if " + + "true, only non-selected attributes will be used for the calculation."; + } + + /** + * Sets whether the matching sense of attribute indices is inverted or not. + * + * @param value if true the matching sense is inverted + */ + public void setInvertSelection(boolean value) { + m_AttributeIndices.setInvert(value); + invalidate(); + } + + /** + * Gets whether the matching sense of attribute indices is inverted or not. + * + * @return true if the matching sense is inverted + */ + public boolean getInvertSelection() { + return m_AttributeIndices.getInvert(); + } + + /** + * invalidates all initializations. + */ + protected void invalidate() { + m_Validated = false; + } + + /** + * performs the initializations if necessary. + */ + protected void validate() { + if (!m_Validated) { + initialize(); + m_Validated = true; + } + } + + /** + * initializes the ranges and the attributes being used. + */ + protected void initialize() { + initializeAttributeIndices(); + initializeRanges(); + } + + /** + * initializes the attribute indices. + */ + protected void initializeAttributeIndices() { + m_AttributeIndices.setUpper(m_Data.NumAttributes() - 1); + m_ActiveIndices = new boolean[m_Data.NumAttributes()]; + for (int i = 0; i < m_ActiveIndices.length; i++) + m_ActiveIndices[i] = m_AttributeIndices.isInRange(i); + } + + /** + * Sets the instances. + * + * @param insts the instances to use + */ + public void setInstances(Instances insts) { + m_Data = insts; + invalidate(); + } + + /** + * returns the instances currently set. + * + * @return the current instances + */ + public Instances getInstances() { + return m_Data; + } + + /** + * Does nothing, derived classes may override it though. + * + * @param distances the distances to post-process + */ + public void postProcessDistances(double[] distances) { + } + + /** + * Update the distance function (if necessary) for the newly added instance. + * + * @param ins the instance to add + */ + public void update(Instance ins) { + validate(); + + m_Ranges = updateRanges(ins, m_Ranges); + } + + /** + * Calculates the distance between two instances. + * + * @param first the first instance + * @param second the second instance + * @return the distance between the two given instances + */ + public double distance(Instance first, Instance second) { + return distance(first, second, null); + } + + /** + * Calculates the distance between two instances. + * + * @param first the first instance + * @param second the second instance + * @param stats the performance stats object + * @return the distance between the two given instances + */ + public double distance(Instance first, Instance second, PerformanceStats stats) { + return distance(first, second, Double.POSITIVE_INFINITY, stats); + } + + /** + * Calculates the distance between two instances. Offers speed up (if the + * distance function class in use supports it) in nearest neighbour search by + * taking into account the cutOff or maximum distance. Depending on the + * distance function class, post processing of the distances by + * postProcessDistances(double []) may be required if this function is used. + * + * @param first the first instance + * @param second the second instance + * @param cutOffValue If the distance being calculated becomes larger than + * cutOffValue then the rest of the calculation is + * discarded. + * @return the distance between the two given instances or + * Double.POSITIVE_INFINITY if the distance being + * calculated becomes larger than cutOffValue. + */ + public double distance(Instance first, Instance second, double cutOffValue) { + return distance(first, second, cutOffValue, null); + } + + /** + * Calculates the distance between two instances. Offers speed up (if the + * distance function class in use supports it) in nearest neighbour search by + * taking into account the cutOff or maximum distance. Depending on the + * distance function class, post processing of the distances by + * postProcessDistances(double []) may be required if this function is used. + * + * @param first the first instance + * @param second the second instance + * @param cutOffValue If the distance being calculated becomes larger than + * cutOffValue then the rest of the calculation is + * discarded. + * @param stats the performance stats object + * @return the distance between the two given instances or + * Double.POSITIVE_INFINITY if the distance being + * calculated becomes larger than cutOffValue. + */ + public double distance(Instance first, Instance second, double cutOffValue, PerformanceStats stats) { + double distance = 0; + int firstI, secondI; + int firstNumValues = first.numValues(); + int secondNumValues = second.numValues(); + int numAttributes = m_Data.NumAttributes(); + int classIndex = m_Data.GetClassIndex(); + + validate(); + + for (int p1 = 0, p2 = 0; p1 < firstNumValues || p2 < secondNumValues; ) { + if (p1 >= firstNumValues) + firstI = numAttributes; + else + firstI = first.index(p1); + + if (p2 >= secondNumValues) + secondI = numAttributes; + else + secondI = second.index(p2); + + if (firstI == classIndex) { + p1++; + continue; + } + if ((firstI < numAttributes) && !m_ActiveIndices[firstI]) { + p1++; + continue; + } + + if (secondI == classIndex) { + p2++; + continue; + } + if ((secondI < numAttributes) && !m_ActiveIndices[secondI]) { + p2++; + continue; + } + + double diff; + + if (firstI == secondI) { + diff = difference(firstI, + first.valueSparse(p1), + second.valueSparse(p2)); + p1++; + p2++; + } + else if (firstI > secondI) { + diff = difference(secondI, + 0, second.valueSparse(p2)); + p2++; + } + else { + diff = difference(firstI, + first.valueSparse(p1), 0); + p1++; + } + if (stats != null) + stats.incrCoordCount(); + distance = updateDistance(distance, m_Weights[firstI]*diff); + if (distance > cutOffValue) + return Double.POSITIVE_INFINITY; + } + + return distance; + } + + /** + * Updates the current distance calculated so far with the new difference + * between two attributes. The difference between the attributes was + * calculated with the difference(int,double,double) method. + * + * @param currDist the current distance calculated so far + * @param diff the difference between two new attributes + * @return the update distance + * @see #difference(int, double, double) + */ + protected abstract double updateDistance(double currDist, double diff); + + /** + * Normalizes a given value of a numeric attribute. + * + * @param x the value to be normalized + * @param i the attribute's index + * @return the normalized value + */ + protected double norm(double x, int i) { + if (Double.isNaN(m_Ranges[i][R_MIN]) || (m_Ranges[i][R_MAX] == m_Ranges[i][R_MIN])) + return 0; + else + return (x - m_Ranges[i][R_MIN]) / (m_Ranges[i][R_WIDTH]); + } + + /** + * Computes the difference between two given attribute + * values. + * + * @param index the attribute index + * @param val1 the first value + * @param val2 the second value + * @return the difference + */ + protected double difference(int index, double val1, double val2) { + switch (m_Data.Attribute(index).Type()) { + case Attribute.NOMINAL: + if (Instance.isMissingValue(val1) || + Instance.isMissingValue(val2) || + ((int) val1 != (int) val2)) { + return 1; + } + else { + return 0; + } + + case Attribute.NUMERIC: + if (Instance.isMissingValue(val1) || + Instance.isMissingValue(val2)) { + if (Instance.isMissingValue(val1) && + Instance.isMissingValue(val2)) { + if (!m_DontNormalize) //We are doing normalization + return 1; + else + return (m_Ranges[index][R_MAX] - m_Ranges[index][R_MIN]); + } + else { + double diff; + if (Instance.isMissingValue(val2)) { + diff = (!m_DontNormalize) ? norm(val1, index) : val1; + } + else { + diff = (!m_DontNormalize) ? norm(val2, index) : val2; + } + if (!m_DontNormalize && diff < 0.5) { + diff = 1.0 - diff; + } + else if (m_DontNormalize) { + if ((m_Ranges[index][R_MAX]-diff) > (diff-m_Ranges[index][R_MIN])) + return m_Ranges[index][R_MAX]-diff; + else + return diff-m_Ranges[index][R_MIN]; + } + return diff; + } + } + else { + return (!m_DontNormalize) ? + (norm(val1, index) - norm(val2, index)) : + (val1 - val2); + } + + default: + return 0; + } + } + + /** + * Initializes the ranges using all instances of the dataset. + * Sets m_Ranges. + * + * @return the ranges + */ + public double[][] initializeRanges() { + if (m_Data == null) { + m_Ranges = null; + return m_Ranges; + } + + int numAtt = m_Data.NumAttributes(); + double[][] ranges = new double [numAtt][3]; + if (m_Data.Size() <= 0) { + initializeRangesEmpty(numAtt, ranges); + m_Ranges = ranges; + return m_Ranges; + } + else { + // initialize ranges using the first instance + updateRangesFirst(m_Data.instance(0), numAtt, ranges); + } + + // update ranges, starting from the second + for (int i = 1; i < m_Data.Size(); i++) + updateRanges(m_Data.instance(i), numAtt, ranges); + + m_Ranges = ranges; + + return m_Ranges; + } + + /** + * Used to initialize the ranges. For this the values of the first + * instance is used to save time. + * Sets low and high to the values of the first instance and + * width to zero. + * + * @param instance the new instance + * @param numAtt number of attributes in the model + * @param ranges low, high and width values for all attributes + */ + public void updateRangesFirst(Instance instance, int numAtt, double[][] ranges) { + for (int j = 0; j < numAtt; j++) { + if (!instance.isMissing(j)) { + ranges[j][R_MIN] = instance.GetElementAt(j); + ranges[j][R_MAX] = instance.GetElementAt(j); + ranges[j][R_WIDTH] = 0.0; + } + else { // if value was missing + ranges[j][R_MIN] = Double.POSITIVE_INFINITY; + ranges[j][R_MAX] = -Double.POSITIVE_INFINITY; + ranges[j][R_WIDTH] = Double.POSITIVE_INFINITY; + } + } + } + + /** + * Updates the minimum and maximum and width values for all the attributes + * based on a new instance. + * + * @param instance the new instance + * @param numAtt number of attributes in the model + * @param ranges low, high and width values for all attributes + */ + public void updateRanges(Instance instance, int numAtt, double[][] ranges) { + // updateRangesFirst must have been called on ranges + for (int j = 0; j < numAtt; j++) { + double value = instance.GetElementAt(j); + if (!instance.isMissing(j)) { + if (value < ranges[j][R_MIN]) { + ranges[j][R_MIN] = value; + ranges[j][R_WIDTH] = ranges[j][R_MAX] - ranges[j][R_MIN]; + if (value > ranges[j][R_MAX]) { //if this is the first value that is + ranges[j][R_MAX] = value; //not missing. The,0 + ranges[j][R_WIDTH] = ranges[j][R_MAX] - ranges[j][R_MIN]; + } + } + else { + if (value > ranges[j][R_MAX]) { + ranges[j][R_MAX] = value; + ranges[j][R_WIDTH] = ranges[j][R_MAX] - ranges[j][R_MIN]; + } + } + } + } + } + + /** + * Used to initialize the ranges. + * + * @param numAtt number of attributes in the model + * @param ranges low, high and width values for all attributes + */ + public void initializeRangesEmpty(int numAtt, double[][] ranges) { + for (int j = 0; j < numAtt; j++) { + ranges[j][R_MIN] = Double.POSITIVE_INFINITY; + ranges[j][R_MAX] = -Double.POSITIVE_INFINITY; + ranges[j][R_WIDTH] = Double.POSITIVE_INFINITY; + } + } + + /** + * Updates the ranges given a new instance. + * + * @param instance the new instance + * @param ranges low, high and width values for all attributes + * @return the updated ranges + */ + public double[][] updateRanges(Instance instance, double[][] ranges) { + // updateRangesFirst must have been called on ranges + for (int j = 0; j < ranges.length; j++) { + double value = instance.GetElementAt(j); + if (!instance.isMissing(j)) { + if (value < ranges[j][R_MIN]) { + ranges[j][R_MIN] = value; + ranges[j][R_WIDTH] = ranges[j][R_MAX] - ranges[j][R_MIN]; + } else { + if (instance.GetElementAt(j) > ranges[j][R_MAX]) { + ranges[j][R_MAX] = value; + ranges[j][R_WIDTH] = ranges[j][R_MAX] - ranges[j][R_MIN]; + } + } + } + } + + return ranges; + } + + /** + * Initializes the ranges of a subset of the instances of this dataset. + * Therefore m_Ranges is not set. + * + * @param instList list of indexes of the subset + * @return the ranges + * @throws Exception if something goes wrong + */ + public double[][] initializeRanges(int[] instList) throws Exception { + if (m_Data == null) + throw new Exception("No instances supplied."); + + int numAtt = m_Data.NumAttributes(); + double[][] ranges = new double [numAtt][3]; + + if (m_Data.Size() <= 0) { + initializeRangesEmpty(numAtt, ranges); + return ranges; + } + else { + // initialize ranges using the first instance + updateRangesFirst(m_Data.instance(instList[0]), numAtt, ranges); + // update ranges, starting from the second + for (int i = 1; i < instList.length; i++) { + updateRanges(m_Data.instance(instList[i]), numAtt, ranges); + } + } + return ranges; + } + + /** + * Initializes the ranges of a subset of the instances of this dataset. + * Therefore m_Ranges is not set. + * The caller of this method should ensure that the supplied start and end + * indices are valid (start <= end, end<instList.length etc) and + * correct. + * + * @param instList list of indexes of the instances + * @param startIdx start index of the subset of instances in the indices array + * @param endIdx end index of the subset of instances in the indices array + * @return the ranges + * @throws Exception if something goes wrong + */ + public double[][] initializeRanges(int[] instList, int startIdx, int endIdx) throws Exception { + if (m_Data == null) + throw new Exception("No instances supplied."); + + int numAtt = m_Data.NumAttributes(); + double[][] ranges = new double [numAtt][3]; + + if (m_Data.Size() <= 0) { + initializeRangesEmpty(numAtt, ranges); + return ranges; + } + else { + // initialize ranges using the first instance + updateRangesFirst(m_Data.instance(instList[startIdx]), numAtt, ranges); + // update ranges, starting from the second + for (int i = startIdx+1; i <= endIdx; i++) { + updateRanges(m_Data.instance(instList[i]), numAtt, ranges); + } + } + + return ranges; + } + + /** + * Update the ranges if a new instance comes. + * + * @param instance the new instance + */ + public void updateRanges(Instance instance) { + validate(); + + m_Ranges = updateRanges(instance, m_Ranges); + } + + /** + * Test if an instance is within the given ranges. + * + * @param instance the instance + * @param ranges the ranges the instance is tested to be in + * @return true if instance is within the ranges + */ + public boolean inRanges(Instance instance, double[][] ranges) { + boolean isIn = true; + + // updateRangesFirst must have been called on ranges + for (int j = 0; isIn && (j < ranges.length); j++) { + if (!instance.isMissing(j)) { + double value = instance.GetElementAt(j); + isIn = value <= ranges[j][R_MAX]; + if (isIn) isIn = value >= ranges[j][R_MIN]; + } + } + + return isIn; + } + + /** + * Check if ranges are set. + * + * @return true if ranges are set + */ + public boolean rangesSet() { + return (m_Ranges != null); + } + + /** + * Method to get the ranges. + * + * @return the ranges + * @throws Exception if no randes are set yet + */ + public double[][] getRanges() throws Exception { + validate(); + + if (m_Ranges == null) + throw new Exception("Ranges not yet set."); + + return m_Ranges; + } + + /** + * Returns an empty string. + * + * @return an empty string + */ + public String toString() { + return ""; + } +} diff --git a/src/gaknn/core/kdtree/PerformanceStats.java b/src/gaknn/core/kdtree/PerformanceStats.java new file mode 100644 index 0000000..bb24b29 --- /dev/null +++ b/src/gaknn/core/kdtree/PerformanceStats.java @@ -0,0 +1,344 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* + * PerformanceStats.java + * Copyright (C) 2007 University of Waikato, Hamilton, New Zealand + */ + +package gaknn.core.kdtree; + +//import weka.core.AdditionalMeasureProducer; +//import weka.core.RevisionHandler; +//import weka.core.RevisionUtils; + +import java.io.Serializable; +import java.util.Enumeration; +import java.util.Vector; + +/** + * The class that measures the performance of a nearest + * neighbour search (NNS) algorithm. + * + * @author Ashraf M. Kibriya (amk14[at-the-rate]cs[dot]waikato[dot]ac[dot]nz) + * @version $Revision: 1.2 $ + */ +public class PerformanceStats + implements AdditionalMeasureProducer, Serializable{ + + /** for serialization. */ + private static final long serialVersionUID = -7215110351388368092L; + + /** The total number of queries looked at. */ + protected int m_NumQueries; + + //Point-stats variables + /** The min and max data points looked for a query by + * the NNS algorithm. */ + public double m_MinP, m_MaxP; + + /** The sum of data points looked + * at for all the queries. + */ + public double m_SumP; + /** The squared sum of data points looked + * at for all the queries. + */ + public double m_SumSqP; + /** The number of data points looked at + * for the current/last query. + */ + public double m_PointCount; + + //Coord-stats variables + /** The min and max coordinates(attributes) looked + * at per query. + */ + public double m_MinC, m_MaxC; + /** The sum of coordinates/attributes looked at + * for all the queries. + */ + public double m_SumC; + /** The squared sum of coordinates/attributes looked at + * for all the queries. + */ + public double m_SumSqC; + /** + * The number of coordinates looked at for + * the current/last query. + */ + public double m_CoordCount; + + /** + * default constructor. + */ + public PerformanceStats() { + reset(); + } + + /** + * Resets all internal fields/counters. + */ + public void reset() { + m_NumQueries = 0; + //point stats + m_SumP = m_SumSqP = m_PointCount = 0; + m_MinP = Integer.MAX_VALUE; + m_MaxP = Integer.MIN_VALUE; + //coord stats + m_SumC = m_SumSqC = m_CoordCount = 0; + m_MinC = Integer.MAX_VALUE; + m_MaxC = Integer.MIN_VALUE; + } + + /** + * Signals start of the nearest neighbour search. + * Initializes the stats object. + */ + public void searchStart() { + m_PointCount = 0; + m_CoordCount = 0; + } + + /** + * Signals end of the nearest neighbour search. + * Calculates the statistics for the search. + */ + public void searchFinish() { + m_NumQueries++; m_SumP += m_PointCount; m_SumSqP += m_PointCount*m_PointCount; + if (m_PointCount < m_MinP) m_MinP = m_PointCount; + if (m_PointCount > m_MaxP) m_MaxP = m_PointCount; + //coord stats + double coordsPerPt = m_CoordCount / m_PointCount;; + m_SumC += coordsPerPt; m_SumSqC += coordsPerPt*coordsPerPt; + if(coordsPerPt < m_MinC) m_MinC = coordsPerPt; + if(coordsPerPt > m_MaxC) m_MaxC = coordsPerPt; + } + + /** + * Increments the point count + * (number of datapoints looked at). + */ + public void incrPointCount() { + m_PointCount++; + } + + /** + * Increments the coordinate count + * (number of coordinates/attributes + * looked at). + */ + public void incrCoordCount() { + m_CoordCount++; + } + + /** + * adds the given number to the point count. + * + * @param n The number to add to the point count. + */ + public void updatePointCount(int n) { + m_PointCount += n; + } + + /** + * Returns the number of queries. + * + * @return The number of queries. + */ + public int getNumQueries() { + return m_NumQueries; + } + + /** + * Returns the total number of points visited. + * + * @return The total number. + */ + public double getTotalPointsVisited() { + return m_SumP; + } + + /** + * Returns the mean of points visited. + * + * @return The mean points visited. + */ + public double getMeanPointsVisited() { + return m_SumP/(double)m_NumQueries; + } + + /** + * Returns the standard deviation of points visited. + * + * @return The standard deviation. + */ + public double getStdDevPointsVisited() { + return Math.sqrt((m_SumSqP - (m_SumP*m_SumP)/(double)m_NumQueries)/(m_NumQueries-1)); + } + + /** + * Returns the minimum of points visited. + * + * @return The minimum. + */ + public double getMinPointsVisited() { + return m_MinP; + } + + /** + * Returns the maximum of points visited. + * + * @return The maximum. + */ + public double getMaxPointsVisited() { + return m_MaxP; + } + + /*************----------Coord Stat functions---------**************/ + + /** + * Returns the total sum of coords per point. + * + * @return The total per point. + */ + public double getTotalCoordsPerPoint() { + return m_SumC; + } + + /** + * Returns the mean of coords per point. + * + * @return The mean. + */ + public double getMeanCoordsPerPoint() { + return m_SumC/(double)m_NumQueries; + } + + /** + * Returns the standard deviation of coords per point. + * + * @return The standard deviation. + */ + public double getStdDevCoordsPerPoint() { + return Math.sqrt((m_SumSqC - (m_SumC*m_SumC)/(double)m_NumQueries)/(m_NumQueries-1)); + } + + /** + * Returns the minimum of coords per point. + * + * @return The minimum. + */ + public double getMinCoordsPerPoint() { + return m_MinC; + } + + /** + * Returns the maximum of coords per point. + * + * @return The maximum. + */ + public double getMaxCoordsPerPoint() { + return m_MaxC; + } + + /*****----MiscFunctions----****/ + + /** + * Returns an enumeration of the additional measure names. + * + * @return An enumeration of the measure names. + */ + public Enumeration enumerateMeasures() { + Vector newVector = new Vector(); + + newVector.addElement("measureTotal_points_visited"); + newVector.addElement("measureMean_points_visited"); + newVector.addElement("measureStdDev_points_visited"); + newVector.addElement("measureMin_points_visited"); + newVector.addElement("measureMax_points_visited"); + //coord stats + newVector.addElement("measureTotalCoordsPerPoint"); + newVector.addElement("measureMeanCoordsPerPoint"); + newVector.addElement("measureStdDevCoordsPerPoint"); + newVector.addElement("measureMinCoordsPerPoint"); + newVector.addElement("measureMaxCoordsPerPoint"); + + return newVector.elements(); + } + + /** + * Returns the value of the named measure. + * + * @param additionalMeasureName The name of the measure to query for + * its value. + * @return The value of the named measure. + * @throws IllegalArgumentException If the named measure is not + * supported. + */ + public double getMeasure(String additionalMeasureName) throws IllegalArgumentException { + if (additionalMeasureName.compareToIgnoreCase("measureTotal_points_visited") == 0) { + return (double) getTotalPointsVisited(); + } else if (additionalMeasureName.compareToIgnoreCase("measureMean_points_visited") == 0) { + return (double) getMeanPointsVisited(); + } else if (additionalMeasureName.compareToIgnoreCase("measureStdDev_points_visited") == 0) { + return (double) getStdDevPointsVisited(); + } else if (additionalMeasureName.compareToIgnoreCase("measureMin_points_visited") == 0) { + return (double) getMinPointsVisited(); + } else if (additionalMeasureName.compareToIgnoreCase("measureMax_points_visited") == 0) { + return (double) getMaxPointsVisited(); + } + //coord stats + else if (additionalMeasureName.compareToIgnoreCase("measureTotalCoordsPerPoint") == 0) { + return (double) getTotalCoordsPerPoint(); + } else if (additionalMeasureName.compareToIgnoreCase("measureMeanCoordsPerPoint") == 0) { + return (double) getMeanCoordsPerPoint(); + } else if (additionalMeasureName.compareToIgnoreCase("measureStdDevCoordsPerPoint") == 0) { + return (double) getStdDevCoordsPerPoint(); + } else if (additionalMeasureName.compareToIgnoreCase("measureMinCoordsPerPoint") == 0) { + return (double) getMinCoordsPerPoint(); + } else if (additionalMeasureName.compareToIgnoreCase("measureMaxCoordsPerPoint") == 0) { + return (double) getMaxCoordsPerPoint(); + } else { + throw new IllegalArgumentException(additionalMeasureName + + " not supported by PerformanceStats."); + } + } + + /** + * Returns a string representation of the statistics. + * + * @return The statistics as string. + */ + public String getStats() { + StringBuffer buf = new StringBuffer(); + + buf.append(" min, max, total, mean, stddev\n"); + buf.append("Points: "+getMinPointsVisited()+", "+getMaxPointsVisited()+","+getTotalPointsVisited()+ + ","+getMeanPointsVisited()+", "+getStdDevPointsVisited()+"\n"); + + return buf.toString(); + } + + /** + * Returns the revision string. + * + * @return the revision + */ +// public String getRevision() { +// return RevisionUtils.extract("$Revision: 1.2 $"); +// } +} diff --git a/src/gaknn/core/kdtree/Range.java b/src/gaknn/core/kdtree/Range.java new file mode 100644 index 0000000..4548136 --- /dev/null +++ b/src/gaknn/core/kdtree/Range.java @@ -0,0 +1,458 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* + * Range.java + * Copyright (C) 1999 University of Waikato, Hamilton, New Zealand + * + */ + +package gaknn.core.kdtree; + +import java.io.Serializable; +import java.util.Enumeration; +import java.util.Vector; + +/** + * Class representing a range of cardinal numbers. The range is set by a + * string representation such as:

+ * + * + * first-last + * 1,2,3,4 + *

+ * or combinations thereof. The range is internally converted from + * 1-based to 0-based (so methods that set or get numbers not in string + * format should use 0-based numbers). + * + * @author Len Trigg (trigg@cs.waikato.ac.nz) + * @version $Revision: 1.18 $ + */ +public class Range + implements Serializable { + + /** for serialization */ + static final long serialVersionUID = 3667337062176835900L; + + /** Record the string representations of the columns to delete */ + /*@non_null spec_public@*/Vector m_RangeStrings = new Vector(); + + /** Whether matching should be inverted */ + /*@spec_public@*/ boolean m_Invert; + + /** The array of flags for whether an column is selected */ + /*@spec_public@*/boolean [] m_SelectFlags; + + /** Store the maximum value permitted in the range. -1 indicates that + no upper value has been set */ + /*@spec_public@*/ int m_Upper = -1; + + /** Default constructor. */ + //@assignable this.*; + public Range() { + } + + /** + * Constructor to set initial range. + * + * @param rangeList the initial range + * @throws IllegalArgumentException if the range list is invalid + */ + public Range(/*@non_null@*/ String rangeList) { + + setRanges(rangeList); + } + + /** + * Sets the value of "last". + * + * @param newUpper the value of "last" + */ + public void setUpper(int newUpper) { + + if (newUpper >= 0) { + m_Upper = newUpper; + setFlags(); + } + } + + /** + * Gets whether the range sense is inverted, i.e. all except + * the values included by the range string are selected. + * + * @return whether the matching sense is inverted + */ + //@ensures \result <==> m_Invert; + public /*@pure@*/boolean getInvert() { + + return m_Invert; + } + + /** + * Sets whether the range sense is inverted, i.e. all except + * the values included by the range string are selected. + * + * @param newSetting true if the matching sense is inverted + */ + public void setInvert(boolean newSetting) { + + m_Invert = newSetting; + } + + /** + * Gets the string representing the selected range of values + * + * @return the range selection string + */ + public /*@non_null pure@*/String getRanges() { + + StringBuffer result = new StringBuffer(m_RangeStrings.size()*4); + boolean first = true; + char sep = ','; + for (int i = 0; i < m_RangeStrings.size(); i++) { + if (first) { + result.append((String)m_RangeStrings.elementAt(i)); + first = false; + } else { + result.append(sep + (String)m_RangeStrings.elementAt(i)); + } + } + return result.toString(); + } + + /** + * Sets the ranges from a string representation. Note that setUpper() + * must be called afterwards for ranges to be actually set internally. + * + * @param rangeList the comma separated list of ranges. The empty + * string sets the range to empty. + * @throws IllegalArgumentException if the rangeList was not well formed + */ + //@requires rangeList != null; + //@assignable m_RangeStrings,m_SelectFlags; + public void setRanges(String rangeList) { + + Vector ranges = new Vector (10); + + // Split the rangeList up into the vector + while (!rangeList.equals("")) { + String range = rangeList.trim(); + int commaLoc = rangeList.indexOf(','); + if (commaLoc != -1) { + range = rangeList.substring(0, commaLoc).trim(); + rangeList = rangeList.substring(commaLoc + 1).trim(); + } else { + rangeList = ""; + } + if (!range.equals("")) { + ranges.addElement(range); + } + } + m_RangeStrings = ranges; + m_SelectFlags = null; + } + + /** + * Gets whether the supplied cardinal number is included in the current + * range. + * + * @param index the number of interest + * @return true if index is in the current range + * @throws RuntimeException if the upper limit of the range hasn't been defined + */ + //@requires m_Upper >= 0; + //@requires 0 <= index && index < m_SelectFlags.length; + public /*@pure@*/ boolean isInRange(int index) { + + if (m_Upper == -1) { + throw new RuntimeException("No upper limit has been specified for range"); + } + if (m_Invert) { + return !m_SelectFlags[index]; + } else { + return m_SelectFlags[index]; + } + } + + /** + * Constructs a representation of the current range. Being a string + * representation, the numbers are based from 1. + * + * @return the string representation of the current range + */ + public /*@non_null pure@*/ String toString() { + + if (m_RangeStrings.size() == 0) { + return "Empty"; + } + String result ="Strings: "; + Enumeration enu = m_RangeStrings.elements(); + while (enu.hasMoreElements()) { + result += (String)enu.nextElement() + " "; + } + result += "\n"; + + result += "Invert: " + m_Invert + "\n"; + + try { + if (m_Upper == -1) { + throw new RuntimeException("Upper limit has not been specified"); + } + String cols = null; + for (int i = 0; i < m_SelectFlags.length; i++) { + if (isInRange(i)) { + if (cols == null) { + cols = "Cols: " + (i + 1); + } else { + cols += "," + (i + 1); + } + } + } + if (cols != null) { + result += cols + "\n"; + } + } catch (Exception ex) { + result += ex.getMessage(); + } + return result; + } + + /** + * Gets an array containing all the selected values, in the order + * that they were selected (or ascending order if range inversion is on) + * + * @return the array of selected values + * @throws RuntimeException if the upper limit of the range hasn't been defined + */ + //@requires m_Upper >= 0; + public /*@non_null@*/ int [] getSelection() { + + if (m_Upper == -1) { + throw new RuntimeException("No upper limit has been specified for range"); + } + int [] selectIndices = new int [m_Upper + 1]; + int numSelected = 0; + if (m_Invert) + { + for (int i = 0; i <= m_Upper; i++) { + if (!m_SelectFlags[i]) { + selectIndices[numSelected++] = i; + } + } + } + else + { + Enumeration enu = m_RangeStrings.elements(); + while (enu.hasMoreElements()) { + String currentRange = (String)enu.nextElement(); + int start = rangeLower(currentRange); + int end = rangeUpper(currentRange); + for (int i = start; (i <= m_Upper) && (i <= end); i++) { + if (m_SelectFlags[i]) { + selectIndices[numSelected++] = i; + } + } + } + } + int [] result = new int [numSelected]; + System.arraycopy(selectIndices, 0, result, 0, numSelected); + return result; + } + + /** + * Creates a string representation of the indices in the supplied array. + * + * @param indices an array containing indices to select. + * Since the array will typically come from a program, indices are assumed + * from 0, and thus will have 1 added in the String representation. + * @return the string representation of the indices + */ + public static /*@non_null pure@*/String indicesToRangeList(/*@non_null@*/ int []indices) { + + StringBuffer rl = new StringBuffer(); + int last = -2; + boolean range = false; + for(int i = 0; i < indices.length; i++) { + if (i == 0) { + rl.append(indices[i] + 1); + } else if (indices[i] == last) { + range = true; + } else { + if (range) { + rl.append('-').append(last); + range = false; + } + rl.append(',').append(indices[i] + 1); + } + last = indices[i] + 1; + } + if (range) { + rl.append('-').append(last); + } + return rl.toString(); + } + + /** Sets the flags array. */ + protected void setFlags() { + + m_SelectFlags = new boolean [m_Upper + 1]; + Enumeration enu = m_RangeStrings.elements(); + while (enu.hasMoreElements()) { + String currentRange = (String)enu.nextElement(); + if (!isValidRange(currentRange)) { + throw new IllegalArgumentException("Invalid range list at " + currentRange); + } + int start = rangeLower(currentRange); + int end = rangeUpper(currentRange); + for (int i = start; (i <= m_Upper) && (i <= end); i++) { + m_SelectFlags[i] = true; + } + } + } + + + /** + * Translates a single string selection into it's internal 0-based equivalent + * + * @param single the string representing the selection (eg: 1 first last) + * @return the number corresponding to the selected value + */ + protected /*@pure@*/ int rangeSingle(/*@non_null@*/ String single) { + + if (single.toLowerCase().equals("first")) { + return 0; + } + if (single.toLowerCase().equals("last")) { + return m_Upper; + } + int index = Integer.parseInt(single) - 1; + if (index < 0) { + index = 0; + } + if (index > m_Upper) { + index = m_Upper; + } + return index; + } + + /** + * Translates a range into it's lower index. + * + * @param range the string representation of the range + * @return the lower index of the range + */ + protected int rangeLower(/*@non_null@*/ String range) { + + int hyphenIndex; + if ((hyphenIndex = range.indexOf('-')) >= 0) { + return Math.min(rangeLower(range.substring(0, hyphenIndex)), + rangeLower(range.substring(hyphenIndex + 1))); + } + return rangeSingle(range); + } + + /** + * Translates a range into it's upper index. Must only be called once + * setUpper has been called. + * + * @param range the string representation of the range + * @return the upper index of the range + */ + protected int rangeUpper(/*@non_null@*/ String range) { + + int hyphenIndex; + if ((hyphenIndex = range.indexOf('-')) >= 0) { + return Math.max(rangeUpper(range.substring(0, hyphenIndex)), + rangeUpper(range.substring(hyphenIndex + 1))); + } + return rangeSingle(range); + } + + /** + * Determines if a string represents a valid index or simple range. + * Examples: first last 2 first-last first-4 4-last + * Doesn't check that a < b for a-b + * + * @param range the string to check + * @return true if the range is valid + */ + protected boolean isValidRange(String range) { + + if (range == null) { + return false; + } + int hyphenIndex; + if ((hyphenIndex = range.indexOf('-')) >= 0) { + if (isValidRange(range.substring(0, hyphenIndex)) && + isValidRange(range.substring(hyphenIndex + 1))) { + return true; + } + return false; + } + if (range.toLowerCase().equals("first")) { + return true; + } + if (range.toLowerCase().equals("last")) { + return true; + } + try { + int index = Integer.parseInt(range); + if ((index > 0) && (index <= m_Upper + 1)){ + return true; + } + return false; + } catch (NumberFormatException ex) { + return false; + } + } + + /** + * Returns the revision string. + * + * @return the revision + */ +// public String getRevision() { +// return RevisionUtils.extract("$Revision: 1.18 $"); +// } + + /** + * Main method for testing this class. + * + * @param argv one parameter: a test range specification + */ + public static void main(String [] argv) { + + try { + if (argv.length == 0) { + throw new Exception("Usage: Range "); + } + Range range = new Range(); + range.setRanges(argv[0]); + range.setUpper(9); + range.setInvert(false); + System.out.println("Input: " + argv[0] + "\n" + + range.toString()); + int [] rangeIndices = range.getSelection(); + for (int i = 0; i < rangeIndices.length; i++) + System.out.print(" " + (rangeIndices[i] + 1)); + System.out.println(""); + } catch (Exception ex) { + System.out.println(ex.getMessage()); + } + } +} + + diff --git a/src/gaknn/core/kdtree/SlidingMidPointOfWidestSide.java b/src/gaknn/core/kdtree/SlidingMidPointOfWidestSide.java new file mode 100644 index 0000000..911e35a --- /dev/null +++ b/src/gaknn/core/kdtree/SlidingMidPointOfWidestSide.java @@ -0,0 +1,267 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* + * SlidingMidPointOfWidestSide.java + * Copyright (C) 2007 University of Waikato, Hamilton, New Zealand + */ + +package gaknn.core.kdtree; + +//import weka.core.RevisionUtils; +//import weka.core.TechnicalInformation; +//import weka.core.TechnicalInformationHandler; +//import weka.core.TechnicalInformation.Field; +//import weka.core.TechnicalInformation.Type; + +/** + + * The class that splits a node into two based on the midpoint value of the dimension in which the node's rectangle is widest. If after splitting one side is empty then it is slided towards the non-empty side until there is at least one point on the empty side.
+ *
+ * For more information see also:
+ *
+ * David M. Mount (2006). ANN Programming Manual. College Park, MD, USA. + *

+ + * + + * BibTeX: + *

+ * @manual{Mount2006,
+ *    address = {College Park, MD, USA},
+ *    author = {David M. Mount},
+ *    organization = {Department of Computer Science, University of Maryland},
+ *    title = {ANN Programming Manual},
+ *    year = {2006},
+ *    HTTP = {Available from http://www.cs.umd.edu/\~mount/ANN/}
+ * }
+ * 
+ *

+ + * + + + * + * @author Ashraf M. Kibriya (amk14@waikato.ac.nz) + * @version $Revision: 1.3 $ + */ +public class SlidingMidPointOfWidestSide + extends KDTreeNodeSplitter + { + + /** for serialization. */ + private static final long serialVersionUID = 852857628205680562L; + + /** The floating point error to tolerate in finding the widest + * rectangular side. */ + protected static double ERR = 0.001; + + /** + * Returns a string describing this nearest neighbour search algorithm. + * + * @return a description of the algorithm for displaying in the + * explorer/experimenter gui + */ +// public String globalInfo() { +// return +// "The class that splits a node into two based on the midpoint value of " +// + "the dimension in which the node's rectangle is widest. If after " +// + "splitting one side is empty then it is slided towards the non-empty " +// + "side until there is at least one point on the empty side.\n\n" +// + "For more information see also:\n\n" +// + getTechnicalInformation().toString(); +// } + + /** + * Returns an instance of a TechnicalInformation object, containing detailed + * information about the technical background of this class, e.g., paper + * reference or book this class is based on. + * + * @return the technical information about this class + */ +// public TechnicalInformation getTechnicalInformation() { +// TechnicalInformation result; +// +// result = new TechnicalInformation(Type.MANUAL); +// result.setValue(Field.AUTHOR, "David M. Mount"); +// result.setValue(Field.YEAR, "2006"); +// result.setValue(Field.TITLE, "ANN Programming Manual"); +// result.setValue(Field.ORGANIZATION, "Department of Computer Science, University of Maryland"); +// result.setValue(Field.ADDRESS, +// "College Park, MD, USA"); +// result.setValue(Field.HTTP, +// "Available from http://www.cs.umd.edu/~mount/ANN/"); +// +// return result; +// } + + /** + * Splits a node into two based on the midpoint value of the dimension + * in which the node's rectangle is widest. If after splitting one side + * is empty then it is slided towards the non-empty side until there is + * at least one point on the empty side. The two nodes created after the + * whole splitting are correctly initialised. And, node.left and + * node.right are set appropriately. + * @param node The node to split. + * @param numNodesCreated The number of nodes that so far have been + * created for the tree, so that the newly created nodes are + * assigned correct/meaningful node numbers/ids. + * @param nodeRanges The attributes' range for the points inside + * the node that is to be split. + * @param universe The attributes' range for the whole + * point-space. + * @throws Exception If there is some problem in splitting the + * given node. + */ + public void splitNode(KDTreeNode node, int numNodesCreated, + double[][] nodeRanges, double[][] universe) throws Exception { + + correctlyInitialized(); + + if (node.m_NodesRectBounds == null) { + node.m_NodesRectBounds = new double[2][node.m_NodeRanges.length]; + for (int i = 0; i < node.m_NodeRanges.length; i++) { + node.m_NodesRectBounds[MIN][i] = node.m_NodeRanges[i][MIN]; + node.m_NodesRectBounds[MAX][i] = node.m_NodeRanges[i][MAX]; + } + } + + // finding widest side of the hyper rectangle + double maxRectWidth = Double.NEGATIVE_INFINITY, maxPtWidth = Double.NEGATIVE_INFINITY, tempval; + int splitDim = -1, classIdx = m_Instances.GetClassIndex(); + + for (int i = 0; i < node.m_NodesRectBounds[0].length; i++) { + if (i == classIdx) + continue; + tempval = node.m_NodesRectBounds[MAX][i] - node.m_NodesRectBounds[MIN][i]; + if (m_NormalizeNodeWidth) { + tempval = tempval / universe[i][WIDTH]; + } + if (tempval > maxRectWidth && node.m_NodeRanges[i][WIDTH] > 0.0) + maxRectWidth = tempval; + } + + for (int i = 0; i < node.m_NodesRectBounds[0].length; i++) { + if (i == classIdx) + continue; + tempval = node.m_NodesRectBounds[MAX][i] - node.m_NodesRectBounds[MIN][i]; + if (m_NormalizeNodeWidth) { + tempval = tempval / universe[i][WIDTH]; + } + if (tempval >= maxRectWidth * (1 - ERR) + && node.m_NodeRanges[i][WIDTH] > 0.0) { + if (node.m_NodeRanges[i][WIDTH] > maxPtWidth) { + maxPtWidth = node.m_NodeRanges[i][WIDTH]; + if (m_NormalizeNodeWidth) + maxPtWidth = maxPtWidth / universe[i][WIDTH]; + splitDim = i; + } + } + } + + double splitVal = node.m_NodesRectBounds[MIN][splitDim] + + (node.m_NodesRectBounds[MAX][splitDim] - node.m_NodesRectBounds[MIN][splitDim]) + * 0.5; + // might want to try to slide it further to contain more than one point on + // the + // side that is resulting empty + if (splitVal < node.m_NodeRanges[splitDim][MIN]) + splitVal = node.m_NodeRanges[splitDim][MIN]; + else if (splitVal >= node.m_NodeRanges[splitDim][MAX]) + splitVal = node.m_NodeRanges[splitDim][MAX] + - node.m_NodeRanges[splitDim][WIDTH] * 0.001; + + int rightStart = rearrangePoints(m_InstList, node.m_Start, node.m_End, + splitDim, splitVal); + + if (rightStart == node.m_Start || rightStart > node.m_End) { + if (rightStart == node.m_Start) + throw new Exception("Left child is empty in node " + node.m_NodeNumber + + ". Not possible with " + + "SlidingMidPointofWidestSide splitting method. Please " + + "check code."); + else + throw new Exception("Right child is empty in node " + node.m_NodeNumber + + ". Not possible with " + + "SlidingMidPointofWidestSide splitting method. Please " + + "check code."); + } + + node.m_SplitDim = splitDim; + node.m_SplitValue = splitVal; + + double[][] widths = new double[2][node.m_NodesRectBounds[0].length]; + + System.arraycopy(node.m_NodesRectBounds[MIN], 0, widths[MIN], 0, + node.m_NodesRectBounds[MIN].length); + System.arraycopy(node.m_NodesRectBounds[MAX], 0, widths[MAX], 0, + node.m_NodesRectBounds[MAX].length); + widths[MAX][splitDim] = splitVal; + + node.m_Left = new KDTreeNode(numNodesCreated + 1, node.m_Start, + rightStart - 1, m_EuclideanDistance.initializeRanges(m_InstList, + node.m_Start, rightStart - 1), widths); + + widths = new double[2][node.m_NodesRectBounds[0].length]; + System.arraycopy(node.m_NodesRectBounds[MIN], 0, widths[MIN], 0, + node.m_NodesRectBounds[MIN].length); + System.arraycopy(node.m_NodesRectBounds[MAX], 0, widths[MAX], 0, + node.m_NodesRectBounds[MAX].length); + widths[MIN][splitDim] = splitVal; + + node.m_Right = new KDTreeNode(numNodesCreated + 2, rightStart, node.m_End, + m_EuclideanDistance.initializeRanges(m_InstList, rightStart, node.m_End), widths); + } + + /** + * Re-arranges the indices array such that the points <= to the splitVal + * are on the left of the array and those > the splitVal are on the right. + * + * @param indices The master index array. + * @param startidx The begining index of portion of indices that needs + * re-arranging. + * @param endidx The end index of portion of indices that needs + * re-arranging. + * @param splitDim The split dimension/attribute. + * @param splitVal The split value. + * @return The startIdx of the points > the splitVal (the points + * belonging to the right child of the node). + */ + protected int rearrangePoints(int[] indices, final int startidx, + final int endidx, final int splitDim, final double splitVal) { + + int tmp, left = startidx - 1; + for (int i = startidx; i <= endidx; i++) { + if (m_EuclideanDistance.valueIsSmallerEqual(m_Instances + .instance(indices[i]), splitDim, splitVal)) { + left++; + tmp = indices[left]; + indices[left] = indices[i]; + indices[i] = tmp; + }// end valueIsSmallerEqual + }// endfor + return left + 1; + } + + /** + * Returns the revision string. + * + * @return the revision + */ +// public String getRevision() { +// return RevisionUtils.extract("$Revision: 1.3 $"); +// } +} diff --git a/src/gaknn/core/kdtree/TreePerformanceStats.java b/src/gaknn/core/kdtree/TreePerformanceStats.java new file mode 100644 index 0000000..9091964 --- /dev/null +++ b/src/gaknn/core/kdtree/TreePerformanceStats.java @@ -0,0 +1,322 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* + * TreePerformanceStats.java + * Copyright (C) 2007 University of Waikato, Hamilton, New Zealand + */ + +package gaknn.core.kdtree; + +//import weka.core.RevisionUtils; + +import java.util.Enumeration; +import java.util.Vector; + +/** + * The class that measures the performance of a tree based + * nearest neighbour search algorithm. + * + * @author Ashraf M. Kibriya (amk14[at-the-rate]cs[dot]waikato[dot]ac[dot]nz) + * @version $Revision: 1.2 $ + */ +public class TreePerformanceStats + extends PerformanceStats { + + /** for serialization. */ + private static final long serialVersionUID = -6637636693340810373L; + + // Variables for leaves + /** The min and max number leaf nodes looked + * for a query by the tree based NNS algorithm. */ + protected int m_MinLeaves, m_MaxLeaves; + + /** The sum of leaf nodes looked + * at for all the queries. + */ + protected int m_SumLeaves; + /** The squared sum of leaf nodes looked + * at for all the queries. + */ + protected int m_SumSqLeaves; + /** The number of leaf nodes looked at + * for the current/last query. + */ + protected int m_LeafCount; + + // Variables for internal nodes + /** The min and max number internal nodes looked + * for a query by the tree based NNS algorithm. */ + protected int m_MinIntNodes, m_MaxIntNodes; + /** The sum of internal nodes looked + * at for all the queries. + */ + protected int m_SumIntNodes; + /** The squared sum of internal nodes looked + * at for all the queries. + */ + protected int m_SumSqIntNodes; + /** The number of internal nodes looked at + * for the current/last query. + */ + protected int m_IntNodeCount; + + /** + * Default constructor. + */ + public TreePerformanceStats() { + reset(); + } + + /** + * Resets all internal fields/counters. + */ + public void reset() { + super.reset(); + //initializing leaf variables + m_SumLeaves = m_SumSqLeaves = m_LeafCount = 0; + m_MinLeaves = Integer.MAX_VALUE; + m_MaxLeaves = Integer.MIN_VALUE; + //initializing internal variables + m_SumIntNodes = m_SumSqIntNodes = m_IntNodeCount = 0; + m_MinIntNodes = Integer.MAX_VALUE; + m_MaxIntNodes = Integer.MIN_VALUE; + } + + /** + * Signals start of the nearest neighbour search. + * Initializes the stats object. + */ + public void searchStart() { + super.searchStart(); + m_LeafCount = 0; + m_IntNodeCount = 0; + } + + /** + * Signals end of the nearest neighbour search. + * Calculates the statistics for the search. + */ + public void searchFinish() { + super.searchFinish(); + //updating stats for leaf nodes + m_SumLeaves += m_LeafCount; m_SumSqLeaves += m_LeafCount*m_LeafCount; + if (m_LeafCount < m_MinLeaves) m_MinLeaves = m_LeafCount; + if (m_LeafCount > m_MaxLeaves) m_MaxLeaves = m_LeafCount; + //updating stats for internal nodes + m_SumIntNodes += m_IntNodeCount; m_SumSqIntNodes += m_IntNodeCount*m_IntNodeCount; + if (m_IntNodeCount < m_MinIntNodes) m_MinIntNodes = m_IntNodeCount; + if (m_IntNodeCount > m_MaxIntNodes) m_MaxIntNodes = m_IntNodeCount; + } + + /** + * Increments the leaf count. + */ + public void incrLeafCount() { + m_LeafCount++; + } + + /** + * Increments the internal node count. + */ + public void incrIntNodeCount() { + m_IntNodeCount++; + } + + // Getter functions for leaves + + /** + * Returns the total number of leaves visited. + * + * @return The total number. + */ + public int getTotalLeavesVisited() { + return m_SumLeaves; + } + + /** + * Returns the mean of number of leaves visited. + * + * @return The mean number of leaves visited. + */ + public double getMeanLeavesVisited() { + return m_SumLeaves/(double)m_NumQueries; + } + + /** + * Returns the standard deviation of leaves visited. + * + * @return The standard deviation of leaves visited. + */ + public double getStdDevLeavesVisited() { + return Math.sqrt((m_SumSqLeaves - (m_SumLeaves*m_SumLeaves)/(double)m_NumQueries)/(m_NumQueries-1)); + } + + /** + * Returns the minimum number of leaves visited. + * + * @return The minimum number of leaves visited. + */ + public int getMinLeavesVisited() { + return m_MinLeaves; + } + + /** + * Returns the maximum number of leaves visited. + * + * @return The maximum number of leaves visited. + */ + public int getMaxLeavesVisited() { + return m_MaxLeaves; + } + + // Getter functions for internal nodes + + /** + * Returns the total number of internal nodes visited. + * + * @return The total number of internal nodes visited. + */ + public int getTotalIntNodesVisited() { + return m_SumIntNodes; + } + + /** + * Returns the mean of internal nodes visited. + * + * @return The mean number of internal nodes + * visited. + */ + public double getMeanIntNodesVisited() { + return m_SumIntNodes/(double)m_NumQueries; + } + + /** + * Returns the standard deviation of internal nodes visited. + * + * @return The standard deviation of internal nodes visited. + */ + public double getStdDevIntNodesVisited() { + return Math.sqrt((m_SumSqIntNodes - (m_SumIntNodes*m_SumIntNodes)/(double)m_NumQueries)/(m_NumQueries-1)); + } + + /** + * Returns the minimum of internal nodes visited. + * + * @return The minimum of internal nodes visited. + */ + public int getMinIntNodesVisited() { + return m_MinIntNodes; + } + + /** + * returns the maximum of internal nodes visited. + * + * @return The maximum of internal nodes visited. + */ + public int getMaxIntNodesVisited() { + return m_MaxIntNodes; + } + + /** + * Returns an enumeration of the additional measure names. + * + * @return An enumeration of the measure names. + */ + public Enumeration enumerateMeasures() { + Vector newVector = new Vector(); + + Enumeration en = super.enumerateMeasures(); + while(en.hasMoreElements()) + newVector.addElement(en.nextElement()); + + newVector.addElement("measureTotal_nodes_visited"); + newVector.addElement("measureMean_nodes_visited"); + newVector.addElement("measureStdDev_nodes_visited"); + newVector.addElement("measureMin_nodes_visited"); + newVector.addElement("measureMax_nodes_visited"); + //coord stats + newVector.addElement("measureTotal_leaves_visited"); + newVector.addElement("measureMean_leaves_visited"); + newVector.addElement("measureStdDev_leaves_visited"); + newVector.addElement("measureMin_leaves_visited"); + newVector.addElement("measureMax_leaves_visited"); + + return newVector.elements(); + } + + /** + * Returns the value of the named measure. + * + * @param additionalMeasureName The name of the measure to query for + * its value. + * @return The value of the named measure. + * @throws IllegalArgumentException If the named measure is not + * supported. + */ + public double getMeasure(String additionalMeasureName) throws IllegalArgumentException { + if (additionalMeasureName.compareToIgnoreCase("measureTotal_nodes_visited") == 0) { + return (double) getTotalIntNodesVisited(); + } else if (additionalMeasureName.compareToIgnoreCase("measureMean_nodes_visited") == 0) { + return (double) getMeanIntNodesVisited(); + } else if (additionalMeasureName.compareToIgnoreCase("measureStdDev_nodes_visited") == 0) { + return (double) getStdDevIntNodesVisited(); + } else if (additionalMeasureName.compareToIgnoreCase("measureMin_nodes_visited") == 0) { + return (double) getMinIntNodesVisited(); + } else if (additionalMeasureName.compareToIgnoreCase("measureMax_nodes_visited") == 0) { + return (double) getMaxIntNodesVisited(); + } + //coord stats + else if (additionalMeasureName.compareToIgnoreCase("measureTotal_leaves_visited") == 0) { + return (double) getTotalLeavesVisited(); + } else if (additionalMeasureName.compareToIgnoreCase("measureMean_leaves_visited") == 0) { + return (double) getMeanLeavesVisited(); + } else if (additionalMeasureName.compareToIgnoreCase("measureStdDev_leaves_visited") == 0) { + return (double) getStdDevLeavesVisited(); + } else if (additionalMeasureName.compareToIgnoreCase("measureMin_leaves_visited") == 0) { + return (double) getMinLeavesVisited(); + } else if (additionalMeasureName.compareToIgnoreCase("measureMax_leaves_visited") == 0) { + return (double) getMaxLeavesVisited(); + } else { + return super.getMeasure(additionalMeasureName); + } + } + + /** + * Returns a string representation of the statistics. + * + * @return The statistics as string. + */ + public String getStats() { + StringBuffer buf = new StringBuffer(super.getStats()); + + buf.append("leaves: "+getMinLeavesVisited()+", "+getMaxLeavesVisited()+ + ","+getTotalLeavesVisited()+","+getMeanLeavesVisited()+", "+getStdDevLeavesVisited()+"\n"); + buf.append("Int nodes: "+getMinIntNodesVisited()+", "+getMaxIntNodesVisited()+ + ","+getTotalIntNodesVisited()+","+getMeanIntNodesVisited()+", "+getStdDevIntNodesVisited()+"\n"); + + return buf.toString(); + } + + /** + * Returns the revision string. + * + * @return the revision + */ +// public String getRevision() { +// return RevisionUtils.extract("$Revision: 1.2 $"); +// } +} diff --git a/src/gaknn/predictor/PredictorKdtree.java b/src/gaknn/predictor/PredictorKdtree.java new file mode 100644 index 0000000..a164d70 --- /dev/null +++ b/src/gaknn/predictor/PredictorKdtree.java @@ -0,0 +1,129 @@ +package gaknn.predictor; + +import gaknn.core.Instance; +import gaknn.core.Instances; +import gaknn.core.Pair; +import gaknn.core.kdtree.KDTree; +import gaknn.similarity.AbstractSimilarity; + +public class PredictorKdtree extends Predictor { + KDTree kdTree; + + public PredictorKdtree(AbstractSimilarity sim, Instance[] trSet, Instances inst,double[] weights) { + super(sim, trSet); + kdTree=new KDTree(inst); + kdTree.SetWeights(weights); + try { + kdTree.setInstances(inst); + + } catch (Exception e) { + // TODO Auto-generated catch block + System.out.println("building error kd tree"); + e.printStackTrace(); + } + // TODO Auto-generated constructor stub + } + /** get attribute values and find the majority class value confidence by finding k nearest neighbors form kd tree + + * find the k nearest neighbors from kd tree and find the vote for each class value and return the majority class value confidence. */ + @Override + public double Predict(Instance instance) { + Instances kNeighbours=new Instances(null, m_K); + // TODO Auto-generated method stub + double[] vote = new double[m_ClassList.length]; + int ClassIndex = 0; + try { + // get the k nearest neighbors form kd tree in a form of instances + kNeighbours=kdTree.kNearestNeighbours(instance, m_K); + for (int i=0; i Double.MAX_VALUE) + val = Double.MAX_VALUE; + else if (Double.isNaN(val)) + val = 0.0; + + return val; + } + + /** get attribute values and find the majority class value by finding k nearest neighbors form kd tree + * first create instance which have the attribute values + * then find the k nearest neighbors from kd tree and find the vote for each class value and return the majority class value and its confidence. */ + @Override + public Pair Predict(double[] instance) { + // TODO Auto-generated method stub + Instance inst=new Instance(instance); + double[] vote = new double[m_ClassList.length]; + int ClassIndex = 0; + //Instances kNeighbours=new Instances(); + // TODO Auto-generated method stub + try { + // get the k nearest neighbors form kd tree in a form of instances + Instances kNeighbours=kdTree.kNearestNeighbours(inst, m_K); + for (int i=0; i Double.MAX_VALUE) + conf = 1.0; + else + conf = (vote[clsId]/totconf); + + return conf; + } + +}