00001 //---------------------------------------------------------------------- 00002 // File: ANN.h 00003 // Programmer: Sunil Arya and David Mount 00004 // Last modified: 05/03/05 (Release 1.1) 00005 // Description: Basic include file for approximate nearest 00006 // neighbor searching. 00007 //---------------------------------------------------------------------- 00008 // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and 00009 // David Mount. All Rights Reserved. 00010 // 00011 // This software and related documentation is part of the 00012 // Approximate Nearest Neighbor Library (ANN). 00013 // 00014 // Permission to use, copy, and distribute this software and its 00015 // documentation is hereby granted free of charge, provided that 00016 // (1) it is not a component of a commercial product, and 00017 // (2) this notice appears in all copies of the software and 00018 // related documentation. 00019 // 00020 // The University of Maryland (U.M.) and the authors make no representations 00021 // about the suitability or fitness of this software for any purpose. It is 00022 // provided "as is" without express or implied warranty. 00023 //---------------------------------------------------------------------- 00024 // History: 00025 // Revision 0.1 03/04/98 00026 // Initial release 00027 // Revision 1.0 04/01/05 00028 // Added copyright and revision information 00029 // Added ANNcoordPrec for coordinate precision. 00030 // Added methods theDim, nPoints, maxPoints, thePoints to ANNpointSet. 00031 // Cleaned up C++ structure for modern compilers 00032 // Revision 1.1 05/03/05 00033 // Added fixed-radius k-NN searching 00034 //---------------------------------------------------------------------- 00035 00036 //---------------------------------------------------------------------- 00037 // ANN - approximate nearest neighbor searching 00038 // ANN is a library for approximate nearest neighbor searching, 00039 // based on the use of standard and priority search in kd-trees 00040 // and balanced box-decomposition (bbd) trees. Here are some 00041 // references to the main algorithmic techniques used here: 00042 // 00043 // kd-trees: 00044 // Friedman, Bentley, and Finkel, ``An algorithm for finding 00045 // best matches in logarithmic expected time,'' ACM 00046 // Transactions on Mathematical Software, 3(3):209-226, 1977. 00047 // 00048 // Priority search in kd-trees: 00049 // Arya and Mount, ``Algorithms for fast vector quantization,'' 00050 // Proc. of DCC '93: Data Compression Conference, eds. J. A. 00051 // Storer and M. Cohn, IEEE Press, 1993, 381-390. 00052 // 00053 // Approximate nearest neighbor search and bbd-trees: 00054 // Arya, Mount, Netanyahu, Silverman, and Wu, ``An optimal 00055 // algorithm for approximate nearest neighbor searching,'' 00056 // 5th Ann. ACM-SIAM Symposium on Discrete Algorithms, 00057 // 1994, 573-582. 00058 //---------------------------------------------------------------------- 00059 00060 #ifndef ANN_H 00061 #define ANN_H 00062 00063 #ifdef WIN32 00064 //---------------------------------------------------------------------- 00065 // For Microsoft Visual C++, externally accessible symbols must be 00066 // explicitly indicated with DLL_API, which is somewhat like "extern." 00067 // 00068 // The following ifdef block is the standard way of creating macros 00069 // which make exporting from a DLL simpler. All files within this DLL 00070 // are compiled with the DLL_EXPORTS preprocessor symbol defined on the 00071 // command line. In contrast, projects that use (or import) the DLL 00072 // objects do not define the DLL_EXPORTS symbol. This way any other 00073 // project whose source files include this file see DLL_API functions as 00074 // being imported from a DLL, wheras this DLL sees symbols defined with 00075 // this macro as being exported. 00076 // CHANGE: allow build as static lib when ANN_STATIC_LIB is defined 00077 // change DLL_EXPORTS to ann_EXPORTS 00078 //---------------------------------------------------------------------- 00079 #ifdef ANN_STATIC_LIB 00080 #define DLL_API 00081 #else 00082 #ifdef ann_EXPORTS 00083 #define DLL_API __declspec(dllexport) 00084 #else 00085 #define DLL_API 00086 //__declspec(dllimport) 00087 #endif 00088 #endif 00089 //---------------------------------------------------------------------- 00090 // DLL_API is ignored for all other systems 00091 //---------------------------------------------------------------------- 00092 #else 00093 #define DLL_API 00094 #endif 00095 00096 //---------------------------------------------------------------------- 00097 // basic includes 00098 //---------------------------------------------------------------------- 00099 00100 #include <cmath> // math includes 00101 #include <iostream> // I/O streams 00102 00103 //---------------------------------------------------------------------- 00104 // Limits 00105 // There are a number of places where we use the maximum double value as 00106 // default initializers (and others may be used, depending on the 00107 // data/distance representation). These can usually be found in limits.h 00108 // (as LONG_MAX, INT_MAX) or in float.h (as DBL_MAX, FLT_MAX). 00109 // 00110 // Not all systems have these files. If you are using such a system, 00111 // you should set the preprocessor symbol ANN_NO_LIMITS_H when 00112 // compiling, and modify the statements below to generate the 00113 // appropriate value. For practical purposes, this does not need to be 00114 // the maximum double value. It is sufficient that it be at least as 00115 // large than the maximum squared distance between between any two 00116 // points. 00117 //---------------------------------------------------------------------- 00118 #ifdef ANN_NO_LIMITS_H // limits.h unavailable 00119 #include <cvalues> // replacement for limits.h 00120 const double ANN_DBL_MAX = MAXDOUBLE; // insert maximum double 00121 #else 00122 #include <climits> 00123 #include <cfloat> 00124 const double ANN_DBL_MAX = DBL_MAX; 00125 #endif 00126 00127 #define ANNversion "1.0" // ANN version and information 00128 #define ANNversionCmt "" 00129 #define ANNcopyright "David M. Mount and Sunil Arya" 00130 #define ANNlatestRev "Mar 1, 2005" 00131 00132 //---------------------------------------------------------------------- 00133 // ANNbool 00134 // This is a simple boolean type. Although ANSI C++ is supposed 00135 // to support the type bool, some compilers do not have it. 00136 //---------------------------------------------------------------------- 00137 00138 enum ANNbool {ANNfalse = 0, ANNtrue = 1}; // ANN boolean type (non ANSI C++) 00139 00140 //---------------------------------------------------------------------- 00141 // ANNcoord, ANNdist 00142 // ANNcoord and ANNdist are the types used for representing 00143 // point coordinates and distances. They can be modified by the 00144 // user, with some care. It is assumed that they are both numeric 00145 // types, and that ANNdist is generally of an equal or higher type 00146 // from ANNcoord. A variable of type ANNdist should be large 00147 // enough to store the sum of squared components of a variable 00148 // of type ANNcoord for the number of dimensions needed in the 00149 // application. For example, the following combinations are 00150 // legal: 00151 // 00152 // ANNcoord ANNdist 00153 // --------- ------------------------------- 00154 // short short, int, long, float, double 00155 // int int, long, float, double 00156 // long long, float, double 00157 // float float, double 00158 // double double 00159 // 00160 // It is the user's responsibility to make sure that overflow does 00161 // not occur in distance calculation. 00162 //---------------------------------------------------------------------- 00163 00164 typedef float ANNcoord; // coordinate data type 00165 typedef float ANNdist; // distance data type 00166 00167 //---------------------------------------------------------------------- 00168 // ANNidx 00169 // ANNidx is a point index. When the data structure is built, the 00170 // points are given as an array. Nearest neighbor results are 00171 // returned as an integer index into this array. To make it 00172 // clearer when this is happening, we define the integer type 00173 // ANNidx. Indexing starts from 0. 00174 // 00175 // For fixed-radius near neighbor searching, it is possible that 00176 // there are not k nearest neighbors within the search radius. To 00177 // indicate this, the algorithm returns ANN_NULL_IDX as its result. 00178 // It should be distinguishable from any valid array index. 00179 //---------------------------------------------------------------------- 00180 00181 typedef int ANNidx; // point index 00182 const ANNidx ANN_NULL_IDX = -1; // a NULL point index 00183 00184 //---------------------------------------------------------------------- 00185 // Infinite distance: 00186 // The code assumes that there is an "infinite distance" which it 00187 // uses to initialize distances before performing nearest neighbor 00188 // searches. It should be as larger or larger than any legitimate 00189 // nearest neighbor distance. 00190 // 00191 // On most systems, these should be found in the standard include 00192 // file <limits.h> or possibly <float.h>. If you do not have these 00193 // file, some suggested values are listed below, assuming 64-bit 00194 // long, 32-bit int and 16-bit short. 00195 // 00196 // ANNdist ANN_DIST_INF Values (see <limits.h> or <float.h>) 00197 // ------- ------------ ------------------------------------ 00198 // double DBL_MAX 1.79769313486231570e+308 00199 // float FLT_MAX 3.40282346638528860e+38 00200 // long LONG_MAX 0x7fffffffffffffff 00201 // int INT_MAX 0x7fffffff 00202 // short SHRT_MAX 0x7fff 00203 //---------------------------------------------------------------------- 00204 00205 const ANNdist ANN_DIST_INF = FLT_MAX; 00206 00207 //---------------------------------------------------------------------- 00208 // Significant digits for tree dumps: 00209 // When floating point coordinates are used, the routine that dumps 00210 // a tree needs to know roughly how many significant digits there 00211 // are in a ANNcoord, so it can output points to full precision. 00212 // This is defined to be ANNcoordPrec. On most systems these 00213 // values can be found in the standard include files <limits.h> or 00214 // <float.h>. For integer types, the value is essentially ignored. 00215 // 00216 // ANNcoord ANNcoordPrec Values (see <limits.h> or <float.h>) 00217 // -------- ------------ ------------------------------------ 00218 // double DBL_DIG 15 00219 // float FLT_DIG 6 00220 // long doesn't matter 19 00221 // int doesn't matter 10 00222 // short doesn't matter 5 00223 //---------------------------------------------------------------------- 00224 00225 #ifdef DBL_DIG // number of sig. bits in ANNcoord 00226 const int ANNcoordPrec = DBL_DIG; 00227 #else 00228 const int ANNcoordPrec = 15; // default precision 00229 #endif 00230 00231 //---------------------------------------------------------------------- 00232 // Self match? 00233 // In some applications, the nearest neighbor of a point is not 00234 // allowed to be the point itself. This occurs, for example, when 00235 // computing all nearest neighbors in a set. By setting the 00236 // parameter ANN_ALLOW_SELF_MATCH to ANNfalse, the nearest neighbor 00237 // is the closest point whose distance from the query point is 00238 // strictly positive. 00239 //---------------------------------------------------------------------- 00240 00241 const ANNbool ANN_ALLOW_SELF_MATCH = ANNtrue; 00242 00243 //---------------------------------------------------------------------- 00244 // Norms and metrics: 00245 // ANN supports any Minkowski norm for defining distance. In 00246 // particular, for any p >= 1, the L_p Minkowski norm defines the 00247 // length of a d-vector (v0, v1, ..., v(d-1)) to be 00248 // 00249 // (|v0|^p + |v1|^p + ... + |v(d-1)|^p)^(1/p), 00250 // 00251 // (where ^ denotes exponentiation, and |.| denotes absolute 00252 // value). The distance between two points is defined to be the 00253 // norm of the vector joining them. Some common distance metrics 00254 // include 00255 // 00256 // Euclidean metric p = 2 00257 // Manhattan metric p = 1 00258 // Max metric p = infinity 00259 // 00260 // In the case of the max metric, the norm is computed by taking 00261 // the maxima of the absolute values of the components. ANN is 00262 // highly "coordinate-based" and does not support general distances 00263 // functions (e.g. those obeying just the triangle inequality). It 00264 // also does not support distance functions based on 00265 // inner-products. 00266 // 00267 // For the purpose of computing nearest neighbors, it is not 00268 // necessary to compute the final power (1/p). Thus the only 00269 // component that is used by the program is |v(i)|^p. 00270 // 00271 // ANN parameterizes the distance computation through the following 00272 // macros. (Macros are used rather than procedures for 00273 // efficiency.) Recall that the distance between two points is 00274 // given by the length of the vector joining them, and the length 00275 // or norm of a vector v is given by formula: 00276 // 00277 // |v| = ROOT(POW(v0) # POW(v1) # ... # POW(v(d-1))) 00278 // 00279 // where ROOT, POW are unary functions and # is an associative and 00280 // commutative binary operator mapping the following types: 00281 // 00282 // ** POW: ANNcoord --> ANNdist 00283 // ** #: ANNdist x ANNdist --> ANNdist 00284 // ** ROOT: ANNdist (>0) --> double 00285 // 00286 // For early termination in distance calculation (partial distance 00287 // calculation) we assume that POW and # together are monotonically 00288 // increasing on sequences of arguments, meaning that for all 00289 // v0..vk and y: 00290 // 00291 // POW(v0) #...# POW(vk) <= (POW(v0) #...# POW(vk)) # POW(y). 00292 // 00293 // Incremental Distance Calculation: 00294 // The program uses an optimized method of computing distances for 00295 // kd-trees and bd-trees, called incremental distance calculation. 00296 // It is used when distances are to be updated when only a single 00297 // coordinate of a point has been changed. In order to use this, 00298 // we assume that there is an incremental update function DIFF(x,y) 00299 // for #, such that if: 00300 // 00301 // s = x0 # ... # xi # ... # xk 00302 // 00303 // then if s' is equal to s but with xi replaced by y, that is, 00304 // 00305 // s' = x0 # ... # y # ... # xk 00306 // 00307 // then the length of s' can be computed by: 00308 // 00309 // |s'| = |s| # DIFF(xi,y). 00310 // 00311 // Thus, if # is + then DIFF(xi,y) is (yi-x). For the L_infinity 00312 // norm we make use of the fact that in the program this function 00313 // is only invoked when y > xi, and hence DIFF(xi,y)=y. 00314 // 00315 // Finally, for approximate nearest neighbor queries we assume 00316 // that POW and ROOT are related such that 00317 // 00318 // v*ROOT(x) = ROOT(POW(v)*x) 00319 // 00320 // Here are the values for the various Minkowski norms: 00321 // 00322 // L_p: p even: p odd: 00323 // ------------------------- ------------------------ 00324 // POW(v) = v^p POW(v) = |v|^p 00325 // ROOT(x) = x^(1/p) ROOT(x) = x^(1/p) 00326 // # = + # = + 00327 // DIFF(x,y) = y - x DIFF(x,y) = y - x 00328 // 00329 // L_inf: 00330 // POW(v) = |v| 00331 // ROOT(x) = x 00332 // # = max 00333 // DIFF(x,y) = y 00334 // 00335 // By default the Euclidean norm is assumed. To change the norm, 00336 // uncomment the appropriate set of macros below. 00337 //---------------------------------------------------------------------- 00338 00339 //---------------------------------------------------------------------- 00340 // Use the following for the Euclidean norm 00341 //---------------------------------------------------------------------- 00342 #define ANN_POW(v) ((v)*(v)) 00343 #define ANN_ROOT(x) sqrt(x) 00344 #define ANN_SUM(x,y) ((x) + (y)) 00345 #define ANN_DIFF(x,y) ((y) - (x)) 00346 00347 //---------------------------------------------------------------------- 00348 // Use the following for the L_1 (Manhattan) norm 00349 //---------------------------------------------------------------------- 00350 // #define ANN_POW(v) fabs(v) 00351 // #define ANN_ROOT(x) (x) 00352 // #define ANN_SUM(x,y) ((x) + (y)) 00353 // #define ANN_DIFF(x,y) ((y) - (x)) 00354 00355 //---------------------------------------------------------------------- 00356 // Use the following for a general L_p norm 00357 //---------------------------------------------------------------------- 00358 // #define ANN_POW(v) pow(fabs(v),p) 00359 // #define ANN_ROOT(x) pow(fabs(x),1/p) 00360 // #define ANN_SUM(x,y) ((x) + (y)) 00361 // #define ANN_DIFF(x,y) ((y) - (x)) 00362 00363 //---------------------------------------------------------------------- 00364 // Use the following for the L_infinity (Max) norm 00365 //---------------------------------------------------------------------- 00366 // #define ANN_POW(v) fabs(v) 00367 // #define ANN_ROOT(x) (x) 00368 // #define ANN_SUM(x,y) ((x) > (y) ? (x) : (y)) 00369 // #define ANN_DIFF(x,y) (y) 00370 00371 //---------------------------------------------------------------------- 00372 // Array types 00373 // The following array types are of basic interest. A point is 00374 // just a dimensionless array of coordinates, a point array is a 00375 // dimensionless array of points. A distance array is a 00376 // dimensionless array of distances and an index array is a 00377 // dimensionless array of point indices. The latter two are used 00378 // when returning the results of k-nearest neighbor queries. 00379 //---------------------------------------------------------------------- 00380 00381 typedef ANNcoord* ANNpoint; // a point 00382 typedef ANNpoint* ANNpointArray; // an array of points 00383 typedef ANNdist* ANNdistArray; // an array of distances 00384 typedef ANNidx* ANNidxArray; // an array of point indices 00385 00386 //---------------------------------------------------------------------- 00387 // Basic point and array utilities: 00388 // The following procedures are useful supplements to ANN's nearest 00389 // neighbor capabilities. 00390 // 00391 // annDist(): 00392 // Computes the (squared) distance between a pair of points. 00393 // Note that this routine is not used internally by ANN for 00394 // computing distance calculations. For reasons of efficiency 00395 // this is done using incremental distance calculation. Thus, 00396 // this routine cannot be modified as a method of changing the 00397 // metric. 00398 // 00399 // Because points (somewhat like strings in C) are stored as 00400 // pointers. Consequently, creating and destroying copies of 00401 // points may require storage allocation. These procedures do 00402 // this. 00403 // 00404 // annAllocPt() and annDeallocPt(): 00405 // Allocate a deallocate storage for a single point, and 00406 // return a pointer to it. The argument to AllocPt() is 00407 // used to initialize all components. 00408 // 00409 // annAllocPts() and annDeallocPts(): 00410 // Allocate and deallocate an array of points as well a 00411 // place to store their coordinates, and initializes the 00412 // points to point to their respective coordinates. It 00413 // allocates point storage in a contiguous block large 00414 // enough to store all the points. It performs no 00415 // initialization. 00416 // 00417 // annCopyPt(): 00418 // Creates a copy of a given point, allocating space for 00419 // the new point. It returns a pointer to the newly 00420 // allocated copy. 00421 //---------------------------------------------------------------------- 00422 00423 DLL_API ANNdist annDist( 00424 int dim, // dimension of space 00425 ANNpoint p, // points 00426 ANNpoint q); 00427 00428 DLL_API ANNpoint annAllocPt( 00429 int dim, // dimension 00430 ANNcoord c = 0); // coordinate value (all equal) 00431 00432 DLL_API ANNpointArray annAllocPts( 00433 int n, // number of points 00434 int dim); // dimension 00435 00436 DLL_API void annDeallocPt( 00437 ANNpoint &p); // deallocate 1 point 00438 00439 DLL_API void annDeallocPts( 00440 ANNpointArray &pa); // point array 00441 00442 DLL_API ANNpoint annCopyPt( 00443 int dim, // dimension 00444 ANNpoint source); // point to copy 00445 00446 //---------------------------------------------------------------------- 00447 //Overall structure: ANN supports a number of different data structures 00448 //for approximate and exact nearest neighbor searching. These are: 00449 // 00450 // ANNbruteForce A simple brute-force search structure. 00451 // ANNkd_tree A kd-tree tree search structure. ANNbd_tree 00452 // A bd-tree tree search structure (a kd-tree with shrink 00453 // capabilities). 00454 // 00455 // At a minimum, each of these data structures support k-nearest 00456 // neighbor queries. The nearest neighbor query, annkSearch, 00457 // returns an integer identifier and the distance to the nearest 00458 // neighbor(s) and annRangeSearch returns the nearest points that 00459 // lie within a given query ball. 00460 // 00461 // Each structure is built by invoking the appropriate constructor 00462 // and passing it (at a minimum) the array of points, the total 00463 // number of points and the dimension of the space. Each structure 00464 // is also assumed to support a destructor and member functions 00465 // that return basic information about the point set. 00466 // 00467 // Note that the array of points is not copied by the data 00468 // structure (for reasons of space efficiency), and it is assumed 00469 // to be constant throughout the lifetime of the search structure. 00470 // 00471 // The search algorithm, annkSearch, is given the query point (q), 00472 // and the desired number of nearest neighbors to report (k), and 00473 // the error bound (eps) (whose default value is 0, implying exact 00474 // nearest neighbors). It returns two arrays which are assumed to 00475 // contain at least k elements: one (nn_idx) contains the indices 00476 // (within the point array) of the nearest neighbors and the other 00477 // (dd) contains the squared distances to these nearest neighbors. 00478 // 00479 // The search algorithm, annkFRSearch, is a fixed-radius kNN 00480 // search. In addition to a query point, it is given a (squared) 00481 // radius bound. (This is done for consistency, because the search 00482 // returns distances as squared quantities.) It does two things. 00483 // First, it computes the k nearest neighbors within the radius 00484 // bound, and second, it returns the total number of points lying 00485 // within the radius bound. It is permitted to set k = 0, in which 00486 // case it effectively answers a range counting query. If the 00487 // error bound epsilon is positive, then the search is approximate 00488 // in the sense that it is free to ignore any point that lies 00489 // outside a ball of radius r/(1+epsilon), where r is the given 00490 // (unsquared) radius bound. 00491 // 00492 // The generic object from which all the search structures are 00493 // dervied is given below. It is a virtual object, and is useless 00494 // by itself. 00495 //---------------------------------------------------------------------- 00496 00497 class DLL_API ANNpointSet { 00498 public: 00499 virtual ~ANNpointSet() {} // virtual distructor 00500 00501 virtual void annkSearch( // approx k near neighbor search 00502 ANNpoint q, // query point 00503 int k, // number of near neighbors to return 00504 ANNidxArray nn_idx, // nearest neighbor array (modified) 00505 ANNdistArray dd, // dist to near neighbors (modified) 00506 double eps=0.0 // error bound 00507 ) = 0; // pure virtual (defined elsewhere) 00508 00509 virtual int annkFRSearch( // approx fixed-radius kNN search 00510 ANNpoint q, // query point 00511 ANNdist sqRad, // squared radius 00512 int k = 0, // number of near neighbors to return 00513 ANNidxArray nn_idx = NULL, // nearest neighbor array (modified) 00514 ANNdistArray dd = NULL, // dist to near neighbors (modified) 00515 double eps=0.0 // error bound 00516 ) = 0; // pure virtual (defined elsewhere) 00517 00518 virtual int theDim() = 0; // return dimension of space 00519 virtual int nPoints() = 0; // return number of points 00520 // return pointer to points 00521 virtual ANNpointArray thePoints() = 0; 00522 }; 00523 00524 //---------------------------------------------------------------------- 00525 // Brute-force nearest neighbor search: 00526 // The brute-force search structure is very simple but inefficient. 00527 // It has been provided primarily for the sake of comparison with 00528 // and validation of the more complex search structures. 00529 // 00530 // Query processing is the same as described above, but the value 00531 // of epsilon is ignored, since all distance calculations are 00532 // performed exactly. 00533 // 00534 // WARNING: This data structure is very slow, and should not be 00535 // used unless the number of points is very small. 00536 // 00537 // Internal information: 00538 // --------------------- 00539 // This data structure bascially consists of the array of points 00540 // (each a pointer to an array of coordinates). The search is 00541 // performed by a simple linear scan of all the points. 00542 //---------------------------------------------------------------------- 00543 00544 class DLL_API ANNbruteForce: public ANNpointSet { 00545 int dim; // dimension 00546 int n_pts; // number of points 00547 ANNpointArray pts; // point array 00548 public: 00549 ANNbruteForce( // constructor from point array 00550 ANNpointArray pa, // point array 00551 int n, // number of points 00552 int dd); // dimension 00553 00554 ~ANNbruteForce(); // destructor 00555 00556 void annkSearch( // approx k near neighbor search 00557 ANNpoint q, // query point 00558 int k, // number of near neighbors to return 00559 ANNidxArray nn_idx, // nearest neighbor array (modified) 00560 ANNdistArray dd, // dist to near neighbors (modified) 00561 double eps=0.0); // error bound 00562 00563 int annkFRSearch( // approx fixed-radius kNN search 00564 ANNpoint q, // query point 00565 ANNdist sqRad, // squared radius 00566 int k = 0, // number of near neighbors to return 00567 ANNidxArray nn_idx = NULL, // nearest neighbor array (modified) 00568 ANNdistArray dd = NULL, // dist to near neighbors (modified) 00569 double eps=0.0); // error bound 00570 00571 int theDim() // return dimension of space 00572 { return dim; } 00573 00574 int nPoints() // return number of points 00575 { return n_pts; } 00576 00577 ANNpointArray thePoints() // return pointer to points 00578 { return pts; } 00579 }; 00580 00581 //---------------------------------------------------------------------- 00582 // kd- and bd-tree splitting and shrinking rules 00583 // kd-trees supports a collection of different splitting rules. 00584 // In addition to the standard kd-tree splitting rule proposed 00585 // by Friedman, Bentley, and Finkel, we have introduced a 00586 // number of other splitting rules, which seem to perform 00587 // as well or better (for the distributions we have tested). 00588 // 00589 // The splitting methods given below allow the user to tailor 00590 // the data structure to the particular data set. They are 00591 // are described in greater details in the kd_split.cc source 00592 // file. The method ANN_KD_SUGGEST is the method chosen (rather 00593 // subjectively) by the implementors as the one giving the 00594 // fastest performance, and is the default splitting method. 00595 // 00596 // As with splitting rules, there are a number of different 00597 // shrinking rules. The shrinking rule ANN_BD_NONE does no 00598 // shrinking (and hence produces a kd-tree tree). The rule 00599 // ANN_BD_SUGGEST uses the implementors favorite rule. 00600 //---------------------------------------------------------------------- 00601 00602 enum ANNsplitRule { 00603 ANN_KD_STD = 0, // the optimized kd-splitting rule 00604 ANN_KD_MIDPT = 1, // midpoint split 00605 ANN_KD_FAIR = 2, // fair split 00606 ANN_KD_SL_MIDPT = 3, // sliding midpoint splitting method 00607 ANN_KD_SL_FAIR = 4, // sliding fair split method 00608 ANN_KD_SUGGEST = 5}; // the authors' suggestion for best 00609 const int ANN_N_SPLIT_RULES = 6; // number of split rules 00610 00611 enum ANNshrinkRule { 00612 ANN_BD_NONE = 0, // no shrinking at all (just kd-tree) 00613 ANN_BD_SIMPLE = 1, // simple splitting 00614 ANN_BD_CENTROID = 2, // centroid splitting 00615 ANN_BD_SUGGEST = 3}; // the authors' suggested choice 00616 const int ANN_N_SHRINK_RULES = 4; // number of shrink rules 00617 00618 //---------------------------------------------------------------------- 00619 // kd-tree: 00620 // The main search data structure supported by ANN is a kd-tree. 00621 // The main constructor is given a set of points and a choice of 00622 // splitting method to use in building the tree. 00623 // 00624 // Construction: 00625 // ------------- 00626 // The constructor is given the point array, number of points, 00627 // dimension, bucket size (default = 1), and the splitting rule 00628 // (default = ANN_KD_SUGGEST). The point array is not copied, and 00629 // is assumed to be kept constant throughout the lifetime of the 00630 // search structure. There is also a "load" constructor that 00631 // builds a tree from a file description that was created by the 00632 // Dump operation. 00633 // 00634 // Search: 00635 // ------- 00636 // There are two search methods: 00637 // 00638 // Standard search (annkSearch()): 00639 // Searches nodes in tree-traversal order, always visiting 00640 // the closer child first. 00641 // Priority search (annkPriSearch()): 00642 // Searches nodes in order of increasing distance of the 00643 // associated cell from the query point. For many 00644 // distributions the standard search seems to work just 00645 // fine, but priority search is safer for worst-case 00646 // performance. 00647 // 00648 // Printing: 00649 // --------- 00650 // There are two methods provided for printing the tree. Print() 00651 // is used to produce a "human-readable" display of the tree, with 00652 // indenation, which is handy for debugging. Dump() produces a 00653 // format that is suitable reading by another program. There is a 00654 // "load" constructor, which constructs a tree which is assumed to 00655 // have been saved by the Dump() procedure. 00656 // 00657 // Performance and Structure Statistics: 00658 // ------------------------------------- 00659 // The procedure getStats() collects statistics information on the 00660 // tree (its size, height, etc.) See ANNperf.h for information on 00661 // the stats structure it returns. 00662 // 00663 // Internal information: 00664 // --------------------- 00665 // The data structure consists of three major chunks of storage. 00666 // The first (implicit) storage are the points themselves (pts), 00667 // which have been provided by the users as an argument to the 00668 // constructor, or are allocated dynamically if the tree is built 00669 // using the load constructor). These should not be changed during 00670 // the lifetime of the search structure. It is the user's 00671 // responsibility to delete these after the tree is destroyed. 00672 // 00673 // The second is the tree itself (which is dynamically allocated in 00674 // the constructor) and is given as a pointer to its root node 00675 // (root). These nodes are automatically deallocated when the tree 00676 // is deleted. See the file src/kd_tree.h for further information 00677 // on the structure of the tree nodes. 00678 // 00679 // Each leaf of the tree does not contain a pointer directly to a 00680 // point, but rather contains a pointer to a "bucket", which is an 00681 // array consisting of point indices. The third major chunk of 00682 // storage is an array (pidx), which is a large array in which all 00683 // these bucket subarrays reside. (The reason for storing them 00684 // separately is the buckets are typically small, but of varying 00685 // sizes. This was done to avoid fragmentation.) This array is 00686 // also deallocated when the tree is deleted. 00687 // 00688 // In addition to this, the tree consists of a number of other 00689 // pieces of information which are used in searching and for 00690 // subsequent tree operations. These consist of the following: 00691 // 00692 // dim Dimension of space 00693 // n_pts Number of points currently in the tree 00694 // n_max Maximum number of points that are allowed 00695 // in the tree 00696 // bkt_size Maximum bucket size (no. of points per leaf) 00697 // bnd_box_lo Bounding box low point 00698 // bnd_box_hi Bounding box high point 00699 // splitRule Splitting method used 00700 // 00701 //---------------------------------------------------------------------- 00702 00703 //---------------------------------------------------------------------- 00704 // Some types and objects used by kd-tree functions 00705 // See src/kd_tree.h and src/kd_tree.cpp for definitions 00706 //---------------------------------------------------------------------- 00707 class ANNkdStats; // stats on kd-tree 00708 class ANNkd_node; // generic node in a kd-tree 00709 typedef ANNkd_node* ANNkd_ptr; // pointer to a kd-tree node 00710 00711 class DLL_API ANNkd_tree: public ANNpointSet { 00712 protected: 00713 int dim; // dimension of space 00714 int n_pts; // number of points in tree 00715 int bkt_size; // bucket size 00716 ANNpointArray pts; // the points 00717 ANNidxArray pidx; // point indices (to pts array) 00718 ANNkd_ptr root; // root of kd-tree 00719 ANNpoint bnd_box_lo; // bounding box low point 00720 ANNpoint bnd_box_hi; // bounding box high point 00721 00722 void SkeletonTree( // construct skeleton tree 00723 int n, // number of points 00724 int dd, // dimension 00725 int bs, // bucket size 00726 ANNpointArray pa = NULL, // point array (optional) 00727 ANNidxArray pi = NULL); // point indices (optional) 00728 00729 public: 00730 ANNkd_tree( // build skeleton tree 00731 int n = 0, // number of points 00732 int dd = 0, // dimension 00733 int bs = 1); // bucket size 00734 00735 ANNkd_tree( // build from point array 00736 ANNpointArray pa, // point array 00737 int n, // number of points 00738 int dd, // dimension 00739 int bs = 1, // bucket size 00740 ANNsplitRule split = ANN_KD_SUGGEST); // splitting method 00741 00742 ANNkd_tree( // build from dump file 00743 std::istream& in); // input stream for dump file 00744 00745 ~ANNkd_tree(); // tree destructor 00746 00747 void annkSearch( // approx k near neighbor search 00748 ANNpoint q, // query point 00749 int k, // number of near neighbors to return 00750 ANNidxArray nn_idx, // nearest neighbor array (modified) 00751 ANNdistArray dd, // dist to near neighbors (modified) 00752 double eps=0.0); // error bound 00753 00754 void annkPriSearch( // priority k near neighbor search 00755 ANNpoint q, // query point 00756 int k, // number of near neighbors to return 00757 ANNidxArray nn_idx, // nearest neighbor array (modified) 00758 ANNdistArray dd, // dist to near neighbors (modified) 00759 double eps=0.0); // error bound 00760 00761 int annkFRSearch( // approx fixed-radius kNN search 00762 ANNpoint q, // the query point 00763 ANNdist sqRad, // squared radius of query ball 00764 int k, // number of neighbors to return 00765 ANNidxArray nn_idx = NULL, // nearest neighbor array (modified) 00766 ANNdistArray dd = NULL, // dist to near neighbors (modified) 00767 double eps=0.0); // error bound 00768 00769 int theDim() // return dimension of space 00770 { return dim; } 00771 00772 int nPoints() // return number of points 00773 { return n_pts; } 00774 00775 ANNpointArray thePoints() // return pointer to points 00776 { return pts; } 00777 00778 virtual void Print( // print the tree (for debugging) 00779 ANNbool with_pts, // print points as well? 00780 std::ostream& out); // output stream 00781 00782 virtual void Dump( // dump entire tree 00783 ANNbool with_pts, // print points as well? 00784 std::ostream& out); // output stream 00785 00786 virtual void getStats( // compute tree statistics 00787 ANNkdStats& st); // the statistics (modified) 00788 }; 00789 00790 //---------------------------------------------------------------------- 00791 // Box decomposition tree (bd-tree) 00792 // The bd-tree is inherited from a kd-tree. The main difference 00793 // in the bd-tree and the kd-tree is a new type of internal node 00794 // called a shrinking node (in the kd-tree there is only one type 00795 // of internal node, a splitting node). The shrinking node 00796 // makes it possible to generate balanced trees in which the 00797 // cells have bounded aspect ratio, by allowing the decomposition 00798 // to zoom in on regions of dense point concentration. Although 00799 // this is a nice idea in theory, few point distributions are so 00800 // densely clustered that this is really needed. 00801 //---------------------------------------------------------------------- 00802 00803 class DLL_API ANNbd_tree: public ANNkd_tree { 00804 public: 00805 ANNbd_tree( // build skeleton tree 00806 int n, // number of points 00807 int dd, // dimension 00808 int bs = 1) // bucket size 00809 : ANNkd_tree(n, dd, bs) {} // build base kd-tree 00810 00811 ANNbd_tree( // build from point array 00812 ANNpointArray pa, // point array 00813 int n, // number of points 00814 int dd, // dimension 00815 int bs = 1, // bucket size 00816 ANNsplitRule split = ANN_KD_SUGGEST, // splitting rule 00817 ANNshrinkRule shrink = ANN_BD_SUGGEST); // shrinking rule 00818 00819 ANNbd_tree( // build from dump file 00820 std::istream& in); // input stream for dump file 00821 }; 00822 00823 //---------------------------------------------------------------------- 00824 // Other functions 00825 // annMaxPtsVisit Sets a limit on the maximum number of points 00826 // to visit in the search. 00827 // annClose Can be called when all use of ANN is finished. 00828 // It clears up a minor memory leak. 00829 //---------------------------------------------------------------------- 00830 00831 DLL_API void annMaxPtsVisit( // max. pts to visit in search 00832 int maxPts); // the limit 00833 00834 DLL_API void annClose(); // called to end use of ANN 00835 00836 #endif