Boost logo

Boost-Commit :

From: franklin.jonathan_at_[hidden]
Date: 2008-05-08 16:36:50


Author: jfranklin
Date: 2008-05-08 16:36:49 EDT (Thu, 08 May 2008)
New Revision: 45228
URL: http://svn.boost.org/trac/boost/changeset/45228

Log:
Created detail subdir.
Moved naive query function into its own header file in detail subdir.
Added header for Yuan Ming Chen's k_means algorithm implementation.

Added:
   sandbox/boost/algorithm/cluster/detail/
   sandbox/boost/algorithm/cluster/detail/naive_query.hpp (contents, props changed)
   sandbox/boost/algorithm/cluster/k_means.hpp (contents, props changed)
Text files modified:
   sandbox/boost/algorithm/cluster/dbscan.hpp | 28 +++-------------------------
   1 files changed, 3 insertions(+), 25 deletions(-)

Modified: sandbox/boost/algorithm/cluster/dbscan.hpp
==============================================================================
--- sandbox/boost/algorithm/cluster/dbscan.hpp (original)
+++ sandbox/boost/algorithm/cluster/dbscan.hpp 2008-05-08 16:36:49 EDT (Thu, 08 May 2008)
@@ -8,6 +8,7 @@
 
 #include <boost/algorithm/cluster/cluster_data.hpp>
 #include <boost/algorithm/cluster/concept.hpp>
+#include <boost/algorithm/cluster/detail/naive_query.hpp>
 #include <vector>
 
 namespace boost
@@ -23,29 +24,6 @@
 int const UNCLASSIFIED = -1;
 int const NOISE = 0;
 
-// TODO: Replace this naive query function w/ R*-tree or fractional cascading.
-// This query mechanism makes the runtime quadratic.
-template<typename NTupleIterT, typename DistFunT>
-static void query(
- NTupleIterT const & query_pt,
- NTupleIterT const & begin,
- NTupleIterT const & end,
- typename NTupleIterT::difference_type eps,
- DistFunT const & d,
- std::vector<NTupleIterT> & v)
-{
- for(NTupleIterT cur_pt = begin; cur_pt != end; ++cur_pt)
- {
- if (query_pt == cur_pt)
- continue;
-
- if (d(*query_pt->tuple, *cur_pt->tuple) > eps)
- continue;
-
- v.push_back(cur_pt);
- }
-}
-
 // TODO: Replace this so we don't have to store the cluster info for each tuple?
 template<typename NTupleIterT>
 struct node
@@ -107,7 +85,7 @@
     // Expand cluster.
 
     std::vector<ntuple_nodes::iterator> seeds;
- detail::query(it, tuples.begin(), tuples.end(), eps, d, seeds);
+ detail::naive_query(it, tuples.begin(), tuples.end(), eps, d, seeds);
     // If the neighborhood of this tuple is too small, then mark it as noise.
     if (seeds.size() < min_points)
     {
@@ -137,7 +115,7 @@
       seeds.pop_back();
 
       std::vector<ntuple_nodes::iterator> results;
- detail::query(cur, tuples.begin(), tuples.end(), eps, d, results);
+ detail::naive_query(cur, tuples.begin(), tuples.end(), eps, d, results);
 
       if (results.size() >= min_points)
       {

Added: sandbox/boost/algorithm/cluster/detail/naive_query.hpp
==============================================================================
--- (empty file)
+++ sandbox/boost/algorithm/cluster/detail/naive_query.hpp 2008-05-08 16:36:49 EDT (Thu, 08 May 2008)
@@ -0,0 +1,50 @@
+// (C) Copyright Jonathan Franklin 2008.
+// Use, modification and distribution are subject to the
+// Boost Software License, Version 1.0. (See accompanying file
+// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#if ! defined BOOST_ALGORITHM_CLUSTER_DETAIL_NAIVE_QUERY_HPP
+#define BOOST_ALGORITHM_CLUSTER_DETAIL_NAIVE_QUERY_HPP
+
+#include <boost/algorithm/cluster/cluster_data.hpp>
+#include <boost/algorithm/cluster/concept.hpp>
+#include <vector>
+
+namespace boost
+{
+namespace algorithm
+{
+namespace cluster
+{
+namespace detail
+{
+
+// TODO: Replace this naive query function w/ R*-tree or fractional cascading.
+// This query mechanism makes the runtime quadratic.
+template<typename NTupleIterT, typename DistFunT>
+static void naive_query(
+ NTupleIterT const & query_pt,
+ NTupleIterT const & begin,
+ NTupleIterT const & end,
+ typename NTupleIterT::difference_type eps,
+ DistFunT const & d,
+ std::vector<NTupleIterT> & v)
+{
+ for(NTupleIterT cur_pt = begin; cur_pt != end; ++cur_pt)
+ {
+ if (query_pt == cur_pt)
+ continue;
+
+ if (d(*query_pt->tuple, *cur_pt->tuple) > eps)
+ continue;
+
+ v.push_back(cur_pt);
+ }
+}
+
+} // End of namespace detail.
+} // End of namespace cluster
+} // End of namespace algorithm
+} // End of namespace boost
+
+#endif // BOOST_ALGORITHM_CLUSTER_DETAIL_NAIVE_QUERY_HPP

Added: sandbox/boost/algorithm/cluster/k_means.hpp
==============================================================================
--- (empty file)
+++ sandbox/boost/algorithm/cluster/k_means.hpp 2008-05-08 16:36:49 EDT (Thu, 08 May 2008)
@@ -0,0 +1,195 @@
+/*****
+** References
+** - J. MacQueen, "Some methods for classification and analysis
+** of multivariate observations", Fifth Berkeley Symposium on
+** Math Statistics and Probability, 281-297, 1967.
+** - I.S. Dhillon and D.S. Modha, "A data-clustering algorithm
+** on distributed memory multiprocessors",
+** Large-Scale Parallel Data Mining, 245-260, 1999.
+** Yuanming Chen, 2008-05-08
+*/
+
+#ifndef BOOST_ALGORITHM_CLUSTER_K_MEANS_HPP
+#define BOOST_ALGORITHM_CLUSTER_K_MEANS_HPP
+
+#include <cmath>
+#include <float.h>
+//#include "common.hpp"
+#include <vector>
+#include <list>
+#include <cassert>
+
+namespace boost {
+ namespace algorithm {
+ namespace cluster {
+ namespace detail {
+ template<typename AttributeType, typename differenceType>
+ //The original C function
+ int *k_means(AttributeType **data, int n, int m, int k, differenceType eps, AttributeType **centroids)
+ {
+ /* output cluster label for each data point */
+ int *labels = (int*)calloc(n, sizeof(int));
+
+ int h, i, j; /* loop counters, of course :) */
+ int *counts = (int*)calloc(k, sizeof(int)); /* size of each cluster */
+ AttributeType old_error, error = FLT_MAX; /* sum of squared euclidean distance */
+ AttributeType **c = centroids ? centroids : (AttributeType**)calloc(k, sizeof(AttributeType*));
+ AttributeType **c1 = (AttributeType**)calloc(k, sizeof(AttributeType*)); /* temp centroids */
+
+ //assert(data && k > 0 && k <= n && m > 0 && t >= 0); /* for debugging */
+
+ /****
+ ** initialization */
+
+ for (h = i = 0; i < k; h += n / k, i++) {
+ c1[i] = (AttributeType*)calloc(m, sizeof(AttributeType));
+ if (!centroids) {
+ c[i] = (AttributeType*)calloc(m, sizeof(AttributeType));
+ }
+ /* pick k points as initial centroids */
+ for (j = m; j-- > 0; c[i][j] = data[h][j]);
+ }
+
+ /****
+ ** main loop */
+
+ do {
+ /* save error from last step */
+ old_error = error, error = 0;
+
+ /* clear old counts and temp centroids */
+ for (i = 0; i < k; counts[i++] = 0) {
+ for (j = 0; j < m; c1[i][j++] = 0);
+ }
+
+ for (h = 0; h < n; h++) {
+ /* identify the closest cluster */
+ AttributeType min_distance = FLT_MAX;
+ for (i = 0; i < k; i++) {
+ AttributeType distance = 0;
+ for (j = m; j-- > 0; distance += pow(data[h][j] - c[i][j], 2));
+ if (distance < min_distance) {
+ labels[h] = i;
+ min_distance = distance;
+ }
+ }
+ /* update size and temp centroid of the destination cluster */
+ for (j = m; j-- > 0; c1[labels[h]][j] += data[h][j]);
+ counts[labels[h]]++;
+ /* update standard error */
+ error += min_distance;
+ }
+
+ for (i = 0; i < k; i++) { /* update all centroids */
+ for (j = 0; j < m; j++) {
+ c[i][j] = counts[i] ? c1[i][j] / counts[i] : c1[i][j];
+ }
+ }
+
+ } while (fabs(error - old_error) > eps);
+
+ /****
+ ** housekeeping */
+
+ for (i = 0; i < k; i++) {
+ if (!centroids) {
+ free(c[i]);
+ }
+ free(c1[i]);
+ }
+
+ if (!centroids) {
+ free(c);
+ }
+ free(c1);
+
+ free(counts);
+
+ return labels;
+ }
+ } //End of details namespace
+
+ template<typename PointType>
+ struct KMeansCluster {
+ PointType centroid;
+ std::vector<int> points; //The indice of points are stored here
+ };
+
+ template <typename KMeansCluster>
+ struct KMeansClustering {
+ typedef std::vector< KMeansCluster > type;
+ type clusters;
+ };
+
+ /**
+ * @param first: the first data point's iterator
+ * @param last: the last data point's iterator
+ * @param k: the k value for the k-mean algorithm
+ * @return collections of clusters
+ */
+ template <typename NTupleIter>
+ typename KMeansClustering< typename KMeansCluster<typename NTupleIter::value_type> >
+ k_means(NTupleIter first, NTupleIter last, unsigned k,
+ typename NTupleIter::difference_type const & eps)
+ {
+ typedef NTupleIter::difference_type DistanceType;
+ typedef NTupleIter::value_type PointType;
+ typedef PointType::value_type AttributeType; //For the c funtion test, it will be a double type
+ const DistanceType knumOfPoints = last - first; //The n variable in the C function
+ const size_t knDimension = PointType::size(); //The m variable in the C function
+
+ AttributeType** ppData = new AttributeType* [knumOfPoints];
+ AttributeType** centroids = new AttributeType* [k];
+ //Pre-allocate the result array
+ for(size_t nCentroid = 0; nCentroid < k; nCentroid++)
+ {
+ centroids[nCentroid] = new AttributeType[knDimension];
+ }
+
+ int nIndex = 0;
+ for(NTupleIter iter = first; iter != last; iter++, nIndex++)
+ {
+ PointType& pt= *iter; //A point
+ ppData[nIndex] = new AttributeType[knDimension];
+ for(unsigned int nAttribute = 0; nAttribute < knDimension; nAttribute++)
+ {
+ ppData[nIndex][nAttribute] = pt[nAttribute];
+ }
+ }
+
+ int* labels = detail::k_means(ppData, (int) knumOfPoints, (int) knDimension, k, eps, centroids);
+
+ typedef KMeansCluster<PointType> KMeansClusterType;
+ KMeansClustering< KMeansClusterType > clustering;
+ for(size_t nCentroid = 0; nCentroid < k; nCentroid++)
+ {
+
+ KMeansClusterType cluster;
+ PointType centroid;
+ for(unsigned int nAttribute = 0; nAttribute < knDimension; nAttribute++)
+ {
+ centroid[nAttribute] = centroids[nCentroid][nAttribute];
+ }
+ cluster.centroid = centroid;
+ clustering.clusters.push_back(cluster);
+ delete[] centroids[nCentroid];
+ }
+
+ for(int nPoint = 0; nPoint < knumOfPoints; nPoint++)
+ {
+ int nCentroidIndex = labels[nPoint];
+ clustering.clusters[nCentroidIndex].points.push_back(nPoint);
+ delete[] ppData[nPoint];
+ }
+
+ delete[] centroids;
+ delete[] ppData;
+ delete[] labels;
+
+ return clustering;
+ }
+ } //End of cluster namespace
+ } //End of algorithm namespace
+} //End of boost namespace
+
+#endif // BOOST_ALGORITHM_CLUSTER_K_MEANS_HPP


Boost-Commit list run by bdawes at acm.org, david.abrahams at rcn.com, gregod at cs.rpi.edu, cpdaniel at pacbell.net, john at johnmaddock.co.uk