/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.math3.stat.clustering;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.math3.exception.NotPositiveException;
import org.apache.commons.math3.exception.NullArgumentException;
import org.apache.commons.math3.util.MathUtils;
DBSCAN (density-based spatial clustering of applications with noise) algorithm.
The DBSCAN algorithm forms clusters based on the idea of density connectivity, i.e.
a point p is density connected to another point q, if there exists a chain of
points pi, with i = 1 .. n and p1 = p and pn = q,
such that each pair <pi, pi+1> is directly density-reachable.
A point q is directly density-reachable from point p if it is in the ε-neighborhood
of this point.
Any point that is not density-reachable from a formed cluster is treated as noise, and
will thus not be present in the result.
The algorithm requires two parameters:
- eps: the distance that defines the ε-neighborhood of a point
- minPoints: the minimum number of density-connected points required to form a cluster
Note: as DBSCAN is not a centroid-based clustering algorithm, the resulting Cluster
objects will have no defined center, i.e. Cluster.getCenter()
will return null
.
Type parameters: - <T> – type of the points to cluster
See Also: Since: 3.1 Deprecated: As of 3.2 (to be removed in 4.0), use DBSCANClusterer
instead
/**
* DBSCAN (density-based spatial clustering of applications with noise) algorithm.
* <p>
* The DBSCAN algorithm forms clusters based on the idea of density connectivity, i.e.
* a point p is density connected to another point q, if there exists a chain of
* points p<sub>i</sub>, with i = 1 .. n and p<sub>1</sub> = p and p<sub>n</sub> = q,
* such that each pair <p<sub>i</sub>, p<sub>i+1</sub>> is directly density-reachable.
* A point q is directly density-reachable from point p if it is in the ε-neighborhood
* of this point.
* <p>
* Any point that is not density-reachable from a formed cluster is treated as noise, and
* will thus not be present in the result.
* <p>
* The algorithm requires two parameters:
* <ul>
* <li>eps: the distance that defines the ε-neighborhood of a point
* <li>minPoints: the minimum number of density-connected points required to form a cluster
* </ul>
* <p>
* <b>Note:</b> as DBSCAN is not a centroid-based clustering algorithm, the resulting
* {@link Cluster} objects will have no defined center, i.e. {@link Cluster#getCenter()} will
* return {@code null}.
*
* @param <T> type of the points to cluster
* @see <a href="http://en.wikipedia.org/wiki/DBSCAN">DBSCAN (wikipedia)</a>
* @see <a href="http://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf">
* A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise</a>
* @since 3.1
* @deprecated As of 3.2 (to be removed in 4.0),
* use {@link org.apache.commons.math3.ml.clustering.DBSCANClusterer} instead
*/
@Deprecated
public class DBSCANClusterer<T extends Clusterable<T>> {
Maximum radius of the neighborhood to be considered. /** Maximum radius of the neighborhood to be considered. */
private final double eps;
Minimum number of points needed for a cluster. /** Minimum number of points needed for a cluster. */
private final int minPts;
Status of a point during the clustering process. /** Status of a point during the clustering process. */
private enum PointStatus {
The point has is considered to be noise. /** The point has is considered to be noise. */
NOISE,
The point is already part of a cluster. /** The point is already part of a cluster. */
PART_OF_CLUSTER
}
Creates a new instance of a DBSCANClusterer.
Params: - eps – maximum radius of the neighborhood to be considered
- minPts – minimum number of points needed for a cluster
Throws: - NotPositiveException – if
eps < 0.0
or minPts < 0
/**
* Creates a new instance of a DBSCANClusterer.
*
* @param eps maximum radius of the neighborhood to be considered
* @param minPts minimum number of points needed for a cluster
* @throws NotPositiveException if {@code eps < 0.0} or {@code minPts < 0}
*/
public DBSCANClusterer(final double eps, final int minPts)
throws NotPositiveException {
if (eps < 0.0d) {
throw new NotPositiveException(eps);
}
if (minPts < 0) {
throw new NotPositiveException(minPts);
}
this.eps = eps;
this.minPts = minPts;
}
Returns the maximum radius of the neighborhood to be considered.
Returns: maximum radius of the neighborhood
/**
* Returns the maximum radius of the neighborhood to be considered.
*
* @return maximum radius of the neighborhood
*/
public double getEps() {
return eps;
}
Returns the minimum number of points needed for a cluster.
Returns: minimum number of points needed for a cluster
/**
* Returns the minimum number of points needed for a cluster.
*
* @return minimum number of points needed for a cluster
*/
public int getMinPts() {
return minPts;
}
Performs DBSCAN cluster analysis.
Note: as DBSCAN is not a centroid-based clustering algorithm, the resulting Cluster
objects will have no defined center, i.e. Cluster.getCenter()
will return null
.
Params: - points – the points to cluster
Throws: - NullArgumentException – if the data points are null
Returns: the list of clusters
/**
* Performs DBSCAN cluster analysis.
* <p>
* <b>Note:</b> as DBSCAN is not a centroid-based clustering algorithm, the resulting
* {@link Cluster} objects will have no defined center, i.e. {@link Cluster#getCenter()} will
* return {@code null}.
*
* @param points the points to cluster
* @return the list of clusters
* @throws NullArgumentException if the data points are null
*/
public List<Cluster<T>> cluster(final Collection<T> points) throws NullArgumentException {
// sanity checks
MathUtils.checkNotNull(points);
final List<Cluster<T>> clusters = new ArrayList<Cluster<T>>();
final Map<Clusterable<T>, PointStatus> visited = new HashMap<Clusterable<T>, PointStatus>();
for (final T point : points) {
if (visited.get(point) != null) {
continue;
}
final List<T> neighbors = getNeighbors(point, points);
if (neighbors.size() >= minPts) {
// DBSCAN does not care about center points
final Cluster<T> cluster = new Cluster<T>(null);
clusters.add(expandCluster(cluster, point, neighbors, points, visited));
} else {
visited.put(point, PointStatus.NOISE);
}
}
return clusters;
}
Expands the cluster to include density-reachable items.
Params: - cluster – Cluster to expand
- point – Point to add to cluster
- neighbors – List of neighbors
- points – the data set
- visited – the set of already visited points
Returns: the expanded cluster
/**
* Expands the cluster to include density-reachable items.
*
* @param cluster Cluster to expand
* @param point Point to add to cluster
* @param neighbors List of neighbors
* @param points the data set
* @param visited the set of already visited points
* @return the expanded cluster
*/
private Cluster<T> expandCluster(final Cluster<T> cluster,
final T point,
final List<T> neighbors,
final Collection<T> points,
final Map<Clusterable<T>, PointStatus> visited) {
cluster.addPoint(point);
visited.put(point, PointStatus.PART_OF_CLUSTER);
List<T> seeds = new ArrayList<T>(neighbors);
int index = 0;
while (index < seeds.size()) {
final T current = seeds.get(index);
PointStatus pStatus = visited.get(current);
// only check non-visited points
if (pStatus == null) {
final List<T> currentNeighbors = getNeighbors(current, points);
if (currentNeighbors.size() >= minPts) {
seeds = merge(seeds, currentNeighbors);
}
}
if (pStatus != PointStatus.PART_OF_CLUSTER) {
visited.put(current, PointStatus.PART_OF_CLUSTER);
cluster.addPoint(current);
}
index++;
}
return cluster;
}
Returns a list of density-reachable neighbors of a point
. Params: - point – the point to look for
- points – possible neighbors
Returns: the List of neighbors
/**
* Returns a list of density-reachable neighbors of a {@code point}.
*
* @param point the point to look for
* @param points possible neighbors
* @return the List of neighbors
*/
private List<T> getNeighbors(final T point, final Collection<T> points) {
final List<T> neighbors = new ArrayList<T>();
for (final T neighbor : points) {
if (point != neighbor && neighbor.distanceFrom(point) <= eps) {
neighbors.add(neighbor);
}
}
return neighbors;
}
Merges two lists together.
Params: - one – first list
- two – second list
Returns: merged lists
/**
* Merges two lists together.
*
* @param one first list
* @param two second list
* @return merged lists
*/
private List<T> merge(final List<T> one, final List<T> two) {
final Set<T> oneSet = new HashSet<T>(one);
for (T item : two) {
if (!oneSet.contains(item)) {
one.add(item);
}
}
return one;
}
}