I've recently just finished my implementation of a DBSCAN in C++ for a machine learning framework. I've tried to follow the pseudocode implementation on Wikipedia as best I could. I also found some example implementations on Github to help as well. So far, this is what my code looks like. I was wondering if my implementation looks similar to the pseudocode and if their was anything I could add in terms of correctness or even speed.
std::vector<uint32_t> DBSCAN::regionQuery(uint32_t p)
{
Metrics metrics;
MetricCoordinates _mc;
uint32_t coordinates_size = coordinates[0]->lat_pts.size();
if (cluster_weights.dist_metric == "haversine"){
for (uint32_t i = 0; i < coordinates_size; i++){
_mc.lat_1 = coordinates[0]->lat_pts[i];
_mc.lat_2 = coordinates[0]->lat_pts[p];
_mc.long_1 = coordinates[0]->long_pts[i];
_mc.long_2 = coordinates[0]->long_pts[p];
if (metrics.haversineDistanceMetric(_mc) <= cluster_weights.eps){
rq_pts.push_back(i);
}
}
} else if (cluster_weights.dist_metric == "euclidean"){
for (uint32_t i = 0; i < coordinates_size; i++){
_mc.lat_1 = coordinates[0]->lat_pts[i];
_mc.lat_2 = coordinates[0]->lat_pts[p];
_mc.long_1 = coordinates[0]->long_pts[i];
_mc.long_2 = coordinates[0]->long_pts[p];
if (metrics.euclideanDistanceMetric(_mc) <= cluster_weights.eps){
rq_pts.push_back(i);
}
}
}
// all points within the eps neighborhood
return rq_pts;
}
void DBSCAN::expandCluster(uint32_t p, std::vector<uint32_t>* ec_neighbor_pts, int32_t* n_clusters)
{
cluster_pts.push_back(std::vector<int32_t>());
cluster_pts[*n_clusters].push_back(p);
uint32_t ec_neighbors_size = ec_neighbor_pts->size();
assert(ec_neighbors_size != 0);
for (uint32_t i = 0; i < ec_neighbors_size; i++){
if (!visited_pts[ec_neighbor_pts->at(i)]){
// mark point p as visited
visited_pts[ec_neighbor_pts->at(i)] = true;
std::vector<uint32_t> ec_neighbor_pts_ = regionQuery(ec_neighbor_pts->at(i));
if (ec_neighbor_pts_.size() >= cluster_weights.min_pts){
ec_neighbor_pts->insert(ec_neighbor_pts->end(), ec_neighbor_pts_.begin(), ec_neighbor_pts_.end());
}
// mark point p as clustered
clustered_pts[ec_neighbor_pts->at(i)] = true;
// add any other points that haven't been clustered
if (clustered_pts[ec_neighbor_pts->at(i)]){
cluster_pts[*n_clusters].push_back(ec_neighbor_pts->at(i));
}
}
}
}
void DBSCAN::performClusterSearch()
{
uint32_t coordinates_size = coordinates[0]->lat_pts.size();
for (uint32_t i = 0; i < coordinates_size; i++){
if (visited_pts[i]) {
continue;
} else {
// mark point p as visited
visited_pts[i] = true;
std::vector<uint32_t> rq_neighbor_pts = regionQuery(i);
if (rq_neighbor_pts.size() < cluster_weights.min_pts){
noise_pts_.push_back(rq_neighbor_pts[i]);
} else {
n_clusters_++;
// mark point p as clustered
clustered_pts[i] = true;
expandCluster(i, &rq_neighbor_pts, &n_clusters_);
}
}
}
}