I'am writing on a spatial clustering algorithm using pandas and scipy's kdtree. I profiled the code and the .loc
part takes most time for bigger datasets. I wonder if its possible to speed up the points.loc[idx, 'cluster'] = clusterNr
somehow.
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
def getRandomCoordinates(samples=1000, offsetX=52.2, offsetY=13.1, width=0.5):
points = np.random.rand(samples, 2) * width
#points = da.random.random(size=(samples, 2), chunks=(500, 500))
data = pd.DataFrame(points, columns=['lat', 'lng'])
data.lat += offsetX
data.lng += offsetY
# set spatial properties
data.columnX = 'lat'
data.columnY = 'lng'
return data
radius = 0.01
points = getRandomCoordinates(25)
samples = points.sample(10)
tree = NearestNeighbors(n_neighbors=2, radius=0.1, leaf_size=30, algorithm="ball_tree", n_jobs=1).fit(points)
nngrf = tree.radius_neighbors_graph(samples, radius, mode='connectivity').toarray().astype(np.bool)
points['cluster'] = -1
for clusterNr, idx in enumerate(nngrf):
points.loc[idx, 'cluster'] = clusterNr
Input data:
lng lat
0 12.988426 52.343361
1 13.055824 52.396462
2 13.353571 52.347457
3 12.980915 52.339021
4 13.232137 52.339155
5 12.877804 52.385926
6 13.220915 52.378951
7 13.479688 52.424455
8 13.324399 52.637530
9 13.052958 52.398084
10 13.087653 52.413064
11 13.330557 52.637883
12 13.354927 52.380040
13 13.163061 52.514445
14 13.371755 52.520665
15 13.698472 52.389397
16 13.405825 52.507757
17 13.239793 52.391341
18 13.369102 52.525122
19 13.322234 52.511453
20 13.326276 52.515045
21 13.318642 52.296283
22 13.411129 52.478509
23 13.207719 52.283844
24 13.222899 52.381747
and the result:
lng lat cluster
0 12.988426 52.343361 9
1 13.055824 52.396462 6
2 13.353571 52.347457 -1
3 12.980915 52.339021 9
4 13.232137 52.339155 4
5 12.877804 52.385926 -1
6 13.220915 52.378951 7
7 13.479688 52.424455 -1
8 13.324399 52.637530 -1
9 13.052958 52.398084 6
10 13.087653 52.413064 5
11 13.330557 52.637883 -1
12 13.354927 52.380040 0
13 13.163061 52.514445 -1
14 13.371755 52.520665 2
15 13.698472 52.389397 -1
16 13.405825 52.507757 1
17 13.239793 52.391341 -1
18 13.369102 52.525122 2
19 13.322234 52.511453 8
20 13.326276 52.515045 8
21 13.318642 52.296283 -1
22 13.411129 52.478509 -1
23 13.207719 52.283844 3
24 13.222899 52.381747 7
and the nearest neighbors graph:
[[False False False False False False False False False False False False
False False True False False False True False False False False False
False]
[False False True False False False False False False False False False
False False False False False False False False False False False False
False]
[False False False False False False False False False False False False
False False False False False False False False False False True False
False]
[False False False False False False False False False False False False
True False False False False False False False False False False False
False]
[False False False False False False True False False False False False
False False False False False False False False False False False False
True]
[False False False False False False False False False False False False
False False False False False False False True True False False False
False]
[False False False False False True False False False False False False
False False False False False False False False False False False False
False]
[False False False False False False False False False False False False
False False False True False False False False False False False False
False]
[False False False False False False False False False False False False
False False False False False False False False False False False True
False]
[False False False False False False False False False False False False
False False False False False False False True True False False False
False]]