Python Calculate Distance Closest Xy Points

python calculate distance closest xy points

A possibility is to use a breadth-first search to scan all elements, and find the closest point for each element popped off the queue:

import re, collections
import math

s = ["9.5 7.5", "10.2 19.1", "9.7 10.2", "2.5 3.6", "5.5 6.5", "7.8 9.8"]
def cast_data(f):
   def wrapper(*args, **kwargs):
     data, [start] = args
     return list(map(lambda x:' '.join(map(str, x)), f(list(map(lambda x:list(map(float, re.findall('[\d\.]+', x))), data)), list(map(float, re.findall('[\d\.]+', start))))))
   return wrapper

@cast_data
def bfs(data, start, results=[]):
   queue = collections.deque([start])
   while queue and data:
     result = queue.popleft()
     possible = min(data, key=lambda x:math.hypot(*[c-d for c, d in zip(result, x)]))
     if possible not in results:
       results.append(possible)
       queue.append(possible)
       data = list(filter(lambda x:x != possible, data))
   return results

print(bfs(s, ["2.2 4.6"]))

Output:

['2.5 3.6', '5.5 6.5', '7.8 9.8', '9.7 10.2', '9.5 7.5', '10.2 19.1']

The result is the listing of closest points, as determined by using math.hypot.

How to find the closest coordinate from a list of points?

Numpy has a useful function : norm.

import numpy as np
A = [(26, 63), (25, 63), (24, 63), (23, 63), (22, 63), (21, 63), (20, 63), (22, 62), (27, 63)]
A = np.array(A)
leftbottom = np.array((0,238))
distances = np.linalg.norm(A-leftbottom, axis=1)
min_index = np.argmin(distances)
print(f"the closest point is {A[min_index]}, at a distance of {distances[min_index]}")

Result:

the closest point is [20 63], at a distance of 176.13914953808538

Find the nearest point in distance for all the points in the dataset - Python

The brute force method of finding the nearest of N points to a given point is O(N) -- you'd have to check each point.
In contrast, if the N points are stored in a KD-tree, then finding the nearest point is on average O(log(N)).
There is also the additional one-time cost of building the KD-tree, which requires O(N) time.

If you need to repeat this process N times, then the brute force method is O(N**2) and the kd-tree method is O(N*log(N)).
Thus, for large enough N, the KD-tree will beat the brute force method.

See here for more on nearest neighbor algorithms (including KD-tree).

Below (in the function using_kdtree) is a way to compute the great circle arclengths of nearest neighbors using scipy.spatial.kdtree.

scipy.spatial.kdtree uses the Euclidean distance between points, but there is a formula for converting Euclidean chord distances between points on a sphere to great circle arclength (given the radius of the sphere).
So the idea is to convert the latitude/longitude data into cartesian coordinates, use a KDTree to find the nearest neighbors, and then apply the great circle distance formula to obtain the desired result.

Here are some benchmarks. Using N = 100, using_kdtree is 39x faster than the orig (brute force) method.

In [180]: %timeit using_kdtree(data)
100 loops, best of 3: 18.6 ms per loop

In [181]: %timeit using_sklearn(data)
1 loop, best of 3: 214 ms per loop

In [179]: %timeit orig(data)
1 loop, best of 3: 728 ms per loop

For N = 10000:

In [5]: %timeit using_kdtree(data)
1 loop, best of 3: 2.78 s per loop

In [6]: %timeit using_sklearn(data)
1 loop, best of 3: 1min 15s per loop

In [7]: %timeit orig(data)
# untested; too slow

Since using_kdtree is O(N log(N)) and orig is O(N**2), the factor by
which using_kdtree is faster than orig will grow as N, the length of
data, grows.

import numpy as np
import scipy.spatial as spatial
import pandas as pd
import sklearn.neighbors as neighbors
from math import radians, cos, sin, asin, sqrt

R = 6367

def using_kdtree(data):
    "Based on https://stackoverflow.com/q/43020919/190597"
    def dist_to_arclength(chord_length):
        """
        https://en.wikipedia.org/wiki/Great-circle_distance
        Convert Euclidean chord length to great circle arc length
        """
        central_angle = 2*np.arcsin(chord_length/(2.0*R)) 
        arclength = R*central_angle
        return arclength

    phi = np.deg2rad(data['Latitude'])
    theta = np.deg2rad(data['Longitude'])
    data['x'] = R * np.cos(phi) * np.cos(theta)
    data['y'] = R * np.cos(phi) * np.sin(theta)
    data['z'] = R * np.sin(phi)
    tree = spatial.KDTree(data[['x', 'y','z']])
    distance, index = tree.query(data[['x', 'y','z']], k=2)
    return dist_to_arclength(distance[:, 1])

def orig(data):
    def distance(lon1, lat1, lon2, lat2):
        """
        Calculate the great circle distance between two points 
        on the earth (specified in decimal degrees)
        """
        # convert decimal degrees to radians 
        lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
        # haversine formula 
        dlon = lon2 - lon1 
        dlat = lat2 - lat1 
        a = sin(dlat/2.0)**2 + cos(lat1) * cos(lat2) * sin(dlon/2.0)**2
        c = 2 * asin(sqrt(a)) 
        km = R * c
        return km

    shortest_distance = []
    for i in range(len(data)):
        distance1 = []
        for j in range(len(data)):
            if i == j: continue
            distance1.append(distance(data['Longitude'][i], data['Latitude'][i], 
                                      data['Longitude'][j], data['Latitude'][j]))
        shortest_distance.append(min(distance1))
    return shortest_distance


def using_sklearn(data):
    """
    Based on https://stackoverflow.com/a/45127250/190597 (Jonas Adler)
    """
    def distance(p1, p2):
        """
        Calculate the great circle distance between two points
        on the earth (specified in decimal degrees)
        """
        lon1, lat1 = p1
        lon2, lat2 = p2
        # convert decimal degrees to radians
        lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
        # haversine formula
        dlon = lon2 - lon1
        dlat = lat2 - lat1
        a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
        c = 2 * np.arcsin(np.sqrt(a))
        km = R * c
        return km
    points = data[['Longitude', 'Latitude']]
    nbrs = neighbors.NearestNeighbors(n_neighbors=2, metric=distance).fit(points)
    distances, indices = nbrs.kneighbors(points)
    result = distances[:, 1]
    return result

np.random.seed(2017)
N = 1000
data = pd.DataFrame({'Latitude':np.random.uniform(-90,90,size=N), 
                     'Longitude':np.random.uniform(0,360,size=N)})

expected = orig(data)
for func in [using_kdtree, using_sklearn]:
    result = func(data)
    assert np.allclose(expected, result)

measure distance to nearest group of points - python

Here is a way to use scipy.spatial.KDTree, which is very useful when you intend to do many distance and neighbor searches.

import numpy as np
import pandas as pd
from scipy.spatial import KDTree

def within_radius_dist(z, radius, closed=False):
    center = z[['x_ref', 'y_ref']].mean()  # they should all be same
    z = z[['x', 'y']]
    dist_ubound = radius * 1.0001 if closed else radius
    dist, idx = KDTree(z).query(
        center, k=None, distance_upper_bound=dist_ubound)
    if closed:
        idx = [i for d, i in zip(dist, idx) if d <= radius]
    if idx:
        within = z.iloc[idx]
        dist, _ = KDTree(within).query(z)
    else:
        dist = np.nan
    return pd.Series(dist, index=z.index)

Application (here using your df as example):

>>> df.assign(distance=df.groupby('Time', group_keys=False).apply(
...     within_radius_dist, radius=3, closed=True))
    Time Item  x  y  x_ref  y_ref  distance
0      1    A  5 -2      3     -2  0.000000
1      1    B  5  0      3     -2  0.000000
2      1    C  8 -2      3     -2  3.000000
3      1    D  3  0      3     -2  0.000000
4      1    E  6  0      3     -2  1.000000
5      1    F  2  4      3     -2  4.123106
6      2    A  6 -1      4     -2  0.000000
7      2    B  7  2      4     -2  3.162278
8      2    C  4 -3      4     -2  0.000000
9      2    D  2  4      4     -2  6.403124
10     2    E  7 -4      4     -2  3.162278
11     2    F  6  2      4     -2  3.000000

Explanation:

The groupby('Time') makes sure we apply the function within_radius_dist() to each group by time.
Inside the function, the first KDTree query finds the points inside the sphere (circle, here, since this problem is 2D, but this can be generalized to nD) of given radius centered on (x_ref, y_ref).
Since the distance_upper_bound argument is exclusive (i.e. KDTree query returns only distances strictly smaller than this), in the case where we want to include points at the radius (when closed=True), then we need to do a bit of extra processing: add a small fraction to the radius, then clip.
Note also that, by default, the p=2 norm (Euclidean norm) is used, but you could use other norms as well.
within are these points inside the sphere.
(note: if there are no such point, we return NaN for all distances).
The second KDTree query looks for the nearest distance of all our points (within the group) to those within points. This conveniently returns 0 for points that are within the sphere (because that's the distance to themselves) and the distance to the nearest point within for the other points. So that's our result right there.
We return the result as a Series, so pandas knowns to put that in shape properly, and finally assign that to a column called 'distance'.

Last observation: the desired result provided in the original question seems to ignore x_ref, y_ref and use a single center=(4, -2). In the first group (Time == 1), the correct distance for C is 3.0 (distance to A) and E is not within the circle.

Supplement

If you are interested in also capturing which nearest neighbor is found for each point:

def within_radius_dist(z, radius, closed=False):
    center = z[['x_ref', 'y_ref']].mean()  # they should all be same
    z = z[['x', 'y']]
    dist_ubound = radius * 1.0001 if closed else radius
    dist, idx = KDTree(z).query(
        center, k=None, distance_upper_bound=dist_ubound)
    if closed:
        idx = [i for d, i in zip(dist, idx) if d <= radius]
    if idx:
        within = z.iloc[idx]
        dist, idx = KDTree(within).query(z)
        neigh_idx = within.index[idx]
    else:
        dist = np.nan
        neigh_idx = None
    return pd.DataFrame({'distance': dist, 'neighbor': neigh_idx}, index=z.index)

and then:

out = pd.concat([df, df.groupby('Time', group_keys=False).apply(
    within_radius_dist, radius=3, closed=True)], axis=1)
out.assign(neigh_item=out.loc[out.neighbor, 'Item'].values)

Output:

    Time Item  x  y  x_ref  y_ref  distance  neighbor neigh_item
0      1    A  5 -2      3     -2  0.000000         0          A
1      1    B  5  0      3     -2  0.000000         1          B
2      1    C  8 -2      3     -2  3.000000         0          A
3      1    D  3  0      3     -2  0.000000         3          D
4      1    E  6  0      3     -2  1.000000         1          B
5      1    F  2  4      3     -2  4.123106         3          D
6      2    A  6 -1      4     -2  0.000000         6          A
7      2    B  7  2      4     -2  3.162278         6          A
8      2    C  4 -3      4     -2  0.000000         8          C
9      2    D  2  4      4     -2  6.403124         6          A
10     2    E  7 -4      4     -2  3.162278         8          C
11     2    F  6  2      4     -2  3.000000         6          A

Calculate nearest distance to certain points in python

Its a lot easier when there is sample data, make sure to include that next time.

I generate random data

import numpy as np
import pandas as pd
import sklearn


x = np.linspace(1,50)
y = np.linspace(1,50)

GRID = np.meshgrid(x,y)
grid_colors = 1* ( np.random.random(GRID[0].size) > .8 )

sample_data = pd.DataFrame( {'X': GRID[0].flatten(), 'Y':GRID[1].flatten(), 'grid_color' : grid_colors})

sample_data.plot.scatter(x="X",y='Y', c='grid_color', colormap='bwr', figsize=(10,10))

grid

BallTree (or KDTree) can create a tree to query with

from sklearn.neighbors import BallTree 

red_points = sample_data[sample_data.grid_color == 1]
blue_points = sample_data[sample_data.grid_color != 1]

tree = BallTree(red_points[['X','Y']], leaf_size=15, metric='minkowski')

and use it with

distance, index = tree.query(sample_data[['X','Y']], k=1)

now add it to the DataFrame

sample_data['nearest_point_distance'] = distance
sample_data['nearest_point_X'] = red_points.X.values[index]
sample_data['nearest_point_Y'] = red_points.Y.values[index]

which gives

     X    Y  grid_color  nearest_point_distance  nearest_point_X  \
0  1.0  1.0           0                     2.0              3.0   
1  2.0  1.0           0                     1.0              3.0   
2  3.0  1.0           1                     0.0              3.0   
3  4.0  1.0           0                     1.0              3.0   
4  5.0  1.0           1                     0.0              5.0   

   nearest_point_Y  
0              1.0  
1              1.0  
2              1.0  
3              1.0  
4              1.0

Modification to have red point not find themself;

Find the nearest k=2 instead of k=1;

distance, index = tree.query(sample_data[['X','Y']], k=2)

And, with help of numpy indexing, make red points use the second instead of the first found;

sample_size = GRID[0].size

sample_data['nearest_point_distance'] = distance[np.arange(sample_size),sample_data.grid_color]
sample_data['nearest_point_X'] = red_points.X.values[index[np.arange(sample_size),sample_data.grid_color]]
sample_data['nearest_point_Y'] = red_points.Y.values[index[np.arange(sample_size),sample_data.grid_color]]

The output type is the same, but due to randomness it won't agree with earlier made picture.

Get the distance to each nearest element in a 1D/2D without for loop

There seems to be a theme with O(N^2) solutions here. For 1D, it's quite simple to get O(N log N):

x = np.array([2, 9, 5, 6, 55, 8])
i = np.argsort(x)
dist = np.diff(x[i])
min_dist = np.r_[dist[0], np.minimum(dist[1:], dist[:-1]), dist[-1]])
min_dist = min_dist[np.argsort(i)]

This clearly won't scale well to multiple dimensions, so use scipy.special.KDTree instead. Assuming your data is N-dimensional and has shape (M, N), you can do

k = KDTree(data)
dist = k.query(data, k=2)[0][:, -1]

Scipy has a Cython implementation of KDTree, cKDTree. Sklearn has a sklearn.neighbors.KDTree with a similar interface as well.

Return distance to nearest point by group - python

I could not find a nice straightforward way, as it seems from your example you don't want to include the point itself. I ended up making groups and calculate distances within them.

Edit: Added variation which (should) include the ID of the nearest point.

from sklearn.neighbors import BallTree
import pandas as pd
import numpy as np
from scipy.spatial.distance import pdist, squareform

df = pd.DataFrame({              
    'Time' : [1,1,1,1,1,1,2,2,2,2,2,2],             
    'ID' : ['A','B','C','X','U','V','A','B','C','X','U','V'],      
    'Group' : ['Red','Red','Red','Grn','Grn','Grn','Red','Red','Red','Grn','Grn','Grn'],           
    'X' : [2.0,3.0,4.0,2.0,2.0,1.0,1.0,6.0,4.0,2.0,5.0,3.0],
    'Y' : [3.0,1.0,0.0,0.0,2.0,1.0,2.0,0.0,1.0,1.0,0.0,0.0],           
    })


def subgroup_distance(df, group_column='Group', include_point_itself=True):    
    groups = df[group_column].unique()

    all_points = df[['X','Y']].values
    
    for group in groups:
        group_points = df[df[group_column] == group][['X','Y']]
        tree = BallTree(group_points, leaf_size=15, metric='minkowski')
        
        if include_point_itself:
            distance, index = tree.query(all_points, k=1)
            distances = distance[:,0]
            distance_column_name = "distance_{}".format( group )
            df[ distance_column_name ] = distances
                
        else:
            indici = (df[group_column] == group).values * 1
            distance, index = tree.query(all_points, k=2)
            distances = distance[ np.arange(distance.shape[0]), indici]

            distance_column_name = "distance_{}".format( group )
            df[ distance_column_name ] = distances

    return df

def calculate_distances(df):
    return subgroup_distance(df, include_point_itself=False)

df_distances = df.groupby(['Time']).apply(calculate_distances).reset_index()

this will output

    index  Time ID Group    X    Y  distance_Red  distance_Grn
0       0     1  A   Red  2.0  3.0      2.236068      1.000000
1       1     1  B   Red  3.0  1.0      1.414214      1.414214
2       2     1  C   Red  4.0  0.0      1.414214      2.000000
3       3     1  X   Grn  2.0  0.0      1.414214      1.414214
4       4     1  U   Grn  2.0  2.0      1.000000      1.414214
5       5     1  V   Grn  1.0  1.0      2.000000      1.414214
6       6     2  A   Red  1.0  2.0      3.162278      1.414214
7       7     2  B   Red  6.0  0.0      2.236068      1.000000
8       8     2  C   Red  4.0  1.0      2.236068      1.414214
9       9     2  X   Grn  2.0  1.0      1.414214      1.414214
10     10     2  U   Grn  5.0  0.0      1.000000      2.000000
11     11     2  V   Grn  3.0  0.0      1.414214      1.414214

Variation which will output the ID of the nearest point in the subgroup

def subgroup_distance_with_nearest(df, group_column='Group', include_point_itself=True):    
    groups = df[group_column].unique()

    all_points = df[['X','Y']].values
    
    for group in groups:
        group_points = df[df[group_column] == group][['X','Y']]
        tree = BallTree(group_points, leaf_size=15, metric='minkowski')
        
        distances = None
        nearest_id = None
        
        if include_point_itself:
            distance, index = tree.query(all_points, k=1)
            distances = distance[:,0]
            nearest_id = group_points.index[index[:,0]]
                        
        else:
            indici = (df[group_column] == group).values * 1
            distance, index = tree.query(all_points, k=2)
            distances = distance[ np.arange(distance.shape[0]), indici]
            index_indici =  index[ np.arange(distance.shape[0]), indici]
            nearest_id = group_points.index[index_indici]

        distance_column_nearest_name = "nearest_index_{}".format( group )
        distance_column_name = "distance_{}".format( group )
        df[ distance_column_name ] = distances
        df[ distance_column_nearest_name] = nearest_id


    return df

def subgroup_distance_with_nearest(df):
    return subgroup_distance(df, include_point_itself=False)

df_distances = df.groupby(['Time']).apply(calculate_distances).reset_index()

and it will output

    index  Time ID Group    X    Y  distance_Red  nearest_index_Red  \
0       0     1  A   Red  2.0  3.0      2.236068                  1   
1       1     1  B   Red  3.0  1.0      1.414214                  2   
2       2     1  C   Red  4.0  0.0      1.414214                  1   
3       3     1  X   Grn  2.0  0.0      1.414214                  1   
4       4     1  U   Grn  2.0  2.0      1.000000                  0   
5       5     1  V   Grn  1.0  1.0      2.000000                  1   
6       6     2  A   Red  1.0  2.0      3.162278                  8   
7       7     2  B   Red  6.0  0.0      2.236068                  8   
8       8     2  C   Red  4.0  1.0      2.236068                  7   
9       9     2  X   Grn  2.0  1.0      1.414214                  6   
10     10     2  U   Grn  5.0  0.0      1.000000                  7   
11     11     2  V   Grn  3.0  0.0      1.414214                  8   

    distance_Grn  nearest_index_Grn  
0       1.000000                  4  
1       1.414214                  4  
2       2.000000                  3  
3       1.414214                  5  
4       1.414214                  5  
5       1.414214                  3  
6       1.414214                  9  
7       1.000000                 10  
8       1.414214                 11  
9       1.414214                 11  
10      2.000000                 11  
11      1.414214                  9

I didn't recalculate and test the ID's, but seems to be at least correct that it indeed return a ID from the subgroup.

Python Calculate Distance Closest Xy Points