Skip to content

Commit

Permalink
Merge pull request #816 from PrimozGodec/concave-hull
Browse files Browse the repository at this point in the history
[ENH] Functionalities for computing concave hull around clusters of points
  • Loading branch information
VesnaT authored Apr 19, 2022
2 parents c8a60d2 + 40abacc commit 2895274
Show file tree
Hide file tree
Showing 3 changed files with 317 additions and 0 deletions.
261 changes: 261 additions & 0 deletions orangecontrib/text/concave_hull.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,261 @@
from bisect import bisect_left, bisect_right
from math import sqrt
from typing import Optional, Dict, Iterable, Tuple, List

import numpy as np
import pyclipper
from Orange.data import ContinuousVariable, Domain, Table
from scipy.spatial import Delaunay


def _angle(
v1: Tuple[np.ndarray, np.ndarray], v2: Tuple[np.ndarray, np.ndarray]
) -> float:
"""
Compute clockwise angles between v1 and v2. Both vectors are
given as a tuple with two points ([x1, y1], [x2, y2]) such that
[x2, y2] of v1 == [x1, y1] of v2.
The angle is given in degrees between 0 and 2 * pi
"""
v1, v2 = np.array(v1), np.array(v2)
x1, y1 = v1[0] - v1[1]
x2, y2 = v2[1] - v2[0]
dot = np.sum(x1 * x2 + y1 * y2) # dot product
det = x1 * y2 - y1 * x1 # determinant
angle = np.arctan2(det, dot) # atan2(y, x) or atan2(sin, cos)
return -angle if angle <= 0 else np.pi + np.pi - angle


def _find_hull(
edges_list: List, points_list: np.ndarray, starting_edge: int
) -> Tuple[np.ndarray, List]:
"""
This function return a single hull which starts and ends in starting_edge.
Parameters
----------
edges_list
List of edges. Each edge is presented as a tuple of two indices which
tell the starting and ending note. The index correspond to location
of point in points_list. This list must be sorted in the ascending order.
points_list
Points location. Each point has x and y location.
starting_edge
The index of the list where hull starts.
Returns
-------
polygon
The array with the hull/polygon points
used_points
List of booleans that indicates whether each point was used in polygon
"""
firsts = [x[0] for x in edges_list] # first elements for bisection
used_edges = [False] * len(edges_list)
used_edges[starting_edge] = True

# remember start and next point
start, next_point = edges_list[starting_edge]

# remember current polygon around points
poly = [points_list[start], points_list[next_point]]

# we count number of steps to stop iteration in case it is locked
# in some dead cycle. It can be a result of some unexpected cases.
count = 0
while start != next_point and count < len(edges_list):
# find the index where the first value equal to next_point appear
ind_left = bisect_left(firsts, next_point)
# find the index next to the last value
ind_right = bisect_right(firsts, next_point)

# check if there are more edges available from the same point
if ind_right - ind_left > 1:
# select the most distant one in clockwise direction. It is probably
# the point on the outer hull - we prevent a hull to discover cycles
# inside a polygon
ang = -1
for i in range(ind_left, ind_right):
cur_ang = _angle(
(poly[-2], poly[-1]), (poly[-1], points_list[edges_list[i][1]])
)
if cur_ang > ang:
ang = cur_ang
ind_left = i
# save a next point of the polygon
used_edges[ind_left] = True
next_point = edges_list[ind_left][1]
poly.append(points_list[next_point])
count += 1
return np.array(poly), used_edges


def _edges_to_polygon(edges: Iterable, points: np.ndarray) -> np.ndarray:
"""
This function connects edges in polygons. It computes all possible hulls -
some clusters have more of them when they have a hole in the middle, and
selects outer hull.
Parameters
----------
edges
Iterable of edges. Each edge is presented as a tuple of two indices which
tell the starting and ending note. The index correspond to location
of point in points_list
points
Points location. Each point has x and y location.
Returns
-------
The array with the hull/polygon points.
"""
# sort based on first element of tuple to enable bisection search
edges = sorted(edges, key=lambda x: x[0])
# need to use all edges
used = [False] * len(edges)

# it is possible that we will find more separate hulls -
# it happens in cases when a polygon has inner cycles
polygons = []
while not all(used):
i = used.index(False)
poly, new_used = _find_hull(edges, points, i)
polygons.append(poly)
used = [u1 or u2 for u1, u2 in zip(used, new_used)]

# select polygon that is outside - the widest and the highest
height_width = [np.sum(p.max(axis=0) - p.min(axis=0)) for p in polygons]
i = height_width.index(max(height_width))
return polygons[i]


def _get_shape_around_points(pts: np.ndarray, eps: Optional[float]) -> np.ndarray:
"""
Compute the shape (concave hull) of a set of a cluster.
"""

def add_edge(edges_list, i, j):
"""
Add a line between the i-th and j-th points,
if not in the list already. Remove the lines that are not outer
edges - when (j, i) already in list means that two triangles
has same edge what means it is not an outer edge
"""
if (j, i) in edges_list:
# both neighboring triangles are in shape - it's not a boundary edge
edges_list.remove((j, i))
else:
edges_list.add((i, j))

if len(pts) < 4:
rng = list(range(3))
return _edges_to_polygon(zip(rng, rng[1:] + rng[:1]), pts)

eps = eps * 2 if eps else float("inf")
tri = Delaunay(pts)
edges = set()
# loop over triangles: ia, ib, ic = indices of corner points of the triangle
for ia, ib, ic in tri.vertices:
pa = pts[ia]
pb = pts[ib]
pc = pts[ic]

# Lengths of sides of triangle
a = sqrt((pa[0] - pb[0]) ** 2 + (pa[1] - pb[1]) ** 2)
b = sqrt((pb[0] - pc[0]) ** 2 + (pb[1] - pc[1]) ** 2)
c = sqrt((pc[0] - pa[0]) ** 2 + (pc[1] - pa[1]) ** 2)

# filter - the longest edge of triangle must be smaller than epsilon
if max(a, b, c) <= eps:
add_edge(edges, ia, ib)
add_edge(edges, ib, ic)
add_edge(edges, ic, ia)

return _edges_to_polygon(edges, pts)


def _smoothen_hull(hull: np.ndarray) -> np.ndarray:
"""
Expand hull a bit away from the cluster and smoothen it
"""
# scale hull between 0 and 1 in both directions - help to produce the hull
# with equal distance from the cluster when axes have different scaling
x, y = hull[:, 0:1], hull[:, 1:2]
x_min, x_max = np.min(x), np.max(x)
y_min, y_max = np.min(y), np.max(y)
x_diff, y_diff = (x_max - x_min), (y_max - y_min)
hull = np.hstack(((x - x_min) / x_diff, (y - y_min) / y_diff))

# buffer the line for 3 * dist_from_cluster and move it back for
# 2 * dist_from_cluster. It will make a hull smother.

# pyclipper work with integer so points need to be scaled first
scaling_factor = 1000
dist_from_cluster = 0.06
scaled_hull = pyclipper.scale_to_clipper(hull, scaling_factor)

# buffer the hull for dist_from_cluster * 3
pco1 = pyclipper.PyclipperOffset()
pco1.AddPath(scaled_hull, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
im_solution = pco1.Execute(dist_from_cluster * scaling_factor * 3)
# buffer the hull for dist_from_cluster * -2
pco2 = pyclipper.PyclipperOffset()
pco2.AddPath(im_solution[0], pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
solution = pyclipper.scale_from_clipper(
pco2.Execute(dist_from_cluster * scaling_factor * (-2)), scaling_factor
)
solution = np.array(solution).reshape(-1, 2)

# scale the hue back for its original space
return np.hstack(
(solution[:, 0:1] * x_diff + x_min, solution[:, 1:2] * y_diff + y_min)
)


def compute_concave_hulls(
embedding: np.ndarray, clusters: np.ndarray, epsilon: Optional[float] = None
) -> Dict[int, np.ndarray]:
"""
Function computes the points of the concave hull around points.
Parameters
----------
embedding
Visualisation coordinates - embeddings
clusters
Cluster indices for each item.
epsilon
Points with distance > 2*epsilon are not connected resulting in concave
hull. The parameter is optional, setting it to None result in convex hull.
Returns
-------
The points of the concave hull. Dictionary with cluster index
as a key and array of points as a value - [[x1, y1], [x2, y2], [x3, y3], ...]
"""
hulls = {}
for cl in set(clusters) - {-1}:
points = embedding[clusters == cl]

# subsample when more than 1000 points - to speed up the algorithm
if points.shape[0] > 1000:
points = points[np.random.randint(points.shape[0], size=1000), :]

# compute hull around the cluster
hull = _get_shape_around_points(points, epsilon)
# smoothen and extend the hull
hulls[cl] = _smoothen_hull(hull)
return hulls


if __name__ == "__main__":
from matplotlib import pyplot as plt

table = Table("iris")
clusters = np.array([1] * 50 + [2] * 50 + [3] * 50)

hulls = compute_concave_hulls(table.X[:, :2], clusters)
plt.plot(table.X[:, 0], table.X[:, 1], "o")
for k, h in hulls.items():
plt.plot(h[:, 0], h[:, 1])
plt.show()
55 changes: 55 additions & 0 deletions orangecontrib/text/tests/test_concave_hull.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import unittest

import numpy as np
from Orange.data import Table

from orangecontrib.text.concave_hull import compute_concave_hulls


class TestConcaveHull(unittest.TestCase):
def test_compute_concave_hulls(self):
data = Table.from_file("iris")[:, 2:4]
clusters = np.array([0] * 50 + [1] * 50 + [2] * 50)

hulls = compute_concave_hulls(data.X, clusters, epsilon=0.5)
self.assertEqual(3, len(hulls))
self.assertEqual(2, hulls[0].shape[1]) # hull have x and y
self.assertEqual(2, hulls[1].shape[1]) # hull have x and y
self.assertEqual(2, hulls[2].shape[1]) # hull have x and y

hulls = compute_concave_hulls(data.X, clusters)
self.assertEqual(3, len(hulls))
self.assertEqual(2, hulls[0].shape[1]) # hull have x and y
self.assertEqual(2, hulls[1].shape[1]) # hull have x and y
self.assertEqual(2, hulls[2].shape[1]) # hull have x and y

def test_compute_concave_hulls_subsampling(self):
"""
When more than 1000 points passed they are sub-sampled in order to
compute a concave hull
"""
iris = Table.from_file("iris")
data = np.repeat(iris.X[:, 2:4], 10, axis=0) # more than 1000 points
clusters = np.array([0] * 50 * 10 + [1] * 50 * 10 + [2] * 50 * 10)

hulls = compute_concave_hulls(data, clusters, epsilon=0.5)

self.assertEqual(3, len(hulls))
self.assertEqual(2, hulls[0].shape[1]) # hull have x and y
self.assertEqual(2, hulls[1].shape[1]) # hull have x and y
self.assertEqual(2, hulls[2].shape[1]) # hull have x and y

def test_compute_concave_hulls_triangle(self):
"""
Concave hull must also work for tree points - it is a special case
"""
data = np.array([[1, 1], [1, 2], [2, 1]])
clusters = np.array([0] * 3)
hulls = compute_concave_hulls(data, clusters, epsilon=0.5)

self.assertEqual(1, len(hulls))
self.assertEqual(2, hulls[0].shape[1]) # hull have x and y


if __name__ == "__main__":
unittest.main()
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Orange3 >=3.31.1
orange-widget-base >=4.14.0
pandas
pdfminer3k>=1.3.1
pyclipper>=1.2.0
pyqtgraph
pyyaml
requests
Expand Down

0 comments on commit 2895274

Please sign in to comment.