Source code for icarus.scenarios.algorithms

"""Various algorithms used for optimal cache placement."""
import random

import numpy as np
import networkx as nx

import fnss

from icarus.util import path_links


__all__ = [
    'pam',
    'extract_cluster_level_topology',
    'deploy_clusters',
    'compute_clusters',
    'compute_p_median',
           ]


[docs]def pam(distances, k, n_iter=10):
    """Compute k-medoids using the PAM algorithm

    Parameters
    ----------
    distances : 2-d NumPy array
        Array of distances between points
    k : int
        Number of clusters
    n_iter : int
        Number of iterations to repeat. Each repetition is executed using a
        different initial random assignment. Repetiting the experiment allow
        to reach different local optima, possibly achieving a best solution.

    Return
    ------
    clusters : 1-d NumPy array
        Array mapping point to medoid, e.g. if point i is mapped to medoid j
        then clusters[i] = j
    medoids : 1-d NumPy array
        Array listing (in no particular order) all medoids
    cost : float
        Cost of the solution

    Notes
    -----
    Implementation based on:
    https://github.com/salspaugh/machine_learning/blob/master/clustering/kmedoids.py
    """
    def assign_points_to_clusters(medoids, distances):
        """Return a 1-d array having for at each index the medoid the element
        belongs to.

        E.g. if point i is mapped to medoid j, clusters[i] = j
        """
        distances_to_medoids = distances[:, medoids]
        clusters = medoids[np.argmin(distances_to_medoids, axis=1)]
        clusters[medoids] = medoids
        return clusters

    def compute_new_medoid(cluster, distances):
        mask = np.ones(distances.shape)
        mask[np.ix_(cluster, cluster)] = 0.
        cluster_distances = np.ma.masked_array(data=distances, mask=mask, fill_value=10e9)
        costs = cluster_distances.sum(axis=1)
        return costs.argmin(axis=0, fill_value=np.inf)

    def clusters(distances, k):
        m = distances.shape[0]  # number of points
        if k > m:
            raise ValueError("k is greater than the number of points")

        if hasattr(np.random, 'choice'):
            curr_medoids = np.random.choice(np.arange(m, dtype=int), k, replace=False)
        else:
            # This is only if I use NumPy < 1.7
            curr_medoids = np.asarray(random.sample(np.arange(m, dtype=int), k))

        old_medoids = np.empty(k)
        new_medoids = np.empty(k)

        # Set a negative value to ensure execution of while loop
        old_medoids[0] = -1

        # Until the medoids stop updating, do the following:
        while not np.all(old_medoids == curr_medoids):
            # Assign each point to cluster with closest medoid
            clusters = assign_points_to_clusters(curr_medoids, distances)

            # Update cluster medoids to be lowest cost point.
            for curr_medoid in curr_medoids:
                cluster = np.where(clusters == curr_medoid)[0]
                new_medoids[curr_medoids == curr_medoid] = compute_new_medoid(cluster, distances)

            old_medoids[:] = curr_medoids[:]
            curr_medoids[:] = new_medoids[:]
            cost = np.sum(distances[np.arange(m), clusters])
        return clusters, curr_medoids, cost

    min_cost = np.inf
    opt_clusters = None
    opt_medoids = None

    for _ in range(n_iter):
        curr_clusters, curr_medoids, curr_cost = clusters(distances, k)
        if curr_cost < min_cost:
            min_cost = curr_cost
            opt_clusters = curr_clusters
            opt_medoids = curr_medoids
    return opt_clusters, opt_medoids, min_cost


[docs]def extract_cluster_level_topology(topology):
    """Build a cluster-level topology.

    Each node in the topology must be have the 'cluster' attribute

    Parameters
    ----------
    topology : Topology
        The router-level topology

    Returns
    -------
    topology : Topology
        The cluster-level topology

    Notes
    -----
     * Each router must have a cache deployed
     * All sources and receiver must have one single attachment point with a
       cache
     * Each node must be labelled with cluster
    """
    cluster_map = nx.get_node_attributes(topology, 'cluster')
    if len(cluster_map) < topology.number_of_nodes():
        raise ValueError('There are nodes not labelled with cluster information')
    if nx.number_connected_components(topology) > 1:
        raise ValueError('There is more than one connected component')
    cluster_topology = fnss.Topology()
    cluster_set = set(cluster_map.values())
    if len(cluster_set) == 1:
        # There is only one huge cluster
        cluster_topology.add_node(cluster_set.pop())
        return cluster_topology
    for u, v in topology.edges_iter():
        cluster_u = cluster_map[u]
        cluster_v = cluster_map[v]
        if cluster_u != cluster_v:
            cluster_topology.add_edge(cluster_u, cluster_v)
    return cluster_topology


[docs]def deploy_clusters(topology, clusters, assign_src_rcv=True):
    """Annotate topology with cluster informations

    This function checks that all ICR candidate nodes are assigned exactly
    to one cluster.

    If assign_src_rcv is True, then it also labels source and receiver nodes
    to the closest cluster.

    This function assumes that:
     * each node of the topology is either an icr_candidate, a source or a receiver
     * each source and receiver must have degree equal to 1

    Parameters
    ----------
    topology : Topology
        The topology onto which deploy clusters
    clusters : list of sets
        Router-cluster assignment. Each element of a list is a set of node
        identifiers. Nodes in the same set belong to the same cluster.
        The length of the list therefore corresponds to the number of clusters.
    assign_src_rcv : bool, optional
        If *True*, the function labels source and receiver nodes with the
        cluster label of the router they are attached to.
    """
    clustered_nodes = set()
    n_clustered_nodes = 0
    for c in clusters:
        clustered_nodes = clustered_nodes.union(c)
        n_clustered_nodes += len(c)
    if n_clustered_nodes != len(clustered_nodes):
        raise ValueError('At least one node is listed in more than one cluster')
    if clustered_nodes != topology.graph['icr_candidates']:
        raise ValueError('Set of nodes in the cluster do not match ICR candidates')
    topology.graph['clusters'] = clusters
    for i in range(len(clusters)):
        for v in clusters[i]:
            topology.node[v]['cluster'] = i
    if not assign_src_rcv:
        return
    src_rcv = topology.sources().union(topology.receivers())
    deg = nx.degree(topology)
    if any(deg[v] > 1 for v in src_rcv):
        raise ValueError("There are at least one source or receiver with degree >= 1")
    for v in src_rcv:
        next_node = list(topology.edge[v].keys())[0]
        topology.node[v]['cluster'] = topology.node[next_node]['cluster']


[docs]def compute_clusters(topology, k, distance='delay', nbunch=None, n_iter=10):
    """Cluster nodes of a topologies as to minimize the intra-cluster latency.

    This function assumes that every link is labelled with latencies and
    performs clustering using the k-medoids method with the PAM algorithm.

    Parameters
    ----------
    topology : Topology
        The topology
    k : int
        The number of clusters
    distance : str, optional
        The link metric used to represent distance between nodes.
        If None, hop count is used instead
    n_iter : int, optional
        The number of iterations

    Return
    ------
    clusters: list of sets
        List of clusters (each cluster being a set of nodes)
    """
    topology = topology.to_undirected()
    if nx.number_connected_components(topology) > 1:
        raise ValueError('The topology has more than one connected component')
    if nbunch is not None:
        topology = topology.subgraph(nbunch)
    topology = nx.convert_node_labels_to_integers(topology, label_attribute='label')

    if distance is not None:
        for u, v in topology.edges_iter():
            if distance not in topology.edge[u][v]:
                raise ValueError('Edge (%s, %s) does not have a %s attribute'
                                 % (str(topology.node[u]['label']),
                                    str(topology.node[v]['label']),
                                    distance))

    n = topology.number_of_nodes()
    path = nx.all_pairs_shortest_path(topology)
    distances = np.zeros((n, n))

    for u in path:
        for v in path[u]:
            # Note: need to do something about weights and asymmetric paths!
            if u == v or distances[u][v] != 0:
                continue
            # Extract all edges of a path
            edges = path_links(path[u][v])
            if distance is not None:
                distances[u][v] = distances[v][u] = sum(topology.edge[u][v][distance]
                                                        for u, v in edges)
            else:
                distances[u][v] = distances[v][u] = len(edges)
    clusters = [set() for _ in range(k)]
    medoid_assignment = pam(distances, k=k, n_iter=n_iter)[0]
    if any(medoid_assignment >= n):
        raise ValueError('Something is wrong with k-medoids algorithm. '
                         'I got an assignment to a medoid that does not exist')
    medoids = list(set(medoid_assignment))
    medoid_cluster_map = {medoids[i]: i for i in range(len(medoids))}
    # Concert assignments from medoid ID to cluster ID
    for v in range(n):
        clusters[medoid_cluster_map[medoid_assignment[v]]].add(topology.node[v]['label'])
    return clusters


[docs]def compute_p_median(distances, p, n_iter=20):
    """Compute p-median solution using the Adjusted Vertex Substitution (AVS)
    algorithm.

    Parameters
    ----------
    distances : dict of dicts
        Distance between nodes
    p : int
        Number of facilities

    Return
    ------
    allocation : dict
        Dict mapping each node to the allocated facility
    facilities : set
        Set of the p facilities identified
    """
    if p > len(distances):
        raise ValueError("p value is greater than the number of points")
    distances_matrix = np.zeros((len(distances), len(distances)))
    nodes = list(sorted(distances.keys()))
    nodes_map = dict(list(enumerate(nodes)))
    for i, v in enumerate(nodes):
        for j, u in enumerate(nodes):
            distances_matrix[i][j] = distances[u][v]
    mappings, medians, cost = pam(distances_matrix, p, n_iter=n_iter)
    facilities = set(nodes_map[v] for v in medians)
    allocation = {}
    for i, j in enumerate(mappings):
        allocation[nodes_map[i]] = nodes_map[j]
    return allocation, facilities, cost