Source code for cluster

import itertools
import json
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np

try:
    from .network_destination import NetworkDestination
except:
    try:
        from network_destination import NetworkDestination
    except Exception as e:
        raise ValueError(e)

################################################################################
#         Cluster object for clustering flows per network destination          #
################################################################################

[docs]class Cluster(object): """Cluster object for clustering flows by network destination Attributes ---------- samples : np.array of shape=(n_samples,) Samples used to fit Cluster counter : int Counter for total number of NetworkDestinations generated dict_destination : dict Dicationary of (dst IP, dst port) -> NetworkDestination dict_certificate : dict Dicationary of TLS certificate -> NetworkDestination """
[docs] def __init__(self, load=None): """Cluster flows by network destinations Parameters ---------- load : string, default=None If given, load cluster from json file from 'load' path. """ # Set samples self.samples = np.zeros((0)) # Initialise counter self.counter = 0 # Dictionaries of destination identifiers -> cluster self.dict_destination = dict() self.dict_certificate = dict() # Load cluster if necessary if load is not None: self.load(load)
######################################################################## # Fit & prediction methods # ########################################################################
[docs] def fit(self, X, y=None): """Fit the clustering algorithm with flow samples X. Parameters ---------- X : array-like of shape=(n_samples, n_features) Flow samples to fit cluster object. y : array-like of shape=(n_samples,), optional If given, add labels to each cluster. Returns ------- result : self Returns self """ # Add X to samples self.samples = np.concatenate((self.samples, X)) # Set y to empty if None y = np.zeros(len(X)) if y is None else y # Loop over all samples in X for sample, label in zip(X, y): # Extract values certificate = sample.certificate destination = sample.destination # Get the number of matching clusters clusters = [self.dict_certificate.get(certificate), self.dict_destination.get(destination)] # Case 1: Multiple matches # Check for multiple matching slices if all(c is not None for c in clusters): # Case 1a: Destination and certificate -> same cluster if all(c == clusters[0] for c in clusters): # Set the cluster to that one cluster cluster = clusters[0] # Case 1b: Destination and certificate -> different clusters else: # Create new cluster cluster = self.new_cluster() # For each match for c in clusters: # Add samples from old cluster to new cluster cluster.merge(c) # Reset dictionaries to point to new cluster for value in c.certificates: if value is not None: self.dict_certificate[value] = cluster for value in c.destinations: if value is not None: self.dict_destination[value] = cluster # Case 2: Single or no matches else: # Check for matching cluster or create a new one cluster = [c for c in clusters if c] or [self.new_cluster()] # Get matching cluster cluster = cluster[0] # Add datapoint to cluster cluster.add(sample, label) # Point dictionaries to new cluster if certificate is not None: self.dict_certificate[certificate] = cluster if destination is not None: self.dict_destination[destination] = cluster # Return result return self
[docs] def predict(self, X): """Predict cluster labels of X. Parameters ---------- X : array-like of shape=(n_samples, n_features) Samples for which to predict NetworkDestination cluster. Returns ------- result : array-like of shape=(n_samples,) Labels of NetworkDestination cluster corresponding to cluster of fitted samples. Has a value of -1 if no cluster could be matched """ # Predict each item and return return np.asarray([self.predict_single(x) for x in X])
def predict_single(self, X): """Predict single flow X Parameters ---------- X : Flow Flow sample for which to retrieve NetworkDestination cluster. Returns ------- result : int Label of NetworkDestination cluster corresponding to flow or -1 if no cluster could be matched. """ # Get matching cluster or -1 return self.dict_destination.get(X.destination, self.dict_certificate.get(X.certificate, NetworkDestination(-1))).identifier
[docs] def fit_predict(self, X): """Fit and predict cluster with given samples. Parameters ---------- X : array-like of shape=(n_samples, n_features) Samples to fit cluster object. Returns ------- result : array-like of shape=(n_samples,) Labels of cluster corresponding to cluster of fitted samples. Has a value of -1 if no cluster could be matched. """ return self.fit(X).predict(X)
######################################################################## # Auxiliarry methods # ######################################################################## def new_cluster(self): """Creates and returns new NetworkDestination cluster. Returns ------- result : NetworkDestination New unique NetworkDestination for given cluster. """ # Increment number of clusters self.counter += 1 # Create cluster and return return NetworkDestination(self.counter - 1)
[docs] def clusters(self): """Return a set of NetworkDestinations in the current cluster object. Returns ------- result : set Set of NetworkDestinations in cluster. """ clusters = set(self.dict_certificate.values()) clusters |= set(self.dict_destination.values()) return clusters
[docs] def cluster_dict(self): """Return a dictionary of id -> NetworkDestination. Returns ------- result : dict Dict of NetworkDestination.identifier -> NetworkDestination """ return {c.identifier: c for c in self.clusters()}
######################################################################## # Import/export methods # ########################################################################
[docs] def save(self, outfile): """Saves cluster object to json file. Parameters ---------- outfile : string Path to json file in which to store the cluster object. """ # Create output output = {"samples": self.samples.tolist()} # Write to json file with open(outfile, 'w') as outfile: json.dump(output, outfile)
[docs] def load(self, infile): """Loads cluster object from json file. Parameters ---------- infile : string Path to json file from which to load the cluster object. """ with open(infile, 'r') as infile: result = json.load(infile) samples = np.asarray(result.get("samples")) self.fit(samples)
[docs] def copy(self): """Returns a (semi-deep) copy of self. The resulting cluster is a deep copy apart from the samples X. Has a tremendous speedup compared to copy.deepcopy(self) Returns ------- result : Cluster Copy of self """ # Initialise result result = Cluster() # Fit result with given samples result.fit(self.samples) # Return result return result
######################################################################## # Plot methods # ########################################################################
[docs] def plot(self, annotate=False): """Plot cluster NetworkDestinations. Parameters ---------- annotate : boolean, default=False If True, annotate each cluster """ # Get clusters clusters = [c.get_description() for c in self.clusters()] sizes = [20*len(c.samples)**0.7 for c in self.clusters()] # Create complete graph from clusters graph = nx.Graph() graph.add_nodes_from(clusters) graph.add_edges_from(itertools.combinations(clusters, 2)) # Draw graph nx.draw_spring(graph, alpha=0.7, # see through nodes edgelist = list(), # don't show edges node_size = sizes, # set node sizes with_labels = annotate, # don't show labels ) # Plot graph plt.show()
######################################################################## # String method # ######################################################################## def __str__(self): """Returns string representation of self.""" # Get all clusters as a set clusters = self.clusters() # Get predictions preds = self.predict(self.samples) # Return string return """Cluster --------------------------------- Flow samples : {:>} Unique Network Destinations : {:>} Unique labels : {:>} ------------------------------- Unique certificates : {} Unique ip destinations : {}""".format( self.samples.shape[0], len(clusters), np.unique(preds[preds != -1]).shape[0], len(self.dict_certificate), len(self.dict_destination))