Source code for fingerprints

from itertools               import combinations
import numpy as np

try:
    from .cluster import Cluster
    from .cross_correlation_graph import CrossCorrelationGraph
    from .fingerprint import Fingerprint
except:
    try:
        from cluster import Cluster
        from cross_correlation_graph import CrossCorrelationGraph
        from fingerprint import Fingerprint
    except Exception as e:
        raise ValueError(e)

[docs]class FingerprintGenerator(object):
    """Generator of FlowPrint Fingerprint objects from flows

        Attributes
        ----------
        batch : float
            Threshold for the batch size in seconds

        window : float
            Threshold for the window size in seconds

        correlation : float
            Threshold for the minimum required correlation

        similarity : float
            Threshold for the minimum required similarity
    """

[docs]    def __init__(self, batch=300, window=30, correlation=0.1, similarity=0.9):
        """Generate FlowPrint Fingerprint objects from flows

            Parameters
            ----------
            batch : float, default=300
                Threshold for the batch size in seconds

            window : float, default=30
                Threshold for the window size in seconds

            correlation : float, default=0.1
                Threshold for the minimum required correlation

            similarity : float, default=0.9
                Threshold for the minimum required similarity
            """
        # Set FlowPrint parameters
        self.batch       = batch
        self.window      = window
        self.correlation = correlation
        self.similarity  = similarity

    ########################################################################
    #                          Fit-Predict method                          #
    ########################################################################

[docs]    def fit_predict(self, X, y=None):
        """Create fingerprints from given samples in X.

            Parameters
            ----------
            X : array-like of shape=(n_samples,)
                Samples (Flow objects) from which to generate fingerprints.

            y : array-like of shape=(n_samples,), optional
                Labels corresponding to X. If given, they will be encorporated
                into each fingerprint.

            Returns
            -------
            result : np.array of shape=(n_samples,)
                Resulting fingerprints.
            """
        ####################################################################
        #                        Setup fingerprints                        #
        ####################################################################

        # Transform to arrays
        X = np.asarray(X)
        y = np.asarray(y) if y is not None else np.zeros(X.shape[0], dtype=int)

        # Initialise result
        result = (np.zeros(X.shape[0]) - 1).astype(object)

        ####################################################################
        #                       Divide into batches                        #
        ####################################################################

        # Sort X and y by time
        sort_time = np.argsort(X)
        sort_orig = np.argsort(sort_time)
        X = X[sort_time]
        y = y[sort_time]

        # Divide X and y in batches of size batch
        sort_batch = np.array([x.time_start for x in X])
        # Compute number of required batches
        if sort_batch.shape[0]:
            batches = int(
                np.ceil((max(sort_batch) - min(sort_batch)) / self.batch)
            )
        else:
            batches = 0
        # Get edges of batches
        _, edges = np.histogram(sort_batch, bins=max(1, batches))
        # Sort indices into batches
        batches = np.digitize(sort_batch, edges[:-1])

        ####################################################################
        #                Create fingerprints for each batch                #
        ####################################################################

        # Loop over each batch
        for batch in np.unique(batches):

            # Get data for given batch
            X_batch = X[batches == batch]
            y_batch = y[batches == batch]

            # Create fingerprints for single batch
            prediction = self._fit_single_batch_(X_batch, y_batch)
            result[batches == batch] = prediction

        ####################################################################
        #                    Merge similar fingerprints                    #
        ####################################################################

        # Merge similar fingerprints on all timeslots
        result = self.merge_fingerprints(result, self.similarity)
        # For unknown results assign nearest neighbour
        result = self.assign_nearest(X, result)

        # Return result
        return result[sort_orig]


    def _fit_single_batch_(self, X, y=None):
        """Create fingerprints for a given batch of flows.

            Parameters
            ----------
            X : array-like of shape=(n_samples_batch,)
                Samples (Flow objects) from which to generate fingerprints.

            y : array-like of shape=(n_samples_batch,), optional
                Labels corresponding to X. If given, they will be encorporated
                into each fingerprint.

            Returns
            -------
            np.array of shape=(n_samples,)
                Resulting fingerprints corresponding to each flow.
            """
        ####################################################################
        #                       Create fingerprints                        #
        ####################################################################

        # Create clustering instance
        cluster = Cluster()
        # Cluster flows into network destinations
        cluster.fit(X, y)

        # Find cliques in clusters
        cliques = CrossCorrelationGraph(
                    window      = self.window,     # Set window size
                    correlation = self.correlation # Set correlation threshold
                  ).fit_predict(cluster)           # Get cliques

        # Transform cliques to fingerprints
        fingerprints = list(
            Fingerprint(c)                 # Cast to fingerprint
            for c in cliques if len(c) > 1 # Only select cliques > 1
        )

        ####################################################################
        #                   Assign fingerprints per flow                   #
        ####################################################################

        # Get network destination per flow
        destinations = cluster.predict(X)     # Get destination id per flow
        translation  = cluster.cluster_dict() # Get destinations for each id
        destinations = [translation.get(d) for d in destinations]

        # Get fingerprint per network destination
        mapping_fingerprints = dict()
        # Map destination to largest fingerprint by (#destinations, #flows)
        for fingerprint in sorted(fingerprints):
            for destination in fingerprint:
                mapping_fingerprints[destination] = fingerprint

        # Apply mapping
        prediction = np.array([
            mapping_fingerprints.get(x.destination,
            mapping_fingerprints.get(x.certificate,
            Fingerprint())) for x in X
        ])

        ####################################################################
        #             Handle unknown and similar fingerprints              #
        ####################################################################

        # For unknown results assign nearest neighbour
        prediction = self.assign_nearest(X, prediction)
        # Merge similar fingerprints
        prediction = self.merge_fingerprints(prediction, self.similarity)

        # Return prediction
        return prediction

    ########################################################################
    #                          Auxiliary methods                           #
    ########################################################################

    def assign_nearest(self, X, y):
        """Set unassigned labels to that of nearest neighbours.

            Parameters
            ----------
            X : np.array of shape=(n_flows,)
                Array of original flows.

            y : np.array of shape=(n_flows,) and dtype=int
                Array of fingerprints.

            Returns
            -------
            result : np.array of shape=(n_flows,) and dtype=int
                Array of Fingerprints. Without any -1 labels.
            """
        ####################################################################
        #             Sort flows and fingerprints by timestamp             #
        ####################################################################

        # Sort flows by time
        sort_time = np.argsort(X)
        sort_orig = np.argsort(sort_time)

        # Sort by time
        X = X[sort_time]
        y = y[sort_time]
        # Get timestamps
        timestamps = np.asarray([x.time_start for x in X])

        ####################################################################
        #               Assign closest fingerprints in time                #
        ####################################################################

        # Get blocks of unassigned fingerprint indices
        blocks = list()
        block  = list()
        for i, fingerprint in enumerate(y):
            if fingerprint and block:
                blocks.append(np.asarray(block))
                block = list()
            elif not fingerprint:
                block.append(i)
        if block:
            blocks.append(np.asarray(block))

        # For each block of unassigned fingerprints compute new labels
        for block in blocks:
            # Get indices before and after block
            before = min(block) - 1
            after  = max(block) + 1
            # Get timestamps before and after block
            ts_before = X[before].time_start if before >= 0          else float('inf')
            ts_after  = X[after ].time_start if after  <  X.shape[0] else float('inf')
            # Get fingerprints before and after block
            fp_before = y[before] if before >= 0          else Fingerprint()
            fp_after  = y[after ] if after  <  X.shape[0] else Fingerprint()

            # Assign new fingerprints per block
            block_before = abs(timestamps[block] - ts_before) <\
                           abs(timestamps[block] - ts_after )
            y[block[ block_before]] = fp_before
            y[block[~block_before]] = fp_after

        # Return fingerprints in original order
        return y[sort_orig]


    def merge_fingerprints(self, fingerprints, threshold=1):
        """Merge fingerprints based on similarity.

            Parameters
            ----------
            fingerprints : list
                List of fingerprints to merge.

            Returns
            -------
            result : list
                Merged fingerprints
            """
        ####################################################################
        #           Case default: all fingerprints are different           #
        ####################################################################
        result = np.asarray(fingerprints)

        # Retrieve unique fingerprints
        unique = sorted(set(fingerprints))

        ####################################################################
        #                Case 1: all fingerprints are equal                #
        ####################################################################
        if threshold <= 0:
            # Create one big merged fingerprint out of all unique fingerprints
            result[:] = Fingerprint(set().union(*unique))

        ####################################################################
        #         Case 2: Merge fingerprints by 0 < threshold < 1          #
        ####################################################################
        elif threshold < 1:
            # Initialise fingerprinting pairs to merge
            pairs = set([
                # Define pairs
                (fp1, fp2)
                # For each combination of pairs
                for fp1, fp2 in self.score_combinations(unique, threshold)
                # Where similarity >= threshold
                if fp1.compare(fp2) >= threshold
            ])

            # Create mapping of original fingerprint -> merged fingerprint
            mapping = dict()
            # Loop over all fingerprints to be merged
            for fp1, fp2 in pairs:
                # Create merged fingerprint
                fp_merged = mapping.get(fp1, fp1).merge(
                            mapping.get(fp2, fp2))
                # Set mappings
                mapping[fp1] = fp_merged
                mapping[fp2] = fp_merged

            # Apply mapping
            result = np.array([mapping.get(fp, fp) for fp in fingerprints])

        ####################################################################
        #                    Return merged fingerprints                    #
        ####################################################################
        return result


    def score_combinations(self, fingerprints, threshold):
        """Generator for combinations of fingerprints where can be > threshold.

            IMPORTANT: This method is purely for efficiency purposes. It is
            based on the observation that fingerprints of size N require at
            least M equal destinations to reach the threshold. Alternatively
            `itertools.combinations(fingerprints, 2)`` may be used.

            Parameters
            ----------
            fingerprints : iterable
                Iterable of fingerprints

            threshold : float
                Threshold that specifies whether fingerprints should match.

            Yields
            ------
            fingerprint1, fingerprint2 : Fingerprint
                Fingerprints which may have a similarity score >= threshold
            """
        # Get unique fingerprints
        fingerprints = sorted(set(fingerprints))

        # Only compare fingerprints of similar lengths
        lengths = dict()
        # Loop over all fingerprints
        for fp in fingerprints:
            # Get fingerprint length
            length = len(fp)
            # Set fingerprint lengths
            lengths[length] = lengths.get(length, set()) | set([fp])

        # Get possible combinations of lengths
        for length, fingerprints in sorted(lengths.items()):
            # Skip length 0, they cannot be equal
            if length == 0: continue
            # Initialise set of length combinations to explore
            length_comb = set()
            # Get length of possible next combination
            length2 = length
            # Check if score is possible
            while (length2-length) / max(length2, 1) <= (1-threshold):
                # Check if length is possible
                if length2 in lengths:
                    # If both are possible, add length as combination
                    length_comb.add(length2)
                # Increment next combination to check
                length2 += 1
            # Loop over all combinations of lengths
            for length2 in length_comb:
                # If lengths are equal, return combinations in self
                if length == length2:
                    yield from combinations(lengths.get(length), 2)
                # If lengths are not equal, return combinations between sets
                else:
                    a = lengths.get(length )
                    b = lengths.get(length2)
                    yield from ((x,y) for x in a for y in b)





    def map(self, fingerprints_test, fingerprints_train, verbose=False):
        """Map training fingerprints to testing fingerprints.

            Parameters
            ----------
            fingerprints_test : list
                List of fingerprints from which to map (keys).

            fingerprints_train : list
                List of fingerprints to which to map (values).

            verbose : boolean, default=False
                If set, prints progress

            Returns
            -------
            result : dict
                Dictionary of fingerprint_test -> closest_match
            """
        # Get unique fingerprints
        fingerprints_train = np.unique(fingerprints_train)
        fingerprints_test  = np.unique(fingerprints_test)

        # Create mappings
        mapping_train = dict()

        # Loop over fingerprints
        for fp in fingerprints_train:
            # Extract keys
            for key in fp:
                # Add fingerprint to each key
                mapping_train[key] = mapping_train.get(key, set()) | set([fp])

        # Refine mapping to fp_test -> set([fps_train labels])
        mapping = dict()
        # Loop over all testing fingerprints
        for i, fp in enumerate(fingerprints_test):

            # Print progress if verbose
            if verbose:
                print("{}/{}".format(i+1, fingerprints_test.shape[0]), end='\r')

            # Initialise set
            matches = set()

            # Loop over all keys of fingerprint
            for key in fp:
                # Get possible fingerprints
                matches |= mapping_train.get(key, set())

            # Initialise highest score
            highest_score = 0

            # Loop over all matches
            for match in matches:
                # Get score
                score = fp.compare(match)
                # If larger than highest score, replace match
                if score > highest_score:
                    mapping[fp] = match
                    highest_score = score

        # Return result
        return mapping

    def isin(self, fingerprints_test, fingerprints_train, similarity, verbose=False):
        """Check if testing fingerprints are in training fingerprints.

            Parameters
            ----------
            fingerprints_test : list
                List of fingerprints to check.

            fingerprints_train : list
                List of fingerprints to check against.

            similarity : float
                Minimum required similarity for mapping.

            verbose : boolean, default=False
                If set, prints progress

            Returns
            -------
            result : dict
                Dictionary of fingerprint_test -> True/False
            """
        # Get unique fingerprints
        fingerprints_train = np.unique(fingerprints_train)
        fingerprints_test  = np.unique(fingerprints_test)

        # Create mappings
        mapping_train = dict()

        # Loop over fingerprints
        for fp in fingerprints_train:
            # Extract keys
            for key in fp:
                # Add fingerprint to each key
                mapping_train[key] = mapping_train.get(key, set()) | set([fp])

        # Refine mapping to fp_test -> True/False
        mapping = {fp: False for fp in fingerprints_test}
        # Loop over all testing fingerprints
        for i, fp in enumerate(fingerprints_test):

            # Print progress if verbose
            if verbose:
                print("{}/{}".format(i+1, fingerprints_test.shape[0]), end='\r')

            # Initialise set
            matches = set()

            # Loop over all keys of fingerprint
            for key in fp:
                # Get possible fingerprints
                matches |= mapping_train.get(key, set())

            # Loop over all matches
            for match in matches:
                # Get score
                score = fp.compare(match)
                # If >= than similarity, set match to True
                if score >= similarity:
                    mapping[fp] = True
                    break

        # Return result
        return mapping