Source code for fingerprints

from itertools               import combinations
import numpy as np

try:
    from .cluster import Cluster
    from .cross_correlation_graph import CrossCorrelationGraph
    from .fingerprint import Fingerprint
except:
    try:
        from cluster import Cluster
        from cross_correlation_graph import CrossCorrelationGraph
        from fingerprint import Fingerprint
    except Exception as e:
        raise ValueError(e)

[docs]class FingerprintGenerator(object): """Generator of FlowPrint Fingerprint objects from flows Attributes ---------- batch : float Threshold for the batch size in seconds window : float Threshold for the window size in seconds correlation : float Threshold for the minimum required correlation similarity : float Threshold for the minimum required similarity """
[docs] def __init__(self, batch=300, window=30, correlation=0.1, similarity=0.9): """Generate FlowPrint Fingerprint objects from flows Parameters ---------- batch : float, default=300 Threshold for the batch size in seconds window : float, default=30 Threshold for the window size in seconds correlation : float, default=0.1 Threshold for the minimum required correlation similarity : float, default=0.9 Threshold for the minimum required similarity """ # Set FlowPrint parameters self.batch = batch self.window = window self.correlation = correlation self.similarity = similarity
######################################################################## # Fit-Predict method # ########################################################################
[docs] def fit_predict(self, X, y=None): """Create fingerprints from given samples in X. Parameters ---------- X : array-like of shape=(n_samples,) Samples (Flow objects) from which to generate fingerprints. y : array-like of shape=(n_samples,), optional Labels corresponding to X. If given, they will be encorporated into each fingerprint. Returns ------- result : np.array of shape=(n_samples,) Resulting fingerprints. """ #################################################################### # Setup fingerprints # #################################################################### # Transform to arrays X = np.asarray(X) y = np.asarray(y) if y is not None else np.zeros(X.shape[0], dtype=int) # Initialise result result = (np.zeros(X.shape[0]) - 1).astype(object) #################################################################### # Divide into batches # #################################################################### # Sort X and y by time sort_time = np.argsort(X) sort_orig = np.argsort(sort_time) X = X[sort_time] y = y[sort_time] # Divide X and y in batches of size batch sort_batch = np.array([x.time_start for x in X]) # Compute number of required batches if sort_batch.shape[0]: batches = int( np.ceil((max(sort_batch) - min(sort_batch)) / self.batch) ) else: batches = 0 # Get edges of batches _, edges = np.histogram(sort_batch, bins=max(1, batches)) # Sort indices into batches batches = np.digitize(sort_batch, edges[:-1]) #################################################################### # Create fingerprints for each batch # #################################################################### # Loop over each batch for batch in np.unique(batches): # Get data for given batch X_batch = X[batches == batch] y_batch = y[batches == batch] # Create fingerprints for single batch prediction = self._fit_single_batch_(X_batch, y_batch) result[batches == batch] = prediction #################################################################### # Merge similar fingerprints # #################################################################### # Merge similar fingerprints on all timeslots result = self.merge_fingerprints(result, self.similarity) # For unknown results assign nearest neighbour result = self.assign_nearest(X, result) # Return result return result[sort_orig]
def _fit_single_batch_(self, X, y=None): """Create fingerprints for a given batch of flows. Parameters ---------- X : array-like of shape=(n_samples_batch,) Samples (Flow objects) from which to generate fingerprints. y : array-like of shape=(n_samples_batch,), optional Labels corresponding to X. If given, they will be encorporated into each fingerprint. Returns ------- np.array of shape=(n_samples,) Resulting fingerprints corresponding to each flow. """ #################################################################### # Create fingerprints # #################################################################### # Create clustering instance cluster = Cluster() # Cluster flows into network destinations cluster.fit(X, y) # Find cliques in clusters cliques = CrossCorrelationGraph( window = self.window, # Set window size correlation = self.correlation # Set correlation threshold ).fit_predict(cluster) # Get cliques # Transform cliques to fingerprints fingerprints = list( Fingerprint(c) # Cast to fingerprint for c in cliques if len(c) > 1 # Only select cliques > 1 ) #################################################################### # Assign fingerprints per flow # #################################################################### # Get network destination per flow destinations = cluster.predict(X) # Get destination id per flow translation = cluster.cluster_dict() # Get destinations for each id destinations = [translation.get(d) for d in destinations] # Get fingerprint per network destination mapping_fingerprints = dict() # Map destination to largest fingerprint by (#destinations, #flows) for fingerprint in sorted(fingerprints): for destination in fingerprint: mapping_fingerprints[destination] = fingerprint # Apply mapping prediction = np.array([ mapping_fingerprints.get(x.destination, mapping_fingerprints.get(x.certificate, Fingerprint())) for x in X ]) #################################################################### # Handle unknown and similar fingerprints # #################################################################### # For unknown results assign nearest neighbour prediction = self.assign_nearest(X, prediction) # Merge similar fingerprints prediction = self.merge_fingerprints(prediction, self.similarity) # Return prediction return prediction ######################################################################## # Auxiliary methods # ######################################################################## def assign_nearest(self, X, y): """Set unassigned labels to that of nearest neighbours. Parameters ---------- X : np.array of shape=(n_flows,) Array of original flows. y : np.array of shape=(n_flows,) and dtype=int Array of fingerprints. Returns ------- result : np.array of shape=(n_flows,) and dtype=int Array of Fingerprints. Without any -1 labels. """ #################################################################### # Sort flows and fingerprints by timestamp # #################################################################### # Sort flows by time sort_time = np.argsort(X) sort_orig = np.argsort(sort_time) # Sort by time X = X[sort_time] y = y[sort_time] # Get timestamps timestamps = np.asarray([x.time_start for x in X]) #################################################################### # Assign closest fingerprints in time # #################################################################### # Get blocks of unassigned fingerprint indices blocks = list() block = list() for i, fingerprint in enumerate(y): if fingerprint and block: blocks.append(np.asarray(block)) block = list() elif not fingerprint: block.append(i) if block: blocks.append(np.asarray(block)) # For each block of unassigned fingerprints compute new labels for block in blocks: # Get indices before and after block before = min(block) - 1 after = max(block) + 1 # Get timestamps before and after block ts_before = X[before].time_start if before >= 0 else float('inf') ts_after = X[after ].time_start if after < X.shape[0] else float('inf') # Get fingerprints before and after block fp_before = y[before] if before >= 0 else Fingerprint() fp_after = y[after ] if after < X.shape[0] else Fingerprint() # Assign new fingerprints per block block_before = abs(timestamps[block] - ts_before) <\ abs(timestamps[block] - ts_after ) y[block[ block_before]] = fp_before y[block[~block_before]] = fp_after # Return fingerprints in original order return y[sort_orig] def merge_fingerprints(self, fingerprints, threshold=1): """Merge fingerprints based on similarity. Parameters ---------- fingerprints : list List of fingerprints to merge. Returns ------- result : list Merged fingerprints """ #################################################################### # Case default: all fingerprints are different # #################################################################### result = np.asarray(fingerprints) # Retrieve unique fingerprints unique = sorted(set(fingerprints)) #################################################################### # Case 1: all fingerprints are equal # #################################################################### if threshold <= 0: # Create one big merged fingerprint out of all unique fingerprints result[:] = Fingerprint(set().union(*unique)) #################################################################### # Case 2: Merge fingerprints by 0 < threshold < 1 # #################################################################### elif threshold < 1: # Initialise fingerprinting pairs to merge pairs = set([ # Define pairs (fp1, fp2) # For each combination of pairs for fp1, fp2 in self.score_combinations(unique, threshold) # Where similarity >= threshold if fp1.compare(fp2) >= threshold ]) # Create mapping of original fingerprint -> merged fingerprint mapping = dict() # Loop over all fingerprints to be merged for fp1, fp2 in pairs: # Create merged fingerprint fp_merged = mapping.get(fp1, fp1).merge( mapping.get(fp2, fp2)) # Set mappings mapping[fp1] = fp_merged mapping[fp2] = fp_merged # Apply mapping result = np.array([mapping.get(fp, fp) for fp in fingerprints]) #################################################################### # Return merged fingerprints # #################################################################### return result def score_combinations(self, fingerprints, threshold): """Generator for combinations of fingerprints where can be > threshold. IMPORTANT: This method is purely for efficiency purposes. It is based on the observation that fingerprints of size N require at least M equal destinations to reach the threshold. Alternatively `itertools.combinations(fingerprints, 2)`` may be used. Parameters ---------- fingerprints : iterable Iterable of fingerprints threshold : float Threshold that specifies whether fingerprints should match. Yields ------ fingerprint1, fingerprint2 : Fingerprint Fingerprints which may have a similarity score >= threshold """ # Get unique fingerprints fingerprints = sorted(set(fingerprints)) # Only compare fingerprints of similar lengths lengths = dict() # Loop over all fingerprints for fp in fingerprints: # Get fingerprint length length = len(fp) # Set fingerprint lengths lengths[length] = lengths.get(length, set()) | set([fp]) # Get possible combinations of lengths for length, fingerprints in sorted(lengths.items()): # Skip length 0, they cannot be equal if length == 0: continue # Initialise set of length combinations to explore length_comb = set() # Get length of possible next combination length2 = length # Check if score is possible while (length2-length) / max(length2, 1) <= (1-threshold): # Check if length is possible if length2 in lengths: # If both are possible, add length as combination length_comb.add(length2) # Increment next combination to check length2 += 1 # Loop over all combinations of lengths for length2 in length_comb: # If lengths are equal, return combinations in self if length == length2: yield from combinations(lengths.get(length), 2) # If lengths are not equal, return combinations between sets else: a = lengths.get(length ) b = lengths.get(length2) yield from ((x,y) for x in a for y in b) def map(self, fingerprints_test, fingerprints_train, verbose=False): """Map training fingerprints to testing fingerprints. Parameters ---------- fingerprints_test : list List of fingerprints from which to map (keys). fingerprints_train : list List of fingerprints to which to map (values). verbose : boolean, default=False If set, prints progress Returns ------- result : dict Dictionary of fingerprint_test -> closest_match """ # Get unique fingerprints fingerprints_train = np.unique(fingerprints_train) fingerprints_test = np.unique(fingerprints_test) # Create mappings mapping_train = dict() # Loop over fingerprints for fp in fingerprints_train: # Extract keys for key in fp: # Add fingerprint to each key mapping_train[key] = mapping_train.get(key, set()) | set([fp]) # Refine mapping to fp_test -> set([fps_train labels]) mapping = dict() # Loop over all testing fingerprints for i, fp in enumerate(fingerprints_test): # Print progress if verbose if verbose: print("{}/{}".format(i+1, fingerprints_test.shape[0]), end='\r') # Initialise set matches = set() # Loop over all keys of fingerprint for key in fp: # Get possible fingerprints matches |= mapping_train.get(key, set()) # Initialise highest score highest_score = 0 # Loop over all matches for match in matches: # Get score score = fp.compare(match) # If larger than highest score, replace match if score > highest_score: mapping[fp] = match highest_score = score # Return result return mapping def isin(self, fingerprints_test, fingerprints_train, similarity, verbose=False): """Check if testing fingerprints are in training fingerprints. Parameters ---------- fingerprints_test : list List of fingerprints to check. fingerprints_train : list List of fingerprints to check against. similarity : float Minimum required similarity for mapping. verbose : boolean, default=False If set, prints progress Returns ------- result : dict Dictionary of fingerprint_test -> True/False """ # Get unique fingerprints fingerprints_train = np.unique(fingerprints_train) fingerprints_test = np.unique(fingerprints_test) # Create mappings mapping_train = dict() # Loop over fingerprints for fp in fingerprints_train: # Extract keys for key in fp: # Add fingerprint to each key mapping_train[key] = mapping_train.get(key, set()) | set([fp]) # Refine mapping to fp_test -> True/False mapping = {fp: False for fp in fingerprints_test} # Loop over all testing fingerprints for i, fp in enumerate(fingerprints_test): # Print progress if verbose if verbose: print("{}/{}".format(i+1, fingerprints_test.shape[0]), end='\r') # Initialise set matches = set() # Loop over all keys of fingerprint for key in fp: # Get possible fingerprints matches |= mapping_train.get(key, set()) # Loop over all matches for match in matches: # Get score score = fp.compare(match) # If >= than similarity, set match to True if score >= similarity: mapping[fp] = True break # Return result return mapping