Source code for browser_detector

from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd

try:
    from .cluster import Cluster
except:
    try:
        from cluster import Cluster
    except Exception as e:
        raise ValueError(e)


[docs]class BrowserDetector(object): """Detector for browser application Attributes ---------- classifier : sklearn.ensemble.RandomForestClassifier Random forest classifier used for classifying individual datapoints before : float Time frame in seconds to remove before detected browser after : float Time frame in seconds to remove after detected browser """
[docs] def __init__(self, before=10, after=10, random_state=42): """Detector for browser application Parameters ---------- before : float, default = 10 Time frame in seconds to remove before detected browser after : float, default = 10 Time frame in seconds to remove after detected browser random_state : int, RandomState instance or None, optional, default: None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random` """ # Initialise classifier self.classifier = RandomForestClassifier( n_estimators=10, random_state=random_state ) # Initialise before and after seconds self.before = before self.after = after
######################################################################## # Fit/Predict methods # ########################################################################
[docs] def fit(self, X, y): """Fit the classifier with browser and non-browser traffic Parameters ---------- X : array-like of shape=(n_samples, n_features) Flows to fit the classifier with y : array-like of shape=(n_samples,) Array of labels, -1 for non-browser, 1 for browser Returns ------- result : self Returns self for fit_predict method """ # Fit classifier self.classifier.fit(self.features(X), y.astype(int)) # Return self return self
[docs] def predict(self, X, y=None): """Predict whether samples from X are browser: 1 or non_browser: -1 Parameters ---------- X : array-like of shape=(n_samples, n_features) Flows to predict with the classifier y : ignored Returns ------- result : np.array of shape=(n_samples,) -1 if sample from X is not from browser, 1 if sample from X is from browser """ # Get prediction from Random Forest predictions = self.classifier.predict(self.features(X)) ################################################################ # Label temporally close flows as browser as well # ################################################################ # Get timestamps from flows timestamps = np.asarray([x.time_start for x in X]) # Loop over all predictions for i, prediction in enumerate(predictions): # Check if we found a browser if prediction == 1: # Get timestamp ts = timestamps[i] # Set previous and future timestamps for j in range(i, 0, -1): predictions[j] = max(predictions[j], 0) if timestamps[j] < ts-self.before: break for j in range(i, timestamps.shape[0]): predictions[j] = max(predictions[j], 0) if timestamps[j] > ts+self.after: break # Set detected by timeframe to -1 predictions[predictions == 0] = -1 # Return result return predictions
[docs] def fit_predict(self, X, y): """Fit and predict the samples with the classifier as browser or non-browser traffic Parameters ---------- X : array-like of shape=(n_samples, n_features) Flows to fit the classifier with y : array-like of shape=(n_samples,) Array of labels, -1 for non-browser, 1 for browser Returns ------- result : np.array of shape=(n_samples,) -1 if sample from X is not from browser, 1 if sample from X is from browser """ return self.fit(X, y).predict(X)
######################################################################## # Feature extraction # ########################################################################
[docs] def features(self, X): """Returns flow features for determining whether flows are browser Parameters ---------- X : array-like of shape=(n_samples, n_features) Flows from which to extract features Returns ------- result : np.array of shape=(n_samples, n_features) Features for determining browser flows. Currently the features are [clusters', length incoming', length outgoing', ratio incoming/outgoing'] where the ' indicates the derivative """ print("Creating features") # Compute dataframe of flow features df = pd.DataFrame({ 'outgoing' : [ sum(y for y in x.lengths if y > 0) for x in X], 'incoming' : [-sum(y for y in x.lengths if y < 0) for x in X], 'cluster' : Cluster().fit_predict(X) }, index=pd.to_datetime([x.time_start for x in X], unit='s') ).sort_index() # Compute features as rolling changes cluster = df['cluster' ].rolling('5s').apply( lambda x: np.unique(x).shape[0], raw=True).values incoming = df['incoming'].rolling('5s').apply(np.mean, raw=True).values outgoing = df['outgoing'].rolling('5s').apply(np.mean, raw=True).values # Compute derivatives cluster = np.concatenate(([0], np.diff(cluster))) incoming = np.concatenate(([0], np.diff(incoming))) outgoing = np.concatenate(([0], np.diff(outgoing))) ratio = incoming/outgoing # To numpy array result = np.asarray([cluster, incoming, outgoing, ratio]).T # Impute NaN result[np.isnan(result) ] = 0 result[result == float('inf')] = 0 result[result == -float('inf')] = 0 # Return result return result