Source code for browser_detector

from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd

try:
    from .cluster import Cluster
except:
    try:
        from cluster import Cluster
    except Exception as e:
        raise ValueError(e)


[docs]class BrowserDetector(object):
    """Detector for browser application

        Attributes
        ----------
        classifier : sklearn.ensemble.RandomForestClassifier
            Random forest classifier used for classifying individual datapoints

        before : float
            Time frame in seconds to remove before detected browser

        after : float
            Time frame in seconds to remove after detected browser
    """

[docs]    def __init__(self, before=10, after=10, random_state=42):
        """Detector for browser application

            Parameters
            ----------
            before : float, default = 10
                Time frame in seconds to remove before detected browser

            after : float, default = 10
                Time frame in seconds to remove after detected browser

            random_state : int, RandomState instance or None, optional, default:
                None If int, random_state is the seed used by the random number
                generator; If RandomState instance, random_state is the random
                number generator; If None, the random number generator is the
                RandomState instance used by `np.random`
            """
        # Initialise classifier
        self.classifier = RandomForestClassifier(
            n_estimators=10, random_state=random_state
        )
        # Initialise before and after seconds
        self.before = before
        self.after  = after

    ########################################################################
    #                         Fit/Predict methods                          #
    ########################################################################

[docs]    def fit(self, X, y):
        """Fit the classifier with browser and non-browser traffic

            Parameters
            ----------
            X : array-like of shape=(n_samples, n_features)
                Flows to fit the classifier with

            y : array-like of shape=(n_samples,)
                Array of labels, -1 for non-browser, 1 for browser

            Returns
            -------
            result : self
                Returns self for fit_predict method
            """
        # Fit classifier
        self.classifier.fit(self.features(X), y.astype(int))
        # Return self
        return self

[docs]    def predict(self, X, y=None):
        """Predict whether samples from X are browser: 1 or non_browser: -1

            Parameters
            ----------
            X : array-like of shape=(n_samples, n_features)
                Flows to predict with the classifier

            y : ignored

            Returns
            -------
            result : np.array of shape=(n_samples,)
                -1 if sample from X is not from browser, 1 if sample from X is
                from browser
            """
        # Get prediction from Random Forest
        predictions = self.classifier.predict(self.features(X))

        ################################################################
        #       Label temporally close flows as browser as well        #
        ################################################################

        # Get timestamps from flows
        timestamps  = np.asarray([x.time_start for x in X])

        # Loop over all predictions
        for i, prediction in enumerate(predictions):
            # Check if we found a browser
            if prediction == 1:
                # Get timestamp
                ts = timestamps[i]
                # Set previous and future timestamps
                for j in range(i, 0, -1):
                    predictions[j] = max(predictions[j], 0)
                    if timestamps[j] < ts-self.before: break
                for j in range(i, timestamps.shape[0]):
                    predictions[j] = max(predictions[j], 0)
                    if timestamps[j] > ts+self.after: break

        # Set detected by timeframe to -1
        predictions[predictions == 0] = -1

        # Return result
        return predictions

[docs]    def fit_predict(self, X, y):
        """Fit and predict the samples with the classifier as browser
            or non-browser traffic

            Parameters
            ----------
            X : array-like of shape=(n_samples, n_features)
                Flows to fit the classifier with

            y : array-like of shape=(n_samples,)
                Array of labels, -1 for non-browser, 1 for browser

            Returns
            -------
            result : np.array of shape=(n_samples,)
                -1 if sample from X is not from browser, 1 if sample from X is
                from browser
            """
        return self.fit(X, y).predict(X)



    ########################################################################
    #                          Feature extraction                          #
    ########################################################################

[docs]    def features(self, X):
        """Returns flow features for determining whether flows are browser

            Parameters
            ----------
            X : array-like of shape=(n_samples, n_features)
                Flows from which to extract features

            Returns
            -------
            result : np.array of shape=(n_samples, n_features)
                Features for determining browser flows.
                Currently the features are [clusters', length incoming',
                length outgoing', ratio incoming/outgoing']
                where the ' indicates the derivative
            """
        print("Creating features")
        # Compute dataframe of flow features
        df = pd.DataFrame({
            'outgoing' : [ sum(y for y in x.lengths if y > 0) for x in X],
            'incoming' : [-sum(y for y in x.lengths if y < 0) for x in X],
            'cluster'  : Cluster().fit_predict(X)
        }, index=pd.to_datetime([x.time_start for x in X], unit='s')
        ).sort_index()

        # Compute features as rolling changes
        cluster  = df['cluster' ].rolling('5s').apply(
            lambda x: np.unique(x).shape[0], raw=True).values
        incoming = df['incoming'].rolling('5s').apply(np.mean, raw=True).values
        outgoing = df['outgoing'].rolling('5s').apply(np.mean, raw=True).values

        # Compute derivatives
        cluster  = np.concatenate(([0], np.diff(cluster)))
        incoming = np.concatenate(([0], np.diff(incoming)))
        outgoing = np.concatenate(([0], np.diff(outgoing)))
        ratio    = incoming/outgoing

        # To numpy array
        result = np.asarray([cluster, incoming, outgoing, ratio]).T
        # Impute NaN
        result[np.isnan(result)       ] = 0
        result[result ==  float('inf')] = 0
        result[result == -float('inf')] = 0

        # Return result
        return result