AI Machine Learning Code to Identify Malicious Traffic

Project Date January 13th, 2022
Code Available at lakshminkmeda.github
Tools Python, Google Collab

The following code is a part of a project 'Artificial Intelligence based Malicious Traffic Detection', carried out as a part of university dissertation. The idea of the project is to gather regular traffic from a sample network and use machine learning to derive meaningful patterns from the data, which in turn can be used to distinguish unidentified or unauthorized traffic in the network. A firewall does a similar job but is ideally used to monitor the traffic entering the network. This particular project can stay within a network and analyze the local traffic to detect any intrusions.

                # AI Code to Detect Malicious Traffic
                import numpy as np
                import pandas as pd
                import tensorflow as tf
                import glob
                from tensorflow import feature_column
                from tensorflow.keras import layers
                from sklearn.model_selection import train_test_split
                from pandas.api.types import CategoricalDtype
                from tensorflow.keras.callbacks import TensorBoard
                from google.colab import drive
                drive.mount('/content/drive')
                
                %load_ext tensorboard
                %tensorboard --logdir=logs
                
                #Use Pandas to create a dataframe
                path = "/content/drive/My Drive/AI/Dataset/"
                files = glob.glob(path + "/*.csv")
                print('File names:', files)
                dataframe = pd.DataFrame()
                content = []
                
                # To check all the csv files in the path
                for filename in files:
                    df = pd.read_csv(filename, index_col=None)
                    content.append(df)
                
                dataframe = pd.concat(content)
                
                dataframe["time"] = dataframe["time"].astype('category')
                dataframe["source_address"] = dataframe["source_address"].astype('category')
                dataframe["destination_address"] = dataframe["destination_address"].astype('category')
                dataframe["protocol"] = dataframe["protocol"].astype('category')
                dataframe["label"] = dataframe["label"].astype('category')
                dataframe["label_detail"] = dataframe["label_detail"].astype('category')
                dataframe["threat"] = dataframe["threat"].astype('category')
                dataframe["source_port"] = dataframe["source_port"].astype('category')
                dataframe["destination_port"] = dataframe["destination_port"].astype('category')
                dataframe["orig_packets"] = dataframe["orig_packets"].astype('category')
                
                dataframe["time_cat"] = dataframe["time"].cat.codes
                dataframe["source_address_cat"] = dataframe["source_address"].cat.codes
                dataframe["destination_address_cat"] = dataframe["destination_address"].cat.codes
                dataframe["protocol_cat"] = dataframe["protocol"].cat.codes
                dataframe["label_cat"] = dataframe["label"].cat.codes
                dataframe["label_detail_cat"] = dataframe["label_detail"].cat.codes
                dataframe["threat_cat"] = dataframe["threat"].cat.codes
                dataframe["source_port_cat"] = dataframe["source_port"].cat.codes
                dataframe["destination_port_cat"] = dataframe["destination_port"].cat.codes
                dataframe["orig_packets_cat"] = dataframe["orig_packets"].cat.codes

                #save dataframe with new columns for future datmapping
                dataframe.to_csv('dataframe-export-allcolumns.csv')

                #remove old columns
                del dataframe["time"]
                del dataframe["source_address"]
                del dataframe["destination_address"]
                del dataframe["protocol"]
                del dataframe["label"]
                del dataframe["label_detail"]
                del dataframe["source_port"]
                del dataframe["destination_port"]
                del dataframe["threat"]
                del dataframe["orig_packets"]

                #restore original names of columns
                dataframe.rename(columns={"source_address_cat": "source_address",
                                          "destination_address_cat": "destination_address",
                                          "time_cat": "time", "protocol_cat": "protocol",
                                          "label_detail_cat": "label_detail", "label_cat":
                                          "label", "threat_cat": "threat", "source_port_cat":
                                          "source_port", "destination_port_cat":
                                          "destination_port", "orig_packets_cat": "orig_packets"
                                          }, inplace=True)
                print(dataframe.head())
                print(dataframe.info())

                #save dataframe cleaned up
                dataframe.to_csv('dataframe-export-int-cleaned.csv')
                
                #dataframe = np.asarray(dataframe).astype(np.float32)
                #dataframe = dataframe.astype(np.float32)
                #tf.convert_to_tensor(dataframe, dtype=tf.int32)
                #Split the dataframe into train, validation, and test
                train, test = train_test_split(dataframe, test_size=0.2)
                train, val = train_test_split(train, test_size=0.2)
                print(len(train), 'train examples')
                print(len(val), 'validation examples')
                print(len(test), 'test examples')

                #Create an input pipeline using tf.data
                # A utility method to create a tf.data dataset from a Pandas Dataframe
                def df_to_dataset(dataframe, shuffle=True, batch_size=32):
                  dataframe = dataframe.copy()
                  labels = dataframe.pop('threat')
                  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
                  if shuffle:
                    ds = ds.shuffle(buffer_size=len(dataframe))
                  ds = ds.batch(batch_size)
                  return ds

                #choose columns needed for calculations (features)
                feature_columns = []
                for header in ["source_address", "source_port", "protocol", "label", "label_detail"]:
                    feature_columns.append(feature_column.numeric_column(header))

                #create feature layer
                feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

                #set batch size pipeline
                batch_size = 32
                train_ds = df_to_dataset(train, batch_size=batch_size)
                val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
                test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)
                
                #create tensorboard callback
                tensorboard_callback = [TensorBoard(
                    log_dir="logs",
                    histogram_freq=1,
                    write_graph=True,
                    write_images=False,
                    update_freq="epoch",
                )]
                
                #create compile and train model
                model = tf.keras.Sequential([
                  feature_layer,
                  layers.Dense(128, activation='relu'),
                  layers.Dense(128, activation='relu'),
                  layers.Dense(1)
                ])
                model.compile(optimizer='adam',
                              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                              metrics=['accuracy'])
                model.fit(train_ds,
                          validation_data=val_ds,
                          epochs=10, callbacks=[tensorboard_callback])
                loss, accuracy = model.evaluate(test_ds)
                print("Accuracy", accuracy)
                model.summary()
            
The project required extreme computing power due to multiple and large datasets used, hence was run using Google Collab and TensorBoard.