The following code is a part of a project 'Artificial Intelligence based Malicious Traffic Detection', carried out as a part of university dissertation. The idea of the project is to gather regular traffic from a sample network and use machine learning to derive meaningful patterns from the data, which in turn can be used to distinguish unidentified or unauthorized traffic in the network. A firewall does a similar job but is ideally used to monitor the traffic entering the network. This particular project can stay within a network and analyze the local traffic to detect any intrusions.
# AI Code to Detect Malicious Traffic import numpy as np import pandas as pd import tensorflow as tf import glob from tensorflow import feature_column from tensorflow.keras import layers from sklearn.model_selection import train_test_split from pandas.api.types import CategoricalDtype from tensorflow.keras.callbacks import TensorBoard from google.colab import drive drive.mount('/content/drive') %load_ext tensorboard %tensorboard --logdir=logs #Use Pandas to create a dataframe path = "/content/drive/My Drive/AI/Dataset/" files = glob.glob(path + "/*.csv") print('File names:', files) dataframe = pd.DataFrame() content = [] # To check all the csv files in the path for filename in files: df = pd.read_csv(filename, index_col=None) content.append(df) dataframe = pd.concat(content) dataframe["time"] = dataframe["time"].astype('category') dataframe["source_address"] = dataframe["source_address"].astype('category') dataframe["destination_address"] = dataframe["destination_address"].astype('category') dataframe["protocol"] = dataframe["protocol"].astype('category') dataframe["label"] = dataframe["label"].astype('category') dataframe["label_detail"] = dataframe["label_detail"].astype('category') dataframe["threat"] = dataframe["threat"].astype('category') dataframe["source_port"] = dataframe["source_port"].astype('category') dataframe["destination_port"] = dataframe["destination_port"].astype('category') dataframe["orig_packets"] = dataframe["orig_packets"].astype('category') dataframe["time_cat"] = dataframe["time"].cat.codes dataframe["source_address_cat"] = dataframe["source_address"].cat.codes dataframe["destination_address_cat"] = dataframe["destination_address"].cat.codes dataframe["protocol_cat"] = dataframe["protocol"].cat.codes dataframe["label_cat"] = dataframe["label"].cat.codes dataframe["label_detail_cat"] = dataframe["label_detail"].cat.codes dataframe["threat_cat"] = dataframe["threat"].cat.codes dataframe["source_port_cat"] = dataframe["source_port"].cat.codes dataframe["destination_port_cat"] = dataframe["destination_port"].cat.codes dataframe["orig_packets_cat"] = dataframe["orig_packets"].cat.codes #save dataframe with new columns for future datmapping dataframe.to_csv('dataframe-export-allcolumns.csv') #remove old columns del dataframe["time"] del dataframe["source_address"] del dataframe["destination_address"] del dataframe["protocol"] del dataframe["label"] del dataframe["label_detail"] del dataframe["source_port"] del dataframe["destination_port"] del dataframe["threat"] del dataframe["orig_packets"] #restore original names of columns dataframe.rename(columns={"source_address_cat": "source_address", "destination_address_cat": "destination_address", "time_cat": "time", "protocol_cat": "protocol", "label_detail_cat": "label_detail", "label_cat": "label", "threat_cat": "threat", "source_port_cat": "source_port", "destination_port_cat": "destination_port", "orig_packets_cat": "orig_packets" }, inplace=True) print(dataframe.head()) print(dataframe.info()) #save dataframe cleaned up dataframe.to_csv('dataframe-export-int-cleaned.csv') #dataframe = np.asarray(dataframe).astype(np.float32) #dataframe = dataframe.astype(np.float32) #tf.convert_to_tensor(dataframe, dtype=tf.int32) #Split the dataframe into train, validation, and test train, test = train_test_split(dataframe, test_size=0.2) train, val = train_test_split(train, test_size=0.2) print(len(train), 'train examples') print(len(val), 'validation examples') print(len(test), 'test examples') #Create an input pipeline using tf.data # A utility method to create a tf.data dataset from a Pandas Dataframe def df_to_dataset(dataframe, shuffle=True, batch_size=32): dataframe = dataframe.copy() labels = dataframe.pop('threat') ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels)) if shuffle: ds = ds.shuffle(buffer_size=len(dataframe)) ds = ds.batch(batch_size) return ds #choose columns needed for calculations (features) feature_columns = [] for header in ["source_address", "source_port", "protocol", "label", "label_detail"]: feature_columns.append(feature_column.numeric_column(header)) #create feature layer feature_layer = tf.keras.layers.DenseFeatures(feature_columns) #set batch size pipeline batch_size = 32 train_ds = df_to_dataset(train, batch_size=batch_size) val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size) test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size) #create tensorboard callback tensorboard_callback = [TensorBoard( log_dir="logs", histogram_freq=1, write_graph=True, write_images=False, update_freq="epoch", )] #create compile and train model model = tf.keras.Sequential([ feature_layer, layers.Dense(128, activation='relu'), layers.Dense(128, activation='relu'), layers.Dense(1) ]) model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=['accuracy']) model.fit(train_ds, validation_data=val_ds, epochs=10, callbacks=[tensorboard_callback]) loss, accuracy = model.evaluate(test_ds) print("Accuracy", accuracy) model.summary()The project required extreme computing power due to multiple and large datasets used, hence was run using Google Collab and TensorBoard.