Featured image of post Spam Detection and Analysis (Part 2 - Analysis with R)

Spam Detection and Analysis (Part 2 - Analysis with R)

Spam is still a common attack method. Most of the email services have spam filters that can help us block and filter out most of the emails with commercial, fraudulent and malicious content. The purpose of the project is to explore and analyze the email header to identify the features that can tell us which emails are malicious.

Spam is still a common attack method. Most of the email services have spam filters that can help us block and filter out most of the emails with commercial, fraudulent and malicious content. The purpose of the project is to explore and analyze the email header to identify the features that can tell us which emails are malicious.

This is the Part 2 of the series. For how to parsing email, please check Part 1

Data Exploration

Import the data to R

For This part of the script, I used R language to explore the data and the build model. Therefore We need to import CSV to R first.

1
em <- read.csv("gamilspamedited.csv",stringsAsFactors = F)
'data.frame':	272 obs. of  18 variables:
 $ Subject            : chr  
 $ Return.Path.Address: chr  
 $ Date               : chr  
 $ Reply.To.Address   : chr
 $ Content.Type       : chr  
 $ From.Address       : chr
 $ IP                 : chr  
 $ SPF                : chr  
 $ DMARC              : chr  
 $ DKIM               : chr
 $ Country            : chr  
 $ Regin              : chr  
 $ City               : chr  
 $ IPv6.Indicator     : int  
 $ CAT                : chr  
 $ Reputation         : chr  

After cleaning the data, there are 272 entries and 18 columns left.

Preprocess the date

Create Variables

Create hashtml and hasattach columns to label if the email use html for main content and if the email has attachment.

1
2
3
4
5
6
7
8
9
library(dplyr)
hashtml.func <- function(x,c1){
    x <- case_when(x[c1]=="multipart/alternative" ~ "1",x[c1]=="multipart/mixed" ~ "1",x[c1]=="multipart/related" ~ "1",x[c1]=="text/html" ~ "1",x[c1]=="text/plain" ~ "0")
}
em$hashtml <- apply(em,1,hashtml.func,c1='Content.Type')
hasattach.func <- function(x,c1){
    x <- case_when(x[c1]=="multipart/alternative" ~ "0",x[c1]=="multipart/mixed" ~ "1",x[c1]=="multipart/related" ~ "0",x[c1]=="text/html" ~ "0",x[c1]=="text/plain" ~ "0")
}
em$hasattach <- apply(em,1,hasattach.func,c1='Content.Type')

Compare Email’s From address and reply address and label if they are different.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
library(stringr)
rpavsfrom.func <- function(x){
    fa<-x['From.Address']
    fa<-str_split(fa,'@')[[1]][2]
    fasplit <-str_split(fa,'[.]')[[1]]
    fareal <- paste(str_split(fa,'[.]')[[1]][(length(fasplit)-1)],'.',str_split(fa,'[.]')[[1]][(length(fasplit))],sep = "")
    rp<-x['Return.Path.Address']
    rp<-str_split(rp,'@')[[1]][2]
    rpsplit <-str_split(rp,'[.]')[[1]]
    rpreal <- paste(str_split(rp,'[.]')[[1]][(length(rpsplit)-1)],'.',str_split(rp,'[.]')[[1]][(length(rpsplit))],sep = "")
    if (fareal == rpreal) {
        x = 1
    }else{
        x = 0
    }
    return(x)
}
em$rpavsfrom<-apply(em,1,rpavsfrom.func)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
library(stringr)
replyvsfrom.func <- function(x){
    fa<-x['From.Address']
    fa<-str_split(fa,'@')[[1]][2]
    fasplit <-str_split(fa,'[.]')[[1]]
    fareal <- paste(str_split(fa,'[.]')[[1]][(length(fasplit)-1)],'.',str_split(fa,'[.]')[[1]][(length(fasplit))],sep = "")
    ra<-x['Reply.To.Address']
    ra<-str_split(ra,'@')[[1]][2]
    rasplit <-str_split(ra,'[.]')[[1]]
    rareal <- paste(str_split(ra,'[.]')[[1]][(length(rasplit)-1)],'.',str_split(ra,'[.]')[[1]][(length(rasplit))],sep = "")
    if (fareal == rareal) {
        x = 1
    }else{
        x = 0
    }
    return(x)
}
em$replyvsfrom<-apply(em,1,replyvsfrom.func)

Bin and Categorize the Time and the Day.

1
2
3
4
5
6
7
8
library(lubridate)
weekday.func <- function(x,c1){
    weekday <- wday(as.Date(str_split(x[c1],' ')[[1]][1],format='%m/%d/%Y'))
    return(weekday)
}
em$Weekday <- apply(em,1,weekday.func,c1='Date')
em$Weekday <- factor(em$Weekday)
levels(em$Weekday) <- c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")
1
2
3
4
5
6
time.func <- function(x,c1){
    time <- gsub(":", "", str_split(x[c1],' ')[[1]][2])
    return(time)
}
em$Time <- apply(em,1,time.func,c1='Date')
em$Time <- factor(round(as.numeric(em$Time)/100))

Convert the data to proper formats.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
em$SPF <- factor(em$SPF,levels = c("pass","neutral","fail","temperror"))
em$DMARC <- factor(em$DMARC,levels = c("pass","fail","None"))
em$DKIM <- factor(em$DKIM,levels = c("pass","None"))
em$Country <- factor(em$Country)
em$IPv6.Indicator<-factor(em$IPv6.Indicator)
em$CAT <- factor(em$CAT)
em$Reputation <- factor(em$Reputation)
em$hashtml <- factor(em$hashtml)
em$hasattach <- factor(em$hasattach)
em$rpavsfrom <- factor(em$rpavsfrom)
em$replyvsfrom <- factor(em$replyvsfrom)

Set base level.

1
2
em$Country <- relevel(em$Country,ref = "US")
em$Reputation <- relevel(em$Reputation,ref = "Unsuspicious")

Re-code the target to numeric.

1
em$CAT <- as.numeric(em$CAT == "Spam")

Explore the Relationship.

1
2
3
set.seed(11)
selected.var <- c(7, 9, 10, 11, 12, 15, 16,17,18,19,20,21,22,23,24)
selected.df <- em[, selected.var]
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
emnum<-selected.df
levels(selected.df$SPF) <- c("1", "2", "3", "4")
emnum$SPF <- as.numeric(selected.df$SPF)
levels(selected.df$DMARC) <- c("1", "2", "3")
emnum$DMARC <- as.numeric(selected.df$DMARC)
levels(selected.df$DKIM) <- c("1", "2")
emnum$DKIM <- as.numeric(selected.df$DKIM)
levels(selected.df$Country) <- c("1", "2")
emnum$Country <- as.numeric(selected.df$Country)
levels(selected.df$IPv6.Indicator) <- c("1", "2")
emnum$IPv6.Indicator <- as.numeric(selected.df$IPv6.Indicator)
levels(selected.df$Reputation) <- c("1", "2")
emnum$Reputation <- as.numeric(selected.df$Reputation)
levels(selected.df$hashtml) <- c("1", "2")
emnum$hashtml <- as.numeric(selected.df$hashtml)
levels(selected.df$hasattach) <- c("1", "2")
emnum$hasattach <- as.numeric(selected.df$hasattach)
levels(selected.df$rpavsfrom) <- c("1", "2")
emnum$rpavsfrom <- as.numeric(selected.df$rpavsfrom)
levels(selected.df$replyvsfrom) <- c("1", "2")
emnum$replyvsfrom <- as.numeric(selected.df$replyvsfrom)
levels(selected.df$Weekday) <- c("1", "2", "3", "4", "5", "6", "7")
emnum$Weekday <- as.numeric(selected.df$Weekday)
levels(selected.df$Time) <- c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14","15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25")
emnum$Time <- as.numeric(selected.df$Time)

Create correlation matrix.

1
2
3
library("PerformanceAnalytics")
chart.Correlation(emnum, method="spearman",histogram=TRUE,pch="16")

png

1
2
3
library(corrr)

emnum %>% cor(method="spearman") %>% network_plot(min_cor=0.1)

png

Build Model

Partition Data

Create Training and Validation Sets

1
2
3
4
# partition the data
train.index <- sample(1:nrow(em), nrow(em)*0.6)  
train.df <- selected.df[train.index, ]
valid.df <- selected.df[-train.index, ]

Fit a Logistic Regression Model

1
2
logit.reg <- glm(CAT ~ Message+DMARC+Country+DKIM+Reputation+replyvsfrom+rpavsfrom+Weekday, data = train.df, family = "binomial")
summary(logit.reg)
Call:
glm(formula = CAT ~ Message + DMARC + Country + DKIM + Reputation +
    replyvsfrom + rpavsfrom + Weekday, family = "binomial", data = train.df)

Deviance Residuals:
     Min        1Q    Median        3Q       Max  
-3.09465  -0.18311  -0.04540  -0.00298   2.73259  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -4.880e+00  1.891e+00  -2.581 0.009848 **
Message      -2.864e-05  1.803e-05  -1.589 0.112145    
DMARC2       -1.101e+01  2.400e+03  -0.005 0.996340    
DMARC3        6.306e+00  1.445e+00   4.364 1.28e-05 ***
Country2      4.332e+00  1.767e+00   2.451 0.014247 *  
DKIM2        -3.987e+00  2.255e+00  -1.768 0.077077 .  
Reputation2   9.137e+00  2.607e+00   3.505 0.000457 ***
replyvsfrom2 -5.052e+00  1.214e+00  -4.162 3.16e-05 ***
rpavsfrom2   -2.141e+00  1.044e+00  -2.052 0.040209 *  
Weekday2      3.313e+00  1.764e+00   1.878 0.060359 .  
Weekday3      3.392e+00  1.544e+00   2.197 0.028011 *  
Weekday4      6.549e-01  1.684e+00   0.389 0.697268    
Weekday5      5.326e+00  1.607e+00   3.315 0.000918 ***
Weekday6      1.686e+00  1.216e+00   1.387 0.165409    
Weekday7      2.898e+00  1.634e+00   1.773 0.076224 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 172.126  on 162  degrees of freedom
Residual deviance:  56.569  on 148  degrees of freedom
AIC: 86.569

Number of Fisher Scoring iterations: 15

Predict with validation dataset and find the best threshold.

1
2
3
4
5
6
logit.reg.pred <- predict(logit.reg, valid.df,  type = "response")
library(pROC)
r <- roc(valid.df$CAT, logit.reg.pred)
plot.roc(r)
pred <- ifelse(logit.reg.pred > coords(r, x = "best")[[1]], 1, 0)
print(coords(r, x = "best"))
Type 'citation("pROC")' for a citation.

Attaching package: 'pROC'

The following objects are masked from 'package:stats':

    cov, smooth, var



  threshold specificity sensitivity
  0.3222076   0.9625000   0.9655172

1
2
library(caret)
confusionMatrix(factor(pred), factor(valid.df$CAT), positive = "1")
Loading required package: lattice
Loading required package: ggplot2



Confusion Matrix and Statistics

          Reference
Prediction  0  1
         0 77  1
         1  3 28

               Accuracy : 0.9633          
                 95% CI : (0.9087, 0.9899)
    No Information Rate : 0.7339          
    P-Value [Acc > NIR] : 2.433e-10       

                  Kappa : 0.9081          
 Mcnemar's Test P-Value : 0.6171          

            Sensitivity : 0.9655          
            Specificity : 0.9625          
         Pos Pred Value : 0.9032          
         Neg Pred Value : 0.9872          
             Prevalence : 0.2661          
         Detection Rate : 0.2569          
   Detection Prevalence : 0.2844          
      Balanced Accuracy : 0.9640          

       'Positive' Class : 1               
comments powered by Disqus
All contents licensed under CC BY-NC-SA 4.0 本站所有内容基于CC BY-NC-SA 4.0协议发布,转载需要署名
Built with Hugo
Theme Stack designed by Jimmy