R/R 머신러닝

R26_Regression Tree와 Model Tree

Codezoy 2019. 11. 7. 11:57

whitewines.csv


# Regression Tree와 Model Tree
rm(list = ls())
#1. 데이터 준비
wine <- read.csv(file = "mlwr/whitewines.csv")

#2 데이터 확인 전처리
str(wine)
# 4,898 obs.(예시), 12variables(특징) - white wine 데이터

> summary(wine)
fixed.acidity    volatile.acidity  citric.acid     residual.sugar     chlorides       free.sulfur.dioxide
Min.   : 3.800   Min.   :0.0800   Min.   :0.0000   Min.   : 0.600   Min.   :0.00900   Min.   :  2.00     
1st Qu.: 6.300   1st Qu.:0.2100   1st Qu.:0.2700   1st Qu.: 1.700   1st Qu.:0.03600   1st Qu.: 23.00     
Median : 6.800   Median :0.2600   Median :0.3200   Median : 5.200   Median :0.04300   Median : 34.00     
Mean   : 6.855   Mean   :0.2782   Mean   :0.3342   Mean   : 6.391   Mean   :0.04577   Mean   : 35.31     
3rd Qu.: 7.300   3rd Qu.:0.3200   3rd Qu.:0.3900   3rd Qu.: 9.900   3rd Qu.:0.05000   3rd Qu.: 46.00     
Max.   :14.200   Max.   :1.1000   Max.   :1.6600   Max.   :65.800   Max.   :0.34600   Max.   :289.00     
total.sulfur.dioxide    density             pH          sulphates         alcohol         quality     
Min.   :  9.0        Min.   :0.9871   Min.   :2.720   Min.   :0.2200   Min.   : 8.00   Min.   :3.000  
1st Qu.:108.0        1st Qu.:0.9917   1st Qu.:3.090   1st Qu.:0.4100   1st Qu.: 9.50   1st Qu.:5.000  
Median :134.0        Median :0.9937   Median :3.180   Median :0.4700   Median :10.40   Median :6.000  
Mean   :138.4        Mean   :0.9940   Mean   :3.188   Mean   :0.4898   Mean   :10.51   Mean   :5.878  
3rd Qu.:167.0        3rd Qu.:0.9961   3rd Qu.:3.280   3rd Qu.:0.5500   3rd Qu.:11.40   3rd Qu.:6.000  
Max.   :440.0        Max.   :1.0390   Max.   :3.820   Max.   :1.0800   Max.   :14.20   Max.   :9.000  

# 종속 변수 (quality)의 분포
hist(wine$quality)

#regression tree를 사용하기 위한 패키지
#rpart : recursive partitioning
install.packages('rpart')
library(rpart)

#3 모델 학습
# 학습 데이터 세트(75%)/테스트 데이터 세트(25%)
head(wine)

  fixed.acidity volatile.acidity citric.acid residual.sugar chlorides free.sulfur.dioxide
1           6.7             0.62        0.24           1.10     0.039                   6
2           5.7             0.22        0.20          16.00     0.044                  41
3           5.9             0.19        0.26           7.40     0.034                  33
4           5.3             0.47        0.10           1.30     0.036                  11
5           6.4             0.29        0.21           9.65     0.041                  36
6           7.0             0.14        0.41           0.90     0.037                  22
  total.sulfur.dioxide density   pH sulphates  alcohol quality
1                   62 0.99340 3.41      0.32 10.40000       5
2                  113 0.99862 3.22      0.46  8.90000       6
3                  123 0.99500 3.49      0.42 10.10000       6
4                   74 0.99082 3.48      0.54 11.20000       4
5                  119 0.99334 2.99      0.34 10.93333       6
6                   95 0.99140 3.25      0.43 10.90000       6

4898 * 0.75
wine_train<-wine[1:3674,]
wine_test<-wine[3675:4898,]


# 학습 데이터를 rpart 패키지를 사용해서 학습시킴
wine_rpart <- rpart(formula = quality ~ ., data = wine_train)
wine_rpart
n= 3674

node), split, n, deviance, yval
      * denotes terminal node

1) root 3674 2884.10500 5.871257  
   2) alcohol< 10.85 2318 1383.39300 5.606126  
     4) volatile.acidity>=0.2425 1332  672.77700 5.389640 *
     5) volatile.acidity< 0.2425 986  563.85800 5.898580  
      10) volatile.acidity>=0.2075 445  205.00220 5.723596 *
      11) volatile.acidity< 0.2075 541  334.02220 6.042514  
        22) density< 0.99788 455  246.88790 5.931868 *
        23) density>=0.99788 86   52.09302 6.627907 *
   3) alcohol>=10.85 1356 1059.22700 6.324484  
     6) free.sulfur.dioxide< 11.5 90  103.38890 5.388889 *
     7) free.sulfur.dioxide>=11.5 1266  871.45730 6.390995  
      14) alcohol< 11.76667 618  421.06310 6.179612 *
      15) alcohol>=11.76667 648  396.44440 6.592593 *



> summary(wine_rpart)
Call:
rpart(formula = quality ~ ., data = wine_train)
  n= 3674

          CP nsplit rel error    xerror       xstd
1 0.15307510      0 1.0000000 1.0014660 0.02477931
2 0.05088510      1 0.8469249 0.8484126 0.02361649
3 0.02925723      2 0.7960398 0.8099456 0.02310113
4 0.01870591      3 0.7667826 0.7825864 0.02184668
5 0.01038014      4 0.7480767 0.7630470 0.02104674
6 0.01000000      6 0.7273164 0.7458542 0.02069839

Variable importance
             alcohol              density     volatile.acidity            chlorides total.sulfur.dioxide
                  34                   23                   12                   11                    7
free.sulfur.dioxide       residual.sugar            sulphates
                   6                    5                    1

Node number 1: 3674 observations,    complexity param=0.1530751
  mean=5.871257, MSE=0.785004
  left son=2 (2318 obs) right son=3 (1356 obs)
  Primary splits:
      alcohol              < 10.85    to the left,  improve=0.15307510, (0 missing)
      density              < 0.992035 to the right, improve=0.10785780, (0 missing)
      chlorides            < 0.0395   to the right, improve=0.07512851, (0 missing)
      total.sulfur.dioxide < 158.5    to the right, improve=0.03991005, (0 missing)
      citric.acid          < 0.235    to the left,  improve=0.03608134, (0 missing)
  Surrogate splits:
      density              < 0.991995 to the right, agree=0.869, adj=0.645, (0 split)
      chlorides            < 0.0375   to the right, agree=0.755, adj=0.337, (0 split)
      total.sulfur.dioxide < 103.5    to the right, agree=0.688, adj=0.154, (0 split)
      residual.sugar       < 5.375    to the right, agree=0.667, adj=0.099, (0 split)
      sulphates            < 0.345    to the right, agree=0.645, adj=0.038, (0 split)

Node number 2: 2318 observations,    complexity param=0.0508851
  mean=5.606126, MSE=0.5968046
  left son=4 (1332 obs) right son=5 (986 obs)
  Primary splits:
      volatile.acidity    < 0.2425   to the right, improve=0.10608550, (0 missing)
      free.sulfur.dioxide < 13.5     to the left,  improve=0.03385955, (0 missing)
      citric.acid         < 0.235    to the left,  improve=0.03255713, (0 missing)
      alcohol             < 10.11667 to the left,  improve=0.03155448, (0 missing)
      chlorides           < 0.0585   to the right, improve=0.01654644, (0 missing)
  Surrogate splits:
      total.sulfur.dioxide < 136.5    to the right, agree=0.618, adj=0.101, (0 split)
      pH                   < 3.295    to the left,  agree=0.595, adj=0.049, (0 split)
      sulphates            < 0.675    to the left,  agree=0.592, adj=0.042, (0 split)
      alcohol              < 10.11667 to the left,  agree=0.587, adj=0.029, (0 split)
      density              < 0.999435 to the left,  agree=0.583, adj=0.019, (0 split)

Node number 3: 1356 observations,    complexity param=0.02925723
  mean=6.324484, MSE=0.781141
  left son=6 (90 obs) right son=7 (1266 obs)
  Primary splits:
      free.sulfur.dioxide  < 11.5     to the left,  improve=0.07966271, (0 missing)
      alcohol              < 11.76667 to the left,  improve=0.06087662, (0 missing)
      total.sulfur.dioxide < 67.5     to the left,  improve=0.04486822, (0 missing)
      residual.sugar       < 1.375    to the left,  improve=0.02973624, (0 missing)
      fixed.acidity        < 7.35     to the right, improve=0.02671823, (0 missing)
  Surrogate splits:
      total.sulfur.dioxide < 53.5     to the left,  agree=0.948, adj=0.211, (0 split)
      volatile.acidity     < 0.875    to the right, agree=0.935, adj=0.022, (0 split)

Node number 4: 1332 observations
  mean=5.38964, MSE=0.5050879

Node number 5: 986 observations,    complexity param=0.01038014
  mean=5.89858, MSE=0.5718641
  left son=10 (445 obs) right son=11 (541 obs)
  Primary splits:
      volatile.acidity     < 0.2075   to the right, improve=0.04404226, (0 missing)
      free.sulfur.dioxide  < 24.5     to the left,  improve=0.03245207, (0 missing)
      fixed.acidity        < 8.45     to the right, improve=0.02382907, (0 missing)
      sulphates            < 0.355    to the left,  improve=0.01716880, (0 missing)
      total.sulfur.dioxide < 64       to the left,  improve=0.01698462, (0 missing)
  Surrogate splits:
      residual.sugar       < 7.425    to the right, agree=0.609, adj=0.133, (0 split)
      density              < 0.99809  to the right, agree=0.598, adj=0.110, (0 split)
      total.sulfur.dioxide < 168.5    to the right, agree=0.589, adj=0.090, (0 split)
      free.sulfur.dioxide  < 54.5     to the right, agree=0.580, adj=0.070, (0 split)
      alcohol              < 9.05     to the left,  agree=0.573, adj=0.054, (0 split)

Node number 6: 90 observations
  mean=5.388889, MSE=1.148765

Node number 7: 1266 observations,    complexity param=0.01870591
  mean=6.390995, MSE=0.6883549
  left son=14 (618 obs) right son=15 (648 obs)
  Primary splits:
      alcohol              < 11.76667 to the left,  improve=0.06190756, (0 missing)
      chlorides            < 0.0395   to the right, improve=0.02805413, (0 missing)
      fixed.acidity        < 7.35     to the right, improve=0.02717655, (0 missing)
      pH                   < 3.055    to the left,  improve=0.02432458, (0 missing)
      total.sulfur.dioxide < 191.5    to the right, improve=0.02239407, (0 missing)
  Surrogate splits:
      density              < 0.990885 to the right, agree=0.719, adj=0.424, (0 split)
      volatile.acidity     < 0.2775   to the left,  agree=0.641, adj=0.264, (0 split)
      chlorides            < 0.0365   to the right, agree=0.630, adj=0.243, (0 split)
      residual.sugar       < 1.475    to the left,  agree=0.575, adj=0.129, (0 split)
      total.sulfur.dioxide < 128.5    to the right, agree=0.572, adj=0.123, (0 split)

Node number 10: 445 observations
  mean=5.723596, MSE=0.4606792

Node number 11: 541 observations,    complexity param=0.01038014
  mean=6.042514, MSE=0.6174162
  left son=22 (455 obs) right son=23 (86 obs)
  Primary splits:
      density             < 0.99788  to the left,  improve=0.10490690, (0 missing)
      residual.sugar      < 12.575   to the left,  improve=0.09123744, (0 missing)
      alcohol             < 9.05     to the right, improve=0.08059059, (0 missing)
      fixed.acidity       < 8.45     to the right, improve=0.05552344, (0 missing)
      free.sulfur.dioxide < 26.5     to the left,  improve=0.05274526, (0 missing)
  Surrogate splits:
      residual.sugar       < 13.55    to the left,  agree=0.935, adj=0.593, (0 split)
      alcohol              < 9.15     to the right, agree=0.900, adj=0.372, (0 split)
      total.sulfur.dioxide < 189.5    to the left,  agree=0.847, adj=0.035, (0 split)
      pH                   < 2.965    to the right, agree=0.843, adj=0.012, (0 split)

Node number 14: 618 observations
  mean=6.179612, MSE=0.6813319

Node number 15: 648 observations
  mean=6.592593, MSE=0.611797

Node number 22: 455 observations
  mean=5.931868, MSE=0.5426108

Node number 23: 86 observations
  mean=6.627907, MSE=0.6057328


# rpart (회귀 트리)결과를 시각적으로 보여주는 패키지
install.packages('rpart.plot')
library(rpart.plot)
rpart.plot(x= wine_rpart, digits = 3)


> # 4. 모델 평가 - regression tree가 테스트 데이터를 얼마나 잘 설명?
> wine_predict <-predict(wine_rpart, wine_test)
> summary(wine_predict)                                                  # 예측 quality의 기술 통계량 
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
  5.389   5.390   5.932   5.883   6.180   6.628

> summary(wine_test$quality)                                                  # 실제 quality의 기술 통계량 
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
  3.000   5.000   6.000   5.898   6.000   9.000

# 모델 성능 평가 1)
# 상관 계수(correlation coefficient):  -1 <= cor <= 1
> cor(wine_predict, wine_test$quality)
[1] 0.5354775

# 모델 성능 평가 2)
# MAE(Mean Absolute Error): 평균 절대 오차
# 오차(실제값 - 예측값)들의 절대값의 평균
MAE <- function (actual, predict){
  return(mean(abs(actual-predict))) }

# 함수 테스트
MAE(wine_test$quality, wine_predict)           # 값이 작을수록 정확하다
[1] 0.6037949

#5 모델 성능 향상
# 모델 트리(Model Tree):
# Regression Tree(분류) + Regression Modeling(회귀 모델 적용)
# 교재(R을 활용한 러신 머닝 2/e) : RWeka 패키지의 M5P 함수 사용
# Cubist 패키지: 규칙 학습 기반 분류 + M5P 알고리즘 회귀 모델 적용
install.packages("Cubist")
library(Cubist)
#cubist(x = 훈련 데이터, y = 훈련 데이터의 결과)
wine_cubist <- cubist(x = wine_train[-12], y = wine_train$quality)
# 종속변수를 제외한 데이터 프레임을 넣어야 함, 12번째 컬럼 제외
# (데이터프레임, 훈련 label)

> wine_cubist

Call:
cubist.default(x = wine_train[-12], y = wine_train$quality)

Number of samples: 3674
Number of predictors: 11

Number of committees: 1
Number of rules: 12

> summary(wine_cubist)

Call:
cubist.default(x = wine_train[-12], y = wine_train$quality)


Cubist [Release 2.07 GPL Edition]  Tue Oct 15 11:55:22 2019
---------------------------------

    Target attribute `outcome'

Read 3674 cases (12 attributes) from undefined.data

Model:

  Rule 1: [75 cases, mean 5.1, range 3 to 6, est err 0.3]

    if
    volatile.acidity > 0.205
    citric.acid <= 0.27
    residual.sugar <= 17.85
    total.sulfur.dioxide > 133
    alcohol <= 9.1
    then                                                                                                          # 기존에 평균 값을 사용했던 것과 달리 회귀분석을 이용해
    outcome = 80.7 + 0.055 residual.sugar - 77 density + 1.33 sulphates       값을 대입함
              + 0.0085 free.sulfur.dioxide - 0.0032 total.sulfur.dioxide
              + 0.016 alcohol - 0.19 volatile.acidity

  Rule 2: [61 cases, mean 5.3, range 5 to 6, est err 0.3]

    if
    volatile.acidity > 0.205
    residual.sugar > 17.85
    then
    outcome = -13.8 + 0.223 alcohol - 6 chlorides - 0.81 citric.acid
              - 0.58 volatile.acidity + 18 density - 0.11 pH

  Rule 3: [78 cases, mean 5.4, range 4 to 7, est err 0.4]

    if
    volatile.acidity > 0.205
    residual.sugar <= 17.85
    total.sulfur.dioxide <= 133
    alcohol <= 9.1
    then
    outcome = 295.9 - 294 density + 0.135 residual.sugar
              + 0.0201 free.sulfur.dioxide + 12.1 chlorides
              - 1.76 volatile.acidity + 0.061 alcohol

  Rule 4: [23 cases, mean 5.4, range 3 to 8, est err 0.9]

    if
    free.sulfur.dioxide > 93
    then
    outcome = -2.3 + 76.9 chlorides - 11.39 volatile.acidity + 0.898 alcohol
              - 0.0176 free.sulfur.dioxide

  Rule 5: [240 cases, mean 5.5, range 3 to 7, est err 0.5]

    if
    volatile.acidity > 0.205
    citric.acid > 0.27
    residual.sugar <= 17.85
    alcohol <= 9.1
    then
    outcome = -70.8 - 2.81 volatile.acidity + 77 density + 0.161 alcohol
              - 1.09 citric.acid - 0.14 pH - 0.02 fixed.acidity
              - 0.0007 free.sulfur.dioxide

  Rule 6: [1019 cases, mean 5.8, range 3 to 8, est err 0.5]

    if
    volatile.acidity > 0.155
    free.sulfur.dioxide > 30
    free.sulfur.dioxide <= 93
    total.sulfur.dioxide > 130
    sulphates <= 0.64
    alcohol > 9.1
    then
    outcome = 187.2 - 189 density + 0.08 residual.sugar + 0.164 alcohol
              + 1.08 pH + 0.14 fixed.acidity - 1.15 volatile.acidity
              + 0.77 sulphates - 0.001 total.sulfur.dioxide

  Rule 7: [1449 cases, mean 5.9, range 3 to 9, est err 0.6]

    if
    free.sulfur.dioxide <= 30
    alcohol > 9.1
    then
    outcome = 145.2 + 0.0278 free.sulfur.dioxide - 146 density
              + 0.084 residual.sugar + 0.197 alcohol - 1.46 volatile.acidity
              + 0.82 pH + 0.41 citric.acid + 0.4 sulphates

  Rule 8: [165 cases, mean 5.9, range 3 to 8, est err 0.6]

    if
    volatile.acidity > 0.155
    free.sulfur.dioxide > 30
    free.sulfur.dioxide <= 93
    sulphates > 0.64
    then
    outcome = 225.9 - 229 density + 0.077 residual.sugar + 2.57 sulphates
              + 0.31 fixed.acidity - 0.0043 total.sulfur.dioxide + 1.18 pH
              - 0.26 volatile.acidity + 0.014 alcohol
              + 0.0005 free.sulfur.dioxide

  Rule 9: [315 cases, mean 6.2, range 4 to 8, est err 0.6]

    if
    volatile.acidity > 0.155
    free.sulfur.dioxide > 30
    total.sulfur.dioxide <= 130
    pH <= 3.26
    sulphates <= 0.64
    alcohol > 9.1
    then
    outcome = 84 + 0.222 alcohol + 2.24 sulphates - 84 density
              + 0.043 residual.sugar + 0.002 total.sulfur.dioxide + 0.36 pH
              + 0.06 fixed.acidity

  Rule 10: [105 cases, mean 6.3, range 4 to 8, est err 0.5]

    if
    volatile.acidity <= 0.155
    free.sulfur.dioxide > 30
    then
    outcome = 155.2 - 154 density + 0.057 residual.sugar + 0.68 pH
              + 0.9 sulphates + 0.1 fixed.acidity - 0.68 volatile.acidity
              + 0.044 alcohol - 0.001 total.sulfur.dioxide
              + 0.001 free.sulfur.dioxide

  Rule 11: [90 cases, mean 6.5, range 4 to 8, est err 0.6]

    if
    volatile.acidity <= 0.205
    alcohol <= 9.1
    then
    outcome = -219.2 + 229 density + 25.3 chlorides - 2.73 citric.acid
              - 1.02 pH

  Rule 12: [135 cases, mean 6.5, range 4 to 9, est err 0.6]

    if
    volatile.acidity > 0.155
    free.sulfur.dioxide > 30
    total.sulfur.dioxide <= 130
    pH > 3.26
    sulphates <= 0.64
    then
    outcome = 562.5 - 564 density + 0.199 residual.sugar
              + 0.0173 total.sulfur.dioxide + 0.56 fixed.acidity
              - 0.315 alcohol + 1.72 volatile.acidity + 1.21 sulphates


Evaluation on training data (3674 cases):

    Average  |error|                0.5
    Relative |error|               0.69
    Correlation coefficient        0.63


    Attribute usage:
      Conds  Model

       87%    98%    alcohol
       86%    57%    free.sulfur.dioxide
       61%    89%    volatile.acidity
       44%    87%    sulphates
       43%    48%    total.sulfur.dioxide
       12%    89%    residual.sugar
       12%    92%    pH
        8%    49%    citric.acid
              99%    density
              53%    fixed.acidity
               7%    chlorides


Time: 0.2 secs

> # 모델 트리의 성능 테스트 !!
> wine_predict2 <- predict(wine_cubist, wine_test)
> head(wine_predict2)
[1] 4.938196 5.731961 5.425939 5.261845 5.373295 5.962343

> summary(wine_test$quality)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
  3.000   5.000   6.000   5.898   6.000   9.000

> summary(wine_predict2)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
  4.239   5.462   5.832   5.850   6.228   7.416

> summary(wine_predict)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
  5.389   5.390   5.932   5.883   6.180   6.628

> # 상관 계수                                           # 상관계수 wine_predict 보다 증가
> cor(wine_predict2, wine_test$quality)
[1] 0.640193

# MAE: 평균 절대 오차                            # MAE wine_predict 보다 감소
> MAE(wine_predict2, wine_test$quality)
[1] 0.5379868
--> 성능 증가