rm(list = ls())
#1. 데이터 준비
wine <- read.csv(file = "mlwr/whitewines.csv")
#2 데이터 확인 전처리
# 4,898 obs.(예시), 12variables(특징) - white wine 데이터
> summary(wine)
fixed.acidity volatile.acidity citric.acid residual.sugar chlorides free.sulfur.dioxide
Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600 Min. :0.00900 Min. : 2.00
1st Qu.: 6.300 1st Qu.:0.2100 1st Qu.:0.2700 1st Qu.: 1.700 1st Qu.:0.03600 1st Qu.: 23.00
Median : 6.800 Median :0.2600 Median :0.3200 Median : 5.200 Median :0.04300 Median : 34.00
Mean : 6.855 Mean :0.2782 Mean :0.3342 Mean : 6.391 Mean :0.04577 Mean : 35.31
3rd Qu.: 7.300 3rd Qu.:0.3200 3rd Qu.:0.3900 3rd Qu.: 9.900 3rd Qu.:0.05000 3rd Qu.: 46.00
Max. :14.200 Max. :1.1000 Max. :1.6600 Max. :65.800 Max. :0.34600 Max. :289.00
total.sulfur.dioxide density pH sulphates alcohol quality
Min. : 9.0 Min. :0.9871 Min. :2.720 Min. :0.2200 Min. : 8.00 Min. :3.000
1st Qu.:108.0 1st Qu.:0.9917 1st Qu.:3.090 1st Qu.:0.4100 1st Qu.: 9.50 1st Qu.:5.000
Median :134.0 Median :0.9937 Median :3.180 Median :0.4700 Median :10.40 Median :6.000
Mean :138.4 Mean :0.9940 Mean :3.188 Mean :0.4898 Mean :10.51 Mean :5.878
3rd Qu.:167.0 3rd Qu.:0.9961 3rd Qu.:3.280 3rd Qu.:0.5500 3rd Qu.:11.40 3rd Qu.:6.000
Max. :440.0 Max. :1.0390 Max. :3.820 Max. :1.0800 Max. :14.20 Max. :9.000
# 종속 변수 (quality)의 분포
#regression tree를 사용하기 위한 패키지
#rpart : recursive partitioning
#3 모델 학습
# 학습 데이터 세트(75%)/테스트 데이터 세트(25%)
fixed.acidity volatile.acidity citric.acid residual.sugar chlorides free.sulfur.dioxide
1 6.7 0.62 0.24 1.10 0.039 6
2 5.7 0.22 0.20 16.00 0.044 41
3 5.9 0.19 0.26 7.40 0.034 33
4 5.3 0.47 0.10 1.30 0.036 11
5 6.4 0.29 0.21 9.65 0.041 36
6 7.0 0.14 0.41 0.90 0.037 22
total.sulfur.dioxide density pH sulphates alcohol quality
1 62 0.99340 3.41 0.32 10.40000 5
2 113 0.99862 3.22 0.46 8.90000 6
3 123 0.99500 3.49 0.42 10.10000 6
4 74 0.99082 3.48 0.54 11.20000 4
5 119 0.99334 2.99 0.34 10.93333 6
6 95 0.99140 3.25 0.43 10.90000 6
4898 * 0.75
# 학습 데이터를 rpart 패키지를 사용해서 학습시킴
wine_rpart <- rpart(formula = quality ~ ., data = wine_train)
n= 3674
node), split, n, deviance, yval
* denotes terminal node
1) root 3674 2884.10500 5.871257
2) alcohol< 10.85 2318 1383.39300 5.606126
4) volatile.acidity>=0.2425 1332 672.77700 5.389640 *
5) volatile.acidity< 0.2425 986 563.85800 5.898580
10) volatile.acidity>=0.2075 445 205.00220 5.723596 *
11) volatile.acidity< 0.2075 541 334.02220 6.042514
22) density< 0.99788 455 246.88790 5.931868 *
23) density>=0.99788 86 52.09302 6.627907 *
3) alcohol>=10.85 1356 1059.22700 6.324484
6) free.sulfur.dioxide< 11.5 90 103.38890 5.388889 *
7) free.sulfur.dioxide>=11.5 1266 871.45730 6.390995
14) alcohol< 11.76667 618 421.06310 6.179612 *
15) alcohol>=11.76667 648 396.44440 6.592593 *
> summary(wine_rpart)
rpart(formula = quality ~ ., data = wine_train)
n= 3674
CP nsplit rel error xerror xstd
1 0.15307510 0 1.0000000 1.0014660 0.02477931
2 0.05088510 1 0.8469249 0.8484126 0.02361649
3 0.02925723 2 0.7960398 0.8099456 0.02310113
4 0.01870591 3 0.7667826 0.7825864 0.02184668
5 0.01038014 4 0.7480767 0.7630470 0.02104674
6 0.01000000 6 0.7273164 0.7458542 0.02069839
Variable importance
alcohol density volatile.acidity chlorides total.sulfur.dioxide
34 23 12 11 7
free.sulfur.dioxide residual.sugar sulphates
6 5 1
Node number 1: 3674 observations, complexity param=0.1530751
mean=5.871257, MSE=0.785004
left son=2 (2318 obs) right son=3 (1356 obs)
Primary splits:
alcohol < 10.85 to the left, improve=0.15307510, (0 missing)
density < 0.992035 to the right, improve=0.10785780, (0 missing)
chlorides < 0.0395 to the right, improve=0.07512851, (0 missing)
total.sulfur.dioxide < 158.5 to the right, improve=0.03991005, (0 missing)
citric.acid < 0.235 to the left, improve=0.03608134, (0 missing)
Surrogate splits:
density < 0.991995 to the right, agree=0.869, adj=0.645, (0 split)
chlorides < 0.0375 to the right, agree=0.755, adj=0.337, (0 split)
total.sulfur.dioxide < 103.5 to the right, agree=0.688, adj=0.154, (0 split)
residual.sugar < 5.375 to the right, agree=0.667, adj=0.099, (0 split)
sulphates < 0.345 to the right, agree=0.645, adj=0.038, (0 split)
Node number 2: 2318 observations, complexity param=0.0508851
mean=5.606126, MSE=0.5968046
left son=4 (1332 obs) right son=5 (986 obs)
Primary splits:
volatile.acidity < 0.2425 to the right, improve=0.10608550, (0 missing)
free.sulfur.dioxide < 13.5 to the left, improve=0.03385955, (0 missing)
citric.acid < 0.235 to the left, improve=0.03255713, (0 missing)
alcohol < 10.11667 to the left, improve=0.03155448, (0 missing)
chlorides < 0.0585 to the right, improve=0.01654644, (0 missing)
Surrogate splits:
total.sulfur.dioxide < 136.5 to the right, agree=0.618, adj=0.101, (0 split)
pH < 3.295 to the left, agree=0.595, adj=0.049, (0 split)
sulphates < 0.675 to the left, agree=0.592, adj=0.042, (0 split)
alcohol < 10.11667 to the left, agree=0.587, adj=0.029, (0 split)
density < 0.999435 to the left, agree=0.583, adj=0.019, (0 split)
Node number 3: 1356 observations, complexity param=0.02925723
mean=6.324484, MSE=0.781141
left son=6 (90 obs) right son=7 (1266 obs)
Primary splits:
free.sulfur.dioxide < 11.5 to the left, improve=0.07966271, (0 missing)
alcohol < 11.76667 to the left, improve=0.06087662, (0 missing)
total.sulfur.dioxide < 67.5 to the left, improve=0.04486822, (0 missing)
residual.sugar < 1.375 to the left, improve=0.02973624, (0 missing)
fixed.acidity < 7.35 to the right, improve=0.02671823, (0 missing)
Surrogate splits:
total.sulfur.dioxide < 53.5 to the left, agree=0.948, adj=0.211, (0 split)
volatile.acidity < 0.875 to the right, agree=0.935, adj=0.022, (0 split)
Node number 4: 1332 observations
mean=5.38964, MSE=0.5050879
Node number 5: 986 observations, complexity param=0.01038014
mean=5.89858, MSE=0.5718641
left son=10 (445 obs) right son=11 (541 obs)
Primary splits:
volatile.acidity < 0.2075 to the right, improve=0.04404226, (0 missing)
free.sulfur.dioxide < 24.5 to the left, improve=0.03245207, (0 missing)
fixed.acidity < 8.45 to the right, improve=0.02382907, (0 missing)
sulphates < 0.355 to the left, improve=0.01716880, (0 missing)
total.sulfur.dioxide < 64 to the left, improve=0.01698462, (0 missing)
Surrogate splits:
residual.sugar < 7.425 to the right, agree=0.609, adj=0.133, (0 split)
density < 0.99809 to the right, agree=0.598, adj=0.110, (0 split)
total.sulfur.dioxide < 168.5 to the right, agree=0.589, adj=0.090, (0 split)
free.sulfur.dioxide < 54.5 to the right, agree=0.580, adj=0.070, (0 split)
alcohol < 9.05 to the left, agree=0.573, adj=0.054, (0 split)
Node number 6: 90 observations
mean=5.388889, MSE=1.148765
Node number 7: 1266 observations, complexity param=0.01870591
mean=6.390995, MSE=0.6883549
left son=14 (618 obs) right son=15 (648 obs)
Primary splits:
alcohol < 11.76667 to the left, improve=0.06190756, (0 missing)
chlorides < 0.0395 to the right, improve=0.02805413, (0 missing)
fixed.acidity < 7.35 to the right, improve=0.02717655, (0 missing)
pH < 3.055 to the left, improve=0.02432458, (0 missing)
total.sulfur.dioxide < 191.5 to the right, improve=0.02239407, (0 missing)
Surrogate splits:
density < 0.990885 to the right, agree=0.719, adj=0.424, (0 split)
volatile.acidity < 0.2775 to the left, agree=0.641, adj=0.264, (0 split)
chlorides < 0.0365 to the right, agree=0.630, adj=0.243, (0 split)
residual.sugar < 1.475 to the left, agree=0.575, adj=0.129, (0 split)
total.sulfur.dioxide < 128.5 to the right, agree=0.572, adj=0.123, (0 split)
Node number 10: 445 observations
mean=5.723596, MSE=0.4606792
Node number 11: 541 observations, complexity param=0.01038014
mean=6.042514, MSE=0.6174162
left son=22 (455 obs) right son=23 (86 obs)
Primary splits:
density < 0.99788 to the left, improve=0.10490690, (0 missing)
residual.sugar < 12.575 to the left, improve=0.09123744, (0 missing)
alcohol < 9.05 to the right, improve=0.08059059, (0 missing)
fixed.acidity < 8.45 to the right, improve=0.05552344, (0 missing)
free.sulfur.dioxide < 26.5 to the left, improve=0.05274526, (0 missing)
Surrogate splits:
residual.sugar < 13.55 to the left, agree=0.935, adj=0.593, (0 split)
alcohol < 9.15 to the right, agree=0.900, adj=0.372, (0 split)
total.sulfur.dioxide < 189.5 to the left, agree=0.847, adj=0.035, (0 split)
pH < 2.965 to the right, agree=0.843, adj=0.012, (0 split)
Node number 14: 618 observations
mean=6.179612, MSE=0.6813319
Node number 15: 648 observations
mean=6.592593, MSE=0.611797
Node number 22: 455 observations
mean=5.931868, MSE=0.5426108
Node number 23: 86 observations
mean=6.627907, MSE=0.6057328
# rpart (회귀 트리)결과를 시각적으로 보여주는 패키지
rpart.plot(x= wine_rpart, digits = 3)
> # 4. 모델 평가 - regression tree가 테스트 데이터를 얼마나 잘 설명?
> wine_predict <-predict(wine_rpart, wine_test)
> summary(wine_predict) # 예측 quality의 기술 통계량
Min. 1st Qu. Median Mean 3rd Qu. Max.
5.389 5.390 5.932 5.883 6.180 6.628
> summary(wine_test$quality) # 실제 quality의 기술 통계량
Min. 1st Qu. Median Mean 3rd Qu. Max.
3.000 5.000 6.000 5.898 6.000 9.000
# 모델 성능 평가 1)
# 상관 계수(correlation coefficient): -1 <= cor <= 1
> cor(wine_predict, wine_test$quality)
[1] 0.5354775
# 모델 성능 평가 2)
# MAE(Mean Absolute Error): 평균 절대 오차
# 오차(실제값 - 예측값)들의 절대값의 평균
MAE <- function (actual, predict){
return(mean(abs(actual-predict))) }
# 함수 테스트
MAE(wine_test$quality, wine_predict) # 값이 작을수록 정확하다
[1] 0.6037949
#5 모델 성능 향상
# 모델 트리(Model Tree):
# Regression Tree(분류) + Regression Modeling(회귀 모델 적용)
# 교재(R을 활용한 러신 머닝 2/e) : RWeka 패키지의 M5P 함수 사용
# Cubist 패키지: 규칙 학습 기반 분류 + M5P 알고리즘 회귀 모델 적용
#cubist(x = 훈련 데이터, y = 훈련 데이터의 결과)
wine_cubist <- cubist(x = wine_train[-12], y = wine_train$quality)
# 종속변수를 제외한 데이터 프레임을 넣어야 함, 12번째 컬럼 제외
# (데이터프레임, 훈련 label)
> wine_cubist
cubist.default(x = wine_train[-12], y = wine_train$quality)
Number of samples: 3674
Number of predictors: 11
Number of committees: 1
Number of rules: 12
> summary(wine_cubist)
cubist.default(x = wine_train[-12], y = wine_train$quality)
Cubist [Release 2.07 GPL Edition] Tue Oct 15 11:55:22 2019
Target attribute `outcome'
Read 3674 cases (12 attributes) from undefined.data
Rule 1: [75 cases, mean 5.1, range 3 to 6, est err 0.3]
volatile.acidity > 0.205
citric.acid <= 0.27
residual.sugar <= 17.85
total.sulfur.dioxide > 133
alcohol <= 9.1
then # 기존에 평균 값을 사용했던 것과 달리 회귀분석을 이용해
outcome = 80.7 + 0.055 residual.sugar - 77 density + 1.33 sulphates 값을 대입함
+ 0.0085 free.sulfur.dioxide - 0.0032 total.sulfur.dioxide
+ 0.016 alcohol - 0.19 volatile.acidity
Rule 2: [61 cases, mean 5.3, range 5 to 6, est err 0.3]
volatile.acidity > 0.205
residual.sugar > 17.85
outcome = -13.8 + 0.223 alcohol - 6 chlorides - 0.81 citric.acid
- 0.58 volatile.acidity + 18 density - 0.11 pH
Rule 3: [78 cases, mean 5.4, range 4 to 7, est err 0.4]
volatile.acidity > 0.205
residual.sugar <= 17.85
total.sulfur.dioxide <= 133
alcohol <= 9.1
outcome = 295.9 - 294 density + 0.135 residual.sugar
+ 0.0201 free.sulfur.dioxide + 12.1 chlorides
- 1.76 volatile.acidity + 0.061 alcohol
Rule 4: [23 cases, mean 5.4, range 3 to 8, est err 0.9]
free.sulfur.dioxide > 93
outcome = -2.3 + 76.9 chlorides - 11.39 volatile.acidity + 0.898 alcohol
- 0.0176 free.sulfur.dioxide
Rule 5: [240 cases, mean 5.5, range 3 to 7, est err 0.5]
volatile.acidity > 0.205
citric.acid > 0.27
residual.sugar <= 17.85
alcohol <= 9.1
outcome = -70.8 - 2.81 volatile.acidity + 77 density + 0.161 alcohol
- 1.09 citric.acid - 0.14 pH - 0.02 fixed.acidity
- 0.0007 free.sulfur.dioxide
Rule 6: [1019 cases, mean 5.8, range 3 to 8, est err 0.5]
volatile.acidity > 0.155
free.sulfur.dioxide > 30
free.sulfur.dioxide <= 93
total.sulfur.dioxide > 130
sulphates <= 0.64
alcohol > 9.1
outcome = 187.2 - 189 density + 0.08 residual.sugar + 0.164 alcohol
+ 1.08 pH + 0.14 fixed.acidity - 1.15 volatile.acidity
+ 0.77 sulphates - 0.001 total.sulfur.dioxide
Rule 7: [1449 cases, mean 5.9, range 3 to 9, est err 0.6]
free.sulfur.dioxide <= 30
alcohol > 9.1
outcome = 145.2 + 0.0278 free.sulfur.dioxide - 146 density
+ 0.084 residual.sugar + 0.197 alcohol - 1.46 volatile.acidity
+ 0.82 pH + 0.41 citric.acid + 0.4 sulphates
Rule 8: [165 cases, mean 5.9, range 3 to 8, est err 0.6]
volatile.acidity > 0.155
free.sulfur.dioxide > 30
free.sulfur.dioxide <= 93
sulphates > 0.64
outcome = 225.9 - 229 density + 0.077 residual.sugar + 2.57 sulphates
+ 0.31 fixed.acidity - 0.0043 total.sulfur.dioxide + 1.18 pH
- 0.26 volatile.acidity + 0.014 alcohol
+ 0.0005 free.sulfur.dioxide
Rule 9: [315 cases, mean 6.2, range 4 to 8, est err 0.6]
volatile.acidity > 0.155
free.sulfur.dioxide > 30
total.sulfur.dioxide <= 130
pH <= 3.26
sulphates <= 0.64
alcohol > 9.1
outcome = 84 + 0.222 alcohol + 2.24 sulphates - 84 density
+ 0.043 residual.sugar + 0.002 total.sulfur.dioxide + 0.36 pH
+ 0.06 fixed.acidity
Rule 10: [105 cases, mean 6.3, range 4 to 8, est err 0.5]
volatile.acidity <= 0.155
free.sulfur.dioxide > 30
outcome = 155.2 - 154 density + 0.057 residual.sugar + 0.68 pH
+ 0.9 sulphates + 0.1 fixed.acidity - 0.68 volatile.acidity
+ 0.044 alcohol - 0.001 total.sulfur.dioxide
+ 0.001 free.sulfur.dioxide
Rule 11: [90 cases, mean 6.5, range 4 to 8, est err 0.6]
volatile.acidity <= 0.205
alcohol <= 9.1
outcome = -219.2 + 229 density + 25.3 chlorides - 2.73 citric.acid
- 1.02 pH
Rule 12: [135 cases, mean 6.5, range 4 to 9, est err 0.6]
volatile.acidity > 0.155
free.sulfur.dioxide > 30
total.sulfur.dioxide <= 130
pH > 3.26
sulphates <= 0.64
outcome = 562.5 - 564 density + 0.199 residual.sugar
+ 0.0173 total.sulfur.dioxide + 0.56 fixed.acidity
- 0.315 alcohol + 1.72 volatile.acidity + 1.21 sulphates
Evaluation on training data (3674 cases):
Average |error| 0.5
Relative |error| 0.69
Correlation coefficient 0.63
Attribute usage:
Conds Model
87% 98% alcohol
86% 57% free.sulfur.dioxide
61% 89% volatile.acidity
44% 87% sulphates
43% 48% total.sulfur.dioxide
12% 89% residual.sugar
12% 92% pH
8% 49% citric.acid
99% density
53% fixed.acidity
7% chlorides
Time: 0.2 secs
> # 모델 트리의 성능 테스트 !!
> wine_predict2 <- predict(wine_cubist, wine_test)
> head(wine_predict2)
[1] 4.938196 5.731961 5.425939 5.261845 5.373295 5.962343
> summary(wine_test$quality)
Min. 1st Qu. Median Mean 3rd Qu. Max.
3.000 5.000 6.000 5.898 6.000 9.000
> summary(wine_predict2)
Min. 1st Qu. Median Mean 3rd Qu. Max.
4.239 5.462 5.832 5.850 6.228 7.416
> summary(wine_predict)
Min. 1st Qu. Median Mean 3rd Qu. Max.
5.389 5.390 5.932 5.883 6.180 6.628
> # 상관 계수 # 상관계수 wine_predict 보다 증가
> cor(wine_predict2, wine_test$quality)
[1] 0.640193
# MAE: 평균 절대 오차 # MAE wine_predict 보다 감소
> MAE(wine_predict2, wine_test$quality)
[1] 0.5379868
--> 성능 증가
'R > R 머신러닝' 카테고리의 다른 글
R27-2_블랙박스 방법: 신경망2 (시멘트 강도 학습) (0) | 2019.11.11 |
R27_블랙박스 방법: 신경망 (0) | 2019.11.08 |
R25_수치 데이터 예측 : : 회귀방법 3 - 미국 의료비 데이터 (0) | 2019.11.06 |
R24_수치 데이터 예측: 회귀 방법2 - 챌린저호의 사고 조사 데이터 (0) | 2019.11.05 |
R23_수치 데이터 예측: 회귀 방법 (0) | 2019.11.04 |