# csv 파일을 읽어서 데이터 프레임 생성
> credit <- read.csv(file = 'mlwr/credit.csv',
+ encoding = 'UTF-8')
> str(credit)
'data.frame': 1000 obs. of 17 variables:
$ checking_balance : Factor w/ 4 levels "< 0 DM","> 200 DM",..: 1 3 4 1 1 4 4 3 4 3 ...
$ months_loan_duration: int 6 48 12 42 24 36 24 36 12 30 ...
$ credit_history : Factor w/ 5 levels "critical","good",..: 1 2 1 2 4 2 2 2 2 1 ...
$ purpose : Factor w/ 6 levels "business","car",..: 5 5 4 5 2 4 5 2 5 2 ...
$ amount : int 1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
$ savings_balance : Factor w/ 5 levels "< 100 DM","> 1000 DM",..: 5 1 1 1 1 5 4 1 2 1 ...
$ employment_duration : Factor w/ 5 levels "< 1 year","> 7 years",..: 2 3 4 4 3 3 2 3 4 5 ...
$ percent_of_income : int 4 2 2 2 3 2 3 2 2 4 ...
$ years_at_residence : int 4 2 3 4 4 4 4 2 4 2 ...
$ age : int 67 22 49 45 53 35 53 35 61 28 ...
$ other_credit : Factor w/ 3 levels "bank","none",..: 2 2 2 2 2 2 2 2 2 2 ...
$ housing : Factor w/ 3 levels "other","own",..: 2 2 2 1 1 1 2 3 2 2 ...
$ existing_loans_count: int 2 1 1 1 2 1 1 1 1 2 ...
$ job : Factor w/ 4 levels "management","skilled",..: 2 2 4 2 2 4 2 1 4 1 ...
$ dependents : int 1 1 2 2 2 2 1 1 1 1 ...
$ phone : Factor w/ 2 levels "no","yes": 2 1 1 1 1 2 1 2 1 1 ...
$ default : Factor w/ 2 levels "no","yes": 1 2 1 1 2 1 1 1 1 2 ...
> head(credit)
checking_balance months_loan_duration credit_history purpose amount savings_balance
1 < 0 DM 6 critical furniture/appliances 1169 unknown
2 1 - 200 DM 48 good furniture/appliances 5951 < 100 DM
3 unknown 12 critical education 2096 < 100 DM
4 < 0 DM 42 good furniture/appliances 7882 < 100 DM
5 < 0 DM 24 poor car 4870 < 100 DM
6 unknown 36 good education 9055 unknown
employment_duration percent_of_income years_at_residence age other_credit housing existing_loans_count
1 > 7 years 4 4 67 none own 2
2 1 - 4 years 2 2 22 none own 1
3 4 - 7 years 2 3 49 none own 1
4 4 - 7 years 2 4 45 none other 1
5 1 - 4 years 3 4 53 none other 2
6 1 - 4 years 2 4 35 none other 1
job dependents phone default
1 skilled 1 yes no
2 skilled 1 no yes
3 unskilled 2 no no
4 skilled 2 no no
5 skilled 2 no yes
6 unskilled 2 yes no
> # 대출 상환 능력과 관계가 있을 것 같은 변수들?
> # 범주형 변수(특징) -> table(도수분포표)
> # 수치형 변수(특징) -> summary(기술 통계량)
> table(credit$checking_balance)
< 0 DM > 200 DM 1 - 200 DM unknown
274 63 269 394
> table(credit$saving_balance)
< table of extent 0 >
> summary(credit$months_loan_duration)
Min. 1st Qu. Median Mean 3rd Qu. Max.
4.0 12.0 18.0 20.9 24.0 72.0
> summary(credit$amount)
Min. 1st Qu. Median Mean 3rd Qu. Max.
250 1366 2320 3271 3972 18424
> # 채무 불이행(default) 빈도수
> table(credit$default)
no yes
700 300
> head(credit$amount, n = 10)
[1] 1169 5951 2096 7882 4870 9055 2835 6948 3059 5234
> tail(credit$amount, n = 10)
[1] 3565 1569 1936 3959 2390 1736 3857 804 1845 4576
> tail(credit$default, n = 10)
[1] no no no no no no no no yes no
Levels: no yes
# random sampling : 정렬된 데이터를 무작위 추출
> sample(1000,10)
[1] 412 776 671 90 190 792 293 314 339 596 # 데이터의 행번호로 사용
> # 학습 데이터 세트와 테스트 데이터 세트(9:1)를 준비
> rows <- sample(1000, 900)
> train_set <- credit[rows,]
> test_set <- credit[-rows,]
> # 학습과 테스트 데이터 세트에서 default 비율을 확인
> prop.table(table(train_set$default))
no yes
0.7011111 0.2988889
> prop.table(table(test_set$default))
no yes
0.69 0.31
# 의사결정 나무(decision tree)를 사용하기 위한 패키지
install.packages('C50')
library(C50)
# C50::C5.0(train, class, trial = 1, costs = NULL)
# train : 학습 데이터 세트
# class : 학습 데이터의 레이블 (분류에 대한 정답 - 현 예제에서는 default값)
# trials, costs : 옵션 -> 성능 개선
# 학습 데이터 세트에서는 데이터의 레이블(클래스)를 제거해야 함!
>credit_model <- C5.0(train_set[-17], train_set$default)
>credit_model
Call:
C5.0.default(x = train_set[-17], y = train_set$default)
Classification Tree
Number of samples: 900
Number of predictors: 16
Tree size: 57
Non-standard options: attempt to group attributes
> summary(credit_model)
Call:
C5.0.default(x = train_set[-17], y = train_set$default)
C5.0 [Release 2.07 GPL Edition] Fri Oct 11 11:51:30 2019
-------------------------------
Class specified by attribute `outcome'
Read 900 cases (17 attributes) from undefined.data
Decision tree: (해당 가지의 사람 수/ 오류 명수). default = no ( 채무불이행 x), yes(채무불이행 o)
checking_balance = unknown: no (362/42)
checking_balance in {< 0 DM,> 200 DM,1 - 200 DM}:
:...amount > 8648: yes (32/7)
amount <= 8648:
:...credit_history in {perfect,very good}:
:...housing in {other,rent}: yes (24/2)
: housing = own:
: :...purpose in {car0,renovations}: yes (2)
: purpose = education: no (2)
: purpose = business:
: :...job in {management,skilled}: no (6/1)
: : job in {unemployed,unskilled}: yes (2)
: purpose = car:
: :...months_loan_duration <= 18: yes (6)
: : months_loan_duration > 18:
: : :...job in {management,skilled,unemployed}: no (3)
: : job = unskilled: yes (1)
: purpose = furniture/appliances:
: :...other_credit = none: yes (2)
: other_credit = store: no (4)
: other_credit = bank:
: :...checking_balance = < 0 DM: yes (3)
: checking_balance in {> 200 DM,1 - 200 DM}: no (3)
credit_history in {critical,good,poor}:
:...months_loan_duration <= 15:
:...purpose in {business,car0}: no (10)
: purpose = furniture/appliances:
: :...savings_balance in {< 100 DM,> 1000 DM,500 - 1000 DM,
: : : unknown}: no (95/17)
: : savings_balance = 100 - 500 DM: yes (4)
: purpose = renovations:
: :...other_credit in {bank,none}: no (6/1)
: : other_credit = store: yes (1)
: purpose = education:
: :...savings_balance = > 1000 DM: yes (1)
: : savings_balance in {100 - 500 DM,500 - 1000 DM,
: : : unknown}: no (3)
: : savings_balance = < 100 DM:
: : :...phone = no: yes (4)
: : phone = yes:
: : :...credit_history = critical: yes (1)
: : credit_history in {good,poor}: no (3)
: purpose = car:
: :...savings_balance in {> 1000 DM,500 - 1000 DM,
: : unknown}: no (16/1)
: savings_balance = 100 - 500 DM:
: :...credit_history in {critical,poor}: no (2)
: : credit_history = good: yes (4/1)
: savings_balance = < 100 DM:
: :...credit_history in {critical,poor}: no (26/4)
: credit_history = good:
: :...percent_of_income > 2: yes (14/4)
: percent_of_income <= 2:
: :...amount <= 1386: yes (3)
: amount > 1386: no (7)
months_loan_duration > 15:
:...savings_balance = > 1000 DM: no (8)
savings_balance = unknown:
:...checking_balance in {> 200 DM,1 - 200 DM}: no (28/3)
: checking_balance = < 0 DM:
: :...job in {management,unemployed}: yes (2)
: job = unskilled: no (2)
: job = skilled:
: :...phone = no: yes (7/1)
: phone = yes: no (5/1)
savings_balance in {< 100 DM,100 - 500 DM,500 - 1000 DM}:
:...checking_balance = > 200 DM:
:...dependents <= 1: no (13/2)
: dependents > 1: yes (2)
checking_balance = < 0 DM:
:...job in {skilled,unemployed,unskilled}: yes (79/24)
: job = management:
: :...phone = no: yes (3/1)
: phone = yes: no (15/2)
checking_balance = 1 - 200 DM:
:...years_at_residence <= 1:
:...housing in {other,own}: no (11/1)
: housing = rent: yes (1)
years_at_residence > 1:
:...percent_of_income <= 1: no (6)
percent_of_income > 1:
:...purpose in {car0,education,
: renovations}: no (5)
purpose = business: [S1]
purpose = car:
:...other_credit = store: yes (1)
: other_credit = bank: [S2]
: other_credit = none:
: :...age <= 29: yes (9/2)
: age > 29: no (9/1)
purpose = furniture/appliances: [S3]
SubTree [S1]
employment_duration in {> 7 years,1 - 4 years}: yes (7/2)
employment_duration in {< 1 year,4 - 7 years,unemployed}: no (3)
SubTree [S2]
savings_balance in {< 100 DM,500 - 1000 DM}: yes (3)
savings_balance = 100 - 500 DM: no (1)
SubTree [S3]
employment_duration in {< 1 year,1 - 4 years}: yes (14/1)
employment_duration in {> 7 years,unemployed}: no (8/2)
employment_duration = 4 - 7 years:
:...housing in {other,own}: yes (4)
housing = rent: no (2)
Evaluation on training data (900 cases):
Decision Tree
----------------
Size Errors
57 123(13.7%) <<
(a) (b) <-classified as # 모델 추측값
---- ----
586 45 (a): class no # default 실제로 no
78 191 (b): class yes # default 실제로 yes
Attribute usage:
100.00% checking_balance
59.78% amount
56.22% credit_history
50.89% months_loan_duration
47.89% savings_balance
33.33% purpose
13.89% job
10.67% percent_of_income
9.33% years_at_residence
8.44% housing
4.67% other_credit
4.22% employment_duration
4.22% phone
2.00% age
1.67% dependents
Time: 0.0 secs
--> 중요성 순서대로 특성(column) 기준으로 분류
# 모델을 테스트 데이터를 사용해서 평가
#stats::predict(모델, 테스트 데이터 세트)
credit_pred <- predict(credit_model, test_set)
table(credit_pred)
credit_pred
no yes
66 34
> # 테스트 평가 결과 - 이원 교차표
> library(gmodels)
> CrossTable(x = test_set$default, y = credit_pred)
Total Observations in Table: 100
○ 적응형 부스팅(Adaptive boosting: AdaBoost): 여러개의 의사결정 트리를 만들어서 각 예시에 대해
최고의 클래스를 투표하게 만드는 과정
○ 비용 행렬(Cost matrix): 오류 유형에 페널티를 줌
# 1) 의사결정 나무의 갯수를 변경 --> 각 나무의 찬/반 투표를 통해 의사결정!
credit_boost <- C5.0(train_set[-17], train_set$default,
trials = 10)
credit_boost
# AdaBoost를 적용한 모델 성능 평가
credit_boost_predict <- predict(credit_boost, test_set[-17])
table(credit_boost_predict)
CrossTable( x = test_set$default,
y = credit_boost_predict,
prop.chisq = F)
# 2) 비용 행렬(cost matrix) 사용
# 발생할 수 있는 오류에 페널티를 추가.
# 비용 행렬: 페널티 값들로 이루어진 행렬
> # 비용 행렬의 행/열의 이름
> matrix_dimname <- list(predict = c('no', 'yes'),
+ actual = c('no', 'yes'))
> # 비용 행렬
> cost_matrix <- matrix(data = c(0, 1, 4, 0), # 컬럼 1행1열 ->2행 1열-> 1행2열 ->2행2열 : 위에서 아래방향
+ nrow = 2,
+ dimnames = matrix_dimname) # byrow = T 로 설정하면 1행1열-> 1행2열 : 왼쪽에서 오른쪽 방향으로 입력
> cost_matrix
actual
predict no yes
no 0 4 # 패널티 점수 (임의 값) . 디폴트 할 사람에게 돈을 빌려주는 것이 은행에게는
yes 1 0 # 돈 갚을 사람에게 돈을 안 빌려주는 것 보다 4배 치명적
# 모델 훈련에 비용 행렬을 적용
credit_cost <- C5.0(train_set[-17], train_set$default,
costs = cost_matrix)
CrossTable( x = test_set$default,
y = credit_cost,
prop.chisq = F)
credit_cost
summary(credit_cost)
Call:
C5.0.default(x = train_set[-17], y = train_set$default, costs = cost_matrix)
C5.0 [Release 2.07 GPL Edition] Fri Oct 11 14:45:09 2019
-------------------------------
Class specified by attribute `outcome'
Read 900 cases (17 attributes) from undefined.data
Read misclassification costs from undefined.costs
Decision tree:
checking_balance = unknown:
:...other_credit in {bank,store}:
: :...purpose in {business,car,education,renovations}: yes (31/17)
: : purpose in {car0,furniture/appliances}:
: : :...years_at_residence > 2: no (16)
: : years_at_residence <= 2:
: : :...housing = other: no (0)
: : housing = rent: yes (1)
: : housing = own:
: : :...age <= 33: no (8)
: : age > 33: yes (1)
: other_credit = none:
: :...credit_history in {perfect,very good}: no (5)
: credit_history = critical:
: :...amount <= 6967: no (115/1)
: : amount > 6967: yes (8/6)
: credit_history = poor:
: :...years_at_residence <= 1: yes (2)
: : years_at_residence > 1:
: : :...percent_of_income <= 3: no (13)
: : percent_of_income > 3: yes (13/8)
: credit_history = good:
: :...existing_loans_count <= 1: no (132/10)
: existing_loans_count > 1:
: :...percent_of_income <= 2: yes (6/1)
: percent_of_income > 2:
: :...amount <= 9055: no (10)
: amount > 9055: yes (1)
checking_balance in {< 0 DM,> 200 DM,1 - 200 DM}:
:...savings_balance in {> 1000 DM,500 - 1000 DM,unknown}:
:...purpose = business: no (10)
: purpose in {car,car0,education,furniture/appliances,renovations}:
: :...housing = other: yes (12/6)
: housing = own:
: :...employment_duration = < 1 year: no (10)
: : employment_duration in {> 7 years,1 - 4 years,4 - 7 years,
: : unemployed}: yes (63/44)
: housing = rent:
: :...existing_loans_count > 1: no (8)
: existing_loans_count <= 1:
: :...percent_of_income <= 2: no (7)
: percent_of_income > 2: yes (5/1)
savings_balance in {< 100 DM,100 - 500 DM}:
:...credit_history in {perfect,very good}: yes (52/12)
credit_history in {critical,good,poor}:
:...months_loan_duration > 15: yes (210/97)
months_loan_duration <= 15:
:...purpose in {business,car0}: no (9)
purpose = education: yes (8/3)
purpose = renovations:
:...savings_balance = < 100 DM: yes (3/2)
: savings_balance = 100 - 500 DM: no (3)
purpose = car:
:...credit_history in {good,poor}: yes (33/14)
: credit_history = critical:
: :...dependents <= 1: yes (14/11)
: dependents > 1: no (11)
purpose = furniture/appliances:
:...savings_balance = 100 - 500 DM: yes (4)
savings_balance = < 100 DM:
:...employment_duration in {4 - 7 years,
: unemployed}: no (12)
employment_duration in {< 1 year,> 7 years,1 - 4 years}:
:...existing_loans_count > 1:
:...age <= 54: no (10)
: age > 54: yes (1)
existing_loans_count <= 1:
:...credit_history = critical: yes (2/1)
credit_history = poor: no (1)
credit_history = good:
:...months_loan_duration <= 7: no (11)
months_loan_duration > 7: [S1]
SubTree [S1]
checking_balance in {> 200 DM,1 - 200 DM}: yes (24/15)
checking_balance = < 0 DM:
:...years_at_residence <= 3: no (11)
years_at_residence > 3: yes (4/2)
Evaluation on training data (900 cases):
Decision Tree
-----------------------
Size Errors Cost
41 251(27.9%) 0.32 <<
(a) (b) <-classified as
---- ----
391 240 (a): class no # 종전값 45
11 258 (b): class yes # 종전값 78
Attribute usage:
100.00% checking_balance
80.89% credit_history
59.78% savings_balance
41.22% months_loan_duration
40.22% other_credit
37.00% purpose
25.89% existing_loans_count
16.56% employment_duration
14.89% amount
12.78% housing
7.67% years_at_residence
6.11% percent_of_income
2.78% dependents
2.22% age
Time: 0.0 secs
credit_cost_predict <- predict(credit_cost, test_set[-17])
CrossTable(x = test_set$default,
y = credit_cost_predict,
prop.chisq = F, prop.c = F, prop.r = F)
(2,1) : 10 -> 2 로 감소
# 독버섯으로 naive base
mushroom <- read.csv(file = 'mlwr/mushrooms.csv',
encoding = 'UTF-8')
library(e1071)
str(mushroom)
head(mushroom)
#데이터 분류
mushroom_train = mushroom[1:1114,2:23]
mushroom_test = mushroom[1115:8124,2:23]
#데이터 답지
mushroom_train_label = mushroom[1:1114,1]
mushroom_test_label = mushroom[1115:8124,1]
#Naive Bayes 적용
mush_classifier = naiveBayes(mushroom_train, mushroom_train_label)
mush_pred = predict(mush_classifier,mushroom_test)
# 분류 결과
CrossTable(x = mushroom_test_label, # x = 행, y = 열
y = mush_pred,
prop.chisq = F)