insurance = read.csv('mlwr/insurance.csv')
# 데이터 확인
str(insurance)
'data.frame': 1338 obs. of 7 variables:
$ age : int 19 18 28 33 32 31 46 37 37 60 ...
$ sex : Factor w/ 2 levels "female","male": 1 2 2 2 2 1 1 1 2 1 ...
$ bmi : num 27.9 33.8 33 22.7 28.9 25.7 33.4 27.7 29.8 25.8 ...
$ children: int 0 1 3 0 0 0 1 3 2 0 ...
$ smoker : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 1 1 1 ...
$ region : Factor w/ 4 levels "northeast","northwest",..: 4 3 3 2 2 3 3 2 1 2 ...
$ expenses: num 16885 1726 4449 21984 3867 ...
summary(insurance)
age sex bmi children smoker region expenses
Min. :18.00 female:662 Min. :16.00 Min. :0.000 no :1064 northeast:324 Min. : 1122
1st Qu.:27.00 male :676 1st Qu.:26.30 1st Qu.:0.000 yes: 274 northwest:325 1st Qu.: 4740
Median :39.00 Median :30.40 Median :1.000 southeast:364 Median : 9382
Mean :39.21 Mean :30.67 Mean :1.095 southwest:325 Mean :13270
3rd Qu.:51.00 3rd Qu.:34.70 3rd Qu.:2.000 3rd Qu.:16640
Max. :64.00 Max. :53.10 Max. :5.000 Max. :63770
# 종속 변수- expenses(의료비 지출)
boxplot(insurance$expenses)
hist(insurance$expenses) # 오른쪽으로 꼬리가 긴 분포
# 대다수의 사람들의 의료비는 0 ~ 15,000 사이에 분포
> # 상관 계수: cor(x, y)
> cor(insurance$bmi, insurance$expenses)
[1] 0.1985763
> # 상관 행렬 : 상관 계수들로 만든 행렬
> cor(insurance[c('age', 'bmi', 'children', 'expenses')])
age bmi children expenses
age 1.0000000 0.10934101 0.04246900 0.29900819
bmi 0.1093410 1.00000000 0.01264471 0.19857626
children 0.0424690 0.01264471 1.00000000 0.06799823
expenses 0.2990082 0.19857626 0.06799823 1.00000000
pairs(insurance[c('age', 'bmi', 'children', 'expenses')]) # 그래프를 대략적으로 보고 상관성 있는 변수들을 추려낸다.
install.packages('psych')
library(psych)
pairs.panels(insurance[c('age', 'bmi', 'children', 'expenses')])
선 : 심플선형회귀를 했을 때 그려지는 그래프
타원 : 길쭉할수록 상관관계가 높다.
>pairs.panels(insurance)
# 다중 선형 회귀 (multiple linear regression)
# expenses ~ 나머지 모든 변수
ins_model <- lm(formula = expenses~., data = insurance)
ins_model
Call:
lm(formula = expenses ~ ., data = insurance)
Coefficients:
(Intercept) age sexmale bmi children smokeryes
-11941.6 256.8 -131.4 339.3 475.7 23847.5
regionnorthwest regionsoutheast regionsouthwest
-352.8 -1035.6 -959.3
sexmale : sex --> male : 1 , female : 0 으로 놓고 계산한 값. female 계수가 0이라 표기하지 않음.
--> 더미 코딩 기법(dummy coding)을 이용
명목 특징을 변수로 취급하기 위해 사용하는 기법.
범주에 속하면 1, 그렇지 않으면 0으로 설정. sex의 male는 1, female는 0으로 설정. 각 변수를 나누어
sexmale, sexfemale로 명명한다.
여러개의 level이 있는 경우, 예를 들어
지역1, 지역2, 지역3, 지역4가 있을 때
기준을 지역1로 하고 dummy 변수를 만든다면
지역 2에 대한 dummy ː 지역2는 1, 나머지 지역은 0
지역 3에 대한 dummy ː 지역3은 1, 나머지 지역은 0
지역 4에 대한 dummy ː 지역4는 1, 나머지 지역은 0
이런 식으로 전체 범주 개수 -1개의 더미변수를 만든다.
지역 2에 대한 dummy ː (+) <== 기준 level인 지역 1의 effect에 비해 지역2의 effect가 (+), 즉 크다는 의미입니다.
지역 3에 대한 dummy ː (-) <== 기준 level인 지역 1의 effect에 비해 지역3의 effect가 (-), 즉 작다는 의미입니다.
지역 4에 대한 dummy ː (+) <== 기준 level인 지역 1의 effect에 비해 지역4의 effect가 (+), 즉 크다는 의미입니다.
만일, 지역4의 beta가 지역2보다 크다면 effect의 순서는 다음과 같습니다.
지역3 < 지역1 < 지역2 < 지역4
라는 결론이 나온다.
> head(insurance[c('age', 'age2')])
age age2
1 19 361
2 18 324
3 28 784
4 33 1089
5 32 1024
6 31 961
# 수치형 변수를 이진화
# bmi 값이 30 이상이면 1, 그렇지 않으면 0으로 변환
> insurance$bmi30 <- ifelse(insurance$bmi>30, 1, 0)
> head(insurance$bmi30)
[1] 0 1 1 0 0 0
> head(insurance[c('bmi','bmi30')])
bmi bmi30
1 27.9 0
2 33.8 1
3 33.0 1
4 22.7 0
5 28.9 0
6 25.7 0
> # 두 변수 이상의 상호작용을 선형 회귀 모델에 추가
> # 흡연 + 비만
> structure(insurance)
age sex bmi children smoker region expenses age2 bmi30
1 19 female 27.9 0 yes southwest 16884.92 361 0
2 18 male 33.8 1 no southeast 1725.55 324 1
3 28 male 33.0 3 no southeast 4449.46 784 1
4 33 male 22.7 0 no northwest 21984.47 1089 0
5 32 male 28.9 0 no northwest 3866.86 1024 0
6 31 female 25.7 0 no southeast 3756.62 961 0
7 46 female 33.4 1 no southeast 8240.59 2116 1
8 37 female 27.7 3 no northwest 7281.51 1369 0
9 37 male 29.8 2 no northeast 6406.41 1369 0
10 60 female 25.8 0 no northwest 28923.14 3600 0
11 25 male 26.2 0 no northeast 2721.32 625 0
12 62 female 26.3 0 yes southeast 27808.73 3844 0
13 23 male 34.4 0 no southwest 1826.84 529 1
14 56 female 39.8 0 no southeast 11090.72 3136 1
15 27 male 42.1 0 yes southeast 39611.76 729 1
16 19 male 24.6 1 no southwest 1837.24 361 0
17 52 female 30.8 1 no northeast 10797.34 2704 1
18 23 male 23.8 0 no northeast 2395.17 529 0
19 56 male 40.3 0 no southwest 10602.39 3136 1
20 30 male 35.3 0 yes southwest 36837.47 900 1
21 60 female 36.0 0 no northeast 13228.85 3600 1
22 30 female 32.4 1 no southwest 4149.74 900 1
23 18 male 34.1 0 no southeast 1137.01 324 1
24 34 female 31.9 1 yes northeast 37701.88 1156 1
25 37 male 28.0 2 no northwest 6203.90 1369 0
26 59 female 27.7 3 no southeast 14001.13 3481 0
27 63 female 23.1 0 no northeast 14451.84 3969 0
28 55 female 32.8 2 no northwest 12268.63 3025 1
29 23 male 17.4 1 no northwest 2775.19 529 0
30 31 male 36.3 2 yes southwest 38711.00 961 1
31 22 male 35.6 0 yes southwest 35585.58 484 1
32 18 female 26.3 0 no northeast 2198.19 324 0
33 19 female 28.6 5 no southwest 4687.80 361 0
34 63 male 28.3 0 no northwest 13770.10 3969 0
35 28 male 36.4 1 yes southwest 51194.56 784 1
36 19 male 20.4 0 no northwest 1625.43 361 0
37 62 female 33.0 3 no northwest 15612.19 3844 1
38 26 male 20.8 0 no southwest 2302.30 676 0
39 35 male 36.7 1 yes northeast 39774.28 1225 1
40 60 male 39.9 0 yes southwest 48173.36 3600 1
41 24 female 26.6 0 no northeast 3046.06 576 0
42 31 female 36.6 2 no southeast 4949.76 961 1
43 41 male 21.8 1 no southeast 6272.48 1681 0
44 37 female 30.8 2 no southeast 6313.76 1369 1
45 38 male 37.1 1 no northeast 6079.67 1444 1
46 55 male 37.3 0 no southwest 20630.28 3025 1
47 18 female 38.7 2 no northeast 3393.36 324 1
48 28 female 34.8 0 no northwest 3556.92 784 1
49 60 female 24.5 0 no southeast 12629.90 3600 0
50 36 male 35.2 1 yes southeast 38709.18 1296 1
51 18 female 35.6 0 no northeast 2211.13 324 1
52 21 female 33.6 2 no northwest 3579.83 441 1
53 48 male 28.0 1 yes southwest 23568.27 2304 0
54 36 male 34.4 0 yes southeast 37742.58 1296 1
55 40 female 28.7 3 no northwest 8059.68 1600 0
56 58 male 37.0 2 yes northwest 47496.49 3364 1
57 58 female 31.8 2 no northeast 13607.37 3364 1
58 18 male 31.7 2 yes southeast 34303.17 324 1
59 53 female 22.9 1 yes southeast 23244.79 2809 0
60 34 female 37.3 2 no northwest 5989.52 1156 1
61 43 male 27.4 3 no northeast 8606.22 1849 0
62 25 male 33.7 4 no southeast 4504.66 625 1
63 64 male 24.7 1 no northwest 30166.62 4096 0
64 28 female 25.9 1 no northwest 4133.64 784 0
65 20 female 22.4 0 yes northwest 14711.74 400 0
66 19 female 28.9 0 no southwest 1743.21 361 0
67 61 female 39.1 2 no southwest 14235.07 3721 1
68 40 male 26.3 1 no northwest 6389.38 1600 0
69 40 female 36.2 0 no southeast 5920.10 1600 1
70 28 male 24.0 3 yes southeast 17663.14 784 0
71 27 female 24.8 0 yes southeast 16577.78 729 0
72 31 male 28.5 5 no northeast 6799.46 961 0
73 53 female 28.1 3 no southwest 11741.73 2809 0
74 58 male 32.0 1 no southeast 11946.63 3364 1
75 44 male 27.4 2 no southwest 7726.85 1936 0
76 57 male 34.0 0 no northwest 11356.66 3249 1
77 29 female 29.6 1 no southeast 3947.41 841 0
78 21 male 35.5 0 no southeast 1532.47 441 1
79 22 female 39.8 0 no northeast 2755.02 484 1
80 41 female 33.0 0 no northwest 6571.02 1681 1
81 31 male 26.9 1 no northeast 4441.21 961 0
82 45 female 38.3 0 no northeast 7935.29 2025 1
83 22 male 37.6 1 yes southeast 37165.16 484 1
84 48 female 41.2 4 no northwest 11033.66 2304 1
85 37 female 34.8 2 yes southwest 39836.52 1369 1
86 45 male 22.9 2 yes northwest 21098.55 2025 0
87 57 female 31.2 0 yes northwest 43578.94 3249 1
88 56 female 27.2 0 no southwest 11073.18 3136 0
89 46 female 27.7 0 no northwest 8026.67 2116 0
90 55 female 27.0 0 no northwest 11082.58 3025 0
91 21 female 39.5 0 no southeast 2026.97 441 1
92 53 female 24.8 1 no northwest 10942.13 2809 0
93 59 male 29.8 3 yes northeast 30184.94 3481 0
94 35 male 34.8 2 no northwest 5729.01 1225 1
95 64 female 31.3 2 yes southwest 47291.06 4096 1
96 28 female 37.6 1 no southeast 3766.88 784 1
97 54 female 30.8 3 no southwest 12105.32 2916 1
98 55 male 38.3 0 no southeast 10226.28 3025 1
99 56 male 20.0 0 yes northeast 22412.65 3136 0
100 38 male 19.3 0 yes southwest 15820.70 1444 0
101 41 female 31.6 0 no southwest 6186.13 1681 1
102 30 male 25.5 0 no northeast 3645.09 900 0
103 18 female 30.1 0 no northeast 21344.85 324 1
104 61 female 29.9 3 yes southeast 30942.19 3721 0
105 34 female 27.5 1 no southwest 5003.85 1156 0
106 20 male 28.0 1 yes northwest 17560.38 400 0
107 19 female 28.4 1 no southwest 2331.52 361 0
108 26 male 30.9 2 no northwest 3877.30 676 1
109 29 male 27.9 0 no southeast 2867.12 841 0
110 63 male 35.1 0 yes southeast 47055.53 3969 1
111 54 male 33.6 1 no northwest 10825.25 2916 1
[ reached 'max' / getOption("max.print") -- omitted 1227 rows ]
ins_model2 <- lm(formula = expenses~ age + sex + bmi +
children + smoker + region+ age2 + bmi30+
smoker*bmi30,
data = insurance)