#Clustering (군집화)
# 데이터 준비
teens <- read.csv(file='mlwr/snsdata.csv')
# 데이터 확인
str(teens)
head(teens,n = 20)
# 몇가지 변수(특징)에서 결측치(NA)가 보임!
summary(teens)
> table(teens$gender, useNA = 'ifany')
F M <NA>
22054 5222 2724
# female 변수를 데이터프레임에 추가
# 성별이 'F'이고, NA가 아니면 1, 그렇지 않으면 0을 입력 - 더미코딩
teens$female <- ifelse( teens$gender == 'F' & !is.na(teens$gender), 1, 0 )
> table(teens$female)
0 1
7946 22054
# nogender 변수를 데이터프레임에 추가
# gender 변수가 NA이면 1, 그렇지 않으면 0을 입력
> teens$nogender <- ifelse(is.na(teens$gender),1,0)
> table(teens$nogender)
0 1
27276 2724
# age의 정상 범위는 13 ~ 19라고 가정 -> 이외의 값들은 NA
teens$age <- ifelse(teens$age >= 13 & teens$age <= 19, teens$age, NA)
> summary(teens$age)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
13.03 16.24 17.16 17.12 18.05 19.00 7000
> # age의 NA 값들을 gradeyear별 age의 평균값으로 대체
> # dplyr 패키지 이용
> library(dplyr)
> teens %>%
+ group_by(gradyear) %>%
+ summarise( mean(age), na.rm = T )
# A tibble: 4 x 2
gradyear `mean(age)`
<int> <dbl>
1 2006 18.5
2 2007 17.7
3 2008 16.8
4 2009 15.8
> ave(df$score, df$class, FUN = mean)
[1] NA NA NA 8.5 8.5
> ave(df$score, df$class, FUN = my_mean)
[1] 9.5 9.5 9.5 8.5 8.5
ave_age <- ave(teens$age, teens$gradyear, FUN = my_mean)
teens$age <- ifelse(is.na(teens$age),ave_age,teens$age)
> summary(teens$age)
Min. 1st Qu. Median Mean 3rd Qu. Max.
13.03 16.28 17.23 17.19 18.20 19.00
# k- 평균 군집화 알고리즘 모델을 생성
str(teens)
# 개인 식별 정보 (gradyear, gender, age, friends)를 제외하고,
# 오로지 관심사들로만 clustering을 시도
interests <- teens[5:40]
> str(interests)
'data.frame': 30000 obs. of 36 variables:
$ basketball : int 0 0 0 0 0 0 0 0 0 0 ...
$ football : int 0 1 1 0 0 0 0 0 0 0 ...
$ soccer : int 0 0 0 0 0 0 0 0 0 0 ...
$ softball : int 0 0 0 0 0 0 0 1 0 0 ...
$ volleyball : int 0 0 0 0 0 0 0 0 0 0 ...
$ swimming : int 0 0 0 0 0 0 0 0 0 0 ...
$ cheerleading: int 0 0 0 0 0 0 0 0 0 0 ...
$ baseball : int 0 0 0 0 0 0 0 0 0 0 ...
$ tennis : int 0 0 0 0 0 0 0 0 0 0 ...
$ sports : int 0 0 0 0 0 0 0 0 0 0 ...
$ cute : int 0 1 0 1 0 0 0 0 0 1 ...
$ sex : int 0 0 0 0 1 1 0 2 0 0 ...
$ sexy : int 0 0 0 0 0 0 0 1 0 0 ...
$ hot : int 0 0 0 0 0 0 0 0 0 1 ...
$ kissed : int 0 0 0 0 5 0 0 0 0 0 ...
$ dance : int 1 0 0 0 1 0 0 0 0 0 ...
$ band : int 0 0 2 0 1 0 1 0 0 0 ...
$ marching : int 0 0 0 0 0 1 1 0 0 0 ...
$ music : int 0 2 1 0 3 2 0 1 0 1 ...
$ rock : int 0 2 0 1 0 0 0 1 0 1 ...
$ god : int 0 1 0 0 1 0 0 0 0 6 ...
$ church : int 0 0 0 0 0 0 0 0 0 0 ...
$ jesus : int 0 0 0 0 0 0 0 0 0 2 ...
$ bible : int 0 0 0 0 0 0 0 0 0 0 ...
$ hair : int 0 6 0 0 1 0 0 0 0 1 ...
$ dress : int 0 4 0 0 0 1 0 0 0 0 ...
$ blonde : int 0 0 0 0 0 0 0 0 0 0 ...
$ mall : int 0 1 0 0 0 0 2 0 0 0 ...
$ shopping : int 0 0 0 0 2 1 0 0 0 1 ...
$ clothes : int 0 0 0 0 0 0 0 0 0 0 ...
$ hollister : int 0 0 0 0 0 0 2 0 0 0 ...
$ abercrombie : int 0 0 0 0 0 0 0 0 0 0 ...
$ die : int 0 0 0 0 0 0 0 0 0 0 ...
$ death : int 0 0 1 0 0 0 0 0 0 0 ...
$ drunk : int 0 0 0 0 1 1 0 0 0 0 ...
$ drugs : int 0 0 0 0 1 0 0 0 0 0 ...
set.seed(2345)
teen_clusters <- kmeans(interests, 5)
> str(teen_clusters$cluster)
int [1:30000] 5 1 5 5 1 5 5 5 5 4 ...
> table(teen_clusters$cluster)
1 2 3 4 5
2128 1200 731 638 25303
# 모델이 분류한 클러스터가 어떤 특징들을 갖고 있을까?
>teens$cluster <- teen_clusters$cluster
> teens[1:10, c('cluster', 'gender', 'age', 'friends')]
cluster gender age friends
1 5 M 18.98200 7
2 1 F 18.80100 0
3 5 M 18.33500 69
4 5 F 18.87500 0
5 1 <NA> 18.99500 10
6 5 F 18.48625 142
7 5 F 18.93000 72
8 5 M 18.32200 17
9 5 F 18.48625 52
10 4 F 18.70800 39
> teen_clusters$centers
basketball football soccer softball volleyball swimming cheerleading baseball tennis
1 0.5808271 0.5305451 0.23637218 0.2946429 0.2617481 0.2659774 0.21005639 0.25845865 0.16682331
2 0.3591667 0.3725000 0.14000000 0.1875000 0.1716667 0.1625000 0.18750000 0.08416667 0.09916667
3 0.4705882 0.4295486 4.86730506 0.1915185 0.2079343 0.2106703 0.09439124 0.12859097 0.15321477
4 0.2993730 0.2680251 0.14733542 0.1865204 0.1410658 0.1489028 0.12695925 0.10501567 0.08934169
5 0.2299332 0.2176817 0.09326957 0.1472157 0.1299846 0.1194325 0.09394143 0.09232107 0.07813303
sports cute sex sexy hot kissed dance band marching music
1 0.3825188 0.8599624 1.5775376 0.3580827 0.2908835 0.75704887 0.6621241 1.4459586 0.17575188 2.5958647
2 0.1208333 0.6791667 0.1900000 0.2316667 0.2258333 0.10416667 4.6333333 0.3741667 0.04416667 0.9641667
3 0.3160055 0.3652531 0.1641587 0.1340629 0.1737346 0.09986320 0.3707250 0.2544460 0.02599179 0.7592339
4 0.1802508 0.3777429 0.2037618 0.1630094 0.1536050 0.09717868 0.3652038 0.4075235 0.03918495 1.0094044
5 0.1143738 0.2581907 0.0967079 0.1183259 0.1060349 0.04841323 0.2087500 0.1982374 0.02952219 0.5633719
rock god church jesus bible hair dress blonde mall shopping
1 0.9520677 0.7109962 0.4360902 0.14238722 0.03383459 2.7156955 0.28900376 0.64520677 0.7039474 0.6442669
2 0.3383333 0.5183333 0.4500000 0.11166667 0.02750000 0.6283333 0.30083333 0.10000000 0.4483333 0.6150000
3 0.3091655 0.4145007 0.3406293 0.11764706 0.03009576 0.4268126 0.12175103 0.15731874 0.3023256 0.5444596
4 0.3009404 6.2789969 1.3667712 1.24451411 0.27115987 0.4310345 0.15047022 0.11912226 0.2492163 0.3824451
5 0.1758685 0.2970004 0.1919140 0.08082046 0.01343714 0.2196182 0.08568154 0.05074497 0.2096589 0.3098052
clothes hollister abercrombie die death drunk drugs
1 0.5639098 0.18233083 0.15413534 0.6546053 0.28947368 0.34633459 0.39003759
2 0.2325000 0.09833333 0.08750000 0.2508333 0.15000000 0.11500000 0.06083333
3 0.1504788 0.10943912 0.06019152 0.1983584 0.10670315 0.07934337 0.05882353
4 0.2319749 0.06426332 0.04388715 0.4169279 0.31661442 0.13949843 0.07680251
5 0.1074181 0.05805636 0.04070664 0.1350828 0.09291388 0.06390547 0.03232818
>> 성능의 개선 --> 정규화
#정규화
normalize<- function(x){
return((x-min(x))/(max(x)-min(x)))
}
interests_n<-as.data.frame(lapply(interests,normalize)) # z-점수 표준화를 하려면 scale 함수를 이용하면 된다.
str(interests_n)
set.seed(2345)
teen_clusters_n <- kmeans(interests_n, 5)
str(teen_clusters_n)
str(teen_clusters_n$cluster)
table(teen_clusters_n$cluster)
# 모델이 분류한 클러스터가 어떤 특징들을 갖고 있을까?
teens$cluster_n <- teen_clusters_n$cluster
teens[1:10, c('cluster_n', 'gender', 'age', 'friends')]
teen_clusters_n$centers
basketball football soccer softball volleyball swimming cheerleading baseball
1 0.015178911 0.01890623 0.009826728 0.010748113 0.009570939 0.006790525 0.009516559 0.008065284
2 0.015669807 0.02097501 0.012244526 0.010747765 0.015069936 0.008748398 0.088215212 0.006119418
3 0.011635354 0.01138628 0.008339395 0.007149914 0.007441747 0.004940176 0.006009425 0.004999288
4 0.037772911 0.10949798 0.015945858 0.055019755 0.057393170 0.005665684 0.010012516 0.033296829
5 0.006551624 0.00532366 0.006534344 0.003837662 0.004261982 0.003150851 0.003679950 0.003325097
tennis sports cute sex sexy hot kissed dance band
1 0.007460982 0.016051263 0.03138350 0.004911880 0.012963668 0.022116483 0.014889754 0.02296663 0.006678894
2 0.008166052 0.012529018 0.03611908 0.002429260 0.009536165 0.025440393 0.005940188 0.02423870 0.005033953
3 0.005408483 0.009678338 0.02708986 0.002177376 0.009773223 0.019328210 0.004499770 0.01745896 0.004532947
4 0.009372827 0.050688360 0.02236592 0.002587298 0.010058870 0.018397997 0.004877892 0.01821722 0.005398162
5 0.004924123 0.006396103 0.01149838 0.001178691 0.006285120 0.007759436 0.001995665 0.01054919 0.004073433
marching music rock god church jesus bible hair
1 0.005156245 0.018527550 0.019413780 0.008369753 0.008512994 0.004682147 0.003287538 0.031687569
2 0.003277345 0.014607231 0.013519050 0.006813974 0.009617891 0.004383449 0.001862128 0.019154755
3 0.003027715 0.012093474 0.013324658 0.005959780 0.007329917 0.003937755 0.001475041 0.017217901
4 0.003678841 0.013115353 0.019369450 0.007324557 0.007850722 0.004839383 0.002427277 0.013688282
5 0.003665612 0.009862758 0.008957047 0.005235111 0.004142333 0.003345249 0.001787404 0.006272964
dress blonde mall shopping clothes hollister abercrombie die death
1 0.024573870 0.0009231353 0.02810557 0.04041942 0.170631900 0.017129806 0.013085268 0.015157283 0.013295992
2 0.030452001 0.0005336961 0.03287587 0.18241406 0.026423597 0.029359552 0.025553052 0.007951287 0.009949084
3 0.017047791 0.0003595209 0.13079989 0.02963020 0.007401082 0.011323022 0.008859949 0.012046166 0.009800333
4 0.013396375 0.0005524235 0.01244611 0.02400728 0.007561535 0.006721365 0.006101377 0.008211021 0.008492759
5 0.007307328 0.0001451723 0.00000000 0.01312356 0.000000000 0.003171766 0.002595679 0.006824634 0.006875498
drunk drugs
1 0.022696993 0.010444423
2 0.013006964 0.003968660
3 0.017470823 0.007258753
4 0.016166041 0.004250104
5 0.007274472 0.002129114