Data Report

data from Prof. Kim Soo Yong

Created by

Rischan Mafrur

Chonnam National University of South Korea

May 25, 2014

User Clustering based on Broadcast log data. Load the dataset

broadcast <- read.csv("broadcast.csv", header=TRUE)
names(broadcast)
##  [1] "sumcallin"     "caountcallin"  "sumcallout"    "countcallout" 
##  [5] "sumreceived"   "countreceived" "sumsent"       "countsent"    
##  [9] "user"          "personality"

Load the data set

broadcast_kmean <- read.csv("broadcast.csv", header=TRUE)
broadcast_kmean$personality <- NULL
broadcast_kmean$user <- NULL
names(broadcast_kmean)
## [1] "sumcallin"     "caountcallin"  "sumcallout"    "countcallout" 
## [5] "sumreceived"   "countreceived" "sumsent"       "countsent"

Creating cluster model and show the information

kc <- kmeans(broadcast_kmean,2)
kc
## K-means clustering with 2 clusters of sizes 753, 104
## 
## Cluster means:
##   sumcallin caountcallin sumcallout countcallout sumreceived countreceived
## 1     363.8        3.400        297        3.378       198.4         5.007
## 2    1111.9        5.394       2386        7.683       229.8         6.452
##   sumsent countsent
## 1   52.28     3.615
## 2  105.36     7.212
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 2 2 1 1 2 1 1 2 1 1 1 1 2 2 2 1 1 1 1 2 1 1 1 1 1 1 2
##  [36] 1 1 1 2 1 1 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [71] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [106] 1 1 1 2 1 2 1 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 2 1 1 2 1 1 1 1 1 1 2 2 2
## [141] 2 1 1 1 2 2 2 1 1 1 1 2 2 1 1 1 1 2 2 2 2 2 1 2 2 2 2 1 1 2 1 1 1 1 1
## [176] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1
## [211] 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [246] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [281] 2 1 1 2 1 1 1 1 2 2 1 1 1 1 1 1 1 2 1 1 2 1 1 1 1 1 1 2 1 1 1 1 2 2 2
## [316] 2 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [351] 1 2 1 1 1 1 1 1 2 1 2 2 1 2 2 1 2 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [386] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [421] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [456] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1
## [491] 1 2 1 1 1 1 1 1 1 1 1 2 1 1 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1
## [526] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1
## [561] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [596] 1 1 1 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [631] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1
## [666] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 2 1 1 1 1 1 1 1
## [701] 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 2 1 1 1 2 2 1 1 1 2 1 2 2 1
## [736] 2 2 1 1 1 1 2 1 2 2 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2
## [771] 1 1 1 1 1 1 2 2 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [806] 1 1 1 2 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [841] 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2
## 
## Within cluster sum of squares by cluster:
## [1] 460769514 443697588
##  (between_SS / total_SS =  33.2 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

Confusion matrix

#comparing result cluster to real value
table(broadcast$personality, kc$cluster)
##        
##           1   2
##   extro 411  68
##   intro 342  36