# Haste makes waste

``````> getwd()
[1] "C:/Users/utane/Documents"
> setwd("C:/Users/utane/OneDrive/udacity/23-R")
> getwd()
[1] "C:/Users/utane/OneDrive/udacity/23-R"

# 列出当前目录下的文件名
> list.files()

# 导入数据后，显示有99003行数据，15个变量

# 该数据集中的变量名
> names(pf)
[1] "userid"                "age"                   "dob_day"               "dob_year"
[5] "dob_month"             "gender"                "tenure"                "friend_count"
``````

# 4. 练习:用户生日直方图

``````
# 安装并加载图形库
install.packages('ggplot2')
library(ggplot2)

ggplot(aes(x = dob_day), data = pf) +
geom_histogram(binwidth = 1) +
scale_x_continuous(breaks = 1:31)

``````

# 8. 练习:分面

``````ggplot(aes(x = dob_day), data = pf) +
geom_histogram(binwidth = 1) +
scale_x_continuous(breaks = 1:31) + facet_wrap(~dob_month,ncol = 3)
``````

`facet_wrap``facet_grid`用于切面

# 11. 练习:好友数量

``````> pf <- read.csv("pseudo_facebook.tsv",sep='\t')
> names(pf)
[1] "userid"                "age"                   "dob_day"               "dob_year"
[5] "dob_month"             "gender"                "tenure"                "friend_count"
> ggplot(aes(x = friend_count), data = pf) +
+   geom_histogram(binwidth = 1)
``````

# 12. 限制轴

``````> ggplot(aes(x = friend_count), data = pf) +
+   geom_histogram() +
+   scale_x_continuous(limits = c(0, 1000))
``````

# 14. 调整组距

``````
# 添加组距，即0到1000，以50为步长
> ggplot(aes(x = friend_count), data = pf) +
+    geom_histogram() +
+    scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50))

# 以性别gender进行分面
> ggplot(aes(x = friend_count), data = pf) +
+    geom_histogram() +
+    scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) +  facet_wrap(~gender)
``````

# 15. 忽略NA观测值

``````> ggplot(aes(x = friend_count), data = subset(pf, !is.na(gender))) +
+    geom_histogram() +
+    scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) +  facet_wrap(~gender)
``````

# 16. 按性别划分的统计学(by())

• 变量
• 类别变量，用于划分子集的指标列表
• 函数
``````> table(pf\$gender)
female   male
40254  58574

> by(pf\$friend_count,pf\$gender,summary)
pf\$gender: female
Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
0      37      96     242     244    4923
-----------------------------------------------------------------------------------
pf\$gender: male
Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
0      27      74     165     182    4917
>

``````

# 17. 使用时长

``````
>  ggplot(aes(x = tenure), data = pf) +
+    geom_histogram(binwidth = 30, color = 'black', fill = '#099DD9')

> ggplot(aes(x = tenure/365), data = pf) +
+     geom_histogram(binwidth = .25, color = 'black', fill = '#F79420')

``````

``````> ggplot(aes(x = tenure/365), data = pf) +
+     geom_histogram(binwidth = .25, color = 'black', fill = '#F79420') +
+ scale_x_continuous(breaks = seq(1,7,1),limits=c(0,7))
``````

# 19. 练习：用户年龄

``````> ggplot(aes(x = age),data = pf) +  geom_histogram(binwidth = 1, color = 'black', fill = '#F79420') +
scale_x_continuous(breaks = seq(1,150,5),limits=c(0,150))
``````

# 22. 转换数据

``````> setwd("C:/Users/utane/OneDrive/udacity/23-R")
> names(pf)
[1] "userid"                "age"                   "dob_day"
[4] "dob_year"              "dob_month"             "gender"
[7] "tenure"                "friend_count"          "friendships_initiated"

>
> library(ggplot2)
> qplot(x=friend_count,data=pf)
>
``````

``````install.packages('gridExtra')
library(gridExtra)
``````
``````> summary(pf\$friend_count)
Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
0.0    31.0    82.0   196.4   206.0  4923.0
> summary(log10(pf\$friend_count))
Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
-Inf   1.491   1.914    -Inf   2.314   3.692
> summary(log10(pf\$friend_count+1))
Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
0.000   1.505   1.919   1.868   2.316   3.692
> summary(sqrt(pf\$friend_count))
Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
0.000   5.568   9.055  11.088  14.353  70.164
> library(gridExtra)
>
> q1 = qplot(x=friend_count,data=pf)
> q2 = qplot(x=log10(pf\$friend_count+1),data=pf)
> q3 = qplot(x=sqrt(pf\$friend_count),data=pf)
>
> grid.arrange(q1, q2, q3,ncol=1)
``````
• 另一种方式也可以画出相同的图形：
``````> p1 <- ggplot(aes(x = friend_count),data=pf) + geom_histogram()
> p2 <- p1 + scale_x_log10()
> p3 <- p1 + scale_x_sqrt()

> grid.arrange(p1, p2, p3,ncol=1)
``````

# 23. 添加定标层

``````> logScale <- qplot(x = log10(friend_count),data = pf)
> countScale <- ggplot(aes(x=friend_count),data = pf) +
+ geom_histogram() + scale_x_log10()

> library(ggplot2)
> library(gridExtra)

> grid.arrange(logScale,countScale,ncol = 2)

``````

# 24. 频率多边形

``````> qplot(x = friend_count,data = subset(pf,!is.na(gender)),
+ binwidth = 10) +
+ scale_x_continuous(lim = c(0,1000),breaks = seq(0,1000,50)) +
+ facet_wrap(~gender)
``````

``````qplot(x = friend_count,data = subset(pf,!is.na(gender)),
binwidth = 10,geom = "freqpoly",color = gender) +
scale_x_continuous(lim = c(0,1000),breaks = seq(0,1000,50))
``````

• 使用频数多边形确定哪个性别在万维网 (www_likes) 上获得的点赞数量更多
``````qplot(x = www_likes,data = subset(pf,!is.na(gender)),
geom = "freqpoly",color=gender) +
scale_x_continuous() +
scale_x_log10()
``````

# 25. 练习:网页端上的“点赞”数

``````by(pf\$www_likes,pf\$gender,sum)
``````

pf\$gender: female [1] 3507665

pf\$gender: male [1] 1430175

``````by(pf\$www_likes,pf\$gender,summary)
``````

pf\$gender: female Min. 1st Qu. Median Mean 3rd Qu. Max. 0.00 0.00 0.00 87.14 25.00 14865.00

pf\$gender: male Min. 1st Qu. Median Mean 3rd Qu. Max. 0.00 0.00 0.00 24.42 2.00 12903.00

# 26. 箱线图

• 方法1
``````qplot(x=gender,y=friend_count,
data = subset(pf,!is.na(gender)),
geom = "boxplot") +
scale_y_continuous(lim = c(0,1000),breaks = seq(0,1000,50))

``````

• 方法2
``````qplot(x=gender,y=friend_count,
data = subset(pf,!is.na(gender)),
geom = "boxplot",ylim = c(0,1000))
``````

# 27. 练习:箱线图、四分位数和友谊

``````qplot(x=gender,y=friend_count,
data = subset(pf,!is.na(gender)),
geom = "boxplot") +
coord_cartesian(ylim = c(0,250))

by(pf\$friend_count,pf\$gender,summary)
``````

pf\$gender: female Min. 1st Qu. Median Mean 3rd Qu. Max. 0 37 96 242 244 4923

pf\$gender: male Min. 1st Qu. Median Mean 3rd Qu. Max. 0 27 74 165 182 4917

• 问题`谁发起了更多的交友请求，男士还是女士？`
``````qplot(x=gender,y=friendships_initiated,
data = subset(pf,!is.na(gender)),geom="boxplot") +
coord_cartesian(ylim = c(0,150))

by(pf\$friendships_initiated,pf\$gender,summary)
``````

pf\$gender: female Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0 19.0 49.0 113.9 124.8 3654.0

pf\$gender: male Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0 15.0 44.0 103.1 111.0 4144.0

# 28. 练习:符合逻辑

``````summary(pf\$mobile_likes)

mobile_check_in <- NA
pf\$mobile_check_in <- ifelse(pf\$mobile_likes >0,1,0)
pf\$mobile_check_in <- factor(pf\$mobile_check_in)
summary(pf\$mobile_check_in)

``````

Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0 0.0 4.0 106.1 46.0 25111.0

0 1 35056 63947

• 求用手机登录人数的百分比
``````sum(pf\$mobile_check_in == 1) / length(pf\$mobile_check_in)
``````