# 3. 练习: 第三个定性变量

``````qplot(x = friend_count,data = subset(pf,!is.na(gender)),
binwidth = 10) +
scale_x_continuous(lim = c(0,1000),breaks = seq(0,1000,50)) +
facet_wrap(~gender)
``````

``````ggplot(aes(x = gender, y = age),
data = subset(pf, !is.na(gender))) + geom_boxplot() +
stat_summary(fun.y = mean,geom = "point",shape = 4)
``````

• `观察各个年龄层上，男女用户的好友数量`，取的是好友数目的中位数：
``````ggplot(aes(x=age,y=friend_count),
data = subset(pf,!is.na(gender))) +
geom_line(aes(color=gender),stat="summary",fun.y=median)
``````

• 根据年龄和性别同时分组：
``````library("dplyr")
age_gender_groups <- group_by(subset(pf,!is.na(gender)), age, gender)
pf.fc_by_age_gender <- summarise(age_gender_groups,
mean_friend_count = mean(friend_count),
median_friend_count = median(friend_count),
n = n())

``````

# 4. 练习: 绘制条件小结

``````ggplot(aes(x=age,y=median_friend_count),data=pf.fc_by_age_gender) +
geom_line(aes(color=gender))
``````

# 6. 宽格式和长格式

• 使用`tidyr`包重组数据：
``````#install.packages("tidyr")
library(tidyr)
spread(subset(pf.fc_by_age_gender, select = c('gender', 'age', 'median_friend_count')), gender, median_friend_count)
``````

# 7. 重塑数据

``````
#install.packages("reshape2")
library(reshape2)

pf.fc_by_age_gender.wide <- dcast(pf.fc_by_age_gender,
age ~ gender,
value.var = "median_friend_count")

``````

# 8. 练习：比率图

Plot the ratio of the female to male median friend counts using the data frame pf.fc_by_age_gender.wide.

Think about what geom you should use. Add a horizontal line to the plot with a y intercept of 1, which will be the base line. Look up the documentation for geom_hline to do that. Use the parameter linetype in geom_hline to make the line dashed.

``````ggplot(aes(x=age,y=female/male),data=pf.fc_by_age_gender.wide) + geom_hline(yintercept = 1,linetype=2,alpha = 0.3) +
geom_line(aes(color=age))
``````

# 9. 练习: 第三个定量变量

``````pf\$year_joined <- 2014 - ceiling(pf\$tenure / 365)
``````

# 10. 练习: 切割一个变量

``````summary(pf\$year_joined)
``````

``````table(pf\$year_joined)
``````

• 练习：需要将上述数据，按照 `(2004, 2009]` / `(2009, 2011]` / `(2011, 2012]` / `(2012, 2014]`进行分割汇总：
``````pf\$year_joined.bucket <- cut(pf\$year_joined,breaks = c(2004,2009,2011,2012,2014))
table(pf\$year_joined.bucket)
``````

• 注意`\$``.`的差别：
``````pf.test1.test12 <- "test1.test12"  # 当前内存中创建一个变量`pf.test1.test2`
pf\$test1 <- "test1" # pf数据框中，创建一个字段`test1`
pf\$test1.test12 <- "test1.test12" # pf数据框中，创建一个字段`test1.test12`
``````

# 11. 练习: 绘制在一起

``````ggplot(aes(x = age, y = friend_count),
data = subset(pf, !is.na(year_joined.bucket))) +
#  geom_line(aes(color = gender), stat = 'summary', fun.y = median) +
geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = median)

``````

# 12. 练习: 绘制总均值

``````ggplot(aes(x = age, y = friend_count),
data = subset(pf, !is.na(year_joined.bucket))) +
geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean) +
geom_line(stat="summary",fun.y=mean,linetype=2)
``````

# 13. 练习: 好友率

• 我自己的方式，很繁琐，还创建了一个新的数据集，浪费内存：
``````tenureover0 <- subset(pf,pf\$tenure>0)
tenureover0\$tenure_count <- tenureover0\$friend_count / tenureover0\$tenure
summary(tenureover0\$tenure_count)
``````
• 视频中的方式：
``````with(subset(pf,tenure >=1),summary(friend_count / tenure))
``````

# 14. 练习: 申请好友数

``````ggplot(aes(x = tenure, y = friendships_initiated / tenure),
data = subset(pf, tenure >= 1)) +
geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean)
``````

# 15. 练习: 偏差方差折衷

``````p0 <- ggplot(aes(x = tenure, y = friendships_initiated / tenure),
data = subset(pf, tenure >= 1)) +
geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean)

p1 <- ggplot(aes(x = 7*round(tenure/7), y = friendships_initiated / tenure),
data = subset(pf, tenure >= 1)) +
geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean)

p2 <- ggplot(aes(x = 30*round(tenure/30), y = friendships_initiated / tenure),
data = subset(pf, tenure >= 1)) +
geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean)

p3 <- ggplot(aes(x = 90*round(tenure/90), y = friendships_initiated / tenure),
data = subset(pf, tenure >= 1)) +
geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean)

library(gridExtra)

grid.arrange(p0,p1,p2,p3,ncol=1)
``````

``````ggplot(aes(x = tenure, y = friendships_initiated / tenure),
data = subset(pf, tenure >= 1)) +
geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean) +
geom_smooth(method="auto",color="red")
``````

• 视频中的方式，使用`geom_smooth`函数：
``````ggplot(aes(x = tenure, y = friendships_initiated / tenure),
data = subset(pf, tenure >= 1)) +
geom_smooth(aes(color = year_joined.bucket))
``````

# 17. 酸奶数据集简介

``````yo <- read.csv("yogurt.csv")
str(yo)
``````

• 将id转换为factor：
``````yo\$id <- factor(yo\$id)
str(yo)
``````

# 18. 练习: 重访直方图

• 构建一个价格的直方图：
``````p1 <- ggplot(aes(x = price), data = yo) + geom_histogram(binwidth = 1)
p2 <- ggplot(aes(x = price), data = yo) + geom_histogram(binwidth = 10)

library(gridExtra)
grid.arrange(p1,p2,ncol=1)
``````

# 19. 练习: 购买数量

• 继续探索数据-汇总数据
``````summary(yo)
``````

• 查看价格的种类
``````unique(yo\$price)
length(unique(yo\$price))
``````

• 将数据集按照price汇总：
``````table(yo\$price)
``````

• 练习：

``````yo <- transform(yo,all.purchases=(strawberry + blueberry + pina.colada + plain + mixed.berry))
``````

``````yo\$all.purchases + yo\$strawberry + yo\$blueberry + yo\$pina.colada + yo\$plain + yo\$mixed.berry

``````

# 20. 练习: 随时间变化的价格

• 随时间变化的价格图：
``````ggplot(aes(x=time,y=price),data=yo) + geom_jitter(alpha=0.25,shape=21,fill=I("#F79420"))
``````

# 22. 练习: 查看家庭样本

• 首先创建随机数种子，随机选取16个家庭，注意语法 `%in%`
``````set.seed(4230)
sample.ids <- sample(levels(yo\$id),16)

ggplot(aes(x=time,y=price),
data=subset(yo,id %in% sample.ids)) +
facet_wrap(~id) +
geom_line() +
geom_point(aes(size=all.purchases),pch=1)
``````

# 25. 练习: 散点图矩阵

• 创建变量与变量之间的散点图，最终形成一个散点图矩阵
``````#install.packages("GGally")
library(GGally)
theme_set(theme_minimal(20))

set.seed(1836)
pf_subset <- pf[,c(2:5)]
names(pf_subset)
ggpairs(pf_subset[sample.int(nrow(pf_subset),1000),])
``````

# 26. 更多变量

``````nci <- read.table("nci.tsv")
colnames(nci) <- c(1:64)
``````

# 27. 热图

``````library(reshape2)

nci.long.samp <- melt(as.matrix(nci[1:200,]))

names(nci.long.samp) <- c("gene","case","value")

ggplot(aes(y=gene,x=case,fill=value),
data=nci.long.samp) +
geom_tile() +