# 3. 练习：散点图

• 准备工作
``````setwd("C:/Users/utane/OneDrive/udacity/24-R")
getwd()
library(ggplot2)
``````
• 绘图
``````qplot(x = age,y = friend_count,data=pf)
``````

# 4. ggplot 语法

• 查看age的范围
``````summary(pf\$age)
``````
• 使用ggplot画图
``````ggplot(aes(x=age,y=friend_count),data = pf) + geom_point() +
xlim(13,113)
``````

1. 需要自己制定图形类型，如geom_point()
2. xy轴，需要外面有一个wrapper
3. 使用layer的概念，每添加一个layer会改变图形形状

# 5. 过度绘制

``````ggplot(aes(x=age,y=friend_count),data = pf) +
geom_point(alpha = 1/20) +
xlim(13,113)
``````

``````ggplot(aes(x=age,y=friend_count),data = pf) +
geom_jitter(alpha = 1/20) +
xlim(13,113)
``````

# 6. 练习：coord_trans()

``````ggplot(aes(x=age,y=friend_count),data = pf) +
geom_point(alpha = 1/20) +
xlim(13,113) + coord_trans(y = "sqrt")
``````

# 7. Alpha 和 Jitter

• 自己的实现，使用facet_wrap进行了分面，区分的男和女的情况
``````ggplot(aes(x=age,y=friendships_initiated),data = pf) +
geom_jitter(alpha = 1/20) +
xlim(13,113) + facet_wrap(~gender,ncol = 2)

``````

• 课程示例，上面的6说明了jitter的情况下不能使用sqrt分层，如果使用下面的形式，可以使得sqrt下使用抖动：
``````ggplot(aes(x=age,y=friendships_initiated),data = pf) +
geom_point(alpha = 1/10,position = position_jitter( h = 0 )) +
coord_trans(y="sqrt")

``````

# 9. 条件均值

``````install.packages("dplyr")
library("dplyr")
``````
• 有下面两种方式对数据重新组织，最终都得到相同的结果集：
``````age_groups <- group_by(pf,age)
pf.fc_by_age <- summarise(age_groups,
friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n())
``````
``````pf.fc_by_age <- pf %>%
group_by(age) %>%
summarise(friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n()) %>%
arrange(age)

``````

• 练习：
``````ggplot(aes(x=age,y=friend_count_mean),data = pf.fc_by_age) +
geom_line()
``````

# 10. 练习: 将摘要与原始数据叠加

``````ggplot(aes(x=age,y=friend_count),data=pf) +
coord_cartesian(xlim = c(13, 90)) +
geom_point(alpha = 0.05,
position = position_jitter(h=0),
color="orange") +
coord_trans(y="sqrt") +
geom_line(stat = "summary",fun.y = mean) +
geom_line(stat = "summary",fun.y = quantile,fun.args = list(probs = .1),
linetype = 2,color = "red") +
geom_line(stat = "summary",fun.y = quantile,fun.args = list(probs = .5),
color = "green") +
geom_line(stat = "summary",fun.y = quantile,fun.args = list(probs = .9),
linetype = 2,color = "blue")
``````

# 12. 相关性

``````cor(pf\$age,pf\$friend_count)
``````

``````?cor.test
``````
``````cor.test(x, ...)

## Default S3 method:
cor.test(x, y,
alternative = c("two.sided", "less", "greater"),
method = c("pearson", "kendall", "spearman"),
exact = NULL, conf.level = 0.95, continuity = FALSE, ...)
``````
``````cor.test(pf\$age,pf\$friend_count,method="pearson")
``````

``````with(pf,cor.test(age,friend_count,method="pearson"))
``````

``````-0.02740737
``````

# 13. 子集相关性

``````with(subset(pf,pf\$age <= 70),cor.test(age,friend_count,method="pearson"))
``````

``````-0.1712144
``````

# 15. 创建散点图

``````ggplot(aes(x=www_likes_received,y=likes_received),data = pf) + geom_point()
``````

# 16. 强相关

``````ggplot(aes(x=www_likes_received,y=likes_received),data = pf) +
geom_point() +
geom_smooth(method="lm",color="red")
``````

``````with(pf,
``````

# 18. 相关系数的更多注意事项

• 先安装对应的包，并查看示例数据的帮助
``````install.packages("alr3")
library(alr3)
``````
``````data(Mitchell)
?Mitchell
``````
``````ggplot(aes(x=Month,y=Temp),data = Mitchell) +
geom_point()
``````

# 19. 噪声散点图

``````with(Mitchell,
cor.test(Month,Temp,method="pearson"))
``````

# 20. 理解数据

``````ggplot(aes(x=Month,y=Temp),data = Mitchell) +
geom_point() + scale_x_continuous(breaks = seq(0,203,12))
``````

# 21. 新的视角

``````ggplot(aes(x=Month%%12,y=Temp),data = Mitchell) +
geom_point() + scale_x_continuous(breaks = seq(0,203,12))
``````

# 22. 练习: 了解噪声：年龄到月龄

``````pf\$age_with_months <- pf\$age + (1 - pf\$dob_month / 12)
``````

# 23. 练习: 带有月均值的年龄

``````names(pf)
``````

[1] “userid” “age” “dob_day” “dob_year” “dob_month”
[6] “gender” “tenure” “friend_count” “friendships_initiated” “likes”
[16] “age_with_months”

• 练习：产生新的dataframe
``````age_month_groups <- group_by(pf, age_with_months)

pf.fc_by_age_months <- summarise(age_month_groups,
friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n())

pf.fc_by_age_months <- arrange(pf.fc_by_age_months, age_with_months)

``````

# 24. 练习: 条件均值中的噪声

``````ggplot(aes(x=age_with_months,y=friend_count_mean),data=subset(pf.fc_by_age,age_with_months <= 70)) +
geom_line()
``````

# 25. 平滑化条件均值

``````
p1 <- ggplot(aes(x=age,y=friend_count_mean),data=subset(pf.fc_by_age,age<71)) +
geom_line()+ geom_smooth()

p2 <- ggplot(aes(x=age_with_months,y=friend_count_mean),data=subset(pf.fc_by_age_months,age_with_months <= 71)) +
geom_line() + geom_smooth()

p3 <- ggplot(aes(x=round(age/5)*5,y=friend_count_mean),data=subset(pf.fc_by_age,age<71)) +
geom_line(stat = "summary",fun.y=mean)

library(gridExtra)

grid.arrange(p2,p1,p3,ncol=1)

``````