2 つ以上の変数の組み合わせごとに,別の変数の平均値を求めるという例題
> aq = read.csv("aq", header=TRUE)
> aq = ddply(aq, .(Month), transform, HighTemp=ifelse(Temp-mean(Temp) > 0, 1, 0))
> f1 = function() ddply(aq1, .(Month, HighTemp), summarize, AveWind=mean(Wind))
> (ans = f1())
Month HighTemp AveWind
1 5 0 12.800000
2 5 1 10.518750
3 6 0 10.383333
4 6 1 10.091667
5 7 0 9.820000
6 7 1 8.118750
7 8 0 10.093750
8 8 1 7.406667
9 9 0 11.393750
10 9 1 8.792857
>
> f2 = function() data.frame(Month=rep(5:9, each=2), HighTemp=rep(0:1, 5), AveWind=c(by(aq$Wind, list(aq$HighTemp, aq$Month), mean)))
> # identical(ans, f2())
> all.equal(ans, f2())
[1] TRUE
>
> f3 = function() data.frame(Month=rep(5:9, each=2), HighTemp=rep(0:1, 5), AveWind=unname(sapply(split(aq$Wind, list(aq$HighTemp, aq$Month)), mean)))
> # identical(ans, f3())
> all.equal(ans, f3())
[1] TRUE
>
> # aggregate は列の順序が違ってくるので [c(2,1,3)] で入れ替えている(本来不要)
> f4 = function() {
+ a = aggregate(aq$Wind, list(aq$HighTemp, aq$Month), mean)[c(2,1,3)]
+ colnames(a) = c("Month", "HighTemp", "AveWind")
+ a
+ }
> identical(ans, f4())
[1] TRUE
>
> # f4 が遅いのは list のせいなので,sapply を二重にして回避
> f5 = function() {
+ a = split(aq, aq$Month)
+ b = c(sapply(a, function(d) sapply(split(d$Wind, d$HighTemp), mean)))
+ data.frame(Month=rep(5:9, each=2), HighTemp=rep(0:1, 5), AveWind=b, row.names=NULL)
+ }
> # identical(ans, f5())
> all.equal(ans, f5())
[1] TRUE
>
> # もう 1 つの回避法(sapply を for の代わりに使う)
> f6 = function() {
+ b = c(sapply(5:9, function(i) sapply(0:1, function(j) mean(aq[aq$Month==i & aq$HighTemp==j, "Wind"]))))
+ data.frame(Month=rep(5:9, each=2), HighTemp=rep(0:1, 5), AveWind=b, row.names=NULL)
+ }
> # identical(ans, f6())
> all.equal(ans, f6())
[1] TRUE
>
> # さらにもう 1 つの回避法(値の組合せを expand.grid で作る)
> f7 = function() {
+ l = expand.grid(0:1,5:9)
+ b = c(apply(l, 1, function(ij) mean(aq[aq$Month==ij[2] & aq$HighTemp==ij[1], "Wind"])))
+ data.frame(Month=l[,2], HighTemp=l[,1], AveWind=b, row.names=NULL)
+ }
> # identical(ans, f7())
> all.equal(ans, f7())
[1] TRUE
>
> # ddply は,たいして速くない。sapply を for 代わりに使うと 1.5 倍速
> # aggregate は遅い
> benchmark(f1(), f2(), f3(), f4(), f5(), f6(), f7(), columns=c("test", "replications", "elapsed", "relative", "user.self", "sys.self"), replications=1000, order=NULL)
test replications elapsed relative user.self sys.self
1 f1() 1000 9.691 1.533 9.367 0.331
2 f2() 1000 15.140 2.395 14.905 0.256
3 f3() 1000 13.162 2.082 13.076 0.167
4 f4() 1000 43.494 6.881 41.944 0.580
5 f5() 1000 16.176 2.559 15.947 0.302
6 f6() 1000 6.321 1.000 5.998 0.362
7 f7() 1000 6.586 1.042 6.273 0.354