#==========
Julia の修行をするときに,いろいろなプログラムを書き換えるのは有効な方法だ。
以下のプログラムを Julia に翻訳してみる。
重回帰分析(ステップワイズ変数選択)
http://aoki2.si.gunma-u.ac.jp/R/sreg.html
ファイル名: sreg.jl 関数名: sreg
翻訳するときに書いたメモ
翻訳のしがいのあるプログラムだった。
==========#
using DataFrames, Rmath, Statistics, Printf, Plots
function sreg(data; vname=[], stepwise = true, Pin = 0.05, Pout = 0.05,
predict = false, verbose = false)
formatpval(p) = p >= 0.0001 ? @sprintf("%.5f", p) : "< 0.0001"
function stepout(isw)
step += 1
syy = ss[q1]
lxi = lx[1:ni]
b1 = r[q1, lxi];
b = b1 .* sd[q1] ./ sd[lxi];
b0 = means[q1] - sum(b .* means[lxi])
sr = sum(b .* ss[lxi])
se = syy - sr
if se < 0 && abs(se / syy) < 1e-12
se = 0
end
idf1 = ni
idf2 = nc - ni - 1
vr = sr / idf1
ve = se / idf2
if ve == 0
f = fp9 = NaN
stde = tv = pp = rep(NaN, ni)
seb0 = tv0 = pp0 = NaN
error("従属変数の予測値と観測値が完全に一致します。分析の指定に間違いはないですか?")
else
f = vr / ve
fp9 = pf(f, idf1, idf2, false)
end
rhoy = se / syy
if ve != 0
rdiag = [r[lxi0, lxi0] for lxi0 in lxi];
seb = rdiag .* rhoy ./ idf2 .* var[q1] ./ var[lxi];
stde = sqrt.(seb);
tv = abs.(b ./ stde);
pp = pt.(tv, idf2, false) * 2;
temp = means[lxi] ./ sd[lxi];
seb0 = sqrt((1 / nc + transpose(temp) * r[lxi, lxi] * temp / (nc - 1)) * ve)
tv0 = abs(b0 / seb0)
pp0 = pt(tv0, idf2, false) * 2
end
append!(stde, seb0);
append!(tv, tv0);
append!(pp, pp0);
adjR2 = 1 - ve / syy * (nc - 1)
if rhoy != 0 && isw != 0
r2change = oldrhoy - rhoy
fchange = r2change * idf2 / rhoy
pchange = fchange < 0 ? NaN : pf(fchange, 1, idf2, false)
else
pchange = NaN
end
if isw != 0 && verbose
print("\n ***** ステップ $step ***** ")
println(" $(["編入", "除去"][isw])変数: $(vname[ip])")
end
multico = [r[lxi0, lxi0] for lxi0 in lxi]
R2 = 1 - rhoy
R = sqrt(R2)
loglik = 0.5 * (sum( - nc * (log(2 * pi) + 1 - log(nc) + log(se))))
AIC = 2 * ni + 4 - 2 * loglik
results = merge(results, Dict(:lxi => lxi,
:variables => vname[lxi],
:coefficient => vcat(b, b0),
:standarderror => stde,
:tvalue => tv,
:tPvalue => pp,
:standardizedcoefficient => b1,
:tolerance => 1 ./ multico,
:VIF => multico,
:SS => vcat(sr, se, syy),
:df => vcat(ni, idf2, nc - 1),
:MS => vcat(vr, ve),
:Fvalue => f,
:FPvalue => fp9,
:R2 => R2,
:adjR2 => adjR2,
:R => R,
:loglik => loglik,
:AIC => AIC,
:ve => ve))
if stepwise && !isnan(pchange)
results = merge(results, Dict(:r2change => r2change,
:fchange => fchange,
:df1 => 1,
:df2 => idf2,
:pchange => pchange))
end
oldrhoy = rhoy
end
function fmax()
kouho = 1:q
if ni > 0
suf = trues(q)
[suf[i] = false for i in lx[1:ni]]
kouho = (1:q)[suf]
end
rdiag = [r[i, i] for i in kouho]
temp = 1 ./ (rdiag .* r[q1, q1] ./ (r[q1, kouho] .^ 2) .- 1)
ip = argmax(temp)
return temp[ip], kouho[ip]
end
function fmin()
kouho = lx[1:ni]
rdiag = [r[i, i] for i in kouho]
temp = r[q1, kouho] .^ 2 ./ (rdiag .* r[q1, q1])
ip = argmin(temp)
return temp[ip], lx[ip]
end
function sweepsreg!(r, ip, q1)
ap = r[ip, ip]
abs(ap) > EPSINV || error("正規方程式の係数行列が特異行列です")
for i in 1:q1
if i != ip
temp = r[ip, i] / ap
for j in 1:q1
if j != ip
r[j, i] -= r[j, ip] * temp
end
end
end
end
r[:, ip] /= ap
r[ip, :] /= -ap
r[ip, ip] = 1 / ap
end
function printsreg()
variable = results[:variables]
@printf("\n%-11s %10s %10s %9s %9s %10s %9s\n",
"", "Estimate", "Std. Error", "t value", "Pr(>|t|)", "Std. coef.", "Tolerance")
for i in 1:length(variable)
@printf("%-11s %10.6f %10.6f %9.6f %9.6f %10.6f %9.6f\n",
variable[i], results[:coefficient][i], results[:standarderror][i],
results[:tvalue][i], results[:tPvalue][i],
results[:standardizedcoefficient][i],
results[:tolerance][i])
end
@printf("%-11s %10.6f %10.6f %9.6f %9.6f\n\n",
"(Intercept)", results[:coefficient][end], results[:standarderror][end],
results[:tvalue][end], results[:tPvalue][end])
end
function additional()
@printf("Residual standard error: %.5f on %d degrees of freedom\n",
sqrt(results[:ve]), results[:df][2])
@printf("Multiple correlation coefficient: %.5f\n",
results[:R])
@printf("Multiple R-squared: %.5f, Adjusted R-squared: %.5f\n",
results[:R2], results[:adjR2])
@printf("log likelifood: %.5f, AIC: %.5f\n",
results[:loglik], results[:AIC])
end
function printanova()
variable = results[:variables]
@printf("\nANOVA Table\n%11s %10s %4s %9s %10s %8s\n",
"", "SS", "df", "MS", "F value", "p value")
@printf("%-11s %10.6f %4d %9.6f %10.6f %8.6f\n",
"Regression", results[:SS][1], results[:df][1], results[:MS][1], results[:Fvalue], results[:FPvalue])
@printf("%-11s %10.6f %4d %9.6f\n",
"Residual", results[:SS][2], results[:df][2], results[:MS][2])
@printf("%-11s %10.6f %4d\n",
"Total", results[:SS][3], results[:df][3])
end
function procpredict(predict)
lxi = results[:lxi]
ve = results[:ve]
results[:coefficient]
constant = results[:coefficient][end]
b = results[:coefficient][1:end-1]
y = data[:, size(data, 2)]
data = data[:, lxi]
est = data * b .+ constant
res = y .- est
r = r[lxi, lxi]
means = results[:mean][lxi]
stds = results[:sd][lxi]
nc, q = size(data)
factor = sqrt(nc / (nc - 1))
for i in 1:q
data[:, i] = ((data[:, i] .- means[i]) ./ stds[i]) .* factor
end
stres = [transpose(data[i, :]) * r * data[i, :] for i = 1:nc]
stres = res ./ sqrt.(ve .* (1 .- (stres .+ 1) ./ nc))
if predict
@printf("\npredict\n%5s %12s %12s %12s %12s\n",
"i", "observed", "predicted", "residuals", "std. resid.")
for i = 1:nc
@printf("%5d:%12.6f %12.6f %12.6f %12.6f\n",
i, y[i], est[i], res[i], stres[i])
end
end
results = merge(results, Dict(:observed => y, :predicted => est,
:residuals => res, :standardizedresiduals => stres))
end
EPSINV = 1e-6
MULTICO = 10
Pout >= Pin || (Pout = Pin)
step = 0
oldrhoy = 1
ve = 0
ip = 0
lxi = []
nc, q1 = size(data)
q = q1 - 1
(stepwise == true || nc > q1) || error("独立変数の個数よりデータ数が多くなければなりません")
q > 1 || (stepwise = false)
length(vname) != 0 || (vname = vcat(["x" * string(i) for i = 1:q], "y"))
means = vec(mean(data, dims=1));
sd = vec(std(data, dims=1));
var = sd .^ 2;
ss = (cov(data) .* (nc - 1))[:, q1];
r = cor(data);
results = Dict(:samplesize => nc, :dependentvariable => vname[q1],
:independentvariable => vname[1:q],
:mean => means, :var => var, :sd => sd, :r => r)
stde = similar(means);
tv = similar(means);
pp = similar(means);
ni = 0
if stepwise == false
for ip in 1:q
sweepsreg!(r, ip, q1)
end
lx = 1:q
ni = q
stepout(0)
else
if verbose
println("\n変数編入基準 Pin: $Pin")
println("変数除去基準 Pout: $Pout\n")
end
lx = zeros(Int, q)
ni = 0
while ni < min(q, nc - 2)
ansmax, ip = fmax()
P = (nc - ni - 2) * ansmax # [1]
P = pf(P, 1, nc - ni - 2, false)
verbose && print("編入候補変数: $(vname[ip]) P : $(formatpval(P))")
if P > Pin
verbose && println(" 編入されませんでした")
break
end
verbose && println(" 編入されました")
ni = ni + 1
lx[ni] = ip
sweepsreg!(r, ip, q1)
stepout(1)
printsreg()
while true
ansmin, ip = fmin()
P = (nc - ni - 1) * ansmin # [1]
P = pf(P, 1, nc - ni - 1, false)
verbose && print("除去候補変数: $(vname[ip]) P :$(formatpval(P))")
if P <= Pout
verbose && println(" 除去されませんでした")
break
else
verbose && println(" 除去されました")
lx = lx[ - which(lx == ip)]
ni = ni - 1
sweepsreg!(r, ip, q1)
stepout(2)
printsreg()
end
end
end
end
ni == 0 && error("条件( Pin < $Pin)を満たす独立変数がありません")
println("\n***** 最終結果 *****")
printsreg()
additional()
printanova()
procpredict(predict)
end
using RDatasets
iris = dataset("datasets", "iris");
a = iris[:, 1:4];
result = sreg(Array(a), stepwise=true,
vname=["SepalLength", "SepalWidth", "PetalLength", "PetalWidth"],
predict=true, verbose=true)
#==========
変数編入基準 Pin: 0.05
変数除去基準 Pout: 0.05
編入候補変数: PetalLength P : < 0.0001 編入されました
1 lxi = [3]
***** ステップ 1 ***** 編入変数: PetalLength
Estimate Std. Error t value Pr(>|t|) Std. coef. Tolerance
PetalLength 0.415755 0.009582 43.387237 0.000000 0.962865 1.000000
(Intercept) -0.363076 0.039762 9.131221 0.000000
除去候補変数: PetalLength P :< 0.0001 除去されませんでした
編入候補変数: SepalWidth P : 0.02014 編入されました
1 lxi = [3, 2]
***** ステップ 2 ***** 編入変数: SepalWidth
Estimate Std. Error t value Pr(>|t|) Std. coef. Tolerance
PetalLength 0.426270 0.010447 40.803902 0.000000 0.987217 0.816439
SepalWidth 0.099396 0.042310 2.349216 0.020144 0.056837 0.816439
(Intercept) -0.706478 0.151334 4.668332 0.000007
除去候補変数: SepalWidth P :0.02014 除去されませんでした
編入候補変数: SepalLength P : < 0.0001 編入されました
1 lxi = [3, 2, 1]
***** ステップ 3 ***** 編入変数: SepalLength
Estimate Std. Error t value Pr(>|t|) Std. coef. Tolerance
PetalLength 0.524083 0.024491 21.398708 0.000000 1.213746 0.132314
SepalWidth 0.222829 0.048938 4.553279 0.000011 0.127419 0.543585
SepalLength -0.207266 0.047506 4.362929 0.000024 -0.225166 0.159822
(Intercept) -0.240307 0.178370 1.347243 0.179989
除去候補変数: SepalLength P :< 0.0001 除去されませんでした
***** 最終結果 *****
Estimate Std. Error t value Pr(>|t|) Std. coef. Tolerance
PetalLength 0.524083 0.024491 21.398708 0.000000 1.213746 0.132314
SepalWidth 0.222829 0.048938 4.553279 0.000011 0.127419 0.543585
SepalLength -0.207266 0.047506 4.362929 0.000024 -0.225166 0.159822
(Intercept) -0.240307 0.178370 1.347243 0.179989
Residual standard error: 0.19197 on 146 degrees of freedom
Multiple correlation coefficient: 0.96843
Multiple R-squared: 0.93785, Adjusted R-squared: 0.93657
log likelifood: 36.75109, AIC: -63.50218
ANOVA Table
SS df MS F value p value
Regression 81.189636 3 27.063212 734.388537 0.000000
Residual 5.380298 146 0.036851
Total 86.569933 149
predict
i observed predicted residuals std. resid.
1: 0.200000 0.216252 -0.016252 -0.085561
2: 0.200000 0.146291 0.053709 0.283384
3: 0.200000 0.179901 0.020099 0.105791
4: 0.200000 0.283162 -0.083162 -0.438018
5: 0.200000 0.259261 -0.059261 -0.312245
6: 0.400000 0.400428 -0.000428 -0.002269
7: 0.300000 0.297602 0.002398 0.012656
8: 0.200000 0.267104 -0.067104 -0.352773
9: 0.200000 0.227641 -0.027641 -0.146141
10: 0.100000 0.220982 -0.120982 -0.636584
:
==========#
※コメント投稿者のブログIDはブログ作成者のみに通知されます