裏 RjpWiki

Julia ときどき R, Python によるコンピュータプログラム,コンピュータ・サイエンス,統計学

Julia に翻訳--146 重回帰分析,ステップワイズ変数選択

2021年03月30日 | ブログラミング

#==========
Julia の修行をするときに,いろいろなプログラムを書き換えるのは有効な方法だ。
以下のプログラムを Julia に翻訳してみる。

重回帰分析(ステップワイズ変数選択)
http://aoki2.si.gunma-u.ac.jp/R/sreg.html

ファイル名: sreg.jl  関数名: sreg

翻訳するときに書いたメモ

翻訳のしがいのあるプログラムだった。

==========#

using DataFrames, Rmath, Statistics, Printf, Plots

function sreg(data; vname=[], stepwise = true, Pin = 0.05, Pout = 0.05,
              predict = false, verbose = false)

    formatpval(p) = p >= 0.0001 ? @sprintf("%.5f", p) : "< 0.0001"

    function stepout(isw)
        step += 1
        syy = ss[q1]
        lxi = lx[1:ni]
        b1 = r[q1, lxi];
        b = b1 .* sd[q1] ./ sd[lxi];
        b0 = means[q1] - sum(b .* means[lxi])
        sr = sum(b .* ss[lxi])
        se = syy - sr
        if se < 0 && abs(se / syy) < 1e-12
            se = 0
        end
        idf1 = ni
        idf2 = nc - ni - 1
        vr = sr / idf1
        ve = se / idf2
        if ve == 0
            f = fp9 = NaN
            stde = tv = pp = rep(NaN, ni)
            seb0 = tv0 = pp0 = NaN
            error("従属変数の予測値と観測値が完全に一致します。分析の指定に間違いはないですか?")
        else
            f = vr / ve
            fp9 = pf(f, idf1, idf2, false)
        end
        rhoy = se / syy
        if ve != 0
            rdiag = [r[lxi0, lxi0] for lxi0 in lxi];
            seb = rdiag .* rhoy ./ idf2 .* var[q1] ./ var[lxi];
            stde = sqrt.(seb);
            tv = abs.(b ./ stde);
            pp = pt.(tv, idf2, false) * 2;
            temp = means[lxi] ./ sd[lxi];
            seb0 = sqrt((1 / nc + transpose(temp) * r[lxi, lxi] * temp / (nc - 1)) * ve)
            tv0 = abs(b0 / seb0)
            pp0 = pt(tv0, idf2, false) * 2
        end
        append!(stde, seb0);
        append!(tv, tv0);
        append!(pp, pp0);
        adjR2 = 1 - ve / syy * (nc - 1)
        if rhoy != 0 && isw != 0
            r2change = oldrhoy - rhoy
            fchange = r2change * idf2 / rhoy
            pchange = fchange < 0 ? NaN : pf(fchange, 1, idf2, false)
        else
            pchange = NaN
        end
        if isw != 0 && verbose
            print("\n ***** ステップ $step *****   ")
            println("  $(["編入", "除去"][isw])変数: $(vname[ip])")
        end
        multico = [r[lxi0, lxi0] for lxi0 in lxi]
        R2 = 1 - rhoy
        R = sqrt(R2)
        loglik = 0.5 * (sum( - nc * (log(2 * pi) + 1 - log(nc) + log(se))))
        AIC = 2 * ni + 4 - 2 * loglik
        results = merge(results, Dict(:lxi => lxi,
                    :variables => vname[lxi],
                    :coefficient => vcat(b, b0),
                    :standarderror => stde,
                    :tvalue => tv,
                    :tPvalue => pp,
                    :standardizedcoefficient => b1,
                    :tolerance => 1 ./ multico,
                    :VIF => multico,
                    :SS => vcat(sr, se, syy),
                    :df => vcat(ni, idf2, nc - 1),
                    :MS => vcat(vr, ve),
                    :Fvalue => f,
                    :FPvalue => fp9,
                    :R2 => R2,
                    :adjR2 => adjR2,
                    :R => R,
                    :loglik => loglik,
                    :AIC => AIC,
                    :ve => ve))
        if stepwise && !isnan(pchange)
            results = merge(results, Dict(:r2change => r2change,
                    :fchange => fchange,
                    :df1 => 1,
                    :df2 => idf2,
                    :pchange => pchange))
        end
        oldrhoy = rhoy
    end

    function fmax()
        kouho = 1:q
        if ni > 0
            suf = trues(q)
            [suf[i] = false for i in lx[1:ni]]
            kouho = (1:q)[suf]
        end
        rdiag = [r[i, i] for i in kouho]
        temp = 1 ./ (rdiag .* r[q1, q1] ./ (r[q1, kouho] .^ 2) .- 1)
        ip = argmax(temp)
        return temp[ip], kouho[ip]
    end

    function fmin()
        kouho = lx[1:ni]
        rdiag = [r[i, i] for i in kouho]
        temp = r[q1, kouho] .^ 2 ./ (rdiag .* r[q1, q1])
        ip = argmin(temp)
        return temp[ip], lx[ip]
    end

    function sweepsreg!(r, ip, q1)
        ap = r[ip, ip]
        abs(ap) > EPSINV || error("正規方程式の係数行列が特異行列です")
        for i in 1:q1
            if i != ip
                temp = r[ip, i] / ap
                for j in 1:q1
                    if j != ip
                        r[j, i] -= r[j, ip] * temp
                    end
                end
            end
        end
        r[:, ip] /= ap
        r[ip, :] /= -ap
        r[ip, ip] = 1 / ap
    end

    function printsreg()
        variable = results[:variables]
        @printf("\n%-11s  %10s  %10s  %9s  %9s  %10s  %9s\n",
                 "", "Estimate", "Std. Error", "t value", "Pr(>|t|)", "Std. coef.", "Tolerance")
        for i in 1:length(variable)
            @printf("%-11s  %10.6f  %10.6f  %9.6f  %9.6f  %10.6f  %9.6f\n",
                    variable[i], results[:coefficient][i], results[:standarderror][i],
                    results[:tvalue][i], results[:tPvalue][i],
                    results[:standardizedcoefficient][i],
                    results[:tolerance][i])
        end
        @printf("%-11s  %10.6f  %10.6f  %9.6f  %9.6f\n\n",
                "(Intercept)", results[:coefficient][end], results[:standarderror][end],
                results[:tvalue][end], results[:tPvalue][end])
    end

    function additional()
        @printf("Residual standard error: %.5f on %d degrees of freedom\n",
                sqrt(results[:ve]), results[:df][2])
        @printf("Multiple correlation coefficient: %.5f\n",
                results[:R])
        @printf("Multiple R-squared: %.5f, Adjusted R-squared: %.5f\n",
                results[:R2], results[:adjR2])
        @printf("log likelifood: %.5f,  AIC: %.5f\n",
                results[:loglik], results[:AIC])
    end

    function printanova()
        variable = results[:variables]
        @printf("\nANOVA Table\n%11s  %10s %4s  %9s  %10s  %8s\n",
                "", "SS", "df", "MS", "F value", "p value")
        @printf("%-11s  %10.6f %4d  %9.6f  %10.6f  %8.6f\n",
                "Regression", results[:SS][1], results[:df][1], results[:MS][1], results[:Fvalue], results[:FPvalue])
        @printf("%-11s  %10.6f %4d  %9.6f\n",
                "Residual", results[:SS][2], results[:df][2], results[:MS][2])
        @printf("%-11s  %10.6f %4d\n",
                "Total", results[:SS][3], results[:df][3])
    end

    function procpredict(predict)
        lxi = results[:lxi]
        ve = results[:ve]
        results[:coefficient]
        constant = results[:coefficient][end]
        b = results[:coefficient][1:end-1]
        y = data[:, size(data, 2)]
        data = data[:, lxi]
        est = data * b .+ constant
        res = y .- est
        r = r[lxi, lxi]
        means = results[:mean][lxi]
        stds = results[:sd][lxi]
        nc, q = size(data)
        factor = sqrt(nc / (nc - 1))
        for i in 1:q
            data[:, i] = ((data[:, i] .- means[i]) ./ stds[i]) .* factor
        end
        stres = [transpose(data[i, :]) * r * data[i, :] for i = 1:nc]
        stres = res ./ sqrt.(ve .* (1 .- (stres .+ 1) ./ nc))
        if predict
            @printf("\npredict\n%5s %12s  %12s  %12s  %12s\n",
                    "i", "observed", "predicted", "residuals", "std. resid.")
            for i = 1:nc
                @printf("%5d:%12.6f  %12.6f  %12.6f  %12.6f\n",
                        i, y[i], est[i], res[i], stres[i])
            end
        end
        results = merge(results, Dict(:observed => y, :predicted => est,
                    :residuals => res, :standardizedresiduals => stres))
    end

    EPSINV = 1e-6
    MULTICO = 10
    Pout >= Pin || (Pout = Pin)
    step = 0
    oldrhoy = 1
    ve = 0
    ip = 0
    lxi = []
    nc, q1 = size(data)
    q = q1 - 1
    (stepwise == true || nc > q1) || error("独立変数の個数よりデータ数が多くなければなりません")
    q > 1 || (stepwise = false)
    length(vname) != 0 || (vname = vcat(["x" * string(i) for i = 1:q], "y"))
    means = vec(mean(data, dims=1));
    sd = vec(std(data, dims=1));
    var = sd .^ 2;
    ss = (cov(data) .* (nc - 1))[:, q1];
    r = cor(data);
    results = Dict(:samplesize => nc, :dependentvariable => vname[q1],
                   :independentvariable => vname[1:q],
                   :mean => means, :var => var, :sd => sd, :r => r)
    stde = similar(means);
    tv = similar(means);
    pp = similar(means);
    ni = 0
    if stepwise == false
        for ip in 1:q
            sweepsreg!(r, ip, q1)
        end
        lx = 1:q
        ni = q
        stepout(0)
    else
        if verbose
            println("\n変数編入基準    Pin:  $Pin")
            println("変数除去基準    Pout: $Pout\n")
        end
        lx = zeros(Int, q)
        ni = 0
        while ni < min(q, nc - 2)
            ansmax, ip = fmax()
            P = (nc - ni - 2) * ansmax # [1]
            P = pf(P, 1, nc - ni - 2, false)
            verbose && print("編入候補変数: $(vname[ip])   P : $(formatpval(P))")
            if P > Pin
                verbose && println("  編入されませんでした")
                break
            end
            verbose && println("  編入されました")
            ni = ni + 1
            lx[ni] = ip
            sweepsreg!(r, ip, q1)
            stepout(1)
            printsreg()
            while true
                ansmin, ip = fmin()
                P = (nc - ni - 1) * ansmin # [1]
                P = pf(P, 1, nc - ni - 1, false)
                verbose && print("除去候補変数: $(vname[ip])   P :$(formatpval(P))")
                if P <= Pout
                    verbose && println("  除去されませんでした")
                    break
                else
                    verbose && println("  除去されました")
                    lx = lx[ - which(lx == ip)]
                    ni = ni - 1
                    sweepsreg!(r, ip, q1)
                    stepout(2)
                    printsreg()
                end
            end
        end
    end

    ni == 0 && error("条件( Pin < $Pin)を満たす独立変数がありません")
    println("\n***** 最終結果 *****")
    printsreg()
    additional()
    printanova()
    procpredict(predict)
end

using RDatasets
iris = dataset("datasets", "iris");
a = iris[:, 1:4];
result = sreg(Array(a), stepwise=true,
    vname=["SepalLength", "SepalWidth", "PetalLength", "PetalWidth"],
    predict=true, verbose=true)

#==========
    変数編入基準    Pin:  0.05
    変数除去基準    Pout: 0.05

    編入候補変数: PetalLength   P : < 0.0001  編入されました
    1 lxi = [3]

     ***** ステップ 1 *****     編入変数: PetalLength

                   Estimate  Std. Error    t value   Pr(>|t|)  Std. coef.  Tolerance
    PetalLength    0.415755    0.009582  43.387237   0.000000    0.962865   1.000000
    (Intercept)   -0.363076    0.039762   9.131221   0.000000

    除去候補変数: PetalLength   P :< 0.0001  除去されませんでした
    編入候補変数: SepalWidth   P : 0.02014  編入されました
    1 lxi = [3, 2]

     ***** ステップ 2 *****     編入変数: SepalWidth

                   Estimate  Std. Error    t value   Pr(>|t|)  Std. coef.  Tolerance
    PetalLength    0.426270    0.010447  40.803902   0.000000    0.987217   0.816439
    SepalWidth     0.099396    0.042310   2.349216   0.020144    0.056837   0.816439
    (Intercept)   -0.706478    0.151334   4.668332   0.000007

    除去候補変数: SepalWidth   P :0.02014  除去されませんでした
    編入候補変数: SepalLength   P : < 0.0001  編入されました
    1 lxi = [3, 2, 1]

     ***** ステップ 3 *****     編入変数: SepalLength

                   Estimate  Std. Error    t value   Pr(>|t|)  Std. coef.  Tolerance
    PetalLength    0.524083    0.024491  21.398708   0.000000    1.213746   0.132314
    SepalWidth     0.222829    0.048938   4.553279   0.000011    0.127419   0.543585
    SepalLength   -0.207266    0.047506   4.362929   0.000024   -0.225166   0.159822
    (Intercept)   -0.240307    0.178370   1.347243   0.179989

    除去候補変数: SepalLength   P :< 0.0001  除去されませんでした

    ***** 最終結果 *****

                   Estimate  Std. Error    t value   Pr(>|t|)  Std. coef.  Tolerance
    PetalLength    0.524083    0.024491  21.398708   0.000000    1.213746   0.132314
    SepalWidth     0.222829    0.048938   4.553279   0.000011    0.127419   0.543585
    SepalLength   -0.207266    0.047506   4.362929   0.000024   -0.225166   0.159822
    (Intercept)   -0.240307    0.178370   1.347243   0.179989

    Residual standard error: 0.19197 on 146 degrees of freedom
    Multiple correlation coefficient: 0.96843
    Multiple R-squared: 0.93785, Adjusted R-squared: 0.93657
    log likelifood: 36.75109,  AIC: -63.50218

    ANOVA Table
                         SS   df         MS     F value   p value
    Regression    81.189636    3  27.063212  734.388537  0.000000
    Residual       5.380298  146   0.036851
    Total         86.569933  149

    predict
        i     observed     predicted     residuals   std. resid.
        1:    0.200000      0.216252     -0.016252     -0.085561
        2:    0.200000      0.146291      0.053709      0.283384
        3:    0.200000      0.179901      0.020099      0.105791
        4:    0.200000      0.283162     -0.083162     -0.438018
        5:    0.200000      0.259261     -0.059261     -0.312245
        6:    0.400000      0.400428     -0.000428     -0.002269
        7:    0.300000      0.297602      0.002398      0.012656
        8:    0.200000      0.267104     -0.067104     -0.352773
        9:    0.200000      0.227641     -0.027641     -0.146141
       10:    0.100000      0.220982     -0.120982     -0.636584
                :
==========#
コメント
  • X
  • Facebookでシェアする
  • はてなブックマークに追加する
  • LINEでシェアする

PVアクセスランキング にほんブログ村

PVアクセスランキング にほんブログ村