dak ブログ

python、rubyなどのプログラミング、MySQL、サーバーの設定などの備忘録。レゴの写真も。

BigQuery で主成分分析

2024-03-31 23:56:24 | BigQuery
BigQuery で主成分分析を行う方法のメモ。 ここでは、5次元のデータを2次元に次元圧縮します。
■テーブル作成・データ登録
drop table if exists dataset.test_data;

create table dataset.test_data (
  id      string,
  values  array
);

insert into dataset.test_data values ('id_1', [1.0, 0.0, 1.0, 3.0, 0.0]);
insert into dataset.test_data values ('id_2', [1.0, 2.0, 1.0, 1.0, 1.0]);
insert into dataset.test_data values ('id_3', [0.0, 2.0, 0.0, 1.0, 1.0]);
insert into dataset.test_data values ('id_4', [1.0, 0.0, 3.0, 2.0, 3.0]);
insert into dataset.test_data values ('id_5', [0.0, 0.0, 0.0, 2.0, 0.0]);
insert into dataset.test_data values ('id_6', [0.0, 0.0, 2.0, 2.0, 4.0]);
insert into dataset.test_data values ('id_7', [1.0, 2.0, 1.0, 1.0, 0.0]);
insert into dataset.test_data values ('id_8', [0.0, 2.0, 3.0, 2.0, 2.0]);
insert into dataset.test_data values ('id_9', [1.0, 2.0, 0.0, 2.0, 0.0]);
insert into dataset.test_data values ('id_10', [1.0, 0.0, 3.0, 1.0, 0.0]);
■モデル作成
create or replace model dataset.test_model
options (
  model_type = 'pca'
  , num_principal_components = 2
  , scale_features = false
  , pca_solver = 'full'
)
as (
  select
    values[0] as f0
    , values[1] as f1
    , values[2] as f2
    , values[3] as f3
    , values[4] as f4
    , values[5] as f5
  from
    dataset.test_data
);
■適用
select
  *
from
  ml.predict(model dataset.test_model,
  (
    select
      id
      , values[0] as f0
      , values[1] as f1
      , values[2] as f2
      , values[3] as f3
      , values[4] as f4
      , values[5] as f5
      , values[6] as f6
    from
      dataset.test_data
  ),
  struct(true as keep_original_columns)
);

[{
  "principal_component_1": "-0.69271374229229665",
  "principal_component_2": "1.6535767369786081",
  "id": "id_1",
  "f0": "1.0",
  "f1": "0.0",
  "f2": "1.0",
  "f3": "3.0",
  "f4": "0.0"
}, {
  "principal_component_1": "0.31854554694562942",
  "principal_component_2": "1.5448567976194651",
  "id": "id_10",
  "f0": "1.0",
  "f1": "0.0",
  "f2": "3.0",
  "f3": "1.0",
  "f4": "0.0"
}, {
  "principal_component_1": "-0.67889365782004729",
  "principal_component_2": "-0.94631539186280522",
  "id": "id_2",
  "f0": "1.0",
  "f1": "2.0",
  "f2": "1.0",
  "f3": "1.0",
  "f4": "1.0"
}, {
  "principal_component_1": "-1.2041817682899503",
  "principal_component_2": "-1.3374039692570403",
  "id": "id_3",
  "f0": "0.0",
  "f1": "2.0",
  "f2": "0.0",
  "f3": "1.0",
  "f4": "1.0"
}, {
  "principal_component_1": "2.6487614707519027",
  "principal_component_2": "0.36207867152786921",
  "id": "id_4",
  "f0": "1.0",
  "f1": "0.0",
  "f2": "3.0",
  "f3": "2.0",
  "f4": "3.0"
}, {
  "principal_component_1": "-1.3025753320545592",
  "principal_component_2": "0.98632696171180989",
  "id": "id_5",
  "f0": "0.0",
  "f1": "0.0",
  "f2": "0.0",
  "f3": "2.0",
  "f4": "0.0"
}, {
  "principal_component_1": "2.8720208417866377",
  "principal_component_2": "-0.5153230138544187",
  "id": "id_6",
  "f0": "0.0",
  "f1": "0.0",
  "f2": "2.0",
  "f3": "2.0",
  "f4": "4.0"
}, {
  "principal_component_1": "-1.4274411393246851",
  "principal_component_2": "-0.46000228387475217",
  "id": "id_7",
  "f0": "1.0",
  "f1": "2.0",
  "f2": "1.0",
  "f3": "1.0",
  "f4": "0.0"
}, {
  "principal_component_1": "1.3995485642410159",
  "principal_component_2": "-0.88215219479355511",
  "id": "id_8",
  "f0": "0.0",
  "f1": "2.0",
  "f2": "3.0",
  "f3": "2.0",
  "f4": "2.0"
}, {
  "principal_component_1": "-1.9330707839436485",
  "principal_component_2": "-0.40564231419518071",
  "id": "id_9",
  "f0": "1.0",
  "f1": "2.0",
  "f2": "0.0",
  "f3": "2.0",
  "f4": "0.0"
}]
■固有値など
select
  *
from
  ml.principal_component_info(model dataset.test_model)
;

[{
  "principal_component_id": "0",
  "eigenvalue": "3.020722723318388",
  "explained_variance_ratio": "0.545913745178022",
  "cumulative_explained_variance_ratio": "0.545913745178022"
}, {
  "principal_component_id": "1",
  "eigenvalue": "1.147655631054062",
  "explained_variance_ratio": "0.20740764416639679",
  "cumulative_explained_variance_ratio": "0.75332138934441883"
}]