BigQuery で主成分分析を行う方法のメモ。 ここでは、5次元のデータを2次元に次元圧縮します。
■テーブル作成・データ登録
drop table if exists dataset.test_data; create table dataset.test_data ( id string, values array ); insert into dataset.test_data values ('id_1', [1.0, 0.0, 1.0, 3.0, 0.0]); insert into dataset.test_data values ('id_2', [1.0, 2.0, 1.0, 1.0, 1.0]); insert into dataset.test_data values ('id_3', [0.0, 2.0, 0.0, 1.0, 1.0]); insert into dataset.test_data values ('id_4', [1.0, 0.0, 3.0, 2.0, 3.0]); insert into dataset.test_data values ('id_5', [0.0, 0.0, 0.0, 2.0, 0.0]); insert into dataset.test_data values ('id_6', [0.0, 0.0, 2.0, 2.0, 4.0]); insert into dataset.test_data values ('id_7', [1.0, 2.0, 1.0, 1.0, 0.0]); insert into dataset.test_data values ('id_8', [0.0, 2.0, 3.0, 2.0, 2.0]); insert into dataset.test_data values ('id_9', [1.0, 2.0, 0.0, 2.0, 0.0]); insert into dataset.test_data values ('id_10', [1.0, 0.0, 3.0, 1.0, 0.0]);
■モデル作成
create or replace model dataset.test_model options ( model_type = 'pca' , num_principal_components = 2 , scale_features = false , pca_solver = 'full' ) as ( select values[0] as f0 , values[1] as f1 , values[2] as f2 , values[3] as f3 , values[4] as f4 , values[5] as f5 from dataset.test_data );
■適用
select * from ml.predict(model dataset.test_model, ( select id , values[0] as f0 , values[1] as f1 , values[2] as f2 , values[3] as f3 , values[4] as f4 , values[5] as f5 , values[6] as f6 from dataset.test_data ), struct(true as keep_original_columns) ); [{ "principal_component_1": "-0.69271374229229665", "principal_component_2": "1.6535767369786081", "id": "id_1", "f0": "1.0", "f1": "0.0", "f2": "1.0", "f3": "3.0", "f4": "0.0" }, { "principal_component_1": "0.31854554694562942", "principal_component_2": "1.5448567976194651", "id": "id_10", "f0": "1.0", "f1": "0.0", "f2": "3.0", "f3": "1.0", "f4": "0.0" }, { "principal_component_1": "-0.67889365782004729", "principal_component_2": "-0.94631539186280522", "id": "id_2", "f0": "1.0", "f1": "2.0", "f2": "1.0", "f3": "1.0", "f4": "1.0" }, { "principal_component_1": "-1.2041817682899503", "principal_component_2": "-1.3374039692570403", "id": "id_3", "f0": "0.0", "f1": "2.0", "f2": "0.0", "f3": "1.0", "f4": "1.0" }, { "principal_component_1": "2.6487614707519027", "principal_component_2": "0.36207867152786921", "id": "id_4", "f0": "1.0", "f1": "0.0", "f2": "3.0", "f3": "2.0", "f4": "3.0" }, { "principal_component_1": "-1.3025753320545592", "principal_component_2": "0.98632696171180989", "id": "id_5", "f0": "0.0", "f1": "0.0", "f2": "0.0", "f3": "2.0", "f4": "0.0" }, { "principal_component_1": "2.8720208417866377", "principal_component_2": "-0.5153230138544187", "id": "id_6", "f0": "0.0", "f1": "0.0", "f2": "2.0", "f3": "2.0", "f4": "4.0" }, { "principal_component_1": "-1.4274411393246851", "principal_component_2": "-0.46000228387475217", "id": "id_7", "f0": "1.0", "f1": "2.0", "f2": "1.0", "f3": "1.0", "f4": "0.0" }, { "principal_component_1": "1.3995485642410159", "principal_component_2": "-0.88215219479355511", "id": "id_8", "f0": "0.0", "f1": "2.0", "f2": "3.0", "f3": "2.0", "f4": "2.0" }, { "principal_component_1": "-1.9330707839436485", "principal_component_2": "-0.40564231419518071", "id": "id_9", "f0": "1.0", "f1": "2.0", "f2": "0.0", "f3": "2.0", "f4": "0.0" }]
■固有値など
select * from ml.principal_component_info(model dataset.test_model) ; [{ "principal_component_id": "0", "eigenvalue": "3.020722723318388", "explained_variance_ratio": "0.545913745178022", "cumulative_explained_variance_ratio": "0.545913745178022" }, { "principal_component_id": "1", "eigenvalue": "1.147655631054062", "explained_variance_ratio": "0.20740764416639679", "cumulative_explained_variance_ratio": "0.75332138934441883" }]