調べたことやコード変更した内容について下記のようにコメントを入れることにしています。
#【コメント】コメントです
#https://ohke.hateblo.jp/entry/2017/08/11/230000を参考に利用しています。
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
%matplotlib inline
以下では,Googleドライブのマイドライブ直下にstudy_ai_mlフォルダを置くことを仮定しています.必要に応じて,パスを変更してください。
cancer_df = pd.read_csv('/content/drive/My Drive/study_ai_ml_google/data/cancer.csv')
print('cancer df shape: {}'.format(cancer_df.shape))
cancer df shape: (569, 33)
cancer_df
id | diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | radius_se | texture_se | perimeter_se | area_se | smoothness_se | compactness_se | concavity_se | concave points_se | symmetry_se | fractal_dimension_se | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | Unnamed: 32 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 842302 | M | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.30010 | 0.14710 | 0.2419 | 0.07871 | 1.0950 | 0.9053 | 8.589 | 153.40 | 0.006399 | 0.04904 | 0.05373 | 0.01587 | 0.03003 | 0.006193 | 25.380 | 17.33 | 184.60 | 2019.0 | 0.16220 | 0.66560 | 0.7119 | 0.2654 | 0.4601 | 0.11890 | NaN |
1 | 842517 | M | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.08690 | 0.07017 | 0.1812 | 0.05667 | 0.5435 | 0.7339 | 3.398 | 74.08 | 0.005225 | 0.01308 | 0.01860 | 0.01340 | 0.01389 | 0.003532 | 24.990 | 23.41 | 158.80 | 1956.0 | 0.12380 | 0.18660 | 0.2416 | 0.1860 | 0.2750 | 0.08902 | NaN |
2 | 84300903 | M | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.19740 | 0.12790 | 0.2069 | 0.05999 | 0.7456 | 0.7869 | 4.585 | 94.03 | 0.006150 | 0.04006 | 0.03832 | 0.02058 | 0.02250 | 0.004571 | 23.570 | 25.53 | 152.50 | 1709.0 | 0.14440 | 0.42450 | 0.4504 | 0.2430 | 0.3613 | 0.08758 | NaN |
3 | 84348301 | M | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.24140 | 0.10520 | 0.2597 | 0.09744 | 0.4956 | 1.1560 | 3.445 | 27.23 | 0.009110 | 0.07458 | 0.05661 | 0.01867 | 0.05963 | 0.009208 | 14.910 | 26.50 | 98.87 | 567.7 | 0.20980 | 0.86630 | 0.6869 | 0.2575 | 0.6638 | 0.17300 | NaN |
4 | 84358402 | M | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.19800 | 0.10430 | 0.1809 | 0.05883 | 0.7572 | 0.7813 | 5.438 | 94.44 | 0.011490 | 0.02461 | 0.05688 | 0.01885 | 0.01756 | 0.005115 | 22.540 | 16.67 | 152.20 | 1575.0 | 0.13740 | 0.20500 | 0.4000 | 0.1625 | 0.2364 | 0.07678 | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
564 | 926424 | M | 21.56 | 22.39 | 142.00 | 1479.0 | 0.11100 | 0.11590 | 0.24390 | 0.13890 | 0.1726 | 0.05623 | 1.1760 | 1.2560 | 7.673 | 158.70 | 0.010300 | 0.02891 | 0.05198 | 0.02454 | 0.01114 | 0.004239 | 25.450 | 26.40 | 166.10 | 2027.0 | 0.14100 | 0.21130 | 0.4107 | 0.2216 | 0.2060 | 0.07115 | NaN |
565 | 926682 | M | 20.13 | 28.25 | 131.20 | 1261.0 | 0.09780 | 0.10340 | 0.14400 | 0.09791 | 0.1752 | 0.05533 | 0.7655 | 2.4630 | 5.203 | 99.04 | 0.005769 | 0.02423 | 0.03950 | 0.01678 | 0.01898 | 0.002498 | 23.690 | 38.25 | 155.00 | 1731.0 | 0.11660 | 0.19220 | 0.3215 | 0.1628 | 0.2572 | 0.06637 | NaN |
566 | 926954 | M | 16.60 | 28.08 | 108.30 | 858.1 | 0.08455 | 0.10230 | 0.09251 | 0.05302 | 0.1590 | 0.05648 | 0.4564 | 1.0750 | 3.425 | 48.55 | 0.005903 | 0.03731 | 0.04730 | 0.01557 | 0.01318 | 0.003892 | 18.980 | 34.12 | 126.70 | 1124.0 | 0.11390 | 0.30940 | 0.3403 | 0.1418 | 0.2218 | 0.07820 | NaN |
567 | 927241 | M | 20.60 | 29.33 | 140.10 | 1265.0 | 0.11780 | 0.27700 | 0.35140 | 0.15200 | 0.2397 | 0.07016 | 0.7260 | 1.5950 | 5.772 | 86.22 | 0.006522 | 0.06158 | 0.07117 | 0.01664 | 0.02324 | 0.006185 | 25.740 | 39.42 | 184.60 | 1821.0 | 0.16500 | 0.86810 | 0.9387 | 0.2650 | 0.4087 | 0.12400 | NaN |
568 | 92751 | B | 7.76 | 24.54 | 47.92 | 181.0 | 0.05263 | 0.04362 | 0.00000 | 0.00000 | 0.1587 | 0.05884 | 0.3857 | 1.4280 | 2.548 | 19.15 | 0.007189 | 0.00466 | 0.00000 | 0.00000 | 0.02676 | 0.002783 | 9.456 | 30.37 | 59.16 | 268.6 | 0.08996 | 0.06444 | 0.0000 | 0.0000 | 0.2871 | 0.07039 | NaN |
569 rows × 33 columns
cancer_df.drop('Unnamed: 32', axis=1, inplace=True)
cancer_df
id | diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | radius_se | texture_se | perimeter_se | area_se | smoothness_se | compactness_se | concavity_se | concave points_se | symmetry_se | fractal_dimension_se | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 842302 | M | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.30010 | 0.14710 | 0.2419 | 0.07871 | 1.0950 | 0.9053 | 8.589 | 153.40 | 0.006399 | 0.04904 | 0.05373 | 0.01587 | 0.03003 | 0.006193 | 25.380 | 17.33 | 184.60 | 2019.0 | 0.16220 | 0.66560 | 0.7119 | 0.2654 | 0.4601 | 0.11890 |
1 | 842517 | M | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.08690 | 0.07017 | 0.1812 | 0.05667 | 0.5435 | 0.7339 | 3.398 | 74.08 | 0.005225 | 0.01308 | 0.01860 | 0.01340 | 0.01389 | 0.003532 | 24.990 | 23.41 | 158.80 | 1956.0 | 0.12380 | 0.18660 | 0.2416 | 0.1860 | 0.2750 | 0.08902 |
2 | 84300903 | M | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.19740 | 0.12790 | 0.2069 | 0.05999 | 0.7456 | 0.7869 | 4.585 | 94.03 | 0.006150 | 0.04006 | 0.03832 | 0.02058 | 0.02250 | 0.004571 | 23.570 | 25.53 | 152.50 | 1709.0 | 0.14440 | 0.42450 | 0.4504 | 0.2430 | 0.3613 | 0.08758 |
3 | 84348301 | M | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.24140 | 0.10520 | 0.2597 | 0.09744 | 0.4956 | 1.1560 | 3.445 | 27.23 | 0.009110 | 0.07458 | 0.05661 | 0.01867 | 0.05963 | 0.009208 | 14.910 | 26.50 | 98.87 | 567.7 | 0.20980 | 0.86630 | 0.6869 | 0.2575 | 0.6638 | 0.17300 |
4 | 84358402 | M | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.19800 | 0.10430 | 0.1809 | 0.05883 | 0.7572 | 0.7813 | 5.438 | 94.44 | 0.011490 | 0.02461 | 0.05688 | 0.01885 | 0.01756 | 0.005115 | 22.540 | 16.67 | 152.20 | 1575.0 | 0.13740 | 0.20500 | 0.4000 | 0.1625 | 0.2364 | 0.07678 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
564 | 926424 | M | 21.56 | 22.39 | 142.00 | 1479.0 | 0.11100 | 0.11590 | 0.24390 | 0.13890 | 0.1726 | 0.05623 | 1.1760 | 1.2560 | 7.673 | 158.70 | 0.010300 | 0.02891 | 0.05198 | 0.02454 | 0.01114 | 0.004239 | 25.450 | 26.40 | 166.10 | 2027.0 | 0.14100 | 0.21130 | 0.4107 | 0.2216 | 0.2060 | 0.07115 |
565 | 926682 | M | 20.13 | 28.25 | 131.20 | 1261.0 | 0.09780 | 0.10340 | 0.14400 | 0.09791 | 0.1752 | 0.05533 | 0.7655 | 2.4630 | 5.203 | 99.04 | 0.005769 | 0.02423 | 0.03950 | 0.01678 | 0.01898 | 0.002498 | 23.690 | 38.25 | 155.00 | 1731.0 | 0.11660 | 0.19220 | 0.3215 | 0.1628 | 0.2572 | 0.06637 |
566 | 926954 | M | 16.60 | 28.08 | 108.30 | 858.1 | 0.08455 | 0.10230 | 0.09251 | 0.05302 | 0.1590 | 0.05648 | 0.4564 | 1.0750 | 3.425 | 48.55 | 0.005903 | 0.03731 | 0.04730 | 0.01557 | 0.01318 | 0.003892 | 18.980 | 34.12 | 126.70 | 1124.0 | 0.11390 | 0.30940 | 0.3403 | 0.1418 | 0.2218 | 0.07820 |
567 | 927241 | M | 20.60 | 29.33 | 140.10 | 1265.0 | 0.11780 | 0.27700 | 0.35140 | 0.15200 | 0.2397 | 0.07016 | 0.7260 | 1.5950 | 5.772 | 86.22 | 0.006522 | 0.06158 | 0.07117 | 0.01664 | 0.02324 | 0.006185 | 25.740 | 39.42 | 184.60 | 1821.0 | 0.16500 | 0.86810 | 0.9387 | 0.2650 | 0.4087 | 0.12400 |
568 | 92751 | B | 7.76 | 24.54 | 47.92 | 181.0 | 0.05263 | 0.04362 | 0.00000 | 0.00000 | 0.1587 | 0.05884 | 0.3857 | 1.4280 | 2.548 | 19.15 | 0.007189 | 0.00466 | 0.00000 | 0.00000 | 0.02676 | 0.002783 | 9.456 | 30.37 | 59.16 | 268.6 | 0.08996 | 0.06444 | 0.0000 | 0.0000 | 0.2871 | 0.07039 |
569 rows × 32 columns
・diagnosis: 診断結果 (良性がB / 悪性がM) ・説明変数は3列以降、目的変数を2列目としロジスティック回帰で分類
# 目的変数の抽出
y = cancer_df.diagnosis.apply(lambda d: 1 if d == 'M' else 0)
y
0 1 1 1 2 1 3 1 4 1 .. 564 1 565 1 566 1 567 1 568 0 Name: diagnosis, Length: 569, dtype: int64
# 説明変数の抽出
X = cancer_df.loc[:, 'radius_mean':]
X
radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | radius_se | texture_se | perimeter_se | area_se | smoothness_se | compactness_se | concavity_se | concave points_se | symmetry_se | fractal_dimension_se | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.30010 | 0.14710 | 0.2419 | 0.07871 | 1.0950 | 0.9053 | 8.589 | 153.40 | 0.006399 | 0.04904 | 0.05373 | 0.01587 | 0.03003 | 0.006193 | 25.380 | 17.33 | 184.60 | 2019.0 | 0.16220 | 0.66560 | 0.7119 | 0.2654 | 0.4601 | 0.11890 |
1 | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.08690 | 0.07017 | 0.1812 | 0.05667 | 0.5435 | 0.7339 | 3.398 | 74.08 | 0.005225 | 0.01308 | 0.01860 | 0.01340 | 0.01389 | 0.003532 | 24.990 | 23.41 | 158.80 | 1956.0 | 0.12380 | 0.18660 | 0.2416 | 0.1860 | 0.2750 | 0.08902 |
2 | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.19740 | 0.12790 | 0.2069 | 0.05999 | 0.7456 | 0.7869 | 4.585 | 94.03 | 0.006150 | 0.04006 | 0.03832 | 0.02058 | 0.02250 | 0.004571 | 23.570 | 25.53 | 152.50 | 1709.0 | 0.14440 | 0.42450 | 0.4504 | 0.2430 | 0.3613 | 0.08758 |
3 | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.24140 | 0.10520 | 0.2597 | 0.09744 | 0.4956 | 1.1560 | 3.445 | 27.23 | 0.009110 | 0.07458 | 0.05661 | 0.01867 | 0.05963 | 0.009208 | 14.910 | 26.50 | 98.87 | 567.7 | 0.20980 | 0.86630 | 0.6869 | 0.2575 | 0.6638 | 0.17300 |
4 | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.19800 | 0.10430 | 0.1809 | 0.05883 | 0.7572 | 0.7813 | 5.438 | 94.44 | 0.011490 | 0.02461 | 0.05688 | 0.01885 | 0.01756 | 0.005115 | 22.540 | 16.67 | 152.20 | 1575.0 | 0.13740 | 0.20500 | 0.4000 | 0.1625 | 0.2364 | 0.07678 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
564 | 21.56 | 22.39 | 142.00 | 1479.0 | 0.11100 | 0.11590 | 0.24390 | 0.13890 | 0.1726 | 0.05623 | 1.1760 | 1.2560 | 7.673 | 158.70 | 0.010300 | 0.02891 | 0.05198 | 0.02454 | 0.01114 | 0.004239 | 25.450 | 26.40 | 166.10 | 2027.0 | 0.14100 | 0.21130 | 0.4107 | 0.2216 | 0.2060 | 0.07115 |
565 | 20.13 | 28.25 | 131.20 | 1261.0 | 0.09780 | 0.10340 | 0.14400 | 0.09791 | 0.1752 | 0.05533 | 0.7655 | 2.4630 | 5.203 | 99.04 | 0.005769 | 0.02423 | 0.03950 | 0.01678 | 0.01898 | 0.002498 | 23.690 | 38.25 | 155.00 | 1731.0 | 0.11660 | 0.19220 | 0.3215 | 0.1628 | 0.2572 | 0.06637 |
566 | 16.60 | 28.08 | 108.30 | 858.1 | 0.08455 | 0.10230 | 0.09251 | 0.05302 | 0.1590 | 0.05648 | 0.4564 | 1.0750 | 3.425 | 48.55 | 0.005903 | 0.03731 | 0.04730 | 0.01557 | 0.01318 | 0.003892 | 18.980 | 34.12 | 126.70 | 1124.0 | 0.11390 | 0.30940 | 0.3403 | 0.1418 | 0.2218 | 0.07820 |
567 | 20.60 | 29.33 | 140.10 | 1265.0 | 0.11780 | 0.27700 | 0.35140 | 0.15200 | 0.2397 | 0.07016 | 0.7260 | 1.5950 | 5.772 | 86.22 | 0.006522 | 0.06158 | 0.07117 | 0.01664 | 0.02324 | 0.006185 | 25.740 | 39.42 | 184.60 | 1821.0 | 0.16500 | 0.86810 | 0.9387 | 0.2650 | 0.4087 | 0.12400 |
568 | 7.76 | 24.54 | 47.92 | 181.0 | 0.05263 | 0.04362 | 0.00000 | 0.00000 | 0.1587 | 0.05884 | 0.3857 | 1.4280 | 2.548 | 19.15 | 0.007189 | 0.00466 | 0.00000 | 0.00000 | 0.02676 | 0.002783 | 9.456 | 30.37 | 59.16 | 268.6 | 0.08996 | 0.06444 | 0.0000 | 0.0000 | 0.2871 | 0.07039 |
569 rows × 30 columns
# 学習用とテスト用でデータを分離
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
# 標準化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# ロジスティック回帰で学習
logistic = LogisticRegressionCV(cv=10, random_state=0,max_iter=1000)
#【コメント】イテレーションを増やすようにというWarningが出ていたので100にした
logistic.fit(X_train_scaled, y_train)
# 検証
print('Train score: {:.3f}'.format(logistic.score(X_train_scaled, y_train)))
print('Test score: {:.3f}'.format(logistic.score(X_test_scaled, y_test)))
print('Confustion matrix:\n{}'.format(confusion_matrix(y_true=y_test, y_pred=logistic.predict(X_test_scaled))))
Train score: 0.988 Test score: 0.972 Confustion matrix: [[89 1] [ 3 50]]
・検証スコア97%で分類できることを確認
pca = PCA(n_components=30)
pca.fit(X_train_scaled)
plt.bar([n for n in range(1, len(pca.explained_variance_ratio_)+1)], pca.explained_variance_ratio_)
#【コメント】各次元の寄与率の可視化
#【コメント】一般的に累積寄与率が80%となるよう、主成分を抽出すると良いらしい(このデータだと4から5次元くらいまで?)
<BarContainer object of 30 artists>
# PCA
# 次元数2まで圧縮
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_scaled)
print('X_train_pca shape: {}'.format(X_train_pca.shape))
# X_train_pca shape: (426, 2)
X_train_pca
#【コメント】 2次元になっている↓
X_train_pca shape: (426, 2)
array([[-2.85924814e+00, -2.86144799e-01], [-3.26308367e+00, 1.07251955e+00], [ 3.75551268e+00, -3.40968876e+00], [-3.49263242e+00, -2.68784664e+00], [-7.43517447e-01, -2.48214924e+00], [-3.36556228e+00, -6.04103786e-01], [-3.18672558e+00, -1.88203437e+00], [-9.75800523e-01, 9.14900548e-01], [ 3.61845617e+00, 4.16797546e+00], [ 4.25896553e+00, 9.52260594e+00], [-8.00920047e-01, 7.76348214e-02], [-3.44711508e+00, 1.26773112e+00], [ 1.85939039e+00, 7.94595702e-01], [-2.12832960e+00, -1.18572536e+00], [-6.04598633e-01, -9.14935763e-01], [-3.93494805e+00, -2.07089658e+00], [ 4.18868101e+00, 9.28984343e-01], [ 4.93713709e+00, -1.15947338e+00], [-3.81508251e+00, -9.42143756e-01], [-2.49836390e-01, 2.30399084e+00], [ 2.51338427e+00, -1.32090995e+00], [-3.22757570e+00, -4.56772534e-01], [ 3.35229082e+00, -1.26381683e-02], [ 2.18140921e+00, 2.44624599e+00], [ 2.81827206e+00, -9.52361878e-01], [-5.91056742e-01, 6.88095808e-01], [-2.49043698e+00, -1.21171505e+00], [-3.55145924e+00, 1.81948146e+00], [-2.19027757e+00, 1.26719173e+00], [-3.55927961e+00, 8.76653657e-01], [-2.40998861e+00, -1.60887845e-02], [ 4.13426028e-01, -3.55079290e+00], [-4.56615956e+00, 3.43931793e+00], [-3.90330920e+00, 1.17670830e+00], [-1.53918414e+00, 5.40453240e-01], [-3.73795472e+00, 2.12903714e-01], [-6.36675531e-01, -2.09581728e+00], [ 6.07583482e+00, -5.80579153e-01], [-1.73251287e+00, 7.70136727e-01], [-1.55345820e+00, -1.19780365e+00], [-4.15147857e-01, -1.35338178e+00], [-2.36272865e+00, 9.84354139e-01], [-3.40978942e+00, 7.55235499e-01], [-1.18216531e+00, -1.30756011e+00], [-1.39403308e+00, 1.52685920e+00], [ 4.65296344e+00, -2.14408451e+00], [-4.33879458e+00, -9.38248915e-01], [-2.45968692e+00, 2.70035719e+00], [ 2.27508313e+00, -2.60695474e+00], [ 3.24923498e+00, 3.47431138e+00], [-1.91967965e+00, 1.88106716e+00], [-1.27651922e+00, 3.55469907e+00], [ 7.17621927e+00, 8.18409598e-01], [ 9.08697926e+00, -3.10749563e-01], [-3.23524871e+00, -1.18745010e+00], [-2.20467968e+00, -1.21596160e+00], [-1.05300808e+00, -3.28517718e+00], [-2.24494165e+00, -4.92559105e-01], [-3.61561101e+00, 7.24425938e-01], [-8.40921667e-01, -3.47761582e+00], [ 7.34976529e+00, -5.00822726e+00], [ 3.83373731e+00, -1.76139366e+00], [-4.36304846e+00, -3.96252459e-01], [-2.20163885e+00, -3.84816328e-01], [ 1.14893785e+00, 8.48759023e+00], [ 5.15463302e+00, -1.82508588e+00], [-2.16163413e+00, -6.89306959e-01], [-9.56855903e-01, 9.69405139e-01], [-4.74913874e+00, -1.87497720e+00], [-4.11757830e+00, 2.35606332e+00], [-4.39073535e+00, -8.48100157e-01], [ 7.22447974e+00, -5.23060667e+00], [-2.09400538e+00, 1.57533741e+00], [ 8.76846726e+00, -2.73290060e-01], [-1.14191485e+00, -4.64028243e-01], [ 3.25195320e+00, -8.14745970e-01], [-1.56343226e+00, 2.92705742e-01], [ 2.84561718e+00, 3.58893948e+00], [-2.88263563e+00, 1.19921703e-01], [-9.40965073e-01, -1.66466986e+00], [-2.81677889e+00, -1.06837719e+00], [-1.87619282e+00, -1.58372064e+00], [-3.62455357e+00, -7.28537826e-01], [-1.91892081e+00, 9.08819247e-01], [ 3.05638916e+00, 8.34476104e-01], [-4.92153454e+00, -2.17997161e+00], [ 4.96878177e+00, -3.92315781e+00], [-1.11798339e+00, -1.61320075e+00], [-1.75649911e+00, 4.00794343e-01], [-3.12851580e+00, -1.71159834e+00], [ 4.30569137e+00, -4.50347379e+00], [-2.59652477e+00, 2.06936284e-01], [-3.44742342e+00, -5.32322647e-01], [-2.60593639e+00, -7.20433256e-01], [ 3.60488597e+00, 2.27249204e+00], [-3.48244169e+00, 1.69043919e+00], [-2.83101797e+00, -2.89280858e+00], [ 1.20458550e+00, -1.93661155e+00], [ 3.95647301e+00, -1.73402958e+00], [-1.99568662e+00, -1.77142390e-01], [ 9.10237289e+00, 8.21421377e-01], [-2.31675492e+00, 2.13498273e+00], [ 3.90646007e+00, 6.11178022e+00], [-4.70460011e+00, -5.09302984e-01], [ 3.87941451e+00, 2.51111387e+00], [ 2.45644812e+00, 4.03445443e+00], [ 1.33788158e+00, -1.18818514e+00], [ 2.08904616e+00, 1.25192608e+00], [-1.45252740e+00, 1.34750263e-01], [-1.96427987e+00, -4.08162077e-01], [-4.16590017e+00, -1.44643028e+00], [ 1.28113496e+01, 2.81500498e+00], [ 3.38054066e-02, -1.79166464e-01], [ 4.77735587e+00, 3.49083524e+00], [-2.31709171e+00, -7.58439556e-01], [ 3.43170845e+00, -2.20656750e+00], [-2.91220473e+00, 3.29900861e-01], [ 1.23912937e+00, 2.59977440e+00], [ 4.79680436e+00, -1.32883128e+00], [-2.70674472e+00, -1.79713428e+00], [-2.14317561e+00, -8.08736468e-01], [-3.84873131e+00, 4.68227691e-01], [-6.31455506e-01, 1.82196749e+00], [-4.13329446e+00, 7.46528349e-02], [-6.60314663e-01, -1.05954597e+00], [-1.35759053e+00, 1.02801380e-01], [-4.84589167e+00, -2.42207709e+00], [-2.81317480e+00, -9.93527746e-01], [-1.77842598e+00, -5.15100183e-01], [-2.06745665e+00, 3.85037678e+00], [-3.15065133e+00, -2.14222173e+00], [-4.65537550e+00, -8.43797914e-01], [ 5.49733006e+00, -3.96710229e+00], [-7.10475844e-01, 1.84929136e+00], [-2.45207731e+00, -3.84262744e-01], [-2.38605903e+00, 3.31824106e+00], [ 3.30630648e+00, 4.16882946e+00], [-2.10877597e+00, 1.09063227e-02], [-2.04532616e+00, 1.70562726e+00], [-2.46494274e+00, -2.38989936e+00], [-2.15941369e+00, 9.57699998e-01], [-1.95268062e+00, -9.74329414e-01], [-2.91881236e+00, -1.70939085e+00], [ 7.73806249e+00, -6.46699917e-01], [-3.28057588e+00, 1.60341941e-01], [ 3.52602655e+00, -2.14620755e+00], [-7.17488105e-01, 3.75428544e+00], [-1.48338132e+00, 2.90610819e-01], [ 8.60450557e-01, 9.53650128e-01], [-1.29404768e+00, 5.01799340e+00], [ 6.18597008e+00, 5.35168399e+00], [ 1.36386033e+00, -1.70600385e+00], [-3.85708305e+00, 8.60918879e-01], [ 2.82379549e+00, 3.51349308e+00], [ 5.77021864e+00, -8.68954194e-01], [-9.20845296e-01, 4.38479892e-01], [-2.64364962e+00, -1.53680076e+00], [ 2.36140698e+00, 4.91945770e+00], [-3.05412232e+00, 3.27481707e-01], [-4.47193714e+00, -1.83346848e+00], [-2.05008549e+00, 1.80693930e+00], [ 4.42451620e+00, -7.83908426e-01], [-2.46113495e+00, 3.25076456e+00], [-3.30310763e+00, -3.17892394e+00], [-2.70485028e+00, -2.57755488e-01], [ 1.13425267e+00, 7.24647940e+00], [-1.64485840e+00, -4.53874880e+00], [ 5.09549789e+00, -1.58894185e+00], [ 2.75824852e+00, -3.81022405e+00], [-1.13164961e+00, 5.73191190e+00], [-1.20232680e+00, 1.61081122e+00], [-1.27070393e+00, 2.59387132e+00], [-2.60798250e+00, -4.63160069e-01], [ 1.03885658e+00, -2.14588343e+00], [ 3.75040902e+00, -2.68559217e+00], [-2.38469869e+00, -2.63039394e-01], [-2.03740689e+00, 3.35625140e-01], [-2.80930188e+00, -1.08811851e+00], [-2.21067216e+00, 4.88819568e-02], [ 5.04442917e-01, 4.49634497e-01], [-3.70945085e+00, -1.37116320e+00], [-6.74609711e-01, -1.53521373e+00], [ 5.06152345e+00, -9.89470312e-01], [-9.19341657e-01, 9.73772069e-01], [ 3.08936423e+00, 4.53301819e+00], [-2.23401321e-01, -1.48046894e-01], [-2.60280296e+00, 3.14188111e+00], [ 1.31403031e+00, 4.06443943e-01], [-4.68153349e-02, -1.00242062e-01], [-1.93308693e+00, -1.13704176e+00], [-1.52131762e+00, 4.33315745e-01], [-1.89979935e+00, 2.55562117e+00], [ 4.92703390e+00, -2.66104990e+00], [ 7.13819586e+00, 3.17589441e-01], [-2.40835023e+00, -8.11563994e-01], [-2.87165038e-01, 1.43716632e-01], [-3.30608498e+00, -1.49983757e+00], [ 2.78570615e+00, -2.69074524e-01], [-4.09042641e+00, -5.67502440e-01], [ 1.73178814e+00, -1.42755166e+00], [ 3.45398267e+00, -1.52512566e+00], [-2.62780130e+00, -1.85313456e+00], [-1.23374077e+00, 1.12496875e+00], [-4.02645464e+00, -2.61337160e+00], [ 1.05640933e+00, 1.18094784e+00], [-2.81191212e+00, 1.08417577e+00], [ 3.04369006e+00, -1.67882484e+00], [-2.55021489e+00, -2.50921046e+00], [ 2.07523849e+00, 6.66926970e+00], [ 4.12677984e+00, -1.83244075e-01], [-2.89339810e+00, -1.44823665e+00], [-1.95021133e+00, -2.19120724e-01], [ 3.70277215e+00, 1.05339353e+00], [ 1.20323367e+01, -7.15382114e+00], [-2.72114047e+00, 1.39739516e+00], [-1.54768248e+00, -1.06381042e+00], [-3.93722770e+00, 7.31131573e-01], [ 4.36872161e+00, -3.86779224e+00], [ 1.37001313e+00, 2.24166805e+00], [-1.57574255e-01, 1.54503323e+00], [-3.33991601e+00, -1.13764071e+00], [ 2.48345877e+00, 2.74093101e+00], [-3.21967402e+00, 1.27036773e+00], [-7.55925465e-01, 6.76999723e-01], [-4.02153678e+00, 1.15381800e+00], [ 3.78284363e+00, 1.16532486e+00], [-3.50769347e+00, -6.69019386e-01], [ 4.96937042e+00, -1.40626742e+00], [-3.90174101e-01, 2.36702092e+00], [ 6.59737010e+00, -1.26031209e+00], [ 1.84778061e+00, 2.15738793e-01], [ 4.68468744e+00, -1.33169271e+00], [ 3.98587486e+00, -2.78214719e+00], [-7.58645079e-02, 2.44365016e+00], [-4.33286950e+00, -4.45442900e-02], [-2.02879388e+00, -1.08666613e+00], [-1.21635779e+00, 4.17561211e+00], [-2.13309598e+00, -1.95597604e+00], [ 3.43700763e-01, -3.09289490e+00], [-1.85413800e+00, -1.56408675e+00], [-1.99762141e+00, -2.73554611e-01], [-2.02117226e+00, -2.91196532e-01], [-3.66335178e+00, -1.26377404e+00], [-4.61969198e+00, -1.90820192e+00], [ 3.38236374e-01, -8.46804337e-01], [-4.28739039e+00, -2.02743717e+00], [-3.12293218e+00, -1.79357118e+00], [ 3.32522906e+00, -1.16942385e+00], [-3.03818514e+00, -6.86340209e-01], [-1.41286812e+00, -1.80538880e-01], [ 3.73032333e+00, -8.78629563e-01], [ 3.36597246e+00, -2.69325076e+00], [ 5.04828453e+00, 9.92383806e-01], [ 3.18210080e+00, 1.67207115e+00], [-2.59875006e+00, -1.15106215e+00], [-1.38321955e+00, 4.36980376e-01], [ 7.10758459e+00, 2.24256979e+00], [-2.49561717e+00, 2.64418263e+00], [-3.02585619e+00, -2.27978639e+00], [ 6.90782343e+00, 1.30532255e+01], [ 2.46485155e-01, -1.51496312e+00], [ 7.37714748e-01, 4.83505271e-01], [-5.07641436e-01, 1.28781779e+00], [-2.18991826e+00, 6.74413409e-01], [-2.70471366e+00, 1.42730016e+00], [-1.06006398e+00, 1.91509798e+00], [-1.93841122e+00, 1.40614284e+00], [ 2.65474324e+00, -3.77976692e+00], [ 3.41518173e+00, -3.26969969e+00], [ 1.09754132e+01, -3.39077083e+00], [-1.34961830e+00, 8.21667815e-01], [ 1.94971254e+00, -2.30718501e+00], [ 7.25347957e+00, 1.31899966e-01], [-1.10336325e+00, -1.40499894e+00], [-2.59087359e+00, 6.87553459e-01], [-2.91815248e+00, -5.34214322e-01], [-1.62087749e+00, 8.68005672e-01], [-2.51366631e+00, -8.01356534e-01], [-1.20499330e+00, -1.53756961e-01], [ 2.29162729e+00, -2.52740867e-01], [-4.77664233e+00, -6.08178860e-01], [-2.20849723e+00, -1.32508957e+00], [-7.33204993e-01, 3.24374200e+00], [-7.58666545e-01, -2.07848833e+00], [-3.86864326e+00, 4.65225834e-01], [-1.45923590e+00, -1.56421810e+00], [ 7.16341081e+00, 1.04687600e+01], [ 3.31088863e+00, -1.65939327e+00], [-3.12337049e+00, -1.92657856e+00], [ 5.00812584e+00, -1.17383608e+00], [-6.60728916e-01, 1.78510222e+00], [ 6.25554401e+00, 1.14757178e+00], [-3.32980445e-01, -9.88442588e-01], [-2.46580519e+00, -2.02959456e+00], [ 2.31208360e+00, 4.58431137e+00], [-3.88156898e+00, 1.08240797e+00], [-1.29131997e+00, 9.34130409e-01], [-3.32442070e+00, -8.47415896e-01], [ 5.18819885e+00, -2.21553456e+00], [ 7.87674197e-01, -1.56145504e+00], [ 4.58203743e+00, 4.83978852e-01], [-1.64242158e+00, 1.73988692e+00], [-5.42471122e+00, 4.77409505e-01], [ 8.74079598e+00, 3.63711992e+00], [ 2.69324550e+00, 1.73128614e-01], [-4.60707340e+00, -2.83900047e+00], [-4.79526420e-01, -9.22611535e-01], [ 5.89385302e-01, 2.77395633e-01], [-1.47351784e+00, 1.71611168e+00], [ 1.33730462e+00, 9.81948610e-01], [ 1.82231573e+00, 2.88998076e+00], [-3.74738325e+00, -1.84165435e+00], [-3.08730294e-02, 2.29738556e+00], [-4.50451659e+00, -3.24612629e+00], [-2.91179143e+00, 3.27834406e-01], [-3.56085734e+00, -1.32616622e+00], [-1.99695736e+00, -2.47535686e+00], [-2.74199332e+00, -2.16327060e+00], [-1.64919372e+00, 2.48401519e+00], [-4.42608805e+00, -8.69927284e-01], [ 8.64544242e+00, -3.19786856e+00], [ 2.54141935e+00, -2.35730784e+00], [-7.40113097e-01, -5.18784054e-02], [-2.97598746e+00, 1.77375924e+00], [-3.36427340e+00, -7.54150773e-02], [ 4.83863925e+00, 3.23691356e+00], [-5.51825211e+00, -7.73630580e-01], [-3.05793601e+00, -2.22100606e+00], [-1.85775451e+00, 1.39851943e+00], [-2.80981317e+00, -4.05238960e-01], [ 4.21342556e+00, -4.98812885e+00], [-2.95696557e+00, -7.09572756e-01], [ 4.16646616e+00, 3.01282388e+00], [ 1.03933388e+00, 1.02169754e+00], [ 4.22800713e+00, 1.32538252e+00], [-2.00444883e+00, 4.21084778e-01], [ 9.21475147e+00, 2.19772974e+00], [-3.52645638e-01, 2.20207995e+00], [ 1.70657037e+00, -2.00123218e+00], [-2.33651837e+00, 7.14403093e-01], [ 5.24198043e+00, -6.53178325e+00], [ 1.61490244e+01, -7.24611405e+00], [-2.42421736e+00, 5.76192085e-01], [-2.77905225e+00, 3.40496767e-01], [-3.42249621e+00, -2.48502091e+00], [-1.93414823e+00, 1.42872198e+00], [-3.03794269e+00, -1.89637954e-01], [-2.89394159e+00, -3.48525482e-02], [-1.39527531e+00, 1.42061067e+00], [-4.98922907e+00, -3.48023351e+00], [ 7.28191791e+00, -3.44747764e+00], [ 6.42282927e+00, -1.61219289e+00], [ 2.85230648e+00, 7.23177385e-01], [ 1.24756825e+00, -2.31255427e+00], [ 4.37071230e+00, -1.96779348e+00], [-5.49703234e-02, -1.19736251e-01], [-1.29663111e+00, 1.54010964e+00], [-1.95475015e+00, -1.85472567e+00], [-3.24845224e+00, -9.64474413e-01], [-3.12152066e+00, -3.36486268e-01], [-2.08953432e+00, 6.28456436e-02], [-1.90796477e+00, 2.56172011e+00], [ 3.56564947e+00, -3.71340906e+00], [-1.89951941e+00, -6.06163363e-01], [ 4.20915999e-01, -2.74923340e+00], [-1.84859038e+00, -9.18591681e-01], [ 3.42045099e-01, 1.69806524e+00], [-3.36897374e-01, -5.75581704e-01], [-3.54310230e-01, -3.32406750e-01], [-1.81253092e+00, -1.10977930e-01], [ 9.54219195e+00, -5.32746910e+00], [ 1.48118077e+00, -9.85472568e-01], [ 1.76641962e+00, 2.41951459e+00], [-3.12026972e+00, -7.89515570e-01], [ 4.59018860e+00, 3.21424161e+00], [-2.16853356e+00, 4.68564919e-01], [-3.75623266e+00, -3.81717458e-01], [ 2.81679892e-01, -4.13729593e-01], [ 1.08542269e+01, -1.89004497e+00], [ 5.08560754e+00, 3.23758776e+00], [ 7.56203604e-01, -2.29298747e+00], [-3.78502513e-01, -7.76399742e-01], [ 2.21777653e+00, 1.04172770e+00], [ 1.81985826e+00, -4.23182624e+00], [ 5.33380606e+00, 4.10137976e+00], [-3.42475266e+00, -2.17190888e+00], [ 9.04537882e+00, 2.33752916e+00], [-4.45929164e-01, 3.86055779e+00], [ 3.30121037e+00, -1.42796866e+00], [-3.63721835e+00, 1.59165031e+00], [-1.71011727e+00, 1.00972610e+00], [ 3.27958813e+00, -1.04055950e+00], [ 5.74162486e-01, 4.54631649e-01], [ 1.46523027e+00, 1.44386747e+00], [ 4.07811727e+00, 6.45074106e-01], [-2.64190057e+00, 6.02524612e-01], [-4.08982337e+00, -5.29707118e-01], [-1.84357047e+00, 1.73549435e+00], [ 1.03893032e-01, 7.26493782e+00], [ 3.81682257e+00, -8.43166568e-01], [-2.37124119e+00, -1.69173266e+00], [-1.63870782e+00, -1.97805600e-01], [ 6.54993668e+00, -5.80871669e+00], [ 3.71474288e-01, 3.97520197e+00], [-3.21380062e-01, 1.58980046e-01], [ 3.61544513e-01, 1.18595929e+00], [ 3.03183473e+00, 8.69697874e-01], [ 6.51808828e-01, 7.38291727e-01], [-1.21826465e+00, -1.56206981e+00], [-8.89951063e-01, 2.54182711e+00], [-2.00662011e+00, 3.05064104e-01], [ 5.58670329e+00, 1.11452916e+00], [ 3.93457631e-01, 3.64475677e+00], [-4.71055375e+00, -3.06342001e-01], [ 3.60862610e+00, -2.09427033e+00], [-2.05349968e+00, 1.24324055e+00], [-2.24850174e+00, -2.48904595e+00], [-3.86328935e+00, 4.39682987e+00], [-4.48217238e-01, 1.52081858e-01], [-1.34839911e+00, -1.76992390e+00], [ 2.71067086e+00, -4.32043003e+00], [ 3.47604839e-02, -3.37853851e+00], [ 6.49029629e+00, 7.94277378e+00], [-3.20758299e+00, 5.43091464e-01], [-5.69315584e+00, -5.73866830e-01], [-1.07704361e+00, 1.37951070e+00]])
# 寄与率
print('explained variance ratio: {}'.format(pca.explained_variance_ratio_))
# explained variance ratio: [ 0.43315126 0.19586506]
#【コメント】累積で60%ちょいくらい
explained variance ratio: [0.43315126 0.19586506]
# 散布図にプロット
temp = pd.DataFrame(X_train_pca)
# 散布図にプロット
temp = pd.DataFrame(X_train_pca)
temp['Outcome'] = y_train.values
temp
0 | 1 | Outcome | |
---|---|---|---|
0 | -2.859248 | -0.286145 | 0 |
1 | -3.263084 | 1.072520 | 0 |
2 | 3.755513 | -3.409689 | 1 |
3 | -3.492632 | -2.687847 | 0 |
4 | -0.743517 | -2.482149 | 1 |
... | ... | ... | ... |
421 | 0.034760 | -3.378539 | 1 |
422 | 6.490296 | 7.942774 | 1 |
423 | -3.207583 | 0.543091 | 0 |
424 | -5.693156 | -0.573867 | 0 |
425 | -1.077044 | 1.379511 | 0 |
426 rows × 3 columns
b = temp[temp['Outcome'] == 0]
m = temp[temp['Outcome'] == 1]
plt.scatter(x=b[0], y=b[1], marker='o') # 良性は○でマーク
plt.scatter(x=m[0], y=m[1], marker='^') # 悪性は△でマーク
plt.xlabel('PC 1') # 第1主成分をx軸
plt.ylabel('PC 2') # 第2主成分をy軸
#【コメント】2次元に次元圧縮してもそれなりに分類できそう
X_train_pca shape: (426, 2) explained variance ratio: [0.43315126 0.19586506]
Text(0, 0.5, 'PC 2')
#【コメント】次元数2まで圧縮してロジスティック回帰してみる
# 次元数2まで圧縮
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.fit_transform(X_test_scaled)
# ロジスティック回帰で学習
logistic = LogisticRegressionCV(cv=10, random_state=0,max_iter=1000)
#【コメント】イテレーションを増やすようにというWarningが出ていたので100にした
logistic.fit(X_train_pca, y_train)
# 検証
print('Train score: {:.3f}'.format(logistic.score(X_train_pca, y_train)))
print('Test score: {:.3f}'.format(logistic.score(X_test_pca, y_test)))
print('Confustion matrix:\n{}'.format(confusion_matrix(y_true=y_test, y_pred=logistic.predict(X_test_pca))))
#【コメント】検証スコア91.6%
Train score: 0.965 Test score: 0.916 Confustion matrix: [[83 7] [ 5 48]]
import numpy as np
#【コメント】1-15次元までのスコアと処理時間を可視化してみる
import time
n_max = 15
n_array = np.arange(1,n_max+1,1)
scores = np.zeros(n_max)
times = np.zeros(n_max)
for n in n_array:
#開始時間記憶
start_time = time.perf_counter()
# 次元数nまで圧縮
pca = PCA(n_components=n)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.fit_transform(X_test_scaled)
# ロジスティック回帰で学習
logistic = LogisticRegressionCV(cv=10, random_state=0,max_iter=1000)
#【コメント】イテレーションを増やすようにというWarningが出ていたので100にした
logistic.fit(X_train_pca, y_train)
#終了時間記憶
end_time = time.perf_counter()
scores[n-1] = logistic.score(X_test_pca, y_test)
times[n-1] = end_time - start_time
fig = plt.figure()
ax1 = fig.add_subplot(111)
ln1=ax1.plot(n_array, scores,'C0',label=r'$score$')
ax2 = ax1.twinx()
ln2=ax2.plot(n_array,times,'C1',label=r'$time$')
h1, l1 = ax1.get_legend_handles_labels()
h2, l2 = ax2.get_legend_handles_labels()
ax1.legend(h1+h2, l1+l2, loc='lower right')
ax1.set_xlabel('n')
ax1.set_ylabel(r'$score$')
ax1.grid(True)
ax2.set_ylabel(r'$time[s]$')
#plt.plot(n_array,scores)
#plt.plot(n_array,times)
#【コメント】scoreは単純に比例していくわけではない←想像してたのと違うなんで?
#【コメント】timeは実行ごとに違うけどおおむね比例傾向。サーバーの他の処理状況によって変動はありそうなので納得できる
Text(0, 0.5, '$time[s]$')