調べたことやコード変更した内容について下記のようにコメントを入れることにしています。

In [1]:
  #【コメント】コメントです
In [2]:
#https://ohke.hateblo.jp/entry/2017/08/11/230000を参考に利用しています。

Googleドライブのマウント

In [3]:
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
%matplotlib inline

sys.pathの設定

以下では,Googleドライブのマイドライブ直下にstudy_ai_mlフォルダを置くことを仮定しています.必要に応じて,パスを変更してください。

In [5]:
cancer_df = pd.read_csv('/content/drive/My Drive/study_ai_ml_google/data/cancer.csv')
In [6]:
print('cancer df shape: {}'.format(cancer_df.shape))
cancer df shape: (569, 33)
In [7]:
cancer_df
Out[7]:
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se area_se smoothness_se compactness_se concavity_se concave points_se symmetry_se fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst Unnamed: 32
0 842302 M 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.30010 0.14710 0.2419 0.07871 1.0950 0.9053 8.589 153.40 0.006399 0.04904 0.05373 0.01587 0.03003 0.006193 25.380 17.33 184.60 2019.0 0.16220 0.66560 0.7119 0.2654 0.4601 0.11890 NaN
1 842517 M 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.08690 0.07017 0.1812 0.05667 0.5435 0.7339 3.398 74.08 0.005225 0.01308 0.01860 0.01340 0.01389 0.003532 24.990 23.41 158.80 1956.0 0.12380 0.18660 0.2416 0.1860 0.2750 0.08902 NaN
2 84300903 M 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.19740 0.12790 0.2069 0.05999 0.7456 0.7869 4.585 94.03 0.006150 0.04006 0.03832 0.02058 0.02250 0.004571 23.570 25.53 152.50 1709.0 0.14440 0.42450 0.4504 0.2430 0.3613 0.08758 NaN
3 84348301 M 11.42 20.38 77.58 386.1 0.14250 0.28390 0.24140 0.10520 0.2597 0.09744 0.4956 1.1560 3.445 27.23 0.009110 0.07458 0.05661 0.01867 0.05963 0.009208 14.910 26.50 98.87 567.7 0.20980 0.86630 0.6869 0.2575 0.6638 0.17300 NaN
4 84358402 M 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.19800 0.10430 0.1809 0.05883 0.7572 0.7813 5.438 94.44 0.011490 0.02461 0.05688 0.01885 0.01756 0.005115 22.540 16.67 152.20 1575.0 0.13740 0.20500 0.4000 0.1625 0.2364 0.07678 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
564 926424 M 21.56 22.39 142.00 1479.0 0.11100 0.11590 0.24390 0.13890 0.1726 0.05623 1.1760 1.2560 7.673 158.70 0.010300 0.02891 0.05198 0.02454 0.01114 0.004239 25.450 26.40 166.10 2027.0 0.14100 0.21130 0.4107 0.2216 0.2060 0.07115 NaN
565 926682 M 20.13 28.25 131.20 1261.0 0.09780 0.10340 0.14400 0.09791 0.1752 0.05533 0.7655 2.4630 5.203 99.04 0.005769 0.02423 0.03950 0.01678 0.01898 0.002498 23.690 38.25 155.00 1731.0 0.11660 0.19220 0.3215 0.1628 0.2572 0.06637 NaN
566 926954 M 16.60 28.08 108.30 858.1 0.08455 0.10230 0.09251 0.05302 0.1590 0.05648 0.4564 1.0750 3.425 48.55 0.005903 0.03731 0.04730 0.01557 0.01318 0.003892 18.980 34.12 126.70 1124.0 0.11390 0.30940 0.3403 0.1418 0.2218 0.07820 NaN
567 927241 M 20.60 29.33 140.10 1265.0 0.11780 0.27700 0.35140 0.15200 0.2397 0.07016 0.7260 1.5950 5.772 86.22 0.006522 0.06158 0.07117 0.01664 0.02324 0.006185 25.740 39.42 184.60 1821.0 0.16500 0.86810 0.9387 0.2650 0.4087 0.12400 NaN
568 92751 B 7.76 24.54 47.92 181.0 0.05263 0.04362 0.00000 0.00000 0.1587 0.05884 0.3857 1.4280 2.548 19.15 0.007189 0.00466 0.00000 0.00000 0.02676 0.002783 9.456 30.37 59.16 268.6 0.08996 0.06444 0.0000 0.0000 0.2871 0.07039 NaN

569 rows × 33 columns

In [8]:
cancer_df.drop('Unnamed: 32', axis=1, inplace=True)
cancer_df
Out[8]:
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se area_se smoothness_se compactness_se concavity_se concave points_se symmetry_se fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
0 842302 M 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.30010 0.14710 0.2419 0.07871 1.0950 0.9053 8.589 153.40 0.006399 0.04904 0.05373 0.01587 0.03003 0.006193 25.380 17.33 184.60 2019.0 0.16220 0.66560 0.7119 0.2654 0.4601 0.11890
1 842517 M 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.08690 0.07017 0.1812 0.05667 0.5435 0.7339 3.398 74.08 0.005225 0.01308 0.01860 0.01340 0.01389 0.003532 24.990 23.41 158.80 1956.0 0.12380 0.18660 0.2416 0.1860 0.2750 0.08902
2 84300903 M 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.19740 0.12790 0.2069 0.05999 0.7456 0.7869 4.585 94.03 0.006150 0.04006 0.03832 0.02058 0.02250 0.004571 23.570 25.53 152.50 1709.0 0.14440 0.42450 0.4504 0.2430 0.3613 0.08758
3 84348301 M 11.42 20.38 77.58 386.1 0.14250 0.28390 0.24140 0.10520 0.2597 0.09744 0.4956 1.1560 3.445 27.23 0.009110 0.07458 0.05661 0.01867 0.05963 0.009208 14.910 26.50 98.87 567.7 0.20980 0.86630 0.6869 0.2575 0.6638 0.17300
4 84358402 M 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.19800 0.10430 0.1809 0.05883 0.7572 0.7813 5.438 94.44 0.011490 0.02461 0.05688 0.01885 0.01756 0.005115 22.540 16.67 152.20 1575.0 0.13740 0.20500 0.4000 0.1625 0.2364 0.07678
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
564 926424 M 21.56 22.39 142.00 1479.0 0.11100 0.11590 0.24390 0.13890 0.1726 0.05623 1.1760 1.2560 7.673 158.70 0.010300 0.02891 0.05198 0.02454 0.01114 0.004239 25.450 26.40 166.10 2027.0 0.14100 0.21130 0.4107 0.2216 0.2060 0.07115
565 926682 M 20.13 28.25 131.20 1261.0 0.09780 0.10340 0.14400 0.09791 0.1752 0.05533 0.7655 2.4630 5.203 99.04 0.005769 0.02423 0.03950 0.01678 0.01898 0.002498 23.690 38.25 155.00 1731.0 0.11660 0.19220 0.3215 0.1628 0.2572 0.06637
566 926954 M 16.60 28.08 108.30 858.1 0.08455 0.10230 0.09251 0.05302 0.1590 0.05648 0.4564 1.0750 3.425 48.55 0.005903 0.03731 0.04730 0.01557 0.01318 0.003892 18.980 34.12 126.70 1124.0 0.11390 0.30940 0.3403 0.1418 0.2218 0.07820
567 927241 M 20.60 29.33 140.10 1265.0 0.11780 0.27700 0.35140 0.15200 0.2397 0.07016 0.7260 1.5950 5.772 86.22 0.006522 0.06158 0.07117 0.01664 0.02324 0.006185 25.740 39.42 184.60 1821.0 0.16500 0.86810 0.9387 0.2650 0.4087 0.12400
568 92751 B 7.76 24.54 47.92 181.0 0.05263 0.04362 0.00000 0.00000 0.1587 0.05884 0.3857 1.4280 2.548 19.15 0.007189 0.00466 0.00000 0.00000 0.02676 0.002783 9.456 30.37 59.16 268.6 0.08996 0.06444 0.0000 0.0000 0.2871 0.07039

569 rows × 32 columns

・diagnosis: 診断結果 (良性がB / 悪性がM) ・説明変数は3列以降、目的変数を2列目としロジスティック回帰で分類

In [14]:
# 目的変数の抽出
y = cancer_df.diagnosis.apply(lambda d: 1 if d == 'M' else 0)
y
Out[14]:
0      1
1      1
2      1
3      1
4      1
      ..
564    1
565    1
566    1
567    1
568    0
Name: diagnosis, Length: 569, dtype: int64
In [15]:
# 説明変数の抽出
X = cancer_df.loc[:, 'radius_mean':]
X
Out[15]:
radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se area_se smoothness_se compactness_se concavity_se concave points_se symmetry_se fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
0 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.30010 0.14710 0.2419 0.07871 1.0950 0.9053 8.589 153.40 0.006399 0.04904 0.05373 0.01587 0.03003 0.006193 25.380 17.33 184.60 2019.0 0.16220 0.66560 0.7119 0.2654 0.4601 0.11890
1 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.08690 0.07017 0.1812 0.05667 0.5435 0.7339 3.398 74.08 0.005225 0.01308 0.01860 0.01340 0.01389 0.003532 24.990 23.41 158.80 1956.0 0.12380 0.18660 0.2416 0.1860 0.2750 0.08902
2 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.19740 0.12790 0.2069 0.05999 0.7456 0.7869 4.585 94.03 0.006150 0.04006 0.03832 0.02058 0.02250 0.004571 23.570 25.53 152.50 1709.0 0.14440 0.42450 0.4504 0.2430 0.3613 0.08758
3 11.42 20.38 77.58 386.1 0.14250 0.28390 0.24140 0.10520 0.2597 0.09744 0.4956 1.1560 3.445 27.23 0.009110 0.07458 0.05661 0.01867 0.05963 0.009208 14.910 26.50 98.87 567.7 0.20980 0.86630 0.6869 0.2575 0.6638 0.17300
4 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.19800 0.10430 0.1809 0.05883 0.7572 0.7813 5.438 94.44 0.011490 0.02461 0.05688 0.01885 0.01756 0.005115 22.540 16.67 152.20 1575.0 0.13740 0.20500 0.4000 0.1625 0.2364 0.07678
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
564 21.56 22.39 142.00 1479.0 0.11100 0.11590 0.24390 0.13890 0.1726 0.05623 1.1760 1.2560 7.673 158.70 0.010300 0.02891 0.05198 0.02454 0.01114 0.004239 25.450 26.40 166.10 2027.0 0.14100 0.21130 0.4107 0.2216 0.2060 0.07115
565 20.13 28.25 131.20 1261.0 0.09780 0.10340 0.14400 0.09791 0.1752 0.05533 0.7655 2.4630 5.203 99.04 0.005769 0.02423 0.03950 0.01678 0.01898 0.002498 23.690 38.25 155.00 1731.0 0.11660 0.19220 0.3215 0.1628 0.2572 0.06637
566 16.60 28.08 108.30 858.1 0.08455 0.10230 0.09251 0.05302 0.1590 0.05648 0.4564 1.0750 3.425 48.55 0.005903 0.03731 0.04730 0.01557 0.01318 0.003892 18.980 34.12 126.70 1124.0 0.11390 0.30940 0.3403 0.1418 0.2218 0.07820
567 20.60 29.33 140.10 1265.0 0.11780 0.27700 0.35140 0.15200 0.2397 0.07016 0.7260 1.5950 5.772 86.22 0.006522 0.06158 0.07117 0.01664 0.02324 0.006185 25.740 39.42 184.60 1821.0 0.16500 0.86810 0.9387 0.2650 0.4087 0.12400
568 7.76 24.54 47.92 181.0 0.05263 0.04362 0.00000 0.00000 0.1587 0.05884 0.3857 1.4280 2.548 19.15 0.007189 0.00466 0.00000 0.00000 0.02676 0.002783 9.456 30.37 59.16 268.6 0.08996 0.06444 0.0000 0.0000 0.2871 0.07039

569 rows × 30 columns

In [18]:
# 学習用とテスト用でデータを分離
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# 標準化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ロジスティック回帰で学習
logistic = LogisticRegressionCV(cv=10, random_state=0,max_iter=1000)
  #【コメント】イテレーションを増やすようにというWarningが出ていたので100にした
logistic.fit(X_train_scaled, y_train)

# 検証
print('Train score: {:.3f}'.format(logistic.score(X_train_scaled, y_train)))
print('Test score: {:.3f}'.format(logistic.score(X_test_scaled, y_test)))
print('Confustion matrix:\n{}'.format(confusion_matrix(y_true=y_test, y_pred=logistic.predict(X_test_scaled))))
Train score: 0.988
Test score: 0.972
Confustion matrix:
[[89  1]
 [ 3 50]]

・検証スコア97%で分類できることを確認

In [12]:
pca = PCA(n_components=30)
pca.fit(X_train_scaled)
plt.bar([n for n in range(1, len(pca.explained_variance_ratio_)+1)], pca.explained_variance_ratio_)
  #【コメント】各次元の寄与率の可視化
  #【コメント】一般的に累積寄与率が80%となるよう、主成分を抽出すると良いらしい(このデータだと4から5次元くらいまで?)
Out[12]:
<BarContainer object of 30 artists>
In [23]:
# PCA
# 次元数2まで圧縮
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_scaled)
print('X_train_pca shape: {}'.format(X_train_pca.shape))
# X_train_pca shape: (426, 2)
X_train_pca
  #【コメント】 2次元になっている↓
X_train_pca shape: (426, 2)
Out[23]:
array([[-2.85924814e+00, -2.86144799e-01],
       [-3.26308367e+00,  1.07251955e+00],
       [ 3.75551268e+00, -3.40968876e+00],
       [-3.49263242e+00, -2.68784664e+00],
       [-7.43517447e-01, -2.48214924e+00],
       [-3.36556228e+00, -6.04103786e-01],
       [-3.18672558e+00, -1.88203437e+00],
       [-9.75800523e-01,  9.14900548e-01],
       [ 3.61845617e+00,  4.16797546e+00],
       [ 4.25896553e+00,  9.52260594e+00],
       [-8.00920047e-01,  7.76348214e-02],
       [-3.44711508e+00,  1.26773112e+00],
       [ 1.85939039e+00,  7.94595702e-01],
       [-2.12832960e+00, -1.18572536e+00],
       [-6.04598633e-01, -9.14935763e-01],
       [-3.93494805e+00, -2.07089658e+00],
       [ 4.18868101e+00,  9.28984343e-01],
       [ 4.93713709e+00, -1.15947338e+00],
       [-3.81508251e+00, -9.42143756e-01],
       [-2.49836390e-01,  2.30399084e+00],
       [ 2.51338427e+00, -1.32090995e+00],
       [-3.22757570e+00, -4.56772534e-01],
       [ 3.35229082e+00, -1.26381683e-02],
       [ 2.18140921e+00,  2.44624599e+00],
       [ 2.81827206e+00, -9.52361878e-01],
       [-5.91056742e-01,  6.88095808e-01],
       [-2.49043698e+00, -1.21171505e+00],
       [-3.55145924e+00,  1.81948146e+00],
       [-2.19027757e+00,  1.26719173e+00],
       [-3.55927961e+00,  8.76653657e-01],
       [-2.40998861e+00, -1.60887845e-02],
       [ 4.13426028e-01, -3.55079290e+00],
       [-4.56615956e+00,  3.43931793e+00],
       [-3.90330920e+00,  1.17670830e+00],
       [-1.53918414e+00,  5.40453240e-01],
       [-3.73795472e+00,  2.12903714e-01],
       [-6.36675531e-01, -2.09581728e+00],
       [ 6.07583482e+00, -5.80579153e-01],
       [-1.73251287e+00,  7.70136727e-01],
       [-1.55345820e+00, -1.19780365e+00],
       [-4.15147857e-01, -1.35338178e+00],
       [-2.36272865e+00,  9.84354139e-01],
       [-3.40978942e+00,  7.55235499e-01],
       [-1.18216531e+00, -1.30756011e+00],
       [-1.39403308e+00,  1.52685920e+00],
       [ 4.65296344e+00, -2.14408451e+00],
       [-4.33879458e+00, -9.38248915e-01],
       [-2.45968692e+00,  2.70035719e+00],
       [ 2.27508313e+00, -2.60695474e+00],
       [ 3.24923498e+00,  3.47431138e+00],
       [-1.91967965e+00,  1.88106716e+00],
       [-1.27651922e+00,  3.55469907e+00],
       [ 7.17621927e+00,  8.18409598e-01],
       [ 9.08697926e+00, -3.10749563e-01],
       [-3.23524871e+00, -1.18745010e+00],
       [-2.20467968e+00, -1.21596160e+00],
       [-1.05300808e+00, -3.28517718e+00],
       [-2.24494165e+00, -4.92559105e-01],
       [-3.61561101e+00,  7.24425938e-01],
       [-8.40921667e-01, -3.47761582e+00],
       [ 7.34976529e+00, -5.00822726e+00],
       [ 3.83373731e+00, -1.76139366e+00],
       [-4.36304846e+00, -3.96252459e-01],
       [-2.20163885e+00, -3.84816328e-01],
       [ 1.14893785e+00,  8.48759023e+00],
       [ 5.15463302e+00, -1.82508588e+00],
       [-2.16163413e+00, -6.89306959e-01],
       [-9.56855903e-01,  9.69405139e-01],
       [-4.74913874e+00, -1.87497720e+00],
       [-4.11757830e+00,  2.35606332e+00],
       [-4.39073535e+00, -8.48100157e-01],
       [ 7.22447974e+00, -5.23060667e+00],
       [-2.09400538e+00,  1.57533741e+00],
       [ 8.76846726e+00, -2.73290060e-01],
       [-1.14191485e+00, -4.64028243e-01],
       [ 3.25195320e+00, -8.14745970e-01],
       [-1.56343226e+00,  2.92705742e-01],
       [ 2.84561718e+00,  3.58893948e+00],
       [-2.88263563e+00,  1.19921703e-01],
       [-9.40965073e-01, -1.66466986e+00],
       [-2.81677889e+00, -1.06837719e+00],
       [-1.87619282e+00, -1.58372064e+00],
       [-3.62455357e+00, -7.28537826e-01],
       [-1.91892081e+00,  9.08819247e-01],
       [ 3.05638916e+00,  8.34476104e-01],
       [-4.92153454e+00, -2.17997161e+00],
       [ 4.96878177e+00, -3.92315781e+00],
       [-1.11798339e+00, -1.61320075e+00],
       [-1.75649911e+00,  4.00794343e-01],
       [-3.12851580e+00, -1.71159834e+00],
       [ 4.30569137e+00, -4.50347379e+00],
       [-2.59652477e+00,  2.06936284e-01],
       [-3.44742342e+00, -5.32322647e-01],
       [-2.60593639e+00, -7.20433256e-01],
       [ 3.60488597e+00,  2.27249204e+00],
       [-3.48244169e+00,  1.69043919e+00],
       [-2.83101797e+00, -2.89280858e+00],
       [ 1.20458550e+00, -1.93661155e+00],
       [ 3.95647301e+00, -1.73402958e+00],
       [-1.99568662e+00, -1.77142390e-01],
       [ 9.10237289e+00,  8.21421377e-01],
       [-2.31675492e+00,  2.13498273e+00],
       [ 3.90646007e+00,  6.11178022e+00],
       [-4.70460011e+00, -5.09302984e-01],
       [ 3.87941451e+00,  2.51111387e+00],
       [ 2.45644812e+00,  4.03445443e+00],
       [ 1.33788158e+00, -1.18818514e+00],
       [ 2.08904616e+00,  1.25192608e+00],
       [-1.45252740e+00,  1.34750263e-01],
       [-1.96427987e+00, -4.08162077e-01],
       [-4.16590017e+00, -1.44643028e+00],
       [ 1.28113496e+01,  2.81500498e+00],
       [ 3.38054066e-02, -1.79166464e-01],
       [ 4.77735587e+00,  3.49083524e+00],
       [-2.31709171e+00, -7.58439556e-01],
       [ 3.43170845e+00, -2.20656750e+00],
       [-2.91220473e+00,  3.29900861e-01],
       [ 1.23912937e+00,  2.59977440e+00],
       [ 4.79680436e+00, -1.32883128e+00],
       [-2.70674472e+00, -1.79713428e+00],
       [-2.14317561e+00, -8.08736468e-01],
       [-3.84873131e+00,  4.68227691e-01],
       [-6.31455506e-01,  1.82196749e+00],
       [-4.13329446e+00,  7.46528349e-02],
       [-6.60314663e-01, -1.05954597e+00],
       [-1.35759053e+00,  1.02801380e-01],
       [-4.84589167e+00, -2.42207709e+00],
       [-2.81317480e+00, -9.93527746e-01],
       [-1.77842598e+00, -5.15100183e-01],
       [-2.06745665e+00,  3.85037678e+00],
       [-3.15065133e+00, -2.14222173e+00],
       [-4.65537550e+00, -8.43797914e-01],
       [ 5.49733006e+00, -3.96710229e+00],
       [-7.10475844e-01,  1.84929136e+00],
       [-2.45207731e+00, -3.84262744e-01],
       [-2.38605903e+00,  3.31824106e+00],
       [ 3.30630648e+00,  4.16882946e+00],
       [-2.10877597e+00,  1.09063227e-02],
       [-2.04532616e+00,  1.70562726e+00],
       [-2.46494274e+00, -2.38989936e+00],
       [-2.15941369e+00,  9.57699998e-01],
       [-1.95268062e+00, -9.74329414e-01],
       [-2.91881236e+00, -1.70939085e+00],
       [ 7.73806249e+00, -6.46699917e-01],
       [-3.28057588e+00,  1.60341941e-01],
       [ 3.52602655e+00, -2.14620755e+00],
       [-7.17488105e-01,  3.75428544e+00],
       [-1.48338132e+00,  2.90610819e-01],
       [ 8.60450557e-01,  9.53650128e-01],
       [-1.29404768e+00,  5.01799340e+00],
       [ 6.18597008e+00,  5.35168399e+00],
       [ 1.36386033e+00, -1.70600385e+00],
       [-3.85708305e+00,  8.60918879e-01],
       [ 2.82379549e+00,  3.51349308e+00],
       [ 5.77021864e+00, -8.68954194e-01],
       [-9.20845296e-01,  4.38479892e-01],
       [-2.64364962e+00, -1.53680076e+00],
       [ 2.36140698e+00,  4.91945770e+00],
       [-3.05412232e+00,  3.27481707e-01],
       [-4.47193714e+00, -1.83346848e+00],
       [-2.05008549e+00,  1.80693930e+00],
       [ 4.42451620e+00, -7.83908426e-01],
       [-2.46113495e+00,  3.25076456e+00],
       [-3.30310763e+00, -3.17892394e+00],
       [-2.70485028e+00, -2.57755488e-01],
       [ 1.13425267e+00,  7.24647940e+00],
       [-1.64485840e+00, -4.53874880e+00],
       [ 5.09549789e+00, -1.58894185e+00],
       [ 2.75824852e+00, -3.81022405e+00],
       [-1.13164961e+00,  5.73191190e+00],
       [-1.20232680e+00,  1.61081122e+00],
       [-1.27070393e+00,  2.59387132e+00],
       [-2.60798250e+00, -4.63160069e-01],
       [ 1.03885658e+00, -2.14588343e+00],
       [ 3.75040902e+00, -2.68559217e+00],
       [-2.38469869e+00, -2.63039394e-01],
       [-2.03740689e+00,  3.35625140e-01],
       [-2.80930188e+00, -1.08811851e+00],
       [-2.21067216e+00,  4.88819568e-02],
       [ 5.04442917e-01,  4.49634497e-01],
       [-3.70945085e+00, -1.37116320e+00],
       [-6.74609711e-01, -1.53521373e+00],
       [ 5.06152345e+00, -9.89470312e-01],
       [-9.19341657e-01,  9.73772069e-01],
       [ 3.08936423e+00,  4.53301819e+00],
       [-2.23401321e-01, -1.48046894e-01],
       [-2.60280296e+00,  3.14188111e+00],
       [ 1.31403031e+00,  4.06443943e-01],
       [-4.68153349e-02, -1.00242062e-01],
       [-1.93308693e+00, -1.13704176e+00],
       [-1.52131762e+00,  4.33315745e-01],
       [-1.89979935e+00,  2.55562117e+00],
       [ 4.92703390e+00, -2.66104990e+00],
       [ 7.13819586e+00,  3.17589441e-01],
       [-2.40835023e+00, -8.11563994e-01],
       [-2.87165038e-01,  1.43716632e-01],
       [-3.30608498e+00, -1.49983757e+00],
       [ 2.78570615e+00, -2.69074524e-01],
       [-4.09042641e+00, -5.67502440e-01],
       [ 1.73178814e+00, -1.42755166e+00],
       [ 3.45398267e+00, -1.52512566e+00],
       [-2.62780130e+00, -1.85313456e+00],
       [-1.23374077e+00,  1.12496875e+00],
       [-4.02645464e+00, -2.61337160e+00],
       [ 1.05640933e+00,  1.18094784e+00],
       [-2.81191212e+00,  1.08417577e+00],
       [ 3.04369006e+00, -1.67882484e+00],
       [-2.55021489e+00, -2.50921046e+00],
       [ 2.07523849e+00,  6.66926970e+00],
       [ 4.12677984e+00, -1.83244075e-01],
       [-2.89339810e+00, -1.44823665e+00],
       [-1.95021133e+00, -2.19120724e-01],
       [ 3.70277215e+00,  1.05339353e+00],
       [ 1.20323367e+01, -7.15382114e+00],
       [-2.72114047e+00,  1.39739516e+00],
       [-1.54768248e+00, -1.06381042e+00],
       [-3.93722770e+00,  7.31131573e-01],
       [ 4.36872161e+00, -3.86779224e+00],
       [ 1.37001313e+00,  2.24166805e+00],
       [-1.57574255e-01,  1.54503323e+00],
       [-3.33991601e+00, -1.13764071e+00],
       [ 2.48345877e+00,  2.74093101e+00],
       [-3.21967402e+00,  1.27036773e+00],
       [-7.55925465e-01,  6.76999723e-01],
       [-4.02153678e+00,  1.15381800e+00],
       [ 3.78284363e+00,  1.16532486e+00],
       [-3.50769347e+00, -6.69019386e-01],
       [ 4.96937042e+00, -1.40626742e+00],
       [-3.90174101e-01,  2.36702092e+00],
       [ 6.59737010e+00, -1.26031209e+00],
       [ 1.84778061e+00,  2.15738793e-01],
       [ 4.68468744e+00, -1.33169271e+00],
       [ 3.98587486e+00, -2.78214719e+00],
       [-7.58645079e-02,  2.44365016e+00],
       [-4.33286950e+00, -4.45442900e-02],
       [-2.02879388e+00, -1.08666613e+00],
       [-1.21635779e+00,  4.17561211e+00],
       [-2.13309598e+00, -1.95597604e+00],
       [ 3.43700763e-01, -3.09289490e+00],
       [-1.85413800e+00, -1.56408675e+00],
       [-1.99762141e+00, -2.73554611e-01],
       [-2.02117226e+00, -2.91196532e-01],
       [-3.66335178e+00, -1.26377404e+00],
       [-4.61969198e+00, -1.90820192e+00],
       [ 3.38236374e-01, -8.46804337e-01],
       [-4.28739039e+00, -2.02743717e+00],
       [-3.12293218e+00, -1.79357118e+00],
       [ 3.32522906e+00, -1.16942385e+00],
       [-3.03818514e+00, -6.86340209e-01],
       [-1.41286812e+00, -1.80538880e-01],
       [ 3.73032333e+00, -8.78629563e-01],
       [ 3.36597246e+00, -2.69325076e+00],
       [ 5.04828453e+00,  9.92383806e-01],
       [ 3.18210080e+00,  1.67207115e+00],
       [-2.59875006e+00, -1.15106215e+00],
       [-1.38321955e+00,  4.36980376e-01],
       [ 7.10758459e+00,  2.24256979e+00],
       [-2.49561717e+00,  2.64418263e+00],
       [-3.02585619e+00, -2.27978639e+00],
       [ 6.90782343e+00,  1.30532255e+01],
       [ 2.46485155e-01, -1.51496312e+00],
       [ 7.37714748e-01,  4.83505271e-01],
       [-5.07641436e-01,  1.28781779e+00],
       [-2.18991826e+00,  6.74413409e-01],
       [-2.70471366e+00,  1.42730016e+00],
       [-1.06006398e+00,  1.91509798e+00],
       [-1.93841122e+00,  1.40614284e+00],
       [ 2.65474324e+00, -3.77976692e+00],
       [ 3.41518173e+00, -3.26969969e+00],
       [ 1.09754132e+01, -3.39077083e+00],
       [-1.34961830e+00,  8.21667815e-01],
       [ 1.94971254e+00, -2.30718501e+00],
       [ 7.25347957e+00,  1.31899966e-01],
       [-1.10336325e+00, -1.40499894e+00],
       [-2.59087359e+00,  6.87553459e-01],
       [-2.91815248e+00, -5.34214322e-01],
       [-1.62087749e+00,  8.68005672e-01],
       [-2.51366631e+00, -8.01356534e-01],
       [-1.20499330e+00, -1.53756961e-01],
       [ 2.29162729e+00, -2.52740867e-01],
       [-4.77664233e+00, -6.08178860e-01],
       [-2.20849723e+00, -1.32508957e+00],
       [-7.33204993e-01,  3.24374200e+00],
       [-7.58666545e-01, -2.07848833e+00],
       [-3.86864326e+00,  4.65225834e-01],
       [-1.45923590e+00, -1.56421810e+00],
       [ 7.16341081e+00,  1.04687600e+01],
       [ 3.31088863e+00, -1.65939327e+00],
       [-3.12337049e+00, -1.92657856e+00],
       [ 5.00812584e+00, -1.17383608e+00],
       [-6.60728916e-01,  1.78510222e+00],
       [ 6.25554401e+00,  1.14757178e+00],
       [-3.32980445e-01, -9.88442588e-01],
       [-2.46580519e+00, -2.02959456e+00],
       [ 2.31208360e+00,  4.58431137e+00],
       [-3.88156898e+00,  1.08240797e+00],
       [-1.29131997e+00,  9.34130409e-01],
       [-3.32442070e+00, -8.47415896e-01],
       [ 5.18819885e+00, -2.21553456e+00],
       [ 7.87674197e-01, -1.56145504e+00],
       [ 4.58203743e+00,  4.83978852e-01],
       [-1.64242158e+00,  1.73988692e+00],
       [-5.42471122e+00,  4.77409505e-01],
       [ 8.74079598e+00,  3.63711992e+00],
       [ 2.69324550e+00,  1.73128614e-01],
       [-4.60707340e+00, -2.83900047e+00],
       [-4.79526420e-01, -9.22611535e-01],
       [ 5.89385302e-01,  2.77395633e-01],
       [-1.47351784e+00,  1.71611168e+00],
       [ 1.33730462e+00,  9.81948610e-01],
       [ 1.82231573e+00,  2.88998076e+00],
       [-3.74738325e+00, -1.84165435e+00],
       [-3.08730294e-02,  2.29738556e+00],
       [-4.50451659e+00, -3.24612629e+00],
       [-2.91179143e+00,  3.27834406e-01],
       [-3.56085734e+00, -1.32616622e+00],
       [-1.99695736e+00, -2.47535686e+00],
       [-2.74199332e+00, -2.16327060e+00],
       [-1.64919372e+00,  2.48401519e+00],
       [-4.42608805e+00, -8.69927284e-01],
       [ 8.64544242e+00, -3.19786856e+00],
       [ 2.54141935e+00, -2.35730784e+00],
       [-7.40113097e-01, -5.18784054e-02],
       [-2.97598746e+00,  1.77375924e+00],
       [-3.36427340e+00, -7.54150773e-02],
       [ 4.83863925e+00,  3.23691356e+00],
       [-5.51825211e+00, -7.73630580e-01],
       [-3.05793601e+00, -2.22100606e+00],
       [-1.85775451e+00,  1.39851943e+00],
       [-2.80981317e+00, -4.05238960e-01],
       [ 4.21342556e+00, -4.98812885e+00],
       [-2.95696557e+00, -7.09572756e-01],
       [ 4.16646616e+00,  3.01282388e+00],
       [ 1.03933388e+00,  1.02169754e+00],
       [ 4.22800713e+00,  1.32538252e+00],
       [-2.00444883e+00,  4.21084778e-01],
       [ 9.21475147e+00,  2.19772974e+00],
       [-3.52645638e-01,  2.20207995e+00],
       [ 1.70657037e+00, -2.00123218e+00],
       [-2.33651837e+00,  7.14403093e-01],
       [ 5.24198043e+00, -6.53178325e+00],
       [ 1.61490244e+01, -7.24611405e+00],
       [-2.42421736e+00,  5.76192085e-01],
       [-2.77905225e+00,  3.40496767e-01],
       [-3.42249621e+00, -2.48502091e+00],
       [-1.93414823e+00,  1.42872198e+00],
       [-3.03794269e+00, -1.89637954e-01],
       [-2.89394159e+00, -3.48525482e-02],
       [-1.39527531e+00,  1.42061067e+00],
       [-4.98922907e+00, -3.48023351e+00],
       [ 7.28191791e+00, -3.44747764e+00],
       [ 6.42282927e+00, -1.61219289e+00],
       [ 2.85230648e+00,  7.23177385e-01],
       [ 1.24756825e+00, -2.31255427e+00],
       [ 4.37071230e+00, -1.96779348e+00],
       [-5.49703234e-02, -1.19736251e-01],
       [-1.29663111e+00,  1.54010964e+00],
       [-1.95475015e+00, -1.85472567e+00],
       [-3.24845224e+00, -9.64474413e-01],
       [-3.12152066e+00, -3.36486268e-01],
       [-2.08953432e+00,  6.28456436e-02],
       [-1.90796477e+00,  2.56172011e+00],
       [ 3.56564947e+00, -3.71340906e+00],
       [-1.89951941e+00, -6.06163363e-01],
       [ 4.20915999e-01, -2.74923340e+00],
       [-1.84859038e+00, -9.18591681e-01],
       [ 3.42045099e-01,  1.69806524e+00],
       [-3.36897374e-01, -5.75581704e-01],
       [-3.54310230e-01, -3.32406750e-01],
       [-1.81253092e+00, -1.10977930e-01],
       [ 9.54219195e+00, -5.32746910e+00],
       [ 1.48118077e+00, -9.85472568e-01],
       [ 1.76641962e+00,  2.41951459e+00],
       [-3.12026972e+00, -7.89515570e-01],
       [ 4.59018860e+00,  3.21424161e+00],
       [-2.16853356e+00,  4.68564919e-01],
       [-3.75623266e+00, -3.81717458e-01],
       [ 2.81679892e-01, -4.13729593e-01],
       [ 1.08542269e+01, -1.89004497e+00],
       [ 5.08560754e+00,  3.23758776e+00],
       [ 7.56203604e-01, -2.29298747e+00],
       [-3.78502513e-01, -7.76399742e-01],
       [ 2.21777653e+00,  1.04172770e+00],
       [ 1.81985826e+00, -4.23182624e+00],
       [ 5.33380606e+00,  4.10137976e+00],
       [-3.42475266e+00, -2.17190888e+00],
       [ 9.04537882e+00,  2.33752916e+00],
       [-4.45929164e-01,  3.86055779e+00],
       [ 3.30121037e+00, -1.42796866e+00],
       [-3.63721835e+00,  1.59165031e+00],
       [-1.71011727e+00,  1.00972610e+00],
       [ 3.27958813e+00, -1.04055950e+00],
       [ 5.74162486e-01,  4.54631649e-01],
       [ 1.46523027e+00,  1.44386747e+00],
       [ 4.07811727e+00,  6.45074106e-01],
       [-2.64190057e+00,  6.02524612e-01],
       [-4.08982337e+00, -5.29707118e-01],
       [-1.84357047e+00,  1.73549435e+00],
       [ 1.03893032e-01,  7.26493782e+00],
       [ 3.81682257e+00, -8.43166568e-01],
       [-2.37124119e+00, -1.69173266e+00],
       [-1.63870782e+00, -1.97805600e-01],
       [ 6.54993668e+00, -5.80871669e+00],
       [ 3.71474288e-01,  3.97520197e+00],
       [-3.21380062e-01,  1.58980046e-01],
       [ 3.61544513e-01,  1.18595929e+00],
       [ 3.03183473e+00,  8.69697874e-01],
       [ 6.51808828e-01,  7.38291727e-01],
       [-1.21826465e+00, -1.56206981e+00],
       [-8.89951063e-01,  2.54182711e+00],
       [-2.00662011e+00,  3.05064104e-01],
       [ 5.58670329e+00,  1.11452916e+00],
       [ 3.93457631e-01,  3.64475677e+00],
       [-4.71055375e+00, -3.06342001e-01],
       [ 3.60862610e+00, -2.09427033e+00],
       [-2.05349968e+00,  1.24324055e+00],
       [-2.24850174e+00, -2.48904595e+00],
       [-3.86328935e+00,  4.39682987e+00],
       [-4.48217238e-01,  1.52081858e-01],
       [-1.34839911e+00, -1.76992390e+00],
       [ 2.71067086e+00, -4.32043003e+00],
       [ 3.47604839e-02, -3.37853851e+00],
       [ 6.49029629e+00,  7.94277378e+00],
       [-3.20758299e+00,  5.43091464e-01],
       [-5.69315584e+00, -5.73866830e-01],
       [-1.07704361e+00,  1.37951070e+00]])
In [25]:
# 寄与率
print('explained variance ratio: {}'.format(pca.explained_variance_ratio_))
# explained variance ratio: [ 0.43315126  0.19586506]
  #【コメント】累積で60%ちょいくらい
explained variance ratio: [0.43315126 0.19586506]
In [28]:
# 散布図にプロット
temp = pd.DataFrame(X_train_pca)
# 散布図にプロット
temp = pd.DataFrame(X_train_pca)
temp['Outcome'] = y_train.values
temp
Out[28]:
0 1 Outcome
0 -2.859248 -0.286145 0
1 -3.263084 1.072520 0
2 3.755513 -3.409689 1
3 -3.492632 -2.687847 0
4 -0.743517 -2.482149 1
... ... ... ...
421 0.034760 -3.378539 1
422 6.490296 7.942774 1
423 -3.207583 0.543091 0
424 -5.693156 -0.573867 0
425 -1.077044 1.379511 0

426 rows × 3 columns

In [20]:
b = temp[temp['Outcome'] == 0]
m = temp[temp['Outcome'] == 1]
plt.scatter(x=b[0], y=b[1], marker='o') # 良性は○でマーク
plt.scatter(x=m[0], y=m[1], marker='^') # 悪性は△でマーク
plt.xlabel('PC 1') # 第1主成分をx軸
plt.ylabel('PC 2') # 第2主成分をy軸
  #【コメント】2次元に次元圧縮してもそれなりに分類できそう
X_train_pca shape: (426, 2)
explained variance ratio: [0.43315126 0.19586506]
Out[20]:
Text(0, 0.5, 'PC 2')
In [98]:
  #【コメント】次元数2まで圧縮してロジスティック回帰してみる
# 次元数2まで圧縮
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.fit_transform(X_test_scaled)


# ロジスティック回帰で学習
logistic = LogisticRegressionCV(cv=10, random_state=0,max_iter=1000)
  #【コメント】イテレーションを増やすようにというWarningが出ていたので100にした
logistic.fit(X_train_pca, y_train)

# 検証
print('Train score: {:.3f}'.format(logistic.score(X_train_pca, y_train)))
print('Test score: {:.3f}'.format(logistic.score(X_test_pca, y_test)))
print('Confustion matrix:\n{}'.format(confusion_matrix(y_true=y_test, y_pred=logistic.predict(X_test_pca))))
  #【コメント】検証スコア91.6%
Train score: 0.965
Test score: 0.916
Confustion matrix:
[[83  7]
 [ 5 48]]
In [70]:
import numpy as np
In [114]:
#【コメント】1-15次元までのスコアと処理時間を可視化してみる
import time

n_max = 15
n_array = np.arange(1,n_max+1,1)
scores = np.zeros(n_max)
times = np.zeros(n_max)

for n in n_array:
  #開始時間記憶
  start_time = time.perf_counter()

  # 次元数nまで圧縮
  pca = PCA(n_components=n)
  X_train_pca = pca.fit_transform(X_train_scaled)
  X_test_pca = pca.fit_transform(X_test_scaled)
  # ロジスティック回帰で学習
  logistic = LogisticRegressionCV(cv=10, random_state=0,max_iter=1000)
    #【コメント】イテレーションを増やすようにというWarningが出ていたので100にした
  logistic.fit(X_train_pca, y_train)
  #終了時間記憶
  end_time = time.perf_counter()

  scores[n-1] = logistic.score(X_test_pca, y_test)
  times[n-1] = end_time - start_time

fig = plt.figure()
ax1 = fig.add_subplot(111)
ln1=ax1.plot(n_array, scores,'C0',label=r'$score$')

ax2 = ax1.twinx()
ln2=ax2.plot(n_array,times,'C1',label=r'$time$')

h1, l1 = ax1.get_legend_handles_labels()
h2, l2 = ax2.get_legend_handles_labels()
ax1.legend(h1+h2, l1+l2, loc='lower right')

ax1.set_xlabel('n')
ax1.set_ylabel(r'$score$')
ax1.grid(True)
ax2.set_ylabel(r'$time[s]$')

#plt.plot(n_array,scores)
#plt.plot(n_array,times)
#【コメント】scoreは単純に比例していくわけではない←想像してたのと違うなんで?
#【コメント】timeは実行ごとに違うけどおおむね比例傾向。サーバーの他の処理状況によって変動はありそうなので納得できる
Out[114]:
Text(0, 0.5, '$time[s]$')