調べたことやコード変更した内容について下記のようにコメントを入れることにしています。
#【コメント】コメントです
#https://ohke.hateblo.jp/entry/2017/08/11/230000を参考に利用しています。
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
%matplotlib inline
以下では,Googleドライブのマイドライブ直下にstudy_ai_mlフォルダを置くことを仮定しています.必要に応じて,パスを変更してください。
cancer_df = pd.read_csv('/content/drive/My Drive/study_ai_ml_google/data/cancer.csv')
print('cancer df shape: {}'.format(cancer_df.shape))
cancer df shape: (569, 33)
cancer_df
| id | diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | radius_se | texture_se | perimeter_se | area_se | smoothness_se | compactness_se | concavity_se | concave points_se | symmetry_se | fractal_dimension_se | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | Unnamed: 32 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 842302 | M | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.30010 | 0.14710 | 0.2419 | 0.07871 | 1.0950 | 0.9053 | 8.589 | 153.40 | 0.006399 | 0.04904 | 0.05373 | 0.01587 | 0.03003 | 0.006193 | 25.380 | 17.33 | 184.60 | 2019.0 | 0.16220 | 0.66560 | 0.7119 | 0.2654 | 0.4601 | 0.11890 | NaN |
| 1 | 842517 | M | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.08690 | 0.07017 | 0.1812 | 0.05667 | 0.5435 | 0.7339 | 3.398 | 74.08 | 0.005225 | 0.01308 | 0.01860 | 0.01340 | 0.01389 | 0.003532 | 24.990 | 23.41 | 158.80 | 1956.0 | 0.12380 | 0.18660 | 0.2416 | 0.1860 | 0.2750 | 0.08902 | NaN |
| 2 | 84300903 | M | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.19740 | 0.12790 | 0.2069 | 0.05999 | 0.7456 | 0.7869 | 4.585 | 94.03 | 0.006150 | 0.04006 | 0.03832 | 0.02058 | 0.02250 | 0.004571 | 23.570 | 25.53 | 152.50 | 1709.0 | 0.14440 | 0.42450 | 0.4504 | 0.2430 | 0.3613 | 0.08758 | NaN |
| 3 | 84348301 | M | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.24140 | 0.10520 | 0.2597 | 0.09744 | 0.4956 | 1.1560 | 3.445 | 27.23 | 0.009110 | 0.07458 | 0.05661 | 0.01867 | 0.05963 | 0.009208 | 14.910 | 26.50 | 98.87 | 567.7 | 0.20980 | 0.86630 | 0.6869 | 0.2575 | 0.6638 | 0.17300 | NaN |
| 4 | 84358402 | M | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.19800 | 0.10430 | 0.1809 | 0.05883 | 0.7572 | 0.7813 | 5.438 | 94.44 | 0.011490 | 0.02461 | 0.05688 | 0.01885 | 0.01756 | 0.005115 | 22.540 | 16.67 | 152.20 | 1575.0 | 0.13740 | 0.20500 | 0.4000 | 0.1625 | 0.2364 | 0.07678 | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 564 | 926424 | M | 21.56 | 22.39 | 142.00 | 1479.0 | 0.11100 | 0.11590 | 0.24390 | 0.13890 | 0.1726 | 0.05623 | 1.1760 | 1.2560 | 7.673 | 158.70 | 0.010300 | 0.02891 | 0.05198 | 0.02454 | 0.01114 | 0.004239 | 25.450 | 26.40 | 166.10 | 2027.0 | 0.14100 | 0.21130 | 0.4107 | 0.2216 | 0.2060 | 0.07115 | NaN |
| 565 | 926682 | M | 20.13 | 28.25 | 131.20 | 1261.0 | 0.09780 | 0.10340 | 0.14400 | 0.09791 | 0.1752 | 0.05533 | 0.7655 | 2.4630 | 5.203 | 99.04 | 0.005769 | 0.02423 | 0.03950 | 0.01678 | 0.01898 | 0.002498 | 23.690 | 38.25 | 155.00 | 1731.0 | 0.11660 | 0.19220 | 0.3215 | 0.1628 | 0.2572 | 0.06637 | NaN |
| 566 | 926954 | M | 16.60 | 28.08 | 108.30 | 858.1 | 0.08455 | 0.10230 | 0.09251 | 0.05302 | 0.1590 | 0.05648 | 0.4564 | 1.0750 | 3.425 | 48.55 | 0.005903 | 0.03731 | 0.04730 | 0.01557 | 0.01318 | 0.003892 | 18.980 | 34.12 | 126.70 | 1124.0 | 0.11390 | 0.30940 | 0.3403 | 0.1418 | 0.2218 | 0.07820 | NaN |
| 567 | 927241 | M | 20.60 | 29.33 | 140.10 | 1265.0 | 0.11780 | 0.27700 | 0.35140 | 0.15200 | 0.2397 | 0.07016 | 0.7260 | 1.5950 | 5.772 | 86.22 | 0.006522 | 0.06158 | 0.07117 | 0.01664 | 0.02324 | 0.006185 | 25.740 | 39.42 | 184.60 | 1821.0 | 0.16500 | 0.86810 | 0.9387 | 0.2650 | 0.4087 | 0.12400 | NaN |
| 568 | 92751 | B | 7.76 | 24.54 | 47.92 | 181.0 | 0.05263 | 0.04362 | 0.00000 | 0.00000 | 0.1587 | 0.05884 | 0.3857 | 1.4280 | 2.548 | 19.15 | 0.007189 | 0.00466 | 0.00000 | 0.00000 | 0.02676 | 0.002783 | 9.456 | 30.37 | 59.16 | 268.6 | 0.08996 | 0.06444 | 0.0000 | 0.0000 | 0.2871 | 0.07039 | NaN |
569 rows × 33 columns
cancer_df.drop('Unnamed: 32', axis=1, inplace=True)
cancer_df
| id | diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | radius_se | texture_se | perimeter_se | area_se | smoothness_se | compactness_se | concavity_se | concave points_se | symmetry_se | fractal_dimension_se | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 842302 | M | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.30010 | 0.14710 | 0.2419 | 0.07871 | 1.0950 | 0.9053 | 8.589 | 153.40 | 0.006399 | 0.04904 | 0.05373 | 0.01587 | 0.03003 | 0.006193 | 25.380 | 17.33 | 184.60 | 2019.0 | 0.16220 | 0.66560 | 0.7119 | 0.2654 | 0.4601 | 0.11890 |
| 1 | 842517 | M | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.08690 | 0.07017 | 0.1812 | 0.05667 | 0.5435 | 0.7339 | 3.398 | 74.08 | 0.005225 | 0.01308 | 0.01860 | 0.01340 | 0.01389 | 0.003532 | 24.990 | 23.41 | 158.80 | 1956.0 | 0.12380 | 0.18660 | 0.2416 | 0.1860 | 0.2750 | 0.08902 |
| 2 | 84300903 | M | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.19740 | 0.12790 | 0.2069 | 0.05999 | 0.7456 | 0.7869 | 4.585 | 94.03 | 0.006150 | 0.04006 | 0.03832 | 0.02058 | 0.02250 | 0.004571 | 23.570 | 25.53 | 152.50 | 1709.0 | 0.14440 | 0.42450 | 0.4504 | 0.2430 | 0.3613 | 0.08758 |
| 3 | 84348301 | M | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.24140 | 0.10520 | 0.2597 | 0.09744 | 0.4956 | 1.1560 | 3.445 | 27.23 | 0.009110 | 0.07458 | 0.05661 | 0.01867 | 0.05963 | 0.009208 | 14.910 | 26.50 | 98.87 | 567.7 | 0.20980 | 0.86630 | 0.6869 | 0.2575 | 0.6638 | 0.17300 |
| 4 | 84358402 | M | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.19800 | 0.10430 | 0.1809 | 0.05883 | 0.7572 | 0.7813 | 5.438 | 94.44 | 0.011490 | 0.02461 | 0.05688 | 0.01885 | 0.01756 | 0.005115 | 22.540 | 16.67 | 152.20 | 1575.0 | 0.13740 | 0.20500 | 0.4000 | 0.1625 | 0.2364 | 0.07678 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 564 | 926424 | M | 21.56 | 22.39 | 142.00 | 1479.0 | 0.11100 | 0.11590 | 0.24390 | 0.13890 | 0.1726 | 0.05623 | 1.1760 | 1.2560 | 7.673 | 158.70 | 0.010300 | 0.02891 | 0.05198 | 0.02454 | 0.01114 | 0.004239 | 25.450 | 26.40 | 166.10 | 2027.0 | 0.14100 | 0.21130 | 0.4107 | 0.2216 | 0.2060 | 0.07115 |
| 565 | 926682 | M | 20.13 | 28.25 | 131.20 | 1261.0 | 0.09780 | 0.10340 | 0.14400 | 0.09791 | 0.1752 | 0.05533 | 0.7655 | 2.4630 | 5.203 | 99.04 | 0.005769 | 0.02423 | 0.03950 | 0.01678 | 0.01898 | 0.002498 | 23.690 | 38.25 | 155.00 | 1731.0 | 0.11660 | 0.19220 | 0.3215 | 0.1628 | 0.2572 | 0.06637 |
| 566 | 926954 | M | 16.60 | 28.08 | 108.30 | 858.1 | 0.08455 | 0.10230 | 0.09251 | 0.05302 | 0.1590 | 0.05648 | 0.4564 | 1.0750 | 3.425 | 48.55 | 0.005903 | 0.03731 | 0.04730 | 0.01557 | 0.01318 | 0.003892 | 18.980 | 34.12 | 126.70 | 1124.0 | 0.11390 | 0.30940 | 0.3403 | 0.1418 | 0.2218 | 0.07820 |
| 567 | 927241 | M | 20.60 | 29.33 | 140.10 | 1265.0 | 0.11780 | 0.27700 | 0.35140 | 0.15200 | 0.2397 | 0.07016 | 0.7260 | 1.5950 | 5.772 | 86.22 | 0.006522 | 0.06158 | 0.07117 | 0.01664 | 0.02324 | 0.006185 | 25.740 | 39.42 | 184.60 | 1821.0 | 0.16500 | 0.86810 | 0.9387 | 0.2650 | 0.4087 | 0.12400 |
| 568 | 92751 | B | 7.76 | 24.54 | 47.92 | 181.0 | 0.05263 | 0.04362 | 0.00000 | 0.00000 | 0.1587 | 0.05884 | 0.3857 | 1.4280 | 2.548 | 19.15 | 0.007189 | 0.00466 | 0.00000 | 0.00000 | 0.02676 | 0.002783 | 9.456 | 30.37 | 59.16 | 268.6 | 0.08996 | 0.06444 | 0.0000 | 0.0000 | 0.2871 | 0.07039 |
569 rows × 32 columns
・diagnosis: 診断結果 (良性がB / 悪性がM) ・説明変数は3列以降、目的変数を2列目としロジスティック回帰で分類
# 目的変数の抽出
y = cancer_df.diagnosis.apply(lambda d: 1 if d == 'M' else 0)
y
0 1
1 1
2 1
3 1
4 1
..
564 1
565 1
566 1
567 1
568 0
Name: diagnosis, Length: 569, dtype: int64
# 説明変数の抽出
X = cancer_df.loc[:, 'radius_mean':]
X
| radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | radius_se | texture_se | perimeter_se | area_se | smoothness_se | compactness_se | concavity_se | concave points_se | symmetry_se | fractal_dimension_se | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.30010 | 0.14710 | 0.2419 | 0.07871 | 1.0950 | 0.9053 | 8.589 | 153.40 | 0.006399 | 0.04904 | 0.05373 | 0.01587 | 0.03003 | 0.006193 | 25.380 | 17.33 | 184.60 | 2019.0 | 0.16220 | 0.66560 | 0.7119 | 0.2654 | 0.4601 | 0.11890 |
| 1 | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.08690 | 0.07017 | 0.1812 | 0.05667 | 0.5435 | 0.7339 | 3.398 | 74.08 | 0.005225 | 0.01308 | 0.01860 | 0.01340 | 0.01389 | 0.003532 | 24.990 | 23.41 | 158.80 | 1956.0 | 0.12380 | 0.18660 | 0.2416 | 0.1860 | 0.2750 | 0.08902 |
| 2 | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.19740 | 0.12790 | 0.2069 | 0.05999 | 0.7456 | 0.7869 | 4.585 | 94.03 | 0.006150 | 0.04006 | 0.03832 | 0.02058 | 0.02250 | 0.004571 | 23.570 | 25.53 | 152.50 | 1709.0 | 0.14440 | 0.42450 | 0.4504 | 0.2430 | 0.3613 | 0.08758 |
| 3 | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.24140 | 0.10520 | 0.2597 | 0.09744 | 0.4956 | 1.1560 | 3.445 | 27.23 | 0.009110 | 0.07458 | 0.05661 | 0.01867 | 0.05963 | 0.009208 | 14.910 | 26.50 | 98.87 | 567.7 | 0.20980 | 0.86630 | 0.6869 | 0.2575 | 0.6638 | 0.17300 |
| 4 | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.19800 | 0.10430 | 0.1809 | 0.05883 | 0.7572 | 0.7813 | 5.438 | 94.44 | 0.011490 | 0.02461 | 0.05688 | 0.01885 | 0.01756 | 0.005115 | 22.540 | 16.67 | 152.20 | 1575.0 | 0.13740 | 0.20500 | 0.4000 | 0.1625 | 0.2364 | 0.07678 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 564 | 21.56 | 22.39 | 142.00 | 1479.0 | 0.11100 | 0.11590 | 0.24390 | 0.13890 | 0.1726 | 0.05623 | 1.1760 | 1.2560 | 7.673 | 158.70 | 0.010300 | 0.02891 | 0.05198 | 0.02454 | 0.01114 | 0.004239 | 25.450 | 26.40 | 166.10 | 2027.0 | 0.14100 | 0.21130 | 0.4107 | 0.2216 | 0.2060 | 0.07115 |
| 565 | 20.13 | 28.25 | 131.20 | 1261.0 | 0.09780 | 0.10340 | 0.14400 | 0.09791 | 0.1752 | 0.05533 | 0.7655 | 2.4630 | 5.203 | 99.04 | 0.005769 | 0.02423 | 0.03950 | 0.01678 | 0.01898 | 0.002498 | 23.690 | 38.25 | 155.00 | 1731.0 | 0.11660 | 0.19220 | 0.3215 | 0.1628 | 0.2572 | 0.06637 |
| 566 | 16.60 | 28.08 | 108.30 | 858.1 | 0.08455 | 0.10230 | 0.09251 | 0.05302 | 0.1590 | 0.05648 | 0.4564 | 1.0750 | 3.425 | 48.55 | 0.005903 | 0.03731 | 0.04730 | 0.01557 | 0.01318 | 0.003892 | 18.980 | 34.12 | 126.70 | 1124.0 | 0.11390 | 0.30940 | 0.3403 | 0.1418 | 0.2218 | 0.07820 |
| 567 | 20.60 | 29.33 | 140.10 | 1265.0 | 0.11780 | 0.27700 | 0.35140 | 0.15200 | 0.2397 | 0.07016 | 0.7260 | 1.5950 | 5.772 | 86.22 | 0.006522 | 0.06158 | 0.07117 | 0.01664 | 0.02324 | 0.006185 | 25.740 | 39.42 | 184.60 | 1821.0 | 0.16500 | 0.86810 | 0.9387 | 0.2650 | 0.4087 | 0.12400 |
| 568 | 7.76 | 24.54 | 47.92 | 181.0 | 0.05263 | 0.04362 | 0.00000 | 0.00000 | 0.1587 | 0.05884 | 0.3857 | 1.4280 | 2.548 | 19.15 | 0.007189 | 0.00466 | 0.00000 | 0.00000 | 0.02676 | 0.002783 | 9.456 | 30.37 | 59.16 | 268.6 | 0.08996 | 0.06444 | 0.0000 | 0.0000 | 0.2871 | 0.07039 |
569 rows × 30 columns
# 学習用とテスト用でデータを分離
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
# 標準化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# ロジスティック回帰で学習
logistic = LogisticRegressionCV(cv=10, random_state=0,max_iter=1000)
#【コメント】イテレーションを増やすようにというWarningが出ていたので100にした
logistic.fit(X_train_scaled, y_train)
# 検証
print('Train score: {:.3f}'.format(logistic.score(X_train_scaled, y_train)))
print('Test score: {:.3f}'.format(logistic.score(X_test_scaled, y_test)))
print('Confustion matrix:\n{}'.format(confusion_matrix(y_true=y_test, y_pred=logistic.predict(X_test_scaled))))
Train score: 0.988 Test score: 0.972 Confustion matrix: [[89 1] [ 3 50]]
・検証スコア97%で分類できることを確認
pca = PCA(n_components=30)
pca.fit(X_train_scaled)
plt.bar([n for n in range(1, len(pca.explained_variance_ratio_)+1)], pca.explained_variance_ratio_)
#【コメント】各次元の寄与率の可視化
#【コメント】一般的に累積寄与率が80%となるよう、主成分を抽出すると良いらしい(このデータだと4から5次元くらいまで?)
<BarContainer object of 30 artists>
# PCA
# 次元数2まで圧縮
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_scaled)
print('X_train_pca shape: {}'.format(X_train_pca.shape))
# X_train_pca shape: (426, 2)
X_train_pca
#【コメント】 2次元になっている↓
X_train_pca shape: (426, 2)
array([[-2.85924814e+00, -2.86144799e-01],
[-3.26308367e+00, 1.07251955e+00],
[ 3.75551268e+00, -3.40968876e+00],
[-3.49263242e+00, -2.68784664e+00],
[-7.43517447e-01, -2.48214924e+00],
[-3.36556228e+00, -6.04103786e-01],
[-3.18672558e+00, -1.88203437e+00],
[-9.75800523e-01, 9.14900548e-01],
[ 3.61845617e+00, 4.16797546e+00],
[ 4.25896553e+00, 9.52260594e+00],
[-8.00920047e-01, 7.76348214e-02],
[-3.44711508e+00, 1.26773112e+00],
[ 1.85939039e+00, 7.94595702e-01],
[-2.12832960e+00, -1.18572536e+00],
[-6.04598633e-01, -9.14935763e-01],
[-3.93494805e+00, -2.07089658e+00],
[ 4.18868101e+00, 9.28984343e-01],
[ 4.93713709e+00, -1.15947338e+00],
[-3.81508251e+00, -9.42143756e-01],
[-2.49836390e-01, 2.30399084e+00],
[ 2.51338427e+00, -1.32090995e+00],
[-3.22757570e+00, -4.56772534e-01],
[ 3.35229082e+00, -1.26381683e-02],
[ 2.18140921e+00, 2.44624599e+00],
[ 2.81827206e+00, -9.52361878e-01],
[-5.91056742e-01, 6.88095808e-01],
[-2.49043698e+00, -1.21171505e+00],
[-3.55145924e+00, 1.81948146e+00],
[-2.19027757e+00, 1.26719173e+00],
[-3.55927961e+00, 8.76653657e-01],
[-2.40998861e+00, -1.60887845e-02],
[ 4.13426028e-01, -3.55079290e+00],
[-4.56615956e+00, 3.43931793e+00],
[-3.90330920e+00, 1.17670830e+00],
[-1.53918414e+00, 5.40453240e-01],
[-3.73795472e+00, 2.12903714e-01],
[-6.36675531e-01, -2.09581728e+00],
[ 6.07583482e+00, -5.80579153e-01],
[-1.73251287e+00, 7.70136727e-01],
[-1.55345820e+00, -1.19780365e+00],
[-4.15147857e-01, -1.35338178e+00],
[-2.36272865e+00, 9.84354139e-01],
[-3.40978942e+00, 7.55235499e-01],
[-1.18216531e+00, -1.30756011e+00],
[-1.39403308e+00, 1.52685920e+00],
[ 4.65296344e+00, -2.14408451e+00],
[-4.33879458e+00, -9.38248915e-01],
[-2.45968692e+00, 2.70035719e+00],
[ 2.27508313e+00, -2.60695474e+00],
[ 3.24923498e+00, 3.47431138e+00],
[-1.91967965e+00, 1.88106716e+00],
[-1.27651922e+00, 3.55469907e+00],
[ 7.17621927e+00, 8.18409598e-01],
[ 9.08697926e+00, -3.10749563e-01],
[-3.23524871e+00, -1.18745010e+00],
[-2.20467968e+00, -1.21596160e+00],
[-1.05300808e+00, -3.28517718e+00],
[-2.24494165e+00, -4.92559105e-01],
[-3.61561101e+00, 7.24425938e-01],
[-8.40921667e-01, -3.47761582e+00],
[ 7.34976529e+00, -5.00822726e+00],
[ 3.83373731e+00, -1.76139366e+00],
[-4.36304846e+00, -3.96252459e-01],
[-2.20163885e+00, -3.84816328e-01],
[ 1.14893785e+00, 8.48759023e+00],
[ 5.15463302e+00, -1.82508588e+00],
[-2.16163413e+00, -6.89306959e-01],
[-9.56855903e-01, 9.69405139e-01],
[-4.74913874e+00, -1.87497720e+00],
[-4.11757830e+00, 2.35606332e+00],
[-4.39073535e+00, -8.48100157e-01],
[ 7.22447974e+00, -5.23060667e+00],
[-2.09400538e+00, 1.57533741e+00],
[ 8.76846726e+00, -2.73290060e-01],
[-1.14191485e+00, -4.64028243e-01],
[ 3.25195320e+00, -8.14745970e-01],
[-1.56343226e+00, 2.92705742e-01],
[ 2.84561718e+00, 3.58893948e+00],
[-2.88263563e+00, 1.19921703e-01],
[-9.40965073e-01, -1.66466986e+00],
[-2.81677889e+00, -1.06837719e+00],
[-1.87619282e+00, -1.58372064e+00],
[-3.62455357e+00, -7.28537826e-01],
[-1.91892081e+00, 9.08819247e-01],
[ 3.05638916e+00, 8.34476104e-01],
[-4.92153454e+00, -2.17997161e+00],
[ 4.96878177e+00, -3.92315781e+00],
[-1.11798339e+00, -1.61320075e+00],
[-1.75649911e+00, 4.00794343e-01],
[-3.12851580e+00, -1.71159834e+00],
[ 4.30569137e+00, -4.50347379e+00],
[-2.59652477e+00, 2.06936284e-01],
[-3.44742342e+00, -5.32322647e-01],
[-2.60593639e+00, -7.20433256e-01],
[ 3.60488597e+00, 2.27249204e+00],
[-3.48244169e+00, 1.69043919e+00],
[-2.83101797e+00, -2.89280858e+00],
[ 1.20458550e+00, -1.93661155e+00],
[ 3.95647301e+00, -1.73402958e+00],
[-1.99568662e+00, -1.77142390e-01],
[ 9.10237289e+00, 8.21421377e-01],
[-2.31675492e+00, 2.13498273e+00],
[ 3.90646007e+00, 6.11178022e+00],
[-4.70460011e+00, -5.09302984e-01],
[ 3.87941451e+00, 2.51111387e+00],
[ 2.45644812e+00, 4.03445443e+00],
[ 1.33788158e+00, -1.18818514e+00],
[ 2.08904616e+00, 1.25192608e+00],
[-1.45252740e+00, 1.34750263e-01],
[-1.96427987e+00, -4.08162077e-01],
[-4.16590017e+00, -1.44643028e+00],
[ 1.28113496e+01, 2.81500498e+00],
[ 3.38054066e-02, -1.79166464e-01],
[ 4.77735587e+00, 3.49083524e+00],
[-2.31709171e+00, -7.58439556e-01],
[ 3.43170845e+00, -2.20656750e+00],
[-2.91220473e+00, 3.29900861e-01],
[ 1.23912937e+00, 2.59977440e+00],
[ 4.79680436e+00, -1.32883128e+00],
[-2.70674472e+00, -1.79713428e+00],
[-2.14317561e+00, -8.08736468e-01],
[-3.84873131e+00, 4.68227691e-01],
[-6.31455506e-01, 1.82196749e+00],
[-4.13329446e+00, 7.46528349e-02],
[-6.60314663e-01, -1.05954597e+00],
[-1.35759053e+00, 1.02801380e-01],
[-4.84589167e+00, -2.42207709e+00],
[-2.81317480e+00, -9.93527746e-01],
[-1.77842598e+00, -5.15100183e-01],
[-2.06745665e+00, 3.85037678e+00],
[-3.15065133e+00, -2.14222173e+00],
[-4.65537550e+00, -8.43797914e-01],
[ 5.49733006e+00, -3.96710229e+00],
[-7.10475844e-01, 1.84929136e+00],
[-2.45207731e+00, -3.84262744e-01],
[-2.38605903e+00, 3.31824106e+00],
[ 3.30630648e+00, 4.16882946e+00],
[-2.10877597e+00, 1.09063227e-02],
[-2.04532616e+00, 1.70562726e+00],
[-2.46494274e+00, -2.38989936e+00],
[-2.15941369e+00, 9.57699998e-01],
[-1.95268062e+00, -9.74329414e-01],
[-2.91881236e+00, -1.70939085e+00],
[ 7.73806249e+00, -6.46699917e-01],
[-3.28057588e+00, 1.60341941e-01],
[ 3.52602655e+00, -2.14620755e+00],
[-7.17488105e-01, 3.75428544e+00],
[-1.48338132e+00, 2.90610819e-01],
[ 8.60450557e-01, 9.53650128e-01],
[-1.29404768e+00, 5.01799340e+00],
[ 6.18597008e+00, 5.35168399e+00],
[ 1.36386033e+00, -1.70600385e+00],
[-3.85708305e+00, 8.60918879e-01],
[ 2.82379549e+00, 3.51349308e+00],
[ 5.77021864e+00, -8.68954194e-01],
[-9.20845296e-01, 4.38479892e-01],
[-2.64364962e+00, -1.53680076e+00],
[ 2.36140698e+00, 4.91945770e+00],
[-3.05412232e+00, 3.27481707e-01],
[-4.47193714e+00, -1.83346848e+00],
[-2.05008549e+00, 1.80693930e+00],
[ 4.42451620e+00, -7.83908426e-01],
[-2.46113495e+00, 3.25076456e+00],
[-3.30310763e+00, -3.17892394e+00],
[-2.70485028e+00, -2.57755488e-01],
[ 1.13425267e+00, 7.24647940e+00],
[-1.64485840e+00, -4.53874880e+00],
[ 5.09549789e+00, -1.58894185e+00],
[ 2.75824852e+00, -3.81022405e+00],
[-1.13164961e+00, 5.73191190e+00],
[-1.20232680e+00, 1.61081122e+00],
[-1.27070393e+00, 2.59387132e+00],
[-2.60798250e+00, -4.63160069e-01],
[ 1.03885658e+00, -2.14588343e+00],
[ 3.75040902e+00, -2.68559217e+00],
[-2.38469869e+00, -2.63039394e-01],
[-2.03740689e+00, 3.35625140e-01],
[-2.80930188e+00, -1.08811851e+00],
[-2.21067216e+00, 4.88819568e-02],
[ 5.04442917e-01, 4.49634497e-01],
[-3.70945085e+00, -1.37116320e+00],
[-6.74609711e-01, -1.53521373e+00],
[ 5.06152345e+00, -9.89470312e-01],
[-9.19341657e-01, 9.73772069e-01],
[ 3.08936423e+00, 4.53301819e+00],
[-2.23401321e-01, -1.48046894e-01],
[-2.60280296e+00, 3.14188111e+00],
[ 1.31403031e+00, 4.06443943e-01],
[-4.68153349e-02, -1.00242062e-01],
[-1.93308693e+00, -1.13704176e+00],
[-1.52131762e+00, 4.33315745e-01],
[-1.89979935e+00, 2.55562117e+00],
[ 4.92703390e+00, -2.66104990e+00],
[ 7.13819586e+00, 3.17589441e-01],
[-2.40835023e+00, -8.11563994e-01],
[-2.87165038e-01, 1.43716632e-01],
[-3.30608498e+00, -1.49983757e+00],
[ 2.78570615e+00, -2.69074524e-01],
[-4.09042641e+00, -5.67502440e-01],
[ 1.73178814e+00, -1.42755166e+00],
[ 3.45398267e+00, -1.52512566e+00],
[-2.62780130e+00, -1.85313456e+00],
[-1.23374077e+00, 1.12496875e+00],
[-4.02645464e+00, -2.61337160e+00],
[ 1.05640933e+00, 1.18094784e+00],
[-2.81191212e+00, 1.08417577e+00],
[ 3.04369006e+00, -1.67882484e+00],
[-2.55021489e+00, -2.50921046e+00],
[ 2.07523849e+00, 6.66926970e+00],
[ 4.12677984e+00, -1.83244075e-01],
[-2.89339810e+00, -1.44823665e+00],
[-1.95021133e+00, -2.19120724e-01],
[ 3.70277215e+00, 1.05339353e+00],
[ 1.20323367e+01, -7.15382114e+00],
[-2.72114047e+00, 1.39739516e+00],
[-1.54768248e+00, -1.06381042e+00],
[-3.93722770e+00, 7.31131573e-01],
[ 4.36872161e+00, -3.86779224e+00],
[ 1.37001313e+00, 2.24166805e+00],
[-1.57574255e-01, 1.54503323e+00],
[-3.33991601e+00, -1.13764071e+00],
[ 2.48345877e+00, 2.74093101e+00],
[-3.21967402e+00, 1.27036773e+00],
[-7.55925465e-01, 6.76999723e-01],
[-4.02153678e+00, 1.15381800e+00],
[ 3.78284363e+00, 1.16532486e+00],
[-3.50769347e+00, -6.69019386e-01],
[ 4.96937042e+00, -1.40626742e+00],
[-3.90174101e-01, 2.36702092e+00],
[ 6.59737010e+00, -1.26031209e+00],
[ 1.84778061e+00, 2.15738793e-01],
[ 4.68468744e+00, -1.33169271e+00],
[ 3.98587486e+00, -2.78214719e+00],
[-7.58645079e-02, 2.44365016e+00],
[-4.33286950e+00, -4.45442900e-02],
[-2.02879388e+00, -1.08666613e+00],
[-1.21635779e+00, 4.17561211e+00],
[-2.13309598e+00, -1.95597604e+00],
[ 3.43700763e-01, -3.09289490e+00],
[-1.85413800e+00, -1.56408675e+00],
[-1.99762141e+00, -2.73554611e-01],
[-2.02117226e+00, -2.91196532e-01],
[-3.66335178e+00, -1.26377404e+00],
[-4.61969198e+00, -1.90820192e+00],
[ 3.38236374e-01, -8.46804337e-01],
[-4.28739039e+00, -2.02743717e+00],
[-3.12293218e+00, -1.79357118e+00],
[ 3.32522906e+00, -1.16942385e+00],
[-3.03818514e+00, -6.86340209e-01],
[-1.41286812e+00, -1.80538880e-01],
[ 3.73032333e+00, -8.78629563e-01],
[ 3.36597246e+00, -2.69325076e+00],
[ 5.04828453e+00, 9.92383806e-01],
[ 3.18210080e+00, 1.67207115e+00],
[-2.59875006e+00, -1.15106215e+00],
[-1.38321955e+00, 4.36980376e-01],
[ 7.10758459e+00, 2.24256979e+00],
[-2.49561717e+00, 2.64418263e+00],
[-3.02585619e+00, -2.27978639e+00],
[ 6.90782343e+00, 1.30532255e+01],
[ 2.46485155e-01, -1.51496312e+00],
[ 7.37714748e-01, 4.83505271e-01],
[-5.07641436e-01, 1.28781779e+00],
[-2.18991826e+00, 6.74413409e-01],
[-2.70471366e+00, 1.42730016e+00],
[-1.06006398e+00, 1.91509798e+00],
[-1.93841122e+00, 1.40614284e+00],
[ 2.65474324e+00, -3.77976692e+00],
[ 3.41518173e+00, -3.26969969e+00],
[ 1.09754132e+01, -3.39077083e+00],
[-1.34961830e+00, 8.21667815e-01],
[ 1.94971254e+00, -2.30718501e+00],
[ 7.25347957e+00, 1.31899966e-01],
[-1.10336325e+00, -1.40499894e+00],
[-2.59087359e+00, 6.87553459e-01],
[-2.91815248e+00, -5.34214322e-01],
[-1.62087749e+00, 8.68005672e-01],
[-2.51366631e+00, -8.01356534e-01],
[-1.20499330e+00, -1.53756961e-01],
[ 2.29162729e+00, -2.52740867e-01],
[-4.77664233e+00, -6.08178860e-01],
[-2.20849723e+00, -1.32508957e+00],
[-7.33204993e-01, 3.24374200e+00],
[-7.58666545e-01, -2.07848833e+00],
[-3.86864326e+00, 4.65225834e-01],
[-1.45923590e+00, -1.56421810e+00],
[ 7.16341081e+00, 1.04687600e+01],
[ 3.31088863e+00, -1.65939327e+00],
[-3.12337049e+00, -1.92657856e+00],
[ 5.00812584e+00, -1.17383608e+00],
[-6.60728916e-01, 1.78510222e+00],
[ 6.25554401e+00, 1.14757178e+00],
[-3.32980445e-01, -9.88442588e-01],
[-2.46580519e+00, -2.02959456e+00],
[ 2.31208360e+00, 4.58431137e+00],
[-3.88156898e+00, 1.08240797e+00],
[-1.29131997e+00, 9.34130409e-01],
[-3.32442070e+00, -8.47415896e-01],
[ 5.18819885e+00, -2.21553456e+00],
[ 7.87674197e-01, -1.56145504e+00],
[ 4.58203743e+00, 4.83978852e-01],
[-1.64242158e+00, 1.73988692e+00],
[-5.42471122e+00, 4.77409505e-01],
[ 8.74079598e+00, 3.63711992e+00],
[ 2.69324550e+00, 1.73128614e-01],
[-4.60707340e+00, -2.83900047e+00],
[-4.79526420e-01, -9.22611535e-01],
[ 5.89385302e-01, 2.77395633e-01],
[-1.47351784e+00, 1.71611168e+00],
[ 1.33730462e+00, 9.81948610e-01],
[ 1.82231573e+00, 2.88998076e+00],
[-3.74738325e+00, -1.84165435e+00],
[-3.08730294e-02, 2.29738556e+00],
[-4.50451659e+00, -3.24612629e+00],
[-2.91179143e+00, 3.27834406e-01],
[-3.56085734e+00, -1.32616622e+00],
[-1.99695736e+00, -2.47535686e+00],
[-2.74199332e+00, -2.16327060e+00],
[-1.64919372e+00, 2.48401519e+00],
[-4.42608805e+00, -8.69927284e-01],
[ 8.64544242e+00, -3.19786856e+00],
[ 2.54141935e+00, -2.35730784e+00],
[-7.40113097e-01, -5.18784054e-02],
[-2.97598746e+00, 1.77375924e+00],
[-3.36427340e+00, -7.54150773e-02],
[ 4.83863925e+00, 3.23691356e+00],
[-5.51825211e+00, -7.73630580e-01],
[-3.05793601e+00, -2.22100606e+00],
[-1.85775451e+00, 1.39851943e+00],
[-2.80981317e+00, -4.05238960e-01],
[ 4.21342556e+00, -4.98812885e+00],
[-2.95696557e+00, -7.09572756e-01],
[ 4.16646616e+00, 3.01282388e+00],
[ 1.03933388e+00, 1.02169754e+00],
[ 4.22800713e+00, 1.32538252e+00],
[-2.00444883e+00, 4.21084778e-01],
[ 9.21475147e+00, 2.19772974e+00],
[-3.52645638e-01, 2.20207995e+00],
[ 1.70657037e+00, -2.00123218e+00],
[-2.33651837e+00, 7.14403093e-01],
[ 5.24198043e+00, -6.53178325e+00],
[ 1.61490244e+01, -7.24611405e+00],
[-2.42421736e+00, 5.76192085e-01],
[-2.77905225e+00, 3.40496767e-01],
[-3.42249621e+00, -2.48502091e+00],
[-1.93414823e+00, 1.42872198e+00],
[-3.03794269e+00, -1.89637954e-01],
[-2.89394159e+00, -3.48525482e-02],
[-1.39527531e+00, 1.42061067e+00],
[-4.98922907e+00, -3.48023351e+00],
[ 7.28191791e+00, -3.44747764e+00],
[ 6.42282927e+00, -1.61219289e+00],
[ 2.85230648e+00, 7.23177385e-01],
[ 1.24756825e+00, -2.31255427e+00],
[ 4.37071230e+00, -1.96779348e+00],
[-5.49703234e-02, -1.19736251e-01],
[-1.29663111e+00, 1.54010964e+00],
[-1.95475015e+00, -1.85472567e+00],
[-3.24845224e+00, -9.64474413e-01],
[-3.12152066e+00, -3.36486268e-01],
[-2.08953432e+00, 6.28456436e-02],
[-1.90796477e+00, 2.56172011e+00],
[ 3.56564947e+00, -3.71340906e+00],
[-1.89951941e+00, -6.06163363e-01],
[ 4.20915999e-01, -2.74923340e+00],
[-1.84859038e+00, -9.18591681e-01],
[ 3.42045099e-01, 1.69806524e+00],
[-3.36897374e-01, -5.75581704e-01],
[-3.54310230e-01, -3.32406750e-01],
[-1.81253092e+00, -1.10977930e-01],
[ 9.54219195e+00, -5.32746910e+00],
[ 1.48118077e+00, -9.85472568e-01],
[ 1.76641962e+00, 2.41951459e+00],
[-3.12026972e+00, -7.89515570e-01],
[ 4.59018860e+00, 3.21424161e+00],
[-2.16853356e+00, 4.68564919e-01],
[-3.75623266e+00, -3.81717458e-01],
[ 2.81679892e-01, -4.13729593e-01],
[ 1.08542269e+01, -1.89004497e+00],
[ 5.08560754e+00, 3.23758776e+00],
[ 7.56203604e-01, -2.29298747e+00],
[-3.78502513e-01, -7.76399742e-01],
[ 2.21777653e+00, 1.04172770e+00],
[ 1.81985826e+00, -4.23182624e+00],
[ 5.33380606e+00, 4.10137976e+00],
[-3.42475266e+00, -2.17190888e+00],
[ 9.04537882e+00, 2.33752916e+00],
[-4.45929164e-01, 3.86055779e+00],
[ 3.30121037e+00, -1.42796866e+00],
[-3.63721835e+00, 1.59165031e+00],
[-1.71011727e+00, 1.00972610e+00],
[ 3.27958813e+00, -1.04055950e+00],
[ 5.74162486e-01, 4.54631649e-01],
[ 1.46523027e+00, 1.44386747e+00],
[ 4.07811727e+00, 6.45074106e-01],
[-2.64190057e+00, 6.02524612e-01],
[-4.08982337e+00, -5.29707118e-01],
[-1.84357047e+00, 1.73549435e+00],
[ 1.03893032e-01, 7.26493782e+00],
[ 3.81682257e+00, -8.43166568e-01],
[-2.37124119e+00, -1.69173266e+00],
[-1.63870782e+00, -1.97805600e-01],
[ 6.54993668e+00, -5.80871669e+00],
[ 3.71474288e-01, 3.97520197e+00],
[-3.21380062e-01, 1.58980046e-01],
[ 3.61544513e-01, 1.18595929e+00],
[ 3.03183473e+00, 8.69697874e-01],
[ 6.51808828e-01, 7.38291727e-01],
[-1.21826465e+00, -1.56206981e+00],
[-8.89951063e-01, 2.54182711e+00],
[-2.00662011e+00, 3.05064104e-01],
[ 5.58670329e+00, 1.11452916e+00],
[ 3.93457631e-01, 3.64475677e+00],
[-4.71055375e+00, -3.06342001e-01],
[ 3.60862610e+00, -2.09427033e+00],
[-2.05349968e+00, 1.24324055e+00],
[-2.24850174e+00, -2.48904595e+00],
[-3.86328935e+00, 4.39682987e+00],
[-4.48217238e-01, 1.52081858e-01],
[-1.34839911e+00, -1.76992390e+00],
[ 2.71067086e+00, -4.32043003e+00],
[ 3.47604839e-02, -3.37853851e+00],
[ 6.49029629e+00, 7.94277378e+00],
[-3.20758299e+00, 5.43091464e-01],
[-5.69315584e+00, -5.73866830e-01],
[-1.07704361e+00, 1.37951070e+00]])
# 寄与率
print('explained variance ratio: {}'.format(pca.explained_variance_ratio_))
# explained variance ratio: [ 0.43315126 0.19586506]
#【コメント】累積で60%ちょいくらい
explained variance ratio: [0.43315126 0.19586506]
# 散布図にプロット
temp = pd.DataFrame(X_train_pca)
# 散布図にプロット
temp = pd.DataFrame(X_train_pca)
temp['Outcome'] = y_train.values
temp
| 0 | 1 | Outcome | |
|---|---|---|---|
| 0 | -2.859248 | -0.286145 | 0 |
| 1 | -3.263084 | 1.072520 | 0 |
| 2 | 3.755513 | -3.409689 | 1 |
| 3 | -3.492632 | -2.687847 | 0 |
| 4 | -0.743517 | -2.482149 | 1 |
| ... | ... | ... | ... |
| 421 | 0.034760 | -3.378539 | 1 |
| 422 | 6.490296 | 7.942774 | 1 |
| 423 | -3.207583 | 0.543091 | 0 |
| 424 | -5.693156 | -0.573867 | 0 |
| 425 | -1.077044 | 1.379511 | 0 |
426 rows × 3 columns
b = temp[temp['Outcome'] == 0]
m = temp[temp['Outcome'] == 1]
plt.scatter(x=b[0], y=b[1], marker='o') # 良性は○でマーク
plt.scatter(x=m[0], y=m[1], marker='^') # 悪性は△でマーク
plt.xlabel('PC 1') # 第1主成分をx軸
plt.ylabel('PC 2') # 第2主成分をy軸
#【コメント】2次元に次元圧縮してもそれなりに分類できそう
X_train_pca shape: (426, 2) explained variance ratio: [0.43315126 0.19586506]
Text(0, 0.5, 'PC 2')
#【コメント】次元数2まで圧縮してロジスティック回帰してみる
# 次元数2まで圧縮
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.fit_transform(X_test_scaled)
# ロジスティック回帰で学習
logistic = LogisticRegressionCV(cv=10, random_state=0,max_iter=1000)
#【コメント】イテレーションを増やすようにというWarningが出ていたので100にした
logistic.fit(X_train_pca, y_train)
# 検証
print('Train score: {:.3f}'.format(logistic.score(X_train_pca, y_train)))
print('Test score: {:.3f}'.format(logistic.score(X_test_pca, y_test)))
print('Confustion matrix:\n{}'.format(confusion_matrix(y_true=y_test, y_pred=logistic.predict(X_test_pca))))
#【コメント】検証スコア91.6%
Train score: 0.965 Test score: 0.916 Confustion matrix: [[83 7] [ 5 48]]
import numpy as np
#【コメント】1-15次元までのスコアと処理時間を可視化してみる
import time
n_max = 15
n_array = np.arange(1,n_max+1,1)
scores = np.zeros(n_max)
times = np.zeros(n_max)
for n in n_array:
#開始時間記憶
start_time = time.perf_counter()
# 次元数nまで圧縮
pca = PCA(n_components=n)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.fit_transform(X_test_scaled)
# ロジスティック回帰で学習
logistic = LogisticRegressionCV(cv=10, random_state=0,max_iter=1000)
#【コメント】イテレーションを増やすようにというWarningが出ていたので100にした
logistic.fit(X_train_pca, y_train)
#終了時間記憶
end_time = time.perf_counter()
scores[n-1] = logistic.score(X_test_pca, y_test)
times[n-1] = end_time - start_time
fig = plt.figure()
ax1 = fig.add_subplot(111)
ln1=ax1.plot(n_array, scores,'C0',label=r'$score$')
ax2 = ax1.twinx()
ln2=ax2.plot(n_array,times,'C1',label=r'$time$')
h1, l1 = ax1.get_legend_handles_labels()
h2, l2 = ax2.get_legend_handles_labels()
ax1.legend(h1+h2, l1+l2, loc='lower right')
ax1.set_xlabel('n')
ax1.set_ylabel(r'$score$')
ax1.grid(True)
ax2.set_ylabel(r'$time[s]$')
#plt.plot(n_array,scores)
#plt.plot(n_array,times)
#【コメント】scoreは単純に比例していくわけではない←想像してたのと違うなんで?
#【コメント】timeは実行ごとに違うけどおおむね比例傾向。サーバーの他の処理状況によって変動はありそうなので納得できる
Text(0, 0.5, '$time[s]$')