(1) Wineデータの取り込み

In [1]:
# Wine
import pandas as pd
import numpy as np
df_wine = pd.read_csv('https://archive.ics.uci.edu/'
                      'ml/machine-learning-databases/wine/wine.data',
                      header=None)

# if the Wine dataset is temporarily unavailable from the
# UCI machine learning repository, un-comment the following line
# of code to load the dataset from a local path:

# df_wine = pd.read_csv('wine.data', header=None)


df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
                   'Alcalinity of ash', 'Magnesium', 'Total phenols',
                   'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
                   'Color intensity', 'Hue', 'OD280/OD315 of diluted wines',
                   'Proline']

print('Class labels', np.unique(df_wine['Class label']))
df_wine.head()
Class labels [1 2 3]
Out[1]:
Class label Alcohol Malic acid Ash Alcalinity of ash Magnesium Total phenols Flavanoids Nonflavanoid phenols Proanthocyanins Color intensity Hue OD280/OD315 of diluted wines Proline
0 1 14.23 1.71 2.43 15.6 127 2.80 3.06 0.28 2.29 5.64 1.04 3.92 1065
1 1 13.20 1.78 2.14 11.2 100 2.65 2.76 0.26 1.28 4.38 1.05 3.40 1050
2 1 13.16 2.36 2.67 18.6 101 2.80 3.24 0.30 2.81 5.68 1.03 3.17 1185
3 1 14.37 1.95 2.50 16.8 113 3.85 3.49 0.24 2.18 7.80 0.86 3.45 1480
4 1 13.24 2.59 2.87 21.0 118 2.80 2.69 0.39 1.82 4.32 1.04 2.93 735

(2) 特徴量とラベル

In [9]:
type(df_wine)
Out[9]:
pandas.core.frame.DataFrame
In [10]:
df_wine.columns
Out[10]:
Index(['Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash',
       'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols',
       'Proanthocyanins', 'Color intensity', 'Hue',
       'OD280/OD315 of diluted wines', 'Proline'],
      dtype='object')
In [2]:
X=pd.DataFrame(df_wine, columns=['Proline','Flavanoids']).values
In [3]:
X[:5]
Out[3]:
array([[1065.  ,    3.06],
       [1050.  ,    2.76],
       [1185.  ,    3.24],
       [1480.  ,    3.49],
       [ 735.  ,    2.69]])
In [6]:
y=  df_wine.iloc[:, 0].values
In [7]:
y[:5]
Out[7]:
array([1, 1, 1, 1, 1])
In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train,y_test = train_test_split(X,y,test_size=0.2, stratify=y)
In [66]:
y_train[:5]
Out[66]:
[3, 2, 1, 1, 3]
In [12]:
X_train[:5]
Out[12]:
array([[7.200e+02, 4.700e-01],
       [1.280e+03, 2.430e+00],
       [1.105e+03, 2.430e+00],
       [8.800e+02, 5.800e-01],
       [5.150e+02, 1.590e+00]])

(3) 標準化

In [13]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
In [14]:
X_train_std[:5]
Out[14]:
array([[-0.06917935, -1.54728864],
       [ 1.68333469,  0.39000667],
       [ 1.13567406,  0.39000667],
       [ 0.43153895, -1.43856288],
       [-0.71072466, -0.44026275]])
In [15]:
X_train_std.mean(), X_train_std.std()
Out[15]:
(1.2509555207044018e-16, 1.0000000000000002)

(4) ロジスティック回帰

In [16]:
%matplotlib  inline
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt


def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):

    # setup marker generator and color map
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])

    # plot the decision surface
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
                           np.arange(x2_min, x2_max, resolution))
    Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    plt.contourf(xx1, xx2, Z, alpha=0.3, cmap=cmap)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())

    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y == cl, 0], 
                    y=X[y == cl, 1],
                    alpha=0.8, 
                    c=colors[idx],
                    marker=markers[idx], 
                    label=cl, 
                    edgecolor='black')

    # highlight test samples
    if test_idx:
        # plot all samples
        X_test, y_test = X[test_idx, :], y[test_idx]

        plt.scatter(X_test[:, 0],
                    X_test[:, 1],
                    c='',
                    edgecolor='black',
                    alpha=1.0,
                    linewidth=1,
                    marker='o',
                    s=100, 
                    label='test set')
In [17]:
X_combined_std = np.vstack((X_train_std, X_test_std))
In [18]:
y_combined = np.hstack((y_train, y_test))
In [19]:
len(y_train), len(y_test)
Out[19]:
(142, 36)
In [20]:
%matplotlib inline
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=10.0, random_state=1)
lr.fit(X_train_std, y_train)

plot_decision_regions(X_combined_std, y_combined,
                      classifier=lr, test_idx=range(len(y_train), len(y_combined)))
plt.xlabel('Proline [standardized]')
plt.ylabel('Flavanoids [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
#plt.savefig('images/03_06.png', dpi=300)
plt.show()
In [21]:
from sklearn.metrics import accuracy_score
y_pred=lr.predict(X_test_std)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
Accuracy: 0.92
In [22]:
y_pred_train=lr.predict(X_train_std)
print('Accuracy: %.2f' % accuracy_score(y_train, y_pred_train))
Accuracy: 0.90

(5) L1正則化付き

In [23]:
%matplotlib inline
from sklearn.linear_model import LogisticRegression

lr1 = LogisticRegression(C=10.0, penalty='l1', random_state=1)
lr1.fit(X_train_std, y_train)

plot_decision_regions(X_combined_std, y_combined,
                      classifier=lr1, test_idx=range(len(y_train), len(y_combined)))
plt.xlabel('Proline [standardized]')
plt.ylabel('Flavanoids [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
#plt.savefig('images/03_06.png', dpi=300)
plt.show()
In [24]:
y_pred=lr1.predict(X_test_std)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
Accuracy: 0.92
In [25]:
y_pred_train=lr1.predict(X_train_std)
print('Accuracy: %.2f' % accuracy_score(y_train, y_pred_train))
Accuracy: 0.90

(6) L2正則化付き

In [26]:
%matplotlib inline
from sklearn.linear_model import LogisticRegression

lr2 = LogisticRegression(C=10.0, penalty='l2',random_state=1)
lr2.fit(X_train_std, y_train)

plot_decision_regions(X_combined_std, y_combined,
                      classifier=lr2, test_idx=range(len(y_train), len(y_combined)))
plt.xlabel('Proline [standardized]')
plt.ylabel('Flavanoids [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
#plt.savefig('images/03_06.png', dpi=300)
plt.show()
In [27]:
y_pred=lr2.predict(X_test_std)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
Accuracy: 0.92
In [28]:
y_pred_train=lr2.predict(X_train_std)
print('Accuracy: %.2f' % accuracy_score(y_train, y_pred_train))
Accuracy: 0.90

(7) SVC

In [29]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.svm import SVC

svm = SVC(kernel='linear', C=1.0, random_state=1)
svm.fit(X_train_std, y_train)

plot_decision_regions(X_combined_std, 
                      y_combined,
                      classifier=svm, 
                      test_idx=range(len(y_train), len(y_combined)))
plt.xlabel('Proline [standardized]')
plt.ylabel('Flavanoids [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
#plt.savefig('images/03_11.png', dpi=300)
plt.show()
In [30]:
y_pred=svm.predict(X_test_std)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
Accuracy: 0.89
In [31]:
y_pred_train=svm.predict(X_train_std)
print('Accuracy: %.2f' % accuracy_score(y_train, y_pred_train))
Accuracy: 0.91
In [32]:
# 非線形なのでカーネル関数を試す
svm2 = SVC(kernel='rbf', C=1.0, gamma=0.1, random_state=1)
svm2.fit(X_train_std, y_train)

plot_decision_regions(X_combined_std, 
                      y_combined,
                      classifier=svm2, 
                      test_idx=range(len(y_train), len(y_combined)))
plt.xlabel('Proline [standardized]')
plt.ylabel('Flavanoids [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
#plt.savefig('images/03_11.png', dpi=300)
plt.show()
In [33]:
y_pred=svm2.predict(X_test_std)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
Accuracy: 0.89
In [34]:
y_pred_train=svm2.predict(X_train_std)
print('Accuracy: %.2f' % accuracy_score(y_train, y_pred_train))
Accuracy: 0.90

(8) 決定木

In [35]:
%matplotlib inline
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(criterion='entropy', 
                              max_depth=4, 
                              random_state=1)
tree.fit(X_train, y_train)

X_combined = np.vstack((X_train, X_test))
y_combined = np.hstack((y_train, y_test))
plt.figure(figsize=(12,5))
plot_decision_regions(X_combined, y_combined, 
                      classifier=tree, test_idx=range(len(y_train), len(y_combined)))

plt.xlabel('Proline')
plt.ylabel('Flavanoids [cm]')
plt.legend(loc='upper left')
plt.tight_layout()
#plt.savefig('images/03_20.png', dpi=300)
plt.show()
In [36]:
y_pred=tree.predict(X_test_std)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
Accuracy: 0.28
In [37]:
y_pred_train=tree.predict(X_train_std)
print('Accuracy: %.2f' % accuracy_score(y_train, y_pred_train))
Accuracy: 0.28