# Wine
import pandas as pd
import numpy as np
df_wine = pd.read_csv('https://archive.ics.uci.edu/'
'ml/machine-learning-databases/wine/wine.data',
header=None)
# if the Wine dataset is temporarily unavailable from the
# UCI machine learning repository, un-comment the following line
# of code to load the dataset from a local path:
# df_wine = pd.read_csv('wine.data', header=None)
df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
'Alcalinity of ash', 'Magnesium', 'Total phenols',
'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
'Color intensity', 'Hue', 'OD280/OD315 of diluted wines',
'Proline']
print('Class labels', np.unique(df_wine['Class label']))
df_wine.head()
type(df_wine)
df_wine.columns
X=pd.DataFrame(df_wine, columns=['Proline','Flavanoids']).values
X[:5]
y= df_wine.iloc[:, 0].values
y[:5]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train,y_test = train_test_split(X,y,test_size=0.2, stratify=y)
y_train[:5]
X_train[:5]
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
X_train_std[:5]
X_train_std.mean(), X_train_std.std()
%matplotlib inline
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
# setup marker generator and color map
markers = ('s', 'x', 'o', '^', 'v')
colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
cmap = ListedColormap(colors[:len(np.unique(y))])
# plot the decision surface
x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
np.arange(x2_min, x2_max, resolution))
Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
Z = Z.reshape(xx1.shape)
plt.contourf(xx1, xx2, Z, alpha=0.3, cmap=cmap)
plt.xlim(xx1.min(), xx1.max())
plt.ylim(xx2.min(), xx2.max())
for idx, cl in enumerate(np.unique(y)):
plt.scatter(x=X[y == cl, 0],
y=X[y == cl, 1],
alpha=0.8,
c=colors[idx],
marker=markers[idx],
label=cl,
edgecolor='black')
# highlight test samples
if test_idx:
# plot all samples
X_test, y_test = X[test_idx, :], y[test_idx]
plt.scatter(X_test[:, 0],
X_test[:, 1],
c='',
edgecolor='black',
alpha=1.0,
linewidth=1,
marker='o',
s=100,
label='test set')
X_combined_std = np.vstack((X_train_std, X_test_std))
y_combined = np.hstack((y_train, y_test))
len(y_train), len(y_test)
%matplotlib inline
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=10.0, random_state=1)
lr.fit(X_train_std, y_train)
plot_decision_regions(X_combined_std, y_combined,
classifier=lr, test_idx=range(len(y_train), len(y_combined)))
plt.xlabel('Proline [standardized]')
plt.ylabel('Flavanoids [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
#plt.savefig('images/03_06.png', dpi=300)
plt.show()
from sklearn.metrics import accuracy_score
y_pred=lr.predict(X_test_std)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
y_pred_train=lr.predict(X_train_std)
print('Accuracy: %.2f' % accuracy_score(y_train, y_pred_train))
%matplotlib inline
from sklearn.linear_model import LogisticRegression
lr1 = LogisticRegression(C=10.0, penalty='l1', random_state=1)
lr1.fit(X_train_std, y_train)
plot_decision_regions(X_combined_std, y_combined,
classifier=lr1, test_idx=range(len(y_train), len(y_combined)))
plt.xlabel('Proline [standardized]')
plt.ylabel('Flavanoids [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
#plt.savefig('images/03_06.png', dpi=300)
plt.show()
y_pred=lr1.predict(X_test_std)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
y_pred_train=lr1.predict(X_train_std)
print('Accuracy: %.2f' % accuracy_score(y_train, y_pred_train))
%matplotlib inline
from sklearn.linear_model import LogisticRegression
lr2 = LogisticRegression(C=10.0, penalty='l2',random_state=1)
lr2.fit(X_train_std, y_train)
plot_decision_regions(X_combined_std, y_combined,
classifier=lr2, test_idx=range(len(y_train), len(y_combined)))
plt.xlabel('Proline [standardized]')
plt.ylabel('Flavanoids [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
#plt.savefig('images/03_06.png', dpi=300)
plt.show()
y_pred=lr2.predict(X_test_std)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
y_pred_train=lr2.predict(X_train_std)
print('Accuracy: %.2f' % accuracy_score(y_train, y_pred_train))
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.svm import SVC
svm = SVC(kernel='linear', C=1.0, random_state=1)
svm.fit(X_train_std, y_train)
plot_decision_regions(X_combined_std,
y_combined,
classifier=svm,
test_idx=range(len(y_train), len(y_combined)))
plt.xlabel('Proline [standardized]')
plt.ylabel('Flavanoids [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
#plt.savefig('images/03_11.png', dpi=300)
plt.show()
y_pred=svm.predict(X_test_std)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
y_pred_train=svm.predict(X_train_std)
print('Accuracy: %.2f' % accuracy_score(y_train, y_pred_train))
# 非線形なのでカーネル関数を試す
svm2 = SVC(kernel='rbf', C=1.0, gamma=0.1, random_state=1)
svm2.fit(X_train_std, y_train)
plot_decision_regions(X_combined_std,
y_combined,
classifier=svm2,
test_idx=range(len(y_train), len(y_combined)))
plt.xlabel('Proline [standardized]')
plt.ylabel('Flavanoids [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
#plt.savefig('images/03_11.png', dpi=300)
plt.show()
y_pred=svm2.predict(X_test_std)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
y_pred_train=svm2.predict(X_train_std)
print('Accuracy: %.2f' % accuracy_score(y_train, y_pred_train))
%matplotlib inline
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion='entropy',
max_depth=4,
random_state=1)
tree.fit(X_train, y_train)
X_combined = np.vstack((X_train, X_test))
y_combined = np.hstack((y_train, y_test))
plt.figure(figsize=(12,5))
plot_decision_regions(X_combined, y_combined,
classifier=tree, test_idx=range(len(y_train), len(y_combined)))
plt.xlabel('Proline')
plt.ylabel('Flavanoids [cm]')
plt.legend(loc='upper left')
plt.tight_layout()
#plt.savefig('images/03_20.png', dpi=300)
plt.show()
y_pred=tree.predict(X_test_std)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
y_pred_train=tree.predict(X_train_std)
print('Accuracy: %.2f' % accuracy_score(y_train, y_pred_train))