掌握分类问题的评估及超参数的调优

案例引入

导包

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("ggplot")
import warnings
warnings.filterwarnings("ignore")

导数据

1
2
3
4
5
6
7
8
9
from sklearn import datasets
import pandas as pd
iris = datasets.load_iris()
X = iris.data
y = iris.target
feature = iris.feature_names
data = pd.DataFrame(X,columns=feature)
data['target'] = y
data.head()
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) target
0 5.1 3.5 1.4 0.2 0
1 4.9 3.0 1.4 0.2 0
2 4.7 3.2 1.3 0.2 0
3 4.6 3.1 1.5 0.2 0
4 5.0 3.6 1.4 0.2 0

原始模型

1
2
3
4
5
6
7
8
9
10
11
'''
C:正则化参数。正则化的强度与C成反比。必须严格为正。惩罚是平方的l2惩罚。
kernel:{'linear','poly','rbf','sigmoid','precomputed'},默认='rbf'
degree:多项式和的阶数
gamma:“ rbf”,“ poly”和“ Sigmoid”的内核系数。
shrinking:是否软间隔分类,默认true

'''
svc_iris = make_pipeline(StandardScaler(), SVC(gamma='auto'))
svc_iris.fit(X, y)
svc_iris.score(X,y)

结果

1
0.9733333333333334

使用网格搜索进行超参数调优

1
2
3
4
5
6
7
8
9
10
11
# 方式1:网格搜索GridSearchCV()
start_time = time.time()
pipe_svc = make_pipeline(StandardScaler(),SVC(random_state=1))
param_range = [0.0001,0.001,0.01,0.1,1.0,10.0,100.0,1000.0]
param_grid = [{'svc__C':param_range,'svc__kernel':['linear']},{'svc__C':param_range,'svc__gamma':param_range,'svc__kernel':['rbf']}]
gs = GridSearchCV(estimator=pipe_svc,param_grid=param_grid,scoring='accuracy',cv=10,n_jobs=-1)
gs = gs.fit(X,y)
end_time = time.time()
print("网格搜索经历时间:%.3f S" % float(end_time-start_time))
print(gs.best_score_)
print(gs.best_params_)
1
2
3
网格搜索经历时间:7.039 S
0.98
{'svc__C': 1.0, 'svc__gamma': 0.1, 'svc__kernel': 'rbf'}

随机网络搜索进行超参数调优

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# 方式2:随机网格搜索RandomizedSearchCV()
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
import time

start_time = time.time()
pipe_svc = make_pipeline(StandardScaler(),SVC(random_state=1))
param_range = [0.0001,0.001,0.01,0.1,1.0,10.0,100.0,1000.0]

param_grid = dict(svr__C=param_range,# 构建连续参数的分布
svr__kernel=['linear', 'rbf'], # 离散参数的集合
svr__gamma=param_range
)

# param_grid = [{'svc__C':param_range,'svc__kernel':['linear','rbf'],'svc__gamma':param_range}]
gs = RandomizedSearchCV(estimator=pipe_svc, param_distributions=param_grid,scoring='accuracy',cv=10,n_jobs=-1)
gs = gs.fit(X,y)
end_time = time.time()
print("随机网格搜索经历时间:%.3f S" % float(end_time-start_time))
print(gs.best_score_)
print(gs.best_params_)

当类别为两类时,可以绘制混淆矩阵与ROC曲线

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# 混淆矩阵:
# 加载数据
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data",header=None)
'''
乳腺癌数据集:569个恶性和良性肿瘤细胞的样本,M为恶性,B为良性
'''
# 做基本的数据预处理
from sklearn.preprocessing import LabelEncoder

X = df.iloc[:,2:].values
y = df.iloc[:,1].values
le = LabelEncoder() #将M-B等字符串编码成计算机能识别的0-1
y = le.fit_transform(y)
le.transform(['M','B'])
# 数据切分8:2
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=1)
from sklearn.svm import SVC
pipe_svc = make_pipeline(StandardScaler(),SVC(random_state=1))
from sklearn.metrics import confusion_matrix

pipe_svc.fit(X_train,y_train)
y_pred = pipe_svc.predict(X_test)
confmat = confusion_matrix(y_true=y_test,y_pred=y_pred)
fig,ax = plt.subplots(figsize=(2.5,2.5))
ax.matshow(confmat, cmap=plt.cm.Blues,alpha=0.3)
for i in range(confmat.shape[0]):
for j in range(confmat.shape[1]):
ax.text(x=j,y=i,s=confmat[i,j],va='center',ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label')
plt.show()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# 绘制ROC曲线:
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import make_scorer,f1_score
scorer = make_scorer(f1_score,pos_label=0)
gs = GridSearchCV(estimator=pipe_svc,param_grid=param_grid,scoring=scorer,cv=10)
y_pred = gs.fit(X_train,y_train).decision_function(X_test)
#y_pred = gs.predict(X_test)
fpr,tpr,threshold = roc_curve(y_test, y_pred) ###计算真阳率和假阳率
roc_auc = auc(fpr,tpr) ###计算auc的值
plt.figure()
lw = 2
plt.figure(figsize=(7,5))
plt.plot(fpr, tpr, color='darkorange',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) ###假阳率为横坐标,真阳率为纵坐标做曲线
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([-0.05, 1.0])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic ')
plt.legend(loc="lower right")
plt.show()

实践:基于SVM实现人脸识别

下载数据

1
2
3
4
5
6
7
8
9
from sklearn.datasets import fetch_lfw_people
faces_data = fetch_lfw_people(min_faces_per_person=60)
print(faces_data.target_names)
print(faces_data.images.shape)

#输出结果
['Ariel Sharon' 'Colin Powell' 'Donald Rumsfeld' 'George W Bush'
'Gerhard Schroeder' 'Hugo Chavez' 'Junichiro Koizumi' 'Tony Blair']
(1348, 62, 47)

预览需要处理的人脸数据

1
2
3
4
5
6
7
8
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
fig, ax = plt.subplots(3,5)
fig.subplots_adjust(left=0.0625, right=1.2, wspace=1)
for i, axi in enumerate(ax.flat):
axi.imshow(faces_data.images[i], cmap='bone')
axi.set(xticks=[], yticks=[], xlabel=faces_data.target_names[faces_data.target[i]])

主成分分析提取特征

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
#使用预处理来提取更有意义的特征。这里使用主成份分析来提取150个基本元素,然后将其提供给支持向量机分类器。
#将这个预处理和分类器打包成管道

from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
pca = PCA(n_components=150, whiten=True, random_state=42)
svc = SVC(kernel='rbf', class_weight='balanced')
model = make_pipeline(pca, svc)

#为了测试分类器的训练效果,将数据集分解成训练集和测试集进行交叉检验
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(faces.data, faces.target, random_state=42)

#用网络搜索交叉检验来寻找最优参数组合。通过不断调整C(松弛变量)和参数gamma(控制径向基函数核的大小),确定最优模型
from sklearn.grid_search import GridSearchCV
param_grid = {'svc__C': [1,5,10,50], 'svc__gamma':[0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(model, param_grid)

grid.fit(x_train, y_train)
print(grid.best_params_)

#输出结果:
{'svc__C': 10, 'svc__gamma': 0.001}

预测测试集

1
2
3
4
5
6
7
8
9
10
model = grid.best_estimator_
y_fit = model.predict(x_test)
#比较预测结果和真实结果
fig, ax = plt.subplots(4, 6)
for i, axi in enumerate(ax.flat):
axi.imshow(x_test[i].reshape(62, 47), cmap='bone')
axi.set(xticks=[], yticks=[])
axi.set_ylabel(faces_data.target_names[y_fit[i]].split()[-1],
color='black' if y_fit[i] == y_test[i] else 'red')
fig.suptitle('Predicted Names; Incorect Lables in Red', size=14)

打印分类分数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
#打印分类效果报告,他会列举每个标签的统计结果,从而对评估器的性能有更全面的认识
from sklearn.metrics import classification_report
print(classification_report(y_test, y_fit, target_names=faces.target_names))

#输出结果
precision recall f1-score support

Ariel Sharon 0.65 0.73 0.69 15
Colin Powell 0.80 0.87 0.83 68
Donald Rumsfeld 0.74 0.84 0.79 31
George W Bush 0.92 0.83 0.88 126
Gerhard Schroeder 0.86 0.83 0.84 23
Hugo Chavez 0.93 0.70 0.80 20
Junichiro Koizumi 0.92 1.00 0.96 12
Tony Blair 0.85 0.95 0.90 42

avg / total 0.86 0.85 0.85 337

画出混淆矩阵

1
2
3
4
5
6
7
8
#画出混淆矩阵,它可以帮助我们清晰的判断那些标签容易被分类器误判
from sklearn.metrics import confusion_matrix
mat = confusion_matrix(y_test, y_fit)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
xticklabels=faces.target_names,
yticklabels=faces.target_names)
plt.xlabel('true label')
plt.ylabel('predicted label')