Python 机器学习:Scikit-learn 核心算法实战
机器学习是人工智能的重要分支,Scikit-learn 是 Python 最流行的机器学习库。本文将深入探讨 Scikit-learn 的核心算法和实战应用,帮助读者掌握机器学习的核心技术。
监督学习
监督学习使用标记数据训练模型,预测新数据的标签。
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
def supervised_learning_demo():
print("监督学习演示:")
X = np.random.rand(100, 1) * 10
y = 2 * X.ravel() + 1 + np.random.randn(100) * 0.5
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"均方误差: {mean_squared_error(y_test, y_pred):.4f}")
print(f"R2 分数: {r2_score(y_test, y_pred):.4f}")
supervised_learning_demo()
无监督学习
无监督学习发现数据中的隐藏模式。
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
def unsupervised_learning_demo():
print("无监督学习演示:")
X, _ = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)
kmeans = KMeans(n_clusters=4)
kmeans.fit(X)
labels = kmeans.labels_
centers = kmeans.cluster_centers_
print(f"聚类标签: {np.unique(labels)}")
print(f"聚类中心形状: {centers.shape}")
unsupervised_learning_demo()
分类算法
分类算法将数据分配到预定义的类别中。
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
def classification_demo():
print("分类算法演示:")
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"准确率: {accuracy_score(y_test, y_pred):.4f}")
print("分类报告:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))
classification_demo()
回归算法
回归算法预测连续值。
from sklearn.datasets import load_boston
from sklearn.ensemble import GradientBoostingRegressor
def regression_demo():
print("回归算法演示:")
boston = load_boston()
X, y = boston.data, boston.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = GradientBoostingRegressor(n_estimators=100)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"均方误差: {mean_squared_error(y_test, y_pred):.4f}")
print(f"R2 分数: {r2_score(y_test, y_pred):.4f}")
regression_demo()
机器学习流程
特征工程
特征工程提取和转换数据特征。
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
def feature_engineering_demo():
print("特征工程演示:")
print("1. 特征缩放")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("2. 标签编码")
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)
print("3. 特征选择")
selector = SelectKBest(f_classif, k=10)
X_selected = selector.fit_transform(X, y_encoded)
feature_engineering_demo()
模型评估
模型评估使用各种指标衡量模型性能。
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve
def model_evaluation_demo():
print("模型评估演示:")
print("1. 混淆矩阵")
cm = confusion_matrix(y_test, y_pred)
print(f"混淆矩阵:/n{cm}")
print("2. ROC AUC")
y_prob = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_prob)
print(f"ROC AUC: {auc:.4f}")
model_evaluation_demo()
超参数调优
超参数调优优化模型性能。
from sklearn.model_selection import GridSearchCV
def hyperparameter_tuning_demo():
print("超参数调优演示:")
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [None, 10, 20],
'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(
RandomForestClassifier(),
param_grid,
cv=5,
scoring='accuracy'
)
grid_search.fit(X_train, y_train)
print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳分数: {grid_search.best_score_:.4f}")
hyperparameter_tuning_demo()
交叉验证
交叉验证评估模型的泛化能力。
from sklearn.model_selection import cross_val_score
def cross_validation_demo():
print("交叉验证演示:")
model = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f"交叉验证分数: {scores}")
print(f"平均分数: {scores.mean():.4f}")
print(f"标准差: {scores.std():.4f}")
cross_validation_demo()
模型持久化
模型持久化保存训练好的模型。
import joblib
def model_persistence_demo():
print("模型持久化演示:")
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)
joblib.dump(model, 'model.joblib')
loaded_model = joblib.load('model.joblib')
y_pred = loaded_model.predict(X_test)
print(f"加载模型预测: {y_pred[:5]}")
model_persistence_demo()
总结
Scikit-learn 提供了丰富的机器学习算法和工具,支持从数据预处理到模型部署的完整流程。掌握这些核心技术,对于构建机器学习应用至关重要。
在实际应用中,需要根据数据特点选择合适的算法,平衡模型复杂度和性能。良好的机器学习实践能够显著提高模型的准确性和可靠性。
IT极限技术分享汇