欢迎光临
我们一直在努力

Python 机器学习:Scikit-learn 核心算法实战

Python 机器学习:Scikit-learn 核心算法实战

机器学习是人工智能的重要分支,Scikit-learn 是 Python 最流行的机器学习库。本文将深入探讨 Scikit-learn 的核心算法和实战应用,帮助读者掌握机器学习的核心技术。

监督学习

监督学习使用标记数据训练模型,预测新数据的标签。

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

def supervised_learning_demo():
    print("监督学习演示:")
    
    X = np.random.rand(100, 1) * 10
    y = 2 * X.ravel() + 1 + np.random.randn(100) * 0.5
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    print(f"均方误差: {mean_squared_error(y_test, y_pred):.4f}")
    print(f"R2 分数: {r2_score(y_test, y_pred):.4f}")

supervised_learning_demo()

无监督学习

无监督学习发现数据中的隐藏模式。

from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

def unsupervised_learning_demo():
    print("无监督学习演示:")
    
    X, _ = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)
    
    kmeans = KMeans(n_clusters=4)
    kmeans.fit(X)
    
    labels = kmeans.labels_
    centers = kmeans.cluster_centers_
    
    print(f"聚类标签: {np.unique(labels)}")
    print(f"聚类中心形状: {centers.shape}")

unsupervised_learning_demo()

分类算法

分类算法将数据分配到预定义的类别中。

from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

def classification_demo():
    print("分类算法演示:")
    
    iris = load_iris()
    X, y = iris.data, iris.target
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    model = RandomForestClassifier(n_estimators=100)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    print(f"准确率: {accuracy_score(y_test, y_pred):.4f}")
    print("分类报告:")
    print(classification_report(y_test, y_pred, target_names=iris.target_names))

classification_demo()

回归算法

回归算法预测连续值。

from sklearn.datasets import load_boston
from sklearn.ensemble import GradientBoostingRegressor

def regression_demo():
    print("回归算法演示:")
    
    boston = load_boston()
    X, y = boston.data, boston.target
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    model = GradientBoostingRegressor(n_estimators=100)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    print(f"均方误差: {mean_squared_error(y_test, y_pred):.4f}")
    print(f"R2 分数: {r2_score(y_test, y_pred):.4f}")

regression_demo()

机器学习流程

graph TD
    A[数据收集] --> B[数据预处理]
    B --> C[特征工程]
    C --> D[模型训练]
    D --> E[模型评估]
    E --> F[模型优化]
    F --> G[模型部署]

特征工程

特征工程提取和转换数据特征。

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif

def feature_engineering_demo():
    print("特征工程演示:")
    
    print("1. 特征缩放")
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    print("2. 标签编码")
    encoder = LabelEncoder()
    y_encoded = encoder.fit_transform(y)
    
    print("3. 特征选择")
    selector = SelectKBest(f_classif, k=10)
    X_selected = selector.fit_transform(X, y_encoded)

feature_engineering_demo()

模型评估

模型评估使用各种指标衡量模型性能。

from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve

def model_evaluation_demo():
    print("模型评估演示:")
    
    print("1. 混淆矩阵")
    cm = confusion_matrix(y_test, y_pred)
    print(f"混淆矩阵:/n{cm}")
    
    print("2. ROC AUC")
    y_prob = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_prob)
    print(f"ROC AUC: {auc:.4f}")

model_evaluation_demo()

超参数调优

超参数调优优化模型性能。

from sklearn.model_selection import GridSearchCV

def hyperparameter_tuning_demo():
    print("超参数调优演示:")
    
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    }
    
    grid_search = GridSearchCV(
        RandomForestClassifier(),
        param_grid,
        cv=5,
        scoring='accuracy'
    )
    
    grid_search.fit(X_train, y_train)
    
    print(f"最佳参数: {grid_search.best_params_}")
    print(f"最佳分数: {grid_search.best_score_:.4f}")

hyperparameter_tuning_demo()

交叉验证

交叉验证评估模型的泛化能力。

from sklearn.model_selection import cross_val_score

def cross_validation_demo():
    print("交叉验证演示:")
    
    model = RandomForestClassifier(n_estimators=100)
    
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    
    print(f"交叉验证分数: {scores}")
    print(f"平均分数: {scores.mean():.4f}")
    print(f"标准差: {scores.std():.4f}")

cross_validation_demo()

模型持久化

模型持久化保存训练好的模型。

import joblib

def model_persistence_demo():
    print("模型持久化演示:")
    
    model = RandomForestClassifier(n_estimators=100)
    model.fit(X_train, y_train)
    
    joblib.dump(model, 'model.joblib')
    
    loaded_model = joblib.load('model.joblib')
    
    y_pred = loaded_model.predict(X_test)
    print(f"加载模型预测: {y_pred[:5]}")

model_persistence_demo()

总结

Scikit-learn 提供了丰富的机器学习算法和工具,支持从数据预处理到模型部署的完整流程。掌握这些核心技术,对于构建机器学习应用至关重要。

在实际应用中,需要根据数据特点选择合适的算法,平衡模型复杂度和性能。良好的机器学习实践能够显著提高模型的准确性和可靠性。

https://segmentfault.com/a/1190000047614884

未经允许不得转载:IT极限技术分享汇 » Python 机器学习:Scikit-learn 核心算法实战

评论 抢沙发

  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址