欢迎来到尧图网

客户服务 关于我们

您的位置:首页 > 科技 > IT业 > 机器学习-分类模型-非线性模型

机器学习-分类模型-非线性模型

2025/3/31 8:08:22 来源:https://blog.csdn.net/2303_81133811/article/details/145969693  浏览:    关键词:机器学习-分类模型-非线性模型
支持向量机 (SVM)通过核函数处理非线性分类(如RBF核)小样本非线性数据svm.SVC
K近邻 (K-Nearest Neighbors)基于局部相似性,无需训练模型小数据量,特征空间均匀neighbors.KNeighborsClassifier
决策树 (Decision Tree)可解释性强,自动处理非线性关系需可视化决策规则tree.DecisionTreeClassifier
随机森林 (Random Forest)集成多棵决策树,降低方差,抗过拟合高维数据、非线性关系ensemble.RandomForestClassifier
梯度提升树 (Gradient Boosting)逐步修正残差,精度高但训练慢中小规模数据,高精度需求ensemble.GradientBoostingClassifier
XGBoost/LightGBM/CatBoost高效梯度提升框架,支持并行和正则化大规模数据、竞赛场景需安装第三方库 (xgboostlightgbm等)

一.支持向量机

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy import stats
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC# 设置显示选项
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)# 导入数据
data_1 = pd.read_excel("员工离职预测模型.xlsx")
# 对“工资”列进行标签编码
label_encoder = LabelEncoder()
data_1['工资'] = label_encoder.fit_transform(data_1['工资'])
print("工资列标签编码后的值:\n", data_1['工资'].unique())
# 将编码后的数值替换掉原Excel表中的字符串
data_1.to_excel("员工离职预测模型_编码后.xlsx", index=False)
data=pd.read_excel("员工离职预测模型_编码后.xlsx")
# 数据预处理
# 使用均值填充缺失值
print("缺失值统计:\n", data.isnull().sum())
data = data.apply(lambda col: col.fillna(col.mean()), axis=0)
# 处理异常值(Z-score 方法)
z_scores = np.abs(stats.zscore(data.select_dtypes(include=[np.number])))
threshold = 3
outliers = (z_scores > threshold).any(axis=1)
print("检测到的异常值行索引:\n", data[outliers].index.tolist())
print(data[outliers])
data = data[~outliers]
# 划分特征和目标变量
X = data.drop("离职", axis=1)
y = data["离职"]# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)# 标准化数据
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)# 初始化 SVC 模型
model = SVC(kernel='rbf')
cv = KFold(n_splits=5, random_state=10, shuffle=True)# 参数网格
param_grid = {'C': [0.01, 0.1, 1, 10, 100],'gamma': ['scale', 'auto', 0.01, 0.1, 1, 10]
}# 网格搜索
grid_search = GridSearchCV(estimator=model,param_grid=param_grid,cv=cv,scoring='accuracy',  # 使用准确率作为评估指标n_jobs=-1
)
grid_search.fit(X_train, y_train)
print("最佳参数组合:", grid_search.best_params_)# 使用最佳模型进行预测
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)# 交叉验证
cv_scores = cross_val_score(best_model, X_train, y_train, cv=cv, scoring='accuracy')
print(f"交叉验证准确率: {np.mean(cv_scores):.4f} (±{np.std(cv_scores):.4f})")# 模型评估
print("测试集准确率:", accuracy_score(y_test, y_pred))
print("测试集精确率:", precision_score(y_test, y_pred, average='weighted'))
print("测试集召回率:", recall_score(y_test, y_pred, average='weighted'))
print("测试集 F1 分数:", f1_score(y_test, y_pred, average='weighted'))# 可视化
# 混淆矩阵
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.title("混淆矩阵")
plt.show()

二.K近邻

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy import stats
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier  # 导入 KNN 模型# 设置显示选项
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)# 导入数据
data_1 = pd.read_excel("员工离职预测模型.xlsx")
# 对“工资”列进行标签编码
label_encoder = LabelEncoder()
data_1['工资'] = label_encoder.fit_transform(data_1['工资'])
print("工资列标签编码后的值:\n", data_1['工资'].unique())
# 将编码后的数值替换掉原Excel表中的字符串
data_1.to_excel("员工离职预测模型_编码后.xlsx", index=False)
data = pd.read_excel("员工离职预测模型_编码后.xlsx")# 数据预处理
# 使用均值填充缺失值
print("缺失值统计:\n", data.isnull().sum())
data = data.apply(lambda col: col.fillna(col.mean()), axis=0)
# 处理异常值(Z-score 方法)
z_scores = np.abs(stats.zscore(data.select_dtypes(include=[np.number])))
threshold = 3
outliers = (z_scores > threshold).any(axis=1)
print("检测到的异常值行索引:\n", data[outliers].index.tolist())
print(data[outliers])
data = data[~outliers]# 划分特征和目标变量
X = data.drop("离职", axis=1)
y = data["离职"]# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)# 标准化数据
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)# 初始化 KNN 模型
model = KNeighborsClassifier()
cv = KFold(n_splits=5, random_state=10, shuffle=True)# 参数网格
param_grid = {'n_neighbors': [3, 5, 7, 9, 11],  # 邻居数量'weights': ['uniform', 'distance'],  # 权重函数'p': [1, 2]  # 距离度量(1:曼哈顿距离,2:欧氏距离)
}# 网格搜索
grid_search = GridSearchCV(estimator=model,param_grid=param_grid,cv=cv,scoring='accuracy',  # 使用准确率作为评估指标n_jobs=-1
)
grid_search.fit(X_train, y_train)
print("最佳参数组合:", grid_search.best_params_)# 使用最佳模型进行预测
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)# 交叉验证
cv_scores = cross_val_score(best_model, X_train, y_train, cv=cv, scoring='accuracy')
print(f"交叉验证准确率: {np.mean(cv_scores):.4f} (±{np.std(cv_scores):.4f})")# 模型评估
print("测试集准确率:", accuracy_score(y_test, y_pred))
print("测试集精确率:", precision_score(y_test, y_pred, average='weighted'))
print("测试集召回率:", recall_score(y_test, y_pred, average='weighted'))
print("测试集 F1 分数:", f1_score(y_test, y_pred, average='weighted'))# 混淆矩阵
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.title("混淆矩阵")
plt.show()

三.决策树

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy import stats
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier  # 导入决策树模型# 设置显示选项
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)# 导入数据
data_1 = pd.read_excel("员工离职预测模型.xlsx")
# 对“工资”列进行标签编码
label_encoder = LabelEncoder()
data_1['工资'] = label_encoder.fit_transform(data_1['工资'])
print("工资列标签编码后的值:\n", data_1['工资'].unique())
# 将编码后的数值替换掉原Excel表中的字符串
data_1.to_excel("员工离职预测模型_编码后.xlsx", index=False)
data = pd.read_excel("员工离职预测模型_编码后.xlsx")# 数据预处理
# 使用均值填充缺失值
print("缺失值统计:\n", data.isnull().sum())
data = data.apply(lambda col: col.fillna(col.mean()), axis=0)
# 处理异常值(Z-score 方法)
z_scores = np.abs(stats.zscore(data.select_dtypes(include=[np.number])))
threshold = 3
outliers = (z_scores > threshold).any(axis=1)
print("检测到的异常值行索引:\n", data[outliers].index.tolist())
print(data[outliers])
data = data[~outliers]# 划分特征和目标变量
X = data.drop("离职", axis=1)
y = data["离职"]# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)# 标准化数据(决策树不需要标准化,但为了保持代码一致性,仍然保留)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)# 初始化决策树模型
model = DecisionTreeClassifier(random_state=42)
cv = KFold(n_splits=5, random_state=10, shuffle=True)# 参数网格
param_grid = {'max_depth': [None, 5, 10, 15, 20],  # 树的最大深度'min_samples_split': [2, 5, 10],  # 内部节点再划分所需最小样本数'min_samples_leaf': [1, 2, 4],  # 叶节点最小样本数'criterion': ['gini', 'entropy']  # 分裂标准
}# 网格搜索
grid_search = GridSearchCV(estimator=model,param_grid=param_grid,cv=cv,scoring='accuracy',  # 使用准确率作为评估指标n_jobs=-1
)
grid_search.fit(X_train, y_train)
print("最佳参数组合:", grid_search.best_params_)# 使用最佳模型进行预测
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)# 交叉验证
cv_scores = cross_val_score(best_model, X_train, y_train, cv=cv, scoring='accuracy')
print(f"交叉验证准确率: {np.mean(cv_scores):.4f} (±{np.std(cv_scores):.4f})")# 模型评估
print("测试集准确率:", accuracy_score(y_test, y_pred))
print("测试集精确率:", precision_score(y_test, y_pred, average='weighted'))
print("测试集召回率:", recall_score(y_test, y_pred, average='weighted'))
print("测试集 F1 分数:", f1_score(y_test, y_pred, average='weighted'))# 混淆矩阵
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.title("混淆矩阵")
plt.show()

四.随机森林

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy import stats
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier  # 导入随机森林模型# 设置显示选项
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)# 导入数据
data_1 = pd.read_excel("员工离职预测模型.xlsx")
# 对“工资”列进行标签编码
label_encoder = LabelEncoder()
data_1['工资'] = label_encoder.fit_transform(data_1['工资'])
print("工资列标签编码后的值:\n", data_1['工资'].unique())
# 将编码后的数值替换掉原Excel表中的字符串
data_1.to_excel("员工离职预测模型_编码后.xlsx", index=False)
data = pd.read_excel("员工离职预测模型_编码后.xlsx")# 数据预处理
# 使用均值填充缺失值
print("缺失值统计:\n", data.isnull().sum())
data = data.apply(lambda col: col.fillna(col.mean()), axis=0)
# 处理异常值(Z-score 方法)
z_scores = np.abs(stats.zscore(data.select_dtypes(include=[np.number])))
threshold = 3
outliers = (z_scores > threshold).any(axis=1)
print("检测到的异常值行索引:\n", data[outliers].index.tolist())
print(data[outliers])
data = data[~outliers]# 划分特征和目标变量
X = data.drop("离职", axis=1)
y = data["离职"]# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)# 标准化数据(随机森林不需要标准化,但为了保持代码一致性,仍然保留)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)# 初始化随机森林模型
model = RandomForestClassifier(random_state=42)
cv = KFold(n_splits=5, random_state=10, shuffle=True)# 参数网格
param_grid = {'n_estimators': [50, 100, 200],  # 树的数量'max_depth': [None, 5, 10, 15, 20],  # 树的最大深度'min_samples_split': [2, 5, 10],  # 内部节点再划分所需最小样本数'min_samples_leaf': [1, 2, 4],  # 叶节点最小样本数'criterion': ['gini', 'entropy']  # 分裂标准
}# 网格搜索
grid_search = GridSearchCV(estimator=model,param_grid=param_grid,cv=cv,scoring='accuracy',  # 使用准确率作为评估指标n_jobs=-1
)
grid_search.fit(X_train, y_train)
print("最佳参数组合:", grid_search.best_params_)# 使用最佳模型进行预测
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)# 交叉验证
cv_scores = cross_val_score(best_model, X_train, y_train, cv=cv, scoring='accuracy')
print(f"交叉验证准确率: {np.mean(cv_scores):.4f} (±{np.std(cv_scores):.4f})")# 模型评估
print("测试集准确率:", accuracy_score(y_test, y_pred))
print("测试集精确率:", precision_score(y_test, y_pred, average='weighted'))
print("测试集召回率:", recall_score(y_test, y_pred, average='weighted'))
print("测试集 F1 分数:", f1_score(y_test, y_pred, average='weighted'))# 混淆矩阵
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.title("混淆矩阵")
plt.show()

五.梯度提升

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy import stats
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier  # 导入梯度提升模型# 设置显示选项
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)# 导入数据
data_1 = pd.read_excel("员工离职预测模型.xlsx")
# 对“工资”列进行标签编码
label_encoder = LabelEncoder()
data_1['工资'] = label_encoder.fit_transform(data_1['工资'])
print("工资列标签编码后的值:\n", data_1['工资'].unique())
# 将编码后的数值替换掉原Excel表中的字符串
data_1.to_excel("员工离职预测模型_编码后.xlsx", index=False)
data = pd.read_excel("员工离职预测模型_编码后.xlsx")# 数据预处理
# 使用均值填充缺失值
print("缺失值统计:\n", data.isnull().sum())
data = data.apply(lambda col: col.fillna(col.mean()), axis=0)
# 处理异常值(Z-score 方法)
z_scores = np.abs(stats.zscore(data.select_dtypes(include=[np.number])))
threshold = 3
outliers = (z_scores > threshold).any(axis=1)
print("检测到的异常值行索引:\n", data[outliers].index.tolist())
print(data[outliers])
data = data[~outliers]# 划分特征和目标变量
X = data.drop("离职", axis=1)
y = data["离职"]# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)# 标准化数据(梯度提升不需要标准化,但为了保持代码一致性,仍然保留)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)# 初始化梯度提升模型
model = GradientBoostingClassifier(random_state=42)
cv = KFold(n_splits=5, random_state=10, shuffle=True)# 参数网格
param_grid = {'n_estimators': [50, 100, 200],  # 树的数量'learning_rate': [0.01, 0.1, 0.2],  # 学习率'max_depth': [3, 5, 7],  # 树的最大深度'min_samples_split': [2, 5, 10],  # 内部节点再划分所需最小样本数'min_samples_leaf': [1, 2, 4],  # 叶节点最小样本数
}# 网格搜索
grid_search = GridSearchCV(estimator=model,param_grid=param_grid,cv=cv,scoring='accuracy',  # 使用准确率作为评估指标n_jobs=-1
)
grid_search.fit(X_train, y_train)
print("最佳参数组合:", grid_search.best_params_)# 使用最佳模型进行预测
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)# 交叉验证
cv_scores = cross_val_score(best_model, X_train, y_train, cv=cv, scoring='accuracy')
print(f"交叉验证准确率: {np.mean(cv_scores):.4f} (±{np.std(cv_scores):.4f})")# 模型评估
print("测试集准确率:", accuracy_score(y_test, y_pred))
print("测试集精确率:", precision_score(y_test, y_pred, average='weighted'))
print("测试集召回率:", recall_score(y_test, y_pred, average='weighted'))
print("测试集 F1 分数:", f1_score(y_test, y_pred, average='weighted'))# 混淆矩阵
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.title("混淆矩阵")
plt.show()

六.XGBoost/LightGBM/CatBoost 

版权声明:

本网仅为发布的内容提供存储空间,不对发表、转载的内容提供任何形式的保证。凡本网注明“来源:XXX网络”的作品,均转载自其它媒体,著作权归作者所有,商业转载请联系作者获得授权,非商业转载请注明出处。

我们尊重并感谢每一位作者,均已注明文章来源和作者。如因作品内容、版权或其它问题,请及时与我们联系,联系邮箱:809451989@qq.com,投稿邮箱:809451989@qq.com

热搜词