一.K-Means(需预设簇数)
1.肘部法
肘部法通过计算不同聚类数下的总平方误差(SSE),寻找SSE下降速度明显减缓的点,即“肘部”。
观察曲线,找到SSE下降速度明显减缓的点,该点即为最佳k值
2.轮廓系数
轮廓系数衡量样本与其所属簇的紧密程度,值越接近1,聚类效果越好。
选择轮廓系数最高的k值。
#1.导入必备的
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from scipy import stats
#2.设置显示选项
plt.rcParams['font.sans-serif'] = ['SimHei'] # 设置中文字体
plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)#3.导入数据
data = pd.read_excel("客户信息.xlsx") # 替换为你的数据文件路径
print("数据预览:\n", data.head())#4.数据预处理
#4.1. 处理缺失值(使用均值填充)
print("缺失值统计:\n", data.isnull().sum())
data = data.fillna(data.mean())#4.2. 处理异常值(Z-score 方法)
z_scores = np.abs(stats.zscore(data.select_dtypes(include=[np.number])))
threshold = 3
outliers = (z_scores > threshold).any(axis=1)
print("检测到的异常值行索引:\n", data[outliers].index.tolist())
data = data[~outliers] # 移除异常值#4.3. 标准化数据(K-Means 对数据尺度敏感,必须标准化)
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data.select_dtypes(include=[np.number]))#5.确定最佳 K 值(肘部法)
wcss = [] # 保存每个 K 值的误差平方和
silhouette_scores = [] # 保存每个 K 值的轮廓系数
k_range = range(2, 11) # 尝试 K 值从 2 到 10for k in k_range:kmeans = KMeans(n_clusters=k, random_state=42)kmeans.fit(data_scaled)wcss.append(kmeans.inertia_) # 误差平方和silhouette_scores.append(silhouette_score(data_scaled, kmeans.labels_)) # 轮廓系数# 绘制肘部法图
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(k_range, wcss, marker='o')
plt.xlabel('K 值')
plt.ylabel('误差平方和 (WCSS)')
plt.title('肘部法')# 绘制轮廓系数图
plt.subplot(1, 2, 2)
plt.plot(k_range, silhouette_scores, marker='o')
plt.xlabel('K 值')
plt.ylabel('轮廓系数')
plt.title('轮廓系数法')
plt.show()#6.模型建立使用最佳 K 值(根据肘部法和轮廓系数)
best_k = int(input("根据肘部法和轮廓系数图,请输入最佳 K 值: "))
kmeans = KMeans(n_clusters=best_k, random_state=42)
kmeans.fit(data_scaled)# 将聚类结果添加到原始数据中
data['Cluster'] = kmeans.labels_
print("聚类结果:\n", data.head())# 可视化聚类结果
if data_scaled.shape[1] >= 2:plt.figure(figsize=(8, 6))plt.scatter(data_scaled[:, 0], data_scaled[:, 1], c=kmeans.labels_, cmap='viridis', s=50)plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c='red', marker='X', s=200, label='聚类中心')plt.xlabel('特征 1')plt.ylabel('特征 2')plt.title('K-Means 聚类结果')plt.legend()plt.show()# 输出聚类中心
print("聚类中心:\n", scaler.inverse_transform(kmeans.cluster_centers_))# 输出聚类结果的统计信息
print("每个簇的样本数量:\n", data['Cluster'].value_counts())# 输出轮廓系数
silhouette_avg = silhouette_score(data_scaled, kmeans.labels_)
print(f"轮廓系数: {silhouette_avg:.4f}")
二.DBSCAN(抗噪声)
1.参数
-
eps
:控制邻域的大小。较小的eps
会将数据划分为更多的小簇,较大的eps
会将数据划分为较少的大簇。 -
min_samples
:控制核心点的最小样本数。较小的min_samples
会使算法对噪声更敏感,较大的min_samples
会使算法更倾向于生成较大的簇。
# 1. 导入必备的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from scipy import stats
from sklearn.neighbors import NearestNeighbors
from itertools import product
from sklearn.decomposition import PCA# 2. 设置显示选项
plt.rcParams['font.sans-serif'] = ['SimHei'] # 设置中文字体
plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)# 3. 导入数据
data = pd.read_excel("客户信息.xlsx") # 替换为你的数据文件路径
print("数据预览:\n", data.head())# 4. 数据预处理
def preprocess_data(data):# 处理缺失值data = data.fillna(data.mean())# 处理异常值z_scores = np.abs(stats.zscore(data.select_dtypes(include=[np.number])))outliers = (z_scores > 3).any(axis=1)data = data[~outliers]# 标准化数据scaler = StandardScaler()data_scaled = scaler.fit_transform(data.select_dtypes(include=[np.number]))return data, data_scaleddata, data_scaled = preprocess_data(data)# 5. 参数调优
def find_optimal_eps(data_scaled, min_samples):neighbors = NearestNeighbors(n_neighbors=min_samples)neighbors_fit = neighbors.fit(data_scaled)distances, indices = neighbors_fit.kneighbors(data_scaled)distances = np.sort(distances[:, min_samples-1], axis=0)plt.plot(distances)plt.xlabel('样本索引')plt.ylabel(f'{min_samples}-最近邻距离')plt.title('K-Distance Graph')plt.show()return distancesdef grid_search_dbscan(data_scaled, eps_values, min_samples_values):results = []for eps, min_samples in product(eps_values, min_samples_values):dbscan = DBSCAN(eps=eps, min_samples=int(min_samples))dbscan.fit(data_scaled)labels = dbscan.labels_if len(set(labels)) > 1:silhouette_avg = silhouette_score(data_scaled, labels)else:silhouette_avg = -1n_clusters = len(set(labels)) - (1 if -1 in labels else 0)results.append({'eps': eps,'min_samples': int(min_samples),'n_clusters': n_clusters,'silhouette_score': silhouette_avg})results_df = pd.DataFrame(results)best_params = results_df.loc[results_df['silhouette_score'].idxmax()]return results_df, best_params# 设置参数范围
min_samples = 5 # 初始 min_samples 值
eps_values = np.arange(0.1, 1.0, 0.05) # eps 的取值范围
min_samples_values = range(2, 15) # min_samples 的取值范围# 绘制K距离图
distances = find_optimal_eps(data_scaled, min_samples)# 网格搜索
results_df, best_params = grid_search_dbscan(data_scaled, eps_values, min_samples_values)
print("参数调优结果:\n", results_df)
print("最佳参数组合:\n", best_params)# 6. 使用最佳参数运行 DBSCAN
best_eps = best_params['eps']
best_min_samples = best_params['min_samples']dbscan = DBSCAN(eps=best_eps, min_samples=int(best_min_samples))
dbscan.fit(data_scaled)# 将聚类结果添加到原始数据中
data['Cluster'] = dbscan.labels_
print("聚类结果:\n", data.head())# 7. 可视化聚类结果
def visualize_clusters(data_scaled, labels, eps, min_samples):plt.figure(figsize=(8, 6))unique_labels = set(labels)colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]for k, col in zip(unique_labels, colors):if k == -1:col = [0, 0, 0, 1] # 噪声点用黑色表示class_member_mask = (labels == k)xy = data_scaled[class_member_mask]plt.scatter(xy[:, 0], xy[:, 1], c=[col], s=50, label=f'Cluster {k}')plt.xlabel('特征 1')plt.ylabel('特征 2')plt.title(f'DBSCAN 聚类结果 (eps={eps}, min_samples={min_samples})')plt.legend()plt.show()visualize_clusters(data_scaled, dbscan.labels_, best_eps, best_min_samples)# 8. 输出聚类结果的统计信息
print("每个簇的样本数量:\n", data['Cluster'].value_counts())# 9. 输出轮廓系数
if len(set(dbscan.labels_)) > 1:silhouette_avg = silhouette_score(data_scaled, dbscan.labels_)print(f"轮廓系数: {silhouette_avg:.4f}")
else:print("无法计算轮廓系数,因为只有一个簇或只有噪声点。")