用户画像的应用场景
产品层面的宏观分析维度
用户画像标签类型
聚类和分类q:
聚类的距离:
3.2 K-means算法原理
模型评估:
误差平方和
Q1:K值得确定:肘部法则
Q2: 聚类效果如何评估??:轮廓系数
3.3 K-means算法的优缺点
3.4 使用Pyhton做K-means算法的步骤
4. 用户画像
5.使用K-means做用户画像
(1)数据解释
本节对互联网行业用户进行聚类分析,根据获取到用户的10个维度进行聚类分析,10维度涵盖了用户的观看、关注、消费行为。- KMeans(n_clusters=8, init='k-means++', n_init=10, max_iter=300, tol=0.0001,
- precompute_distances='auto', verbose=0, random_state=None,
- copy_x=True, n_jobs=None, algorithm='auto')
复制代码
2) 聚类步骤
确定k值
建模分析
效果检测
聚类结果的解读
.
代码部分:
- ## 获取数据from sklearn.metrics import r2_score
- import statsmodels.api as sm
- import numpy as np
- import pandas as pd
- import matplotlib.pyplot as plt
- data = pd.read_table('C:/Users/lb/Desktop/test/k-means_data.txt',sep='\t',engine="python",encoding ='gbk')
- data.columns.values
- data.head()
复制代码
- data = data.drop(['用户id'],axis=1)print( data.shape )
复制代码
缺失值
填充数据
- # 缺失值处理 -- 每列特征的均值填充缺失数据
- data = data.fillna(data.mean())
复制代码
- #缺失值处理之后,索引恢复for i in[data]:
- i.index =range(i.shape[0])
- data.head()
复制代码 标准化以及修正列名
- from sklearn import preprocessing
- scaler = preprocessing.StandardScaler().fit(data)
- data_s = pd.DataFrame(scaler.fit_transform(data,y=data.columns))
- data_s.head()
复制代码
- new_col = data.columns
- data_s.columns = new_col
- data_s.head()
复制代码
标准化后的维数
元数据散点图
- # 查看元数据散点图import numpy as np
- import matplotlib.pyplot as plt
- plt.rcParams['font.sans-serif']=['SimHei']
- plt.rcParams['axes.unicode_minus']=False
- plt.scatter(data_s['浏览数量'],data_s['消费金额'], s=6, c='r',
- alpha=0.4, label='散点1')
- plt.show()
复制代码
建模:
- from sklearn.cluster import KMeans
- from sklearn.preprocessing import MinMaxScaler
- from sklearn.metrics import silhouette_score
复制代码- # 确定K值
- inertia =[]for k inrange(2,10):
- kmeans = KMeans(n_clusters = k,random_state =1).fit(data_s)
- inertia.append(np.sqrt(kmeans.inertia_))
- plt.plot(range(2,10),inertia,marker ='s')
- plt.xlabel('$k$')#k值
- plt.ylabel('$J(C_K)$')#误差平方和#4和5
复制代码
- # 存放轮廓系数
- Scores =[]
- plt.rcParams['font.sans-serif']=['SimHei']#显示中文标签
- plt.rcParams['axes.unicode_minus']=False#用来正常显示负号for k inrange(2,9):
- estimator = KMeans(n_clusters=k)# 构造聚类器
- estimator.fit(data_s)
- Scores.append(silhouette_score(data_s, estimator.labels_, metric='euclidean'))
- X =range(2,9)
- plt.xlabel('k')
- plt.ylabel('轮廓系数')
- plt.plot(X, Scores,'o-')
- plt.show()
复制代码 ## 一行表示一个行向量
calinski_harabaz_score:卡林斯基-哈拉巴斯指数
- # calinski_harabaz指数from sklearn.metrics import calinski_harabaz_score
- import matplotlib.pyplot as plt
- haraba =[]
- plt.rcParams['font.sans-serif']=['SimHei']#显示中文标签
- plt.rcParams['axes.unicode_minus']=False#用来正常显示负号for k inrange(2,9):
- estimator = KMeans(n_clusters=k)# 构造聚类器
- estimator.fit(data_s)
- haraba.append(calinski_harabaz_score(data_s,estimator.labels_))
- X =range(2,9)
- plt.xlabel('k')
- plt.ylabel('calinski_harabaz指数')
- plt.plot(X, haraba,'o-')
- plt.show()
复制代码
建立模型
- # 建立模型
- cluster = KMeans(n_clusters =3,random_state =5).fit(data_s)
复制代码 聚类中心- # 聚类中心
- centers = cluster.cluster_centers_ # 聚类中心
- centers
复制代码 因为是3个簇 所有是3个
样本类别标签
- # 查看总距离平方和
- inertia = cluster.inertia_
- inertia
复制代码
- # 轮廓系数均值
- silhouette_score(data_s,c_preds1)
复制代码
- # 轮廓系数from sklearn.metrics import silhouette_score
- from sklearn.metrics import silhouette_samples
- #每个样本的轮廓系数
- silhouette_samples(data_s,c_preds1)
复制代码 array([ 0.92732185, 0.93211408, 0.91754354, 0.88674901, 0.93250814,
0.91832453, 0.93225057, 0.91800177, 0.91801442, 0.91371574,
0.93039028, 0.92780645, 0.9175957 , 0.91805761, 0.91801695,
0.90394838, 0.82497174, 0.91733802, 0.86933501, 0.92838568,
0.90395603, 0.90396898, 0.71949046, 0.60180808, 0.91797249,
0.90003841, 0.8249717 , 0.91807368, 0.93202087, 0.9181117 ,
0.93245682, 0.92778811, 0.92656082, 0.72477047, 0.93019856,
0.92828162, 0.91816004, 0.88759818, 0.93127829, 0.92603308,
0.90071358, -0.17466512, 0.93172762, 0.91827221, 0.8707087 ,
0.93098797, 0.93098797, 0.93137879, 0.93064253, 0.90396551,
0.91799883, 0.93294867, 0.93255235, 0.92731491, 0.91761448,
0.91798014, 0.93159375, 0.91974077, 0.91644496, 0.85981065,
0.90395365, 0.91817525, 0.9318612 , 0.90394838, 0.91843222,
0.61614305, 0.93230046, 0.93227226, 0.93062299, 0.91772833,
0.92723017, 0.91801529, 0.91801117, 0.93302561, 0.89769678,
0.91819631, 0.90394838, 0.92225906, 0.42998441, 0.85359922,
0.49967657, 0.35952168, 0.91810681, 0.92582604, 0.9182524 ,
0.66910184, 0.91805175, 0.91805175, 0.93255546, 0.91548173,
0.91548173, 0.91815125, 0.37411391, 0.60619449, 0.91929334,
0.93075391, 0.40382763, 0.9278305 , 0.93120899, 0.93308297,
0.93150594, 0.93177713, 0.90394982, 0.90394982, 0.91842709,
0.92716037, 0.92970515, 0.93027568, 0.93142373, 0.93067922,
0.93228489, 0.91605913, 0.8630899 , 0.92845676, 0.91807434,
0.56483434, 0.59047301, 0.93211681, 0.93195585, 0.91769925,
0.93239282, 0.91810854, 0.93265678, 0.52170789, 0.91768007,
0.79704101, 0.9174981 , 0.93153993, 0.9273972 , 0.91796437,
0.93145531, 0.93198917, 0.90394888, 0.18277362, 0.91824065,
0.91754479, 0.91798471, 0.9317068 , 0.9301013 , 0.91741988,
0.93116962, 0.03740003, 0.91805067, 0.93212341, 0.30722591,
0.47547937, 0.92527144, 0.6330433 , 0.6330433 , 0.9310836 ,
0.87741024, 0.90396363, 0.91814416, 0.91631695, 0.34389833,
0.93233374, 0.91748558, 0.93103558, 0.50049229, 0.91844024,
0.8911124 , 0.8911124 , 0.91857703, 0.91148529, 0.93050201,
0.90395961, 0.91174891, 0.90395527, 0.93252806, 0.91797483,
0.93297992, 0.92870732, 0.93023766, 0.83658348, 0.92388233,
0.93170759, 0.93160029, 0.93270814, 0.4490633 , 0.4490633 ,
0.78738663, 0.92915066, 0.9268259 , 0.93305011, 0.9280447 ,
0.91796439, 0.91738324, 0.75867505, 0.92721804, 0.91841875,
0.91957815, 0.91844261, 0.93167406, 0.91834179, 0.9180343 ,
0.92740537, 0.9174482 , 0.92530647, 0.69268396, 0.91137483,
0.91808896, 0.9279222 , 0.93057453, 0.8968111 , 0.91815978,
0.82378692, 0.83671792, 0.91813035, 0.91796654, 0.91805443,
0.92863594, 0.93098307, 0.88998633, 0.92986096, 0.9236901 ,
0.91805397])
一共是216个样本 ,所以是216个轮廓系数- len( silhouette_samples(data_s,c_preds1))
复制代码
取其中两列 画聚类中心图 因为有三个簇是3个- import matplotlib.pyplot as plt
- plt.figure()
- plt.scatter(data_s.values[:,0], data_s.values[:,1], c=c_preds1)#原始数据散点图,按照分类查看
- plt.scatter(centers[:,0], centers[:,1],
- marker='x', s=169, linewidths=3,
- color='r', zorder=10)#重心红色X进行突出
复制代码
聚类和标签组合起来- clust_prod = data.copy()
- clust_prod['标签']= c_preds1
- clust_prod.head(10)
复制代码
- clust_prod[clust_prod.标签 ==0]
复制代码
直方图观察各类中某个变量的区别- # 绘制子图import matplotlib.pyplot as plt
- #fig = plt.figure()for i inrange(3):
- plt.subplot(1,3,i+1)
- plt.hist(x = clust_prod[clust_prod.标签==i].浏览数量,# 指定绘图数据
- bins =5,# 指定直方图中条块的个数
- color ='steelblue',# 指定直方图的填充色
- edgecolor ='black'# 指定直方图的边框色)
- plt.title('第'+str(i)+'类')
- plt.show()
复制代码
层次聚类:
AGNES,是一种采用自底向上聚合策略的层次聚类算法,是先将数据集中的每个样本看作一个初始聚类簇,然后在算法运行的每一步中找出距离最近的两个聚类簇进行合并,该过程不断重复,直至达到预设的聚类簇个数,这里的关键是如何计算聚类簇之间的距离。其实每个聚类簇就是一个样本集合,聚类簇之间的聚类就是集合之间的聚类。
- from sklearn.cluster import AgglomerativeClustering
- from sklearn.metrics import confusion_matrix
- clustering = AgglomerativeClustering(linkage='ward', n_clusters=3)
- res = clustering.fit(data_s)
复制代码 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)- # 存放轮廓系数
- Scores =[]
- plt.rcParams['font.sans-serif']=['SimHei']#显示中文标签
- plt.rcParams['axes.unicode_minus']=False#用来正常显示负号for k inrange(2,9):
- estimator = AgglomerativeClustering(n_clusters=k)# 构造聚类器
- estimator.fit(data_s)
- Scores.append(silhouette_score(data_s, estimator.labels_, metric='euclidean'))
- X =range(2,9)
- plt.xlabel('k')
- plt.ylabel('轮廓系数')
- plt.plot(X, Scores,'o-')
- plt.show()
复制代码 |