import pandas as pda
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split #划分训练集和测试集
from sklearn.ensemble import RandomForestClassifier  #随机森林
from sklearn.preprocessing import LabelEncoder  
from sklearn.metrics import classification_report, roc_curve, auc, confusion_matrix  #模型评估

from sklearn.svm import SVC  #支持向量机

import warnings
warnings.filterwarnings('ignore') #忽略警告信息

plt.rcParams['font.sans-serif']='SimHei'
plt.rcParams['axes.unicode_minus']=False
df=pd.read_csv('Predict Hair Fall.csv')
df
Id Genetics Hormonal Changes Medical Conditions Medications & Treatments Nutritional Deficiencies Stress Age Poor Hair Care Habits Environmental Factors Smoking Weight Loss Hair Loss
0 133992 Yes No No Data No Data Magnesium deficiency Moderate 19 Yes Yes No No 0
1 148393 No No Eczema Antibiotics Magnesium deficiency High 43 Yes Yes No No 0
2 155074 No No Dermatosis Antifungal Cream Protein deficiency Moderate 26 Yes Yes No Yes 0
3 118261 Yes Yes Ringworm Antibiotics Biotin Deficiency Moderate 46 Yes Yes No No 0
4 111915 No No Psoriasis Accutane Iron deficiency Moderate 30 No Yes Yes No 1
5 139661 Yes No Psoriasis Antibiotics Magnesium deficiency Low 37 No Yes No Yes 1
6 169255 Yes Yes No Data No Data Selenium deficiency High 40 Yes No No No 1
7 112032 Yes No Dermatosis Chemotherapy Omega-3 fatty acids High 35 Yes No Yes No 0
8 140785 Yes No Eczema Steroids Selenium deficiency Moderate 19 No No Yes Yes 1
9 187999 No Yes Ringworm Rogaine Magnesium deficiency Moderate 49 Yes Yes Yes No 0
10 118858 Yes Yes Eczema Blood Pressure Medication Biotin Deficiency High 26 Yes Yes Yes No 0
11 159158 No Yes Alopecia Areata Accutane Zinc Deficiency High 48 No No No No 1
12 156086 Yes Yes Scalp Infection Immunomodulators Biotin Deficiency Moderate 20 No Yes Yes No 1
13 178256 No No Psoriasis Antibiotics Vitamin A Deficiency High 30 Yes Yes Yes Yes 0
14 150154 Yes No Eczema Antibiotics Biotin Deficiency High 34 Yes Yes No Yes 0
15 130552 Yes Yes Scalp Infection Rogaine Vitamin D Deficiency Moderate 29 Yes No No Yes 0
16 116190 Yes No Seborrheic Dermatitis Antidepressants Vitamin D Deficiency High 46 Yes Yes No Yes 0
17 194441 No Yes Dermatosis Antibiotics Zinc Deficiency Low 19 Yes No No Yes 1
18 147404 Yes Yes Dermatosis Accutane Biotin Deficiency Low 26 No No Yes No 0
19 136709 Yes Yes Seborrheic Dermatitis Chemotherapy Vitamin A Deficiency High 46 Yes Yes No Yes 1
20 187362 Yes Yes Seborrheic Dermatitis Accutane Protein deficiency High 46 No Yes No Yes 1
21 133804 No No No Data Chemotherapy Zinc Deficiency High 20 No Yes Yes Yes 1
22 148974 Yes Yes Psoriasis Antibiotics Vitamin D Deficiency Low 29 No No Yes Yes 0
23 116818 No Yes Scalp Infection Antidepressants Vitamin D Deficiency High 37 No No Yes No 1
24 142062 No Yes Dermatitis Antibiotics Protein deficiency High 33 Yes Yes No No 0
25 147833 Yes Yes Dermatosis Heart Medication Vitamin A Deficiency Moderate 34 No No No No 0
26 190967 No Yes Dermatosis Immunomodulators Selenium deficiency High 28 No No No No 0
27 114579 No Yes Seborrheic Dermatitis Antifungal Cream Zinc Deficiency Low 41 No No No No 0
28 159949 No Yes Thyroid Problems Chemotherapy Omega-3 fatty acids Low 35 Yes Yes Yes No 0
29 117296 No Yes Dermatitis Chemotherapy No Data Low 21 Yes Yes Yes No 0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
969 127078 Yes No Thyroid Problems Rogaine Omega-3 fatty acids High 35 Yes Yes No Yes 0
970 163276 No Yes Alopecia Areata Antifungal Cream Biotin Deficiency Low 18 No Yes Yes No 1
971 146975 No No Alopecia Areata Antibiotics Magnesium deficiency Moderate 40 Yes Yes No No 1
972 132137 Yes Yes Androgenetic Alopecia Accutane Vitamin E deficiency Low 46 Yes Yes Yes Yes 0
973 124196 Yes No Psoriasis Heart Medication Omega-3 fatty acids High 38 Yes Yes Yes No 0
974 154461 No Yes Scalp Infection Immunomodulators Omega-3 fatty acids Low 41 Yes No Yes No 0
975 153091 Yes Yes Dermatitis Heart Medication Vitamin D Deficiency High 44 No Yes Yes No 0
976 163302 Yes Yes Psoriasis Blood Pressure Medication No Data Low 21 Yes Yes Yes No 0
977 128405 Yes Yes Alopecia Areata Accutane Vitamin E deficiency Low 25 No Yes No No 0
978 195409 Yes No Dermatosis Antibiotics Zinc Deficiency Low 46 No No No Yes 0
979 113751 Yes Yes Psoriasis Antifungal Cream Omega-3 fatty acids Moderate 27 Yes Yes No Yes 1
980 110360 Yes Yes Dermatosis Antidepressants Selenium deficiency Moderate 26 No No No No 0
981 130242 Yes Yes No Data Antibiotics Iron deficiency High 40 Yes Yes Yes Yes 0
982 131756 Yes Yes Psoriasis Antifungal Cream Zinc Deficiency Moderate 50 Yes No No No 0
983 114983 No Yes Thyroid Problems Steroids Biotin Deficiency High 33 Yes Yes No No 1
984 149286 No No Androgenetic Alopecia Antibiotics Protein deficiency High 43 No No No No 0
985 136269 Yes No Seborrheic Dermatitis Blood Pressure Medication Biotin Deficiency Low 33 Yes No Yes Yes 0
986 194704 No Yes Androgenetic Alopecia Immunomodulators Biotin Deficiency Low 46 No No Yes No 1
987 179959 Yes Yes Androgenetic Alopecia Immunomodulators Selenium deficiency Moderate 19 Yes No Yes No 0
988 174574 No No Androgenetic Alopecia Antibiotics No Data High 23 Yes Yes No Yes 0
989 144786 No No Ringworm Rogaine Selenium deficiency Moderate 45 No Yes No Yes 1
990 127532 Yes No Alopecia Areata Antidepressants Zinc Deficiency Low 42 No Yes Yes No 0
991 131739 No Yes Thyroid Problems Antidepressants Zinc Deficiency High 33 Yes Yes No No 0
992 181854 Yes Yes Dermatosis Rogaine Magnesium deficiency Low 30 No No No No 1
993 196218 No Yes Scalp Infection Immunomodulators No Data Moderate 23 Yes Yes Yes Yes 0
994 184367 Yes No Seborrheic Dermatitis Rogaine Vitamin A Deficiency Low 33 Yes Yes Yes Yes 1
995 164777 Yes Yes No Data Accutane Protein deficiency Low 47 No No No Yes 0
996 143273 No Yes Androgenetic Alopecia Antidepressants Protein deficiency Moderate 20 Yes No Yes Yes 1
997 169123 No Yes Dermatitis Immunomodulators Biotin Deficiency Moderate 32 Yes Yes Yes Yes 1
998 127183 Yes Yes Psoriasis Blood Pressure Medication Vitamin D Deficiency Low 34 No Yes No No 1

999 rows × 13 columns

# 将列名转换成中文,便于理解
chinese_columns=[
    '遗传因素',
    '荷尔蒙变化', 
    '医疗状况', 
    '药物及治疗', 
    '营养缺乏', 
    '压力水平', 
    '年龄', 
    '不良护发习惯', 
    '环境因素', 
    '吸烟习惯', 
    '体重减轻', 
    '脱发标记'   # 目标变量
]


# 将原始数据集的列名改为:第一列为'ID',后面依次为chinese_columns中的12个列名
df.columns = ['ID'] + chinese_columns
df.head(5)
ID 遗传因素 荷尔蒙变化 医疗状况 药物及治疗 营养缺乏 压力水平 年龄 不良护发习惯 环境因素 吸烟习惯 体重减轻 脱发标记
0 133992 Yes No No Data No Data Magnesium deficiency Moderate 19 Yes Yes No No 0
1 148393 No No Eczema Antibiotics Magnesium deficiency High 43 Yes Yes No No 0
2 155074 No No Dermatosis Antifungal Cream Protein deficiency Moderate 26 Yes Yes No Yes 0
3 118261 Yes Yes Ringworm Antibiotics Biotin Deficiency Moderate 46 Yes Yes No No 0
4 111915 No No Psoriasis Accutane Iron deficiency Moderate 30 No Yes Yes No 1
import numpy as np
# 缺失值处理
df.replace("No Data", np.nan, inplace=True) 
df.head()
ID 遗传因素 荷尔蒙变化 医疗状况 药物及治疗 营养缺乏 压力水平 年龄 不良护发习惯 环境因素 吸烟习惯 体重减轻 脱发标记
0 133992 Yes No NaN NaN Magnesium deficiency Moderate 19 Yes Yes No No 0
1 148393 No No Eczema Antibiotics Magnesium deficiency High 43 Yes Yes No No 0
2 155074 No No Dermatosis Antifungal Cream Protein deficiency Moderate 26 Yes Yes No Yes 0
3 118261 Yes Yes Ringworm Antibiotics Biotin Deficiency Moderate 46 Yes Yes No No 0
4 111915 No No Psoriasis Accutane Iron deficiency Moderate 30 No Yes Yes No 1
import numpy as np
# 二值列转换
binary_cols = ['遗传因素', '荷尔蒙变化', '不良护发习惯', '环境因素', '吸烟习惯', '体重减轻']
for col in binary_cols:
    df[col] = df[col].map({'Yes': 1, 'No': 0, np.nan: np.nan})  # 直接用 np.nan 处理缺失
df.head()
ID 遗传因素 荷尔蒙变化 医疗状况 药物及治疗 营养缺乏 压力水平 年龄 不良护发习惯 环境因素 吸烟习惯 体重减轻 脱发标记
0 133992 1.0 0.0 NaN NaN Magnesium deficiency Moderate 19 1.0 1.0 0.0 0.0 0
1 148393 0.0 0.0 Eczema Antibiotics Magnesium deficiency High 43 1.0 1.0 0.0 0.0 0
2 155074 0.0 0.0 Dermatosis Antifungal Cream Protein deficiency Moderate 26 1.0 1.0 0.0 1.0 0
3 118261 1.0 1.0 Ringworm Antibiotics Biotin Deficiency Moderate 46 1.0 1.0 0.0 0.0 0
4 111915 0.0 0.0 Psoriasis Accutane Iron deficiency Moderate 30 0.0 1.0 1.0 0.0 1
# 创建高压力分组
df['高压力']=df['压力水平'].apply(lambda x:1 if x=='High' else 0)
df.head()
ID 遗传因素 荷尔蒙变化 医疗状况 药物及治疗 营养缺乏 压力水平 年龄 不良护发习惯 环境因素 吸烟习惯 体重减轻 脱发标记 高压力
0 133992 1.0 0.0 NaN NaN Magnesium deficiency Moderate 19 1.0 1.0 0.0 0.0 0 0
1 148393 0.0 0.0 Eczema Antibiotics Magnesium deficiency High 43 1.0 1.0 0.0 0.0 0 1
2 155074 0.0 0.0 Dermatosis Antifungal Cream Protein deficiency Moderate 26 1.0 1.0 0.0 1.0 0 0
3 118261 1.0 1.0 Ringworm Antibiotics Biotin Deficiency Moderate 46 1.0 1.0 0.0 0.0 0 0
4 111915 0.0 0.0 Psoriasis Accutane Iron deficiency Moderate 30 0.0 1.0 1.0 0.0 1 0
#脱发标记分布
data=df['脱发标记'].value_counts()

plt.figure(figsize=(8,5))
plt.pie(data,labels=['不脱发','脱发'],autopct='%.2f%%',startangle=90,shadow=True)
plt.title('脱发标记分布')
plt.show()

在这里插入图片描述

# 年龄与脱发关系
fig=plt.figure(figsize=(10,6))
ax1=plt.subplot(111)
df.boxplot(column='年龄',by='脱发标记',ax=ax1,)
ax1.set_title('脱发人群年龄分布',fontsize=14)
ax1.set_ylabel('年龄')
plt.show()

在这里插入图片描述

#常见医疗诊断分析
plt.figure(figsize=(12, 8))
top_conditions = df['医疗状况'].value_counts().head(10)
plt.barh(top_conditions.index,top_conditions) #横向柱状图
plt.title('十大常见脱发相关医疗状况', fontsize=14)
plt.xlabel('样本数量', fontsize=12)
plt.ylabel('医疗状况', fontsize=12)
plt.tight_layout()
plt.show()

在这里插入图片描述

plt.figure(figsize=(12,8))

top8_nutrition=df['营养缺乏'].value_counts().head(8)

plt.barh(top8_nutrition.index,top8_nutrition)
plt.title('常见脱发相关营养缺乏类型', fontsize=16, pad=20)
plt.xlabel('样本数量', fontsize=14)
plt.ylabel('营养缺乏类型', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

plt.show()

在这里插入图片描述

# 二值特征与脱发关系
features = ['遗传因素', '荷尔蒙变化', '不良护发习惯', '环境因素', '吸烟习惯', '体重减轻']

fig,axes=plt.subplots(3,2,figsize=(15,15))
axes=axes.flatten()
df['脱发标记'] = df['脱发标记'].astype('category')
categories = df['脱发标记'].cat.categories
num_categories = len(categories)
x=np.arange(num_categories)
width=0.35

for i ,feature in enumerate(features):#枚举每个特征
    if i < len(axes):
        ax=axes[i]
        #计算每个类别中1和0的数量
        counts=df.groupby('脱发标记')[feature].value_counts().unstack(fill_value=0)
        rects1=ax.bar(x-width/2,counts[0],width)
        rects1=ax.bar(x+width/2,counts[1],width)
        ax.set_title(f'{feature}与脱发')
        #ax.set_ylable('数量')
        ax.set_xticks(x)
        ax.set_xticklabels(categories)
        ax.legend(['无','有'])

plt.tight_layout()
plt.show()

在这里插入图片描述

counts
体重减轻 0.0 1.0
脱发标记
0 276 226
1 251 246
# 特征相关性分析
corr_features = ['遗传因素', '荷尔蒙变化', '高压力', '不良护发习惯', '吸烟习惯', '体重减轻', '脱发标记']
corr = df[corr_features].corr()
plt.figure(figsize=(12, 8))
plt.imshow(corr, cmap='hot',interpolation='nearest',alpha=0.9)
plt.colorbar()
plt.title('脱发相关因素相关系数热力图', fontsize=14)

plt.tight_layout()
plt.show()

在这里插入图片描述

df.head()
ID 遗传因素 荷尔蒙变化 医疗状况 药物及治疗 营养缺乏 压力水平 年龄 不良护发习惯 环境因素 吸烟习惯 体重减轻 脱发标记 高压力
0 133992 1.0 0.0 NaN NaN Magnesium deficiency Moderate 19 1.0 1.0 0.0 0.0 0 0
1 148393 0.0 0.0 Eczema Antibiotics Magnesium deficiency High 43 1.0 1.0 0.0 0.0 0 1
2 155074 0.0 0.0 Dermatosis Antifungal Cream Protein deficiency Moderate 26 1.0 1.0 0.0 1.0 0 0
3 118261 1.0 1.0 Ringworm Antibiotics Biotin Deficiency Moderate 46 1.0 1.0 0.0 0.0 0 0
4 111915 0.0 0.0 Psoriasis Accutane Iron deficiency Moderate 30 0.0 1.0 1.0 0.0 1 0
# 缺失值处理(删除少量缺失行)
df.dropna(subset=['脱发标记', '医疗状况', '药物及治疗', '营养缺乏'], inplace=True)
# 复合变量
# 遗传因素+高压力组合
df['遗传高压力组合'] = ((df['遗传因素'] == 1) & (df['高压力'] == 1)).astype(int)
# 标签编码分类变量
label_encoders = {}
categorical_cols = ['医疗状况', '药物及治疗', '营养缺乏', '压力水平']
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le
df
ID 遗传因素 荷尔蒙变化 医疗状况 药物及治疗 营养缺乏 压力水平 年龄 不良护发习惯 环境因素 吸烟习惯 体重减轻 脱发标记 高压力 遗传高压力组合
1 148393 0.0 0.0 4 1 2 0 43 1.0 1.0 0.0 0.0 0 1 0
2 155074 0.0 0.0 3 3 4 2 26 1.0 1.0 0.0 1.0 0 0 0
3 118261 1.0 1.0 6 1 0 2 46 1.0 1.0 0.0 0.0 0 0 0
4 111915 0.0 0.0 5 0 1 2 30 0.0 1.0 1.0 0.0 1 0 0
5 139661 1.0 0.0 5 1 2 1 37 0.0 1.0 0.0 1.0 1 0 0
7 112032 1.0 0.0 3 5 3 0 35 1.0 0.0 1.0 0.0 0 1 1
8 140785 1.0 0.0 4 9 5 2 19 0.0 0.0 1.0 1.0 1 0 0
9 187999 0.0 1.0 6 8 2 2 49 1.0 1.0 1.0 0.0 0 0 0
10 118858 1.0 1.0 4 4 0 0 26 1.0 1.0 1.0 0.0 0 1 1
11 159158 0.0 1.0 0 0 9 0 48 0.0 0.0 0.0 0.0 1 1 0
12 156086 1.0 1.0 7 7 0 2 20 0.0 1.0 1.0 0.0 1 0 0
13 178256 0.0 0.0 5 1 6 0 30 1.0 1.0 1.0 1.0 0 1 0
14 150154 1.0 0.0 4 1 0 0 34 1.0 1.0 0.0 1.0 0 1 1
15 130552 1.0 1.0 7 8 7 2 29 1.0 0.0 0.0 1.0 0 0 0
16 116190 1.0 0.0 8 2 7 0 46 1.0 1.0 0.0 1.0 0 1 1
17 194441 0.0 1.0 3 1 9 1 19 1.0 0.0 0.0 1.0 1 0 0
18 147404 1.0 1.0 3 0 0 1 26 0.0 0.0 1.0 0.0 0 0 0
19 136709 1.0 1.0 8 5 6 0 46 1.0 1.0 0.0 1.0 1 1 1
20 187362 1.0 1.0 8 0 4 0 46 0.0 1.0 0.0 1.0 1 1 1
22 148974 1.0 1.0 5 1 7 1 29 0.0 0.0 1.0 1.0 0 0 0
23 116818 0.0 1.0 7 2 7 0 37 0.0 0.0 1.0 0.0 1 1 0
24 142062 0.0 1.0 2 1 4 0 33 1.0 1.0 0.0 0.0 0 1 0
25 147833 1.0 1.0 3 6 6 2 34 0.0 0.0 0.0 0.0 0 0 0
26 190967 0.0 1.0 3 7 5 0 28 0.0 0.0 0.0 0.0 0 1 0
27 114579 0.0 1.0 8 3 9 1 41 0.0 0.0 0.0 0.0 0 0 0
28 159949 0.0 1.0 9 5 3 1 35 1.0 1.0 1.0 0.0 0 0 0
31 133091 0.0 1.0 2 1 1 2 45 0.0 0.0 0.0 1.0 1 0 0
32 157912 1.0 0.0 2 9 4 2 30 1.0 1.0 0.0 0.0 0 0 0
33 144334 1.0 0.0 6 9 5 0 24 1.0 1.0 0.0 1.0 1 1 1
34 136948 0.0 1.0 7 2 7 2 31 1.0 1.0 1.0 1.0 1 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
961 144096 1.0 0.0 9 9 2 0 33 1.0 1.0 1.0 0.0 0 1 1
962 137559 0.0 0.0 9 7 2 2 42 1.0 0.0 0.0 0.0 0 0 0
963 196758 1.0 0.0 5 6 2 0 43 0.0 1.0 0.0 0.0 0 1 1
967 187095 1.0 0.0 1 5 1 2 41 1.0 1.0 0.0 0.0 1 0 0
968 190151 0.0 1.0 1 5 1 0 41 1.0 0.0 1.0 0.0 0 1 0
969 127078 1.0 0.0 9 8 3 0 35 1.0 1.0 0.0 1.0 0 1 1
970 163276 0.0 1.0 0 3 0 1 18 0.0 1.0 1.0 0.0 1 0 0
971 146975 0.0 0.0 0 1 2 2 40 1.0 1.0 0.0 0.0 1 0 0
972 132137 1.0 1.0 1 0 8 1 46 1.0 1.0 1.0 1.0 0 0 0
973 124196 1.0 0.0 5 6 3 0 38 1.0 1.0 1.0 0.0 0 1 1
974 154461 0.0 1.0 7 7 3 1 41 1.0 0.0 1.0 0.0 0 0 0
975 153091 1.0 1.0 2 6 7 0 44 0.0 1.0 1.0 0.0 0 1 1
977 128405 1.0 1.0 0 0 8 1 25 0.0 1.0 0.0 0.0 0 0 0
978 195409 1.0 0.0 3 1 9 1 46 0.0 0.0 0.0 1.0 0 0 0
979 113751 1.0 1.0 5 3 3 2 27 1.0 1.0 0.0 1.0 1 0 0
980 110360 1.0 1.0 3 2 5 2 26 0.0 0.0 0.0 0.0 0 0 0
982 131756 1.0 1.0 5 3 9 2 50 1.0 0.0 0.0 0.0 0 0 0
983 114983 0.0 1.0 9 9 0 0 33 1.0 1.0 0.0 0.0 1 1 0
984 149286 0.0 0.0 1 1 4 0 43 0.0 0.0 0.0 0.0 0 1 0
985 136269 1.0 0.0 8 4 0 1 33 1.0 0.0 1.0 1.0 0 0 0
986 194704 0.0 1.0 1 7 0 1 46 0.0 0.0 1.0 0.0 1 0 0
987 179959 1.0 1.0 1 7 5 2 19 1.0 0.0 1.0 0.0 0 0 0
989 144786 0.0 0.0 6 8 5 2 45 0.0 1.0 0.0 1.0 1 0 0
990 127532 1.0 0.0 0 2 9 1 42 0.0 1.0 1.0 0.0 0 0 0
991 131739 0.0 1.0 9 2 9 0 33 1.0 1.0 0.0 0.0 0 1 0
992 181854 1.0 1.0 3 8 2 1 30 0.0 0.0 0.0 0.0 1 0 0
994 184367 1.0 0.0 8 8 6 1 33 1.0 1.0 1.0 1.0 1 0 0
996 143273 0.0 1.0 1 2 4 2 20 1.0 0.0 1.0 1.0 1 0 0
997 169123 0.0 1.0 2 7 0 2 32 1.0 1.0 1.0 1.0 1 0 0
998 127183 1.0 1.0 5 4 7 1 34 0.0 1.0 0.0 0.0 1 0 0

809 rows × 15 columns

# 特征选择(部分可能部分重复,但影响不大)
features = [
    '遗传因素', '荷尔蒙变化', '医疗状况', '药物及治疗', 
    '营养缺乏', '压力水平', '年龄', '不良护发习惯', 
    '环境因素', '吸烟习惯', '体重减轻', '高压力',
    '遗传高压力组合'
]

X = df[features]
y = df['脱发标记']
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)
def evaluate_model(model, model_name, X_train, X_test, y_train, y_test):  #模型对象、模型名称,x训练集,y训练集,x测试集
    # 训练模型
    model.fit(X_train, y_train)
    
    # 预测
    y_pred = model.predict(X_test)
    
    # 打印分类报告
    print(f"=== {model_name} 模型评估 ===")
    print(classification_report(y_test, y_pred))#包含准确率、召回率、f1测度、支持率
    
    # 绘制混淆矩阵
    cm = confusion_matrix(y_test, y_pred) #返回混淆矩阵
    plt.figure(figsize=(8, 6))
    plt.imshow(cm,cmap='hot',interpolation='nearest')#热力图
    plt.colorbar()#视觉映射
    #添加标签
    for i in range(2):
        for j in range(2):
            plt.text(j, i, f'{cm[i, j]:.2f}', ha='center', va='center', color='black')
    plt.title(f'{model_name}模型混淆矩阵')
    plt.show()
    
    # 绘制ROC曲线
    if hasattr(model, "predict_proba"):#hasattr用于检查对象是否具有指定的属性
        y_prob = model.predict_proba(X_test)[:, 1]
    else:  # 对于SVM等没有predict_proba方法的模型
        y_prob = model.decision_function(X_test) if hasattr(model, "decision_function") else y_pred
    
    fpr, tpr, _ = roc_curve(y_test, y_prob)  #roc曲线
    roc_auc = auc(fpr, tpr) #auc面积
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC曲线(面积 = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('假阳率')
    plt.ylabel('真阳率')
    plt.title(f'{model_name}模型ROC曲线')
    plt.legend(loc="lower right")
    plt.show()
    
    return model, roc_auc
rf_model = RandomForestClassifier(random_state=15, class_weight='balanced')
rf_model, rf_auc = evaluate_model(rf_model, "随机森林", X_train, X_test, y_train, y_test)
=== 随机森林 模型评估 ===
             precision    recall  f1-score   support

          0       0.48      0.65      0.55        60
          1       0.49      0.32      0.39        62

avg / total       0.48      0.48      0.47       122

在这里插入图片描述

在这里插入图片描述

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt #画图
import matplotlib as mpl
import scipy.stats as stats
from scipy.stats import chi2_contingency
from sklearn.preprocessing import StandardScaler #标准化
from sklearn.model_selection import train_test_split #划分训练集和测试集
from sklearn.linear_model import LogisticRegression #回归线性模型
from sklearn.tree import DecisionTreeClassifier #决策树
from sklearn.ensemble import RandomForestClassifier #随机森林
from sklearn.svm import SVC #支持向量积
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc #评价指标

import warnings
warnings.filterwarnings(‘ignore’) #忽略警告信息
mpl.rcParams[‘font.family’] = ‘SimHei’
plt.rcParams[‘axes.unicode_minus’] = False # 步骤二(解决坐标轴负数的负号显示问题)
该代码展示了一个脱发预测的机器学习项目流程。首先导入必要的Python库(pandas、matplotlib、sklearn等),加载脱发预测数据集。数据集包含12个特征变量(如遗传因素、荷尔蒙变化、医疗状况、压力水平等)和1个目标变量(Hair Loss)。接着使用随机森林和支持向量机算法进行建模,并准备了模型评估指标(分类报告、ROC曲线等)。数据包含多种分类变量(如"Yes/No")和数值变量(年龄),部分字段存在"No Data"缺失值。该分析旨在通过机器学习方法识别导致脱发的关键因素。

Logo

脑启社区是一个专注类脑智能领域的开发者社区。欢迎加入社区,共建类脑智能生态。社区为开发者提供了丰富的开源类脑工具软件、类脑算法模型及数据集、类脑知识库、类脑技术培训课程以及类脑应用案例等资源。

更多推荐