Untitled1
该代码展示了一个脱发预测的机器学习项目流程。首先导入必要的Python库(pandas、matplotlib、sklearn等),加载脱发预测数据集。数据集包含12个特征变量(如遗传因素、荷尔蒙变化、医疗状况、压力水平等)和1个目标变量(Hair Loss)。接着使用随机森林和支持向量机算法进行建模,并准备了模型评估指标(分类报告、ROC曲线等)。数据包含多种分类变量(如"Yes/No&
import pandas as pda
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split #划分训练集和测试集
from sklearn.ensemble import RandomForestClassifier #随机森林
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, roc_curve, auc, confusion_matrix #模型评估
from sklearn.svm import SVC #支持向量机
import warnings
warnings.filterwarnings('ignore') #忽略警告信息
plt.rcParams['font.sans-serif']='SimHei'
plt.rcParams['axes.unicode_minus']=False
df=pd.read_csv('Predict Hair Fall.csv')
df
| Id | Genetics | Hormonal Changes | Medical Conditions | Medications & Treatments | Nutritional Deficiencies | Stress | Age | Poor Hair Care Habits | Environmental Factors | Smoking | Weight Loss | Hair Loss | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 133992 | Yes | No | No Data | No Data | Magnesium deficiency | Moderate | 19 | Yes | Yes | No | No | 0 |
| 1 | 148393 | No | No | Eczema | Antibiotics | Magnesium deficiency | High | 43 | Yes | Yes | No | No | 0 |
| 2 | 155074 | No | No | Dermatosis | Antifungal Cream | Protein deficiency | Moderate | 26 | Yes | Yes | No | Yes | 0 |
| 3 | 118261 | Yes | Yes | Ringworm | Antibiotics | Biotin Deficiency | Moderate | 46 | Yes | Yes | No | No | 0 |
| 4 | 111915 | No | No | Psoriasis | Accutane | Iron deficiency | Moderate | 30 | No | Yes | Yes | No | 1 |
| 5 | 139661 | Yes | No | Psoriasis | Antibiotics | Magnesium deficiency | Low | 37 | No | Yes | No | Yes | 1 |
| 6 | 169255 | Yes | Yes | No Data | No Data | Selenium deficiency | High | 40 | Yes | No | No | No | 1 |
| 7 | 112032 | Yes | No | Dermatosis | Chemotherapy | Omega-3 fatty acids | High | 35 | Yes | No | Yes | No | 0 |
| 8 | 140785 | Yes | No | Eczema | Steroids | Selenium deficiency | Moderate | 19 | No | No | Yes | Yes | 1 |
| 9 | 187999 | No | Yes | Ringworm | Rogaine | Magnesium deficiency | Moderate | 49 | Yes | Yes | Yes | No | 0 |
| 10 | 118858 | Yes | Yes | Eczema | Blood Pressure Medication | Biotin Deficiency | High | 26 | Yes | Yes | Yes | No | 0 |
| 11 | 159158 | No | Yes | Alopecia Areata | Accutane | Zinc Deficiency | High | 48 | No | No | No | No | 1 |
| 12 | 156086 | Yes | Yes | Scalp Infection | Immunomodulators | Biotin Deficiency | Moderate | 20 | No | Yes | Yes | No | 1 |
| 13 | 178256 | No | No | Psoriasis | Antibiotics | Vitamin A Deficiency | High | 30 | Yes | Yes | Yes | Yes | 0 |
| 14 | 150154 | Yes | No | Eczema | Antibiotics | Biotin Deficiency | High | 34 | Yes | Yes | No | Yes | 0 |
| 15 | 130552 | Yes | Yes | Scalp Infection | Rogaine | Vitamin D Deficiency | Moderate | 29 | Yes | No | No | Yes | 0 |
| 16 | 116190 | Yes | No | Seborrheic Dermatitis | Antidepressants | Vitamin D Deficiency | High | 46 | Yes | Yes | No | Yes | 0 |
| 17 | 194441 | No | Yes | Dermatosis | Antibiotics | Zinc Deficiency | Low | 19 | Yes | No | No | Yes | 1 |
| 18 | 147404 | Yes | Yes | Dermatosis | Accutane | Biotin Deficiency | Low | 26 | No | No | Yes | No | 0 |
| 19 | 136709 | Yes | Yes | Seborrheic Dermatitis | Chemotherapy | Vitamin A Deficiency | High | 46 | Yes | Yes | No | Yes | 1 |
| 20 | 187362 | Yes | Yes | Seborrheic Dermatitis | Accutane | Protein deficiency | High | 46 | No | Yes | No | Yes | 1 |
| 21 | 133804 | No | No | No Data | Chemotherapy | Zinc Deficiency | High | 20 | No | Yes | Yes | Yes | 1 |
| 22 | 148974 | Yes | Yes | Psoriasis | Antibiotics | Vitamin D Deficiency | Low | 29 | No | No | Yes | Yes | 0 |
| 23 | 116818 | No | Yes | Scalp Infection | Antidepressants | Vitamin D Deficiency | High | 37 | No | No | Yes | No | 1 |
| 24 | 142062 | No | Yes | Dermatitis | Antibiotics | Protein deficiency | High | 33 | Yes | Yes | No | No | 0 |
| 25 | 147833 | Yes | Yes | Dermatosis | Heart Medication | Vitamin A Deficiency | Moderate | 34 | No | No | No | No | 0 |
| 26 | 190967 | No | Yes | Dermatosis | Immunomodulators | Selenium deficiency | High | 28 | No | No | No | No | 0 |
| 27 | 114579 | No | Yes | Seborrheic Dermatitis | Antifungal Cream | Zinc Deficiency | Low | 41 | No | No | No | No | 0 |
| 28 | 159949 | No | Yes | Thyroid Problems | Chemotherapy | Omega-3 fatty acids | Low | 35 | Yes | Yes | Yes | No | 0 |
| 29 | 117296 | No | Yes | Dermatitis | Chemotherapy | No Data | Low | 21 | Yes | Yes | Yes | No | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 969 | 127078 | Yes | No | Thyroid Problems | Rogaine | Omega-3 fatty acids | High | 35 | Yes | Yes | No | Yes | 0 |
| 970 | 163276 | No | Yes | Alopecia Areata | Antifungal Cream | Biotin Deficiency | Low | 18 | No | Yes | Yes | No | 1 |
| 971 | 146975 | No | No | Alopecia Areata | Antibiotics | Magnesium deficiency | Moderate | 40 | Yes | Yes | No | No | 1 |
| 972 | 132137 | Yes | Yes | Androgenetic Alopecia | Accutane | Vitamin E deficiency | Low | 46 | Yes | Yes | Yes | Yes | 0 |
| 973 | 124196 | Yes | No | Psoriasis | Heart Medication | Omega-3 fatty acids | High | 38 | Yes | Yes | Yes | No | 0 |
| 974 | 154461 | No | Yes | Scalp Infection | Immunomodulators | Omega-3 fatty acids | Low | 41 | Yes | No | Yes | No | 0 |
| 975 | 153091 | Yes | Yes | Dermatitis | Heart Medication | Vitamin D Deficiency | High | 44 | No | Yes | Yes | No | 0 |
| 976 | 163302 | Yes | Yes | Psoriasis | Blood Pressure Medication | No Data | Low | 21 | Yes | Yes | Yes | No | 0 |
| 977 | 128405 | Yes | Yes | Alopecia Areata | Accutane | Vitamin E deficiency | Low | 25 | No | Yes | No | No | 0 |
| 978 | 195409 | Yes | No | Dermatosis | Antibiotics | Zinc Deficiency | Low | 46 | No | No | No | Yes | 0 |
| 979 | 113751 | Yes | Yes | Psoriasis | Antifungal Cream | Omega-3 fatty acids | Moderate | 27 | Yes | Yes | No | Yes | 1 |
| 980 | 110360 | Yes | Yes | Dermatosis | Antidepressants | Selenium deficiency | Moderate | 26 | No | No | No | No | 0 |
| 981 | 130242 | Yes | Yes | No Data | Antibiotics | Iron deficiency | High | 40 | Yes | Yes | Yes | Yes | 0 |
| 982 | 131756 | Yes | Yes | Psoriasis | Antifungal Cream | Zinc Deficiency | Moderate | 50 | Yes | No | No | No | 0 |
| 983 | 114983 | No | Yes | Thyroid Problems | Steroids | Biotin Deficiency | High | 33 | Yes | Yes | No | No | 1 |
| 984 | 149286 | No | No | Androgenetic Alopecia | Antibiotics | Protein deficiency | High | 43 | No | No | No | No | 0 |
| 985 | 136269 | Yes | No | Seborrheic Dermatitis | Blood Pressure Medication | Biotin Deficiency | Low | 33 | Yes | No | Yes | Yes | 0 |
| 986 | 194704 | No | Yes | Androgenetic Alopecia | Immunomodulators | Biotin Deficiency | Low | 46 | No | No | Yes | No | 1 |
| 987 | 179959 | Yes | Yes | Androgenetic Alopecia | Immunomodulators | Selenium deficiency | Moderate | 19 | Yes | No | Yes | No | 0 |
| 988 | 174574 | No | No | Androgenetic Alopecia | Antibiotics | No Data | High | 23 | Yes | Yes | No | Yes | 0 |
| 989 | 144786 | No | No | Ringworm | Rogaine | Selenium deficiency | Moderate | 45 | No | Yes | No | Yes | 1 |
| 990 | 127532 | Yes | No | Alopecia Areata | Antidepressants | Zinc Deficiency | Low | 42 | No | Yes | Yes | No | 0 |
| 991 | 131739 | No | Yes | Thyroid Problems | Antidepressants | Zinc Deficiency | High | 33 | Yes | Yes | No | No | 0 |
| 992 | 181854 | Yes | Yes | Dermatosis | Rogaine | Magnesium deficiency | Low | 30 | No | No | No | No | 1 |
| 993 | 196218 | No | Yes | Scalp Infection | Immunomodulators | No Data | Moderate | 23 | Yes | Yes | Yes | Yes | 0 |
| 994 | 184367 | Yes | No | Seborrheic Dermatitis | Rogaine | Vitamin A Deficiency | Low | 33 | Yes | Yes | Yes | Yes | 1 |
| 995 | 164777 | Yes | Yes | No Data | Accutane | Protein deficiency | Low | 47 | No | No | No | Yes | 0 |
| 996 | 143273 | No | Yes | Androgenetic Alopecia | Antidepressants | Protein deficiency | Moderate | 20 | Yes | No | Yes | Yes | 1 |
| 997 | 169123 | No | Yes | Dermatitis | Immunomodulators | Biotin Deficiency | Moderate | 32 | Yes | Yes | Yes | Yes | 1 |
| 998 | 127183 | Yes | Yes | Psoriasis | Blood Pressure Medication | Vitamin D Deficiency | Low | 34 | No | Yes | No | No | 1 |
999 rows × 13 columns
# 将列名转换成中文,便于理解
chinese_columns=[
'遗传因素',
'荷尔蒙变化',
'医疗状况',
'药物及治疗',
'营养缺乏',
'压力水平',
'年龄',
'不良护发习惯',
'环境因素',
'吸烟习惯',
'体重减轻',
'脱发标记' # 目标变量
]
# 将原始数据集的列名改为:第一列为'ID',后面依次为chinese_columns中的12个列名
df.columns = ['ID'] + chinese_columns
df.head(5)
| ID | 遗传因素 | 荷尔蒙变化 | 医疗状况 | 药物及治疗 | 营养缺乏 | 压力水平 | 年龄 | 不良护发习惯 | 环境因素 | 吸烟习惯 | 体重减轻 | 脱发标记 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 133992 | Yes | No | No Data | No Data | Magnesium deficiency | Moderate | 19 | Yes | Yes | No | No | 0 |
| 1 | 148393 | No | No | Eczema | Antibiotics | Magnesium deficiency | High | 43 | Yes | Yes | No | No | 0 |
| 2 | 155074 | No | No | Dermatosis | Antifungal Cream | Protein deficiency | Moderate | 26 | Yes | Yes | No | Yes | 0 |
| 3 | 118261 | Yes | Yes | Ringworm | Antibiotics | Biotin Deficiency | Moderate | 46 | Yes | Yes | No | No | 0 |
| 4 | 111915 | No | No | Psoriasis | Accutane | Iron deficiency | Moderate | 30 | No | Yes | Yes | No | 1 |
import numpy as np
# 缺失值处理
df.replace("No Data", np.nan, inplace=True)
df.head()
| ID | 遗传因素 | 荷尔蒙变化 | 医疗状况 | 药物及治疗 | 营养缺乏 | 压力水平 | 年龄 | 不良护发习惯 | 环境因素 | 吸烟习惯 | 体重减轻 | 脱发标记 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 133992 | Yes | No | NaN | NaN | Magnesium deficiency | Moderate | 19 | Yes | Yes | No | No | 0 |
| 1 | 148393 | No | No | Eczema | Antibiotics | Magnesium deficiency | High | 43 | Yes | Yes | No | No | 0 |
| 2 | 155074 | No | No | Dermatosis | Antifungal Cream | Protein deficiency | Moderate | 26 | Yes | Yes | No | Yes | 0 |
| 3 | 118261 | Yes | Yes | Ringworm | Antibiotics | Biotin Deficiency | Moderate | 46 | Yes | Yes | No | No | 0 |
| 4 | 111915 | No | No | Psoriasis | Accutane | Iron deficiency | Moderate | 30 | No | Yes | Yes | No | 1 |
import numpy as np
# 二值列转换
binary_cols = ['遗传因素', '荷尔蒙变化', '不良护发习惯', '环境因素', '吸烟习惯', '体重减轻']
for col in binary_cols:
df[col] = df[col].map({'Yes': 1, 'No': 0, np.nan: np.nan}) # 直接用 np.nan 处理缺失
df.head()
| ID | 遗传因素 | 荷尔蒙变化 | 医疗状况 | 药物及治疗 | 营养缺乏 | 压力水平 | 年龄 | 不良护发习惯 | 环境因素 | 吸烟习惯 | 体重减轻 | 脱发标记 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 133992 | 1.0 | 0.0 | NaN | NaN | Magnesium deficiency | Moderate | 19 | 1.0 | 1.0 | 0.0 | 0.0 | 0 |
| 1 | 148393 | 0.0 | 0.0 | Eczema | Antibiotics | Magnesium deficiency | High | 43 | 1.0 | 1.0 | 0.0 | 0.0 | 0 |
| 2 | 155074 | 0.0 | 0.0 | Dermatosis | Antifungal Cream | Protein deficiency | Moderate | 26 | 1.0 | 1.0 | 0.0 | 1.0 | 0 |
| 3 | 118261 | 1.0 | 1.0 | Ringworm | Antibiotics | Biotin Deficiency | Moderate | 46 | 1.0 | 1.0 | 0.0 | 0.0 | 0 |
| 4 | 111915 | 0.0 | 0.0 | Psoriasis | Accutane | Iron deficiency | Moderate | 30 | 0.0 | 1.0 | 1.0 | 0.0 | 1 |
# 创建高压力分组
df['高压力']=df['压力水平'].apply(lambda x:1 if x=='High' else 0)
df.head()
| ID | 遗传因素 | 荷尔蒙变化 | 医疗状况 | 药物及治疗 | 营养缺乏 | 压力水平 | 年龄 | 不良护发习惯 | 环境因素 | 吸烟习惯 | 体重减轻 | 脱发标记 | 高压力 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 133992 | 1.0 | 0.0 | NaN | NaN | Magnesium deficiency | Moderate | 19 | 1.0 | 1.0 | 0.0 | 0.0 | 0 | 0 |
| 1 | 148393 | 0.0 | 0.0 | Eczema | Antibiotics | Magnesium deficiency | High | 43 | 1.0 | 1.0 | 0.0 | 0.0 | 0 | 1 |
| 2 | 155074 | 0.0 | 0.0 | Dermatosis | Antifungal Cream | Protein deficiency | Moderate | 26 | 1.0 | 1.0 | 0.0 | 1.0 | 0 | 0 |
| 3 | 118261 | 1.0 | 1.0 | Ringworm | Antibiotics | Biotin Deficiency | Moderate | 46 | 1.0 | 1.0 | 0.0 | 0.0 | 0 | 0 |
| 4 | 111915 | 0.0 | 0.0 | Psoriasis | Accutane | Iron deficiency | Moderate | 30 | 0.0 | 1.0 | 1.0 | 0.0 | 1 | 0 |
#脱发标记分布
data=df['脱发标记'].value_counts()
plt.figure(figsize=(8,5))
plt.pie(data,labels=['不脱发','脱发'],autopct='%.2f%%',startangle=90,shadow=True)
plt.title('脱发标记分布')
plt.show()

# 年龄与脱发关系
fig=plt.figure(figsize=(10,6))
ax1=plt.subplot(111)
df.boxplot(column='年龄',by='脱发标记',ax=ax1,)
ax1.set_title('脱发人群年龄分布',fontsize=14)
ax1.set_ylabel('年龄')
plt.show()

#常见医疗诊断分析
plt.figure(figsize=(12, 8))
top_conditions = df['医疗状况'].value_counts().head(10)
plt.barh(top_conditions.index,top_conditions) #横向柱状图
plt.title('十大常见脱发相关医疗状况', fontsize=14)
plt.xlabel('样本数量', fontsize=12)
plt.ylabel('医疗状况', fontsize=12)
plt.tight_layout()
plt.show()

plt.figure(figsize=(12,8))
top8_nutrition=df['营养缺乏'].value_counts().head(8)
plt.barh(top8_nutrition.index,top8_nutrition)
plt.title('常见脱发相关营养缺乏类型', fontsize=16, pad=20)
plt.xlabel('样本数量', fontsize=14)
plt.ylabel('营养缺乏类型', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

# 二值特征与脱发关系
features = ['遗传因素', '荷尔蒙变化', '不良护发习惯', '环境因素', '吸烟习惯', '体重减轻']
fig,axes=plt.subplots(3,2,figsize=(15,15))
axes=axes.flatten()
df['脱发标记'] = df['脱发标记'].astype('category')
categories = df['脱发标记'].cat.categories
num_categories = len(categories)
x=np.arange(num_categories)
width=0.35
for i ,feature in enumerate(features):#枚举每个特征
if i < len(axes):
ax=axes[i]
#计算每个类别中1和0的数量
counts=df.groupby('脱发标记')[feature].value_counts().unstack(fill_value=0)
rects1=ax.bar(x-width/2,counts[0],width)
rects1=ax.bar(x+width/2,counts[1],width)
ax.set_title(f'{feature}与脱发')
#ax.set_ylable('数量')
ax.set_xticks(x)
ax.set_xticklabels(categories)
ax.legend(['无','有'])
plt.tight_layout()
plt.show()

counts
| 体重减轻 | 0.0 | 1.0 |
|---|---|---|
| 脱发标记 | ||
| 0 | 276 | 226 |
| 1 | 251 | 246 |
# 特征相关性分析
corr_features = ['遗传因素', '荷尔蒙变化', '高压力', '不良护发习惯', '吸烟习惯', '体重减轻', '脱发标记']
corr = df[corr_features].corr()
plt.figure(figsize=(12, 8))
plt.imshow(corr, cmap='hot',interpolation='nearest',alpha=0.9)
plt.colorbar()
plt.title('脱发相关因素相关系数热力图', fontsize=14)
plt.tight_layout()
plt.show()

df.head()
| ID | 遗传因素 | 荷尔蒙变化 | 医疗状况 | 药物及治疗 | 营养缺乏 | 压力水平 | 年龄 | 不良护发习惯 | 环境因素 | 吸烟习惯 | 体重减轻 | 脱发标记 | 高压力 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 133992 | 1.0 | 0.0 | NaN | NaN | Magnesium deficiency | Moderate | 19 | 1.0 | 1.0 | 0.0 | 0.0 | 0 | 0 |
| 1 | 148393 | 0.0 | 0.0 | Eczema | Antibiotics | Magnesium deficiency | High | 43 | 1.0 | 1.0 | 0.0 | 0.0 | 0 | 1 |
| 2 | 155074 | 0.0 | 0.0 | Dermatosis | Antifungal Cream | Protein deficiency | Moderate | 26 | 1.0 | 1.0 | 0.0 | 1.0 | 0 | 0 |
| 3 | 118261 | 1.0 | 1.0 | Ringworm | Antibiotics | Biotin Deficiency | Moderate | 46 | 1.0 | 1.0 | 0.0 | 0.0 | 0 | 0 |
| 4 | 111915 | 0.0 | 0.0 | Psoriasis | Accutane | Iron deficiency | Moderate | 30 | 0.0 | 1.0 | 1.0 | 0.0 | 1 | 0 |
# 缺失值处理(删除少量缺失行)
df.dropna(subset=['脱发标记', '医疗状况', '药物及治疗', '营养缺乏'], inplace=True)
# 复合变量
# 遗传因素+高压力组合
df['遗传高压力组合'] = ((df['遗传因素'] == 1) & (df['高压力'] == 1)).astype(int)
# 标签编码分类变量
label_encoders = {}
categorical_cols = ['医疗状况', '药物及治疗', '营养缺乏', '压力水平']
for col in categorical_cols:
le = LabelEncoder()
df[col] = le.fit_transform(df[col].astype(str))
label_encoders[col] = le
df
| ID | 遗传因素 | 荷尔蒙变化 | 医疗状况 | 药物及治疗 | 营养缺乏 | 压力水平 | 年龄 | 不良护发习惯 | 环境因素 | 吸烟习惯 | 体重减轻 | 脱发标记 | 高压力 | 遗传高压力组合 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 148393 | 0.0 | 0.0 | 4 | 1 | 2 | 0 | 43 | 1.0 | 1.0 | 0.0 | 0.0 | 0 | 1 | 0 |
| 2 | 155074 | 0.0 | 0.0 | 3 | 3 | 4 | 2 | 26 | 1.0 | 1.0 | 0.0 | 1.0 | 0 | 0 | 0 |
| 3 | 118261 | 1.0 | 1.0 | 6 | 1 | 0 | 2 | 46 | 1.0 | 1.0 | 0.0 | 0.0 | 0 | 0 | 0 |
| 4 | 111915 | 0.0 | 0.0 | 5 | 0 | 1 | 2 | 30 | 0.0 | 1.0 | 1.0 | 0.0 | 1 | 0 | 0 |
| 5 | 139661 | 1.0 | 0.0 | 5 | 1 | 2 | 1 | 37 | 0.0 | 1.0 | 0.0 | 1.0 | 1 | 0 | 0 |
| 7 | 112032 | 1.0 | 0.0 | 3 | 5 | 3 | 0 | 35 | 1.0 | 0.0 | 1.0 | 0.0 | 0 | 1 | 1 |
| 8 | 140785 | 1.0 | 0.0 | 4 | 9 | 5 | 2 | 19 | 0.0 | 0.0 | 1.0 | 1.0 | 1 | 0 | 0 |
| 9 | 187999 | 0.0 | 1.0 | 6 | 8 | 2 | 2 | 49 | 1.0 | 1.0 | 1.0 | 0.0 | 0 | 0 | 0 |
| 10 | 118858 | 1.0 | 1.0 | 4 | 4 | 0 | 0 | 26 | 1.0 | 1.0 | 1.0 | 0.0 | 0 | 1 | 1 |
| 11 | 159158 | 0.0 | 1.0 | 0 | 0 | 9 | 0 | 48 | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 1 | 0 |
| 12 | 156086 | 1.0 | 1.0 | 7 | 7 | 0 | 2 | 20 | 0.0 | 1.0 | 1.0 | 0.0 | 1 | 0 | 0 |
| 13 | 178256 | 0.0 | 0.0 | 5 | 1 | 6 | 0 | 30 | 1.0 | 1.0 | 1.0 | 1.0 | 0 | 1 | 0 |
| 14 | 150154 | 1.0 | 0.0 | 4 | 1 | 0 | 0 | 34 | 1.0 | 1.0 | 0.0 | 1.0 | 0 | 1 | 1 |
| 15 | 130552 | 1.0 | 1.0 | 7 | 8 | 7 | 2 | 29 | 1.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0 |
| 16 | 116190 | 1.0 | 0.0 | 8 | 2 | 7 | 0 | 46 | 1.0 | 1.0 | 0.0 | 1.0 | 0 | 1 | 1 |
| 17 | 194441 | 0.0 | 1.0 | 3 | 1 | 9 | 1 | 19 | 1.0 | 0.0 | 0.0 | 1.0 | 1 | 0 | 0 |
| 18 | 147404 | 1.0 | 1.0 | 3 | 0 | 0 | 1 | 26 | 0.0 | 0.0 | 1.0 | 0.0 | 0 | 0 | 0 |
| 19 | 136709 | 1.0 | 1.0 | 8 | 5 | 6 | 0 | 46 | 1.0 | 1.0 | 0.0 | 1.0 | 1 | 1 | 1 |
| 20 | 187362 | 1.0 | 1.0 | 8 | 0 | 4 | 0 | 46 | 0.0 | 1.0 | 0.0 | 1.0 | 1 | 1 | 1 |
| 22 | 148974 | 1.0 | 1.0 | 5 | 1 | 7 | 1 | 29 | 0.0 | 0.0 | 1.0 | 1.0 | 0 | 0 | 0 |
| 23 | 116818 | 0.0 | 1.0 | 7 | 2 | 7 | 0 | 37 | 0.0 | 0.0 | 1.0 | 0.0 | 1 | 1 | 0 |
| 24 | 142062 | 0.0 | 1.0 | 2 | 1 | 4 | 0 | 33 | 1.0 | 1.0 | 0.0 | 0.0 | 0 | 1 | 0 |
| 25 | 147833 | 1.0 | 1.0 | 3 | 6 | 6 | 2 | 34 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 |
| 26 | 190967 | 0.0 | 1.0 | 3 | 7 | 5 | 0 | 28 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 1 | 0 |
| 27 | 114579 | 0.0 | 1.0 | 8 | 3 | 9 | 1 | 41 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 |
| 28 | 159949 | 0.0 | 1.0 | 9 | 5 | 3 | 1 | 35 | 1.0 | 1.0 | 1.0 | 0.0 | 0 | 0 | 0 |
| 31 | 133091 | 0.0 | 1.0 | 2 | 1 | 1 | 2 | 45 | 0.0 | 0.0 | 0.0 | 1.0 | 1 | 0 | 0 |
| 32 | 157912 | 1.0 | 0.0 | 2 | 9 | 4 | 2 | 30 | 1.0 | 1.0 | 0.0 | 0.0 | 0 | 0 | 0 |
| 33 | 144334 | 1.0 | 0.0 | 6 | 9 | 5 | 0 | 24 | 1.0 | 1.0 | 0.0 | 1.0 | 1 | 1 | 1 |
| 34 | 136948 | 0.0 | 1.0 | 7 | 2 | 7 | 2 | 31 | 1.0 | 1.0 | 1.0 | 1.0 | 1 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 961 | 144096 | 1.0 | 0.0 | 9 | 9 | 2 | 0 | 33 | 1.0 | 1.0 | 1.0 | 0.0 | 0 | 1 | 1 |
| 962 | 137559 | 0.0 | 0.0 | 9 | 7 | 2 | 2 | 42 | 1.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 |
| 963 | 196758 | 1.0 | 0.0 | 5 | 6 | 2 | 0 | 43 | 0.0 | 1.0 | 0.0 | 0.0 | 0 | 1 | 1 |
| 967 | 187095 | 1.0 | 0.0 | 1 | 5 | 1 | 2 | 41 | 1.0 | 1.0 | 0.0 | 0.0 | 1 | 0 | 0 |
| 968 | 190151 | 0.0 | 1.0 | 1 | 5 | 1 | 0 | 41 | 1.0 | 0.0 | 1.0 | 0.0 | 0 | 1 | 0 |
| 969 | 127078 | 1.0 | 0.0 | 9 | 8 | 3 | 0 | 35 | 1.0 | 1.0 | 0.0 | 1.0 | 0 | 1 | 1 |
| 970 | 163276 | 0.0 | 1.0 | 0 | 3 | 0 | 1 | 18 | 0.0 | 1.0 | 1.0 | 0.0 | 1 | 0 | 0 |
| 971 | 146975 | 0.0 | 0.0 | 0 | 1 | 2 | 2 | 40 | 1.0 | 1.0 | 0.0 | 0.0 | 1 | 0 | 0 |
| 972 | 132137 | 1.0 | 1.0 | 1 | 0 | 8 | 1 | 46 | 1.0 | 1.0 | 1.0 | 1.0 | 0 | 0 | 0 |
| 973 | 124196 | 1.0 | 0.0 | 5 | 6 | 3 | 0 | 38 | 1.0 | 1.0 | 1.0 | 0.0 | 0 | 1 | 1 |
| 974 | 154461 | 0.0 | 1.0 | 7 | 7 | 3 | 1 | 41 | 1.0 | 0.0 | 1.0 | 0.0 | 0 | 0 | 0 |
| 975 | 153091 | 1.0 | 1.0 | 2 | 6 | 7 | 0 | 44 | 0.0 | 1.0 | 1.0 | 0.0 | 0 | 1 | 1 |
| 977 | 128405 | 1.0 | 1.0 | 0 | 0 | 8 | 1 | 25 | 0.0 | 1.0 | 0.0 | 0.0 | 0 | 0 | 0 |
| 978 | 195409 | 1.0 | 0.0 | 3 | 1 | 9 | 1 | 46 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0 |
| 979 | 113751 | 1.0 | 1.0 | 5 | 3 | 3 | 2 | 27 | 1.0 | 1.0 | 0.0 | 1.0 | 1 | 0 | 0 |
| 980 | 110360 | 1.0 | 1.0 | 3 | 2 | 5 | 2 | 26 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 |
| 982 | 131756 | 1.0 | 1.0 | 5 | 3 | 9 | 2 | 50 | 1.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 |
| 983 | 114983 | 0.0 | 1.0 | 9 | 9 | 0 | 0 | 33 | 1.0 | 1.0 | 0.0 | 0.0 | 1 | 1 | 0 |
| 984 | 149286 | 0.0 | 0.0 | 1 | 1 | 4 | 0 | 43 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 1 | 0 |
| 985 | 136269 | 1.0 | 0.0 | 8 | 4 | 0 | 1 | 33 | 1.0 | 0.0 | 1.0 | 1.0 | 0 | 0 | 0 |
| 986 | 194704 | 0.0 | 1.0 | 1 | 7 | 0 | 1 | 46 | 0.0 | 0.0 | 1.0 | 0.0 | 1 | 0 | 0 |
| 987 | 179959 | 1.0 | 1.0 | 1 | 7 | 5 | 2 | 19 | 1.0 | 0.0 | 1.0 | 0.0 | 0 | 0 | 0 |
| 989 | 144786 | 0.0 | 0.0 | 6 | 8 | 5 | 2 | 45 | 0.0 | 1.0 | 0.0 | 1.0 | 1 | 0 | 0 |
| 990 | 127532 | 1.0 | 0.0 | 0 | 2 | 9 | 1 | 42 | 0.0 | 1.0 | 1.0 | 0.0 | 0 | 0 | 0 |
| 991 | 131739 | 0.0 | 1.0 | 9 | 2 | 9 | 0 | 33 | 1.0 | 1.0 | 0.0 | 0.0 | 0 | 1 | 0 |
| 992 | 181854 | 1.0 | 1.0 | 3 | 8 | 2 | 1 | 30 | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 0 | 0 |
| 994 | 184367 | 1.0 | 0.0 | 8 | 8 | 6 | 1 | 33 | 1.0 | 1.0 | 1.0 | 1.0 | 1 | 0 | 0 |
| 996 | 143273 | 0.0 | 1.0 | 1 | 2 | 4 | 2 | 20 | 1.0 | 0.0 | 1.0 | 1.0 | 1 | 0 | 0 |
| 997 | 169123 | 0.0 | 1.0 | 2 | 7 | 0 | 2 | 32 | 1.0 | 1.0 | 1.0 | 1.0 | 1 | 0 | 0 |
| 998 | 127183 | 1.0 | 1.0 | 5 | 4 | 7 | 1 | 34 | 0.0 | 1.0 | 0.0 | 0.0 | 1 | 0 | 0 |
809 rows × 15 columns
# 特征选择(部分可能部分重复,但影响不大)
features = [
'遗传因素', '荷尔蒙变化', '医疗状况', '药物及治疗',
'营养缺乏', '压力水平', '年龄', '不良护发习惯',
'环境因素', '吸烟习惯', '体重减轻', '高压力',
'遗传高压力组合'
]
X = df[features]
y = df['脱发标记']
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.15, random_state=42, stratify=y
)
def evaluate_model(model, model_name, X_train, X_test, y_train, y_test): #模型对象、模型名称,x训练集,y训练集,x测试集
# 训练模型
model.fit(X_train, y_train)
# 预测
y_pred = model.predict(X_test)
# 打印分类报告
print(f"=== {model_name} 模型评估 ===")
print(classification_report(y_test, y_pred))#包含准确率、召回率、f1测度、支持率
# 绘制混淆矩阵
cm = confusion_matrix(y_test, y_pred) #返回混淆矩阵
plt.figure(figsize=(8, 6))
plt.imshow(cm,cmap='hot',interpolation='nearest')#热力图
plt.colorbar()#视觉映射
#添加标签
for i in range(2):
for j in range(2):
plt.text(j, i, f'{cm[i, j]:.2f}', ha='center', va='center', color='black')
plt.title(f'{model_name}模型混淆矩阵')
plt.show()
# 绘制ROC曲线
if hasattr(model, "predict_proba"):#hasattr用于检查对象是否具有指定的属性
y_prob = model.predict_proba(X_test)[:, 1]
else: # 对于SVM等没有predict_proba方法的模型
y_prob = model.decision_function(X_test) if hasattr(model, "decision_function") else y_pred
fpr, tpr, _ = roc_curve(y_test, y_prob) #roc曲线
roc_auc = auc(fpr, tpr) #auc面积
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC曲线(面积 = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('假阳率')
plt.ylabel('真阳率')
plt.title(f'{model_name}模型ROC曲线')
plt.legend(loc="lower right")
plt.show()
return model, roc_auc
rf_model = RandomForestClassifier(random_state=15, class_weight='balanced')
rf_model, rf_auc = evaluate_model(rf_model, "随机森林", X_train, X_test, y_train, y_test)
=== 随机森林 模型评估 ===
precision recall f1-score support
0 0.48 0.65 0.55 60
1 0.49 0.32 0.39 62
avg / total 0.48 0.48 0.47 122


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt #画图
import matplotlib as mpl
import scipy.stats as stats
from scipy.stats import chi2_contingency
from sklearn.preprocessing import StandardScaler #标准化
from sklearn.model_selection import train_test_split #划分训练集和测试集
from sklearn.linear_model import LogisticRegression #回归线性模型
from sklearn.tree import DecisionTreeClassifier #决策树
from sklearn.ensemble import RandomForestClassifier #随机森林
from sklearn.svm import SVC #支持向量积
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc #评价指标
import warnings
warnings.filterwarnings(‘ignore’) #忽略警告信息
mpl.rcParams[‘font.family’] = ‘SimHei’
plt.rcParams[‘axes.unicode_minus’] = False # 步骤二(解决坐标轴负数的负号显示问题)
该代码展示了一个脱发预测的机器学习项目流程。首先导入必要的Python库(pandas、matplotlib、sklearn等),加载脱发预测数据集。数据集包含12个特征变量(如遗传因素、荷尔蒙变化、医疗状况、压力水平等)和1个目标变量(Hair Loss)。接着使用随机森林和支持向量机算法进行建模,并准备了模型评估指标(分类报告、ROC曲线等)。数据包含多种分类变量(如"Yes/No")和数值变量(年龄),部分字段存在"No Data"缺失值。该分析旨在通过机器学习方法识别导致脱发的关键因素。
更多推荐


所有评论(0)