diff --git a/车辆12分类数据集/第11组--张朋飞/案例代码/horse_analysis.py b/车辆12分类数据集/第11组--张朋飞/案例代码/horse_analysis.py new file mode 100644 index 0000000..382e6cf --- /dev/null +++ b/车辆12分类数据集/第11组--张朋飞/案例代码/horse_analysis.py @@ -0,0 +1,103 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +from sklearn import preprocessing +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score,confusion_matrix,roc_curve,roc_auc_score +from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,AdaBoostClassifier +from sklearn.naive_bayes import GaussianNB +from sklearn.tree import DecisionTreeClassifier +from sklearn.svm import SVC +from sklearn.linear_model import LogisticRegression +from sklearn.neighbors import KNeighborsClassifier +# 中文正常显示 +plt.rcParams['font.sans-serif'] = ['SimHei'] +plt.rcParams['axes.unicode_minus'] = False +df = pd.read_csv("data_horse.csv",encoding="gbk") +df.head() +## 可视化列的缺失值、唯一值、类型 +df.corr() +# 可视化变量y与其他的相关性曲线 +df.corr()['y'].sort_values(ascending=False) +# 剔除相关性绝对值小于0.1的变量 +df = df[df.columns[df.corr()['y'].abs()>0.1]] +df.head() +df.isnull().sum() +#可视化变量 +import warnings +warnings.filterwarnings("ignore") +for i in df.columns: + plt.figure(figsize=(10,5)) + plt.title(i) + # 设置字体旋转角度 + plt.xticks(rotation=90) + sns.countplot(df[i]) + plt.show() + +# 我们认为这些变量中的0值都是不合理的,所以我们将变量中的0值都替换为np.nan +df[df.columns[:-1]] = df[df.columns[:-1]].replace(0,np.nan) +df.isnull().sum() +# 用众数填充缺失值 +# 观察发现直肠温度列是正太分布的,因此可以用均值填充缺失值,其他使用中位数 +df['直肠温度'] = df['直肠温度'].fillna(df['直肠温度'].mean()) +df[['脉搏','红细胞体积','总蛋白值']] = df[['脉搏','红细胞体积','总蛋白值']].fillna(df[['脉搏','红细胞体积','总蛋白值']].median()) +df.isnull().sum() +## 删除缺失值 +df = df.dropna() +# 再查看一下变量的分布 +for i in df.columns: + plt.figure(figsize=(10,5)) + plt.title(i) + # 设置字体旋转角度 + plt.xticks(rotation=90) + sns.countplot(df[i]) + plt.show() + +# 由于变量的取值范围不一样,因此需要对数据进行标准化 +import time +X_train,X_test,y_train,y_test = train_test_split(df[df.columns[:-1]],df['y'],test_size=0.3,random_state=0) +X_train = StandardScaler().fit_transform(X_train) +X_test = StandardScaler().fit_transform(X_test) + +model_result = {} + +models = [RandomForestClassifier(),BaggingClassifier(),AdaBoostClassifier(),GaussianNB(),LogisticRegression(),DecisionTreeClassifier(),SVC(),KNeighborsClassifier()] +for model in models: + try: + model_name = str(model).split('(')[0] + start = time.time() + model.fit(X_train,y_train) + y_pred = model.predict(X_test) + end = time.time() + # 存储准确率和混淆矩阵 + model_result[model_name] = [accuracy_score(y_test,y_pred),confusion_matrix(y_test,y_pred),end-start] + except Exception as e: + print(model_name,e) + +## 使用df保存模型的结果 +df_result = pd.DataFrame(model_result).T +df_result.columns = ['accuracy_score','confusion_matrix','time'] +df_result.sort_values(by='accuracy_score',ascending=False) + +#第1步:导入算法 +from sklearn.linear_model import LogisticRegression +#第2步:创建模型:逻辑回归(logistic regression) +model = LogisticRegression() +#第3步:训练模型 +model.fit( X_train , y_train ) +#查看模型的正确率 +model.score(X_test , y_test ) + +clf = SVC(kernel='rbf', C=1.0, random_state=0) +clf.fit(X_train, y_train) +y_test_pred = clf.predict(X_test) +print('Accuracy: %.2f' % accuracy_score(y_test, y_test_pred)) + +cm = confusion_matrix(y_test, y_test_pred) +sns.heatmap(cm, annot=True, fmt='d') +plt.xlabel('Predicted Label') +plt.ylabel('True Label') +plt.title('Confusion Matrix') +plt.show() \ No newline at end of file