@@ -0,0 +1,103 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from sklearn import preprocessing
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import accuracy_score,confusion_matrix,roc_curve,roc_auc_score
|
||||
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,AdaBoostClassifier
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
# 中文正常显示
|
||||
plt.rcParams['font.sans-serif'] = ['SimHei']
|
||||
plt.rcParams['axes.unicode_minus'] = False
|
||||
df = pd.read_csv("data_horse.csv",encoding="gbk")
|
||||
df.head()
|
||||
## 可视化列的缺失值、唯一值、类型
|
||||
df.corr()
|
||||
# 可视化变量y与其他的相关性曲线
|
||||
df.corr()['y'].sort_values(ascending=False)
|
||||
# 剔除相关性绝对值小于0.1的变量
|
||||
df = df[df.columns[df.corr()['y'].abs()>0.1]]
|
||||
df.head()
|
||||
df.isnull().sum()
|
||||
#可视化变量
|
||||
import warnings
|
||||
warnings.filterwarnings("ignore")
|
||||
for i in df.columns:
|
||||
plt.figure(figsize=(10,5))
|
||||
plt.title(i)
|
||||
# 设置字体旋转角度
|
||||
plt.xticks(rotation=90)
|
||||
sns.countplot(df[i])
|
||||
plt.show()
|
||||
|
||||
# 我们认为这些变量中的0值都是不合理的,所以我们将变量中的0值都替换为np.nan
|
||||
df[df.columns[:-1]] = df[df.columns[:-1]].replace(0,np.nan)
|
||||
df.isnull().sum()
|
||||
# 用众数填充缺失值
|
||||
# 观察发现直肠温度列是正太分布的,因此可以用均值填充缺失值,其他使用中位数
|
||||
df['直肠温度'] = df['直肠温度'].fillna(df['直肠温度'].mean())
|
||||
df[['脉搏','红细胞体积','总蛋白值']] = df[['脉搏','红细胞体积','总蛋白值']].fillna(df[['脉搏','红细胞体积','总蛋白值']].median())
|
||||
df.isnull().sum()
|
||||
## 删除缺失值
|
||||
df = df.dropna()
|
||||
# 再查看一下变量的分布
|
||||
for i in df.columns:
|
||||
plt.figure(figsize=(10,5))
|
||||
plt.title(i)
|
||||
# 设置字体旋转角度
|
||||
plt.xticks(rotation=90)
|
||||
sns.countplot(df[i])
|
||||
plt.show()
|
||||
|
||||
# 由于变量的取值范围不一样,因此需要对数据进行标准化
|
||||
import time
|
||||
X_train,X_test,y_train,y_test = train_test_split(df[df.columns[:-1]],df['y'],test_size=0.3,random_state=0)
|
||||
X_train = StandardScaler().fit_transform(X_train)
|
||||
X_test = StandardScaler().fit_transform(X_test)
|
||||
|
||||
model_result = {}
|
||||
|
||||
models = [RandomForestClassifier(),BaggingClassifier(),AdaBoostClassifier(),GaussianNB(),LogisticRegression(),DecisionTreeClassifier(),SVC(),KNeighborsClassifier()]
|
||||
for model in models:
|
||||
try:
|
||||
model_name = str(model).split('(')[0]
|
||||
start = time.time()
|
||||
model.fit(X_train,y_train)
|
||||
y_pred = model.predict(X_test)
|
||||
end = time.time()
|
||||
# 存储准确率和混淆矩阵
|
||||
model_result[model_name] = [accuracy_score(y_test,y_pred),confusion_matrix(y_test,y_pred),end-start]
|
||||
except Exception as e:
|
||||
print(model_name,e)
|
||||
|
||||
## 使用df保存模型的结果
|
||||
df_result = pd.DataFrame(model_result).T
|
||||
df_result.columns = ['accuracy_score','confusion_matrix','time']
|
||||
df_result.sort_values(by='accuracy_score',ascending=False)
|
||||
|
||||
#第1步:导入算法
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
#第2步:创建模型:逻辑回归(logistic regression)
|
||||
model = LogisticRegression()
|
||||
#第3步:训练模型
|
||||
model.fit( X_train , y_train )
|
||||
#查看模型的正确率
|
||||
model.score(X_test , y_test )
|
||||
|
||||
clf = SVC(kernel='rbf', C=1.0, random_state=0)
|
||||
clf.fit(X_train, y_train)
|
||||
y_test_pred = clf.predict(X_test)
|
||||
print('Accuracy: %.2f' % accuracy_score(y_test, y_test_pred))
|
||||
|
||||
cm = confusion_matrix(y_test, y_test_pred)
|
||||
sns.heatmap(cm, annot=True, fmt='d')
|
||||
plt.xlabel('Predicted Label')
|
||||
plt.ylabel('True Label')
|
||||
plt.title('Confusion Matrix')
|
||||
plt.show()
|
||||
Reference in New Issue
Block a user