平时作业

Signed-off-by: 张朋飞 <13190662+z1479344137@user.noreply.gitee.com>
This commit is contained in:
张朋飞
2023-07-16 02:51:28 +00:00
committed by Gitee
parent 63fdac5217
commit 41ed38c406
@@ -0,0 +1,103 @@
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,roc_curve,roc_auc_score
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
# 中文正常显示
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
df = pd.read_csv("data_horse.csv",encoding="gbk")
df.head()
## 可视化列的缺失值、唯一值、类型
df.corr()
# 可视化变量y与其他的相关性曲线
df.corr()['y'].sort_values(ascending=False)
# 剔除相关性绝对值小于0.1的变量
df = df[df.columns[df.corr()['y'].abs()>0.1]]
df.head()
df.isnull().sum()
#可视化变量
import warnings
warnings.filterwarnings("ignore")
for i in df.columns:
plt.figure(figsize=(10,5))
plt.title(i)
# 设置字体旋转角度
plt.xticks(rotation=90)
sns.countplot(df[i])
plt.show()
# 我们认为这些变量中的0值都是不合理的,所以我们将变量中的0值都替换为np.nan
df[df.columns[:-1]] = df[df.columns[:-1]].replace(0,np.nan)
df.isnull().sum()
# 用众数填充缺失值
# 观察发现直肠温度列是正太分布的,因此可以用均值填充缺失值,其他使用中位数
df['直肠温度'] = df['直肠温度'].fillna(df['直肠温度'].mean())
df[['脉搏','红细胞体积','总蛋白值']] = df[['脉搏','红细胞体积','总蛋白值']].fillna(df[['脉搏','红细胞体积','总蛋白值']].median())
df.isnull().sum()
## 删除缺失值
df = df.dropna()
# 再查看一下变量的分布
for i in df.columns:
plt.figure(figsize=(10,5))
plt.title(i)
# 设置字体旋转角度
plt.xticks(rotation=90)
sns.countplot(df[i])
plt.show()
# 由于变量的取值范围不一样,因此需要对数据进行标准化
import time
X_train,X_test,y_train,y_test = train_test_split(df[df.columns[:-1]],df['y'],test_size=0.3,random_state=0)
X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)
model_result = {}
models = [RandomForestClassifier(),BaggingClassifier(),AdaBoostClassifier(),GaussianNB(),LogisticRegression(),DecisionTreeClassifier(),SVC(),KNeighborsClassifier()]
for model in models:
try:
model_name = str(model).split('(')[0]
start = time.time()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
end = time.time()
# 存储准确率和混淆矩阵
model_result[model_name] = [accuracy_score(y_test,y_pred),confusion_matrix(y_test,y_pred),end-start]
except Exception as e:
print(model_name,e)
## 使用df保存模型的结果
df_result = pd.DataFrame(model_result).T
df_result.columns = ['accuracy_score','confusion_matrix','time']
df_result.sort_values(by='accuracy_score',ascending=False)
#第1步:导入算法
from sklearn.linear_model import LogisticRegression
#第2步:创建模型:逻辑回归(logistic regression
model = LogisticRegression()
#第3步:训练模型
model.fit( X_train , y_train )
#查看模型的正确率
model.score(X_test , y_test )
clf = SVC(kernel='rbf', C=1.0, random_state=0)
clf.fit(X_train, y_train)
y_test_pred = clf.predict(X_test)
print('Accuracy: %.2f' % accuracy_score(y_test, y_test_pred))
cm = confusion_matrix(y_test, y_test_pred)
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()