From 36a5eef4d6adf1435d43d80c79f1b3abaf7f9145 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=8B=8F=E9=9D=A9=E5=B2=9A?=
 <13190755+sugotland@user.noreply.gitee.com>
Date: Sat, 15 Jul 2023 03:47:29 +0000
Subject: [PATCH] notebook
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 苏革岚 <13190755+sugotland@user.noreply.gitee.com>
---
 第十三组/共享单车数据集/修修修改版.ipynb | 1237 ++++++++++++++++++++++
 1 file changed, 1237 insertions(+)
 create mode 100644 第十三组/共享单车数据集/修修修改版.ipynb

diff --git a/第十三组/共享单车数据集/修修修改版.ipynb b/第十三组/共享单车数据集/修修修改版.ipynb
new file mode 100644
index 0000000..e84a0d2
--- /dev/null
+++ b/第十三组/共享单车数据集/修修修改版.ipynb
@@ -0,0 +1,1237 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "import lr\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "import plotly.express as px\n",
+    "from sklearn.metrics import mean_squared_error, r2_score\n",
+    "from sklearn.linear_model import LinearRegression, Ridge\n",
+    "from sklearn.tree import DecisionTreeRegressor\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "from sklearn.decomposition import PCA\n",
+    "from sklearn.ensemble import AdaBoostRegressor\n",
+    "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor\n",
+    "from sklearn.svm import SVR\n",
+    "from datetime import datetime\n",
+    "\n",
+    "# 设置中文支持\n",
+    "plt.rcParams['font.sans-serif'] = ['SimHei']\n",
+    "plt.rcParams['axes.unicode_minus'] = False"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 数据预处理"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_df = pd.read_csv('train.csv')\n",
+    "test_df = pd.read_csv('test.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "#train_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 拆分时间变量\n",
+    "train_df['date'] = train_df.datetime.apply(lambda c: c.split()[0])\n",
+    "train_df['hour'] = train_df.datetime.apply(lambda c: c.split()[1].split(':')[0]).astype('int')\n",
+    "train_df['year'] = train_df.datetime.apply(lambda c: c.split()[0].split('-')[0]).astype('int')\n",
+    "train_df['month'] = train_df.datetime.apply(lambda c: c.split()[0].split('-')[1]).astype('int')\n",
+    "\n",
+    "test_df['date'] = test_df.datetime.apply(lambda c: c.split()[0])\n",
+    "test_df['hour'] = test_df.datetime.apply(lambda c: c.split()[1].split(':')[0]).astype('int')\n",
+    "test_df['year'] = test_df.datetime.apply(lambda c: c.split()[0].split('-')[0]).astype('int')\n",
+    "test_df['month'] = test_df.datetime.apply(lambda c: c.split()[0].split('-')[1]).astype('int')\n",
+    "\n",
+    "\n",
+    "def get_weekday(x):\n",
+    "    date1 = x.split()[0]\n",
+    "    date2 = datetime.strptime(date1, '%Y-%m-%d')\n",
+    "    week_day = date2.weekday()\n",
+    "    return week_day\n",
+    "\n",
+    "train_df['weekday'] = train_df.datetime.apply(get_weekday)\n",
+    "test_df['weekday'] = test_df.datetime.apply(get_weekday)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#train_df.info()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "得：无整行重复,无缺失值NAN，但可以评估列中某些值为零的数据（例如风速），以确认 0 是否为缺失值。"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#test_df.info()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 异常值分析"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig, axes = plt.subplots(nrows=3,ncols=2)\n",
+    "fig.set_size_inches(15, 20)\n",
+    "sns.boxplot(data=train_df,y=\"count\",orient=\"v\",ax=axes[0][0])\n",
+    "sns.boxplot(data=train_df,y=\"count\",x=\"hour\",orient=\"v\",ax=axes[0][1])\n",
+    "sns.boxplot(data=train_df,y=\"count\",x=\"season\",orient=\"v\",ax=axes[1][0])\n",
+    "sns.boxplot(data=train_df,y=\"count\",x=\"workingday\",orient=\"v\",ax=axes[1][1])\n",
+    "sns.boxplot(data=train_df,y=\"count\",x=\"holiday\",orient=\"v\",ax=axes[2][0])\n",
+    "sns.boxplot(data=train_df,y=\"count\",x=\"weather\",orient=\"v\",ax=axes[2][1])\n",
+    "\n",
+    "axes[0][0].set(ylabel='Count',title=\"Count\")\n",
+    "axes[0][1].set(xlabel='Hour Of The Day', ylabel='Count',title=\"Count by Hour of the Day\")\n",
+    "axes[1][0].set(xlabel='Season', ylabel='Count',title=\"Count by Season\")\n",
+    "axes[1][1].set(xlabel='Working Day', ylabel='Count',title=\"Count by Working Day\")\n",
+    "axes[2][0].set(xlabel='Holiday', ylabel='Count',title=\"Count by Holiday\")\n",
+    "axes[2][1].set(xlabel='Weather', ylabel='Count',title=\"Count by Weather\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "图1（count）可以看出存在训练集异常值。  \n",
+    "图3（count by season）显示春季的自行车租赁量较低。  \n",
+    "图2（count by hour）显示最繁忙的时间段是早上7点到8点和下午5点到6点，这意味着用户主要租用自行车去上班/上学和下班回家。  \n",
+    "图4（count by working day），可发现大多数异常值都出现在工作日。图5（count by holiday）显示所有异常值都出现在非节假日。\n",
+    "图6（count by weather）显示了大多数用户在天气晴朗和多云时租用自行车（1和2），而在大雨或下雪时几乎没有用户租用自行车（3）。"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 训练集异常值处理"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "使用3σ规则来剔除异常值。如果有一个数据点落在平均值的三个标准差之外，则该数据点被视为离群点，可以从数据集中删除。这种方法假定数据呈正态分布，离群值为罕见事件，将来不太可能出现。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## 查找异常值\n",
+    "def detect_outliers(data):\n",
+    "    # mean, standard deviation and 3-sigma of the data\n",
+    "    mean = np.mean(data)\n",
+    "    std = np.std(data)\n",
+    "    threesigma = 3 * std\n",
+    "\n",
+    "    # print upper, lower boundary\n",
+    "    lower, upper = mean-3*std, mean+3*std\n",
+    "    print(f\"Upper and lower boundary is: {lower}/{upper}\")\n",
+    "\n",
+    "    # identify outliers and return the outliers\n",
+    "    outliers = [x for x in data if np.abs(x - mean) > threesigma]\n",
+    "    print(f\"There are {len(outliers)} outliers based on three-sigma rule\")\n",
+    "\n",
+    "## 删除异常值\n",
+    "def delete_outliers(data, df):\n",
+    "    # detecting and dropping outliers\n",
+    "    original_shape = df.shape\n",
+    "    mean = np.mean(data)\n",
+    "    std = np.std(data)\n",
+    "    outliers = np.abs(data-mean) > (3*std)\n",
+    "    outliers_num = len(train_df[outliers])\n",
+    "    df.drop(index=data[outliers].index, inplace=True)\n",
+    "\n",
+    "    # 输出结果对比\n",
+    "    print(\"Number of outliers deleted:\", outliers_num)\n",
+    "    print (\"Shape of dataframe with Ouliers: \",original_shape)\n",
+    "    print (\"Shape of Dataframe After Deleting the Ouliers: \",df.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "train_with_outliers = train_df\n",
+    "# 查找异常值\n",
+    "detect_outliers(train_df['count'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 从训练集删除异常值\n",
+    "delete_outliers(train_df['count'], train_df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 数值型特征异常值处理"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 查看temp（温度），atemp（体感温度），humidity（湿度）、windspeed（风速）等数值型数据的分布情况\n",
+    "fig, axes = plt.subplots(2, 2)\n",
+    "fig.set_size_inches(12,10)\n",
+    "\n",
+    "sns.distplot(train_df['temp'],ax=axes[0,0])\n",
+    "sns.distplot(train_df['atemp'],ax=axes[0,1])\n",
+    "sns.distplot(train_df['humidity'],ax=axes[1,0])\n",
+    "sns.distplot(train_df['windspeed'],ax=axes[1,1])\n",
+    "\n",
+    "axes[0,0].set(title='Distribution of temp',)\n",
+    "axes[0,1].set(title='Distribution of atemp')\n",
+    "axes[1,0].set(title='Distribution of humidity')\n",
+    "axes[1,1].set(title='Distribution of windspeed')\n",
+    "\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "查看上图发现，风速为和温度可能存在异常值。"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### windspeed 风速异常值处理"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "a = train_df[train_df['windspeed'] == 0].shape[0]\n",
+    "print(f'风速为0的天数为：{a}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "风速为0天数过多，可能存在异常。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## 处理缺失值\n",
+    "windspeed_df1 = train_df[train_df['windspeed'] != 0]\n",
+    "windspeed_df2 = train_df[train_df['windspeed'] == 0]\n",
+    "windspeed_df2['windspeed'] = windspeed_df1['windspeed'].interpolate()\n",
+    "train_df = windspeed_df1.append(windspeed_df1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "a = train_df[train_df['windspeed'] == 0].shape[0]\n",
+    "print(f'风速为0的天数为：{a}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Humidity 湿度异常值处理"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "train_df[train_df['humidity'] == 0].shape[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "湿度值为0和100为异常数据，考虑可能有传感器错误情况。"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 可视化分类数据分析"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sns.pairplot(train_df, x_vars=['holiday', 'workingday', 'weather', 'season', 'windspeed', 'humidity', 'temp', 'atemp'],\n",
+    "             y_vars=['casual', 'registered', 'count'], plot_kws={'alpha': 0.1})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "1.会员在工作日出行多，节假日出行少，临时用户则相反；  \n",
+    "2.一季度出行人数总体偏少；  \n",
+    "3.租赁数量随天气等级上升而减少；  \n",
+    "4.小时数对租赁情况影响明显，会员呈现两个高峰，非会员呈现一个正态分布；  \n",
+    "5.租赁数量随风速增大而减少；  \n",
+    "6.温度、湿度对非会员影响比较大，对会员影响较小。"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## temp 温度对租赁数量的影响"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 按温度取租赁数量平均值\n",
+    "tempr = train_df.groupby(['temp'], as_index=True).agg({'casual': 'mean', 'registered': 'mean', 'count': 'mean'})\n",
+    "tempr.plot(title='Mean Count by Temp per Hour',figsize=(7,3))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "分析：租车数量整体是随着温度的上升而升高的"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## atemp体感温度"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "atempr = train_df.groupby(['atemp'], as_index=True).agg({'casual': 'mean',\n",
+    "                                                     'registered': 'mean',\n",
+    "                                                     'count': 'mean'})\n",
+    "atempr.plot(title='Mean Count by Atemp per Hour',figsize=(7,3))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "分析：租车数量随体感温度的升高而上升，温度太高时又会下降"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 湿度对租赁数量影响"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "humidityr = train_df.groupby(['humidity'], as_index=True).agg({'casual': 'mean',\n",
+    "                                                           'registered': 'mean',\n",
+    "                                                           'count': 'mean'})\n",
+    "\n",
+    "humidityr.plot(title='Mean Count by Humidity per Hour',figsize=(7,3))\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "分析：租车数量在湿度为20的时候达到高峰，之后逐渐递减"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 年份、月份对租赁数量的影响"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "day_df=train_df.groupby('date').agg({'year':'mean','month':'mean',\n",
+    "                                      'casual':'sum', 'registered':'sum'\n",
+    "                                      ,'count':'sum'})\n",
+    "month_df = day_df.groupby(['year','month'], as_index=True).agg({'casual':'mean',\n",
+    "                                                                  'registered':'mean',                                                                 'count':'mean'})\n",
+    "month_df.plot(fontsize=8,figsize=(15,4))\n",
+    "tick=list(range(24))\n",
+    "labels=['2011.1','2011.2','2011.3','2011.4','2011.5','2011.6','2011.7','2011.8','2011.9','2011.10','2011.11','2011.12','2012.1','2012.2','2012.3','2012.4','2012.5','2012.6','2012.7','2012.8','2012.9','2012.10','2012.11','2012.12']\n",
+    "plt.xticks(tick,labels,fontsize=8)\n",
+    "plt.title('Mean Count per month in two years', fontsize=15)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 按天\n",
+    "day_df1=train_df.groupby('date').agg({\n",
+    "                                      'casual':'sum', 'registered':'sum'\n",
+    "                                      ,'count':'sum'})\n",
+    "day_df1.plot(fontsize=8,figsize=(8,4),linewidth=0.8)\n",
+    "plt.title('Mean Count per day in two years', fontsize=15 )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(15,4))\n",
+    "# 按年\n",
+    "plt.subplot(1,3,1)\n",
+    "sns.barplot(x=train_df['year'],y=train_df['count'])\n",
+    "plt.title('by year')\n",
+    "\n",
+    "# 按季度\n",
+    "plt.subplot(1,3,2)\n",
+    "sns.barplot(x=train_df['season'],y=train_df['count'])\n",
+    "plt.title('by season')\n",
+    "\n",
+    "# 按月\n",
+    "plt.subplot(1,3,3)\n",
+    "sns.barplot(x=train_df['month'],y=train_df['count'])\n",
+    "plt.title('by month')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "由上分析：  \n",
+    "1、2012年整体租车数量高于2011年。  \n",
+    "2、每年租车数量1月份最低，在6月租车数量达到最高。前半年数量上升，后半年从7月份左右开始下降。  \n",
+    "3、2012年整体波动比2011年剧烈。  \n",
+    "4、有很多局部波谷值。"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 时段对租赁数量的影响"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "fig,(ax1,ax2,ax3,ax4)= plt.subplots(nrows=4)\n",
+    "fig.set_size_inches(12,18)\n",
+    "train_df = train_df.reset_index(drop=True)\n",
+    "\n",
+    "sns.pointplot(x=train_df['hour'],y=train_df['count'],hue=train_df['season'],join=True, palette=\"Paired\", ax=ax1)\n",
+    "\n",
+    "sns.pointplot(x=train_df['hour'],y=train_df['count'],hue=train_df['weekday'],join=True, palette=\"Paired\", ax=ax2)\n",
+    "\n",
+    "sns.lineplot(x=train_df['hour'], y=train_df['registered'], color='red',label='Registered', marker='o', ax=ax3, ci=None)\n",
+    "sns.lineplot(x=train_df['hour'], y=train_df['casual'], color='blue', label='Casual', marker='o', ax=ax3, ci=None)\n",
+    "\n",
+    "sns.pointplot(x=train_df['hour'], y=train_df['count'], hue=train_df['weather'], ax=ax4);\n",
+    "\n",
+    "\n",
+    "ax1.set(xlabel='Hour of the Day', ylabel='Mean Count',title=\"Mean Count by Season per Hour\")\n",
+    "ax2.set(xlabel='Hour of the Day', ylabel='Mean Count',title=\"Mean Count by Weekday per Hour\")\n",
+    "ax3.set(xlabel='Hour of the Day', ylabel='Mean Count',title=\"Mean Count by Casual VS Registered per Hour\")\n",
+    "ax4.set(xlabel='Hour of the Day', ylabel='Mean Count',title=\"Mean Count by Weather per Hour\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "图1显示：在所有季节中，春季是大雨/大雪天数最多的季节，这直接影响了租赁需求的减少，秋季和夏季往往是租车人数最多的季节。  \n",
+    "图2显示：周六和周日的租车模式与平日明显不同，大多数用户在上午11点至下午4点租车。这些周末用户租车是为了休闲，而不是为了上班（上午8点和下午6点）。  \n",
+    "图3显示：大多数用户是注册用户，租用时间在上午7-8点和下午4-5点之间，临时用户的租用时间在白天分布较好，但最繁忙的时间在下午1-2点之间。   \n",
+    "图4显示：天气1和天气2情况下，租赁需求较多，天气3和天气4租赁需求较少，均仍集中在上下班高峰期。   "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 天气对租赁情况影响"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "weather_count = train_df.groupby('weather')\n",
+    "weather_count[['casual', 'registered', 'count']].count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "weather_df = train_df.groupby('weather', as_index=True).agg({'casual': 'mean', 'registered': 'mean', 'count': 'mean'})\n",
+    "weather_df.plot.bar(stacked=True, title='Mean Count by Weather of Casual VS Registered', color=['#c63d40', '#dc8f75', '#ffddb7'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "分析：  \n",
+    "1、总体租车数量第1种天气最多，之后是第2，第4种，第3种。  \n",
+    "2、会员租车数量与总体租车数量变化相同。  \n",
+    "3、临时租车数量是按照天气从好到坏的顺序依次递减。"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 风速对租借情况影响"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "# 考虑到风速特别大的时候很少，如果取平均值会出现异常，所以按风速对租赁数量取最大值。\n",
+    "windspeedr = train_df.groupby(['windspeed'], as_index=True).agg({'casual': 'max', 'registered': 'max', 'count': 'max'})\n",
+    "windspeedr.plot(title='Max Count by Windspeed per Hour',figsize=(8,4))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "分析：风速在30以后，风速越大，相对于的租赁数就越少。但在分数在43-44附近出现了异常情况。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2 = train_df[train_df['windspeed'] > 40]\n",
+    "df2 = df2[df2['count'] > 400]\n",
+    "df2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "因为是下班高峰期的时间"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 工作日与非工作日的租赁情况"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "day_df=train_df.groupby('date').agg({'year':'mean','season':'mean',\n",
+    "                                      'casual':'sum', 'registered':'sum'\n",
+    "                                      ,'count':'sum','temp':'mean',\n",
+    "                                      'atemp':'mean','workingday':'mean'})\n",
+    "#day_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 对工作日和非工作日租赁数量取了平均值，对一周中每天的租赁数量求和\n",
+    "workingday_df = day_df.groupby(['workingday'], as_index=True).agg({'casual': 'mean', 'registered': 'mean'})\n",
+    "#workingday_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "workingday_df1 = workingday_df.loc[0]\n",
+    "workingday_df2 = workingday_df.loc[1]\n",
+    "#workingday_df1.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 绘制非工作日与工作日的临时借车数量和会员借车数量堆积图\n",
+    "width = 0.3\n",
+    "p1 = plt.bar(workingday_df.index, workingday_df['casual'], width,color='#dc8f75')\n",
+    "p2 = plt.bar(workingday_df.index, workingday_df['registered'], width, bottom=workingday_df['casual'],color='#ffddb7')\n",
+    "plt.title('Mean Count per Day')\n",
+    "plt.xticks([0, 1])#, ('non-working day', 'working day'))\n",
+    "plt.legend((p1[0], p2[0]), ('casual', 'registered'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 绘制非工作日临时借车数量和会员借车数量占比饼图\n",
+    "plt.subplot2grid((2, 2), (0, 0))\n",
+    "plt.pie(workingday_df1, labels=['casual', 'registered'], autopct='%1.1f%%',\n",
+    "        pctdistance=0.6, labeldistance=1.35, radius=1.3,colors=['#c63d40','#ffddb7'])\n",
+    "plt.axis('equal')\n",
+    "plt.title('non-working day')\n",
+    "\n",
+    "# 绘制工作日临时借车数量和会员借车数量占比饼图\n",
+    "plt.subplot2grid((2, 2), (0, 1))\n",
+    "plt.pie(workingday_df2, labels=['casual', 'registered'], autopct='%1.1f%%',\n",
+    "        pctdistance=0.6, labeldistance=1.35, radius=1.3,colors=['#c63d40','#ffddb7'])\n",
+    "plt.title('working day')\n",
+    "plt.axis('equal')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "分析：  \n",
+    "非工作日时，会员租车数量占68.5%，临时租车数量占31.5%。  \n",
+    "工作日时，会员租车数量占87.0%，临时租车数量占13%。  \n",
+    "工作日的总租车数量略高于非工作日，但非工作日时临时租车数量比工作日多。  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# 查看工作日与非工作日每小时的临时租赁数量，会员租赁数量和总数量的平均值\n",
+    "workingday_df=train_df[train_df['workingday']==1]\n",
+    "workingday_df = workingday_df.groupby(['hour'], as_index=True).agg({'casual':'mean',\n",
+    "                                                                    'registered':'mean',\n",
+    "                                                                    'count':'mean'})\n",
+    "\n",
+    "nworkingday_df=train_df[train_df['workingday']==0]\n",
+    "nworkingday_df = nworkingday_df.groupby(['hour'], as_index=True).agg({'casual':'mean',\n",
+    "                                                                      'registered':'mean',\n",
+    "                                                                      'count':'mean'})\n",
+    "fig, axes = plt.subplots(1, 2,sharey = True)\n",
+    "\n",
+    "workingday_df.plot(figsize=(15,5),title = 'Mean Count per Hour in Working Day',ax=axes[0])\n",
+    "nworkingday_df.plot(figsize=(15,5),title = 'Mean Count per Hour in non-Working Day',ax=axes[1])\n",
+    "\n",
+    "plt.show()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "分析：  \n",
+    "1、周末会员与非会员用户的情况相似。这可能表明，注册用户是在工作日使用自行车的人，而临时用户在周末租车，在工作日租车时，不需要作为上班的交通工具。   \n",
+    "2、周末会员用户租赁数量降低，临时用户租赁数量增加。\n",
+    "3、工作日会员用户出行数量较多，临时用户出行数量较少"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 节假日的租赁情况"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "day_df=train_df.groupby('date').agg({'year':'mean',\n",
+    "                                      'casual':'sum', 'registered':'sum'\n",
+    "                                      ,'count':'sum','holiday':'mean'})\n",
+    "holiday_count=day_df.groupby('year', as_index=True).agg({'holiday':'sum'})\n",
+    "#holiday_count"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "holiday_df = day_df.groupby('holiday', as_index=True).agg({'casual':'mean', 'registered':'mean'})\n",
+    "holiday_df.plot.bar(stacked=True , title = 'Mean Count by Holiday or not per DAY',cmap='Set3')\n",
+    "plt.xticks(rotation=360)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 会员与非会员"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "number_pei=day_df[['casual','registered']].mean()\n",
+    "#number_pei"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "plt.axes(aspect='equal')\n",
+    "plt.pie(number_pei, labels=['casual','registered'], autopct='%1.1f%%',\n",
+    "        pctdistance=0.6 , labeldistance=1.05 , radius=1 ,colors=['#c63d40','#ffddb7'])\n",
+    "\n",
+    "plt.title('Count of Casual VS Registered')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 正态性假设"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "因变量的正态性假设并不总是必要的，多机器学习算法（如决策树、随机森林和神经网络）并不依赖于此假设。\n",
+    "但当输入特征或预测变量近似呈正态分布时，某些算法可能会表现更好。如果因变量表现出极端偏度或异常值，则应用变换或使用对非正态性更稳健的替代模型可能是有益的。\n",
+    "故下面计算不同因变量的偏度。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 查看temp（温度），atemp（体感温度），humidity（湿度）、windspeed（风速）等数值型数据的分布情况\n",
+    "fig, axes = plt.subplots(2, 2)\n",
+    "fig.set_size_inches(12,10)\n",
+    "\n",
+    "sns.distplot(train_df['temp'],ax=axes[0,0])\n",
+    "sns.distplot(train_df['atemp'],ax=axes[0,1])\n",
+    "sns.distplot(train_df['humidity'],ax=axes[1,0])\n",
+    "sns.distplot(train_df['windspeed'],ax=axes[1,1])\n",
+    "\n",
+    "axes[0,0].set(title='Distribution of temp',)\n",
+    "axes[0,1].set(title='Distribution of atemp')\n",
+    "axes[1,0].set(title='Distribution of humidity')\n",
+    "axes[1,1].set(title='Distribution of windspeed')\n",
+    "\n",
+    "plt.show()\n",
+    "\n",
+    "print(f\"temp偏度为：{train_df['temp'].skew()}\")\n",
+    "print(f\"atemp偏度为：{train_df['atemp'].skew()}\")\n",
+    "print(f\"windspeed偏度为：{train_df['windspeed'].skew()}\")\n",
+    "print(f\"humidity偏度为：{train_df['humidity'].skew()}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 相关性分析"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "train_df.corr()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(13, 10))\n",
+    "sns.heatmap(train_df.corr(), annot=True, cmap='RdBu')\n",
+    "plt.title('Heatmap on Correlation', fontsize=20)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 租赁数量与其他变量的相关性\n",
+    "train_df.corr()['count'].sort_values(ascending=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "从图中可以看出，temp和atemp之间的相关性较高，为0.99，season和month之间为0.97，同样不低，即存在很强的多重共线性。在进行建立模型预测时，选择保留和count相关性高的变量，剔除atemp和season，以免导致因多重共线性造成的过拟合。"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 模型"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 数据准备"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_df.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train_final = train_df.drop(columns=['count','casual', 'registered','datetime','date','atemp','month']).values  # 最终模型训练数据"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#test_df.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x_date_test = test_df['datetime'].values #提交结果时间栏\n",
+    "X_test_final = test_df.drop(columns=['datetime','date','atemp','month']).values # 预测数据"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 因变量\n",
+    "y = train_df['count'].apply(lambda x: np.log1p(x)).values\n",
+    "y_casual = train_df['casual'].apply(lambda x: np.log1p(x)).values\n",
+    "y_registered = train_df['registered'].apply(lambda x: np.log1p(x)).values\n",
+    "\n",
+    "# 划分训练集与测试集\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X_train_final, y, test_size=0.2, random_state=42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## 模型性能评估\n",
+    "def model_performance_evaluation(model_name, test, pred):\n",
+    "    print(model_name,'| 均方误差: %.4f' % mean_squared_error(test, pred))\n",
+    "    print(model_name, '| 决定系数: %.4f' % r2_score(test, pred))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 决策树模型"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "DT = DecisionTreeRegressor()\n",
+    "DT.fit(X_train, y_train)\n",
+    "y_pred = DT.predict(X_test)\n",
+    "\n",
+    "# 可视化y_test与y_pred的差异\n",
+    "plt.figure(figsize=(300,12))\n",
+    "plt.plot(range(len(y_pred)), y_pred, 'b', label=\"predict\")\n",
+    "plt.plot(range(len(y_test)), y_test, 'r', label=\"test\")\n",
+    "plt.legend(loc=\"upper right\")\n",
+    "\n",
+    "# 模型评估\n",
+    "model_performance_evaluation('决策树模型', y_test, y_pred)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "注：在决策树模型中，多次对同一数据进行机器学习，最终预测结果可能会有所不同。"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 线性回归模型"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "LR = LinearRegression()\n",
+    "LR.fit(X_train,y_train)\n",
+    "y_pred = LR.predict(X_test)\n",
+    "## 可视化下y_test与y_pred的差异\n",
+    "plt.figure(figsize=(300,6))\n",
+    "plt.plot(range(len(y_pred)),y_pred,'b',label=\"predict\")\n",
+    "plt.plot(range(len(y_test)),y_test,'r',label=\"test\")\n",
+    "plt.legend(loc=\"upper right\") #显示图中的标签\n",
+    "\n",
+    "## 模型评估\n",
+    "model_performance_evaluation('线性回归模型', y_test, y_pred)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 随机森林回归模型"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "RF = RandomForestRegressor(n_estimators=100, random_state=42)\n",
+    "RF.fit(X_train, y_train)\n",
+    "y_pred = RF.predict(X_test)\n",
+    "## 可视化下y_test与y_pred的差异\n",
+    "plt.figure(figsize=(300,6))\n",
+    "plt.plot(range(len(y_pred)),y_pred,'b',label=\"predict\")\n",
+    "plt.plot(range(len(y_test)),y_test,'r',label=\"test\")\n",
+    "plt.legend(loc=\"upper right\") #显示图中的标签\n",
+    "\n",
+    "## 模型评估\n",
+    "model_performance_evaluation('随机森林回归模型', y_test, y_pred)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 逻辑回归模型"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.linear_model import LogisticRegression\n",
+    "LR = LogisticRegression()\n",
+    "LR.fit(X_train,y_train.astype('int'))\n",
+    "y_pred = LR.predict(X_test)\n",
+    "## 可视化下y_test与y_pred的差异\n",
+    "plt.figure(figsize=(300,6))\n",
+    "plt.plot(range(len(y_pred)),y_pred,'b',label=\"predict\")\n",
+    "plt.plot(range(len(y_test)),y_test,'r',label=\"test\")\n",
+    "plt.legend(loc=\"upper right\") #显示图中的标签\n",
+    "\n",
+    "## 模型评估\n",
+    "model_performance_evaluation('逻辑回归模型', y_test, y_pred)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 神经网络回归模型"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.neural_network import MLPClassifier\n",
+    "mlp = MLPClassifier()\n",
+    "mlp.fit(X_train,y_train.astype('int'))\n",
+    "y_pred = mlp.predict(X_test)\n",
+    "## 可视化下y_test与y_pred的差异\n",
+    "plt.figure(figsize=(300,6))\n",
+    "plt.plot(range(len(y_pred)),y_pred,'b',label=\"predict\")\n",
+    "plt.plot(range(len(y_test)),y_test,'r',label=\"test\")\n",
+    "plt.legend(loc=\"upper right\") #显示图中的标签\n",
+    "\n",
+    "## 模型评估\n",
+    "model_performance_evaluation('逻辑回归模型', y_test, y_pred)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 支持向量回归"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "svr = Ridge()\n",
+    "svr.fit(X_train,y_train)\n",
+    "y_pred = svr.predict(X_test)\n",
+    "## 可视化下y_test与y_pred的差异\n",
+    "plt.figure(figsize=(10,6))\n",
+    "plt.plot(range(len(y_pred)),y_pred,'b',label=\"predict\")\n",
+    "plt.plot(range(len(y_test)),y_test,'r',label=\"test\")\n",
+    "plt.legend(loc=\"upper right\") #显示图中的标签\n",
+    "\n",
+    "## 模型评估\n",
+    "model_performance_evaluation('逻辑回归模型', y_test, y_pred)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "根据模型评估结果，随机森林回归模型均方误差最小，决定系数最大，拟合效果最好，因此选择随机森林回归模型为最终模型。"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 结果输出"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## 输出最好的模型的预测结果，线性回归模型仅为测试代码所用\n",
+    "prediction = []\n",
+    "lr1=lr.fit(X_train_final, y_casual)\n",
+    "lr2=lr.fit(X_train_final, y_registered)\n",
+    "prediction = np.expm1(lr1.predict(X_test_final))+np.expm1(lr2.predict(X_test_final))\n",
+    "submit = pd.DataFrame({'datetime':x_date_test,'count':prediction})\n",
+    "submit.to_csv('submission.csv',index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [],
+   "metadata": {
+    "collapsed": false
+   }
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}