From ad08596d79fe1cf4d66df230e4be42d640bc73be Mon Sep 17 00:00:00 2001
From: zmm <13190795+zmm020425@user.noreply.gitee.com>
Date: Fri, 14 Jul 2023 06:12:56 +0000
Subject: [PATCH] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?=
 =?UTF-8?q?=E5=85=B1=E4=BA=AB=E5=8D=95=E8=BD=A6=E6=95=B0=E6=8D=AE=E5=88=86?=
 =?UTF-8?q?=E6=9E=90/=E6=95=B0=E6=8D=AE=E9=A2=84=E5=A4=84=E7=90=86?=
 =?UTF-8?q?=E3=80=81=E7=9B=B8=E5=85=B3=E6=80=A7=E5=8F=8A=E5=8F=AF=E8=A7=86?=
 =?UTF-8?q?=E5=8C=96=E5=88=86=E6=9E=90.ipynb?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../数据预处理、相关性及可视化分析.ipynb      | 1046 -----------------
 1 file changed, 1046 deletions(-)
 delete mode 100644 共享单车数据分析/数据预处理、相关性及可视化分析.ipynb

diff --git a/共享单车数据分析/数据预处理、相关性及可视化分析.ipynb b/共享单车数据分析/数据预处理、相关性及可视化分析.ipynb
deleted file mode 100644
index 361fd2d..0000000
--- a/共享单车数据分析/数据预处理、相关性及可视化分析.ipynb
+++ /dev/null
@@ -1,1046 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "import matplotlib.pyplot as plt\n",
-    "import seaborn as sns\n",
-    "from sklearn.model_selection import train_test_split\n",
-    "import plotly.express as px\n",
-    "from sklearn.metrics import mean_squared_log_error, r2_score\n",
-    "from sklearn.linear_model import LinearRegression, Ridge\n",
-    "from sklearn.preprocessing import StandardScaler\n",
-    "from sklearn.decomposition import PCA\n",
-    "from sklearn.ensemble import AdaBoostRegressor\n",
-    "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor\n",
-    "from sklearn.svm import SVR\n",
-    "from datetime import datetime\n",
-    "\n",
-    "# 设置中文支持\n",
-    "plt.rcParams['font.sans-serif'] = ['SimHei']\n",
-    "plt.rcParams['axes.unicode_minus'] = False"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 导入数据"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "train_df = pd.read_csv('train.csv')\n",
-    "test_df = pd.read_csv('test.csv')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "train_df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "train_df.info()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "test_df.info()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "得：无整行重复,无缺失值NAN，但可以评估列中某些值为零的数据（例如风速），以确认 0 是否为缺失值。"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 合并训练集和测试集"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(type(train_df))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#full = train_df.append(test_df, ignore_index=True)\n",
-    "full=pd.concat([train_df,test_df],ignore_index=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "full.info()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 拆分时间变量\n",
-    "full['date'] = full.datetime.apply(lambda c: c.split()[0])\n",
-    "full['hour'] = full.datetime.apply(lambda c: c.split()[1].split(':')[0]).astype('int')\n",
-    "full['year'] = full.datetime.apply(lambda c: c.split()[0].split('-')[0]).astype('int')\n",
-    "full['month'] = full.datetime.apply(lambda c: c.split()[0].split('-')[1]).astype('int')\n",
-    "\n",
-    "\n",
-    "def get_weekday(x):\n",
-    "    date1 = x.split()[0]\n",
-    "    date2 = datetime.strptime(date1, '%Y-%m-%d')\n",
-    "    week_day = date2.weekday()\n",
-    "    return week_day\n",
-    "\n",
-    "full['weekday'] = full.datetime.apply(get_weekday)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "full.head()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 异常值处理"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fig, axes = plt.subplots(nrows=3,ncols=2)\n",
-    "fig.set_size_inches(15, 20)\n",
-    "sns.boxplot(data=full,y=\"count\",orient=\"v\",ax=axes[0][0])\n",
-    "sns.boxplot(data=full,y=\"count\",x=\"hour\",orient=\"v\",ax=axes[0][1])\n",
-    "sns.boxplot(data=full,y=\"count\",x=\"season\",orient=\"v\",ax=axes[1][0])\n",
-    "sns.boxplot(data=full,y=\"count\",x=\"workingday\",orient=\"v\",ax=axes[1][1])\n",
-    "sns.boxplot(data=full,y=\"count\",x=\"holiday\",orient=\"v\",ax=axes[2][0])\n",
-    "sns.boxplot(data=full,y=\"count\",x=\"weather\",orient=\"v\",ax=axes[2][1])\n",
-    "\n",
-    "axes[0][0].set(ylabel='Count',title=\"Count\")\n",
-    "axes[0][1].set(xlabel='Hour Of The Day', ylabel='Count',title=\"Count by Hour of the Day\")\n",
-    "axes[1][0].set(xlabel='Season', ylabel='Count',title=\"Count by Season\")\n",
-    "axes[1][1].set(xlabel='Working Day', ylabel='Count',title=\"Count by Working Day\")\n",
-    "axes[2][0].set(xlabel='Holiday', ylabel='Count',title=\"Count by Holiday\")\n",
-    "axes[2][1].set(xlabel='Weather', ylabel='Count',title=\"Count by Weather\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "季节对比显示，春季的自行车租赁量较低。\n",
-    "具有均值信息的方框图是一天中的时间段，因为它显示最繁忙的时间段是早上7点到8点和下午5点到6点，这意味着用户主要租用自行车去上班/上学和下班回家。\n",
-    "根据方框图4和5（工作日和节假日），我们发现大多数异常值都出现在工作日。从节假日来看，结果是合理的，因为所有异常值都出现在非节假日。\n",
-    "最后一个方框图显示了一个显而易见的常识，即大多数用户在天气晴朗和多云时租用自行车（1和2），而在大雨或下雪时几乎没有用户租用自行车（3）。"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "使用3σ规则来剔除异常值。如果有一个数据点落在平均值的三个标准差之外，则该数据点被视为离群点，可以从数据集中删除。这种方法假定数据呈正态分布，离群值为罕见事件，将来不太可能出现。"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Function to detect and print outliers\n",
-    "def detect_outliers(data):\n",
-    "    # mean, standard deviation and 3-sigma of the data\n",
-    "    mean = np.mean(data)\n",
-    "    std = np.std(data)\n",
-    "    threesigma = 3 * std\n",
-    "\n",
-    "    # print upper, lower boundary and boxplot\n",
-    "    sns.boxplot(data, orient=\"v\")\n",
-    "    plt.show()\n",
-    "    lower, upper = mean-3*std, mean+3*std\n",
-    "    print(f\"Upper and lower boundary is: {lower}/{upper}\")\n",
-    "\n",
-    "    # identify outliers and return the outliers\n",
-    "    outliers = [x for x in data if np.abs(x - mean) > threesigma]\n",
-    "    print(f\"There are {len(outliers)} outliers based on three-sigma rule\")\n",
-    "\n",
-    "# Function to delete the outliers\n",
-    "def delete_outliers(data, df):\n",
-    "    # detecting and dropping outliers\n",
-    "    original_shape = df.shape\n",
-    "    mean = np.mean(data)\n",
-    "    std = np.std(data)\n",
-    "    outliers = np.abs(data-mean) > (3*std)\n",
-    "    outliers_num = len(train_df[outliers])\n",
-    "    df.drop(index=data[outliers].index, inplace=True)\n",
-    "\n",
-    "    # print what was deleted\n",
-    "    print(\"Number of outliers deleted:\", outliers_num)\n",
-    "    print (\"Shape of dataframe with Ouliers: \",original_shape)\n",
-    "    print (\"Shape of Dataframe After Deleting the Ouliers: \",df.shape)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "train_with_outliers = train_df\n",
-    "# 查找异常值\n",
-    "detect_outliers(train_df['count'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 从训练集删除异常值\n",
-    "delete_outliers(train_df['count'], train_df)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "对于大多数机器学习技术来说，因变量的正态性假设并不总是必要或可取的。虽然一些传统的统计模型假设因变量的正态性，但许多机器学习算法（如决策树、随机森林和神经网络）并不依赖于此假设。\n",
-    "\n",
-    "但是，请务必注意，当输入特征或预测变量近似呈正态分布时，某些算法可能会假设或表现更好。如果因变量表现出极端偏度或异常值，则应用变换或使用对非正态性更稳健的替代模型可能是有益的"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## temp 温度"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(f\"偏度为：{train_df['temp'].skew()}\")\n",
-    "full['temp'].describe()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sns.distplot(full['temp'])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## atemp 体感温度"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(f\"偏度为：{train_df['atemp'].skew()}\")\n",
-    "full['atemp'].describe()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sns.distplot(full['atemp'])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## windspeed 风速"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sns.distplot(full['windspeed'])\n",
-    "a = full[full['windspeed'] == 0].shape[0]\n",
-    "print(f'风速为0的天数为：{a}')\n",
-    "print(f\"偏度为：{train_df['windspeed'].skew()}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "风速为0天数过多，可能存在异常。"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "## 处理缺失值\n",
-    "windspeed_df1 = full[full['windspeed'] != 0]\n",
-    "windspeed_df2 = full[full['windspeed'] == 0]\n",
-    "windspeed_df2['windspeed'] = windspeed_df1['windspeed'].interpolate()\n",
-    "#full = windspeed_df1.append(windspeed_df1)\n",
-    "full = pd.concat([windspeed_df1,windspeed_df2])\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sns.distplot(full['windspeed'])\n",
-    "a = full[full['windspeed'] == 0].shape[0]\n",
-    "print(f'风速为0的天数为：{a}')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "full.info()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Humidity 湿度"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(f\"偏度为：{train_df['humidity'].skew()}\")\n",
-    "train_df['humidity'].describe()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sns.distplot(full['humidity'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "train_df[train_df['humidity'] == 0].shape[0]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "湿度值为0和100为异常数据，考虑可能有传感器错误情况。"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 各个变量与租车数量的关系"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sns.pairplot(full, x_vars=['holiday', 'workingday', 'weather', 'season', 'windspeed', 'humidity', 'temp', 'atemp'],\n",
-    "             y_vars=['casual', 'registered', 'count'], plot_kws={'alpha': 0.1})"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 相关性分析"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "full.corr()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "plt.figure(figsize=(13, 10))\n",
-    "sns.heatmap(full.corr(), annot=True, cmap='YlOrBr_r')\n",
-    "plt.title('Heatmap on Correlation', fontsize=20)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 租赁数量与其他变量的相关性\n",
-    "full.corr()['count'].sort_values(ascending=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "full.corr()['registered'].sort_values(ascending=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "full.corr()['casual'].sort_values(ascending=False)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 可视化分类数据分析"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## temp 温度"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 按温度取租赁数量平均值\n",
-    "tempr = full.groupby(['temp'], as_index=True).agg({'casual': 'mean', 'registered': 'mean', 'count': 'mean'})\n",
-    "tempr.plot(title='每小时平均租赁数量随温度变化的趋势',figsize=(7,3))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## atemp体感温度"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "atempr = full.groupby(['atemp'], as_index=True).agg({'casual': 'mean',\n",
-    "                                                     'registered': 'mean',\n",
-    "                                                     'count': 'mean'})\n",
-    "atempr.plot(title='每小时平均租赁数量随体感温度变化的趋势',figsize=(7,3))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 湿度对租赁数量影响"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "humidityr = full.groupby(['humidity'], as_index=True).agg({'casual': 'mean',\n",
-    "                                                           'registered': 'mean',\n",
-    "                                                           'count': 'mean'})\n",
-    "\n",
-    "humidityr.plot(title='不同湿度下每小时的平均租赁次数',figsize=(7,3))\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 年份、月份对租赁数量的影响"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "day_df=full.groupby('date').agg({'year':'mean','month':'mean',\n",
-    "                                      'casual':'sum', 'registered':'sum'\n",
-    "                                      ,'count':'sum'})\n",
-    "month_df = day_df.groupby(['year','month'], as_index=True).agg({'casual':'mean',\n",
-    "                                                                  'registered':'mean',                                                                 'count':'mean'})\n",
-    "month_df.plot(fontsize=8,figsize=(15,4))\n",
-    "tick=list(range(24))\n",
-    "labels=['2011.1','2011.2','2011.3','2011.4','2011.5','2011.6','2011.7','2011.8','2011.9','2011.10','2011.11','2011.12','2012.1','2012.2','2012.3','2012.4','2012.5','2012.6','2012.7','2012.8','2012.9','2012.10','2012.11','2012.12']\n",
-    "plt.xticks(tick,labels,fontsize=8)\n",
-    "plt.title('两年内每月租赁数量变化趋势', fontsize=15)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 按天\n",
-    "day_df1=full.groupby('date').agg({\n",
-    "                                      'casual':'sum', 'registered':'sum'\n",
-    "                                      ,'count':'sum'})\n",
-    "day_df1.plot(fontsize=8,figsize=(8,4),linewidth=0.8)\n",
-    "plt.title('两年内每天租赁数量变化趋势', fontsize=15 )"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "plt.figure(figsize=(15,4))\n",
-    "# 按年\n",
-    "plt.subplot(1,3,1)\n",
-    "sns.barplot(x=full['year'],y=full['count'])\n",
-    "plt.title('by year')\n",
-    "\n",
-    "# 按季度\n",
-    "plt.subplot(1,3,2)\n",
-    "sns.barplot(x=full['season'],y=full['count'])\n",
-    "plt.title('by season')\n",
-    "\n",
-    "# 按月\n",
-    "plt.subplot(1,3,3)\n",
-    "sns.barplot(x=full['month'],y=full['count'])\n",
-    "plt.title('by month')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 时段对租赁数量的影响"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\n",
-    "fig,(ax1,ax2,ax3)= plt.subplots(nrows=3)\n",
-    "fig.set_size_inches(15,20)\n",
-    "full = full.reset_index(drop=True)\n",
-    "\n",
-    "sns.pointplot(x=full['hour'],y=full['count'],hue=full['season'],join=True, palette=\"Paired\", ax=ax1)\n",
-    "\n",
-    "sns.pointplot(x=full['hour'],y=full['count'],hue=full['weekday'],join=True, palette=\"Paired\", ax=ax2)\n",
-    "\n",
-    "sns.lineplot(x=full['hour'], y=full['registered'], color='red',label='Registered', marker='o', ax=ax3, ci=None)\n",
-    "sns.lineplot(x=full['hour'], y=full['casual'], color='blue', label='Casual', marker='o', ax=ax3, ci=None)\n",
-    "\n",
-    "ax3.set_ylabel('Count')\n",
-    "\n",
-    "ax1.set(xlabel='Hour of the Day', ylabel='Count',title=\"不同季节一天内每小时共享单车租借次数\")\n",
-    "ax2.set(xlabel='Hour of the Day', ylabel='Count',title=\"一天内每小时共享单车内租借次数\")\n",
-    "ax3.set(xlabel='Hour of the Day', ylabel='Count',title=\"一天内会员 vs 非会员每小时共享单车租借次数\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "COUNT与REGISTERED类似，但由于大多数用户是注册用户，他们的繁忙时间在上午7-8点和下午4-5点之间，而临时用户的繁忙时间在白天分布较好，但最繁忙的时间在下午1-2点之间。\n",
-    "下图显示了为什么春季是租赁量较少的季节。在所有季节中，春季是大雨/大雪天数最多的季节，这直接影响了租赁需求的减少。\n",
-    "下图中的 \"按季节分时段统计 \"显示了白天的租赁分布情况，这也证明了前面所说的春季是最不喜欢租车的季节。\n",
-    "秋季和夏季往往是租车人数最多的季节。\n",
-    "平日按时间段划分的图表显示，周六和周日的租车模式与平日明显不同，大多数用户在上午11点至下午4点租车。这些周末用户租车是为了休闲，而不是为了上班（上午8点和下午6点）。\n",
-    "在最后一张图中，\"注册用户数与休闲用户数 \"的休闲线中，周末用户的形态也很相似。这可能表明，注册用户是在工作日使用自行车的人，而临时用户在周末租车，在工作日租车时，不需要作为上班的交通工具。"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 天气对租赁情况影响"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "weather_count = full.groupby('weather')\n",
-    "weather_count[['casual', 'registered', 'count']].count()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "weather_df = full.groupby('weather', as_index=True).agg({'casual': 'mean', 'registered': 'mean', 'count': 'mean'})\n",
-    "weather_df.plot.bar(stacked=True, title='不同天气下每小时的平均租赁数量', color=['#c63d40', '#dc8f75', '#ffddb7'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "weather_df.plot(figsize=(8,4))\n",
-    "plt.title('租借数量随天气变化的趋势')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 风速对租借情况影响"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 考虑到风速特别大的时候很少，如果取平均值会出现异常，所以按风速对租赁数量取最大值。\n",
-    "windspeedr = full.groupby(['windspeed'], as_index=True).agg({'casual': 'max', 'registered': 'max', 'count': 'max'})\n",
-    "windspeedr.plot(title='不同风速下每小时启动的最大租赁数量',figsize=(8,4))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 可以看到租赁数量随风速越大租赁数量越少，在风速超过30的时候明显减少，打印异常数据分析\n",
-    "df2 = full[full['windspeed'] > 40]\n",
-    "df2 = df2[df2['count'] > 400]\n",
-    "df2"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "因为是下班高峰期的时间"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 工作日与非工作日的租赁情况"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "day_df=full.groupby('date').agg({'year':'mean','season':'mean',\n",
-    "                                      'casual':'sum', 'registered':'sum'\n",
-    "                                      ,'count':'sum','temp':'mean',\n",
-    "                                      'atemp':'mean','workingday':'mean'})\n",
-    "day_df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 对工作日和非工作日租赁数量取了平均值，对一周中每天的租赁数量求和\n",
-    "workingday_df = day_df.groupby(['workingday'], as_index=True).agg({'casual': 'mean', 'registered': 'mean'})\n",
-    "workingday_df.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "workingday_df1 = workingday_df.loc[0]\n",
-    "workingday_df2 = workingday_df.loc[1]\n",
-    "workingday_df1.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 绘制非工作日与工作日的临时借车数量和会员借车数量堆积图\n",
-    "width = 0.3\n",
-    "p1 = plt.bar(workingday_df.index, workingday_df['casual'], width,color='#dc8f75')\n",
-    "p2 = plt.bar(workingday_df.index, workingday_df['registered'], width, bottom=workingday_df['casual'],color='#ffddb7')\n",
-    "plt.title('工作日、非工作日每天平均租赁数量')\n",
-    "plt.xticks([0, 1], ('non-working day', 'working day'))\n",
-    "plt.legend((p1[0], p2[0]), ('casual', 'registered'))\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 绘制非工作日临时借车数量和会员借车数量占比饼图\n",
-    "plt.subplot2grid((2, 2), (0, 0))\n",
-    "plt.pie(workingday_df1, labels=['casual', 'registered'], autopct='%1.1f%%',\n",
-    "        pctdistance=0.6, labeldistance=1.35, radius=1.3,colors=['#c63d40','#ffddb7'])\n",
-    "plt.axis('equal')\n",
-    "plt.title('non-working day')\n",
-    "\n",
-    "# 绘制工作日临时借车数量和会员借车数量占比饼图\n",
-    "plt.subplot2grid((2, 2), (0, 1))\n",
-    "plt.pie(workingday_df2, labels=['casual', 'registered'], autopct='%1.1f%%',\n",
-    "        pctdistance=0.6, labeldistance=1.35, radius=1.3,colors=['#c63d40','#ffddb7'])\n",
-    "plt.title('working day')\n",
-    "plt.axis('equal')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\n",
-    "# 查看工作日与非工作日每小时的临时租赁数量，会员租赁数量和总数量的平均值\n",
-    "workingday_df=full[full['workingday']==1]\n",
-    "workingday_df = workingday_df.groupby(['hour'], as_index=True).agg({'casual':'mean',\n",
-    "                                                                    'registered':'mean',\n",
-    "                                                                    'count':'mean'})\n",
-    "\n",
-    "nworkingday_df=full[full['workingday']==0]\n",
-    "nworkingday_df = nworkingday_df.groupby(['hour'], as_index=True).agg({'casual':'mean',\n",
-    "                                                                      'registered':'mean',\n",
-    "                                                                      'count':'mean'})\n",
-    "fig, axes = plt.subplots(1, 2,sharey = True)\n",
-    "\n",
-    "workingday_df.plot(figsize=(15,5),title = '工作日一天内租借数量变化趋势',ax=axes[0])\n",
-    "nworkingday_df.plot(figsize=(15,5),title = '非工作日一天内租借数量变化趋势',ax=axes[1])\n",
-    "\n",
-    "plt.show()\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 节假日的租赁情况"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "day_df=full.groupby('date').agg({'year':'mean',\n",
-    "                                      'casual':'sum', 'registered':'sum'\n",
-    "                                      ,'count':'sum','holiday':'mean'})\n",
-    "holiday_count=day_df.groupby('year', as_index=True).agg({'holiday':'sum'})\n",
-    "holiday_count"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "holiday_df = day_df.groupby('holiday', as_index=True).agg({'casual':'mean', 'registered':'mean'})\n",
-    "holiday_df.plot.bar(stacked=True , title = '假期或非假期每天的平均租赁数量',cmap='Set3')\n",
-    "plt.xticks(rotation=360)\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 会员与非会员"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "number_pei=day_df[['casual','registered']].mean()\n",
-    "number_pei"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "plt.axes(aspect='equal')\n",
-    "plt.pie(number_pei, labels=['casual','registered'], autopct='%1.1f%%',\n",
-    "        pctdistance=0.6 , labeldistance=1.05 , radius=1 ,colors=['#c63d40','#ffddb7'])\n",
-    "\n",
-    "plt.title('会员与非会员租借数量占比')\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 决策树模型"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from sklearn.tree import DecisionTreeClassifier\n",
-    "clf1 = DecisionTreeClassifier()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "train_df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "train_features = train_df.iloc[:,1:-3]\n",
-    "train_labels_casual = train_df.iloc[:,-3]\n",
-    "test_features = test_df.iloc[:,1:]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "clf1.fit(train_features, train_labels_casual)\n",
-    "predicted_labels_casual = clf1.predict(test_features)\n",
-    "test_df['casual'] = predicted_labels_casual\n",
-    "test_df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "clf2 = DecisionTreeClassifier()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "train_labels_registered = train_df.iloc[:,-2]\n",
-    "clf2.fit(train_features, train_labels_registered)\n",
-    "predicted_labels_registered = clf2.predict(test_features)\n",
-    "test_df['registered'] = predicted_labels_registered\n",
-    "test_df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "test_df['count'] = test_df['casual'] + test_df['registered']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "test_df"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "注：在决策树模型中，多次对同一数据进行机器学习，最终预测结果可能会有所不同。"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.1"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 1
-}