From a395c9c6e029b26350f30274d1772f062b16fbf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E5=8D=93=E7=AB=8B?= <13190677+zhang-zhuoli@user.noreply.gitee.com> Date: Sat, 15 Jul 2023 12:05:21 +0000 Subject: [PATCH] =?UTF-8?q?=E9=9A=8F=E6=9C=BA=E6=A3=AE=E6=9E=97=E5=9B=9E?= =?UTF-8?q?=E5=BD=92?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 张卓立 <13190677+zhang-zhuoli@user.noreply.gitee.com> --- .../rf_reg.ipynb | 278 ++++++++++++++++++ 1 file changed, 278 insertions(+) create mode 100644 共享民宿平台担保交易房子评分的影响研究/rf_reg.ipynb diff --git a/共享民宿平台担保交易房子评分的影响研究/rf_reg.ipynb b/共享民宿平台担保交易房子评分的影响研究/rf_reg.ipynb new file mode 100644 index 0000000..8c130dd --- /dev/null +++ b/共享民宿平台担保交易房子评分的影响研究/rf_reg.ipynb @@ -0,0 +1,278 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "67537bcc-9ece-42d8-a6a7-924966604450", + "metadata": {}, + "source": [ + "# 随机森林回归" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "8f588675-9e05-45c7-9203-15c52f7ddd05", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor\n", + "from sklearn.metrics import mean_squared_error\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "0a02061e-296a-4f6a-91af-c7fa27d46f17", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
host_response_ratehost_acceptance_rateaccommodatespricenumber_of_reviewsreview_scores_rating
01.000.332.0120.090.04.50
11.000.982.090.0351.04.58
21.000.982.066.067.04.52
31.000.981.033.0297.04.70
51.001.002.045.042.04.98
.....................
2032521.000.934.0152.01.04.00
2032531.000.972.045.01.03.00
2032541.000.972.040.01.01.00
2032760.990.992.043.01.05.00
2033081.001.003.0110.01.05.00
\n", + "

134835 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " host_response_rate host_acceptance_rate accommodates price \\\n", + "0 1.00 0.33 2.0 120.0 \n", + "1 1.00 0.98 2.0 90.0 \n", + "2 1.00 0.98 2.0 66.0 \n", + "3 1.00 0.98 1.0 33.0 \n", + "5 1.00 1.00 2.0 45.0 \n", + "... ... ... ... ... \n", + "203252 1.00 0.93 4.0 152.0 \n", + "203253 1.00 0.97 2.0 45.0 \n", + "203254 1.00 0.97 2.0 40.0 \n", + "203276 0.99 0.99 2.0 43.0 \n", + "203308 1.00 1.00 3.0 110.0 \n", + "\n", + " number_of_reviews review_scores_rating \n", + "0 90.0 4.50 \n", + "1 351.0 4.58 \n", + "2 67.0 4.52 \n", + "3 297.0 4.70 \n", + "5 42.0 4.98 \n", + "... ... ... \n", + "203252 1.0 4.00 \n", + "203253 1.0 3.00 \n", + "203254 1.0 1.00 \n", + "203276 1.0 5.00 \n", + "203308 1.0 5.00 \n", + "\n", + "[134835 rows x 6 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "variables = ['number_of_reviews', 'price', 'accommodates',\n", + " 'host_response_rate', 'host_acceptance_rate', 'review_scores_rating']\n", + "df = pd.read_csv('../data/2022-01(US_25).csv', usecols=variables)\n", + "df['price'] = df['price'].replace('\\$', '', regex=True)\n", + "df['price'] = df['price'].replace('\\,', '', regex=True).astype(float)\n", + "df[['host_response_rate', 'host_acceptance_rate']] = df[['host_response_rate',\n", + " 'host_acceptance_rate']].replace('\\%', '', regex=True).astype(float)*0.01\n", + "df[['number_of_reviews']] = df[['number_of_reviews']].astype(float)\n", + "for col in variables:\n", + " df[col] = df[col].astype(np.float32)\n", + " df = df[np.isnan(df[col]) != 1]\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "35569391-e849-4a5e-adf5-25dfd849b2a0", + "metadata": {}, + "outputs": [], + "source": [ + "# 固定划分训练集和测试集\n", + "info = df.iloc[:, :-1].values\n", + "target = df.iloc[:, -1].values\n", + "# 标准化\n", + "stdscaler = StandardScaler()\n", + "info_train, info_test, target_train, target_test = train_test_split(\n", + " info, target, test_size=0.3,shuffle=True, random_state=420)\n", + "info_train = stdscaler.fit_transform(info_train)\n", + "info_test = stdscaler.transform(info_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "7ca33f2e-bcc8-4234-a27b-fa7e1df04030", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MSE:0.2333\n" + ] + } + ], + "source": [ + "# randomforest回归\n", + "rf = RandomForestRegressor(n_estimators=100, random_state=0)\n", + "rf.fit(info_train, target_train)\n", + "target_pred = rf.predict(info_test)\n", + "print(\"MSE:%.4f\" % mean_squared_error(target_test, target_pred))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}