From df2e5b3cbbc0385fbf74641e9ce80d7a4f11e501 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E5=8D=93=E7=AB=8B?=
<13190677+zhang-zhuoli@user.noreply.gitee.com>
Date: Sat, 15 Jul 2023 12:04:50 +0000
Subject: [PATCH] =?UTF-8?q?AdaBoost=E5=9B=9E=E5=BD=92?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: 张卓立 <13190677+zhang-zhuoli@user.noreply.gitee.com>
---
.../ada_reg.ipynb | 360 ++++++++++++++++++
1 file changed, 360 insertions(+)
create mode 100644 共享民宿平台担保交易房子评分的影响研究/ada_reg.ipynb
diff --git a/共享民宿平台担保交易房子评分的影响研究/ada_reg.ipynb b/共享民宿平台担保交易房子评分的影响研究/ada_reg.ipynb
new file mode 100644
index 0000000..65c9166
--- /dev/null
+++ b/共享民宿平台担保交易房子评分的影响研究/ada_reg.ipynb
@@ -0,0 +1,360 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "9e30b402-4ed9-4e45-aa7c-a77a7d68df17",
+ "metadata": {},
+ "source": [
+ "# AdaBoost回归"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "1a11e27d-9c4e-47eb-958b-cf9ed1c4c603",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor\n",
+ "from sklearn.metrics import mean_squared_error\n",
+ "from sklearn.preprocessing import StandardScaler\n",
+ "from sklearn.model_selection import train_test_split"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7a704744-26f2-40f1-9bc1-0e25608dd40f",
+ "metadata": {},
+ "source": [
+ "读入数据,由于样本量很大,直接删除有缺失值的样本。"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "9f2f693d-44fd-432f-b732-7a2117106150",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " host_response_rate | \n",
+ " host_acceptance_rate | \n",
+ " accommodates | \n",
+ " price | \n",
+ " number_of_reviews | \n",
+ " review_scores_rating | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1.00 | \n",
+ " 0.33 | \n",
+ " 2.0 | \n",
+ " 120.0 | \n",
+ " 90.0 | \n",
+ " 4.50 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1.00 | \n",
+ " 0.98 | \n",
+ " 2.0 | \n",
+ " 90.0 | \n",
+ " 351.0 | \n",
+ " 4.58 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1.00 | \n",
+ " 0.98 | \n",
+ " 2.0 | \n",
+ " 66.0 | \n",
+ " 67.0 | \n",
+ " 4.52 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 1.00 | \n",
+ " 0.98 | \n",
+ " 1.0 | \n",
+ " 33.0 | \n",
+ " 297.0 | \n",
+ " 4.70 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 1.00 | \n",
+ " 1.00 | \n",
+ " 2.0 | \n",
+ " 45.0 | \n",
+ " 42.0 | \n",
+ " 4.98 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 203252 | \n",
+ " 1.00 | \n",
+ " 0.93 | \n",
+ " 4.0 | \n",
+ " 152.0 | \n",
+ " 1.0 | \n",
+ " 4.00 | \n",
+ "
\n",
+ " \n",
+ " | 203253 | \n",
+ " 1.00 | \n",
+ " 0.97 | \n",
+ " 2.0 | \n",
+ " 45.0 | \n",
+ " 1.0 | \n",
+ " 3.00 | \n",
+ "
\n",
+ " \n",
+ " | 203254 | \n",
+ " 1.00 | \n",
+ " 0.97 | \n",
+ " 2.0 | \n",
+ " 40.0 | \n",
+ " 1.0 | \n",
+ " 1.00 | \n",
+ "
\n",
+ " \n",
+ " | 203276 | \n",
+ " 0.99 | \n",
+ " 0.99 | \n",
+ " 2.0 | \n",
+ " 43.0 | \n",
+ " 1.0 | \n",
+ " 5.00 | \n",
+ "
\n",
+ " \n",
+ " | 203308 | \n",
+ " 1.00 | \n",
+ " 1.00 | \n",
+ " 3.0 | \n",
+ " 110.0 | \n",
+ " 1.0 | \n",
+ " 5.00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
134835 rows × 6 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " host_response_rate host_acceptance_rate accommodates price \\\n",
+ "0 1.00 0.33 2.0 120.0 \n",
+ "1 1.00 0.98 2.0 90.0 \n",
+ "2 1.00 0.98 2.0 66.0 \n",
+ "3 1.00 0.98 1.0 33.0 \n",
+ "5 1.00 1.00 2.0 45.0 \n",
+ "... ... ... ... ... \n",
+ "203252 1.00 0.93 4.0 152.0 \n",
+ "203253 1.00 0.97 2.0 45.0 \n",
+ "203254 1.00 0.97 2.0 40.0 \n",
+ "203276 0.99 0.99 2.0 43.0 \n",
+ "203308 1.00 1.00 3.0 110.0 \n",
+ "\n",
+ " number_of_reviews review_scores_rating \n",
+ "0 90.0 4.50 \n",
+ "1 351.0 4.58 \n",
+ "2 67.0 4.52 \n",
+ "3 297.0 4.70 \n",
+ "5 42.0 4.98 \n",
+ "... ... ... \n",
+ "203252 1.0 4.00 \n",
+ "203253 1.0 3.00 \n",
+ "203254 1.0 1.00 \n",
+ "203276 1.0 5.00 \n",
+ "203308 1.0 5.00 \n",
+ "\n",
+ "[134835 rows x 6 columns]"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "variables = ['number_of_reviews', 'price', 'accommodates',\n",
+ " 'host_response_rate', 'host_acceptance_rate', 'review_scores_rating']\n",
+ "df = pd.read_csv('../data/2022-01(US_25).csv', usecols=variables)\n",
+ "df['price'] = df['price'].replace('\\$', '', regex=True)\n",
+ "df['price'] = df['price'].replace('\\,', '', regex=True).astype(float)\n",
+ "df[['host_response_rate', 'host_acceptance_rate']] = df[['host_response_rate',\n",
+ " 'host_acceptance_rate']].replace('\\%', '', regex=True).astype(float)*0.01\n",
+ "df[['number_of_reviews']] = df[['number_of_reviews']].astype(float)\n",
+ "for col in variables:\n",
+ " df[col] = df[col].astype(np.float32)\n",
+ " df = df[np.isnan(df[col]) != 1]\n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "16658a50-232f-49c7-ad4c-c93a6a43e118",
+ "metadata": {},
+ "source": [
+ "划分测试集和训练集,测试集大小为0.3,并对数据标准化"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "00158193-2ea9-4fb8-91f1-cb5ea4b9fb96",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# 固定划分训练集和测试集\n",
+ "info = df.iloc[:, :-1].values\n",
+ "target = df.iloc[:, -1].values\n",
+ "# 标准化\n",
+ "stdscaler = StandardScaler()\n",
+ "info_train, info_test, target_train, target_test = train_test_split(\n",
+ " info, target, test_size=0.3,shuffle=True, random_state=420)\n",
+ "info_train = stdscaler.fit_transform(info_train)\n",
+ "info_test = stdscaler.transform(info_test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "91622729-b48b-45bd-9cd6-b33f8d839d7a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[ 0.2976891 , 0.44020966, -0.10153121, -0.14794447, -0.62968445],\n",
+ " [ 0.2976891 , 0.44020966, -0.10153121, -0.27286685, 0.06075969],\n",
+ " [ 0.2976891 , 0.39323488, -0.8076375 , -0.25845274, -0.62968445],\n",
+ " ...,\n",
+ " [ 0.2976891 , 0.5341586 , 0.6045751 , -0.22241743, 0.03610097],\n",
+ " [ 0.2976891 , 0.25231144, -0.8076375 , -0.3016951 , 1.1457433 ],\n",
+ " [ 0.2976891 , -0.6871791 , -0.8076375 , -0.3185116 , 0.72654516]],\n",
+ " dtype=float32)"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "info_train"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "3b292dc8-9c79-45e4-8a4a-6bda192a2b64",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([3. , 4.93, 4. , ..., 4.69, 4.85, 4.83], dtype=float32)"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "target_train"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "47bf67ca-2ede-4bf3-9f77-1f49efd68bc3",
+ "metadata": {},
+ "source": [
+ "拟合并预测,计算在测试集上的均方误差"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "81f63b17-e62c-4775-a525-edb8430a0910",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "MSE:0.4843\n"
+ ]
+ }
+ ],
+ "source": [
+ "# AdaBoost回归\n",
+ "rf = RandomForestRegressor(n_estimators=10, random_state=0, max_depth=2)\n",
+ "ada = AdaBoostRegressor(estimator=rf,n_estimators=100, loss='square', random_state=0)\n",
+ "ada.fit(info_train, target_train)\n",
+ "target_pred = ada.predict(info_test)\n",
+ "print(\"MSE:%.4f\" % mean_squared_error(target_test, target_pred))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e4e758bb-8599-41c0-a6db-ef367d744505",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}