运行购物评论的情感分析的主程序

Signed-off-by: 吴沂钊 <13190667+Yizhao_Wu4926@user.noreply.gitee.com>
2023-07-15 14:47:46 +00:00
parent 4932de7a3f
commit da65298f86
1 changed files with 828 additions and 0 deletions
@@ -0,0 +1,828 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 导入需要的包\n",
+    "from collections import defaultdict\n",
+    "import pandas as pd\n",
+    "import re\n",
+    "import jieba\n",
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.svm import SVC\n",
+    "from sklearn.metrics import classification_report\n",
+    "from gensim.models.word2vec import Word2Vec\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>raw</th>\n",
+       "      <th>y</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>感觉用这个手机下载MP3听很方便，用T-Flash卡装上歌，就可以当一个小MP3用了，很不错...</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>外观美观，速度也不错。上面一排触摸键挺实用。应该对得起这个价格。当然再降点大家肯定也不反对。...</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>我刚拿到书，也没有仔细阅读，就是粗粗的翻了点，觉得还行。里面是蓝黑两种颜色的，有些单词的下面...</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>对于二胡曲的精选，应该出版一个系列套装，可分别出售，单独购买。精品二胡曲是有年代和阶段性的，...</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>用了一年半的 e680 终于送小偷了。刚刚买了一台e850，用了三天，说说我自己的感受。1：...</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                 raw  y\n",
+       "0  感觉用这个手机下载MP3听很方便，用T-Flash卡装上歌，就可以当一个小MP3用了，很不错...  1\n",
+       "1  外观美观，速度也不错。上面一排触摸键挺实用。应该对得起这个价格。当然再降点大家肯定也不反对。...  1\n",
+       "2  我刚拿到书，也没有仔细阅读，就是粗粗的翻了点，觉得还行。里面是蓝黑两种颜色的，有些单词的下面...  1\n",
+       "3  对于二胡曲的精选，应该出版一个系列套装，可分别出售，单独购买。精品二胡曲是有年代和阶段性的，...  1\n",
+       "4  用了一年半的 e680 终于送小偷了。刚刚买了一台e850，用了三天，说说我自己的感受。1：...  1"
+      ]
+     },
+     "execution_count": 78,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# 读入数据\n",
+    "df_pos = pd.read_excel(r\"data/购物评论.xlsx\", sheet_name=\"正向\", header=None)\n",
+    "df_pos[\"y\"] = 1\n",
+    "df_neg = pd.read_excel(r\"data/购物评论.xlsx\", sheet_name=\"负向\", header=None)\n",
+    "df_neg[\"y\"] = 0\n",
+    "# 将正样本和负样本拼接在一起\n",
+    "df = df_pos.append(df_neg, ignore_index=True)\n",
+    "df.columns = [\"raw\", \"y\"]\n",
+    "\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0    [感觉, 用, 这个, 手机, 下载, MP3, 听, 很, 方便, ，, 用, T, -,...\n",
+       "1    [外观, 美观, ，, 速度, 也, 不错, 。, 上面, 一排, 触摸, 键, 挺, 实用...\n",
+       "2    [我刚, 拿到, 书, ，, 也, 没有, 仔细阅读, ，, 就是, 粗粗, 的, 翻, 了...\n",
+       "3    [对于, 二胡曲, 的, 精选, ，, 应该, 出版, 一个系列, 套装, ，, 可, 分别...\n",
+       "4    [用, 了, 一年, 半, 的,  , e680,  , 终于, 送, 小偷, 了, 。, ...\n",
+       "Name: raw, dtype: object"
+      ]
+     },
+     "execution_count": 79,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# 数据清洗\n",
+    "# 由于有些情感词本身也是停用词。因此这里仅去掉换行和空字符，不做其他的清理工作，以保留情感词。\n",
+    "df_cut = df[\"raw\"].apply(lambda x: re.sub('\\n', '', x))\n",
+    "# 将清洗后的文本用jieba.lcut转换为list形式\n",
+    "df_cut = df_cut.apply(jieba.lcut)\n",
+    "\n",
+    "df_cut.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 基于词典的情感分析\n",
+    "# 读入并创建情感词典\n",
+    "sentiment_dict = defaultdict(float)  # 方便处理不存在于词典中的关键字\n",
+    "# 使用with open以节约内存\n",
+    "with open(r\"data\\BosonNLP_sentiment_score.txt\", \"r\", encoding=\"utf-8\") as sen_file:\n",
+    "    for line in sen_file:\n",
+    "        try:\n",
+    "            key, value = line.split(\" \")  # 直接运行会有奇怪的问题出现\n",
+    "        except Exception:\n",
+    "            # 直接跳过有问题的行\n",
+    "            pass\n",
+    "        else:\n",
+    "            sentiment_dict[key] = float(value)  # 正常情况"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0    21.581138\n",
+       "1    14.952362\n",
+       "2     6.245357\n",
+       "3    30.980969\n",
+       "4    36.519372\n",
+       "Name: raw, dtype: float64"
+      ]
+     },
+     "execution_count": 81,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# 以最简单的直接匹配的方式进行情感打分\n",
+    "def get_score(words):\n",
+    "    return sum(sentiment_dict[word] for word in words)\n",
+    "\n",
+    "\n",
+    "y = df_cut.apply(get_score)\n",
+    "\n",
+    "y.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "该模型的准确率为： 0.6764648722184433\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 根据分数>0为正向、分数<0为负向判定准确率\n",
+    "precise = lambda x: (df[\"y\"][x] == 1 and score[x] > 0) or (df[\"y\"][x] == 0 and score[x] <= 0)\n",
+    "# 准确度评估\n",
+    "print(\"该模型的准确度为：\", sum(1 for i in range(len(df)) if precise(i)) / len(df))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "我谢谢蒙牛！多一层体贴，多一层爱护！双重保护。\n",
+      "正向\n",
+      " 蒙牛是店大欺客 至少态度比蒙牛好～～后期赶紧整改！\n",
+      "正向\n",
+      "宝贝收到了，孩子爱吃，敏感肌能用，很启发大脑，提升睡眠质量，五星好评。\n",
+      "正向\n",
+      "五星好评，分期付清\n",
+      "正向\n",
+      "对得起我们吗？rnm，退钱！\n",
+      "负向\n"
+     ]
+    }
+   ],
+   "source": [
+    "def score_pred(string):\n",
+    "    words = \" \".join(jieba.cut(string))\n",
+    "    # 预测并输出\n",
+    "    result = 1 if get_score(string) > 0 else 0\n",
+    "    if result == 1:\n",
+    "        print(\"正向\")\n",
+    "    else:\n",
+    "        print(\"负向\")\n",
+    "\n",
+    "\n",
+    "# 使用上述模型进行预测\n",
+    "test_comment = [\n",
+    "    \"我谢谢蒙牛！多一层体贴，多一层爱护！双重保护。\",\n",
+    "    \" 蒙牛是店大欺客 至少态度比蒙牛好～～后期赶紧整改！\",\n",
+    "    \"宝贝收到了，孩子爱吃，敏感肌能用，很启发大脑，提升睡眠质量，五星好评。\",\n",
+    "    \"五星好评，分期付清\",\n",
+    "    \"对得起我们吗？rnm，退钱！\"\n",
+    "]\n",
+    "for comment in test_comment:\n",
+    "    print(comment)\n",
+    "    score_pred(comment)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0    感觉 用 这个 手机 下载 MP3 听 很 方便 ， 用 T - Flash 卡装 上 歌 ...\n",
+       "1    外观 美观 ， 速度 也 不错 。 上面 一排 触摸 键 挺 实用 。 应该 对得起 这个 ...\n",
+       "2    我刚 拿到 书 ， 也 没有 仔细阅读 ， 就是 粗粗 的 翻 了 点 ， 觉得 还 行 。...\n",
+       "3    对于 二胡曲 的 精选 ， 应该 出版 一个系列 套装 ， 可 分别 出售 ， 单独 购买 ...\n",
+       "4    用 了 一年 半 的   e680   终于 送 小偷 了 。 刚刚 买 了 一台 e850...\n",
+       "Name: raw, dtype: object"
+      ]
+     },
+     "execution_count": 84,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# 基于词袋模型进行情感分析\n",
+    "# 将之前用jieba分好的列表转化为空格分隔的字符串，以方便使用词袋\n",
+    "df_cleantxt = df_cut.apply(lambda x: \" \".join(x))\n",
+    "\n",
+    "df_cleantxt.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<20582x12014 sparse matrix of type '<class 'numpy.int64'>'\n",
+       "\twith 513919 stored elements in Compressed Sparse Row format>"
+      ]
+     },
+     "execution_count": 85,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "countvec = CountVectorizer(min_df = 5)  # 特征向量，出现5次以上的才纳入\n",
+    "# 转化为词频矩阵\n",
+    "wordmtx = countvec.fit_transform(df_cleantxt)\n",
+    "\n",
+    "wordmtx  # 一个稀疏矩阵对象，无法直接显示"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[LibSVM]"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "SVC(verbose=True)"
+      ]
+     },
+     "execution_count": 86,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# 按照7：3的比例生成训练集和测试集，并使用SVM进行建模\n",
+    "x_train, x_test, y_train, y_test = train_test_split(wordmtx, df.y, test_size=0.3, random_state=0)  # 指定random_state可以确保每次分割出的训练集和测试集相同\n",
+    "# 创建分类器，使用径向基核函数作为核函数\n",
+    "clf = SVC(kernel=\"rbf\", verbose=True)\n",
+    "# 训练分类器（耗时很长，慎点）\n",
+    "clf.fit(x_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "该分类器的准确度为： 0.9551606857777469\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 准确度评估\n",
+    "print(\"该分类器的准确度为：\", clf.score(x_train, y_train))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0       0.87      0.89      0.88      3047\n",
+      "           1       0.89      0.87      0.88      3128\n",
+      "\n",
+      "    accuracy                           0.88      6175\n",
+      "   macro avg       0.88      0.88      0.88      6175\n",
+      "weighted avg       0.88      0.88      0.88      6175\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 支持向量机的评估报告\n",
+    "# 四个列名分别为：精确率、召回率、F1值、支持数量\n",
+    "# 三个行名分别为：准确率、宏平均、加权平均\n",
+    "print(classification_report(y_test, clf.predict(x_test)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "我谢谢蒙牛！多一层体贴，多一层爱护！双重保护。\n",
+      "正向\n",
+      " 蒙牛是店大欺客 至少态度比蒙牛好～～后期赶紧整改！\n",
+      "负向\n",
+      "宝贝收到了，孩子爱吃，敏感肌能用，很启发大脑，提升睡眠质量，五星好评。\n",
+      "正向\n",
+      "五星好评，分期付清\n",
+      "负向\n",
+      "对得起我们吗？rnm，退钱！\n",
+      "负向\n"
+     ]
+    }
+   ],
+   "source": [
+    "def bow_pred(string): \n",
+    "    global clf, countvec\n",
+    "    model = clf\n",
+    "    # 数据转换为词频矩阵\n",
+    "    words = \" \".join(jieba.cut(string))\n",
+    "    words_vecs = countvec.transform([words])\n",
+    "    # 预测并输出\n",
+    "    result = model.predict(words_vecs)\n",
+    "    if result[0] == 1:\n",
+    "        print(\"正向\")\n",
+    "    else:\n",
+    "        print(\"负向\")\n",
+    "\n",
+    "\n",
+    "# 使用上述模型进行预测\n",
+    "for comment in test_comment:\n",
+    "    print(comment)\n",
+    "    bow_pred(comment)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(6274592, 9394570)"
+      ]
+     },
+     "execution_count": 90,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# 基于Word2Vec的情感分析\n",
+    "# 按照7：3的比例生成训练集和测试集\n",
+    "x_train, x_test, y_train, y_test = train_test_split(df_cut, df.y, test_size=0.3, random_state=0)\n",
+    "# 初始化word2vec模型和词表\n",
+    "n_dim = 300  # 指定向量维度\n",
+    "w2vmodel = Word2Vec(vector_size=n_dim, min_count=10)  # 出现10次以上的才纳入\n",
+    "# 生成词表\n",
+    "w2vmodel.build_vocab(x_train)\n",
+    "w2vmodel.train(x_train, total_examples=w2vmodel.corpus_count, epochs = 10)  # 训练模型"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 91,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>0</th>\n",
+       "      <th>1</th>\n",
+       "      <th>2</th>\n",
+       "      <th>3</th>\n",
+       "      <th>4</th>\n",
+       "      <th>5</th>\n",
+       "      <th>6</th>\n",
+       "      <th>7</th>\n",
+       "      <th>8</th>\n",
+       "      <th>9</th>\n",
+       "      <th>...</th>\n",
+       "      <th>290</th>\n",
+       "      <th>291</th>\n",
+       "      <th>292</th>\n",
+       "      <th>293</th>\n",
+       "      <th>294</th>\n",
+       "      <th>295</th>\n",
+       "      <th>296</th>\n",
+       "      <th>297</th>\n",
+       "      <th>298</th>\n",
+       "      <th>299</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0.287859</td>\n",
+       "      <td>0.211170</td>\n",
+       "      <td>-0.002694</td>\n",
+       "      <td>0.015414</td>\n",
+       "      <td>-0.130773</td>\n",
+       "      <td>-0.252820</td>\n",
+       "      <td>-0.173765</td>\n",
+       "      <td>0.019247</td>\n",
+       "      <td>0.139526</td>\n",
+       "      <td>0.084161</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-0.174484</td>\n",
+       "      <td>0.183404</td>\n",
+       "      <td>-0.012513</td>\n",
+       "      <td>-0.207201</td>\n",
+       "      <td>0.414504</td>\n",
+       "      <td>0.289306</td>\n",
+       "      <td>0.317440</td>\n",
+       "      <td>-0.280071</td>\n",
+       "      <td>0.093196</td>\n",
+       "      <td>0.119094</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0.125504</td>\n",
+       "      <td>0.210281</td>\n",
+       "      <td>0.242841</td>\n",
+       "      <td>0.152100</td>\n",
+       "      <td>-0.118140</td>\n",
+       "      <td>-0.105682</td>\n",
+       "      <td>0.000601</td>\n",
+       "      <td>0.091197</td>\n",
+       "      <td>0.157458</td>\n",
+       "      <td>0.128822</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-0.048829</td>\n",
+       "      <td>0.089328</td>\n",
+       "      <td>-0.048414</td>\n",
+       "      <td>-0.100801</td>\n",
+       "      <td>0.153824</td>\n",
+       "      <td>0.320274</td>\n",
+       "      <td>0.117689</td>\n",
+       "      <td>-0.306226</td>\n",
+       "      <td>0.321337</td>\n",
+       "      <td>-0.032039</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0.208149</td>\n",
+       "      <td>0.200255</td>\n",
+       "      <td>0.028664</td>\n",
+       "      <td>0.096052</td>\n",
+       "      <td>0.157904</td>\n",
+       "      <td>0.190969</td>\n",
+       "      <td>-0.003578</td>\n",
+       "      <td>0.351330</td>\n",
+       "      <td>-0.182146</td>\n",
+       "      <td>0.020400</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-0.055078</td>\n",
+       "      <td>0.228487</td>\n",
+       "      <td>0.066214</td>\n",
+       "      <td>0.011649</td>\n",
+       "      <td>-0.027820</td>\n",
+       "      <td>0.206249</td>\n",
+       "      <td>-0.050040</td>\n",
+       "      <td>-0.125792</td>\n",
+       "      <td>0.412140</td>\n",
+       "      <td>-0.081710</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0.390679</td>\n",
+       "      <td>0.235006</td>\n",
+       "      <td>0.279163</td>\n",
+       "      <td>0.160066</td>\n",
+       "      <td>0.273807</td>\n",
+       "      <td>-0.093850</td>\n",
+       "      <td>-0.012255</td>\n",
+       "      <td>0.124318</td>\n",
+       "      <td>0.116129</td>\n",
+       "      <td>0.007561</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-0.092300</td>\n",
+       "      <td>-0.054543</td>\n",
+       "      <td>-0.191227</td>\n",
+       "      <td>-0.123319</td>\n",
+       "      <td>0.179981</td>\n",
+       "      <td>0.214038</td>\n",
+       "      <td>0.072776</td>\n",
+       "      <td>-0.226328</td>\n",
+       "      <td>0.267805</td>\n",
+       "      <td>-0.407456</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0.226386</td>\n",
+       "      <td>0.250651</td>\n",
+       "      <td>-0.100598</td>\n",
+       "      <td>-0.016630</td>\n",
+       "      <td>-0.165644</td>\n",
+       "      <td>-0.165641</td>\n",
+       "      <td>-0.108314</td>\n",
+       "      <td>0.047699</td>\n",
+       "      <td>0.030386</td>\n",
+       "      <td>0.061525</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-0.093842</td>\n",
+       "      <td>0.121937</td>\n",
+       "      <td>0.224285</td>\n",
+       "      <td>-0.055072</td>\n",
+       "      <td>0.381097</td>\n",
+       "      <td>0.108929</td>\n",
+       "      <td>0.158639</td>\n",
+       "      <td>0.004664</td>\n",
+       "      <td>-0.072203</td>\n",
+       "      <td>-0.276887</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 300 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        0         1         2         3         4         5         6    \\\n",
+       "0  0.287859  0.211170 -0.002694  0.015414 -0.130773 -0.252820 -0.173765   \n",
+       "1  0.125504  0.210281  0.242841  0.152100 -0.118140 -0.105682  0.000601   \n",
+       "2  0.208149  0.200255  0.028664  0.096052  0.157904  0.190969 -0.003578   \n",
+       "3  0.390679  0.235006  0.279163  0.160066  0.273807 -0.093850 -0.012255   \n",
+       "4  0.226386  0.250651 -0.100598 -0.016630 -0.165644 -0.165641 -0.108314   \n",
+       "\n",
+       "        7         8         9    ...       290       291       292       293  \\\n",
+       "0  0.019247  0.139526  0.084161  ... -0.174484  0.183404 -0.012513 -0.207201   \n",
+       "1  0.091197  0.157458  0.128822  ... -0.048829  0.089328 -0.048414 -0.100801   \n",
+       "2  0.351330 -0.182146  0.020400  ... -0.055078  0.228487  0.066214  0.011649   \n",
+       "3  0.124318  0.116129  0.007561  ... -0.092300 -0.054543 -0.191227 -0.123319   \n",
+       "4  0.047699  0.030386  0.061525  ... -0.093842  0.121937  0.224285 -0.055072   \n",
+       "\n",
+       "        294       295       296       297       298       299  \n",
+       "0  0.414504  0.289306  0.317440 -0.280071  0.093196  0.119094  \n",
+       "1  0.153824  0.320274  0.117689 -0.306226  0.321337 -0.032039  \n",
+       "2 -0.027820  0.206249 -0.050040 -0.125792  0.412140 -0.081710  \n",
+       "3  0.179981  0.214038  0.072776 -0.226328  0.267805 -0.407456  \n",
+       "4  0.381097  0.108929  0.158639  0.004664 -0.072203 -0.276887  \n",
+       "\n",
+       "[5 rows x 300 columns]"
+      ]
+     },
+     "execution_count": 91,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "def m_avgvec(words, w2vmodel):\n",
+    "    \"\"\"\n",
+    "    用各个词向量直接平均的方式生成整句对应的向量\n",
+    "    \"\"\"\n",
+    "    return pd.DataFrame([w2vmodel.wv[w] for w in words if w in w2vmodel.wv]).agg(\"mean\")\n",
+    "\n",
+    "\n",
+    "# 生成建模用矩阵（耗时很长，慎点）\n",
+    "train_vecs = pd.DataFrame([m_avgvec(s, w2vmodel) for s in x_train])\n",
+    "\n",
+    "train_vecs.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 92,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[LibSVM]"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "SVC(verbose=True)"
+      ]
+     },
+     "execution_count": 92,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# 用转换后的矩阵拟合SVM模型\n",
+    "clf2 = SVC(kernel=\"rbf\", verbose=True)\n",
+    "# 训练分类器（耗时很长，慎点）\n",
+    "clf2.fit(train_vecs, y_train)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 93,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "该分类器的准确度为： 0.8936628028041924\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 准确度评估\n",
+    "print(\"该分类器的准确度为：\", clf2.score(train_vecs, y_train))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 94,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 生成测试用矩阵（耗时很长，慎点）\n",
+    "test_vecs = pd.DataFrame([m_avgvec(s, w2vmodel) for s in x_test])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 95,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0       0.87      0.87      0.87      3047\n",
+      "           1       0.87      0.87      0.87      3128\n",
+      "\n",
+      "    accuracy                           0.87      6175\n",
+      "   macro avg       0.87      0.87      0.87      6175\n",
+      "weighted avg       0.87      0.87      0.87      6175\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 支持向量机的评估报告\n",
+    "print(classification_report(y_test, clf2.predict(test_vecs)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 96,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "我谢谢蒙牛！多一层体贴，多一层爱护！双重保护。\n",
+      "正向\n",
+      " 蒙牛是店大欺客 至少态度比蒙牛好～～后期赶紧整改！\n",
+      "负向\n",
+      "宝贝收到了，孩子爱吃，敏感肌能用，很启发大脑，提升睡眠质量，五星好评。\n",
+      "正向\n",
+      "五星好评，分期付清\n",
+      "正向\n",
+      "对得起我们吗？rnm，退钱！\n",
+      "负向\n"
+     ]
+    }
+   ],
+   "source": [
+    "def w2v_pred(string):\n",
+    "    global clf2, w2vmodel\n",
+    "    # 将句子转换为矩阵形式\n",
+    "    vecs = pd.DataFrame([m_avgvec(string, w2vmodel)])\n",
+    "    # 预测并输出\n",
+    "    result = clf2.predict(vecs)\n",
+    "    if result[0] == 1:\n",
+    "        print(\"正向\")\n",
+    "    else:\n",
+    "        print(\"负向\")    \n",
+    "\n",
+    "\n",
+    "# 使用上述模型进行预测\n",
+    "for comment in test_comment:\n",
+    "    print(comment)\n",
+    "    w2v_pred(comment)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.9"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}