From da65298f86b8774ba0c4eeec7e47ab1b53a33cb7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=90=B4=E6=B2=82=E9=92=8A?=
<13190667+Yizhao_Wu4926@user.noreply.gitee.com>
Date: Sat, 15 Jul 2023 14:47:46 +0000
Subject: [PATCH] =?UTF-8?q?=E8=BF=90=E8=A1=8C=E8=B4=AD=E7=89=A9=E8=AF=84?=
=?UTF-8?q?=E8=AE=BA=E7=9A=84=E6=83=85=E6=84=9F=E5=88=86=E6=9E=90=E7=9A=84?=
=?UTF-8?q?=E4=B8=BB=E7=A8=8B=E5=BA=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: 吴沂钊 <13190667+Yizhao_Wu4926@user.noreply.gitee.com>
---
购物评论 情感分类/main.ipynb | 828 +++++++++++++++++++++++++++++++++++
1 file changed, 828 insertions(+)
create mode 100644 购物评论 情感分类/main.ipynb
diff --git a/购物评论 情感分类/main.ipynb b/购物评论 情感分类/main.ipynb
new file mode 100644
index 0000000..c55d334
--- /dev/null
+++ b/购物评论 情感分类/main.ipynb
@@ -0,0 +1,828 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 77,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# 导入需要的包\n",
+ "from collections import defaultdict\n",
+ "import pandas as pd\n",
+ "import re\n",
+ "import jieba\n",
+ "from sklearn.feature_extraction.text import CountVectorizer\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.svm import SVC\n",
+ "from sklearn.metrics import classification_report\n",
+ "from gensim.models.word2vec import Word2Vec\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 78,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " raw | \n",
+ " y | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 感觉用这个手机下载MP3听很方便,用T-Flash卡装上歌,就可以当一个小MP3用了,很不错... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 外观美观,速度也不错。上面一排触摸键挺实用。应该对得起这个价格。当然再降点大家肯定也不反对。... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 我刚拿到书,也没有仔细阅读,就是粗粗的翻了点,觉得还行。里面是蓝黑两种颜色的,有些单词的下面... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 对于二胡曲的精选,应该出版一个系列套装,可分别出售,单独购买。精品二胡曲是有年代和阶段性的,... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 用了一年半的 e680 终于送小偷了。刚刚买了一台e850,用了三天,说说我自己的感受。1:... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " raw y\n",
+ "0 感觉用这个手机下载MP3听很方便,用T-Flash卡装上歌,就可以当一个小MP3用了,很不错... 1\n",
+ "1 外观美观,速度也不错。上面一排触摸键挺实用。应该对得起这个价格。当然再降点大家肯定也不反对。... 1\n",
+ "2 我刚拿到书,也没有仔细阅读,就是粗粗的翻了点,觉得还行。里面是蓝黑两种颜色的,有些单词的下面... 1\n",
+ "3 对于二胡曲的精选,应该出版一个系列套装,可分别出售,单独购买。精品二胡曲是有年代和阶段性的,... 1\n",
+ "4 用了一年半的 e680 终于送小偷了。刚刚买了一台e850,用了三天,说说我自己的感受。1:... 1"
+ ]
+ },
+ "execution_count": 78,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 读入数据\n",
+ "df_pos = pd.read_excel(r\"data/购物评论.xlsx\", sheet_name=\"正向\", header=None)\n",
+ "df_pos[\"y\"] = 1\n",
+ "df_neg = pd.read_excel(r\"data/购物评论.xlsx\", sheet_name=\"负向\", header=None)\n",
+ "df_neg[\"y\"] = 0\n",
+ "# 将正样本和负样本拼接在一起\n",
+ "df = df_pos.append(df_neg, ignore_index=True)\n",
+ "df.columns = [\"raw\", \"y\"]\n",
+ "\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 79,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 [感觉, 用, 这个, 手机, 下载, MP3, 听, 很, 方便, ,, 用, T, -,...\n",
+ "1 [外观, 美观, ,, 速度, 也, 不错, 。, 上面, 一排, 触摸, 键, 挺, 实用...\n",
+ "2 [我刚, 拿到, 书, ,, 也, 没有, 仔细阅读, ,, 就是, 粗粗, 的, 翻, 了...\n",
+ "3 [对于, 二胡曲, 的, 精选, ,, 应该, 出版, 一个系列, 套装, ,, 可, 分别...\n",
+ "4 [用, 了, 一年, 半, 的, , e680, , 终于, 送, 小偷, 了, 。, ...\n",
+ "Name: raw, dtype: object"
+ ]
+ },
+ "execution_count": 79,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 数据清洗\n",
+ "# 由于有些情感词本身也是停用词。因此这里仅去掉换行和空字符,不做其他的清理工作,以保留情感词。\n",
+ "df_cut = df[\"raw\"].apply(lambda x: re.sub('\\n', '', x))\n",
+ "# 将清洗后的文本用jieba.lcut转换为list形式\n",
+ "df_cut = df_cut.apply(jieba.lcut)\n",
+ "\n",
+ "df_cut.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 80,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# 基于词典的情感分析\n",
+ "# 读入并创建情感词典\n",
+ "sentiment_dict = defaultdict(float) # 方便处理不存在于词典中的关键字\n",
+ "# 使用with open以节约内存\n",
+ "with open(r\"data\\BosonNLP_sentiment_score.txt\", \"r\", encoding=\"utf-8\") as sen_file:\n",
+ " for line in sen_file:\n",
+ " try:\n",
+ " key, value = line.split(\" \") # 直接运行会有奇怪的问题出现\n",
+ " except Exception:\n",
+ " # 直接跳过有问题的行\n",
+ " pass\n",
+ " else:\n",
+ " sentiment_dict[key] = float(value) # 正常情况"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 81,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 21.581138\n",
+ "1 14.952362\n",
+ "2 6.245357\n",
+ "3 30.980969\n",
+ "4 36.519372\n",
+ "Name: raw, dtype: float64"
+ ]
+ },
+ "execution_count": 81,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 以最简单的直接匹配的方式进行情感打分\n",
+ "def get_score(words):\n",
+ " return sum(sentiment_dict[word] for word in words)\n",
+ "\n",
+ "\n",
+ "y = df_cut.apply(get_score)\n",
+ "\n",
+ "y.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 82,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "该模型的准确率为: 0.6764648722184433\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 根据分数>0为正向、分数<0为负向判定准确率\n",
+ "precise = lambda x: (df[\"y\"][x] == 1 and score[x] > 0) or (df[\"y\"][x] == 0 and score[x] <= 0)\n",
+ "# 准确度评估\n",
+ "print(\"该模型的准确度为:\", sum(1 for i in range(len(df)) if precise(i)) / len(df))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 83,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "我谢谢蒙牛!多一层体贴,多一层爱护!双重保护。\n",
+ "正向\n",
+ " 蒙牛是店大欺客 至少态度比蒙牛好~~后期赶紧整改!\n",
+ "正向\n",
+ "宝贝收到了,孩子爱吃,敏感肌能用,很启发大脑,提升睡眠质量,五星好评。\n",
+ "正向\n",
+ "五星好评,分期付清\n",
+ "正向\n",
+ "对得起我们吗?rnm,退钱!\n",
+ "负向\n"
+ ]
+ }
+ ],
+ "source": [
+ "def score_pred(string):\n",
+ " words = \" \".join(jieba.cut(string))\n",
+ " # 预测并输出\n",
+ " result = 1 if get_score(string) > 0 else 0\n",
+ " if result == 1:\n",
+ " print(\"正向\")\n",
+ " else:\n",
+ " print(\"负向\")\n",
+ "\n",
+ "\n",
+ "# 使用上述模型进行预测\n",
+ "test_comment = [\n",
+ " \"我谢谢蒙牛!多一层体贴,多一层爱护!双重保护。\",\n",
+ " \" 蒙牛是店大欺客 至少态度比蒙牛好~~后期赶紧整改!\",\n",
+ " \"宝贝收到了,孩子爱吃,敏感肌能用,很启发大脑,提升睡眠质量,五星好评。\",\n",
+ " \"五星好评,分期付清\",\n",
+ " \"对得起我们吗?rnm,退钱!\"\n",
+ "]\n",
+ "for comment in test_comment:\n",
+ " print(comment)\n",
+ " score_pred(comment)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 84,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 感觉 用 这个 手机 下载 MP3 听 很 方便 , 用 T - Flash 卡装 上 歌 ...\n",
+ "1 外观 美观 , 速度 也 不错 。 上面 一排 触摸 键 挺 实用 。 应该 对得起 这个 ...\n",
+ "2 我刚 拿到 书 , 也 没有 仔细阅读 , 就是 粗粗 的 翻 了 点 , 觉得 还 行 。...\n",
+ "3 对于 二胡曲 的 精选 , 应该 出版 一个系列 套装 , 可 分别 出售 , 单独 购买 ...\n",
+ "4 用 了 一年 半 的 e680 终于 送 小偷 了 。 刚刚 买 了 一台 e850...\n",
+ "Name: raw, dtype: object"
+ ]
+ },
+ "execution_count": 84,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 基于词袋模型进行情感分析\n",
+ "# 将之前用jieba分好的列表转化为空格分隔的字符串,以方便使用词袋\n",
+ "df_cleantxt = df_cut.apply(lambda x: \" \".join(x))\n",
+ "\n",
+ "df_cleantxt.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 85,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "<20582x12014 sparse matrix of type ''\n",
+ "\twith 513919 stored elements in Compressed Sparse Row format>"
+ ]
+ },
+ "execution_count": 85,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "countvec = CountVectorizer(min_df = 5) # 特征向量,出现5次以上的才纳入\n",
+ "# 转化为词频矩阵\n",
+ "wordmtx = countvec.fit_transform(df_cleantxt)\n",
+ "\n",
+ "wordmtx # 一个稀疏矩阵对象,无法直接显示"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 86,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[LibSVM]"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "SVC(verbose=True)"
+ ]
+ },
+ "execution_count": 86,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 按照7:3的比例生成训练集和测试集,并使用SVM进行建模\n",
+ "x_train, x_test, y_train, y_test = train_test_split(wordmtx, df.y, test_size=0.3, random_state=0) # 指定random_state可以确保每次分割出的训练集和测试集相同\n",
+ "# 创建分类器,使用径向基核函数作为核函数\n",
+ "clf = SVC(kernel=\"rbf\", verbose=True)\n",
+ "# 训练分类器(耗时很长,慎点)\n",
+ "clf.fit(x_train, y_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 87,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "该分类器的准确度为: 0.9551606857777469\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 准确度评估\n",
+ "print(\"该分类器的准确度为:\", clf.score(x_train, y_train))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 88,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 0.87 0.89 0.88 3047\n",
+ " 1 0.89 0.87 0.88 3128\n",
+ "\n",
+ " accuracy 0.88 6175\n",
+ " macro avg 0.88 0.88 0.88 6175\n",
+ "weighted avg 0.88 0.88 0.88 6175\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 支持向量机的评估报告\n",
+ "# 四个列名分别为:精确率、召回率、F1值、支持数量\n",
+ "# 三个行名分别为:准确率、宏平均、加权平均\n",
+ "print(classification_report(y_test, clf.predict(x_test)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 89,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "我谢谢蒙牛!多一层体贴,多一层爱护!双重保护。\n",
+ "正向\n",
+ " 蒙牛是店大欺客 至少态度比蒙牛好~~后期赶紧整改!\n",
+ "负向\n",
+ "宝贝收到了,孩子爱吃,敏感肌能用,很启发大脑,提升睡眠质量,五星好评。\n",
+ "正向\n",
+ "五星好评,分期付清\n",
+ "负向\n",
+ "对得起我们吗?rnm,退钱!\n",
+ "负向\n"
+ ]
+ }
+ ],
+ "source": [
+ "def bow_pred(string): \n",
+ " global clf, countvec\n",
+ " model = clf\n",
+ " # 数据转换为词频矩阵\n",
+ " words = \" \".join(jieba.cut(string))\n",
+ " words_vecs = countvec.transform([words])\n",
+ " # 预测并输出\n",
+ " result = model.predict(words_vecs)\n",
+ " if result[0] == 1:\n",
+ " print(\"正向\")\n",
+ " else:\n",
+ " print(\"负向\")\n",
+ "\n",
+ "\n",
+ "# 使用上述模型进行预测\n",
+ "for comment in test_comment:\n",
+ " print(comment)\n",
+ " bow_pred(comment)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 90,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(6274592, 9394570)"
+ ]
+ },
+ "execution_count": 90,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 基于Word2Vec的情感分析\n",
+ "# 按照7:3的比例生成训练集和测试集\n",
+ "x_train, x_test, y_train, y_test = train_test_split(df_cut, df.y, test_size=0.3, random_state=0)\n",
+ "# 初始化word2vec模型和词表\n",
+ "n_dim = 300 # 指定向量维度\n",
+ "w2vmodel = Word2Vec(vector_size=n_dim, min_count=10) # 出现10次以上的才纳入\n",
+ "# 生成词表\n",
+ "w2vmodel.build_vocab(x_train)\n",
+ "w2vmodel.train(x_train, total_examples=w2vmodel.corpus_count, epochs = 10) # 训练模型"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 91,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 7 | \n",
+ " 8 | \n",
+ " 9 | \n",
+ " ... | \n",
+ " 290 | \n",
+ " 291 | \n",
+ " 292 | \n",
+ " 293 | \n",
+ " 294 | \n",
+ " 295 | \n",
+ " 296 | \n",
+ " 297 | \n",
+ " 298 | \n",
+ " 299 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0.287859 | \n",
+ " 0.211170 | \n",
+ " -0.002694 | \n",
+ " 0.015414 | \n",
+ " -0.130773 | \n",
+ " -0.252820 | \n",
+ " -0.173765 | \n",
+ " 0.019247 | \n",
+ " 0.139526 | \n",
+ " 0.084161 | \n",
+ " ... | \n",
+ " -0.174484 | \n",
+ " 0.183404 | \n",
+ " -0.012513 | \n",
+ " -0.207201 | \n",
+ " 0.414504 | \n",
+ " 0.289306 | \n",
+ " 0.317440 | \n",
+ " -0.280071 | \n",
+ " 0.093196 | \n",
+ " 0.119094 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 0.125504 | \n",
+ " 0.210281 | \n",
+ " 0.242841 | \n",
+ " 0.152100 | \n",
+ " -0.118140 | \n",
+ " -0.105682 | \n",
+ " 0.000601 | \n",
+ " 0.091197 | \n",
+ " 0.157458 | \n",
+ " 0.128822 | \n",
+ " ... | \n",
+ " -0.048829 | \n",
+ " 0.089328 | \n",
+ " -0.048414 | \n",
+ " -0.100801 | \n",
+ " 0.153824 | \n",
+ " 0.320274 | \n",
+ " 0.117689 | \n",
+ " -0.306226 | \n",
+ " 0.321337 | \n",
+ " -0.032039 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 0.208149 | \n",
+ " 0.200255 | \n",
+ " 0.028664 | \n",
+ " 0.096052 | \n",
+ " 0.157904 | \n",
+ " 0.190969 | \n",
+ " -0.003578 | \n",
+ " 0.351330 | \n",
+ " -0.182146 | \n",
+ " 0.020400 | \n",
+ " ... | \n",
+ " -0.055078 | \n",
+ " 0.228487 | \n",
+ " 0.066214 | \n",
+ " 0.011649 | \n",
+ " -0.027820 | \n",
+ " 0.206249 | \n",
+ " -0.050040 | \n",
+ " -0.125792 | \n",
+ " 0.412140 | \n",
+ " -0.081710 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 0.390679 | \n",
+ " 0.235006 | \n",
+ " 0.279163 | \n",
+ " 0.160066 | \n",
+ " 0.273807 | \n",
+ " -0.093850 | \n",
+ " -0.012255 | \n",
+ " 0.124318 | \n",
+ " 0.116129 | \n",
+ " 0.007561 | \n",
+ " ... | \n",
+ " -0.092300 | \n",
+ " -0.054543 | \n",
+ " -0.191227 | \n",
+ " -0.123319 | \n",
+ " 0.179981 | \n",
+ " 0.214038 | \n",
+ " 0.072776 | \n",
+ " -0.226328 | \n",
+ " 0.267805 | \n",
+ " -0.407456 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 0.226386 | \n",
+ " 0.250651 | \n",
+ " -0.100598 | \n",
+ " -0.016630 | \n",
+ " -0.165644 | \n",
+ " -0.165641 | \n",
+ " -0.108314 | \n",
+ " 0.047699 | \n",
+ " 0.030386 | \n",
+ " 0.061525 | \n",
+ " ... | \n",
+ " -0.093842 | \n",
+ " 0.121937 | \n",
+ " 0.224285 | \n",
+ " -0.055072 | \n",
+ " 0.381097 | \n",
+ " 0.108929 | \n",
+ " 0.158639 | \n",
+ " 0.004664 | \n",
+ " -0.072203 | \n",
+ " -0.276887 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 300 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 2 3 4 5 6 \\\n",
+ "0 0.287859 0.211170 -0.002694 0.015414 -0.130773 -0.252820 -0.173765 \n",
+ "1 0.125504 0.210281 0.242841 0.152100 -0.118140 -0.105682 0.000601 \n",
+ "2 0.208149 0.200255 0.028664 0.096052 0.157904 0.190969 -0.003578 \n",
+ "3 0.390679 0.235006 0.279163 0.160066 0.273807 -0.093850 -0.012255 \n",
+ "4 0.226386 0.250651 -0.100598 -0.016630 -0.165644 -0.165641 -0.108314 \n",
+ "\n",
+ " 7 8 9 ... 290 291 292 293 \\\n",
+ "0 0.019247 0.139526 0.084161 ... -0.174484 0.183404 -0.012513 -0.207201 \n",
+ "1 0.091197 0.157458 0.128822 ... -0.048829 0.089328 -0.048414 -0.100801 \n",
+ "2 0.351330 -0.182146 0.020400 ... -0.055078 0.228487 0.066214 0.011649 \n",
+ "3 0.124318 0.116129 0.007561 ... -0.092300 -0.054543 -0.191227 -0.123319 \n",
+ "4 0.047699 0.030386 0.061525 ... -0.093842 0.121937 0.224285 -0.055072 \n",
+ "\n",
+ " 294 295 296 297 298 299 \n",
+ "0 0.414504 0.289306 0.317440 -0.280071 0.093196 0.119094 \n",
+ "1 0.153824 0.320274 0.117689 -0.306226 0.321337 -0.032039 \n",
+ "2 -0.027820 0.206249 -0.050040 -0.125792 0.412140 -0.081710 \n",
+ "3 0.179981 0.214038 0.072776 -0.226328 0.267805 -0.407456 \n",
+ "4 0.381097 0.108929 0.158639 0.004664 -0.072203 -0.276887 \n",
+ "\n",
+ "[5 rows x 300 columns]"
+ ]
+ },
+ "execution_count": 91,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "def m_avgvec(words, w2vmodel):\n",
+ " \"\"\"\n",
+ " 用各个词向量直接平均的方式生成整句对应的向量\n",
+ " \"\"\"\n",
+ " return pd.DataFrame([w2vmodel.wv[w] for w in words if w in w2vmodel.wv]).agg(\"mean\")\n",
+ "\n",
+ "\n",
+ "# 生成建模用矩阵(耗时很长,慎点)\n",
+ "train_vecs = pd.DataFrame([m_avgvec(s, w2vmodel) for s in x_train])\n",
+ "\n",
+ "train_vecs.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 92,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[LibSVM]"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "SVC(verbose=True)"
+ ]
+ },
+ "execution_count": 92,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 用转换后的矩阵拟合SVM模型\n",
+ "clf2 = SVC(kernel=\"rbf\", verbose=True)\n",
+ "# 训练分类器(耗时很长,慎点)\n",
+ "clf2.fit(train_vecs, y_train)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 93,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "该分类器的准确度为: 0.8936628028041924\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 准确度评估\n",
+ "print(\"该分类器的准确度为:\", clf2.score(train_vecs, y_train))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 94,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# 生成测试用矩阵(耗时很长,慎点)\n",
+ "test_vecs = pd.DataFrame([m_avgvec(s, w2vmodel) for s in x_test])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 95,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 0.87 0.87 0.87 3047\n",
+ " 1 0.87 0.87 0.87 3128\n",
+ "\n",
+ " accuracy 0.87 6175\n",
+ " macro avg 0.87 0.87 0.87 6175\n",
+ "weighted avg 0.87 0.87 0.87 6175\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 支持向量机的评估报告\n",
+ "print(classification_report(y_test, clf2.predict(test_vecs)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 96,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "我谢谢蒙牛!多一层体贴,多一层爱护!双重保护。\n",
+ "正向\n",
+ " 蒙牛是店大欺客 至少态度比蒙牛好~~后期赶紧整改!\n",
+ "负向\n",
+ "宝贝收到了,孩子爱吃,敏感肌能用,很启发大脑,提升睡眠质量,五星好评。\n",
+ "正向\n",
+ "五星好评,分期付清\n",
+ "正向\n",
+ "对得起我们吗?rnm,退钱!\n",
+ "负向\n"
+ ]
+ }
+ ],
+ "source": [
+ "def w2v_pred(string):\n",
+ " global clf2, w2vmodel\n",
+ " # 将句子转换为矩阵形式\n",
+ " vecs = pd.DataFrame([m_avgvec(string, w2vmodel)])\n",
+ " # 预测并输出\n",
+ " result = clf2.predict(vecs)\n",
+ " if result[0] == 1:\n",
+ " print(\"正向\")\n",
+ " else:\n",
+ " print(\"负向\") \n",
+ "\n",
+ "\n",
+ "# 使用上述模型进行预测\n",
+ "for comment in test_comment:\n",
+ " print(comment)\n",
+ " w2v_pred(comment)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.9"
+ },
+ "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}