运行购物评论的情感分析的主程序

Signed-off-by: 吴沂钊 <13190667+Yizhao_Wu4926@user.noreply.gitee.com>
This commit is contained in:
吴沂钊
2023-07-15 14:47:46 +00:00
committed by Gitee
parent 4932de7a3f
commit da65298f86
+828
View File
@@ -0,0 +1,828 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [],
"source": [
"# 导入需要的包\n",
"from collections import defaultdict\n",
"import pandas as pd\n",
"import re\n",
"import jieba\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.svm import SVC\n",
"from sklearn.metrics import classification_report\n",
"from gensim.models.word2vec import Word2Vec\n"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>raw</th>\n",
" <th>y</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>感觉用这个手机下载MP3听很方便,用T-Flash卡装上歌,就可以当一个小MP3用了,很不错...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>外观美观,速度也不错。上面一排触摸键挺实用。应该对得起这个价格。当然再降点大家肯定也不反对。...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>我刚拿到书,也没有仔细阅读,就是粗粗的翻了点,觉得还行。里面是蓝黑两种颜色的,有些单词的下面...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>对于二胡曲的精选,应该出版一个系列套装,可分别出售,单独购买。精品二胡曲是有年代和阶段性的,...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>用了一年半的 e680 终于送小偷了。刚刚买了一台e850,用了三天,说说我自己的感受。1...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" raw y\n",
"0 感觉用这个手机下载MP3听很方便,用T-Flash卡装上歌,就可以当一个小MP3用了,很不错... 1\n",
"1 外观美观,速度也不错。上面一排触摸键挺实用。应该对得起这个价格。当然再降点大家肯定也不反对。... 1\n",
"2 我刚拿到书,也没有仔细阅读,就是粗粗的翻了点,觉得还行。里面是蓝黑两种颜色的,有些单词的下面... 1\n",
"3 对于二胡曲的精选,应该出版一个系列套装,可分别出售,单独购买。精品二胡曲是有年代和阶段性的,... 1\n",
"4 用了一年半的 e680 终于送小偷了。刚刚买了一台e850,用了三天,说说我自己的感受。1:... 1"
]
},
"execution_count": 78,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 读入数据\n",
"df_pos = pd.read_excel(r\"data/购物评论.xlsx\", sheet_name=\"正向\", header=None)\n",
"df_pos[\"y\"] = 1\n",
"df_neg = pd.read_excel(r\"data/购物评论.xlsx\", sheet_name=\"负向\", header=None)\n",
"df_neg[\"y\"] = 0\n",
"# 将正样本和负样本拼接在一起\n",
"df = df_pos.append(df_neg, ignore_index=True)\n",
"df.columns = [\"raw\", \"y\"]\n",
"\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 [感觉, 用, 这个, 手机, 下载, MP3, 听, 很, 方便, , 用, T, -,...\n",
"1 [外观, 美观, , 速度, 也, 不错, 。, 上面, 一排, 触摸, 键, 挺, 实用...\n",
"2 [我刚, 拿到, 书, ,, 也, 没有, 仔细阅读, ,, 就是, 粗粗, 的, 翻, 了...\n",
"3 [对于, 二胡曲, 的, 精选, ,, 应该, 出版, 一个系列, 套装, , 可, 分别...\n",
"4 [用, 了, 一年, 半, 的, , e680, , 终于, 送, 小偷, 了, 。, ...\n",
"Name: raw, dtype: object"
]
},
"execution_count": 79,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 数据清洗\n",
"# 由于有些情感词本身也是停用词。因此这里仅去掉换行和空字符,不做其他的清理工作,以保留情感词。\n",
"df_cut = df[\"raw\"].apply(lambda x: re.sub('\\n', '', x))\n",
"# 将清洗后的文本用jieba.lcut转换为list形式\n",
"df_cut = df_cut.apply(jieba.lcut)\n",
"\n",
"df_cut.head()"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [],
"source": [
"# 基于词典的情感分析\n",
"# 读入并创建情感词典\n",
"sentiment_dict = defaultdict(float) # 方便处理不存在于词典中的关键字\n",
"# 使用with open以节约内存\n",
"with open(r\"data\\BosonNLP_sentiment_score.txt\", \"r\", encoding=\"utf-8\") as sen_file:\n",
" for line in sen_file:\n",
" try:\n",
" key, value = line.split(\" \") # 直接运行会有奇怪的问题出现\n",
" except Exception:\n",
" # 直接跳过有问题的行\n",
" pass\n",
" else:\n",
" sentiment_dict[key] = float(value) # 正常情况"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 21.581138\n",
"1 14.952362\n",
"2 6.245357\n",
"3 30.980969\n",
"4 36.519372\n",
"Name: raw, dtype: float64"
]
},
"execution_count": 81,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 以最简单的直接匹配的方式进行情感打分\n",
"def get_score(words):\n",
" return sum(sentiment_dict[word] for word in words)\n",
"\n",
"\n",
"y = df_cut.apply(get_score)\n",
"\n",
"y.head()"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"该模型的准确率为: 0.6764648722184433\n"
]
}
],
"source": [
"# 根据分数>0为正向、分数<0为负向判定准确率\n",
"precise = lambda x: (df[\"y\"][x] == 1 and score[x] > 0) or (df[\"y\"][x] == 0 and score[x] <= 0)\n",
"# 准确度评估\n",
"print(\"该模型的准确度为:\", sum(1 for i in range(len(df)) if precise(i)) / len(df))"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"我谢谢蒙牛!多一层体贴,多一层爱护!双重保护。\n",
"正向\n",
" 蒙牛是店大欺客 至少态度比蒙牛好~~后期赶紧整改!\n",
"正向\n",
"宝贝收到了,孩子爱吃,敏感肌能用,很启发大脑,提升睡眠质量,五星好评。\n",
"正向\n",
"五星好评,分期付清\n",
"正向\n",
"对得起我们吗?rnm,退钱!\n",
"负向\n"
]
}
],
"source": [
"def score_pred(string):\n",
" words = \" \".join(jieba.cut(string))\n",
" # 预测并输出\n",
" result = 1 if get_score(string) > 0 else 0\n",
" if result == 1:\n",
" print(\"正向\")\n",
" else:\n",
" print(\"负向\")\n",
"\n",
"\n",
"# 使用上述模型进行预测\n",
"test_comment = [\n",
" \"我谢谢蒙牛!多一层体贴,多一层爱护!双重保护。\",\n",
" \" 蒙牛是店大欺客 至少态度比蒙牛好~~后期赶紧整改!\",\n",
" \"宝贝收到了,孩子爱吃,敏感肌能用,很启发大脑,提升睡眠质量,五星好评。\",\n",
" \"五星好评,分期付清\",\n",
" \"对得起我们吗?rnm,退钱!\"\n",
"]\n",
"for comment in test_comment:\n",
" print(comment)\n",
" score_pred(comment)"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 感觉 用 这个 手机 下载 MP3 听 很 方便 , 用 T - Flash 卡装 上 歌 ...\n",
"1 外观 美观 , 速度 也 不错 。 上面 一排 触摸 键 挺 实用 。 应该 对得起 这个 ...\n",
"2 我刚 拿到 书 , 也 没有 仔细阅读 , 就是 粗粗 的 翻 了 点 , 觉得 还 行 。...\n",
"3 对于 二胡曲 的 精选 , 应该 出版 一个系列 套装 , 可 分别 出售 , 单独 购买 ...\n",
"4 用 了 一年 半 的 e680 终于 送 小偷 了 。 刚刚 买 了 一台 e850...\n",
"Name: raw, dtype: object"
]
},
"execution_count": 84,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 基于词袋模型进行情感分析\n",
"# 将之前用jieba分好的列表转化为空格分隔的字符串,以方便使用词袋\n",
"df_cleantxt = df_cut.apply(lambda x: \" \".join(x))\n",
"\n",
"df_cleantxt.head()"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<20582x12014 sparse matrix of type '<class 'numpy.int64'>'\n",
"\twith 513919 stored elements in Compressed Sparse Row format>"
]
},
"execution_count": 85,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"countvec = CountVectorizer(min_df = 5) # 特征向量,出现5次以上的才纳入\n",
"# 转化为词频矩阵\n",
"wordmtx = countvec.fit_transform(df_cleantxt)\n",
"\n",
"wordmtx # 一个稀疏矩阵对象,无法直接显示"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[LibSVM]"
]
},
{
"data": {
"text/plain": [
"SVC(verbose=True)"
]
},
"execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 按照7:3的比例生成训练集和测试集,并使用SVM进行建模\n",
"x_train, x_test, y_train, y_test = train_test_split(wordmtx, df.y, test_size=0.3, random_state=0) # 指定random_state可以确保每次分割出的训练集和测试集相同\n",
"# 创建分类器,使用径向基核函数作为核函数\n",
"clf = SVC(kernel=\"rbf\", verbose=True)\n",
"# 训练分类器(耗时很长,慎点)\n",
"clf.fit(x_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"该分类器的准确度为: 0.9551606857777469\n"
]
}
],
"source": [
"# 准确度评估\n",
"print(\"该分类器的准确度为:\", clf.score(x_train, y_train))"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 0 0.87 0.89 0.88 3047\n",
" 1 0.89 0.87 0.88 3128\n",
"\n",
" accuracy 0.88 6175\n",
" macro avg 0.88 0.88 0.88 6175\n",
"weighted avg 0.88 0.88 0.88 6175\n",
"\n"
]
}
],
"source": [
"# 支持向量机的评估报告\n",
"# 四个列名分别为:精确率、召回率、F1值、支持数量\n",
"# 三个行名分别为:准确率、宏平均、加权平均\n",
"print(classification_report(y_test, clf.predict(x_test)))"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"我谢谢蒙牛!多一层体贴,多一层爱护!双重保护。\n",
"正向\n",
" 蒙牛是店大欺客 至少态度比蒙牛好~~后期赶紧整改!\n",
"负向\n",
"宝贝收到了,孩子爱吃,敏感肌能用,很启发大脑,提升睡眠质量,五星好评。\n",
"正向\n",
"五星好评,分期付清\n",
"负向\n",
"对得起我们吗?rnm,退钱!\n",
"负向\n"
]
}
],
"source": [
"def bow_pred(string): \n",
" global clf, countvec\n",
" model = clf\n",
" # 数据转换为词频矩阵\n",
" words = \" \".join(jieba.cut(string))\n",
" words_vecs = countvec.transform([words])\n",
" # 预测并输出\n",
" result = model.predict(words_vecs)\n",
" if result[0] == 1:\n",
" print(\"正向\")\n",
" else:\n",
" print(\"负向\")\n",
"\n",
"\n",
"# 使用上述模型进行预测\n",
"for comment in test_comment:\n",
" print(comment)\n",
" bow_pred(comment)"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(6274592, 9394570)"
]
},
"execution_count": 90,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 基于Word2Vec的情感分析\n",
"# 按照7:3的比例生成训练集和测试集\n",
"x_train, x_test, y_train, y_test = train_test_split(df_cut, df.y, test_size=0.3, random_state=0)\n",
"# 初始化word2vec模型和词表\n",
"n_dim = 300 # 指定向量维度\n",
"w2vmodel = Word2Vec(vector_size=n_dim, min_count=10) # 出现10次以上的才纳入\n",
"# 生成词表\n",
"w2vmodel.build_vocab(x_train)\n",
"w2vmodel.train(x_train, total_examples=w2vmodel.corpus_count, epochs = 10) # 训练模型"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" <th>6</th>\n",
" <th>7</th>\n",
" <th>8</th>\n",
" <th>9</th>\n",
" <th>...</th>\n",
" <th>290</th>\n",
" <th>291</th>\n",
" <th>292</th>\n",
" <th>293</th>\n",
" <th>294</th>\n",
" <th>295</th>\n",
" <th>296</th>\n",
" <th>297</th>\n",
" <th>298</th>\n",
" <th>299</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.287859</td>\n",
" <td>0.211170</td>\n",
" <td>-0.002694</td>\n",
" <td>0.015414</td>\n",
" <td>-0.130773</td>\n",
" <td>-0.252820</td>\n",
" <td>-0.173765</td>\n",
" <td>0.019247</td>\n",
" <td>0.139526</td>\n",
" <td>0.084161</td>\n",
" <td>...</td>\n",
" <td>-0.174484</td>\n",
" <td>0.183404</td>\n",
" <td>-0.012513</td>\n",
" <td>-0.207201</td>\n",
" <td>0.414504</td>\n",
" <td>0.289306</td>\n",
" <td>0.317440</td>\n",
" <td>-0.280071</td>\n",
" <td>0.093196</td>\n",
" <td>0.119094</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.125504</td>\n",
" <td>0.210281</td>\n",
" <td>0.242841</td>\n",
" <td>0.152100</td>\n",
" <td>-0.118140</td>\n",
" <td>-0.105682</td>\n",
" <td>0.000601</td>\n",
" <td>0.091197</td>\n",
" <td>0.157458</td>\n",
" <td>0.128822</td>\n",
" <td>...</td>\n",
" <td>-0.048829</td>\n",
" <td>0.089328</td>\n",
" <td>-0.048414</td>\n",
" <td>-0.100801</td>\n",
" <td>0.153824</td>\n",
" <td>0.320274</td>\n",
" <td>0.117689</td>\n",
" <td>-0.306226</td>\n",
" <td>0.321337</td>\n",
" <td>-0.032039</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.208149</td>\n",
" <td>0.200255</td>\n",
" <td>0.028664</td>\n",
" <td>0.096052</td>\n",
" <td>0.157904</td>\n",
" <td>0.190969</td>\n",
" <td>-0.003578</td>\n",
" <td>0.351330</td>\n",
" <td>-0.182146</td>\n",
" <td>0.020400</td>\n",
" <td>...</td>\n",
" <td>-0.055078</td>\n",
" <td>0.228487</td>\n",
" <td>0.066214</td>\n",
" <td>0.011649</td>\n",
" <td>-0.027820</td>\n",
" <td>0.206249</td>\n",
" <td>-0.050040</td>\n",
" <td>-0.125792</td>\n",
" <td>0.412140</td>\n",
" <td>-0.081710</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.390679</td>\n",
" <td>0.235006</td>\n",
" <td>0.279163</td>\n",
" <td>0.160066</td>\n",
" <td>0.273807</td>\n",
" <td>-0.093850</td>\n",
" <td>-0.012255</td>\n",
" <td>0.124318</td>\n",
" <td>0.116129</td>\n",
" <td>0.007561</td>\n",
" <td>...</td>\n",
" <td>-0.092300</td>\n",
" <td>-0.054543</td>\n",
" <td>-0.191227</td>\n",
" <td>-0.123319</td>\n",
" <td>0.179981</td>\n",
" <td>0.214038</td>\n",
" <td>0.072776</td>\n",
" <td>-0.226328</td>\n",
" <td>0.267805</td>\n",
" <td>-0.407456</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.226386</td>\n",
" <td>0.250651</td>\n",
" <td>-0.100598</td>\n",
" <td>-0.016630</td>\n",
" <td>-0.165644</td>\n",
" <td>-0.165641</td>\n",
" <td>-0.108314</td>\n",
" <td>0.047699</td>\n",
" <td>0.030386</td>\n",
" <td>0.061525</td>\n",
" <td>...</td>\n",
" <td>-0.093842</td>\n",
" <td>0.121937</td>\n",
" <td>0.224285</td>\n",
" <td>-0.055072</td>\n",
" <td>0.381097</td>\n",
" <td>0.108929</td>\n",
" <td>0.158639</td>\n",
" <td>0.004664</td>\n",
" <td>-0.072203</td>\n",
" <td>-0.276887</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 300 columns</p>\n",
"</div>"
],
"text/plain": [
" 0 1 2 3 4 5 6 \\\n",
"0 0.287859 0.211170 -0.002694 0.015414 -0.130773 -0.252820 -0.173765 \n",
"1 0.125504 0.210281 0.242841 0.152100 -0.118140 -0.105682 0.000601 \n",
"2 0.208149 0.200255 0.028664 0.096052 0.157904 0.190969 -0.003578 \n",
"3 0.390679 0.235006 0.279163 0.160066 0.273807 -0.093850 -0.012255 \n",
"4 0.226386 0.250651 -0.100598 -0.016630 -0.165644 -0.165641 -0.108314 \n",
"\n",
" 7 8 9 ... 290 291 292 293 \\\n",
"0 0.019247 0.139526 0.084161 ... -0.174484 0.183404 -0.012513 -0.207201 \n",
"1 0.091197 0.157458 0.128822 ... -0.048829 0.089328 -0.048414 -0.100801 \n",
"2 0.351330 -0.182146 0.020400 ... -0.055078 0.228487 0.066214 0.011649 \n",
"3 0.124318 0.116129 0.007561 ... -0.092300 -0.054543 -0.191227 -0.123319 \n",
"4 0.047699 0.030386 0.061525 ... -0.093842 0.121937 0.224285 -0.055072 \n",
"\n",
" 294 295 296 297 298 299 \n",
"0 0.414504 0.289306 0.317440 -0.280071 0.093196 0.119094 \n",
"1 0.153824 0.320274 0.117689 -0.306226 0.321337 -0.032039 \n",
"2 -0.027820 0.206249 -0.050040 -0.125792 0.412140 -0.081710 \n",
"3 0.179981 0.214038 0.072776 -0.226328 0.267805 -0.407456 \n",
"4 0.381097 0.108929 0.158639 0.004664 -0.072203 -0.276887 \n",
"\n",
"[5 rows x 300 columns]"
]
},
"execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def m_avgvec(words, w2vmodel):\n",
" \"\"\"\n",
" 用各个词向量直接平均的方式生成整句对应的向量\n",
" \"\"\"\n",
" return pd.DataFrame([w2vmodel.wv[w] for w in words if w in w2vmodel.wv]).agg(\"mean\")\n",
"\n",
"\n",
"# 生成建模用矩阵(耗时很长,慎点)\n",
"train_vecs = pd.DataFrame([m_avgvec(s, w2vmodel) for s in x_train])\n",
"\n",
"train_vecs.head()"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[LibSVM]"
]
},
{
"data": {
"text/plain": [
"SVC(verbose=True)"
]
},
"execution_count": 92,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 用转换后的矩阵拟合SVM模型\n",
"clf2 = SVC(kernel=\"rbf\", verbose=True)\n",
"# 训练分类器(耗时很长,慎点)\n",
"clf2.fit(train_vecs, y_train)\n"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"该分类器的准确度为: 0.8936628028041924\n"
]
}
],
"source": [
"# 准确度评估\n",
"print(\"该分类器的准确度为:\", clf2.score(train_vecs, y_train))"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [],
"source": [
"# 生成测试用矩阵(耗时很长,慎点)\n",
"test_vecs = pd.DataFrame([m_avgvec(s, w2vmodel) for s in x_test])"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 0 0.87 0.87 0.87 3047\n",
" 1 0.87 0.87 0.87 3128\n",
"\n",
" accuracy 0.87 6175\n",
" macro avg 0.87 0.87 0.87 6175\n",
"weighted avg 0.87 0.87 0.87 6175\n",
"\n"
]
}
],
"source": [
"# 支持向量机的评估报告\n",
"print(classification_report(y_test, clf2.predict(test_vecs)))"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"我谢谢蒙牛!多一层体贴,多一层爱护!双重保护。\n",
"正向\n",
" 蒙牛是店大欺客 至少态度比蒙牛好~~后期赶紧整改!\n",
"负向\n",
"宝贝收到了,孩子爱吃,敏感肌能用,很启发大脑,提升睡眠质量,五星好评。\n",
"正向\n",
"五星好评,分期付清\n",
"正向\n",
"对得起我们吗?rnm,退钱!\n",
"负向\n"
]
}
],
"source": [
"def w2v_pred(string):\n",
" global clf2, w2vmodel\n",
" # 将句子转换为矩阵形式\n",
" vecs = pd.DataFrame([m_avgvec(string, w2vmodel)])\n",
" # 预测并输出\n",
" result = clf2.predict(vecs)\n",
" if result[0] == 1:\n",
" print(\"正向\")\n",
" else:\n",
" print(\"负向\") \n",
"\n",
"\n",
"# 使用上述模型进行预测\n",
"for comment in test_comment:\n",
" print(comment)\n",
" w2v_pred(comment)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.9"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}