diff --git a/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/online_spam_check/.ipynb_checkpoints/垃圾短信识别-checkpoint.ipynb b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/online_spam_check/.ipynb_checkpoints/垃圾短信识别-checkpoint.ipynb new file mode 100644 index 0000000..30d0c08 --- /dev/null +++ b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/online_spam_check/.ipynb_checkpoints/垃圾短信识别-checkpoint.ipynb @@ -0,0 +1,216 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "198b28eb", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import jieba" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4cdb7b90", + "metadata": {}, + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: '../dataset/message80W1.csv'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[3], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m../dataset/message80W1.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m, header\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m)\n", + "File \u001b[0;32m~/Programs/mambaforge/envs/tf/lib/python3.11/site-packages/pandas/util/_decorators.py:211\u001b[0m, in \u001b[0;36mdeprecate_kwarg.._deprecate_kwarg..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 210\u001b[0m kwargs[new_arg_name] \u001b[38;5;241m=\u001b[39m new_arg_value\n\u001b[0;32m--> 211\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[0;32m~/Programs/mambaforge/envs/tf/lib/python3.11/site-packages/pandas/util/_decorators.py:331\u001b[0m, in \u001b[0;36mdeprecate_nonkeyword_arguments..decorate..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 325\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(args) \u001b[38;5;241m>\u001b[39m num_allow_args:\n\u001b[1;32m 326\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m 327\u001b[0m msg\u001b[38;5;241m.\u001b[39mformat(arguments\u001b[38;5;241m=\u001b[39m_format_argument_list(allow_args)),\n\u001b[1;32m 328\u001b[0m \u001b[38;5;167;01mFutureWarning\u001b[39;00m,\n\u001b[1;32m 329\u001b[0m stacklevel\u001b[38;5;241m=\u001b[39mfind_stack_level(),\n\u001b[1;32m 330\u001b[0m )\n\u001b[0;32m--> 331\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[0;32m~/Programs/mambaforge/envs/tf/lib/python3.11/site-packages/pandas/io/parsers/readers.py:950\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)\u001b[0m\n\u001b[1;32m 935\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m 936\u001b[0m dialect,\n\u001b[1;32m 937\u001b[0m delimiter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 946\u001b[0m defaults\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdelimiter\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m,\u001b[39m\u001b[38;5;124m\"\u001b[39m},\n\u001b[1;32m 947\u001b[0m )\n\u001b[1;32m 948\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m--> 950\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _read(filepath_or_buffer, kwds)\n", + "File \u001b[0;32m~/Programs/mambaforge/envs/tf/lib/python3.11/site-packages/pandas/io/parsers/readers.py:605\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 602\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m 604\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 605\u001b[0m parser \u001b[38;5;241m=\u001b[39m TextFileReader(filepath_or_buffer, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwds)\n\u001b[1;32m 607\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m 608\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n", + "File \u001b[0;32m~/Programs/mambaforge/envs/tf/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1442\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 1439\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 1441\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1442\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_make_engine(f, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mengine)\n", + "File \u001b[0;32m~/Programs/mambaforge/envs/tf/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1735\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m 1733\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m 1734\u001b[0m mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1735\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m get_handle(\n\u001b[1;32m 1736\u001b[0m f,\n\u001b[1;32m 1737\u001b[0m mode,\n\u001b[1;32m 1738\u001b[0m encoding\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mencoding\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m),\n\u001b[1;32m 1739\u001b[0m compression\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcompression\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m),\n\u001b[1;32m 1740\u001b[0m memory_map\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmemory_map\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mFalse\u001b[39;00m),\n\u001b[1;32m 1741\u001b[0m is_text\u001b[38;5;241m=\u001b[39mis_text,\n\u001b[1;32m 1742\u001b[0m errors\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mencoding_errors\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstrict\u001b[39m\u001b[38;5;124m\"\u001b[39m),\n\u001b[1;32m 1743\u001b[0m storage_options\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstorage_options\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m),\n\u001b[1;32m 1744\u001b[0m )\n\u001b[1;32m 1745\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1746\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n", + "File \u001b[0;32m~/Programs/mambaforge/envs/tf/lib/python3.11/site-packages/pandas/io/common.py:856\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 851\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m 852\u001b[0m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m 853\u001b[0m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m 854\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m 855\u001b[0m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 856\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(\n\u001b[1;32m 857\u001b[0m handle,\n\u001b[1;32m 858\u001b[0m ioargs\u001b[38;5;241m.\u001b[39mmode,\n\u001b[1;32m 859\u001b[0m encoding\u001b[38;5;241m=\u001b[39mioargs\u001b[38;5;241m.\u001b[39mencoding,\n\u001b[1;32m 860\u001b[0m errors\u001b[38;5;241m=\u001b[39merrors,\n\u001b[1;32m 861\u001b[0m newline\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 862\u001b[0m )\n\u001b[1;32m 863\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 864\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m 865\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '../dataset/message80W1.csv'" + ] + } + ], + "source": [ + "df = pd.read_csv(\"../dataset/message80W1.csv\", header=None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1135dc61", + "metadata": {}, + "outputs": [], + "source": [ + "N_pos, N_neg = 10000, 10000\n", + "np.random.seed(42)\n", + "df_positive = df[df[1]==0]\n", + "df_negative = df[df[1]==1]\n", + "# def sample_df(df, N):\n", + "# indexs = np.random.choice(np.arange(len(df)), N)\n", + "# return df.iloc[indexs,2]\n", + "# corpus_pos = sample_df(df_positive, N_pos)\n", + "# corpus_neg = sample_df(df_negative, N_neg)\n", + "corpus_pos = df_positive.sample(n=N_pos).iloc[:,2]\n", + "corpus_neg = df_negative.sample(n=N_neg).iloc[:,2]\n", + "corpus = np.concatenate([corpus_pos, corpus_neg]).reshape(-1,1)\n", + "y = np.concatenate([np.full(N_pos, 1), np.full(N_neg, 0)])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a329a32f", + "metadata": {}, + "outputs": [], + "source": [ + "corpus_cut = np.apply_along_axis(lambda x: ' '.join(jieba.cut(x[0])), axis=1, arr=corpus)\n", + "corpus_cut" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "45c62d11", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n", + "\n", + "with open(\"../dataset/stopword.txt\", encoding='gbk') as file:\n", + " stopwords = file.read().split()\n", + "\n", + "# 频率向量化,token_pattern不取单个词作为特征的问题: https://blog.csdn.net/xxzhix/article/details/82685372\n", + "vectorizer = CountVectorizer(token_pattern='[\\u4e00-\\u9fa5_a-zA-Z0-9]{1,}')\n", + "X = vectorizer.fit_transform(corpus_cut)\n", + "tfidf = TfidfTransformer()\n", + "X = tfidf.fit_transform(X)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1c2b396", + "metadata": {}, + "outputs": [], + "source": [ + "def to_vector(X):\n", + " X = np.array(X).reshape(-1, 1)\n", + " cut = np.apply_along_axis(lambda x: ' '.join(jieba.cut(x[0])), axis=1, arr=X)\n", + " vector = vectorizer.transform(cut)\n", + " return tfidf.transform(vector)\n", + "sentence = '现在看个电脑也得戴眼镜了我正在光速变瞎'\n", + "vector = to_vector([sentence])\n", + "print(vector.shape)\n", + "idxs = vector.nonzero()[1]\n", + "[vectorizer.get_feature_names_out()[i] for i in idxs]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f2ed68d", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d7f4f98", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"训练集{X_train.shape}, 测试集{X_test.shape}\")\n", + "print(f\"训练集正例{sum(y_train==1)}, 测试集正例{sum(y_test==1)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ea14ac0", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.naive_bayes import GaussianNB\n", + "from sklearn.metrics import accuracy_score\n", + "gnb = GaussianNB()\n", + "print(\"拟合中......\")\n", + "gnb.fit(X_train.toarray(), y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5885e1b3", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"预测中......\")\n", + "pred_train = gnb.predict(X_train.toarray())\n", + "pred_test = gnb.predict(X_test.toarray())\n", + "acc_train = accuracy_score(y_train, pred_train)\n", + "acc_test = accuracy_score(y_test, pred_test)\n", + "print(f\"准确率 train/test: {acc_train:.4f}/{acc_test:.4f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3491efb8", + "metadata": {}, + "outputs": [], + "source": [ + "vectors = to_vector([\"尊敬的客上,感谢您一直的支持,亿美亿康美容部特在本月的x、x、x三天举办秒杀活动,现场更是优惠多多,开抢倒计时还有两天,欲抢从速!xx号艳艳\",\n", + " \"秒杀价格8848,8848你值得拥有\",\n", + " \"有博主做过同类防晒霜的对比\",\n", + " \"一刀999\",\n", + " \"并夕夕\",\n", + " \"csc每天打游戏\",\n", + " \"今天年脑爆炸了,太刺激了\",\n", + " \"我收到了垃圾短信\",\n", + " \"尊敬的客户\",\n", + " ]).toarray()\n", + "gnb.predict(vectors)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9251e6da", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/online_spam_check/__pycache__/app.cpython-310.pyc b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/online_spam_check/__pycache__/app.cpython-310.pyc new file mode 100644 index 0000000..1acb757 Binary files /dev/null and b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/online_spam_check/__pycache__/app.cpython-310.pyc differ diff --git a/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/online_spam_check/__pycache__/gnb_model.cpython-311.pyc b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/online_spam_check/__pycache__/gnb_model.cpython-311.pyc new file mode 100644 index 0000000..54f9c52 Binary files /dev/null and b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/online_spam_check/__pycache__/gnb_model.cpython-311.pyc differ diff --git a/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/online_spam_check/app.py b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/online_spam_check/app.py new file mode 100644 index 0000000..f03cf15 --- /dev/null +++ b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/online_spam_check/app.py @@ -0,0 +1,22 @@ +# Importing Necessary Modules +from flask import Flask, request, render_template +from gnb_model import make_predict +app = Flask(__name__) + +result = [] +@app.route("/", methods=['GET', 'POST']) +def input(): + global result + if request.method == 'POST': + print(request.form) + if "clean" in request.form.keys(): + result = [] + sentence = request.form['sentence'] + if len(sentence) != 0: + result.append((sentence, make_predict(sentence))) + # result.append((sentence, "不是")) + return render_template("input.html", result=result) + +# main route to start with +if __name__ == '__main__': + app.run(debug=True, host='0.0.0.0') diff --git a/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/online_spam_check/gnb_model.py b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/online_spam_check/gnb_model.py new file mode 100644 index 0000000..9a2a717 --- /dev/null +++ b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/online_spam_check/gnb_model.py @@ -0,0 +1,77 @@ +import numpy as np +import pandas as pd +import jieba +from pathlib import Path +PATH_DATASET = Path.cwd().joinpath("../../dataset") +DELETE_STOPWORDS = False + +df = pd.read_csv(PATH_DATASET.joinpath("message80W1.csv"), header=None) + +N_pos, N_neg = 10000, 10000 +df_positive = df[df[1]==0] +df_negative = df[df[1]==1] +np.random.seed(42) +def sample_df(df, N): + indexs = np.random.choice(np.arange(len(df)), N) + return df.iloc[indexs,2] +corpus_pos = sample_df(df_positive, N_pos) +corpus_neg = sample_df(df_negative, N_neg) +# corpus_pos = df_positive.sample(n=N_pos, random_state=42).iloc[:,2] +# corpus_neg = df_negative.sample(n=N_neg, random_state=42).iloc[:,2] +corpus = np.concatenate([corpus_pos, corpus_neg]).reshape(-1,1) +y = np.concatenate([np.full(N_pos, 1), np.full(N_neg, 0)]) + +corpus_cut = np.apply_along_axis(lambda x: ' '.join(jieba.cut(x[0])), axis=1, arr=corpus) + +from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer + +with open(PATH_DATASET.joinpath("stopword.txt"), encoding='gbk') as file: + stopwords = file.read().split() + +# 频率向量化,token_pattern不取单个词作为特征的问题: https://blog.csdn.net/xxzhix/article/details/82685372 +if not DELETE_STOPWORDS: + vectorizer = CountVectorizer(token_pattern='[\u4e00-\u9fa5_a-zA-Z0-9]{1,}') +else: + vectorizer_stopwords = CountVectorizer(token_pattern='[\u4e00-\u9fa5_a-zA-Z0-9]{1,}', stop_words=stopwords) +X = vectorizer.fit_transform(corpus_cut) +tfidf = TfidfTransformer() +X = tfidf.fit_transform(X) + +def to_vector(X, stopwords=False): + X = np.array(X).reshape(-1, 1) + cut = np.apply_along_axis(lambda x: ' '.join(jieba.cut(x[0])), axis=1, arr=X) + vector = vectorizer.transform(cut) + return tfidf.transform(vector) + +from sklearn.model_selection import train_test_split +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42) + +from sklearn.naive_bayes import GaussianNB +from sklearn.metrics import accuracy_score +gnb = GaussianNB() +print("拟合中......") +gnb.fit(X_train.toarray(), y_train) + +print("预测中......") +pred_train = gnb.predict(X_train.toarray()) +pred_test = gnb.predict(X_test.toarray()) +acc_train = accuracy_score(y_train, pred_train) +acc_test = accuracy_score(y_test, pred_test) +print(f"准确率 train/test: {acc_train:.4f}/{acc_test:.4f}") + +def make_predict(string:str): + string = [string] + vector = to_vector(string).toarray() + ret = "是垃圾" if gnb.predict(vector) == 0 else "不是垃圾" + return ret + +if __name__ == '__main__': + print(make_predict("尊敬的客上,感谢您一直的支持,亿美亿康美容部特在本月的x、x、x三天举办秒杀活动,现场更是优惠多多,开抢倒计时还有两天,欲抢从速!xx号艳艳")) + print(make_predict("CSC喜欢打游戏")) + print(make_predict("一刀999")) + print(make_predict("你好")) + print(make_predict("尊敬的客上,感谢您一直的支持,亿美亿康美容部特在本月的x、x、x三天举办秒杀活动,现场更是优惠多多,开抢倒计时还有两天,欲抢从速!xx号艳艳")) + print(make_predict("秒杀价格8848,8848你值得拥有")) + print(make_predict("有博主做过同类防晒霜的对比")) + print(make_predict("csc每天打游戏")) + print(make_predict("今天电脑爆炸了")) \ No newline at end of file diff --git a/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/online_spam_check/spam_online.png b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/online_spam_check/spam_online.png new file mode 100644 index 0000000..a538a39 Binary files /dev/null and b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/online_spam_check/spam_online.png differ diff --git a/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/online_spam_check/templates/Temp.html b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/online_spam_check/templates/Temp.html new file mode 100644 index 0000000..bd3dfef --- /dev/null +++ b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/online_spam_check/templates/Temp.html @@ -0,0 +1,26 @@ + + + + + 进入 + + + +

There is a form.

+
+

Name

+

Email

+

Phon nume

+

+
+

HA!HA!HA!HA!

+ + \ No newline at end of file diff --git a/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/online_spam_check/templates/hello.html b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/online_spam_check/templates/hello.html new file mode 100644 index 0000000..d1294b7 --- /dev/null +++ b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/online_spam_check/templates/hello.html @@ -0,0 +1,14 @@ + + + + + Hello from Flask + + + {% if name %} +

Hello {{ name }}!

+ {% else %} +

Hello, World!

+ {% endif %} + + \ No newline at end of file diff --git a/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/online_spam_check/templates/input.html b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/online_spam_check/templates/input.html new file mode 100644 index 0000000..89ce287 --- /dev/null +++ b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/online_spam_check/templates/input.html @@ -0,0 +1,39 @@ + + +
+ + + CSC识别器 +
+ +
+

输入待识别文本:

+

+

+ +
+ {% if result|length >= 1 %} + + + + + + + + {% for sentence, predict in result %} + + + + + {% endfor %} + +
+ 历史信息 +
文本是/否为垃圾
{{ sentence }}{{ predict }}
+ {% endif %} + + diff --git a/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/online_spam_check/templates/result_data.html b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/online_spam_check/templates/result_data.html new file mode 100644 index 0000000..8d217d7 --- /dev/null +++ b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/online_spam_check/templates/result_data.html @@ -0,0 +1,26 @@ + + + + + + +

Your Details

+ + {% for key, value in result.items() %} + + + + + {% endfor %} +
{{ key }}{{ value }}
+ + \ No newline at end of file