在线垃圾邮件识别

Signed-off-by: wty-yy <13190706+wty-yy@user.noreply.gitee.com>
2023-07-13 11:52:41 +00:00
parent 9d8bd2c84c
commit 85edbe9cd7
10 changed files with 420 additions and 0 deletions
@@ -0,0 +1,216 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "198b28eb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import jieba"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "4cdb7b90",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "FileNotFoundError",
+     "evalue": "[Errno 2] No such file or directory: '../dataset/message80W1.csv'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[3], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m../dataset/message80W1.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m, header\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m)\n",
+      "File \u001b[0;32m~/Programs/mambaforge/envs/tf/lib/python3.11/site-packages/pandas/util/_decorators.py:211\u001b[0m, in \u001b[0;36mdeprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    209\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    210\u001b[0m         kwargs[new_arg_name] \u001b[38;5;241m=\u001b[39m new_arg_value\n\u001b[0;32m--> 211\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
+      "File \u001b[0;32m~/Programs/mambaforge/envs/tf/lib/python3.11/site-packages/pandas/util/_decorators.py:331\u001b[0m, in \u001b[0;36mdeprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    325\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(args) \u001b[38;5;241m>\u001b[39m num_allow_args:\n\u001b[1;32m    326\u001b[0m     warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m    327\u001b[0m         msg\u001b[38;5;241m.\u001b[39mformat(arguments\u001b[38;5;241m=\u001b[39m_format_argument_list(allow_args)),\n\u001b[1;32m    328\u001b[0m         \u001b[38;5;167;01mFutureWarning\u001b[39;00m,\n\u001b[1;32m    329\u001b[0m         stacklevel\u001b[38;5;241m=\u001b[39mfind_stack_level(),\n\u001b[1;32m    330\u001b[0m     )\n\u001b[0;32m--> 331\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
+      "File \u001b[0;32m~/Programs/mambaforge/envs/tf/lib/python3.11/site-packages/pandas/io/parsers/readers.py:950\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)\u001b[0m\n\u001b[1;32m    935\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m    936\u001b[0m     dialect,\n\u001b[1;32m    937\u001b[0m     delimiter,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    946\u001b[0m     defaults\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdelimiter\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m,\u001b[39m\u001b[38;5;124m\"\u001b[39m},\n\u001b[1;32m    947\u001b[0m )\n\u001b[1;32m    948\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m--> 950\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _read(filepath_or_buffer, kwds)\n",
+      "File \u001b[0;32m~/Programs/mambaforge/envs/tf/lib/python3.11/site-packages/pandas/io/parsers/readers.py:605\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m    602\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m    604\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 605\u001b[0m parser \u001b[38;5;241m=\u001b[39m TextFileReader(filepath_or_buffer, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwds)\n\u001b[1;32m    607\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m    608\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m parser\n",
+      "File \u001b[0;32m~/Programs/mambaforge/envs/tf/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1442\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m   1439\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m   1441\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1442\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_make_engine(f, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mengine)\n",
+      "File \u001b[0;32m~/Programs/mambaforge/envs/tf/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1735\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m   1733\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m   1734\u001b[0m         mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1735\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m get_handle(\n\u001b[1;32m   1736\u001b[0m     f,\n\u001b[1;32m   1737\u001b[0m     mode,\n\u001b[1;32m   1738\u001b[0m     encoding\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mencoding\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m),\n\u001b[1;32m   1739\u001b[0m     compression\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcompression\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m),\n\u001b[1;32m   1740\u001b[0m     memory_map\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmemory_map\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mFalse\u001b[39;00m),\n\u001b[1;32m   1741\u001b[0m     is_text\u001b[38;5;241m=\u001b[39mis_text,\n\u001b[1;32m   1742\u001b[0m     errors\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mencoding_errors\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstrict\u001b[39m\u001b[38;5;124m\"\u001b[39m),\n\u001b[1;32m   1743\u001b[0m     storage_options\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstorage_options\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m),\n\u001b[1;32m   1744\u001b[0m )\n\u001b[1;32m   1745\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1746\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n",
+      "File \u001b[0;32m~/Programs/mambaforge/envs/tf/lib/python3.11/site-packages/pandas/io/common.py:856\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m    851\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m    852\u001b[0m     \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m    853\u001b[0m     \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m    854\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m    855\u001b[0m         \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 856\u001b[0m         handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(\n\u001b[1;32m    857\u001b[0m             handle,\n\u001b[1;32m    858\u001b[0m             ioargs\u001b[38;5;241m.\u001b[39mmode,\n\u001b[1;32m    859\u001b[0m             encoding\u001b[38;5;241m=\u001b[39mioargs\u001b[38;5;241m.\u001b[39mencoding,\n\u001b[1;32m    860\u001b[0m             errors\u001b[38;5;241m=\u001b[39merrors,\n\u001b[1;32m    861\u001b[0m             newline\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m    862\u001b[0m         )\n\u001b[1;32m    863\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    864\u001b[0m         \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m    865\u001b[0m         handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n",
+      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '../dataset/message80W1.csv'"
+     ]
+    }
+   ],
+   "source": [
+    "df = pd.read_csv(\"../dataset/message80W1.csv\", header=None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1135dc61",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "N_pos, N_neg = 10000, 10000\n",
+    "np.random.seed(42)\n",
+    "df_positive = df[df[1]==0]\n",
+    "df_negative = df[df[1]==1]\n",
+    "# def sample_df(df, N):\n",
+    "#     indexs = np.random.choice(np.arange(len(df)), N)\n",
+    "#     return df.iloc[indexs,2]\n",
+    "# corpus_pos = sample_df(df_positive, N_pos)\n",
+    "# corpus_neg = sample_df(df_negative, N_neg)\n",
+    "corpus_pos = df_positive.sample(n=N_pos).iloc[:,2]\n",
+    "corpus_neg = df_negative.sample(n=N_neg).iloc[:,2]\n",
+    "corpus = np.concatenate([corpus_pos, corpus_neg]).reshape(-1,1)\n",
+    "y = np.concatenate([np.full(N_pos, 1), np.full(N_neg, 0)])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a329a32f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "corpus_cut = np.apply_along_axis(lambda x: ' '.join(jieba.cut(x[0])), axis=1, arr=corpus)\n",
+    "corpus_cut"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "45c62d11",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n",
+    "\n",
+    "with open(\"../dataset/stopword.txt\", encoding='gbk') as file:\n",
+    "    stopwords = file.read().split()\n",
+    "\n",
+    "# 频率向量化，token_pattern不取单个词作为特征的问题: https://blog.csdn.net/xxzhix/article/details/82685372\n",
+    "vectorizer = CountVectorizer(token_pattern='[\\u4e00-\\u9fa5_a-zA-Z0-9]{1,}')\n",
+    "X = vectorizer.fit_transform(corpus_cut)\n",
+    "tfidf = TfidfTransformer()\n",
+    "X = tfidf.fit_transform(X)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d1c2b396",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def to_vector(X):\n",
+    "    X = np.array(X).reshape(-1, 1)\n",
+    "    cut = np.apply_along_axis(lambda x: ' '.join(jieba.cut(x[0])), axis=1, arr=X)\n",
+    "    vector = vectorizer.transform(cut)\n",
+    "    return tfidf.transform(vector)\n",
+    "sentence = '现在看个电脑也得戴眼镜了我正在光速变瞎'\n",
+    "vector = to_vector([sentence])\n",
+    "print(vector.shape)\n",
+    "idxs = vector.nonzero()[1]\n",
+    "[vectorizer.get_feature_names_out()[i] for i in idxs]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2f2ed68d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.model_selection import train_test_split\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0d7f4f98",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f\"训练集{X_train.shape}, 测试集{X_test.shape}\")\n",
+    "print(f\"训练集正例{sum(y_train==1)}, 测试集正例{sum(y_test==1)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8ea14ac0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.naive_bayes import GaussianNB\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "gnb = GaussianNB()\n",
+    "print(\"拟合中......\")\n",
+    "gnb.fit(X_train.toarray(), y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5885e1b3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"预测中......\")\n",
+    "pred_train = gnb.predict(X_train.toarray())\n",
+    "pred_test = gnb.predict(X_test.toarray())\n",
+    "acc_train = accuracy_score(y_train, pred_train)\n",
+    "acc_test = accuracy_score(y_test, pred_test)\n",
+    "print(f\"准确率 train/test: {acc_train:.4f}/{acc_test:.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3491efb8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vectors = to_vector([\"尊敬的客上，感谢您一直的支持，亿美亿康美容部特在本月的x、x、x三天举办秒杀活动，现场更是优惠多多，开抢倒计时还有两天，欲抢从速！xx号艳艳\",\n",
+    "                     \"秒杀价格8848，8848你值得拥有\",\n",
+    "                     \"有博主做过同类防晒霜的对比\",\n",
+    "                     \"一刀999\",\n",
+    "                     \"并夕夕\",\n",
+    "                     \"csc每天打游戏\",\n",
+    "                     \"今天年脑爆炸了，太刺激了\",\n",
+    "                     \"我收到了垃圾短信\",\n",
+    "                     \"尊敬的客户\",\n",
+    "                    ]).toarray()\n",
+    "gnb.predict(vectors)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9251e6da",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
@@ -0,0 +1,22 @@
+# Importing Necessary Modules
+from flask import Flask, request, render_template
+from gnb_model import make_predict
+app = Flask(__name__)
+
+result = []
+@app.route("/", methods=['GET', 'POST'])
+def input():
+	global result
+	if request.method == 'POST':
+		print(request.form)
+		if "clean" in request.form.keys():
+			result = []
+		sentence = request.form['sentence']
+		if len(sentence) != 0:
+			result.append((sentence, make_predict(sentence)))
+			# result.append((sentence, "不是"))
+	return render_template("input.html", result=result)
+
+# main route to start with
+if __name__ == '__main__':
+	app.run(debug=True, host='0.0.0.0')
@@ -0,0 +1,77 @@
+import numpy as np
+import pandas as pd
+import jieba
+from pathlib import Path
+PATH_DATASET = Path.cwd().joinpath("../../dataset")
+DELETE_STOPWORDS = False
+
+df = pd.read_csv(PATH_DATASET.joinpath("message80W1.csv"), header=None)
+
+N_pos, N_neg = 10000, 10000
+df_positive = df[df[1]==0]
+df_negative = df[df[1]==1]
+np.random.seed(42)
+def sample_df(df, N):
+    indexs = np.random.choice(np.arange(len(df)), N)
+    return df.iloc[indexs,2]
+corpus_pos = sample_df(df_positive, N_pos)
+corpus_neg = sample_df(df_negative, N_neg)
+# corpus_pos = df_positive.sample(n=N_pos, random_state=42).iloc[:,2]
+# corpus_neg = df_negative.sample(n=N_neg, random_state=42).iloc[:,2]
+corpus = np.concatenate([corpus_pos, corpus_neg]).reshape(-1,1)
+y = np.concatenate([np.full(N_pos, 1), np.full(N_neg, 0)])
+
+corpus_cut = np.apply_along_axis(lambda x: ' '.join(jieba.cut(x[0])), axis=1, arr=corpus)
+
+from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
+
+with open(PATH_DATASET.joinpath("stopword.txt"), encoding='gbk') as file:
+    stopwords = file.read().split()
+
+# 频率向量化，token_pattern不取单个词作为特征的问题: https://blog.csdn.net/xxzhix/article/details/82685372
+if not DELETE_STOPWORDS:
+    vectorizer = CountVectorizer(token_pattern='[\u4e00-\u9fa5_a-zA-Z0-9]{1,}')
+else:
+    vectorizer_stopwords = CountVectorizer(token_pattern='[\u4e00-\u9fa5_a-zA-Z0-9]{1,}', stop_words=stopwords)
+X = vectorizer.fit_transform(corpus_cut)
+tfidf = TfidfTransformer()
+X = tfidf.fit_transform(X)
+
+def to_vector(X, stopwords=False):
+    X = np.array(X).reshape(-1, 1)
+    cut = np.apply_along_axis(lambda x: ' '.join(jieba.cut(x[0])), axis=1, arr=X)
+    vector = vectorizer.transform(cut)
+    return tfidf.transform(vector)
+
+from sklearn.model_selection import train_test_split
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
+
+from sklearn.naive_bayes import GaussianNB
+from sklearn.metrics import accuracy_score
+gnb = GaussianNB()
+print("拟合中......")
+gnb.fit(X_train.toarray(), y_train)
+
+print("预测中......")
+pred_train = gnb.predict(X_train.toarray())
+pred_test = gnb.predict(X_test.toarray())
+acc_train = accuracy_score(y_train, pred_train)
+acc_test = accuracy_score(y_test, pred_test)
+print(f"准确率 train/test: {acc_train:.4f}/{acc_test:.4f}")
+
+def make_predict(string:str):
+    string = [string]
+    vector = to_vector(string).toarray()
+    ret = "是垃圾" if gnb.predict(vector) == 0 else "不是垃圾"
+    return ret
+
+if __name__ == '__main__':
+    print(make_predict("尊敬的客上，感谢您一直的支持，亿美亿康美容部特在本月的x、x、x三天举办秒杀活动，现场更是优惠多多，开抢倒计时还有两天，欲抢从速！xx号艳艳"))
+    print(make_predict("CSC喜欢打游戏"))
+    print(make_predict("一刀999"))
+    print(make_predict("你好"))
+    print(make_predict("尊敬的客上，感谢您一直的支持，亿美亿康美容部特在本月的x、x、x三天举办秒杀活动，现场更是优惠多多，开抢倒计时还有两天，欲抢从速！xx号艳艳"))
+    print(make_predict("秒杀价格8848，8848你值得拥有"))
+    print(make_predict("有博主做过同类防晒霜的对比"))
+    print(make_predict("csc每天打游戏"))
+    print(make_predict("今天电脑爆炸了"))
@@ -0,0 +1,26 @@
+<!DOCTYPE html>
+<html>
+    <head>
+        <meta charset="utf-8">
+        <title>进入</title>
+        <style>
+            body {
+                text-align: center;
+                background-color: green;
+            }
+            form {
+                display: inline-block;
+            }
+        </style>
+    </head>
+    <body>
+        <h3>There is a form.</h3>
+        <form action="/passing" method="post">
+            <p>Name <input type="text", name="name"></p>
+            <p>Email <input type="email", name="email"></p>
+            <p>Phon nume <input type="text", name="phone"></p>
+            <p><input type="submit" value="Submit!!!"></p>
+        </form>
+        <h4>HA!HA!HA!HA!</h4>
+    </body>
+</html>
@@ -0,0 +1,14 @@
+<!DOCTYPE html>
+<html>
+    <head>
+        <meta charset="utf-8">
+        <title>Hello from Flask</title>
+    </head>
+    <body>
+        {% if name %}
+        <h1>Hello {{ name }}!</h1>
+        {% else %}
+        <h1>Hello, World!</h1>
+        {% endif %}
+    </body>
+</html>
@@ -0,0 +1,39 @@
+<!DOCTYPE html>
+<html>
+    <header>
+        <style>
+            body {
+                text-align: center;
+            }
+        </style>
+        <meta charset="utf-8">
+        <title>CSC识别器</title>
+    </header>
+    <body>
+        <form action="/" method="post">
+            <p>输入待识别文本：<input type="text" name="sentence" minlength="1"></p>
+            <p><button>提交</button></p>
+            <p><button name="clean">清空历史信息</button></p>
+            <!-- <p><input type="submit" name="提交"></p> -->
+        </form>
+        {% if result|length >= 1 %}
+        <table align="center" border="1">
+            <thead><tr><th colspan="2">
+                <strong>历史信息</strong>
+            </th></tr></thead>
+            <tbody>
+                <tr>
+                    <td>文本</td>
+                    <td>是/否为垃圾</td>
+                </tr>
+                {% for sentence, predict in result %}
+                <tr>
+                    <td>{{ sentence }}</td>
+                    <td>{{ predict }}</td>
+                </tr>
+                {% endfor %}
+            </tbody>
+        </table>
+        {% endif %}
+    </body>
+</html>
@@ -0,0 +1,26 @@
+<!DOCTYPE html>
+<html>
+    <head>
+        <style>
+            body {
+                text-align: center;
+                background-color: orange;
+            }
+            table {
+                display: inline-block;
+                border-collapse: collapse;
+            }
+        </style>
+    </head>
+    <body>
+        <p><strong>Your Details</strong></p>
+        <table border=1>
+            {% for key, value in result.items() %}
+            <tr>
+                <th>{{ key }}</th>
+                <th>{{ value }}</th>
+            </tr>
+            {% endfor %}
+        </table>
+    </body>
+</html>