平时作业

Signed-off-by: 吴沂钊 <13190667+Yizhao_Wu4926@user.noreply.gitee.com>
2023-07-15 14:50:33 +00:00
parent b9d432a4ea
commit 29098aca79
3 changed files with 1442 additions and 0 deletions
@@ -0,0 +1,568 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "f6e7da09-3bbb-4298-9a81-d3b09d8e6b83",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['images/1_1.jpg',\n",
+       " 'images/1_10.jpg',\n",
+       " 'images/1_11.jpg',\n",
+       " 'images/1_12.jpg',\n",
+       " 'images/1_13.jpg',\n",
+       " 'images/1_14.jpg',\n",
+       " 'images/1_15.jpg',\n",
+       " 'images/1_16.jpg',\n",
+       " 'images/1_17.jpg',\n",
+       " 'images/1_18.jpg',\n",
+       " 'images/1_19.jpg',\n",
+       " 'images/1_2.jpg',\n",
+       " 'images/1_20.jpg',\n",
+       " 'images/1_21.jpg',\n",
+       " 'images/1_22.jpg',\n",
+       " 'images/1_23.jpg',\n",
+       " 'images/1_24.jpg',\n",
+       " 'images/1_25.jpg',\n",
+       " 'images/1_26.jpg',\n",
+       " 'images/1_27.jpg',\n",
+       " 'images/1_28.jpg',\n",
+       " 'images/1_3.jpg',\n",
+       " 'images/1_30.jpg',\n",
+       " 'images/1_31.jpg',\n",
+       " 'images/1_32.jpg',\n",
+       " 'images/1_33.jpg',\n",
+       " 'images/1_34.jpg',\n",
+       " 'images/1_35.jpg',\n",
+       " 'images/1_36.jpg',\n",
+       " 'images/1_37.jpg',\n",
+       " 'images/1_38.jpg',\n",
+       " 'images/1_39.jpg',\n",
+       " 'images/1_4.jpg',\n",
+       " 'images/1_40.jpg',\n",
+       " 'images/1_41.jpg',\n",
+       " 'images/1_42.jpg',\n",
+       " 'images/1_43.jpg',\n",
+       " 'images/1_44.jpg',\n",
+       " 'images/1_45.jpg',\n",
+       " 'images/1_46.jpg',\n",
+       " 'images/1_47.jpg',\n",
+       " 'images/1_48.jpg',\n",
+       " 'images/1_49.jpg',\n",
+       " 'images/1_5.jpg',\n",
+       " 'images/1_51.jpg',\n",
+       " 'images/1_6.jpg',\n",
+       " 'images/1_7.jpg',\n",
+       " 'images/1_8.jpg',\n",
+       " 'images/1_9.jpg',\n",
+       " 'images/2_1.jpg',\n",
+       " 'images/2_10.jpg',\n",
+       " 'images/2_12.jpg',\n",
+       " 'images/2_13.jpg',\n",
+       " 'images/2_14.jpg',\n",
+       " 'images/2_15.jpg',\n",
+       " 'images/2_17.jpg',\n",
+       " 'images/2_18.jpg',\n",
+       " 'images/2_19.jpg',\n",
+       " 'images/2_2.jpg',\n",
+       " 'images/2_20.jpg',\n",
+       " 'images/2_21.jpg',\n",
+       " 'images/2_22.jpg',\n",
+       " 'images/2_23.jpg',\n",
+       " 'images/2_24.jpg',\n",
+       " 'images/2_26.jpg',\n",
+       " 'images/2_27.jpg',\n",
+       " 'images/2_28.jpg',\n",
+       " 'images/2_29.jpg',\n",
+       " 'images/2_3.jpg',\n",
+       " 'images/2_30.jpg',\n",
+       " 'images/2_31.jpg',\n",
+       " 'images/2_32.jpg',\n",
+       " 'images/2_33.jpg',\n",
+       " 'images/2_34.jpg',\n",
+       " 'images/2_35.jpg',\n",
+       " 'images/2_36.jpg',\n",
+       " 'images/2_37.jpg',\n",
+       " 'images/2_38.jpg',\n",
+       " 'images/2_39.jpg',\n",
+       " 'images/2_4.jpg',\n",
+       " 'images/2_40.jpg',\n",
+       " 'images/2_41.jpg',\n",
+       " 'images/2_42.jpg',\n",
+       " 'images/2_43.jpg',\n",
+       " 'images/2_44.jpg',\n",
+       " 'images/2_5.jpg',\n",
+       " 'images/2_6.jpg',\n",
+       " 'images/2_7.jpg',\n",
+       " 'images/2_8.jpg',\n",
+       " 'images/2_9.jpg',\n",
+       " 'images/3_10.jpg',\n",
+       " 'images/3_11.jpg',\n",
+       " 'images/3_12.jpg',\n",
+       " 'images/3_13.jpg',\n",
+       " 'images/3_14.jpg',\n",
+       " 'images/3_15.jpg',\n",
+       " 'images/3_16.jpg',\n",
+       " 'images/3_17.jpg',\n",
+       " 'images/3_18.jpg',\n",
+       " 'images/3_19.jpg',\n",
+       " 'images/3_20.jpg',\n",
+       " 'images/3_21.jpg',\n",
+       " 'images/3_22.jpg',\n",
+       " 'images/3_23.jpg',\n",
+       " 'images/3_25.jpg',\n",
+       " 'images/3_26.jpg',\n",
+       " 'images/3_27.jpg',\n",
+       " 'images/3_28.jpg',\n",
+       " 'images/3_29.jpg',\n",
+       " 'images/3_3.jpg',\n",
+       " 'images/3_30.jpg',\n",
+       " 'images/3_31.jpg',\n",
+       " 'images/3_32.jpg',\n",
+       " 'images/3_34.jpg',\n",
+       " 'images/3_38.jpg',\n",
+       " 'images/3_40.jpg',\n",
+       " 'images/3_42.jpg',\n",
+       " 'images/3_43.jpg',\n",
+       " 'images/3_44.jpg',\n",
+       " 'images/3_45.jpg',\n",
+       " 'images/3_46.jpg',\n",
+       " 'images/3_47.jpg',\n",
+       " 'images/3_48.jpg',\n",
+       " 'images/3_49.jpg',\n",
+       " 'images/3_5.jpg',\n",
+       " 'images/3_50.jpg',\n",
+       " 'images/3_51.jpg',\n",
+       " 'images/3_52.jpg',\n",
+       " 'images/3_53.jpg',\n",
+       " 'images/3_55.jpg',\n",
+       " 'images/3_56.jpg',\n",
+       " 'images/3_57.jpg',\n",
+       " 'images/3_58.jpg',\n",
+       " 'images/3_59.jpg',\n",
+       " 'images/3_65.jpg',\n",
+       " 'images/3_66.jpg',\n",
+       " 'images/3_7.jpg',\n",
+       " 'images/3_71.jpg',\n",
+       " 'images/3_72.jpg',\n",
+       " 'images/3_73.jpg',\n",
+       " 'images/3_74.jpg',\n",
+       " 'images/3_75.jpg',\n",
+       " 'images/3_76.jpg',\n",
+       " 'images/3_77.jpg',\n",
+       " 'images/3_78.jpg',\n",
+       " 'images/3_8.jpg',\n",
+       " 'images/3_9.jpg',\n",
+       " 'images/4_10.jpg',\n",
+       " 'images/4_11.jpg',\n",
+       " 'images/4_13.jpg',\n",
+       " 'images/4_14.jpg',\n",
+       " 'images/4_21.jpg',\n",
+       " 'images/4_6.jpg',\n",
+       " 'images/4_7.jpg',\n",
+       " 'images/4_9.jpg',\n",
+       " 'images/5_1.jpg',\n",
+       " 'images/5_2.jpg',\n",
+       " 'images/5_3.jpg',\n",
+       " 'images/5_4.jpg',\n",
+       " 'images/5_5.jpg',\n",
+       " 'images/5_6.jpg']"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import re\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import cv2\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "plt.rcParams['font.sans-serif'] = ['SimHei']\n",
+    "plt.rcParams['axes.unicode_minus'] = False\n",
+    "\n",
+    "def getimgnames(path=None):\n",
+    "    \"\"\"\n",
+    "    获取指定文件夹中的JPG图片名称（含路径）\n",
+    "    :param path: 指定文件夹\n",
+    "    :return: path中的所有JPG图片名称（含路径，例如：./path/image1.jpg）\n",
+    "    \"\"\"\n",
+    "    imgnames = []\n",
+    "    filenames = os.listdir(path)  # 获取path中的所有文件名\n",
+    "    for i in filenames:\n",
+    "        if re.findall('\\.jpg$', i) != []:  # 在所有文件名中找出JPG图片名称\n",
+    "            imgnames.append(os.path.join(path, i))  # 将图片名称和路径合并、保存\n",
+    "    return imgnames\n",
+    "\n",
+    "imglist = getimgnames('images/')\n",
+    "imglist"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "43738e00-e1fa-4d6f-b08c-1e64662a5ce0",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'\\ndef cut_image(img, imgFile):\\n    cx = int(np.size(img, 1))/2\\n    cy = int(np.size(img, 0))/2\\n    \\n    plt.figure(figsize=(8,8))\\n    plt.imshow(img)\\n\\n    plt.plot([cx-50, cx+50], [cy+50, cy+50], \\'r\\', linewidth=2)\\n    plt.plot([cx+50, cx+50], [cy-50, cy+50], \\'r\\', linewidth=2)\\n    plt.plot([cx-50, cx+50], [cy-50, cy-50], \\'r\\', linewidth=2)\\n    plt.plot([cx-50, cx-50], [cy-50, cy+50], \\'r\\', linewidth=2)\\n    plt.annotate(\\'选取的水样窗口\\', xy=(cx+50,cy-50), xytext=(cx+300, cy-300),\\n            arrowprops=dict(facecolor=\\'black\\', shrink=0.1))\\n\\n    plt.title(\\'水色样本 \\'+imgFile+\\' 分辨率为\\'+str(img.size)+\" 类别标签 \"+str(imgFile[9]))\\n    plt.show()\\n    \\nfor i in range(len(imglist)):\\n    img = cv2.imread(imglist[i])\\n    cut_image(img, imglist[i])\\n'"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\"\"\"\n",
+    "def cut_image(img, imgFile):\n",
+    "    cx = int(np.size(img, 1))/2\n",
+    "    cy = int(np.size(img, 0))/2\n",
+    "    \n",
+    "    plt.figure(figsize=(8,8))\n",
+    "    plt.imshow(img)\n",
+    "\n",
+    "    plt.plot([cx-50, cx+50], [cy+50, cy+50], 'r', linewidth=2)\n",
+    "    plt.plot([cx+50, cx+50], [cy-50, cy+50], 'r', linewidth=2)\n",
+    "    plt.plot([cx-50, cx+50], [cy-50, cy-50], 'r', linewidth=2)\n",
+    "    plt.plot([cx-50, cx-50], [cy-50, cy+50], 'r', linewidth=2)\n",
+    "    plt.annotate('选取的水样窗口', xy=(cx+50,cy-50), xytext=(cx+300, cy-300),\n",
+    "            arrowprops=dict(facecolor='black', shrink=0.1))\n",
+    "\n",
+    "    plt.title('水色样本 '+imgFile+' 分辨率为'+str(img.size)+\" 类别标签 \"+str(imgFile[9]))\n",
+    "    plt.show()\n",
+    "    \n",
+    "for i in range(len(imglist)):\n",
+    "    img = cv2.imread(imglist[i])\n",
+    "    cut_image(img, imglist[i])\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "65b2b8b6-ed00-47ff-84f1-e22f97d48cdb",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# 加载图像统计信息模块（注：也可以直接通过颜色通道来计算）\n",
+    "from PIL import ImageStat,Image\n",
+    "\n",
+    "# 遍历全体图像进行快速检查\n",
+    "size = 100\n",
+    "imgPath = './images'\n",
+    "imgWidth = []       # 图像宽度\n",
+    "imgHeight = []      # 图像高度\n",
+    "imgRrange = []      # 图像红色通道极差\n",
+    "imgGrange = []      # 图像绿色通道极差\n",
+    "imgBrange = []      # 图像蓝色通道极差\n",
+    "\n",
+    "newImgs = []        # 获得选取后的图像作为模型训练和验证数据\n",
+    "\n",
+    "imgFiles = os.listdir(imgPath)\n",
+    "for imgFile in imgFiles:\n",
+    "    img = Image.open(os.path.join(imgPath,imgFile))\n",
+    "    imgWidth.append(img.size[0])\n",
+    "    imgHeight.append(img.size[1])\n",
+    "    \n",
+    "    # 获得图像中心区域大小为size的图像块\n",
+    "    cx, cy = (int(i/2) for i in img.size)\n",
+    "    box = (cx-50, cy-50, cx+50, cy+50)\n",
+    "    region = img.crop(box)\n",
+    "    \n",
+    "    # 计算选取图像块的标准差 分为红绿蓝三种\n",
+    "    stat = ImageStat.Stat(region)\n",
+    "    imgRrange.append(stat.extrema[0][1]-stat.extrema[0][0])\n",
+    "    imgGrange.append(stat.extrema[1][1]-stat.extrema[1][0])\n",
+    "    imgBrange.append(stat.extrema[2][1]-stat.extrema[2][0])\n",
+    "    \n",
+    "    newImgs.append(region)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "527d76c9-7201-4fe6-a835-3a329fb90bc7",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 161 entries, 0 to 160\n",
+      "Data columns (total 9 columns):\n",
+      " #   Column  Non-Null Count  Dtype  \n",
+      "---  ------  --------------  -----  \n",
+      " 0   0       161 non-null    float64\n",
+      " 1   1       161 non-null    float64\n",
+      " 2   2       161 non-null    float64\n",
+      " 3   3       161 non-null    float64\n",
+      " 4   4       161 non-null    float64\n",
+      " 5   5       161 non-null    float64\n",
+      " 6   6       161 non-null    float64\n",
+      " 7   7       161 non-null    float64\n",
+      " 8   8       161 non-null    float64\n",
+      "dtypes: float64(9)\n",
+      "memory usage: 11.4 KB\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>0</th>\n",
+       "      <th>1</th>\n",
+       "      <th>2</th>\n",
+       "      <th>3</th>\n",
+       "      <th>4</th>\n",
+       "      <th>5</th>\n",
+       "      <th>6</th>\n",
+       "      <th>7</th>\n",
+       "      <th>8</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>148.6037</td>\n",
+       "      <td>138.6396</td>\n",
+       "      <td>64.3687</td>\n",
+       "      <td>3.633077</td>\n",
+       "      <td>4.128330</td>\n",
+       "      <td>10.499046</td>\n",
+       "      <td>4.254340</td>\n",
+       "      <td>4.802914</td>\n",
+       "      <td>12.057685</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>163.6769</td>\n",
+       "      <td>145.5470</td>\n",
+       "      <td>54.4608</td>\n",
+       "      <td>3.934350</td>\n",
+       "      <td>2.840174</td>\n",
+       "      <td>3.502551</td>\n",
+       "      <td>4.453499</td>\n",
+       "      <td>3.245141</td>\n",
+       "      <td>4.065518</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>153.9430</td>\n",
+       "      <td>147.0834</td>\n",
+       "      <td>71.9601</td>\n",
+       "      <td>2.225657</td>\n",
+       "      <td>1.810979</td>\n",
+       "      <td>3.136608</td>\n",
+       "      <td>2.644340</td>\n",
+       "      <td>2.148726</td>\n",
+       "      <td>3.651915</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>150.3724</td>\n",
+       "      <td>151.3982</td>\n",
+       "      <td>64.3164</td>\n",
+       "      <td>2.037380</td>\n",
+       "      <td>1.521590</td>\n",
+       "      <td>2.728093</td>\n",
+       "      <td>2.375780</td>\n",
+       "      <td>1.819692</td>\n",
+       "      <td>3.173392</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>150.7381</td>\n",
+       "      <td>150.9774</td>\n",
+       "      <td>64.6204</td>\n",
+       "      <td>1.918674</td>\n",
+       "      <td>1.665260</td>\n",
+       "      <td>3.110901</td>\n",
+       "      <td>2.259915</td>\n",
+       "      <td>1.958815</td>\n",
+       "      <td>3.619921</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          0         1        2         3         4          5         6  \\\n",
+       "0  148.6037  138.6396  64.3687  3.633077  4.128330  10.499046  4.254340   \n",
+       "1  163.6769  145.5470  54.4608  3.934350  2.840174   3.502551  4.453499   \n",
+       "2  153.9430  147.0834  71.9601  2.225657  1.810979   3.136608  2.644340   \n",
+       "3  150.3724  151.3982  64.3164  2.037380  1.521590   2.728093  2.375780   \n",
+       "4  150.7381  150.9774  64.6204  1.918674  1.665260   3.110901  2.259915   \n",
+       "\n",
+       "          7          8  \n",
+       "0  4.802914  12.057685  \n",
+       "1  3.245141   4.065518  \n",
+       "2  2.148726   3.651915  \n",
+       "3  1.819692   3.173392  \n",
+       "4  1.958815   3.619921  "
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "# 构建训练数据集和分类标签\n",
+    "data = []\n",
+    "dy = []\n",
+    "for i, img in enumerate(newImgs):\n",
+    "    r, g, b = np.split(np.array(img), 3, axis = 2)\n",
+    "    \n",
+    "    #计算一阶矩\n",
+    "    r_m1 = np.mean(r)\n",
+    "    g_m1 = np.mean(g)\n",
+    "    b_m1 = np.mean(b)\n",
+    "    \n",
+    "    #二阶矩\n",
+    "    r_m2 = np.std(r)\n",
+    "    g_m2 = np.std(g)\n",
+    "    b_m2 = np.std(b)\n",
+    "    \n",
+    "    #三阶矩\n",
+    "    r_m3 = np.mean(abs(r - r.mean())**3)**(1/3)\n",
+    "    g_m3 = np.mean(abs(g - g.mean())**3)**(1/3)\n",
+    "    b_m3 = np.mean(abs(b - b.mean())**3)**(1/3)\n",
+    "    \n",
+    "    # 构造新数据集\n",
+    "    df = np.array([r_m1,g_m1,b_m1,r_m2,g_m2,b_m2,r_m3,g_m3,b_m3])\n",
+    "    data.append(df)\n",
+    "    \n",
+    "    # 保存对应的分类标签\n",
+    "    dy.append(int(imgFiles[i][0]))\n",
+    "\n",
+    "dy = np.array(dy)\n",
+    "data = pd.DataFrame(np.array(data))\n",
+    "data.info()\n",
+    "data.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "208d30d2-1850-45eb-a68d-94e44a5f59eb",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "((128, 9), (33, 9), (128,))"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from sklearn.model_selection import train_test_split\n",
+    "x_train,x_test,y_train,y_test=train_test_split(data,dy,test_size=0.2,random_state=0)\n",
+    "x_train.shape,x_test.shape,y_train.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "726c20cb-1065-4f7e-9d71-39e261178fd6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.3939393939393939\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.metrics import accuracy_score\n",
+    "from sklearn.neural_network import MLPClassifier\n",
+    "mlp=MLPClassifier()\n",
+    "mlp.fit(x_train,y_train)\n",
+    "y_pred=mlp.predict(x_test)\n",
+    "print(accuracy_score(y_pred,y_test))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "51804510-c0a3-4a51-9795-0e38a7c9a8f4",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "419b9b44-bcd3-499f-8307-cb9f72ef52b3",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
@@ -0,0 +1,176 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9d091a01",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import re\n",
+    "import jieba\n",
+    "import pickle\n",
+    "from wordcloud import WordCloud\n",
+    "from sklearn.naive_bayes import GaussianNB\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n",
+    "from sklearn.metrics import confusion_matrix\n",
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "plt.rcParams['font.sans-serif'] = 'SimHei'\n",
+    "plt.rcParams['axes.unicode_minus'] = False\n",
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eff92c98",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def data_process(file='message80W1.csv'):\n",
+    "#读取数据并进行清洗\n",
+    "    data = pd.read_csv(file, header=None, index_col=0)\n",
+    "    data.columns = ['label', 'message']\n",
+    "    n = 5000\n",
+    "\n",
+    "    a = data[data['label'] == 0].sample(n)\n",
+    "    b = data[data['label'] == 1].sample(n)\n",
+    "    data_new = pd.concat([a, b], axis=0)\n",
+    "\n",
+    "    data_dup = data_new['message'].drop_duplicates()\n",
+    "    data_qumin = data_dup.apply(lambda x: re.sub('x', '', x))\n",
+    "\n",
+    "    jieba.load_userdict('newdic1.txt')\n",
+    "    data_cut = data_qumin.apply(lambda x: jieba.lcut(x))\n",
+    "\n",
+    "    stopWords = pd.read_csv('stopword.txt', encoding='GB18030', sep='hahaha', header=None)\n",
+    "    stopWords = ['≮', '≯', '≠', '≮', ' ', '会', '月', '日', '–'] + list(stopWords.iloc[:, 0])\n",
+    "    data_after_stop = data_cut.apply(lambda x: [i for i in x if i not in stopWords])\n",
+    "    labels = data_new.loc[data_after_stop.index, 'label']\n",
+    "    adata = data_after_stop.apply(lambda x: ' '.join(x))\n",
+    "    return adata, data_after_stop, labels"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c3ae80f9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):\n",
+    "    sns.heatmap(cm, annot=True)\n",
+    "    plt.ylabel('True label')\n",
+    "    plt.xlabel('Predicted label')\n",
+    "    plt.title(title)\n",
+    "    plt.xticks(np.arange(len(classes)), classes)\n",
+    "    plt.yticks(np.arange(len(classes)), classes)\n",
+    "    plt.show()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "56dbb0d1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "adata, data_after_stop, lables = data_process()\n",
+    "data_tr, data_te, labels_tr, labels_te = train_test_split(adata, lables, test_size=0.2)\n",
+    "countVectorizer = CountVectorizer()\n",
+    "data_tr = countVectorizer.fit_transform(data_tr)\n",
+    "X_tr = TfidfTransformer().fit_transform(data_tr.toarray()).toarray()\n",
+    "data_te = CountVectorizer(vocabulary=countVectorizer.vocabulary_).fit_transform(data_te)\n",
+    "X_te = TfidfTransformer().fit_transform(data_te.toarray()).toarray()\n",
+    "model = GaussianNB()\n",
+    "model.fit(X_tr, labels_tr)\n",
+    "pred = model.predict(X_te)\n",
+    "score = model.score(X_te, labels_te)\n",
+    "with open('model.pkl', 'wb') as f:\n",
+    "    pickle.dump(model, f)\n",
+    "with open('countVectorizer.pkl', 'wb') as f:\n",
+    "    pickle.dump(countVectorizer, f)\n",
+    "plot_confusion_matrix(confusion_matrix(labels_te, pred), [1, 0], title=\"模型分类准确率{:.2f}%\".format(score * 100))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cc83543e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "word_fre = {}\n",
+    "for i in data_after_stop[lables == 0]:\n",
+    "    for j in i:\n",
+    "        if j not in word_fre.keys():\n",
+    "            word_fre[j] = 1\n",
+    "        else:\n",
+    "            word_fre[j] += 1\n",
+    "\n",
+    "wc = WordCloud( background_color='white', font_path=r'C:/Windows/Fonts/SimHei.ttf')\n",
+    "wc.fit_words(word_fre)\n",
+    "plt.imshow(wc)\n",
+    "plt.show()\n",
+    "word_fre = {}\n",
+    "for i in data_after_stop[lables == 0]:\n",
+    "    for j in i:\n",
+    "        if j not in word_fre.keys():\n",
+    "            word_fre[j] = 1\n",
+    "        else:\n",
+    "            word_fre[j] += 1\n",
+    "\n",
+    "wc = WordCloud( background_color='white', font_path=r'C:/Windows/Fonts/SimHei.ttf')\n",
+    "wc.fit_words(word_fre)\n",
+    "plt.imshow(wc)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bde6f19c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5f089b32",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}