平时作业
Signed-off-by: 吴沂钊 <13190667+Yizhao_Wu4926@user.noreply.gitee.com>
This commit is contained in:
@@ -0,0 +1,568 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "f6e7da09-3bbb-4298-9a81-d3b09d8e6b83",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['images/1_1.jpg',\n",
|
||||
" 'images/1_10.jpg',\n",
|
||||
" 'images/1_11.jpg',\n",
|
||||
" 'images/1_12.jpg',\n",
|
||||
" 'images/1_13.jpg',\n",
|
||||
" 'images/1_14.jpg',\n",
|
||||
" 'images/1_15.jpg',\n",
|
||||
" 'images/1_16.jpg',\n",
|
||||
" 'images/1_17.jpg',\n",
|
||||
" 'images/1_18.jpg',\n",
|
||||
" 'images/1_19.jpg',\n",
|
||||
" 'images/1_2.jpg',\n",
|
||||
" 'images/1_20.jpg',\n",
|
||||
" 'images/1_21.jpg',\n",
|
||||
" 'images/1_22.jpg',\n",
|
||||
" 'images/1_23.jpg',\n",
|
||||
" 'images/1_24.jpg',\n",
|
||||
" 'images/1_25.jpg',\n",
|
||||
" 'images/1_26.jpg',\n",
|
||||
" 'images/1_27.jpg',\n",
|
||||
" 'images/1_28.jpg',\n",
|
||||
" 'images/1_3.jpg',\n",
|
||||
" 'images/1_30.jpg',\n",
|
||||
" 'images/1_31.jpg',\n",
|
||||
" 'images/1_32.jpg',\n",
|
||||
" 'images/1_33.jpg',\n",
|
||||
" 'images/1_34.jpg',\n",
|
||||
" 'images/1_35.jpg',\n",
|
||||
" 'images/1_36.jpg',\n",
|
||||
" 'images/1_37.jpg',\n",
|
||||
" 'images/1_38.jpg',\n",
|
||||
" 'images/1_39.jpg',\n",
|
||||
" 'images/1_4.jpg',\n",
|
||||
" 'images/1_40.jpg',\n",
|
||||
" 'images/1_41.jpg',\n",
|
||||
" 'images/1_42.jpg',\n",
|
||||
" 'images/1_43.jpg',\n",
|
||||
" 'images/1_44.jpg',\n",
|
||||
" 'images/1_45.jpg',\n",
|
||||
" 'images/1_46.jpg',\n",
|
||||
" 'images/1_47.jpg',\n",
|
||||
" 'images/1_48.jpg',\n",
|
||||
" 'images/1_49.jpg',\n",
|
||||
" 'images/1_5.jpg',\n",
|
||||
" 'images/1_51.jpg',\n",
|
||||
" 'images/1_6.jpg',\n",
|
||||
" 'images/1_7.jpg',\n",
|
||||
" 'images/1_8.jpg',\n",
|
||||
" 'images/1_9.jpg',\n",
|
||||
" 'images/2_1.jpg',\n",
|
||||
" 'images/2_10.jpg',\n",
|
||||
" 'images/2_12.jpg',\n",
|
||||
" 'images/2_13.jpg',\n",
|
||||
" 'images/2_14.jpg',\n",
|
||||
" 'images/2_15.jpg',\n",
|
||||
" 'images/2_17.jpg',\n",
|
||||
" 'images/2_18.jpg',\n",
|
||||
" 'images/2_19.jpg',\n",
|
||||
" 'images/2_2.jpg',\n",
|
||||
" 'images/2_20.jpg',\n",
|
||||
" 'images/2_21.jpg',\n",
|
||||
" 'images/2_22.jpg',\n",
|
||||
" 'images/2_23.jpg',\n",
|
||||
" 'images/2_24.jpg',\n",
|
||||
" 'images/2_26.jpg',\n",
|
||||
" 'images/2_27.jpg',\n",
|
||||
" 'images/2_28.jpg',\n",
|
||||
" 'images/2_29.jpg',\n",
|
||||
" 'images/2_3.jpg',\n",
|
||||
" 'images/2_30.jpg',\n",
|
||||
" 'images/2_31.jpg',\n",
|
||||
" 'images/2_32.jpg',\n",
|
||||
" 'images/2_33.jpg',\n",
|
||||
" 'images/2_34.jpg',\n",
|
||||
" 'images/2_35.jpg',\n",
|
||||
" 'images/2_36.jpg',\n",
|
||||
" 'images/2_37.jpg',\n",
|
||||
" 'images/2_38.jpg',\n",
|
||||
" 'images/2_39.jpg',\n",
|
||||
" 'images/2_4.jpg',\n",
|
||||
" 'images/2_40.jpg',\n",
|
||||
" 'images/2_41.jpg',\n",
|
||||
" 'images/2_42.jpg',\n",
|
||||
" 'images/2_43.jpg',\n",
|
||||
" 'images/2_44.jpg',\n",
|
||||
" 'images/2_5.jpg',\n",
|
||||
" 'images/2_6.jpg',\n",
|
||||
" 'images/2_7.jpg',\n",
|
||||
" 'images/2_8.jpg',\n",
|
||||
" 'images/2_9.jpg',\n",
|
||||
" 'images/3_10.jpg',\n",
|
||||
" 'images/3_11.jpg',\n",
|
||||
" 'images/3_12.jpg',\n",
|
||||
" 'images/3_13.jpg',\n",
|
||||
" 'images/3_14.jpg',\n",
|
||||
" 'images/3_15.jpg',\n",
|
||||
" 'images/3_16.jpg',\n",
|
||||
" 'images/3_17.jpg',\n",
|
||||
" 'images/3_18.jpg',\n",
|
||||
" 'images/3_19.jpg',\n",
|
||||
" 'images/3_20.jpg',\n",
|
||||
" 'images/3_21.jpg',\n",
|
||||
" 'images/3_22.jpg',\n",
|
||||
" 'images/3_23.jpg',\n",
|
||||
" 'images/3_25.jpg',\n",
|
||||
" 'images/3_26.jpg',\n",
|
||||
" 'images/3_27.jpg',\n",
|
||||
" 'images/3_28.jpg',\n",
|
||||
" 'images/3_29.jpg',\n",
|
||||
" 'images/3_3.jpg',\n",
|
||||
" 'images/3_30.jpg',\n",
|
||||
" 'images/3_31.jpg',\n",
|
||||
" 'images/3_32.jpg',\n",
|
||||
" 'images/3_34.jpg',\n",
|
||||
" 'images/3_38.jpg',\n",
|
||||
" 'images/3_40.jpg',\n",
|
||||
" 'images/3_42.jpg',\n",
|
||||
" 'images/3_43.jpg',\n",
|
||||
" 'images/3_44.jpg',\n",
|
||||
" 'images/3_45.jpg',\n",
|
||||
" 'images/3_46.jpg',\n",
|
||||
" 'images/3_47.jpg',\n",
|
||||
" 'images/3_48.jpg',\n",
|
||||
" 'images/3_49.jpg',\n",
|
||||
" 'images/3_5.jpg',\n",
|
||||
" 'images/3_50.jpg',\n",
|
||||
" 'images/3_51.jpg',\n",
|
||||
" 'images/3_52.jpg',\n",
|
||||
" 'images/3_53.jpg',\n",
|
||||
" 'images/3_55.jpg',\n",
|
||||
" 'images/3_56.jpg',\n",
|
||||
" 'images/3_57.jpg',\n",
|
||||
" 'images/3_58.jpg',\n",
|
||||
" 'images/3_59.jpg',\n",
|
||||
" 'images/3_65.jpg',\n",
|
||||
" 'images/3_66.jpg',\n",
|
||||
" 'images/3_7.jpg',\n",
|
||||
" 'images/3_71.jpg',\n",
|
||||
" 'images/3_72.jpg',\n",
|
||||
" 'images/3_73.jpg',\n",
|
||||
" 'images/3_74.jpg',\n",
|
||||
" 'images/3_75.jpg',\n",
|
||||
" 'images/3_76.jpg',\n",
|
||||
" 'images/3_77.jpg',\n",
|
||||
" 'images/3_78.jpg',\n",
|
||||
" 'images/3_8.jpg',\n",
|
||||
" 'images/3_9.jpg',\n",
|
||||
" 'images/4_10.jpg',\n",
|
||||
" 'images/4_11.jpg',\n",
|
||||
" 'images/4_13.jpg',\n",
|
||||
" 'images/4_14.jpg',\n",
|
||||
" 'images/4_21.jpg',\n",
|
||||
" 'images/4_6.jpg',\n",
|
||||
" 'images/4_7.jpg',\n",
|
||||
" 'images/4_9.jpg',\n",
|
||||
" 'images/5_1.jpg',\n",
|
||||
" 'images/5_2.jpg',\n",
|
||||
" 'images/5_3.jpg',\n",
|
||||
" 'images/5_4.jpg',\n",
|
||||
" 'images/5_5.jpg',\n",
|
||||
" 'images/5_6.jpg']"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"import numpy as np\n",
|
||||
"import os\n",
|
||||
"import cv2\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"\n",
|
||||
"plt.rcParams['font.sans-serif'] = ['SimHei']\n",
|
||||
"plt.rcParams['axes.unicode_minus'] = False\n",
|
||||
"\n",
|
||||
"def getimgnames(path=None):\n",
|
||||
" \"\"\"\n",
|
||||
" 获取指定文件夹中的JPG图片名称(含路径)\n",
|
||||
" :param path: 指定文件夹\n",
|
||||
" :return: path中的所有JPG图片名称(含路径,例如:./path/image1.jpg)\n",
|
||||
" \"\"\"\n",
|
||||
" imgnames = []\n",
|
||||
" filenames = os.listdir(path) # 获取path中的所有文件名\n",
|
||||
" for i in filenames:\n",
|
||||
" if re.findall('\\.jpg$', i) != []: # 在所有文件名中找出JPG图片名称\n",
|
||||
" imgnames.append(os.path.join(path, i)) # 将图片名称和路径合并、保存\n",
|
||||
" return imgnames\n",
|
||||
"\n",
|
||||
"imglist = getimgnames('images/')\n",
|
||||
"imglist"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "43738e00-e1fa-4d6f-b08c-1e64662a5ce0",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'\\ndef cut_image(img, imgFile):\\n cx = int(np.size(img, 1))/2\\n cy = int(np.size(img, 0))/2\\n \\n plt.figure(figsize=(8,8))\\n plt.imshow(img)\\n\\n plt.plot([cx-50, cx+50], [cy+50, cy+50], \\'r\\', linewidth=2)\\n plt.plot([cx+50, cx+50], [cy-50, cy+50], \\'r\\', linewidth=2)\\n plt.plot([cx-50, cx+50], [cy-50, cy-50], \\'r\\', linewidth=2)\\n plt.plot([cx-50, cx-50], [cy-50, cy+50], \\'r\\', linewidth=2)\\n plt.annotate(\\'选取的水样窗口\\', xy=(cx+50,cy-50), xytext=(cx+300, cy-300),\\n arrowprops=dict(facecolor=\\'black\\', shrink=0.1))\\n\\n plt.title(\\'水色样本 \\'+imgFile+\\' 分辨率为\\'+str(img.size)+\" 类别标签 \"+str(imgFile[9]))\\n plt.show()\\n \\nfor i in range(len(imglist)):\\n img = cv2.imread(imglist[i])\\n cut_image(img, imglist[i])\\n'"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\"\"\"\n",
|
||||
"def cut_image(img, imgFile):\n",
|
||||
" cx = int(np.size(img, 1))/2\n",
|
||||
" cy = int(np.size(img, 0))/2\n",
|
||||
" \n",
|
||||
" plt.figure(figsize=(8,8))\n",
|
||||
" plt.imshow(img)\n",
|
||||
"\n",
|
||||
" plt.plot([cx-50, cx+50], [cy+50, cy+50], 'r', linewidth=2)\n",
|
||||
" plt.plot([cx+50, cx+50], [cy-50, cy+50], 'r', linewidth=2)\n",
|
||||
" plt.plot([cx-50, cx+50], [cy-50, cy-50], 'r', linewidth=2)\n",
|
||||
" plt.plot([cx-50, cx-50], [cy-50, cy+50], 'r', linewidth=2)\n",
|
||||
" plt.annotate('选取的水样窗口', xy=(cx+50,cy-50), xytext=(cx+300, cy-300),\n",
|
||||
" arrowprops=dict(facecolor='black', shrink=0.1))\n",
|
||||
"\n",
|
||||
" plt.title('水色样本 '+imgFile+' 分辨率为'+str(img.size)+\" 类别标签 \"+str(imgFile[9]))\n",
|
||||
" plt.show()\n",
|
||||
" \n",
|
||||
"for i in range(len(imglist)):\n",
|
||||
" img = cv2.imread(imglist[i])\n",
|
||||
" cut_image(img, imglist[i])\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "65b2b8b6-ed00-47ff-84f1-e22f97d48cdb",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# 加载图像统计信息模块(注:也可以直接通过颜色通道来计算)\n",
|
||||
"from PIL import ImageStat,Image\n",
|
||||
"\n",
|
||||
"# 遍历全体图像进行快速检查\n",
|
||||
"size = 100\n",
|
||||
"imgPath = './images'\n",
|
||||
"imgWidth = [] # 图像宽度\n",
|
||||
"imgHeight = [] # 图像高度\n",
|
||||
"imgRrange = [] # 图像红色通道极差\n",
|
||||
"imgGrange = [] # 图像绿色通道极差\n",
|
||||
"imgBrange = [] # 图像蓝色通道极差\n",
|
||||
"\n",
|
||||
"newImgs = [] # 获得选取后的图像作为模型训练和验证数据\n",
|
||||
"\n",
|
||||
"imgFiles = os.listdir(imgPath)\n",
|
||||
"for imgFile in imgFiles:\n",
|
||||
" img = Image.open(os.path.join(imgPath,imgFile))\n",
|
||||
" imgWidth.append(img.size[0])\n",
|
||||
" imgHeight.append(img.size[1])\n",
|
||||
" \n",
|
||||
" # 获得图像中心区域大小为size的图像块\n",
|
||||
" cx, cy = (int(i/2) for i in img.size)\n",
|
||||
" box = (cx-50, cy-50, cx+50, cy+50)\n",
|
||||
" region = img.crop(box)\n",
|
||||
" \n",
|
||||
" # 计算选取图像块的标准差 分为红绿蓝三种\n",
|
||||
" stat = ImageStat.Stat(region)\n",
|
||||
" imgRrange.append(stat.extrema[0][1]-stat.extrema[0][0])\n",
|
||||
" imgGrange.append(stat.extrema[1][1]-stat.extrema[1][0])\n",
|
||||
" imgBrange.append(stat.extrema[2][1]-stat.extrema[2][0])\n",
|
||||
" \n",
|
||||
" newImgs.append(region)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "527d76c9-7201-4fe6-a835-3a329fb90bc7",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||||
"RangeIndex: 161 entries, 0 to 160\n",
|
||||
"Data columns (total 9 columns):\n",
|
||||
" # Column Non-Null Count Dtype \n",
|
||||
"--- ------ -------------- ----- \n",
|
||||
" 0 0 161 non-null float64\n",
|
||||
" 1 1 161 non-null float64\n",
|
||||
" 2 2 161 non-null float64\n",
|
||||
" 3 3 161 non-null float64\n",
|
||||
" 4 4 161 non-null float64\n",
|
||||
" 5 5 161 non-null float64\n",
|
||||
" 6 6 161 non-null float64\n",
|
||||
" 7 7 161 non-null float64\n",
|
||||
" 8 8 161 non-null float64\n",
|
||||
"dtypes: float64(9)\n",
|
||||
"memory usage: 11.4 KB\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>0</th>\n",
|
||||
" <th>1</th>\n",
|
||||
" <th>2</th>\n",
|
||||
" <th>3</th>\n",
|
||||
" <th>4</th>\n",
|
||||
" <th>5</th>\n",
|
||||
" <th>6</th>\n",
|
||||
" <th>7</th>\n",
|
||||
" <th>8</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>148.6037</td>\n",
|
||||
" <td>138.6396</td>\n",
|
||||
" <td>64.3687</td>\n",
|
||||
" <td>3.633077</td>\n",
|
||||
" <td>4.128330</td>\n",
|
||||
" <td>10.499046</td>\n",
|
||||
" <td>4.254340</td>\n",
|
||||
" <td>4.802914</td>\n",
|
||||
" <td>12.057685</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>163.6769</td>\n",
|
||||
" <td>145.5470</td>\n",
|
||||
" <td>54.4608</td>\n",
|
||||
" <td>3.934350</td>\n",
|
||||
" <td>2.840174</td>\n",
|
||||
" <td>3.502551</td>\n",
|
||||
" <td>4.453499</td>\n",
|
||||
" <td>3.245141</td>\n",
|
||||
" <td>4.065518</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>153.9430</td>\n",
|
||||
" <td>147.0834</td>\n",
|
||||
" <td>71.9601</td>\n",
|
||||
" <td>2.225657</td>\n",
|
||||
" <td>1.810979</td>\n",
|
||||
" <td>3.136608</td>\n",
|
||||
" <td>2.644340</td>\n",
|
||||
" <td>2.148726</td>\n",
|
||||
" <td>3.651915</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>150.3724</td>\n",
|
||||
" <td>151.3982</td>\n",
|
||||
" <td>64.3164</td>\n",
|
||||
" <td>2.037380</td>\n",
|
||||
" <td>1.521590</td>\n",
|
||||
" <td>2.728093</td>\n",
|
||||
" <td>2.375780</td>\n",
|
||||
" <td>1.819692</td>\n",
|
||||
" <td>3.173392</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>150.7381</td>\n",
|
||||
" <td>150.9774</td>\n",
|
||||
" <td>64.6204</td>\n",
|
||||
" <td>1.918674</td>\n",
|
||||
" <td>1.665260</td>\n",
|
||||
" <td>3.110901</td>\n",
|
||||
" <td>2.259915</td>\n",
|
||||
" <td>1.958815</td>\n",
|
||||
" <td>3.619921</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" 0 1 2 3 4 5 6 \\\n",
|
||||
"0 148.6037 138.6396 64.3687 3.633077 4.128330 10.499046 4.254340 \n",
|
||||
"1 163.6769 145.5470 54.4608 3.934350 2.840174 3.502551 4.453499 \n",
|
||||
"2 153.9430 147.0834 71.9601 2.225657 1.810979 3.136608 2.644340 \n",
|
||||
"3 150.3724 151.3982 64.3164 2.037380 1.521590 2.728093 2.375780 \n",
|
||||
"4 150.7381 150.9774 64.6204 1.918674 1.665260 3.110901 2.259915 \n",
|
||||
"\n",
|
||||
" 7 8 \n",
|
||||
"0 4.802914 12.057685 \n",
|
||||
"1 3.245141 4.065518 \n",
|
||||
"2 2.148726 3.651915 \n",
|
||||
"3 1.819692 3.173392 \n",
|
||||
"4 1.958815 3.619921 "
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"# 构建训练数据集和分类标签\n",
|
||||
"data = []\n",
|
||||
"dy = []\n",
|
||||
"for i, img in enumerate(newImgs):\n",
|
||||
" r, g, b = np.split(np.array(img), 3, axis = 2)\n",
|
||||
" \n",
|
||||
" #计算一阶矩\n",
|
||||
" r_m1 = np.mean(r)\n",
|
||||
" g_m1 = np.mean(g)\n",
|
||||
" b_m1 = np.mean(b)\n",
|
||||
" \n",
|
||||
" #二阶矩\n",
|
||||
" r_m2 = np.std(r)\n",
|
||||
" g_m2 = np.std(g)\n",
|
||||
" b_m2 = np.std(b)\n",
|
||||
" \n",
|
||||
" #三阶矩\n",
|
||||
" r_m3 = np.mean(abs(r - r.mean())**3)**(1/3)\n",
|
||||
" g_m3 = np.mean(abs(g - g.mean())**3)**(1/3)\n",
|
||||
" b_m3 = np.mean(abs(b - b.mean())**3)**(1/3)\n",
|
||||
" \n",
|
||||
" # 构造新数据集\n",
|
||||
" df = np.array([r_m1,g_m1,b_m1,r_m2,g_m2,b_m2,r_m3,g_m3,b_m3])\n",
|
||||
" data.append(df)\n",
|
||||
" \n",
|
||||
" # 保存对应的分类标签\n",
|
||||
" dy.append(int(imgFiles[i][0]))\n",
|
||||
"\n",
|
||||
"dy = np.array(dy)\n",
|
||||
"data = pd.DataFrame(np.array(data))\n",
|
||||
"data.info()\n",
|
||||
"data.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "208d30d2-1850-45eb-a68d-94e44a5f59eb",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"((128, 9), (33, 9), (128,))"
|
||||
]
|
||||
},
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"x_train,x_test,y_train,y_test=train_test_split(data,dy,test_size=0.2,random_state=0)\n",
|
||||
"x_train.shape,x_test.shape,y_train.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "726c20cb-1065-4f7e-9d71-39e261178fd6",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0.3939393939393939\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.metrics import accuracy_score\n",
|
||||
"from sklearn.neural_network import MLPClassifier\n",
|
||||
"mlp=MLPClassifier()\n",
|
||||
"mlp.fit(x_train,y_train)\n",
|
||||
"y_pred=mlp.predict(x_test)\n",
|
||||
"print(accuracy_score(y_pred,y_test))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "51804510-c0a3-4a51-9795-0e38a7c9a8f4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "419b9b44-bcd3-499f-8307-cb9f72ef52b3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
@@ -0,0 +1,176 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9d091a01",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"import re\n",
|
||||
"import jieba\n",
|
||||
"import pickle\n",
|
||||
"from wordcloud import WordCloud\n",
|
||||
"from sklearn.naive_bayes import GaussianNB\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n",
|
||||
"from sklearn.metrics import confusion_matrix\n",
|
||||
"import seaborn as sns\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"plt.rcParams['font.sans-serif'] = 'SimHei'\n",
|
||||
"plt.rcParams['axes.unicode_minus'] = False\n",
|
||||
"import warnings\n",
|
||||
"warnings.filterwarnings('ignore')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "eff92c98",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def data_process(file='message80W1.csv'):\n",
|
||||
"#读取数据并进行清洗\n",
|
||||
" data = pd.read_csv(file, header=None, index_col=0)\n",
|
||||
" data.columns = ['label', 'message']\n",
|
||||
" n = 5000\n",
|
||||
"\n",
|
||||
" a = data[data['label'] == 0].sample(n)\n",
|
||||
" b = data[data['label'] == 1].sample(n)\n",
|
||||
" data_new = pd.concat([a, b], axis=0)\n",
|
||||
"\n",
|
||||
" data_dup = data_new['message'].drop_duplicates()\n",
|
||||
" data_qumin = data_dup.apply(lambda x: re.sub('x', '', x))\n",
|
||||
"\n",
|
||||
" jieba.load_userdict('newdic1.txt')\n",
|
||||
" data_cut = data_qumin.apply(lambda x: jieba.lcut(x))\n",
|
||||
"\n",
|
||||
" stopWords = pd.read_csv('stopword.txt', encoding='GB18030', sep='hahaha', header=None)\n",
|
||||
" stopWords = ['≮', '≯', '≠', '≮', ' ', '会', '月', '日', '–'] + list(stopWords.iloc[:, 0])\n",
|
||||
" data_after_stop = data_cut.apply(lambda x: [i for i in x if i not in stopWords])\n",
|
||||
" labels = data_new.loc[data_after_stop.index, 'label']\n",
|
||||
" adata = data_after_stop.apply(lambda x: ' '.join(x))\n",
|
||||
" return adata, data_after_stop, labels"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c3ae80f9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):\n",
|
||||
" sns.heatmap(cm, annot=True)\n",
|
||||
" plt.ylabel('True label')\n",
|
||||
" plt.xlabel('Predicted label')\n",
|
||||
" plt.title(title)\n",
|
||||
" plt.xticks(np.arange(len(classes)), classes)\n",
|
||||
" plt.yticks(np.arange(len(classes)), classes)\n",
|
||||
" plt.show()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "56dbb0d1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"adata, data_after_stop, lables = data_process()\n",
|
||||
"data_tr, data_te, labels_tr, labels_te = train_test_split(adata, lables, test_size=0.2)\n",
|
||||
"countVectorizer = CountVectorizer()\n",
|
||||
"data_tr = countVectorizer.fit_transform(data_tr)\n",
|
||||
"X_tr = TfidfTransformer().fit_transform(data_tr.toarray()).toarray()\n",
|
||||
"data_te = CountVectorizer(vocabulary=countVectorizer.vocabulary_).fit_transform(data_te)\n",
|
||||
"X_te = TfidfTransformer().fit_transform(data_te.toarray()).toarray()\n",
|
||||
"model = GaussianNB()\n",
|
||||
"model.fit(X_tr, labels_tr)\n",
|
||||
"pred = model.predict(X_te)\n",
|
||||
"score = model.score(X_te, labels_te)\n",
|
||||
"with open('model.pkl', 'wb') as f:\n",
|
||||
" pickle.dump(model, f)\n",
|
||||
"with open('countVectorizer.pkl', 'wb') as f:\n",
|
||||
" pickle.dump(countVectorizer, f)\n",
|
||||
"plot_confusion_matrix(confusion_matrix(labels_te, pred), [1, 0], title=\"模型分类准确率{:.2f}%\".format(score * 100))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cc83543e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"word_fre = {}\n",
|
||||
"for i in data_after_stop[lables == 0]:\n",
|
||||
" for j in i:\n",
|
||||
" if j not in word_fre.keys():\n",
|
||||
" word_fre[j] = 1\n",
|
||||
" else:\n",
|
||||
" word_fre[j] += 1\n",
|
||||
"\n",
|
||||
"wc = WordCloud( background_color='white', font_path=r'C:/Windows/Fonts/SimHei.ttf')\n",
|
||||
"wc.fit_words(word_fre)\n",
|
||||
"plt.imshow(wc)\n",
|
||||
"plt.show()\n",
|
||||
"word_fre = {}\n",
|
||||
"for i in data_after_stop[lables == 0]:\n",
|
||||
" for j in i:\n",
|
||||
" if j not in word_fre.keys():\n",
|
||||
" word_fre[j] = 1\n",
|
||||
" else:\n",
|
||||
" word_fre[j] += 1\n",
|
||||
"\n",
|
||||
"wc = WordCloud( background_color='white', font_path=r'C:/Windows/Fonts/SimHei.ttf')\n",
|
||||
"wc.fit_words(word_fre)\n",
|
||||
"plt.imshow(wc)\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "bde6f19c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"0"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5f089b32",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Reference in New Issue
Block a user