在线垃圾邮件识别
Signed-off-by: wty-yy <13190706+wty-yy@user.noreply.gitee.com>
This commit is contained in:
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,22 @@
|
||||
# Importing Necessary Modules
|
||||
from flask import Flask, request, render_template
|
||||
from gnb_model import make_predict
|
||||
app = Flask(__name__)
|
||||
|
||||
result = []
|
||||
@app.route("/", methods=['GET', 'POST'])
|
||||
def input():
|
||||
global result
|
||||
if request.method == 'POST':
|
||||
print(request.form)
|
||||
if "clean" in request.form.keys():
|
||||
result = []
|
||||
sentence = request.form['sentence']
|
||||
if len(sentence) != 0:
|
||||
result.append((sentence, make_predict(sentence)))
|
||||
# result.append((sentence, "不是"))
|
||||
return render_template("input.html", result=result)
|
||||
|
||||
# main route to start with
|
||||
if __name__ == '__main__':
|
||||
app.run(debug=True, host='0.0.0.0')
|
||||
@@ -0,0 +1,77 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import jieba
|
||||
from pathlib import Path
|
||||
PATH_DATASET = Path.cwd().joinpath("../../dataset")
|
||||
DELETE_STOPWORDS = False
|
||||
|
||||
df = pd.read_csv(PATH_DATASET.joinpath("message80W1.csv"), header=None)
|
||||
|
||||
N_pos, N_neg = 10000, 10000
|
||||
df_positive = df[df[1]==0]
|
||||
df_negative = df[df[1]==1]
|
||||
np.random.seed(42)
|
||||
def sample_df(df, N):
|
||||
indexs = np.random.choice(np.arange(len(df)), N)
|
||||
return df.iloc[indexs,2]
|
||||
corpus_pos = sample_df(df_positive, N_pos)
|
||||
corpus_neg = sample_df(df_negative, N_neg)
|
||||
# corpus_pos = df_positive.sample(n=N_pos, random_state=42).iloc[:,2]
|
||||
# corpus_neg = df_negative.sample(n=N_neg, random_state=42).iloc[:,2]
|
||||
corpus = np.concatenate([corpus_pos, corpus_neg]).reshape(-1,1)
|
||||
y = np.concatenate([np.full(N_pos, 1), np.full(N_neg, 0)])
|
||||
|
||||
corpus_cut = np.apply_along_axis(lambda x: ' '.join(jieba.cut(x[0])), axis=1, arr=corpus)
|
||||
|
||||
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
|
||||
|
||||
with open(PATH_DATASET.joinpath("stopword.txt"), encoding='gbk') as file:
|
||||
stopwords = file.read().split()
|
||||
|
||||
# 频率向量化,token_pattern不取单个词作为特征的问题: https://blog.csdn.net/xxzhix/article/details/82685372
|
||||
if not DELETE_STOPWORDS:
|
||||
vectorizer = CountVectorizer(token_pattern='[\u4e00-\u9fa5_a-zA-Z0-9]{1,}')
|
||||
else:
|
||||
vectorizer_stopwords = CountVectorizer(token_pattern='[\u4e00-\u9fa5_a-zA-Z0-9]{1,}', stop_words=stopwords)
|
||||
X = vectorizer.fit_transform(corpus_cut)
|
||||
tfidf = TfidfTransformer()
|
||||
X = tfidf.fit_transform(X)
|
||||
|
||||
def to_vector(X, stopwords=False):
|
||||
X = np.array(X).reshape(-1, 1)
|
||||
cut = np.apply_along_axis(lambda x: ' '.join(jieba.cut(x[0])), axis=1, arr=X)
|
||||
vector = vectorizer.transform(cut)
|
||||
return tfidf.transform(vector)
|
||||
|
||||
from sklearn.model_selection import train_test_split
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
|
||||
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
from sklearn.metrics import accuracy_score
|
||||
gnb = GaussianNB()
|
||||
print("拟合中......")
|
||||
gnb.fit(X_train.toarray(), y_train)
|
||||
|
||||
print("预测中......")
|
||||
pred_train = gnb.predict(X_train.toarray())
|
||||
pred_test = gnb.predict(X_test.toarray())
|
||||
acc_train = accuracy_score(y_train, pred_train)
|
||||
acc_test = accuracy_score(y_test, pred_test)
|
||||
print(f"准确率 train/test: {acc_train:.4f}/{acc_test:.4f}")
|
||||
|
||||
def make_predict(string:str):
|
||||
string = [string]
|
||||
vector = to_vector(string).toarray()
|
||||
ret = "是垃圾" if gnb.predict(vector) == 0 else "不是垃圾"
|
||||
return ret
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(make_predict("尊敬的客上,感谢您一直的支持,亿美亿康美容部特在本月的x、x、x三天举办秒杀活动,现场更是优惠多多,开抢倒计时还有两天,欲抢从速!xx号艳艳"))
|
||||
print(make_predict("CSC喜欢打游戏"))
|
||||
print(make_predict("一刀999"))
|
||||
print(make_predict("你好"))
|
||||
print(make_predict("尊敬的客上,感谢您一直的支持,亿美亿康美容部特在本月的x、x、x三天举办秒杀活动,现场更是优惠多多,开抢倒计时还有两天,欲抢从速!xx号艳艳"))
|
||||
print(make_predict("秒杀价格8848,8848你值得拥有"))
|
||||
print(make_predict("有博主做过同类防晒霜的对比"))
|
||||
print(make_predict("csc每天打游戏"))
|
||||
print(make_predict("今天电脑爆炸了"))
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 178 KiB |
@@ -0,0 +1,26 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>进入</title>
|
||||
<style>
|
||||
body {
|
||||
text-align: center;
|
||||
background-color: green;
|
||||
}
|
||||
form {
|
||||
display: inline-block;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h3>There is a form.</h3>
|
||||
<form action="/passing" method="post">
|
||||
<p>Name <input type="text", name="name"></p>
|
||||
<p>Email <input type="email", name="email"></p>
|
||||
<p>Phon nume <input type="text", name="phone"></p>
|
||||
<p><input type="submit" value="Submit!!!"></p>
|
||||
</form>
|
||||
<h4>HA!HA!HA!HA!</h4>
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,14 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>Hello from Flask</title>
|
||||
</head>
|
||||
<body>
|
||||
{% if name %}
|
||||
<h1>Hello {{ name }}!</h1>
|
||||
{% else %}
|
||||
<h1>Hello, World!</h1>
|
||||
{% endif %}
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,39 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<header>
|
||||
<style>
|
||||
body {
|
||||
text-align: center;
|
||||
}
|
||||
</style>
|
||||
<meta charset="utf-8">
|
||||
<title>CSC识别器</title>
|
||||
</header>
|
||||
<body>
|
||||
<form action="/" method="post">
|
||||
<p>输入待识别文本:<input type="text" name="sentence" minlength="1"></p>
|
||||
<p><button>提交</button></p>
|
||||
<p><button name="clean">清空历史信息</button></p>
|
||||
<!-- <p><input type="submit" name="提交"></p> -->
|
||||
</form>
|
||||
{% if result|length >= 1 %}
|
||||
<table align="center" border="1">
|
||||
<thead><tr><th colspan="2">
|
||||
<strong>历史信息</strong>
|
||||
</th></tr></thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>文本</td>
|
||||
<td>是/否为垃圾</td>
|
||||
</tr>
|
||||
{% for sentence, predict in result %}
|
||||
<tr>
|
||||
<td>{{ sentence }}</td>
|
||||
<td>{{ predict }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
{% endif %}
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,26 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<style>
|
||||
body {
|
||||
text-align: center;
|
||||
background-color: orange;
|
||||
}
|
||||
table {
|
||||
display: inline-block;
|
||||
border-collapse: collapse;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<p><strong>Your Details</strong></p>
|
||||
<table border=1>
|
||||
{% for key, value in result.items() %}
|
||||
<tr>
|
||||
<th>{{ key }}</th>
|
||||
<th>{{ value }}</th>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</table>
|
||||
</body>
|
||||
</html>
|
||||
Reference in New Issue
Block a user