diff --git a/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/__pycache__/app.cpython-310.pyc b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/__pycache__/app.cpython-310.pyc new file mode 100644 index 0000000..1acb757 Binary files /dev/null and b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/__pycache__/app.cpython-310.pyc differ diff --git a/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/__pycache__/gnb_model.cpython-311.pyc b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/__pycache__/gnb_model.cpython-311.pyc new file mode 100644 index 0000000..54f9c52 Binary files /dev/null and b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/__pycache__/gnb_model.cpython-311.pyc differ diff --git a/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/app.py b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/app.py new file mode 100644 index 0000000..f03cf15 --- /dev/null +++ b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/app.py @@ -0,0 +1,22 @@ +# Importing Necessary Modules +from flask import Flask, request, render_template +from gnb_model import make_predict +app = Flask(__name__) + +result = [] +@app.route("/", methods=['GET', 'POST']) +def input(): + global result + if request.method == 'POST': + print(request.form) + if "clean" in request.form.keys(): + result = [] + sentence = request.form['sentence'] + if len(sentence) != 0: + result.append((sentence, make_predict(sentence))) + # result.append((sentence, "不是")) + return render_template("input.html", result=result) + +# main route to start with +if __name__ == '__main__': + app.run(debug=True, host='0.0.0.0') diff --git a/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/gnb_model.py b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/gnb_model.py new file mode 100644 index 0000000..9a2a717 --- /dev/null +++ b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/gnb_model.py @@ -0,0 +1,77 @@ +import numpy as np +import pandas as pd +import jieba +from pathlib import Path +PATH_DATASET = Path.cwd().joinpath("../../dataset") +DELETE_STOPWORDS = False + +df = pd.read_csv(PATH_DATASET.joinpath("message80W1.csv"), header=None) + +N_pos, N_neg = 10000, 10000 +df_positive = df[df[1]==0] +df_negative = df[df[1]==1] +np.random.seed(42) +def sample_df(df, N): + indexs = np.random.choice(np.arange(len(df)), N) + return df.iloc[indexs,2] +corpus_pos = sample_df(df_positive, N_pos) +corpus_neg = sample_df(df_negative, N_neg) +# corpus_pos = df_positive.sample(n=N_pos, random_state=42).iloc[:,2] +# corpus_neg = df_negative.sample(n=N_neg, random_state=42).iloc[:,2] +corpus = np.concatenate([corpus_pos, corpus_neg]).reshape(-1,1) +y = np.concatenate([np.full(N_pos, 1), np.full(N_neg, 0)]) + +corpus_cut = np.apply_along_axis(lambda x: ' '.join(jieba.cut(x[0])), axis=1, arr=corpus) + +from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer + +with open(PATH_DATASET.joinpath("stopword.txt"), encoding='gbk') as file: + stopwords = file.read().split() + +# 频率向量化,token_pattern不取单个词作为特征的问题: https://blog.csdn.net/xxzhix/article/details/82685372 +if not DELETE_STOPWORDS: + vectorizer = CountVectorizer(token_pattern='[\u4e00-\u9fa5_a-zA-Z0-9]{1,}') +else: + vectorizer_stopwords = CountVectorizer(token_pattern='[\u4e00-\u9fa5_a-zA-Z0-9]{1,}', stop_words=stopwords) +X = vectorizer.fit_transform(corpus_cut) +tfidf = TfidfTransformer() +X = tfidf.fit_transform(X) + +def to_vector(X, stopwords=False): + X = np.array(X).reshape(-1, 1) + cut = np.apply_along_axis(lambda x: ' '.join(jieba.cut(x[0])), axis=1, arr=X) + vector = vectorizer.transform(cut) + return tfidf.transform(vector) + +from sklearn.model_selection import train_test_split +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42) + +from sklearn.naive_bayes import GaussianNB +from sklearn.metrics import accuracy_score +gnb = GaussianNB() +print("拟合中......") +gnb.fit(X_train.toarray(), y_train) + +print("预测中......") +pred_train = gnb.predict(X_train.toarray()) +pred_test = gnb.predict(X_test.toarray()) +acc_train = accuracy_score(y_train, pred_train) +acc_test = accuracy_score(y_test, pred_test) +print(f"准确率 train/test: {acc_train:.4f}/{acc_test:.4f}") + +def make_predict(string:str): + string = [string] + vector = to_vector(string).toarray() + ret = "是垃圾" if gnb.predict(vector) == 0 else "不是垃圾" + return ret + +if __name__ == '__main__': + print(make_predict("尊敬的客上,感谢您一直的支持,亿美亿康美容部特在本月的x、x、x三天举办秒杀活动,现场更是优惠多多,开抢倒计时还有两天,欲抢从速!xx号艳艳")) + print(make_predict("CSC喜欢打游戏")) + print(make_predict("一刀999")) + print(make_predict("你好")) + print(make_predict("尊敬的客上,感谢您一直的支持,亿美亿康美容部特在本月的x、x、x三天举办秒杀活动,现场更是优惠多多,开抢倒计时还有两天,欲抢从速!xx号艳艳")) + print(make_predict("秒杀价格8848,8848你值得拥有")) + print(make_predict("有博主做过同类防晒霜的对比")) + print(make_predict("csc每天打游戏")) + print(make_predict("今天电脑爆炸了")) \ No newline at end of file diff --git a/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/spam_online.png b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/spam_online.png new file mode 100644 index 0000000..a538a39 Binary files /dev/null and b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/spam_online.png differ diff --git a/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/templates/Temp.html b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/templates/Temp.html new file mode 100644 index 0000000..bd3dfef --- /dev/null +++ b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/templates/Temp.html @@ -0,0 +1,26 @@ + + + + + 进入 + + + +

There is a form.

+
+

Name

+

Email

+

Phon nume

+

+
+

HA!HA!HA!HA!

+ + \ No newline at end of file diff --git a/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/templates/hello.html b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/templates/hello.html new file mode 100644 index 0000000..d1294b7 --- /dev/null +++ b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/templates/hello.html @@ -0,0 +1,14 @@ + + + + + Hello from Flask + + + {% if name %} +

Hello {{ name }}!

+ {% else %} +

Hello, World!

+ {% endif %} + + \ No newline at end of file diff --git a/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/templates/input.html b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/templates/input.html new file mode 100644 index 0000000..89ce287 --- /dev/null +++ b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/templates/input.html @@ -0,0 +1,39 @@ + + +
+ + + CSC识别器 +
+ +
+

输入待识别文本:

+

+

+ +
+ {% if result|length >= 1 %} + + + + + + + + {% for sentence, predict in result %} + + + + + {% endfor %} + +
+ 历史信息 +
文本是/否为垃圾
{{ sentence }}{{ predict }}
+ {% endif %} + + diff --git a/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/templates/result_data.html b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/templates/result_data.html new file mode 100644 index 0000000..8d217d7 --- /dev/null +++ b/4. QuickDraw(你画我猜)在线交互识别系统/homeworks/在线垃圾邮件识别/templates/result_data.html @@ -0,0 +1,26 @@ + + + + + + +

Your Details

+ + {% for key, value in result.items() %} + + + + + {% endfor %} +
{{ key }}{{ value }}
+ + \ No newline at end of file