数据集处理以及建模
Signed-off-by: hsiatein <12955678+hsiatein@user.noreply.gitee.com>
This commit is contained in:
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,50 @@
|
||||
import pandas
|
||||
from pandas import read_csv
|
||||
import sklearn
|
||||
import numpy
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
from sklearn.metrics import r2_score
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.ensemble import AdaBoostRegressor
|
||||
from sklearn.ensemble import BaggingRegressor
|
||||
import pylab
|
||||
import joblib
|
||||
data = read_csv('train_q.csv', header=0)
|
||||
|
||||
#X是以0.05为相关系数阈值的数据特征,X2是以0.1为相关系数阈值的数据特征
|
||||
X = data[data.columns[:-1]]
|
||||
X2=data[data.columns[:-2]]
|
||||
y = data['salary']
|
||||
|
||||
#划分训练集和测试集
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
|
||||
X2_train, X2_test, y_train, y_test = train_test_split(X2, y, test_size=0.4)
|
||||
|
||||
#用sklearn库建立bagging模型,这里直接使用了所有数据做训练集
|
||||
BR = BaggingRegressor()#这是以0.05为相关系数阈值的模型
|
||||
BR2 = BaggingRegressor()#这是以0.1为相关系数阈值的模型
|
||||
BR.fit(X,y)
|
||||
BR2.fit(X2,y)
|
||||
#BR.fit(X_train, y_train)
|
||||
#BR2.fit(X2_train, y_train)
|
||||
|
||||
#用sklearn库建立Adaboost模型,这里直接使用了所有数据做训练集
|
||||
AR=AdaBoostRegressor()
|
||||
AR2=AdaBoostRegressor()
|
||||
AR.fit(X,y)
|
||||
AR2.fit(X2,y)
|
||||
|
||||
joblib.dump(BR,'./BaggingReg1.pkl')
|
||||
joblib.dump(BR2,'./BaggingReg2.pkl')
|
||||
joblib.dump(AR,'./AdaBoostReg1.pkl')
|
||||
joblib.dump(AR2,'./AdaBoostReg2.pkl')
|
||||
#绘图
|
||||
#y_predict = BR.predict(X_test)
|
||||
#pylab.subplot(2, 1, 1)
|
||||
#pylab.title('BaggingOrigin')
|
||||
#pylab.plot([i for i in range(len(y_test))], y_test)
|
||||
#pylab.subplot(2, 1, 2)
|
||||
#pylab.title('BaggingPredict')
|
||||
#pylab.plot([i for i in range(len(y_predict))], y_predict)
|
||||
#pylab.show()
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,15 @@
|
||||
import re
|
||||
import pandas as pd
|
||||
#数据集预处理的后半部分
|
||||
path = 'train2.csv'
|
||||
data = pd.read_csv(path, header=0)
|
||||
data_q=data[['education_toint','jobage','worktime_weekly','sex','lang','salary']]
|
||||
#计算学历,工作年龄,工作时间,性别,语言与薪酬之间的相关性
|
||||
pd.set_option('display.max_columns',10)
|
||||
pd.set_option('display.max_rows',10)
|
||||
pd.set_option('display.width',None)
|
||||
print(data_q.corr())
|
||||
#取0.05为阈值,模型仅考虑相关性的绝对值>0.05的几个变量
|
||||
data_q = data_q[data_q.columns[data_q.corr()['salary'].abs() > 0.05]]
|
||||
#保存数据集
|
||||
data_q.to_csv('train_q.csv', index=False, encoding='utf-8-sig')
|
||||
File diff suppressed because one or more lines are too long
@@ -0,0 +1,28 @@
|
||||
import pandas
|
||||
import joblib
|
||||
BR=joblib.load('./BaggingReg1.pkl')
|
||||
BR2=joblib.load('./BaggingReg2.pkl')
|
||||
AR=joblib.load('./AdaBoostReg1.pkl')
|
||||
AR2=joblib.load('./AdaBoostReg2.pkl')
|
||||
#使用模型BR预测
|
||||
def model1_pre(edu,jobage,worktime):
|
||||
temp = {'education_toint': [edu], 'jobage': [jobage], 'worktime_weekly': [worktime]}
|
||||
df = pandas.DataFrame(temp)
|
||||
return BR.predict(df)
|
||||
#使用模型BR2预测
|
||||
def model2_pre(edu,jobage):
|
||||
temp = {'education_toint': [edu], 'jobage': [jobage]}
|
||||
df = pandas.DataFrame(temp)
|
||||
return BR2.predict(df)
|
||||
#使用模型AR预测
|
||||
def model3_pre(edu,jobage,worktime):
|
||||
temp = {'education_toint': [edu], 'jobage': [jobage], 'worktime_weekly': [worktime]}
|
||||
df = pandas.DataFrame(temp)
|
||||
return AR.predict(df)
|
||||
#使用模型AR2预测
|
||||
def model4_pre(edu,jobage):
|
||||
temp = {'education_toint': [edu], 'jobage': [jobage]}
|
||||
df = pandas.DataFrame(temp)
|
||||
return AR2.predict(df)
|
||||
|
||||
print(model1_pre(7,5,40)[0],model1_pre(7,10,40)[0])
|
||||
@@ -0,0 +1,493 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import re
|
||||
import matplotlib.pyplot as plt
|
||||
from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator
|
||||
|
||||
plt.rcParams['font.sans-serif']=['SimHei']
|
||||
plt.rcParams['axes.unicode_minus']=False
|
||||
|
||||
|
||||
data=pd.read_csv('job.csv')
|
||||
# 数据重复处理: 删除重复值
|
||||
# print(data[data.duplicated()])
|
||||
data.drop_duplicates(inplace=True)
|
||||
data.reset_index(drop=True, inplace=True)
|
||||
|
||||
# 缺失值查看、处理:
|
||||
data.isnull().sum()
|
||||
|
||||
data.columns=['job','num','company','job_type','jobage','lang','age','sex','education','add','worktime','salary','fuli','lianxiren','phonecall','add2','company_type','industry','require']
|
||||
|
||||
# 招聘人数处理:缺失值填 1 ,一般是一人; 若干人当成 3人
|
||||
data['num'].unique()
|
||||
data['num'].fillna(1, inplace=True)
|
||||
data['num'].replace('若干', 3, inplace=True)
|
||||
|
||||
# 年龄要求:缺失值填 无限;格式化
|
||||
data['age'].unique()
|
||||
data['age'].fillna('不限', inplace=True)
|
||||
data['age'] = data['age'].apply(lambda x: x.replace('岁至', '-').replace('岁', ''))
|
||||
|
||||
# 语言要求: 忽视精通程度,格式化
|
||||
data['lang'].unique()
|
||||
data['lang'].fillna('不限', inplace=True)
|
||||
data['lang'] = data['lang'].apply(lambda x: x.split('水平')[0] )
|
||||
data['lang'].replace('其他', '不限', inplace=True)
|
||||
|
||||
# 月薪: 格式化。根据一般经验取低值,比如 5000-6000, 取 5000
|
||||
data['salary'].unique()
|
||||
data['salary'] = data['salary'].apply(lambda x: x.replace('参考月薪: ', '') if '参考月薪: ' in str(x) else x)
|
||||
data['salary'] = data['salary'].apply(lambda x: x.split('-', 1)[0] if '-' in str(x) else x )
|
||||
data['salary'].fillna('0', inplace=True)#nan补0是不是有点奇怪
|
||||
|
||||
# 其它岗位说明:缺失值填无
|
||||
data.fillna('其他', inplace=True)
|
||||
|
||||
# 工作年限格式化
|
||||
def jobage_clean(x):
|
||||
if x in ['应届生', '不限']:
|
||||
return x
|
||||
elif re.findall('\d+年', x):
|
||||
return re.findall('(\d+)年', x)[0]
|
||||
elif '年' in x:
|
||||
x = re.findall('\S{1,2}年', x)[0]
|
||||
x = re.sub('厂|验|年|,', '', x)
|
||||
digit_map = {
|
||||
'一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9, '十':10,
|
||||
'十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15, '十六': 16, '两':2
|
||||
}
|
||||
return digit_map.get(x, x)
|
||||
return '其它工作经验'
|
||||
|
||||
data['jobage'].unique()
|
||||
data['jobage'] = data['jobage'].apply(jobage_clean)
|
||||
|
||||
# 性别格式化
|
||||
data['sex'].unique()
|
||||
data['sex'].replace('无', '不限', inplace=True)
|
||||
|
||||
# 工作类型格式化
|
||||
data['job_type'].unique()
|
||||
data['job_type'].replace('毕业生见习', '实习', inplace=True)
|
||||
|
||||
# 学历格式化
|
||||
data['education'].unique()
|
||||
data['education'] = data['education'].apply(lambda x: x[:2])
|
||||
|
||||
# 公司类型 格式化
|
||||
def company_type_clean(x):
|
||||
if len(x) > 100 or '其他' in x:
|
||||
return '其他'
|
||||
elif re.findall('私营|民营', x):
|
||||
return '民营/私营'
|
||||
elif re.findall('外资|外企代表处', x):
|
||||
return '外资'
|
||||
elif re.findall('合资', x):
|
||||
return '合资'
|
||||
return x
|
||||
|
||||
data['company_type'].unique()
|
||||
data['company_type'] = data['company_type'].apply(company_type_clean)
|
||||
|
||||
# 行业 格式化。多个行业,取第一个并简单归类
|
||||
def industry_clean(x):
|
||||
if len(x) > 100 or '其他' in x:
|
||||
return '其他'
|
||||
industry_map = {
|
||||
'IT互联网': '互联网|计算机|网络游戏', '房地产': '房地产', '电子技术': '电子技术', '建筑': '建筑|装潢',
|
||||
'教育培训': '教育|培训', '批发零售': '批发|零售', '金融': '金融|银行|保险', '住宿餐饮': '餐饮|酒店|食品',
|
||||
'农林牧渔': '农|林|牧|渔', '影视文娱': '影视|媒体|艺术|广告|公关|办公|娱乐', '医疗保健': '医疗|美容|制药',
|
||||
'物流运输': '物流|运输', '电信通信': '电信|通信', '生活服务': '人力|中介'
|
||||
}
|
||||
for industry, keyword in industry_map.items():
|
||||
if re.findall(keyword, x):
|
||||
return industry
|
||||
return x.split('、')[0].replace('/', '')
|
||||
|
||||
data['industry'].unique()
|
||||
data['industry'] = data['industry'].apply(industry_clean)
|
||||
|
||||
# 工作时间格式化
|
||||
data['worktime'].unique()
|
||||
data['worktime_day'] = data['worktime'].apply(lambda x: x.split('小时')[0] if '小时' in x else 0)
|
||||
data['worktime_week'] = data['worktime'].apply(lambda x: re.findall('\S*周', x)[0] if '周' in x else 0)
|
||||
|
||||
# 从工作要求中正则解析出:技能要求
|
||||
data['skill'] = data['require'].apply(lambda x: '、'.join(re.findall('[a-zA-Z]+', x)))
|
||||
|
||||
##显示总体情况
|
||||
company_num=data.shape[0]-data['company'].duplicated().sum()
|
||||
job_num=data.shape[0]
|
||||
person_num=np.array(data.loc[:,'num'].tolist(),dtype=int).sum()
|
||||
avg_salary=np.array(data.loc[:,'salary'].tolist(),dtype=int).mean()
|
||||
##显示前十的行业类型
|
||||
industry_list=pd.value_counts(data.loc[:,'industry'].tolist())
|
||||
plt.figure(figsize=(12,8))
|
||||
plt.bar(industry_list.index[0:9],industry_list[0:9])
|
||||
plt.title("前十的行业类型")
|
||||
##企业类型比较
|
||||
company_type_list=pd.value_counts(data.loc[:,'company_type'].tolist())
|
||||
plt.figure(figsize=(12,8))
|
||||
plt.pie(company_type_list,labels=company_type_list.index)
|
||||
plt.title("企业类型比较")
|
||||
##不同企业类型工资对比
|
||||
company_1=data.loc[:,'company_type']
|
||||
salary_1=data.loc[:,'salary']
|
||||
tmp1={'民营/私营':0,'台资/港资':0, '外资':0, '其他':0, '上市公司':0, '合资':0, '国营企业':0, '事业单位':0}
|
||||
tmp2={'民营/私营':0,'台资/港资':0, '外资':0, '其他':0, '上市公司':0, '合资':0, '国营企业':0, '事业单位':0}
|
||||
p=0
|
||||
for i in company_1:
|
||||
tmp1[i]=tmp1[i]+1
|
||||
tmp2[i]=tmp2[i]+int(salary_1[p])
|
||||
p=p+1
|
||||
for i in tmp1:
|
||||
tmp2[i]=tmp2[i]/tmp1[i]
|
||||
|
||||
plt.figure(figsize=(12,8))
|
||||
plt.bar(tmp2.keys(),tmp2.values())
|
||||
plt.title("不同企业类型工资对比")
|
||||
|
||||
##招聘人数最多公司
|
||||
company_1=data.loc[:,'company']
|
||||
num_1=data.loc[:,'num']
|
||||
tmp1={}
|
||||
p=0
|
||||
for i in company_1:
|
||||
if tmp1.get(i)!=None:
|
||||
tmp1[i]=tmp1[i]+int(num_1[p])
|
||||
else:
|
||||
tmp1[i]=int(num_1[p])
|
||||
p=p+1
|
||||
tmp1=sorted(tmp1.items(),key = lambda x:x[1],reverse = True)
|
||||
tmp2=[]
|
||||
tmp3=[]
|
||||
for i in range(10):
|
||||
tmp2.append(tmp1[i][0])
|
||||
tmp3.append(tmp1[i][1])
|
||||
plt.rcParams.update({'font.size':6.5})
|
||||
plt.figure(figsize=(100,8))
|
||||
plt.bar(tmp2,tmp3)
|
||||
plt.title("招聘人数最多公司")
|
||||
|
||||
##平均薪资最高的公司
|
||||
tmp1={}
|
||||
tmp2={}
|
||||
p=0
|
||||
for i in company_1:
|
||||
if tmp1.get(i)!=None:
|
||||
tmp2[i]=tmp2[i]+int(salary_1[p])*int(num_1[p])
|
||||
tmp1[i]=tmp1[i]+int(num_1[p])
|
||||
else:
|
||||
tmp1[i]=int(num_1[p])
|
||||
tmp2[i]=int(salary_1[p])*int(num_1[p])
|
||||
p=p+1
|
||||
for i in tmp1:
|
||||
tmp2[i]=tmp2[i]/tmp1[i]
|
||||
tmp1=sorted(tmp2.items(),key = lambda x:x[1],reverse = True)
|
||||
tmp2=[]
|
||||
tmp3=[]
|
||||
for i in range(10):
|
||||
tmp2.append(tmp1[i][0])
|
||||
tmp3.append(tmp1[i][1])
|
||||
plt.rcParams.update({'font.size':6.5})
|
||||
plt.figure(figsize=(100,8))
|
||||
plt.bar(tmp2,tmp3)
|
||||
plt.title("平均薪资最高的公司")
|
||||
##每天工作时间
|
||||
worktime_day_1=data.loc[:,'worktime_day']
|
||||
tmp1={}
|
||||
p=0
|
||||
for i in worktime_day_1:
|
||||
if tmp1.get(i)!=None:
|
||||
tmp1[i]=tmp1[i]+int(num_1[p])
|
||||
else:
|
||||
tmp1[i]=int(num_1[p])
|
||||
p=p+1
|
||||
tmp1=sorted(tmp1.items(),key = lambda x:x[1],reverse = True)
|
||||
tmp2=[]
|
||||
tmp3=[]
|
||||
for i in range(10):
|
||||
if tmp1[i][0]!=0:
|
||||
tmp2.append(tmp1[i][0])
|
||||
tmp3.append(tmp1[i][1])
|
||||
plt.rcParams.update({'font.size':6.5})
|
||||
plt.figure(figsize=(100,8))
|
||||
plt.bar(tmp2,tmp3)
|
||||
plt.title("每天工作时间")
|
||||
##每周工作天数
|
||||
worktime_week_1=data.loc[:,'worktime_week']
|
||||
tmp1={}
|
||||
p=0
|
||||
for i in worktime_week_1:
|
||||
if tmp1.get(i)!=None:
|
||||
tmp1[i]=tmp1[i]+int(num_1[p])
|
||||
else:
|
||||
tmp1[i]=int(num_1[p])
|
||||
p=p+1
|
||||
tmp1=sorted(tmp1.items(),key = lambda x:x[1],reverse = True)
|
||||
tmp2=[]
|
||||
tmp3=[]
|
||||
for i in range(10):
|
||||
tmp2.append(tmp1[i][0])
|
||||
tmp3.append(tmp1[i][1])
|
||||
plt.rcParams.update({'font.size':6.5})
|
||||
plt.figure(figsize=(100,8))
|
||||
plt.bar(tmp2[1:10],tmp3[1:10])
|
||||
plt.title("每周工作时间")
|
||||
|
||||
##厦门市岗位分布图
|
||||
tmp3={'同安区':0,'翔安区':0,'集美区':0,'海沧区':0,'湖里区':0,'思明区':0}
|
||||
add_1=data.loc[:,'add']
|
||||
p=0
|
||||
for i in add_1:
|
||||
if len(i)>6:
|
||||
tmp1=add_1[p].split('、')
|
||||
for j in tmp1:
|
||||
tmp2=j.split('市')
|
||||
if tmp2[0]=='厦门':
|
||||
if tmp3.get(tmp2[1])!=None:
|
||||
tmp3[tmp2[1]]+=int(num_1[p])
|
||||
else:
|
||||
tmp2=add_1[p].split('市')
|
||||
if tmp2[0]=='厦门':
|
||||
if tmp3.get(tmp2[1])!=None:
|
||||
tmp3[tmp2[1]]+=int(num_1[p])
|
||||
p+=1
|
||||
plt.figure(figsize=(12,8))
|
||||
plt.bar(tmp3.keys(),tmp3.values())
|
||||
plt.title("不同区招聘人数对比")
|
||||
##词云统计
|
||||
fuli_1=data.loc[:,'fuli']
|
||||
Note=open('fuli.txt',mode='w')
|
||||
Note.truncate(0)
|
||||
for i in fuli_1:
|
||||
Note.writelines(i)
|
||||
Note = open('./fuli.txt','r').read()
|
||||
wordcloud = WordCloud(
|
||||
background_color = 'white', #背景颜色,根据图片背景设置,默认为黑色
|
||||
#mask = backgroup_Image, #笼罩图
|
||||
font_path = 'C:\Windows\Fonts\STZHONGS.TTF',#若有中文需要设置才会显示中文
|
||||
width = 1000,
|
||||
height = 860,
|
||||
margin = 2).generate(Note) # generate 可以对全部文本进行自动分词
|
||||
#参数 width,height,margin分别对应宽度像素,长度像素,边缘空白处
|
||||
plt.figure(figsize=(12,8))
|
||||
plt.imshow(wordcloud)
|
||||
plt.axis('off')
|
||||
plt.show()
|
||||
|
||||
##工作经验要求
|
||||
jobage_1=data.loc[:,'jobage']
|
||||
tmp1={}
|
||||
p=0
|
||||
for i in jobage_1:
|
||||
if tmp1.get(i)!=None:
|
||||
tmp1[i]+=int(num_1[p])
|
||||
else:
|
||||
tmp1[i]=int(num_1[p])
|
||||
p+=1
|
||||
plt.rcParams.update({'font.size':8})
|
||||
plt.figure(figsize=(12,8))
|
||||
plt.pie(tmp1.values(),labels=tmp1.keys())
|
||||
plt.title("工作经验要求")
|
||||
|
||||
##平均工资和工作经验的关系
|
||||
tmp1={}
|
||||
tmp2={}
|
||||
p=0
|
||||
for i in jobage_1:
|
||||
i=str(i)
|
||||
if tmp1.get(i)!=None:
|
||||
tmp1[i]+=int(salary_1[p])*int(num_1[p])
|
||||
tmp2[i]+=int(num_1[p])
|
||||
else:
|
||||
tmp1[i]=int(salary_1[p])*int(num_1[p])
|
||||
tmp2[i]=int(num_1[p])
|
||||
p=p+1
|
||||
for i in tmp1:
|
||||
tmp1[i]=float(tmp1[i])/float(tmp2[i])
|
||||
plt.figure(figsize=(12,8))
|
||||
plt.bar(tmp1.keys(),tmp1.values())
|
||||
plt.title("平均工资和工作经验的关系")
|
||||
##岗位数和学历的关系
|
||||
education_1=data.loc[:,'education']
|
||||
tmp1={}
|
||||
p=0
|
||||
for i in education_1:
|
||||
if tmp1.get(i)!=None:
|
||||
tmp1[i]+=int(num_1[p])
|
||||
else:
|
||||
tmp1[i]=int(num_1[p])
|
||||
p+=1
|
||||
plt.rcParams.update({'font.size':8})
|
||||
plt.figure(figsize=(12,8))
|
||||
plt.pie(tmp1.values(),labels=tmp1.keys())
|
||||
plt.title("岗位数和学历的关系")
|
||||
##学历和平均工资的关系
|
||||
tmp1 = {}
|
||||
tmp2 = {}
|
||||
p = 0
|
||||
for i in education_1:
|
||||
i = str(i)
|
||||
if tmp1.get(i) != None:
|
||||
tmp1[i] += int(salary_1[p])*int(num_1[p])
|
||||
tmp2[i] += int(num_1[p])
|
||||
else:
|
||||
tmp1[i] = int(salary_1[p])*int(num_1[p])
|
||||
tmp2[i] = int(num_1[p])
|
||||
p += 1
|
||||
for i in tmp1:
|
||||
tmp1[i] = tmp1[i]/tmp2[i]
|
||||
plt.figure(figsize=(12, 8))
|
||||
plt.bar(tmp1.keys(), tmp1.values())
|
||||
plt.title("学历和平均工资的关系")
|
||||
##岗位数和性别的关系
|
||||
sex_1=data.loc[:,'sex']
|
||||
tmp1={}
|
||||
p=0
|
||||
for i in sex_1:
|
||||
if tmp1.get(i)!=None:
|
||||
tmp1[i]+=int(num_1[p])
|
||||
else:
|
||||
tmp1[i]=int(num_1[p])
|
||||
p+=1
|
||||
plt.rcParams.update({'font.size':8})
|
||||
plt.figure(figsize=(12,8))
|
||||
plt.pie(tmp1.values(),labels=tmp1.keys())
|
||||
plt.title("岗位数和性别的关系")
|
||||
##性别和平均工资的关系
|
||||
tmp1 = {}
|
||||
tmp2 = {}
|
||||
p = 0
|
||||
for i in sex_1:
|
||||
i = str(i)
|
||||
if tmp1.get(i) != None:
|
||||
tmp1[i] += int(salary_1[p])*int(num_1[p])
|
||||
tmp2[i] += int(num_1[p])
|
||||
else:
|
||||
tmp1[i] = int(salary_1[p])*int(num_1[p])
|
||||
tmp2[i] = int(num_1[p])
|
||||
p += 1
|
||||
for i in tmp1:
|
||||
tmp1[i] = tmp1[i]/tmp2[i]
|
||||
plt.figure(figsize=(12, 8))
|
||||
plt.bar(tmp1.keys(), tmp1.values())
|
||||
plt.title("性别和平均工资的关系")
|
||||
##年龄和岗位的关系
|
||||
age_1=data.loc[:,'age']
|
||||
tmp1={'不限':0,'35岁以上':0,'35岁以下':0}
|
||||
p=0
|
||||
for i in age_1:
|
||||
if i=='不限':
|
||||
tmp1[i]+=int(num_1[p])
|
||||
else:
|
||||
tmp2=i.split('-')
|
||||
if int(tmp2[1])<=35:
|
||||
tmp1['35岁以下']+=int(num_1[p])
|
||||
elif int(tmp2[0])>35:
|
||||
tmp1['35岁以上']+=int(num_1[p])
|
||||
else:
|
||||
tmp1['35岁以下']+=int(num_1[p])
|
||||
tmp1['35岁以上']+=int(num_1[p])
|
||||
p+=1
|
||||
plt.rcParams.update({'font.size':8})
|
||||
plt.figure(figsize=(12,8))
|
||||
plt.bar(tmp1.keys(),tmp1.values())
|
||||
plt.title("岗位数和年龄的关系")
|
||||
##平均工资和年龄的关系
|
||||
tmp1={'不限':0,'35岁以上':0,'35岁以下':0}
|
||||
tmp3={'不限':0,'35岁以上':0,'35岁以下':0}
|
||||
p=0
|
||||
for i in age_1:
|
||||
if i=='不限':
|
||||
tmp1[i]+=int(salary_1[p])*int(num_1[p])
|
||||
tmp3[i]+=int(num_1[p])
|
||||
else:
|
||||
tmp2=i.split('-')
|
||||
if int(tmp2[1])<=35:
|
||||
tmp1['35岁以下']+=int(salary_1[p])*int(num_1[p])
|
||||
tmp3['35岁以下']+=int(num_1[p])
|
||||
elif int(tmp2[0])>35:
|
||||
tmp1['35岁以上']+=int(salary_1[p])*int(num_1[p])
|
||||
tmp3['35岁以上']+=int(num_1[p])
|
||||
else:
|
||||
tmp1['35岁以下']+=int(salary_1[p])*int(num_1[p])
|
||||
tmp3['35岁以下']+=int(num_1[p])
|
||||
tmp1['35岁以上']+=int(salary_1[p])*int(num_1[p])
|
||||
tmp3['35岁以上']+=int(num_1[p])
|
||||
p+=1
|
||||
for i in tmp1:
|
||||
tmp1[i]=tmp1[i]/tmp3[i]
|
||||
plt.figure(figsize=(12,8))
|
||||
plt.bar(tmp1.keys(),tmp1.values())
|
||||
plt.title("平均工资和年龄的关系")
|
||||
|
||||
##语言要求和岗位的关系
|
||||
lang_1=data.loc[:,'lang']
|
||||
tmp1={}
|
||||
p=0
|
||||
for i in lang_1:
|
||||
if i!='不限':
|
||||
if tmp1.get(i)!=None:
|
||||
tmp1[i]+=int(num_1[p])
|
||||
else:
|
||||
tmp1[i]=int(num_1[p])
|
||||
p+=1
|
||||
plt.figure(figsize=(12,8))
|
||||
plt.bar(tmp1.keys(),tmp1.values())
|
||||
plt.title("语言要求和岗位的关系")
|
||||
|
||||
##编程语言和岗位的关系
|
||||
skill_1=data.loc[:,'skill']
|
||||
tmp1={'C':0,'JAVA':0,'SQL':0,'PHP':0,'PYTHON':0,'GO':0}
|
||||
for i in skill_1:
|
||||
tmp2=i.split('、')
|
||||
for j in tmp2:
|
||||
if j.upper()=='C':
|
||||
tmp1['C']+=1
|
||||
elif j.upper()=='JAVA':
|
||||
tmp1['JAVA']+=1
|
||||
elif j.upper()=='SQL':
|
||||
tmp1['SQL']+=1
|
||||
elif j.upper()=='PHP':
|
||||
tmp1['PHP']+=1
|
||||
elif j.upper()=='PYTHON':
|
||||
tmp1['PYTHON']+=1
|
||||
elif j.upper()=='GO':
|
||||
tmp1['GO']+=1
|
||||
plt.figure(figsize=(12,8))
|
||||
plt.bar(tmp1.keys(),tmp1.values())
|
||||
plt.title("语言要求和岗位的关系")
|
||||
|
||||
##编程语言和平均薪资的关系
|
||||
tmp1={'C':0,'JAVA':0,'SQL':0,'PHP':0,'PYTHON':0,'GO':0}
|
||||
tmp3={'C':0,'JAVA':0,'SQL':0,'PHP':0,'PYTHON':0,'GO':0}
|
||||
p=0
|
||||
for i in skill_1:
|
||||
tmp2=i.split('、')
|
||||
for j in tmp2:
|
||||
if j.upper()=='C':
|
||||
tmp1['C']+=int(salary_1[p])*int(num_1[p])
|
||||
tmp3['C']+=int(num_1[p])
|
||||
elif j.upper()=='JAVA':
|
||||
tmp1['JAVA']+=int(salary_1[p])*int(num_1[p])
|
||||
tmp3['JAVA']+=int(num_1[p])
|
||||
elif j.upper()=='SQL':
|
||||
tmp1['SQL']+=int(salary_1[p])*int(num_1[p])
|
||||
tmp3['SQL']+=int(num_1[p])
|
||||
elif j.upper()=='PHP':
|
||||
tmp1['PHP']+=int(salary_1[p])*int(num_1[p])
|
||||
tmp3['PHP']+=int(num_1[p])
|
||||
elif j.upper()=='PYTHON':
|
||||
tmp1['PYTHON']+=int(salary_1[p])*int(num_1[p])
|
||||
tmp3['PYTHON']+=int(num_1[p])
|
||||
elif j.upper()=='GO':
|
||||
tmp1['GO']+=int(salary_1[p])*int(num_1[p])
|
||||
tmp3['GO']+=int(num_1[p])
|
||||
p+=1
|
||||
for i in tmp1:
|
||||
tmp1[i]=tmp1[i]/tmp3[i]
|
||||
plt.figure(figsize=(12,8))
|
||||
plt.bar(tmp1.keys(),tmp1.values())
|
||||
plt.title("编程语言和平均薪资的关系")
|
||||
@@ -0,0 +1,31 @@
|
||||
-- 建表
|
||||
CREATE TABLE `job`(
|
||||
`position` string COMMENT '职位',
|
||||
`num` string COMMENT '招聘人数',
|
||||
`company` string COMMENT '公司',
|
||||
`job_type` string COMMENT '职位类型',
|
||||
`jobage` string COMMENT '工作年限',
|
||||
`lang` string COMMENT '语言',
|
||||
`age` string COMMENT '年龄',
|
||||
`sex` string COMMENT '性别',
|
||||
`education` string COMMENT '学历',
|
||||
`workplace` string COMMENT '工作地点',
|
||||
`worktime` string COMMENT '工作时间',
|
||||
`salary` string COMMENT '薪资',
|
||||
`welfare` string COMMENT '福利待遇',
|
||||
`hr` string COMMENT '招聘人',
|
||||
`phone` string COMMENT '联系电话',
|
||||
`address` string COMMENT '联系地址',
|
||||
`company_type` string COMMENT '公司类型',
|
||||
`industry` string COMMENT '行业',
|
||||
`require` string COMMENT '岗位要求',
|
||||
`worktime_day` string COMMENT '工作时间(每天)',
|
||||
`worktime_week` string COMMENT '工作时间(每周)',
|
||||
`skill` string COMMENT '技能要求'
|
||||
)
|
||||
row format delimited
|
||||
fields terminated by ','
|
||||
lines terminated by '\n';
|
||||
|
||||
-- 加载数据
|
||||
LOAD DATA INPATH '/tmp/job_clean.csv' OVERWRITE INTO TABLE job;
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,46 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2021/1/23 13:16
|
||||
# @Author : way
|
||||
# @Site :
|
||||
# @Describe: 模型预测
|
||||
|
||||
import pandas as pd
|
||||
from sklearn.linear_model import LinearRegression
|
||||
import sklearn
|
||||
|
||||
def predict(data, education):
|
||||
"""
|
||||
:param data: 训练数据
|
||||
:param education: 学历
|
||||
:return: 模型得分,10年工资预测
|
||||
"""
|
||||
train = data[data['education'] == education].to_numpy()
|
||||
x = train[:, 1:2]
|
||||
y = train[:, 2]
|
||||
|
||||
# model 训练
|
||||
model = LinearRegression()
|
||||
model.fit(x, y)
|
||||
|
||||
# model 预测
|
||||
X = [[i] for i in range(11)]
|
||||
return model.score(x, y), model.predict(X)
|
||||
|
||||
education_list = ['小学', '初中', '中专', '高中', '大专', '本科', '硕士', '博士']
|
||||
data = pd.read_csv('train.csv')
|
||||
|
||||
scores, values = [], []
|
||||
for education in education_list:
|
||||
score, y = predict(data, education)
|
||||
scores.append(score)
|
||||
values.append(y)
|
||||
|
||||
result = pd.DataFrame()
|
||||
result['学历'] = education_list
|
||||
result['模型得分'] = scores
|
||||
result['(1年经验)平均工资'] = [value[1] for value in values]
|
||||
result['(3年经验)平均工资'] = [value[2] for value in values]
|
||||
result['(5年经验)平均工资'] = [value[4] for value in values]
|
||||
result['(10年经验)平均工资'] = [value[10] for value in values]
|
||||
print(result)
|
||||
@@ -0,0 +1,37 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2021/01/22 21:36
|
||||
# @Author : way
|
||||
# @Site :
|
||||
# @Describe: 数据可视化
|
||||
|
||||
import os
|
||||
import pandas as pd
|
||||
from pyecharts import options as opts
|
||||
from pyecharts.charts import WordCloud, Map
|
||||
from pyecharts.globals import SymbolType
|
||||
|
||||
# 福利词云
|
||||
data = pd.read_csv('welfare.csv')
|
||||
|
||||
c = (
|
||||
WordCloud()
|
||||
.add("", data.values, word_size_range=[20, 100], shape=SymbolType.DIAMOND)
|
||||
.set_global_opts(title_opts=opts.TitleOpts())
|
||||
.render("wordcloud.html")
|
||||
)
|
||||
os.system("wordcloud.html")
|
||||
|
||||
# 岗位分布
|
||||
data = pd.read_csv('workplace.csv')
|
||||
|
||||
c1 = (
|
||||
Map()
|
||||
.add("岗位数", data.values, "厦门")
|
||||
.set_global_opts(
|
||||
title_opts=opts.TitleOpts(title="厦门岗位分布图"),
|
||||
visualmap_opts=opts.VisualMapOpts(max_=20000, min_=5000)
|
||||
)
|
||||
.render("workplace.html")
|
||||
)
|
||||
os.system("workplace.html")
|
||||
@@ -0,0 +1,165 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2021/1/21 21:06
|
||||
# @Author : way
|
||||
# @Site :
|
||||
# @Describe: 数据处理
|
||||
|
||||
import re
|
||||
import pandas as pd
|
||||
|
||||
path = 'job.csv'
|
||||
data = pd.read_csv(path, header=None)
|
||||
data.columns = [
|
||||
'position', 'num', 'company', 'job_type', 'jobage', 'lang', 'age', 'sex', 'education', 'workplace', 'worktime',
|
||||
'salary', 'welfare', 'hr', 'phone', 'address', 'company_type', 'industry', 'require'
|
||||
]
|
||||
|
||||
############################################### 数据清洗 #############################################################
|
||||
# 数据重复处理: 删除重复值
|
||||
# print(data[data.duplicated()])
|
||||
data.drop_duplicates(inplace=True)
|
||||
data.reset_index(drop=True, inplace=True)
|
||||
|
||||
# 缺失值查看、处理:
|
||||
data.isnull().sum()
|
||||
|
||||
# 招聘人数处理:缺失值填 1 ,一般是一人; 若干人当成 3人
|
||||
data['num'].unique()
|
||||
data['num'].fillna(1, inplace=True)
|
||||
data['num'].replace('若干', 3, inplace=True)
|
||||
|
||||
# 年龄要求:缺失值填 无限;格式化
|
||||
data['age'].unique()
|
||||
data['age'].fillna('不限', inplace=True)
|
||||
data['age'] = data['age'].apply(lambda x: x.replace('岁至', '-').replace('岁', ''))
|
||||
|
||||
# 语言要求: 忽视精通程度,格式化,有要求1,不限0
|
||||
data['lang'].unique()
|
||||
data['lang'].fillna('不限', inplace=True)
|
||||
data['lang'] = data['lang'].apply(lambda x: x.split('水平')[0])
|
||||
data['lang'].replace('其他', '不限', inplace=True)
|
||||
data['lang'] = data['lang'].apply(lambda x: 0 if x=='不限' else 1)
|
||||
|
||||
# 月薪: 格式化。根据一般经验取低值,比如 5000-6000, 取 5000
|
||||
data['salary'].unique()
|
||||
data['salary'] = data['salary'].apply(lambda x: x.replace('参考月薪: ', '') if '参考月薪: ' in str(x) else x)
|
||||
data['salary'] = data['salary'].apply(lambda x: x.split('-', 1)[0] if '-' in str(x) else x)
|
||||
data['salary'].fillna('0', inplace=True)
|
||||
|
||||
# 其它岗位说明:缺失值填无
|
||||
data.fillna('其他', inplace=True)
|
||||
|
||||
|
||||
# 工作年限格式化
|
||||
def jobage_clean(x):
|
||||
if x in ['应届生', '不限']:
|
||||
return x
|
||||
elif re.findall('\d+年', x):
|
||||
return re.findall('(\d+)年', x)[0]
|
||||
elif '年' in x:
|
||||
x = re.findall('\S{1,2}年', x)[0]
|
||||
x = re.sub('厂|验|年|,', '', x)
|
||||
digit_map = {
|
||||
'一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
|
||||
'十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15, '十六': 16, '两': 2
|
||||
}
|
||||
return digit_map.get(x, x)
|
||||
return '其它工作经验'
|
||||
|
||||
|
||||
data['jobage'].unique()
|
||||
data['jobage'] = data['jobage'].apply(jobage_clean)
|
||||
|
||||
# 性别格式化,男1女-1不限0
|
||||
data['sex'].unique()
|
||||
data['sex'].replace('无', '不限', inplace=True)
|
||||
def tempf3(x):
|
||||
if(x=='男'):return 1
|
||||
elif(x=='女'):return -1
|
||||
else:return 0
|
||||
data['sex'] = data['sex'].apply(tempf3)
|
||||
|
||||
# 工作类型格式
|
||||
data['job_type'].unique()
|
||||
data['job_type'].replace('毕业生见习', '实习', inplace=True)
|
||||
|
||||
# 学历格式化
|
||||
data['education'].unique()
|
||||
data['education'] = data['education'].apply(lambda x: x[:2])
|
||||
|
||||
|
||||
# 公司类型 格式化
|
||||
def company_type_clean(x):
|
||||
if len(x) > 100 or '其他' in x:
|
||||
return '其他'
|
||||
elif re.findall('私营|民营', x):
|
||||
return '民营/私营'
|
||||
elif re.findall('外资|外企代表处', x):
|
||||
return '外资'
|
||||
elif re.findall('合资', x):
|
||||
return '合资'
|
||||
return x
|
||||
|
||||
|
||||
data['company_type'].unique()
|
||||
data['company_type'] = data['company_type'].apply(company_type_clean)
|
||||
|
||||
|
||||
# 行业 格式化。多个行业,取第一个并简单归类
|
||||
def industry_clean(x):
|
||||
if len(x) > 100 or '其他' in x:
|
||||
return '其他'
|
||||
industry_map = {
|
||||
'IT互联网': '互联网|计算机|网络游戏', '房地产': '房地产', '电子技术': '电子技术', '建筑': '建筑|装潢',
|
||||
'教育培训': '教育|培训', '批发零售': '批发|零售', '金融': '金融|银行|保险', '住宿餐饮': '餐饮|酒店|食品',
|
||||
'农林牧渔': '农|林|牧|渔', '影视文娱': '影视|媒体|艺术|广告|公关|办公|娱乐', '医疗保健': '医疗|美容|制药',
|
||||
'物流运输': '物流|运输', '电信通信': '电信|通信', '生活服务': '人力|中介'
|
||||
}
|
||||
for industry, keyword in industry_map.items():
|
||||
if re.findall(keyword, x):
|
||||
return industry
|
||||
return x.split('、')[0].replace('/', '')
|
||||
|
||||
|
||||
data['industry'].unique()
|
||||
data['industry'] = data['industry'].apply(industry_clean)
|
||||
|
||||
# 工作时间格式化,算出每周工作时间
|
||||
data['worktime'].unique()
|
||||
data['worktime_day'] = data['worktime'].apply(lambda x: x.split('小时')[0] if '小时' in x else 0)
|
||||
data['worktime_week'] = data['worktime'].apply(lambda x: re.findall('\S*周', x)[0] if '周' in x else 0)
|
||||
def tempf1(x):
|
||||
if(x==0):
|
||||
return 0
|
||||
elif(x=='大小周'):return 0
|
||||
else: return x.split('天')[0]
|
||||
data['worktime_week'] = data['worktime_week'].apply(tempf1)
|
||||
data['worktime_weekly']=data['worktime_week'].astype('float')*data['worktime_day'].astype('float')
|
||||
# 从工作要求中正则解析出:技能要求
|
||||
data['skill'] = data['require'].apply(lambda x: '、'.join(re.findall('[a-zA-Z]+', x)))
|
||||
|
||||
################################################## 数据保存 #########################################################
|
||||
# 查看保存的数据
|
||||
print(data.info)
|
||||
|
||||
# 保存清洗后的数据 job_clean.csv
|
||||
data.to_csv('job_clean.csv', index=False, header=None, encoding='utf-8-sig')
|
||||
|
||||
# 取学历、年龄段、薪资等等作预测,保存为 train.csv
|
||||
train_data = data[['education', 'jobage','worktime_weekly','sex','lang', 'salary']][data['job_type']=='全职']
|
||||
train_data['jobage'] = train_data['jobage'].apply(lambda x: x if x not in ['应届生', '不限', '其它工作经验'] else '0')
|
||||
train_data['salary'] = train_data['salary'].astype('int')
|
||||
def tempf2(x):
|
||||
if(x=='小学'):return 0
|
||||
elif(x=='初中'):return 1
|
||||
elif(x=='中专'):return 2
|
||||
elif(x=='高中'):return 3
|
||||
elif(x=='大专'):return 4
|
||||
elif(x=='本科'):return 5
|
||||
elif(x=='硕士'):return 6
|
||||
else:return 7
|
||||
train_data = train_data[(train_data['salary'] > 1000)]
|
||||
train_data['education_toint']=train_data['education'].apply(tempf2)
|
||||
train_data = train_data[(train_data['worktime_weekly'] > 0)]
|
||||
train_data = train_data.to_csv('train2.csv', index=False, encoding='utf-8-sig')
|
||||
Reference in New Issue
Block a user