我是选择互联网行业、实习生、全国这几个类别,通过智联招聘得到的数据,上面列出的城市是全国实习生平均月薪最高的前30个。很奇怪,我本来以为肯定会是北上广深杭,结果大多是北京周边城市。
import numpy as npy
import pandas as pdaimport matplotlib.pyplot as plt
import pymysql
import re
from pylab import *
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False
#注意如果只是对某一列进行操作,只需要把要操作列取出来,否则操作较慢
conn = pymysql.connect(host="127.0.0.1", user="root", passwd="root", db="zhilian", charset="utf8")
sql = "select * from zhaopin;"
dataframe = pda.read_sql(sql, conn)
# dataframe.info()
# 建立索引
dataframe.index = dataframe['Id']
del (dataframe['Id'])
dataframe_sort = dataframe.sort_index()
dataframe = dataframe_sort
dataframe.info()
dataframe[['gzdd', 'zwyx']].head(10)
# 对职位月薪这一列进行数据清洗,将职位月薪区间形式转变成平均月薪,便于操作。
dataframe['bottom'] = dataframe['top'] = dataframe['average'] = dataframe["zwyx"]
pat = '([0-9]+)'
c1 = c2 = c3 = c4 = 0
for i in range(len(dataframe['zwyx'])):
#iloc是进行切片,strip不带参数,是对其首尾取出空格
item = dataframe['zwyx'].iloc[i].strip()
result = re.compile(pat).findall(item)
try:
if result:
try:
#下面这句执行成功,说