Catalog
1、项目介绍
主要目标
- 了解与IT行业的就业市场信息,打造自身的职业规划路线
- 根据市场的主流技术和需求,不断完善自身技能树
环境:
- Win10、Python、Pycharm
技术
- 网页数据采集:scrapy、urllib、re
- 数据存储:MySQL
- 文字切割、词频统计:pandas、jieba、collections
- 数据升采样:pandas
- 数据离散化及透视:pandas
- 数据可视化及分析:Excel、PPT
2、爬虫算法(scrapy框架)
- spider
- quanguo.py
- items.py
- middlewares.py(无修改)
- pipelines.py
- setting.py
- run.py
spider/
quanguo.py
# -*- coding: utf-8 -*-
import scrapy
from urllib import parse
from ..items import QianchengwuyouItem
import re
class QuanguoSpider(scrapy.Spider):
name = 'quanguo'
# 起始页,两层URL编码
start_urls = ['https://search.51job.com/list/000000,000000,0000,00,9,99,{},2,1.html'.format(parse.quote(parse.quote(input('输入搜索词:').strip())))]
# 搜索页
def parse(self, response):
ls_url = response.xpath('//p/span/a').re('href="(.*?)"')
for url in ls_url:
yield scrapy.Request(url=url, callback=self.parse_detail)
next_page = response.xpath('//*[@id="resultList"]/div[55]/div/div/div/ul/li[8]/a').re('href="(.*?)"')
if next_page:
yield scrapy.Request(url=next_page[0], callback=self.parse)
# 详情页
def parse_detail(self, response):
item = QianchengwuyouItem()
# 基本信息
item['city'] = self.name
item['name'] = response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/h1/@title').extract_first()
item['url'] = response.url
item['salary'] = ''.join(response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/strong/text()').extract())
region_exp_edu = response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/p[2]/@title').extract_first().split('\xa0\xa0|\xa0\xa0')
item['region'] = region_exp_edu[0]
item['cp_name'] = response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/p[1]/a[1]/@title').extract_first()
item['workplace'] = ''.join(response.xpath('/html/body/div[3]/div[2]/div[3]/div[2]/div/p').re('</span>(.+?)</p>')).strip()
item['welfare'] = '|'.join(response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/div/div/span/text()').extract())
# 详情页
detail = response.xpath('/html/body/div[3]/div[2]/div[3]/div[1]/div').extract_first().lower()
detail = re.sub('<.*?>', '', detail)
item['detail'] = re.sub("""[^,。?!;:‘’“”、【】…0-9.\-a-zA-Z\u4e00-\u9fa5]+""", '', detail)
# 经验、学历、招聘人数、发布日期
item['exp'] = item['edu'] = item['demand'] = item['pubdate'] = item['skill'] = ''
EDU = ['博士', '硕士', '本科', '大专',
'中专', '中技', '高中', '初中及以下']
for i in region_exp_edu:
if '经验' in i:
item['exp'] = i
elif i in EDU:
item['edu'] = i
elif '招' in i:
item['demand'] = i
elif '发布' in i:
item['pubdate'] = i
else:
item['skill'] = i
# 公司信息
CP_TYPE = ['民营公司', '上市公司', '事业单位', '国企', '外资(欧美)', '外资(非欧美)',
'创业公司', '政府机关', '合资', '外资', '合资', '外企代表处', '非营利组织']
CP_SCALE = ['少于50人', '50-150人', '150-500人', '500-1000人',
'1000-5000人', '5000-10000人', '10000人以上']
cp_info = response.xpath('/html/body/div[3]/div[2]/div[4]/div[1]/div[2]/p/text()').extract()
item['cp_type'] = item['cp_scale'] = item['industry'] = ''
for i in CP_TYPE:
if i in cp_info:
item['cp_type'] = i
break
for i in CP_SCALE:
if i in cp_info:
item['cp_scale'] = i
break
for i in cp_info:
if i not in CP_TYPE and i not in CP_SCALE:
item['industry'] = i
return item
items.py
import scrapy
class QianchengwuyouItem(scrapy.Item):
# define the fields for your item here like:
ls = ['url', 'name', 'salary', 'region', 'workplace',
'cp_name', 'cp_type', 'cp_scale', 'industry',
'exp', 'edu', 'demand', 'pubdate', 'skill',
'welfare',
'detail']
for fd in ls:
exec(fd + '=scrapy.Field()')
pipelines.py
import pymysql
class QianchengwuyouPipeline(object):
def open_spider(self, spider):
self.db = pymysql.connect('localhost', 'root', 'yellow', charset='utf8', db='z_51job')
self.cursor = self.db.cursor()
def close_spider(self, spider):
# self.db.commit()
self.cursor.close()
self.db.close()
def process_item(self, item, spider):
ls = list(item)
sentence = 'insert quanguo (' + ','.join(ls) + ') values (' + ','.join(['"%({})s"'.format(field) for field in ls]) + ');'
self.cursor.execute(sentence % item)
self.db.commit()
return item
setting.py
BOT_NAME = 'qianchengwuyou'
SPIDER_MODULES = ['qianchengwuyou.spiders']
NEWSPIDER_MODULE = 'qianchengwuyou.spiders'
# 浏览器伪装
USER_AGENT = 'User-Agent:Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0;'
# 爬虫协议
ROBOTSTXT_OBEY = False
# 数据存储
ITEM_PIPELINES = {
'qianchengwuyou.pipelines.QianchengwuyouPipeline': 300,
}
run.py
from scrapy import cmdline
cmdline.execute(['scrapy', 'crawl', 'quanguo'])
3、数据分析算法(pandas)
- core/
- conf.py
- db.py
- df.py
- file/
- analysis
- filter/
- EN_filter.txt
- run.py
conf/
conf.py
import os, time
home = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
FILTER_EN = home + r'\file\filter\EN_filter.txt'
# 新建文件夹的前缀
ymd = time.strftime('%Y%m%d', time.localtime())
PREFIX = home + '\\file\\analysis\\' + ymd
if __name__ == '__main__':
print(home)
print(FILTER_EN)
print(PREFIX)
db.py
import pymysql, pandas as pd
class Mysql:
# 连接数据库,选择库
def __init__(self):
self.db = pymysql.connect('localhost', 'root', 'yellow', charset='utf8', db='z_51job')
self.cursor = self.db.cursor()
# 执行SQL
def execute(self, sentence, arg=None):
try:
if arg:
self.cursor.execute(sentence, arg)
else:
self.cursor.execute(sentence)
self.db.commit()
except Exception as error:
print('\033[033m', error, '\033[0m')
# 提交SQL
def close(self):
self.cursor.close()
self.db.close()
# 查询主要字段,排除薪资为空的记录
def select_key(self):
query = '''select
name,salary,region,workplace,cp_name,cp_type,cp_scale,industry,exp,edu,demand,pubdate,welfare,detail
from job where salary != '';'''
df = pd.read_sql(query, self.db)
return df
df.py
import re, jieba.posseg as jp, pandas as pd
from collections import Counter
from core.conf import (FILTER_EN, PREFIX)
# 数据分析
class Df:
def __init__(self, data_frame, kw='python'):
self.df = data_frame
self.writer = pd.ExcelWriter(PREFIX + kw + '.xlsx')
# 新增百分比列,并保存sheet
def percentage(self, ls_tup, name):
fields = [name, 'frequency']
df = pd.DataFrame(ls_tup, columns=fields)
df['percentage'] = df[df.columns[1]]/self.df.shape[0]
df.sort_values(df.columns[1], ascending=False)
df.to_excel(self.writer, sheet_name=name, index=False)
# 分析并保存
def write(self):
# 中英文分词
self.cut('detail')
self.english('detail')
# 薪资离散化、区间化
self.salary()
self.df.to_excel(self.writer, sheet_name='origin', index=False)
self.pivot('salary_section', 'exp')
self.pivot('salary_section', 'edu')
self.pivot('salary_section', 'cp_scale')
self.pivot('salary_section', 'cp_type')
self.pivot('salary_section', 'region')
self.spt('region')
self.spt('industry')
self.spt('welfare')
self.spt('cp_name')
self.writer.save()
# 分词(detail)
def cut(self, field):
text = '|'.join([str(i) for i in self.df[field]]) # 衔接文本
counter = Counter()
posseg = jp.cut(text)
for p in posseg:
if len(p.word) > 2 and p.flag != 'eng':
counter[p.flag + ' | ' + p.word] += 1
most = counter.most_common(99)
# 保存
self.percentage(most, 'Cn%s' % field)
# 英文分词(detail)
def english(self, field):
text = '|'.join([str(i) for i in self.df[field]]) # 衔接文本
pat = '[a-z]+'
re_ls = re.findall(pat, text)
counter = Counter(re_ls)
c1 = counter.most_common(199)
# 英文停词过滤
with open(FILTER_EN, encoding='utf-8') as f:
prep = f.read().strip().split()
c2 = [i for i in c1 if i[0] not in prep]
# 保存
self.percentage(c2, 'En%s' % field)
# 切割(workplace,industry)
def spt(self, field):
text = '|'.join([str(i) for i in self.df[field]]) # 衔接文本
ls = text.split('|')
counter = Counter(ls)
c = counter.most_common()
# 保存
self.percentage(c, 'CNT%s' % field)
# 透视表(薪资分布)
def pivot(self, field_1, field_2):
name = field_1 + '-' + field_2
pivot = self.df.pivot_table(
values='name',
index=field_1,
columns=field_2,
aggfunc='count')
pivot.to_excel(
self.writer,
sheet_name=name)
# salary
def salary(self, n=4, start=5000, end=30000, step=1000):
salary = self.df['salary']
salary = ['0-0千/月' if i == '' else i for i in salary]
# 切割“/”
spt_ls = [i.split('/') for i in salary]
# 年月→乘数
period = {
'年': 1/12,
'月': 1,
'天': 20,
'小时': 160}
for spt in spt_ls:
spt[1] = period[spt[1]]
# 万千→乘数
units = {
'万': 10000,
'千': 1000,
'元': 1}
for spt in spt_ls:
for k in units.keys():
if k in spt[0]:
spt[0] = spt[0].replace(k, '')
spt.append(units[k])
break
# 最大值,最小值
for spt in spt_ls:
if '-' in spt[0]:
mi, ma = spt[0].split('-')
spt[0] = ma
spt.insert(0, mi)
elif '以上' in spt[0]:
mi = spt[0].split('以上')[0]
spt[0] = mi
spt.insert(0, mi)
elif '以下' in spt[0]:
mi = spt[0].split('以下')[0]
spt[0] = mi
spt.insert(0, mi)
else:
spt.insert(0, spt[0])
# 文本转浮点数
dfs = pd.DataFrame(spt_ls, columns=['min', 'max', 'period', 'unit'])
row_min = list(pd.to_numeric(dfs['min']) * dfs['period'] * dfs['unit'])
row_max = list(pd.to_numeric(dfs['max']) * dfs['period'] * dfs['unit'])
# 升采样
row_ls = [row_min, row_max]
for j in range(1, n):
row = [(row_min[i] + (row_max[i] - row_min[i]) * j / n) for i in range(len(spt_ls))]
row_ls.append(row)
# 重构DataFrame
df_ls = [pd.concat([self.df, pd.DataFrame(row, columns=['salary_value'])], axis=1) for row in row_ls]
# 合并DataFrame、升序
df = pd.concat(df_ls)
df.sort_values(by=['name', 'salary_value'], axis=0, ascending=True, inplace=True)
# 薪资区间化(数据离散化)
bin = [-1, 0] + list(range(start, end, step)) + [999999] # 自定义区间划分
open_left = ['(%d,%d]' % (bin[i], bin[i + 1]) for i in range(len(bin) - 1)] # 长度要-1
sec = pd.cut(df['salary_value'], bin, labels=open_left).to_frame(name='salary_section')
# 合并
self.df = pd.concat([df, sec], axis=1)
run.py
import os, sys
# 配置环境变量
BASE_PATH = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, BASE_PATH)
from core.db import Mysql
from core.df import Df
# 执行
if __name__ == '__main__':
db = Mysql()
df = Df(db.select_key())
df.write()
4、分析结果(部分)
-
说明
-
爬虫及分析时间:2018年8月
搜索条件:Python+全国
其它:数据有少量清洗
就业地区分析
全国分布
城市top8
关联词分析
薪资透视(热力图)
薪资–公司规模
基数(99倍采样)
占比
薪资–工作经验
基数(99倍采样)
占比