0x01数据收集
这里使用python爬取招聘信息存到数据库。这个脚本只负责收集,收集之后保存到数据库
为了加快爬取速度这里采用多进程方式。
脚本详情参考https://blog.csdn.net/qq_38204481/article/details/93890794
这里进行了一些修改,修复了一些bug
# -*- coding:utf-8 -*-
from multiprocessing import Pool
import os, time, random
# -*- coding:utf-8 -*-
"""
1.采集工作数据
python php java html go
2. sqlite 存储数据
3. 表格存储数据
4. 采集完成以邮件 方式发送给工作人员
5. 将所有文件压缩再发邮件
6. 数据分析
每个分类的工作数量
工资区间
某个分类下每天的岗位数量
7. 图表展示
"""
"""
静态网站用xpath
动态网站用jison.loads
难点在于网址后面的参数加密
找工作网站:
智联招聘 前程无忧 拉勾网 Boss直聘
"""
import requests
from lxml import etree
import sqlite3
import re
import time
from random import randint
import zmail
import zipfile
def init_sqlite(cols, tb_name='job', db_name="job.db"):
con = sqlite3.connect(db_name)
cols1 = cols
cols1.append("")
sql1 = " varchar(255) ,".join(cols)
sql1 = sql1.rstrip(',') # 去除两头的符号
sql1 = """
CREATE TABLE IF NOT EXISTS %s(
id integer primary key autoincrement,
%s
)
""" % (tb_name, sql1)
# print(sql1)
con.execute(sql1)
return con
def get(url):
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:67.0) Gecko/20100101 Firefox/67.0'
}
response = requests.get(url, headers=headers)
response.encoding = response.apparent_encoding
return response.text
# 通过岗位名称搜工作 ,并存入数据库和表格
def search_job_py_name(name, con):
url = "https://search.51job.com/list/010000%252C020000%252C030200%252C040000%252C180200,000000,0000,00,9,99," + name + ",2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
content = get(url)
root = etree.HTML(content)
total_page = root.xpath("//span[contains(text(),'页,到第')]/text()")[0]
total_page = total_page.replace("共", "").replace("页,到第", "")
print(total_page)
for page in range(1, int(total_page) + 1):
page_url = url.replace(".html", str(page) + ".html")
content = get(url)
root = etree.HTML(content)
job_list = root.xpath("//div[@id='resultList']/div[@class='el']")
for job in job_list:
# rand=randint(1,10)
# time.sleep(rand/10)
job_name = job.xpath("p/span/a/@title")[0]
job_href = job.xpath("p/span/a/@href")[0]
job_company_name = job.xpath("span[1]/a/@title")
job_company_name = job_company_name[0] if job_company_name else "没有公司名字"
job_place = job.xpath("span[2]/text()")
job_place = job_place[0] if job_place else "没有公司地点"
if "-" in job_place:
job_city = job_place.split("-")[0]
job_section = job_place.split("-")[1]
else:
job_city = job_place
job_section = ""
job_salary = job.xpath("span[3]/text()")
job_salary = job_salary[0] if job_salary else "面议"
# 1.3-2万/月 4.5-8千/月 面议 1500元/天
min_money = 0
max_money = 0
if "-" in job_salary:
job_type = "普通"
pattern = re.compile("[-万/月千万]")
result = pattern.split(job_salary)
if "万" in job_salary:
min_money = float(result[0]) * 10000
max_money = float(result[1]) * 10000
elif "千" in job_salary:
min_money = float(result[0]) * 1000
max_money = float(result[1]) * 1000
if "年" in job_salary:
min_money = min_money // 12
max_money = max_money // 12
else:
job_type = "其他"
min_money = str(min_money)
max_money = str(max_money)
job_date = job.xpath("span[4]/text()")
job_date = job_date[0] if job_date else "没有发布时间"
print(job_href, job_name, job_company_name, job_salary, job_place, job_date)
data = {
'job_name': job_name,
'job_company_name': job_company_name,
'job_salary': job_salary,
'job_date': job_date,
'job_group': name,
'job_city': job_city,
'job_section': job_section,
'min_salary': min_money,
'max_salary': max_money,
'job_type': job_type,
}
save_to_sqlite(con, data)
# save_to_csv(data)
# break
# 存储到数据库,连接对象,插入的数据,表名 cols_value为字典
def save_to_sqlite(con, cols_value, tb_name="job"):
cols_name = ",".join(cols_value.keys()) # 取字典的key
col_values = "'" + "','".join(cols_value.values()) + "'"
sql1 = """
INSERT INTO %s (%s) values (%s)
""" % (tb_name, cols_name, col_values)
# print(sql1)
con.execute(sql1)
con.commit()
# 保存到表格csv文件
def save_to_csv(cols_value, csv_name="job.csv"):
"""
csv 文件特点:数据之间用逗号隔开
"""
values = cols_value.values()
values = ",".join(values)
f = open(csv_name, 'a', encoding="utf-8")
f.write(values + '\n')
f.close()
# 发邮件
def send_email(mail_name, file_name):
# mail_content = {
# 'subject': '重要通知', # 随便填写
# 'content_text': 'This message from zmail!', # 随便填写
# }
# 发送带附件的邮件
mail_content = {
'subject': 'Success!', # 邮件主题
# 邮件内容
'content_text': 'This message from zmail!',
# 如果失败,,'content': 'This message from zmail!',
'attachments': file_name,
}
# 使用你的邮件账户名和密码登录服务器
server = zmail.server('tower111362058670@163.com', 'Root1996')
# 发送邮件
server.send_mail(mail_name, mail_content)
# 压缩文件
def zip_files(zip_name, files):
# 创建压缩包
azip = zipfile.ZipFile(zip_name, "w")
# 把文件添加进压缩包,加密方式是ZIP_LZMA
for file in files:
azip.write(file, compress_type=zipfile.ZIP_LZMA)
azip.close()
pass
def worker(job_name):
t_start = time.time()
print("%s开始执行,进程号为%d" % (job_name,os.getpid()))
con = init_sqlite(
['job_name', 'job_company_name', 'job_city', 'job_section', 'job_salary', 'max_salary', 'min_salary',
'job_type', 'job_group', 'job_date'])
search_job_py_name(job_name, con)
con.close()
t_stop = time.time()
print(job_name,"执行完毕,耗时%0.2f" % (t_stop-t_start))
if __name__ == '__main__':
job_list = ["python", "Java", "PHP", "C"]
po = Pool(2) # 定义一个进程池,最大进程数2
for i in range(0,len(job_list)):
po.apply_async(worker,(job_list[i],)) # Pool().apply_async(要调用的目标,(传递给目标的参数元祖,))
print("----start----")
po.close() # 关闭进程池,关闭后po不再接收新的请求
po.join() # 等待po中所有子进程执行完成,再执行下面的代码,可以设置超时时间join(timeout=)
print("-----end-----")
# zip_files("工作情况.zip",job_list+".csv")
# send_email("362058670@qq.com","工作情况.zip")
0x02数据处理
下面脚本对数据进行了一些处理。
用数据库查询语句对数据进行了过滤。
用柱状图和饼状图展示。
特点是把代码都封装了,用的话直接调用就好。
上图仅供参考,数据暂时并不完整
有机会学一下pandas numpy这两个比较专业的 数据分析库
# -*- coding:utf-8 -*-
"""
专业库 pandas numpy
也可以用sql分析
"""
"""
统计上海的各语言的工作数量
统计各语言薪资听均属
"""
from pyecharts import Bar
from pyecharts import Pie
import sqlite3
#统计一类工作对应的工作数量
# 统计每种工作的数量
#基础函数:封装了数据库查询过程,只需要传入查询语句和表名就能查询
def select_sql(sql1,table_name="../2019-6-27/job.db"):
"""
SELECT count(*) from job
where job_type='普通' and job_group='python'
"""
con = sqlite3.connect(table_name)
#查询所有语言工作总数
cursor=con.execute(sql1)
result=cursor.fetchall()
print("bbbb",result)
# 列表推倒式
data = []
columns=[x[0] for x in result]
"""
('C', 47), ('Java', 52), ('PHP', 52), ('python', 48)
经过columns=[x[0] for x in result]
能够得到 ['C', 'Java', 'PHP', 'python']
把2层循环精简到了一句话
"""
print(columns)
for i in range(len(result[0])-1):
data.append([x[i+1] for x in result])
return columns,data
#功能函数:每种工作的数量
def job_num_by_class(table_name="../2019-6-27/job.db"):
sql1 = """
SELECT job_group,count(*) from job
where job_type='普通'
group by job_group;
"""
columns,data=select_sql(sql1,table_name)
return columns, data
#功能函数:统计每种工作的最大最小平均工资水平
def max_mine_avg_salary(table_name="../2019-6-27/job.db"):
#查询所有语言工作总数
sql1="""
SELECT job_group,avg(max_salary),avg(min_salary) from job
where job_type='普通'
group by job_group;
"""
columns, data = select_sql(sql1, table_name)
return columns, data
#功能函数:每个城市的薪资平均最大值最小值水平
def per_city_salary(table_name="../2019-6-27/job.db"):
sql1 = """
SELECT job_city,avg(max_salary),avg(min_salary) from job
where job_type='普通'
group by job_city;
"""
columns, data = select_sql(sql1,table_name="../2019-6-27/job.db")
return columns, data
#功能函数:每个城市工作数量
def per_city_num_of_job(table_name="../2019-6-27/job.db"):
sql1 = """
SELECT job_city,count(job_name) from job
where job_type='普通'
group by job_city;
"""
columns, data = select_sql(sql1, table_name="../2019-6-27/job.db")
print("aaaa",data)
return columns,data
# 创建柱状图,图名字, columns表每一柱表示的含义, datas每一柱的数据(二维数组,支持每一列有两柱)
#table_context_name 字符串数组跟datas元素数量一样,每一组里面一柱的名字
def create_bar(table_name,columns,datas,table_context_name):
# //设置柱状图的主标题与副标题
bar = Bar(table_name)
# //添加柱状图的数据及配置项
for i in range(len(datas)):
bar.add(table_context_name[i], columns, datas[i], mark_line=["average"], mark_point=["max", "min"])
# bar.add("蒸发量", columns, data2, mark_line=["average"], mark_point=["max", "min"])
# //生成本地文件(默认为.html文件)
bar.render(table_name+"_bar.html")
#table_name 网页左上角的表名 columns每一个饼图中光标放上去的时候显示的名字
# datas二维数组,存放每一个饼图需要的每一项数据 table_context_name
def create_pie(table_name,columns ,datas,table_context_name):
# //设置主标题与副标题,标题设置居中,设置宽度为900
pie = Pie(table_name, title_pos='center', width=900)
for i in range(len(datas)):
# //加入数据,设置坐标位置为【75,50】,上方的colums选项取消显示,显示label标签 , center=[75, 50]
pie.add(table_context_name[i], columns, datas[i], center=[25+50*(i%4), 50],is_legend_show=False, is_label_show=True)
# //保存图表
pie.render(table_name+"_pie.html")
columns,data1=job_num_by_class("./job.db")
# print(columns)
# print(data1)
create_bar("每种工作的数量",columns,data1,["每种工作的数量"])
create_pie("每种工作的数量",columns,data1,["每种工作的数量"])
columns,data1=max_mine_avg_salary("./job.db")
create_bar("最大最小薪资平均数(每种语言)",columns,data1,["薪资水平最大值","薪资水平最小值"])
create_pie("最大最小薪资平均数(每种语言)",columns,data1,["薪资水平最大值","薪资水平最小值"])
columns,data1=per_city_salary("./job.db")
create_bar("最大最小薪资平均数(每个城市)",columns,data1,["薪资水平最大值","薪资水平最小值"])
create_pie("最大最小薪资平均数(每个城市)",columns,data1,["薪资水平最大值","薪资水平最小值"])
columns,data1=per_city_num_of_job("./job.db")
print("data1",data1)
create_bar("每个城市工作数",columns,data1,["工作数"])
create_pie("每个城市工作数",columns,data1,["工作数"])