爬取智联招聘python的岗位数
开始爬虫前先查看网址有什么特点 , 爬取的内容要往哪里入手 , 在智联招聘搜索python的岗位出现的网址是https://sou.zhaopin.com/jobs/searchresult.ashx?jl=杭州&kw=python
https://sou.zhaopin.com/jobs/searchresult.ashx?这个可以不用管
jl 代表的是地址在杭州
kw 代表搜索的内容是搜索的内容
import urllib from urllib import request, parse import re #封装成一个url函数 def getJobNum(kw): #模拟浏览器添加相应头 header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"} url = "https://sou.zhaopin.com/jobs/searchresult.ashx?" + kw request = urllib.request.Request(url, headers=header) response = urllib.request.urlopen(request) html = response.read().decode() # HTML源码 ''' <em>2011</em> ''' jobNumre = "<em>(\d+)</em>" # 匹配岗位数量 jobnum = re.findall(jobNumre, html) # 查找 return jobnum[0] if __name__ == '__main__': jobList = ['java', 'php', 'python', 'go'] jobNumdict = {} # 存储 for job in jobList: # jl = 杭州 & kw = php kw = {"jl": "杭州", 'kw': job} kw = urllib.parse.urlencode(kw) # 编码 num = getJobNum(kw) jobNumdict[job] = num print(jobNumdict)
1.导入URL模块
import urllib from urllib import response,parse import re
2.模拟浏览器添加请求头
#封装成一个url函数 def getJobNum(kw): #模拟浏览器添加相应头 header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
3.url请求的地址
url = "https://sou.zhaopin.com/jobs/searchresult.ashx?" + kw
4.把url和请求头带入发起请求
request = urllib.request.Request(url, headers=header)
5.打开请求会得到响应内容
response = urllib.request.urlopen(request)
6.把响应的内容进行解码
html = response.read().decode()
7.正则匹配岗位数量
jobNumre = "<em>(\d+)</em>" # 匹配岗位数量
8.查找到符合正则的内容
jobnum = re.findall(jobNumre, html) # 查找 return jobnum[0]
9.设置想要对比的职位有那些
if __name__ == '__main__': jobList = ['java', 'php', 'python', 'go'] jobNumdict = {} # 存储
10.遍历每个职位对应的数量并进行编码
for job in jobList: # jl = 杭州 & kw = kw = {"jl": "杭州", 'kw': job} kw = urllib.parse.urlencode(kw) # 编码
11.把数据带入函数中得到相应的数量
num = getJobNum(kw)
jobNumdict[job] = num
print(jobNumdict)
结果如下
{'java': '5665', 'php': '1262', 'python': '2002', 'go': '219'}