1 # coding:utf-8
2 # auth:xiaomozi
3 #date:2018.4.19
4 #爬取智联招聘职位信息
5
6
7 import urllib
8 from lxml import etree
9 import time
10 import random
11 import pdb
12
13 def downloader(kw,pages):
14 '''下载器
15 :param kw: 搜索关键字
16 :param pages: 搜索的页码,数组
17 :return: 返回爬取到的HTML集合
18 '''
19 for page in pages:
20 print("the {}page is downloading".format(page))
21 infourl = 'https://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E6%B7%B1%E5%9C%B3&kw={}&sm=0&p={}'.format(kw, str(page))
22 time.sleep(random.uniform(0.5, 2.1))
23 info = urllib.urlopen(infourl).read()
24 yield info
25
26 def extractor(html):
27 '''
28 提取工作岗位信息,返回一个yie