一:url的处理
import urllib.request
from lxml import etree
def bo_url(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36"}
req = urllib.request.Request(url=url,headers=headers)
res = urllib.request.urlopen(req)
return res
二:爬取并匹配数据
def bo_spider(res):
html = res.read()
tree = etree.HTML(html)
bo_list = tree.xpath('//div[@class="job-list"]//ul')
bo_dict = {}
for bo in bo_list:
bo_dict['职位类型'] = bo.xpath('./li//h3//div[@class="job-title"]/text()')
bo_dict['待遇'] = bo.xpath('./li//h3//span[@class="red"]/text()')
bo_dict['发布时间'] = bo.xpath('./li//div[@class="info-publis"]//p/text()')
bo_dict['公司名称'] = bo.xpath('./li//div[@class="company-text"]//a/text()')
bo_dict['地点'] = bo.xpath('./li//div[@class="info-primary"]//p/text()')
print(bo_dict)
return bo_dict
三:用json存储匹配的数据
def xiazai(bo_dict):
bo_list = json.dumps(bo_dict)
with open("boos.json", 'a') as fp:
fp.write(json.dumps(bo_list))
fp.close()
return bo_list
四:控制函数
def main():
work = input("请输入你要爬取的岗位名称")
url = "https://www.zhipin.com/job_detail/?query=" + work + "%E7%88%AC%E8%99%AB&scity=101280600&industry=&position="
# text = xiazai(bo_spider(bo_url(url)))
text = bo_spider(bo_url(url))
return text
if __name__ == '__main__':
main()
五:整体代码
import json
import urllib.request
from lxml import etree
def bo_url(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36"}
req = urllib.request.Request(url=url,headers=headers)
res = urllib.request.urlopen(req)
return res
def bo_spider(res):
html = res.read()
tree = etree.HTML(html)
bo_list = tree.xpath('//div[@class="job-list"]//ul')
bo_dict = {}
for bo in bo_list:
bo_dict['职位类型'] = bo.xpath('./li//h3//div[@class="job-title"]/text()')
bo_dict['待遇'] = bo.xpath('./li//h3//span[@class="red"]/text()')
bo_dict['发布时间'] = bo.xpath('./li//div[@class="info-publis"]//p/text()')
bo_dict['公司名称'] = bo.xpath('./li//div[@class="company-text"]//a/text()')
bo_dict['地点'] = bo.xpath('./li//div[@class="info-primary"]//p/text()')
print(bo_dict)
return bo_dict
def xiazai(bo_dict):
bo_list = json.dumps(bo_dict)
with open("boos.json", 'a') as fp:
fp.write(json.dumps(bo_list))
fp.close()
return bo_list
def main():
work = input("请输入你要爬取的岗位名称")
url = "https://www.zhipin.com/job_detail/?query=" + work + "%E7%88%AC%E8%99%AB&scity=101280600&industry=&position="
text = xiazai(bo_spider(bo_url(url)))
return text
if __name__ == '__main__':
main()