简单快速爬取拉勾网,我们需要在爬取json文件,需要发送POST请求,但是需要先发送一次GET请求,不然会返回错误信息,
爬取间隔尽量设置长一点,有多个ip随意。。。
# -*- coding: utf-8 -*-
import scrapyimport time
import re
import json
from wde32.items import Wde32Item
# 必须先发一次GET请求,在发一次POST才可以
class LagouwangSpider(scrapy.Spider):
name = "lagouwang2"
allowed_domains = ["lagou.com"]
# json 的地址
start_urls = ["https://www.lagou.com/jobs/list_python%E7%88%AC%E8%99%AB?labelWords=sug&fromSearch=true&suginput=python"]
def parse(self,response):
num = 1
for i in range(30):
num += 1
time.sleep(5)
scrapy.Request(
callback=None,
method='GET',
url="https://www.lagou.com/jobs/list_python%E7%88%AC%E8%99%AB?px=default&city=%E5%85%A8%E5%9B%BD#filterBox")
# url="https://www.lagou.com/jobs/list_python%E7%88%AC%E8%99%AB?labelWords=sug&fromSearch=true&suginput=python")
time.sleep(5)
yield scrapy.FormRequest(
url="https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false&isSchoolJob=0",
formdata={
'first':'true',
'pn':str(num),
'kd':'爬虫'},
callback=self.parse_page)
def parse_page(self, response):
iters = Wde32Item()
try:
datas = json.loads(response.body.decode("utf-8"))["content"]["positionResult"]["result"]
for data in datas:
iters["add_url"] = "https://www.lagou.com/jobs/"+str(data["positionId"])+".html"
iters["name"] = data['companyFullName']
iters["createTime"] = data["createTime"]
iters["secondType"] = data["secondType"]
iters["positionName"] = data["positionName"]
iters["education"] = data["education"]
iters["workYear"] = data["workYear"]
iters["city"] = data["city"]
iters["salary"] = data["salary"]
iters['companyShortName'] = data["companyShortName"]
iters["companySize"] = data["companySize"]
yield iters
except:
print('爬取结束')