用爬虫框架进行爬取,框架还是feapder
代码如下:
import feapder
#轻量级爬虫
class LastAirSpider(feapder.AirSpider):
def start_callback(self):
print("爬虫开始")
def end_callback(self):
print("爬虫结束")
#下发任务
def start_requests(self):
#爬取五页数据
for i in range(1, 5):
yield feapder.Request("https://www.lagou.com/wn/jobs?px=new&kd=Python&city=%E5%85%A8%E5%9B%BD&pn={}".format(i))
#异常抛出
def validate(self, request, response):
if response.status_code != 200:
raise Exception("response code not 200")
if "400" not in response.text:
return False
# 解析数据
def parse(self, request, response):
#不支持的字符自动忽略
response.encoding_errors = "ignore"
#标签
article_lists = response.xpath('//div[@class="list__YibNq"]')
# print(article_lists)
# #遍历
for article in article_lists:
#招聘岗位
posts = article.xpath('//div[@class="p-top__1F7CL"]/a/text()').extract()
#待遇
pays = article.xpath('//div[@class="p-bom__JlNur"]/text()').extract()
#公司名称
names = article.xpath('//div[@class="company-name__2-SjF"]/a/text()').extract()
#公司logo网址
logos = article.xpath('//div[@class="com-logo__1QOwC"]/img/@src').extract()
#打印
for post, pay, name, logo in zip(posts, pays, names, logos):
print(post, pay, name, logo)
if __name__ == "__main__":
LastAirSpider(thread_count=10).start()
爬取效果:
ok,不了解的可以私信