使用scrapy爬取拉钩网招聘信息

最新推荐文章于 2021-12-22 18:50:44 发布

差点是美男****

最新推荐文章于 2021-12-22 18:50:44 发布

阅读量506

点赞数

文章标签： xpath python

本文链接：https://blog.csdn.net/qq_44657868/article/details/106077329

版权

具体代码如下

今天开始接触这个框架，反反爬措施没做多少，一直被发现

# -*- coding: utf-8 -*-
import scrapy
from lagou.items import LagouItem
import re

class LagouZhaopinSpider(scrapy.Spider):
    name = 'lagou_zhaopin'
    allowed_domains = ['lagou.com']
    start_urls = ["https://www.lagou.com/zhaopin/1/"]

    def parse(self, response):
        li_list = response.xpath("//li[@class='con_list_item default_list']")
        for i in li_list:
            data_dict = LagouItem()
            data_dict["title"] = i.xpath(".//h3/text()").extract()
            data_dict["addr"] = i.xpath(".//span[@class='add']/em/text()").extract()

            #提取详情页数据
            detail_url = i.xpath('.//a[@class="position_link"]/@href').extract_first()
            yield scrapy.Request(
                                detail_url,
                                callback=self.parse_detail,
                                meta={"data_dict": data_dict}
                                )

        # 请求下一页
        #extract_first()得到的是str类型
        next_page_url = response.xpath("//a[text()='下一页']/@href").extract_first()
        print(next_page_url)
        if next_page_url != "javascript:;":
            yield scrapy.Request(next_page_url, callback=self.parse)

    def parse_detail(self, response):  #详情页数据提取
        data_dict = response.meta["data_dict"]

        #去掉多余字符（空表，换行等）
        content_list = response.xpath('//*[@id="job_detail"]/dd[2]/div//text()').extract()
        content_list = [re.sub(r"\s", "", i) for i in content_list]  #将多余字符替换成空字符
        data_dict["detail"] = [i for i in content_list if len(i) > 0]  #去掉空字符
        yield data_dict

网上找了一堆user-agent

在setting.py设置

import random

USER_AGENT_LIST = [
    'MSIE (MSIE 6.0; X11; Linux; i686) Opera 7.23',
    'Opera/9.20 (Macintosh; Intel Mac OS X; U; en)',
    'Opera/9.0 (Macintosh; PPC Mac OS X; U; en)',
    'iTunes/9.0.3 (Macintosh; U; Intel Mac OS X 10_6_2; en-ca)',
    'Mozilla/4.76 [en_jp] (X11; U; SunOS 5.8 sun4u)',
    'iTunes/4.2 (Macintosh; U; PPC Mac OS X 10.2)',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:5.0) Gecko/20100101 Firefox/5.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:9.0) Gecko/20100101 Firefox/9.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/20120813 Firefox/16.0',
    'Mozilla/4.77 [en] (X11; I; IRIX;64 6.5 IP30)',
    'Mozilla/4.8 [en] (X11; U; SunOS; 5.7 sun4u)'
]

USER_AGENT = random.choice(USER_AGENT_LIST)

在pipelines.py中打印结果

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from pymongo import MongoClient
client = MongoClient()
db = client["lagou"]

class LagouPipeline:
    def process_item(self, item, spider):

        #保存数据的地方
        print(item)
        #db.zhaopin.insert_one(dict(item))
        return item