Python CrawlSpider 爬取拉钩

最新推荐文章于 2019-03-27 16:11:39 发布

Test_C.

最新推荐文章于 2019-03-27 16:11:39 发布

阅读量529

点赞数 1

分类专栏： Python scrapy

本文链接：https://blog.csdn.net/weixin_42544006/article/details/84633869

版权

Python 同时被 2 个专栏收录

110 篇文章 3 订阅

订阅专栏

scrapy

16 篇文章 0 订阅

订阅专栏

创建 CrawlSpider

scrapy genspider -t crawl lagou www.lagou.com

爬虫文件

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import LG_item
from ..tools.get_md5 import get_md5
import datetime
import re


class LagouSpider(CrawlSpider):
    name = 'lagou'
    allowed_domains = ['lagou.com']
    start_urls = ['https://www.lagou.com','https://www.lagou.com/gongsi/']

    custom_settings = {
        "COOKIES_ENABLED": False,
        "DOWNLOAD_DELAY": 0.2,
        # "CONCURRENT_REQUESTS" : 32,

    'DEFAULT_REQUEST_HEADERS': {
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Connection': 'keep-alive',
            'cookie': '_ga=GA1.2.1809666113.1526002829; user_trace_token=20180511094034-4c4d62d7-54bc-11e8-949f-525400f775ce; LGUID=20180511094034-4c4d6608-54bc-11e8-949f-525400f775ce; LG_LOGIN_USER_ID=537d2089412cae011d73a44bb8911986e2cf8ecc81522b3c; JSESSIONID=ABAAABAAAGFABEF2F6A133027057686F2D420CEB60B7F87; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1541598090,1542114329,1542774094,1543386087; _gid=GA1.2.118340539.1543386087; index_location_city=%E5%85%A8%E5%9B%BD; X_HTTP_TOKEN=fb416e9ede186e36ef1a080ebf43ceba; sajssdk_2015_cross_new_user=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22167593fd949988-079e956e17a40f-4313362-2073600-167593fd94a1f2%22%2C%22%24device_id%22%3A%22167593fd949988-079e956e17a40f-4313362-2073600-167593fd94a1f2%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; LGSID=20181128222414-482f1608-f319-11e8-8c3d-5254005c3644; TG-TRACK-CODE=index_navigation; SEARCH_ID=8304e9b43803439494c1c91c06395eca; _gat=1; LGRID=20181128233044-924a1586-f322-11e8-8c40-5254005c3644; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1543419045',
            'Host': 'www.lagou.com',
            'Origin': 'https://www.lagou.com',
            'Referer': 'https://www.lagou.com/',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
        }
    }


    rules = (
        Rule(LinkExtractor(allow=(r'zhaopin/.*',)), follow=True),
        Rule(LinkExtractor(allow=(r'gongsi/j.*',)), follow=True),
        Rule(LinkExtractor(allow=r'jobs/.+html'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        lagou_item = LG_item()

        try:
            # 提取内容
            title = response.css(".job-name::attr(title)").extract()[0]
            url = response.url
            url_object_id = get_md5(url)
            salary,job_city,work_years,degree_need,job_type = response.xpath('//dd[@class="job_request"]/p/span/text()').extract()
            tags = '|'.join(response.css('.position-label.clearfix li::text').extract())
            pulish_time = response.css('.publish_time::text').extract()[0]
            job_advantage =response.css('.job-advantage p::text').extract()[0]
            job_desc = response.css('.job_bt div').extract()[0]
            job_addr =re.sub(r'<.*?>|\n| |查看地图','', response.css('.work_addr').extract()[0])
            company_name =response.css('#job_company dt img::attr(alt)').extract()[0]
            company_url = response.xpath('//dl[@class="job_company"]/dd//li[4]/a/text()').extract_first("没有")
            crawl_time = datetime.datetime.now()


            lagou_item['title']= title
            lagou_item['url']= url
            lagou_item['url_object_id']= url_object_id
            lagou_item['salary']= salary
            lagou_item['job_city']= job_city
            lagou_item['work_years']= work_years
            lagou_item['degree_need']= degree_need
            lagou_item['job_type']= job_type
            lagou_item['tags']= tags
            lagou_item['pulish_time']= pulish_time
            lagou_item['job_advantage']= job_advantage
            lagou_item['job_desc']= job_desc
            lagou_item['job_addr']= job_addr
            lagou_item['company_name']= company_name
            lagou_item['company_url']= company_url
            lagou_item['crawl_time']= crawl_time

            yield lagou_item
        except Exception as e:
            print('出错了: ',e)

settings文件

# -*- coding: utf-8 -*-

# Scrapy settings for LaGou project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'LaGou'

SPIDER_MODULES = ['LaGou.spiders']
NEWSPIDER_MODULE = 'LaGou.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'LaGou (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
# COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:

# 全局请求头
# DEFAULT_REQUEST_HEADERS = {
#
#   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
#   "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
#   "Accept-Encoding": "gzip, deflate, br",
#   "Accept-Language": "zh,zh-CN;q=0.9,en;q=0.8",
#   "Cache-Control": "max-age=0",
#   "Connection": "keep-alive",
#   "Host": "www.lagou.com",
#   "Upgrade-Insecure-Requests": "1",
# }

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'LaGou.middlewares.LagouSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'LaGou.middlewares.LagouDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'LaGou.pipelines.LGMysqlPipeline': 1,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

items 文件

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class LagouItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass


# 定义 拉钩 item
class LG_item(scrapy.Item):
    title = scrapy.Field()
    url = scrapy.Field()
    url_object_id = scrapy.Field()
    salary = scrapy.Field()
    job_city = scrapy.Field()
    work_years = scrapy.Field()
    degree_need = scrapy.Field()
    job_type = scrapy.Field()
    pulish_time = scrapy.Field()
    tags = scrapy.Field()
    job_advantage = scrapy.Field()
    job_desc = scrapy.Field()
    job_addr = scrapy.Field()
    company_url = scrapy.Field()
    company_name = scrapy.Field()
    crawl_time =scrapy.Field()
    # crawl_update_time = scrapy.Field()

pipelines 文件

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql


class LagouPipeline(object):
    def process_item(self, item, spider):
        return item

# 存到 mysql
class LGMysqlPipeline(object):
    def __init__(self):
        self.conn = pymysql.connect('127.0.0.1', 'root', 'cyl666.', 'scrapy', charset='utf8')
        self.cursor = self.conn.cursor()

    def process_item(self, item, spider):
        sql = 'insert into lagou_job(title,url,url_object_id,salary,job_city,work_years,degree_need,job_type,pulish_time,tags,job_advantage,job_desc,job_addr,company_url,company_name,crawl_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
        data = (item['title'],item['url'],item['url_object_id'],item['salary'],
                item['job_city'],item['work_years'],item['degree_need'],
                item['job_type'],item['pulish_time'],item['tags'],
                item['job_advantage'],item['job_desc'],item['job_addr'],
                item['company_url'],item['company_name'],item['crawl_time'])
        try:
            self.cursor.execute(sql, data)
            self.conn.commit()
        except Exception as e:
            print('插入错误', e)

        return item

    def close_spider(self, spider):
        self.cursor.close()
        self.conn.close()

Test_C.

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Python CrawlSpider 爬取拉钩

创建 CrawlSpiderscrapy genspider -t crawl lagou www.lagou.com爬虫文件# -*- coding: utf-8 -*-import scrapyfrom scrapy.linkextractors import LinkExtractorfrom scrapy.spiders import CrawlSpider, Rul...
复制链接

扫一扫