创建 CrawlSpider
scrapy genspider -t crawl lagou www.lagou.com
爬虫文件
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import LG_item
from ..tools.get_md5 import get_md5
import datetime
import re
class LagouSpider(CrawlSpider):
name = 'lagou'
allowed_domains = ['lagou.com']
start_urls = ['https://www.lagou.com','https://www.lagou.com/gongsi/']
custom_settings = {
"COOKIES_ENABLED": False,
"DOWNLOAD_DELAY": 0.2,
# "CONCURRENT_REQUESTS" : 32,
'DEFAULT_REQUEST_HEADERS': {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Connection': 'keep-alive',
'cookie': '_ga=GA1.2.1809666113.1526002829; user_trace_token=20180511094034-4c4d62d7-54bc-11e8-949f-525400f775ce; LGUID=20180511094034-4c4d6608-54bc-11e8-949f-525400f775ce; LG_LOGIN_USER_ID=537d2089412cae011d73a44bb8911986e2cf8ecc81522b3c; JSESSIONID=ABAAABAAAGFABEF2F6A133027057686F2D420CEB60B7F87; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1541598090,1542114329,1542774094,1543386087; _gid=GA1.2.118340539.1543386087; index_location_city=%E5%85%A8%E5%9B%BD; X_HTTP_TOKEN=fb416e9ede186e36ef1a080ebf43ceba; sajssdk_2015_cross_new_user=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22167593fd949988-079e956e17a40f-4313362-2073600-167593fd94a1f2%22%2C%22%24device_id%22%3A%22167593fd949988-079e956e17a40f-4313362-2073600-167593fd94a1f2%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; LGSID=20181128222414-482f1608-f319-11e8-8c3d-5254005c3644; TG-TRACK-CODE=index_navigation; SEARCH_ID=8304e9b43803439494c1c91c06395eca; _gat=1; LGRID=20181128233044-924a1586-f322-11e8-8c40-5254005c3644; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1543419045',
'Host': 'www.lagou.com',
'Origin': 'https://www.lagou.com',
'Referer': 'https://www.lagou.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
}
}
rules = (
Rule(LinkExtractor(allow=(r'zhaopin/.*',)), follow=True),
Rule(LinkExtractor(allow=(r'gongsi/j.*',)), follow=True),
Rule(LinkExtractor(allow=r'jobs/.+html'), callback='parse_item', follow=True),
)
def parse_item(self, response):
lagou_item = LG_item()
try:
# 提取内容
title = response.css(".job-name::attr(title)").extract()[0]
url = response.url
url_object_id = get_md5(url)
salary,job_city,work_years,degree_need,job_type = response.xpath('//dd[@class="job_request"]/p/span/text()').extract()
tags = '|'.join(response.css('.position-label.clearfix li::text').extract())
pulish_time = response.css('.publish_time::text').extract()[0]
job_advantage =response.css('.job-advantage p::text').extract()[0]
job_desc = response.css('.job_bt div').extract()[0]
job_addr =re.sub(r'<.*?>|\n| |查看地图','', response.css('.work_addr').extract()[0])
company_name =response.css('#job_company dt img::attr(alt)').extract()[0]
company_url = response.xpath('//dl[@class="job_company"]/dd//li[4]/a/text()').extract_first("没有")
crawl_time = datetime.datetime.now()
lagou_item['title']= title
lagou_item['url']= url
lagou_item['url_object_id']= url_object_id
lagou_item['salary']= salary
lagou_item['job_city']= job_city
lagou_item['work_years']= work_years
lagou_item['degree_need']= degree_need
lagou_item['job_type']= job_type
lagou_item['tags']= tags
lagou_item['pulish_time']= pulish_time
lagou_item['job_advantage']= job_advantage
lagou_item['job_desc']= job_desc
lagou_item['job_addr']= job_addr
lagou_item['company_name']= company_name
lagou_item['company_url']= company_url
lagou_item['crawl_time']= crawl_time
yield lagou_item
except Exception as e:
print('出错了: ',e)
settings文件
# -*- coding: utf-8 -*-
# Scrapy settings for LaGou project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'LaGou'
SPIDER_MODULES = ['LaGou.spiders']
NEWSPIDER_MODULE = 'LaGou.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'LaGou (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
# COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
# 全局请求头
# DEFAULT_REQUEST_HEADERS = {
#
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
# "Accept-Encoding": "gzip, deflate, br",
# "Accept-Language": "zh,zh-CN;q=0.9,en;q=0.8",
# "Cache-Control": "max-age=0",
# "Connection": "keep-alive",
# "Host": "www.lagou.com",
# "Upgrade-Insecure-Requests": "1",
# }
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'LaGou.middlewares.LagouSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'LaGou.middlewares.LagouDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'LaGou.pipelines.LGMysqlPipeline': 1,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
items 文件
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class LagouItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
# 定义 拉钩 item
class LG_item(scrapy.Item):
title = scrapy.Field()
url = scrapy.Field()
url_object_id = scrapy.Field()
salary = scrapy.Field()
job_city = scrapy.Field()
work_years = scrapy.Field()
degree_need = scrapy.Field()
job_type = scrapy.Field()
pulish_time = scrapy.Field()
tags = scrapy.Field()
job_advantage = scrapy.Field()
job_desc = scrapy.Field()
job_addr = scrapy.Field()
company_url = scrapy.Field()
company_name = scrapy.Field()
crawl_time =scrapy.Field()
# crawl_update_time = scrapy.Field()
pipelines 文件
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
class LagouPipeline(object):
def process_item(self, item, spider):
return item
# 存到 mysql
class LGMysqlPipeline(object):
def __init__(self):
self.conn = pymysql.connect('127.0.0.1', 'root', 'cyl666.', 'scrapy', charset='utf8')
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
sql = 'insert into lagou_job(title,url,url_object_id,salary,job_city,work_years,degree_need,job_type,pulish_time,tags,job_advantage,job_desc,job_addr,company_url,company_name,crawl_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
data = (item['title'],item['url'],item['url_object_id'],item['salary'],
item['job_city'],item['work_years'],item['degree_need'],
item['job_type'],item['pulish_time'],item['tags'],
item['job_advantage'],item['job_desc'],item['job_addr'],
item['company_url'],item['company_name'],item['crawl_time'])
try:
self.cursor.execute(sql, data)
self.conn.commit()
except Exception as e:
print('插入错误', e)
return item
def close_spider(self, spider):
self.cursor.close()
self.conn.close()