基于scrapy与selenium与PhantomJS爬虫腾讯招聘
项目执行程序main.py
:
from scrapy import cmdline
cmdline.execute('scrapy crawl tenxun --nolog'.split())
spider文件,tenxun.py
:
# -*- coding: utf-8 -*-
import scrapy
from tencent.items import TencentItem
class TenxunSpider(scrapy.Spider):
name = 'tenxun'
allowed_domains = ['careers.tencent.com']
start_urls = []
#https://careers.tencent.com/search.html?index=2
for i in range(1,430):
base_url = 'https://careers.tencent.com/search.html?index={}'.format(i)
start_urls.append(base_url)
def parse(self, response):
# print(response.text)
item=TencentItem()
div_list=response.xpath('//div[@class="recruit-wrap recruit-margin"]/div')
# print(div_list)
for site in div_list:
title=site.xpath('.//h4/text()').extract_first()
content=site.xpath('.//p[@class="recruit-tips"]/span/text()').extract()
detail=site.xpath('.//p[@class="recruit-text"]/text()').extract_first()
item['title']=title
content='|'.join(content)
item['content']=content
item['detail']=detail
yield item
items.py
:
import scrapy
class TencentItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
content = scrapy.Field()
detail = scrapy.Field()
pipelines.py
:
import pymongo
class Pipeline(object):
def __init__(self,mongo_url,mongo_db):
self.mongo_url = mongo_url
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls,crawler):
return cls(
mongo_url=crawler.settings.get('MONGO_URL'),
mongo_db=crawler.settings.get('MONGO_DATABASE','items')
)
def open_spider(self,spider):
self.client=pymongo.MongoClient(self.mongo_url)
self.db=self.client[self.mongo_db]
def close_spider(self,spider):
self.client.close()
def process_item(self, item, spider):
collection_name=item.__class__.__name__
print(collection_name)
self.db[collection_name].insert(dict(item))
return item
settings.py
:
BOT_NAME = 'tencent'
SPIDER_MODULES = ['tencent.spiders']
NEWSPIDER_MODULE = 'tencent.spiders'
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
DOWNLOADER_MIDDLEWARES = {
'tencent.TencentMiddlewares.TencentMiddle': 543,
}
ITEM_PIPELINES = {
'tencent.pipelines.Pipeline': 300,
}
MONGO_URI = 'localhost'
MONGO_DATABASE = 'tencent2'
spider中间件TencentMiddlewares.py
:
from selenium import webdriver
import time
from scrapy.http import HtmlResponse
class TencentMiddle(object):
def process_request(self, request, spider):
# print(1)
driver = webdriver.PhantomJS()
driver.get(request.url)
time.sleep(0.1)
html = driver.page_source
return HtmlResponse(url=request.url,body=html,encoding='utf-8',request=request)