基于scrapy框架的CrawlSpider类实现跟进爬取并利用xpath匹配出每一页的代理ip与端口号保存在txt文档中。
因为西刺代理页面过多,并且靠后的基本没用,所以我们只爬取前9页。
spider文件如下:
# -*- coding: utf-8 -*-
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from proIP.items import ProipItem
import requests
class CrawlipSpider(CrawlSpider):
name = 'crawlIP'
allowed_domains = ['xicidaili.com']
start_urls = ['http://www.xicidaili.com/nn/1']
#利用正则匹配
rules = (
Rule(LinkExtractor(allow=r'^http://www.xicidaili.com/nn/[1-9]?$'), callback='parse_item', follow=True),
)
def parse_item(self, response):
#提取出ip列表
ip_list = response.xpath('//table[@id="ip_list"]//tr[@class="odd" or @class=""]/td[2]/text()').extract()
#提取出端口列表
port_list = response.xpath('//table[@id="ip_list"]//tr[@class="odd" or @class=""]/td[3]/text()').extract()
#提取出协议类型列表
type_list = response.xpath('//table[@id="ip_list"]//tr[@class="odd" or @class=""]/td[6]/text()').extract()
print(response.url)
for (ip, port, type) in zip(ip_list, port_list, type_list):
proxies = {type: ip+port}
try:
#设置代理链接百度 如果状态码为200 则表示该代理可以使用 然后交给流水线处理
if requests.get('http://www.baidu.com', proxies=proxies, timeout=2).status_code == 200:
print('success %s' % ip)
item = ProipItem()
item['url'] = type + '://' + ip + ':' + port
yield item
except:
print('fail %s' % ip)
item文件如下:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class ProipItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
url = scrapy.Field()
值得注意的是对user-agent的设置,在settings文件中修改如下:
DEFAULT_REQUEST_HEADERS = {
'Host': 'www.xicidaili.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Referer': 'http://www.xicidaili.com/',
# 'Cookie': '_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJTdjNDI5MzI0MjI2NzZhOTI3MmI5ZmRiYzQxMWRjNjZkBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUNYK3dac04xaXExRXhwYXlSZ251NHpTYmFuV011OEhHbjVBM09DOFl1WGs9BjsARg%3D%3D--106494228e4547863ecf9d88d308c67aecd3a08b',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}
FEED_EXPORT_ENCODING = 'utf-8'
Pipeline文件如下:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
class ProipPipeline(object):
def __init__(self):
self.file = open('ip02.txt', 'a', encoding='utf-8')
def process_item(self, item, spider):
text = item['url'] + '\n'
self.file.write(text)
return item
def close_spider(self, spider):
self.file.close()
pass