步骤
进入虚拟环境conda activate Scrap
在你想要创建项目的文件夹下,创建项目(Scrapy) D:\Users\Victor\Documents\GitHub\Scrapy>scrapy startproject qsbk
创建爬虫,域名限制
cd Xc
scrapy genspider Xczz xicidaili.com
效果图
第一步确认要爬取元素
import scrapy
class XcItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
country = scrapy.Field()
ipaddress = scrapy.Field()
port = scrapy.Field()
serveraddr = scrapy.Field()
isanonymous = scrapy.Field()
type = scrapy.Field()
alivetime = scrapy.Field()
创建爬虫
# -*- coding: utf-8 -*-
import scrapy
from Xc.items import XcItem
class XczzSpider(scrapy.Spider):
name = 'Xczz'
#域名限制
allowed_domains = ['xicidaili.com']
#起始爬取地址
start_urls = ['https://www.xicidaili.com/']
def parse(self, response):
# print(response.body.decode('utf-8'))
#把表分了2个class,分别爬出来,合并就行
item1 = response.xpath('//tr[@class="odd"]')
item2 = response.xpath('//tr[@class=""]')
items = item1 + item2
print(items)
infos = XcItem()
for item in items:
# print(item)
# 爬取国家缩写,得到的是一个列表
country = item.xpath('./td/img/@alt').extract()
#判断是否为空
if country != []:
country = country[0]
else:
country = None
ipaddress = item.xpath('./td[2]/text()').extract()#IP
try:
ipaddress = ipaddress[0]
except:
ipaddress = None
port = item.xpath('./td[3]/text()').extract()#端口
try:
port = port[0]
except:
port = None
serveraddr = item.xpath('./td[4]/text()').extract()#IP现实地址哪个省
try:
serveraddr = serveraddr[0]
except:
serveraddr = None
isanonymous = item.xpath('./td[5]/text()').extract()#
try:
isanonymous = isanonymous[0]
except:
isanonymous = None
type = item.xpath('./td[6]/text()').extract()#类型
try:
type = type[0]
except:
type = None
alivetime = item.xpath('./td[7]/text()').extract()#存活时间
try:
alivetime = alivetime[0]
except:
alivetime = None
verifitime = item.xpath('./td[8]/text()').extract()#检测时间
try:
verifitime = verifitime[0]
except:
verifitime = None
#查看爬取内容是否正确
print(country,ipaddress,port,serveraddr,isanonymous,type,alivetime,verifitime)
infos["country"] = country
infos["ipaddress"] = ipaddress
infos["port"] = port
infos["serveraddr"] = serveraddr
infos["isanonymous"] = isanonymous
infos["alivetime"] = alivetime
infos["verifitime"] = verifitime
yield infos
修改机器人协议,添加请求头,日志文件自动创建
#机器人协议关闭
ROBOTSTXT_OBEY = False
#添加请求头
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Language': 'zh-CN,zh;q=0.9',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
在settings最后添加上
#开启日志
LOG_FILE = 'xcdl.log'
LOG_LEVEL = 'ERROR'
LOG_ENABLED = True