效果图:
环境配置: Anaconda+Mysql+Redis
1.spider.py:
# -*- coding:utf-8 -*-
#import scrapy
from scrapy_redis.spiders import RedisSpider
import sys
sys.path.append("..")
from scrapy_parse_html.items import ScrapyParseHtmlItem
"""
Author:
Purpose: 爬取代理网站:https://www.kuaidaili.com/free/inha/1/
requirement:1.采用分布式爬虫
2.剔除广东移动代理和响应时间大于3秒的数据,
3.将IP、PORT、匿名度、类型、位置、响应速度、最后验证时间保存至mysql数据库、TXT和json
Created: 2020/8/16
"""
class KuaidailiSpider(RedisSpider):
name = 'kuaidaili'
allowed_domains = ['www.kuaidaili.com']
#start_urls = ['https://www.kuaidaili.com/free/inha/1/']
redis_key = "kuaidaili:url"
def parse(self, response):
if response.status == 200: # response.status <class 'int'>
print(response.url)
proxys = response.xpath("//*[@id='list']/table/tbody/tr")
proxys_item = ScrapyParseHtmlItem()
for proxy in proxys:
proxys_item["ip"] = proxy.xpath("./td[1]/text()").extract()[0]
proxys_item["port"] =proxy.xpath("./td[2]/text()").extract()[0]
proxys_item["anonymous"] = proxy.xpath("./td[3]/text()").extract()[0]
proxys_item["types"] = proxy.xpath("./td[4]/text()").extract()[0]
proxys_item["location"] = proxy.xpath("./td[5]/text()").extract()[0]
proxys_item["responding_speed"] = proxy.xpath("./td[6]/text()").extract()[0]
proxys_item["Last_verification_time"] = proxy.xpath("./td[7]/text()").extract()[0]
yield proxys_item #遇到yield就返回item给值pipelines
else:
print(f"请求访问失败,状态码为{response.status}")
2.settings.py:
# -*- coding:utf-8 -*-
#此处只包保留的已加的字段
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36'
}
ITEM_PIPELINES = {
'scrapy_parse_html.pipelines.FilterData_1': 200, #数值越小越容易执行
'scrapy_parse_html.pipelines.FilterData_2': 201,
'scrapy_parse_html.pipelines.FilterData_3': 202,
'scrapy_parse_html.pipelines.ScrapyParseHtmlPipeline': 300,
'scrapy_parse_html.pipelines.InitializationAndClear': 400,
}
LOG_LEVEL = "ERROR"
LOG_FILE ="root.log"
#mysql数据库信息
HOST = "127.0.0.1"
PORT = "3306"
USER = "root"
PWD = "123456"
DATABASE = "python_test"
#SQL语句
CREATE_SQL = """ create table if not exists proxyDB(
id int PRIMARY KEY AUTO_INCREMENT,
ip varchar(50) COMMENT "IP",
port varchar(10) COMMENT "port",
anonymous varchar(20) COMMENT "匿名度",
types varchar(20) COMMENT "类型",
location VARCHAR(100) COMMENT "位置",
responding_speed VARCHAR(10) COMMENT "响应速度",
Last_verification_time VARCHAR(50) COMMENT "最后验证时间"
)
"""
INSERT_SQL = """'insert into proxyDB(ip,port,anonymous,types,location,responding_speed,Last_verification_time)values("{ip}","{port}","{anonymous}","{types}","{location}","{responding_speed}","{Last_verification_time}")'.format(**item)"""
#调度器和去重类
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
#redis的连接信息
REDIS_URL = "redis://:123456@127.0.0.1:6379"
#调度队列,采用先进先出
SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.FifoQueue"
#配置持久化(可选)
SCHEDULER_PERSIST = True
#配置重爬(可选)
SCHEDULER_FLUSH_ON_START = True
3.items.py:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class ScrapyParseHtmlItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
ip = scrapy.Field()
port = scrapy.Field()
anonymous = scrapy.Field()
types = scrapy.Field()
location = scrapy.Field()
responding_speed = scrapy.Field()
Last_verification_time = scrapy.Field()
4.pipelines.py:
# -*- coding:utf-8 -*-
import json
import os
import pymysql
import redis
from itemadapter import ItemAdapter
class FilterData_1:
"""剔除广东代理数据"""
def process_item(self, item, spider):
if not "广东" in item["location"]:
return item
class FilterData_2:
"""剔除移动的代理数据"""
def process_item(self, item, spider):
if not "移动" in item["location"]:
return item
class FilterData_3:
"""剔除响应时间大于3秒的代理数据"""
def process_item(self, item, spider):
time = item["responding_speed"].replace("秒","")
if not float(time) >= 3:
return item
class InitializationAndClear:
def __init__(self,setting): # 用于获取settings中的全局变量
self.host = setting.get("HOST")
self.port = setting.get("PORT")
self.user = setting.get("USER")
self.pwd = setting.get("PWD")
self.database = setting.get("DATABASE")
self.create_sql = setting.get("CREATE_SQL")
self.insert_sql = setting.get("INSERT_SQL")
@classmethod
def from_crawler(cls,crawler):
setting = crawler.settings
return cls(setting = crawler.settings) #直接返回setting类
def process_item(self, item, spider):
self.updateDB(eval(self.insert_sql))
return item
def open_spider(self,spider):
"""spider开始时创建数据库,创建url写入redis中"""
try:
print(self.host,self.port,self.user,self.pwd,self.database)
self.connection = pymysql.connect(host=self.host,port=eval(self.port),user=self.user,password=self.pwd,db=self.database,charset="utf8mb4")
self.cursor = self.connection.cursor()
self.cursor.execute(self.create_sql)
print("数据库创建成功")
except Exception as e:
print("数据库连接失败",self.create_sql,e)
try:
url = "https://www.kuaidaili.com/free/inha/{}/"
connection = redis.Redis(host="127.0.0.1",port="6379",password="123456")
for i in range(1,3589):
connection.lpush("kuaidaili:url",url.format(i))
except:
print("redis数据库中url创建失败")
def close_spider(self,spider):
"""spider关闭时的操作"""
try:
cc = []
with open("proxy_middleware.json","r",encoding="utf-8") as f: #读取proxy.json中的数值,将其处理成json数据
aa = iter(f)
for i in aa:
cc.append(eval(i.strip()))
json_data = json.dumps(cc,indent=4,ensure_ascii=False)
with open("proxy.json","w",encoding="utf-8") as f: #覆盖之前的json,写入文件
f.write(json_data)
print("json格式数据转换成功")
except:
print("json格式数据转换失败")
#删除临时文件
try:
os.remove("proxy_middleware.json")
except Exception as e:
print("临时json文件删除失败",e)
def updateDB(self,sql):
try:
self.cursor.execute(sql)
print("数据插入成功")
self.connection.commit()
except Exception as e:
print("数据插入失败",sql,e)
self.connection.rollback()
class ScrapyParseHtmlPipeline:
def process_item(self, item, spider):
print(item["location"],item["ip"],item["responding_speed"])
self.save_txt(item) #保存txt
self.sava_json(item) #半成品的json,还需在close_spider()方法中处理
return item
def save_txt(self,item):
try:
with open("proxy.txt","a+") as f:
f.write("{ip}\t{port}\t{anonymous}\t{types}\t{location}\t{responding_speed}\t{responding_speed}\n".format(**item))
except Exception as e:
print("文件写入txt失败",e)
def sava_json(self,item):
"""
半成品的json,还需在close_spider()方法中处理
"""
try:
a = str(dict(item))
with open("proxy_middleware.json","a+",encoding="utf-8") as f:
f.write(a+"\n")
except Exception as e:
print("文件写入json失败",e)