1、http://www.xicidaili.com/wt 国内免费代理网站
2、利用scrapy爬取该网站内的IP地址与端口,写入txt文档
3、编写脚本测试txt文档中的ip地址与端口是否可用
4、将可用ip地址与端口输入txt文档
————————————————————————
1、编写Item类
由于我们只需要ip地址与端口,所以只写一个属性即可
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class IpItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class IpInfoItem(scrapy.Item):
ip=scrapy.Field()
2、编写spider
# -*- coding: utf-8 -*-
import scrapy
import sys
sys.path.append("D:\\pycodes\\ip")
from ip.items import IpInfoItem
class IpSpider(scrapy.Spider):
name = 'Ip'
start_urls = []
#爬取5页网站的IP
for i in range(1,6):
start_urls.append('http://www.xicidaili.com/wt/'+str(i))
def parse(self, response):
item = IpInfoItem()
for sel in response.xpath('//tr'):
ip= sel.xpath('.//td[2]/text()').extract_first()
port=sel.xpath('.//td[3]/text()').extract_first()
item['ip']=str(ip)+":"+str(port)
yield item
3、编写pipeline
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class IpPipeline(object):
def process_item(self, item, spider):
return item
class IpInfoPipeline(object):
def process_item(self,item,spider):
try:
#我们只需要IP地址与端口,因此只把字典值写进txt文件
content = item['ip']
open("xinresult.txt","a").write(content+"\n")
except:
pass
return item
至此,我们从网站上爬下来了5页的IP,需要编写脚本进行测试
import requests
alive_ip=[]
def test_alive(proxy):
global alive_ip
for proxies_be in proxy:
#request中的IP地址需要以下列形式的参数写进函数
proxies={"http":proxies_be}
print("正在测试:{}".format(proxies))
try:
r = requests.get("http://www.baidu.com",proxies=proxies,timeout=2)
if r.status_code==200:
print("成功,ip为{}".format(proxies))
alive_ip.append(proxies_be)
else:
print("失败")
except:
print("失败")
def out_file(alive_ip=[]):
with open ("alive_ip.txt","w") as f:
for ip in alive_ip:
f.write(str(ip)+"\n")
print("输出完毕")
def test(filename="blank.txt"):
with open(filename,"r") as f:
lines = f.readlines()
proxys=list(map(lambda x:x.strip(),lines))
test_alive(proxys)
out_file(alive_ip)
test("xinresult.txt")