这段时间的学习,如有雷同,肯定不是抄袭 。hhhh......
建立IP池前需要把IP存储到本地
这里爬取的时西刺IP代理:
class GetForignProxy(object):
# 伪装header
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36"
}
# item用来暂存数据
items = {
"ip": '',
"post": '',
"address": '',
"speed": 0,
"conn_time": 1000
}
# 创建数据库连接
sql_con = MySQLdb.connect(host="localhost",
user="root",
passwd="12345",
db="spider_article",
charset="utf8"
)
# 获取光标
cursor = sql_con.cursor()
def __init__(self, url):
# 初始化
# 请求
self.resp = self.get_response(url)
# 遍历迭代器,使process_response()的for循环执行下去
for item_sql in self.process_response(self.resp):
pass
# 返回参数url的response
def get_response(self, url):
return requests.get(url, headers=self.headers)
# 处理西刺的数据
def process_response(self, resp):
sself.selet= Selector(text= resp.text)
self.all_trs=self.selet.xpath("//table[@id= 'ip_list']//tr[@class= 'odd']")
for tr in self.all_trs:
self.items["ip"]=tr.xpath(".//td//text()").extract()[0]
self.items["post"]=tr.xpath(".//td//text()").extract()[1]
self.items["address"]=tr.xpath(".//td//text()").extract()[3]
speed =tr.xpath(".//div/@title").extract()[0]
conn_time= tr.xpath(".//div/@title").extract()[1]
if speed:
self.items["speed"]= float(speed.split('秒')[0])
self.items["conn_time"] = float(conn_time.split('秒')[0])
# 生成迭代,执行插入数据库操作
yield self.insert_to_mysql(self.items)
# 插入数据库
def insert_to_mysql(self, items):
sql_insert="""
insert into xici_proxy
values
(%s,%s,%s,%s,%s)
"""
self.cursor.execute(sql_insert,(items["ip"],items["post"],items["address"],items["speed"],items["conn_time"]))
self.sql_con.commit()
取的时候,取随机IP:
class GetRandomIp(object):
"""
通过.get_random_ip()获取返回随机IP
"""
def __init__(self):
self.sql_con = MySQLdb.connect(host="localhost",
user="root",
passwd="12345",
db="spider_article",
charset="utf8")
self.cursor = self.sql_con.cursor()
def get_random_ip(self):
sql_select = """select ip,post
from xici_proxy
ORDER BY RAND()
LIMIT 1
"""
while(True):
self.cursor.execute(sql_select)
#取光标中数据
for select_info in self.cursor.fetchall():
ip= select_info[0]
post= select_info[1]
proxy_ip="https://"+ ip+ ':'+ post
#判断IP是否可用,是则返回
if self.JudgeIp(proxy_ip,ip):
return proxy_ip
else:
break
判断IP是否可用:
# 通过参数proxies=字典,来设置代理,发送请求给百度
def JudgeIp(self,proxy_ip,ip):
# 要判断代理IP是否可用,用它搜一下百度就可以
post_url= "https://www.baidu.com/"
header={
"user-agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36"
}
try:
proxy_dict= {"https":proxy_ip}
print("...try to connect...")
# 通过参数proxies=字典,来设置代理
resp= requests.get(post_url, proxies= proxy_dict, headers=header)
# sta_code= resp.status_code
# if sta_code>=200 and sta_code<300:
# return True
# else:
# return False
except Exception as e:
print("invalid proxy_dict")
self.DeleteIp(ip)
return False
else:
sta_code = resp.status_code
if sta_code >= 200 and sta_code < 300:
return True
else:
# 如果ip不可用,从数据库删除掉
self.DeleteIp(ip)
return False
# 从数据库删除掉不可用IP
def DeleteIp(self,ip):
sql_delete="""delete from xici_proxy
where ip='{0}'""".format(ip)
print("删除ip:" + ip+'\n')
self.cursor.execute(sql_delete)
self.sql_con.commit()
return True
最后调用时需要用if __name__ == "__main__":在当前运行,这是python的一种格式
if __name__ == "__main__":
#GetXiCiProxy("https://www.xicidaili.com/wn/")
#GetForignProxy("http://www.data5u.com/free/gwpt/index.shtml")
rand_ip= GetRandomIp()
print(rand_ip.get_random_ip())