设计思路:
1.找个免费代理的网站,爬取网站代理ip等信息;
2.验证代理ip是否有用
3.把有用的代理ip信息存到数据库中
import requests
import MySQLdb
from bs4 import BeautifulSoup
headers = {
'Host':"map.baidu.com",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "en-US,en;q=0.5",
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36"
}
db = MySQLdb.connect("localhost","root",'liao1234','liao')
cursor = db.cursor()
sql ="""create table proxies(ip char(20) not null,
port char(20),
area char(20),
nm char(20),
type char(20),
livetime char(20),
ytime char(20))"""
cursor.execute(sql)
r = requests.get("http://www.xicidaili.com/",headers=headers)
html =r.text
soup = BeautifulSoup(html)
for tag in soup.find('table',id='ip_list').find_all('tr'):
ss = []
for aa in tag.find_all('td'):
if aa.string is None:
continue
else:
print aa.string
ss.append(aa.string)
if len(ss) == 0:
continue
else:
domian = "http://"+str(ss[0])+":"+str(ss[1])
proxies = { "http": domian, "https": domian, }
try:
r1 = requests.get("http://www.webkaka.com/", proxies=proxies,headers=headers,timeout=5)
print r1.status_code
if r1.status_code == 200:
sql = "insert into proxies(ip,port,area,nm,type,livetime,ytime) values('%s','%s','%s','%s','%s','%s','%s')"%(ss[0].encode('utf-8'),ss[1].encode('utf-8'),ss[2].encode('utf-8'),ss[3].encode('utf-8'),ss[4].encode('utf-8'),ss[5].encode('utf-8'),ss[6].encode('utf-8'))
cursor.execute(sql)
else:
print "code is not 200"
except:
print "this ip is droped!"
db.close
扫描结果如下: