#这里的mysql数据库用的是phpstudy里的,安装方便
#需要下载MySqldb第三方库
import MySQLdb,requests
from bs4 import BeautifulSoup
def get_ip_list(url, headers):
web_data = requests.get(url, headers=headers)
soup = BeautifulSoup(web_data.text, 'lxml')
ips = soup.find_all('tr')
#ip_list = []
for i in range(1, len(ips)):
ip_info = ips[i]
tds = ip_info.find_all('td')
yield (tds[1].text,tds[2].text,tds[5].text)
def main():
url= 'http://www.xicidaili.com/nn/'#要爬取的网站
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
}
db = MySQLdb.connect(host='localhost',user='root',passwd='root',db='mydatebase')#连接数据库(地址,用户名,密码,数据库名)
cur = db.cursor()#取游标
sql = "insert into ips(IP,Port,Type) VALUES ('%s','%s','%s')"
for index in range(1,51):
t = url+str(index)
print("第%d页面下载"%index)
for i in get_ip_list(t,headers):#迭代下载的代理IP数组
print(str(i).rjust(50," "))
cur.execute(sql%(i[0],i[1],i[2]))#执行数据库插入操作
cur.close()
db.close()
if __name__=="__main__":
main()