爬取西刺代理网页的信息,并保存到本地的TXT文本或者mysql数据库中
本文只做爬取网页的练习
代码如下:
爬取西刺代理网页的信息,并保存到本地的TXT文本或者mysql数据库中
本文只做爬取网页的练习
代码如下:
import requests
from lxml import etree
import pymysql
class getXiCi:
def __init__(self):
self.url = 'http://www.xicidaili.com/'
self.headers = self.headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
#写入数据库的参数准备
self.user='root'
self.pwd='123456'
self.host='127.0.0.1'
def getHtml(self):
res = requests.get(self.url,headers=self.headers)
res.encoding='utf-8'
html= res.text
self.parseHtml(html)
print('获取网页成功')
def parseHtml(self,html):
lst=[]
#构建xpath解析对象
parsehtml = etree.HTML(html)
#获取IP元素对象
iplist = parsehtml.xpath('//tr/td[2]')
#获取port元素对象
portlist = parsehtml.xpath('//tr/td[3]')
#获取地区元素对象
addrlist = parsehtml.xpath('//tr/td[4]')
#获取是否高匿对象
iflist = parsehtml.xpath('//tr/td[5]')
#获取代理协议类型对象
typelist = parsehtml.xpath('//tr/td[6]')
# #写入本地
# for x,y,z,m,n in zip(iplist,portlist,addrlist,iflist,typelist):
# if x.text and y.text and z.text and m.text and n.text:
# # .text是获取文档内容
# s=x.text+' '+y.text+' '+z.text+' '+m.text+' '+n.text+'\n'
# self.writeComment(s)
#写入数据库
self.writeComment(iplist,portlist,addrlist,iflist,typelist)
#写入本地或者数据库
# # 写入本地
# def writeComment(selfself,lst):
# with open('西刺代理.txt','a') as f:
# f.write(lst)
#写入数据库Mysql
def writeComment(self,iplist,portlist,addrlist,iflist,typelist):
conn = pymysql.connect(self.host,self.user,self.pwd)
cursor = conn.cursor()
cursor.execute('create database if not exists xici;')
cursor.execute('use xici')
cursor.execute('create table if not exists xici(id int primary key auto_increment,IP varchar(20),port varchar(10),address varchar(20),type1 varchar(15),type2 varchar(10))default charset="utf8";')
for x,y,z,m,n in zip(iplist,portlist,addrlist,iflist,typelist):
if x.text and y.text and z.text and m.text and n.text:
cursor.execute('insert into xici(IP,port,address,type1,type2) values("%s","%s","%s","%s","%s");'%(x.text,y.text,z.text,m.text,n.text,))
conn.commit()
#主函数
def main(self):
self.getHtml()
if __name__ == '__main__':
xici = getXiCi()
xici.main()