查看源码 使用xpath解析标签
import requests
import parsel
proxies_list = []
url = "https://www.kuaidaili.com/free/"
hander = {"User-Agent": "Mozilla/5.0"}
r = requests.get(url, headers=hander, timeout=30)
data = r.text
#print(data)
html_data = parsel.Selector(data)
tr_parse = html_data.xpath(
'//table[@class="table table-bordered table-striped"]/tbody/tr')
for tr in tr_parse:
proxies_dict = {}
http_type = tr.xpath('./td[4]/text()').extract_first()
ip = tr.xpath('./td[1]/text()').extract_first()
ip_port = tr.xpath('./td[2]/text()').extract_first()
proxies_dict[http_type] = ip + ':' + ip_port
proxies_list.append(proxies_dict)
print(proxies_list)
def check_ip(proxies_list):#检查IP的质量
hander = {"User-Agent": "Mozilla/5.0"}
can_use = []
for ip in proxies_list:
try:
response = requests.get('http://www.baidu.com',
headers=hander,
timeout=0.1)#如果超过0.12秒没反应则抛弃
if response.status_code == 200:
can_use.append(ip)
except Exception as e:
print(ip, e)
return can_use
print(check_ip(proxies_list)) #输出高质量ip
{'HTTP': '125.94.44.129:1080'} HTTPConnectionPool(host='www.baidu.com', port=80): Read timed out. (read timeout=0.1)
[{'HTTP': '60.190.250.120:8080'}, {'HTTP': '118.112.195.91:9999'}, {'HTTP': '110.243.5.163:9999'}, {'HTTP': '118.89.91.108:8888'}, {'HTTP': '125.122.199.13:9000'}, {'HTTP': '171.11.28.248:9999'}, {'HTTP': '211.152.33.24:39406'}, {'HTTP': '59.62.35.130:9000'}, {'HTTP': '123.163.96.124:9999'}, {'HTTP': '125.117.135.10:9000'}, {'HTTP': '175.44.108.164:9999'}, {'HTTP': '110.243.15.228:9999'}, {'HTTP': '1.193.245.47:9999'}, {'HTTP': '59.62.24.87:9000'}]
使用代理ip池来访问:
proxies_list = []
proxy = [{
'HTTP': '60.190.250.120:8080'
}, {
'HTTP': '118.112.195.91:9999'
}, {
'HTTP': '110.243.5.163:9999'
}, {
'HTTP': '118.89.91.108:8888'
}, {
'HTTP': '125.122.199.13:9000'
}, {
'HTTP': '171.11.28.248:9999'
}, {
'HTTP': '211.152.33.24:39406'
}, {
'HTTP': '59.62.35.130:9000'
}, {
'HTTP': '123.163.96.124:9999'
}, {
'HTTP': '125.117.135.10:9000'
}, {
'HTTP': '175.44.108.164:9999'
}, {
'HTTP': '110.243.15.228:9999'
}, {
'HTTP': '1.193.245.47:9999'
}, {
'HTTP': '59.62.24.87:9000'
}]
for a in range(1,5):
url = "https://www.kuaidaili.com/free/inha/"+str(a)+"/"
hander = {"User-Agent": "Mozilla/5.0"}
for i in proxy:
r = requests.get(url, headers=hander, timeout=1, proxies=i)
if r.status_code == 200:
html = r.text
html_parsel_data = parsel.Selector(html)
tr_parse = html_parsel_data.xpath('//table[@class="table table-bordered table-striped"]/tbody/tr')
for tr in tr_parse:
proxy_dict = {}
http_type = tr.xpath('./td[4]/text()').extract_first()
ip = tr.xpath('./td[1]/text()').extract_first()
ip_port = tr.xpath('./td[2]/text()').extract_first()
proxy_dict[http_type] = ip + ':' + ip_port
proxies_list.append(proxy_dict)
break
else:
continue
def check_ip(proxies_list):
hander = {"User-Agent": "Mozilla/5.0"}
can_use = []
for ip in proxies_list:
try:
response = requests.get('http://www.baidu.com',
headers=hander,
timeout=0.1)
if response.status_code == 200:
can_use.append(ip)
except Exception as e:
print(ip, e)
return can_use
print(check_ip(proxies_list)) 输出高质量代理IP
[{'HTTP': '175.42.128.48:9999'}, {'HTTP': '123.101.212.223:9999'}, {'HTTP': '60.190.250.120:8080'}, {'HTTP': '125.94.44.129:1080'}, {'HTTP': '118.112.195.91:9999'}, {'HTTP': '110.243.5.163:9999'}, {'HTTP': '118.89.91.108:8888'}, {'HTTP': '125.122.199.13:9000'}, {'HTTP': '171.11.28.248:9999'}, {'HTTP': '211.152.33.24:39406'}, {'HTTP': '59.62.35.130:9000'}, {'HTTP': '123.163.96.124:9999'}, {'HTTP': '125.117.135.10:9000'}, {'HTTP': '175.44.108.164:9999'}, {'HTTP': '110.243.15.228:9999'}, {'HTTP': '59.62.24.87:9000'}, {'HTTP': '113.124.93.190:9999'}, {'HTTP': '119.119.239.155:9000'}, {'HTTP': '60.13.42.157:9999'}, {'HTTP': '180.104.63.242:9000'}, {'HTTP': '175.42.68.223:9999'}, {'HTTP': '1.198.73.202:9999'}, {'HTTP': '125.108.76.226:9000'}, {'HTTP': '106.75.177.227:8111'}, {'HTTP': '124.93.201.59:42672'}, {'HTTP': '121.233.206.211:9999'}, {'HTTP': '175.44.109.104:9999'}, {'HTTP': '118.212.104.240:9999'}, {'HTTP': '163.204.240.107:9999'}, {'HTTP': '60.13.42.77:9999'}, {'HTTP': '49.89.86.30:9999'}, {'HTTP': '106.42.217.26:9999'}, {'HTTP': '115.29.170.58:8118'}, {'HTTP': '183.166.133.196:9999'}, {'HTTP': '114.223.208.165:8118'}, {'HTTP': '175.44.109.71:9999'}, {'HTTP': '163.204.244.219:9999'}, {'HTTP': '210.5.10.87:53281'}, {'HTTP': '123.101.213.137:9999'}, {'HTTP': '171.15.49.169:9999'}, {'HTTP': '1.198.72.171:9999'}, {'HTTP': '125.108.101.220:9000'}, {'HTTP': '36.250.156.85:9999'}, {'HTTP': '123.169.167.44:9999'}, {'HTTP': '123.169.167.44:9999'}, {'HTTP': '115.219.168.69:8118'}, {'HTTP': '1.199.30.73:9999'}, {'HTTP': '222.74.65.69:56210'}, {'HTTP': '110.243.26.53:9999'}, {'HTTP': '171.13.7.108:9999'}, {'HTTP': '175.43.151.48:9999'}, {'HTTP': '1.193.245.3:9999'}, {'HTTP': '163.204.240.35:9999'}, {'HTTP': '113.195.16.66:9999'}, {'HTTP': '27.43.188.27:9999'}, {'HTTP': '113.208.115.190:8118'}, {'HTTP': '125.110.100.170:9000'}, {'HTTP': '1.198.72.19:9999'}, {'HTTP': '121.232.199.174:9000'}]
xpath 语法
获取href属性和 文本