要求:爬取西祠代理ip,可用的ip加入列表中显示出来,不能用的pass,访问百度网测试
(普通爬取,不用进程)
import requests
from lxml import etree
import time
def get_all_proxy():
url = 'http://www.xicidaili.com/nn/1'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
response = requests.get(url,headers=headers)
# with open('xici.html','wb') as f:
# f.write(response.content)
# 只需要ip及端口号,提取
html_ele = etree.HTML(response.text)
ip_ele = html_ele.xpath('//table[@id="ip_list"]/tr/td[2]/text()') #要看打印在html中的文件,因为有的网页中有而没打印出来,以自己打印的为主
port_ele = html_ele.xpath('//table[@id="ip_list"]/tr/td[3]/text()')
# print(len(ip_ele))
# print(le