简要介绍:
使用python3 环境,需要自己安装的包有 requests (网址请求,获取页面信息)和 Lxml(页面解析,信息提取) 。
首先确定从何处获取 “IP”,本人此次爬取的是 西刺网 的免费IP代理。
大概流程:
- 请求有免费IP的网址(本次使用“http://www.xicidaili.com/nn/”)
- 获取网址的页面信息
- 从已经获得的页面信息中,提取有用的数据(关于代理IP的)。
- 对爬取的IP数据进行筛选,验证IP 的是否可用(**西刺网上的免费IP并不是全部一定可用**)
- 将有用的IP信息(IP地址及端口号)进行存储,写入文
import requests from lxml import etree #代理IP的信息存储 def write_proxy(proxies): print(proxies) for proxy in proxies: with open("ip_proxy.txt", 'a+') as f: print("正在写入:", proxy) f.write(proxy + '\n') print("录入完成!!!") #解析网页,并得到网页中的代理IP def get_proxy(html): #对获取的页面进行解析 selector = etree.HTML(html) #print(selector.xpath("//title/text()")) proxies = [] #信息提取 for each in selector.xpath("//tr[@class='odd']"): #ip.append(each[0]) ip = each.xpath("./td[2]/text()")[0] port = each.xpath("./td[3]/text()")[0] proxy = ip + ":" + port proxies.append(proxy) print(len(proxies)) test_proxies(proxies) #验证已得到IP的可用性,本段代码通过访问百度网址,返回的response状态码判断(是否可用)。 def test_proxies(proxies): proxies = proxies url= "http://www.baidu.com/" header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36", } normal_proxies=[] count = 1 for proxy in proxies: print("第%s个。。" % count) count+=1 try: response=requests.get(url,headers=header,proxies= {"http":proxy},timeout= 1) if response.status_code == 200: print("该代理IP可用:",proxy) normal_proxies.append(proxy) else: print("该代理IP不可用:",proxy) except Exception: print("该代理IP无效:", proxy) pass #print(normal_proxies) write_proxy(normal_proxies) def get_html(url): header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36", } response = requests.get( url, headers=header, ) #print(response.text) get_proxy(response.text) if __name__ == "__main__": url = "http://www.xicidaili.com/nn/" get_html(url)
多线程模式:
因为验证IP是否可用,程序是串行(故此只能验证一个结束后,才进行验证下一个,**效率极低**。)
流程还是同上。
import requests import threading from lxml import etree #解析网页,并得到网页中的IP代理 def get_proxy(html): selector = etree.HTML(html) #print(selector.xpath("//title/text()")) proxies = [] for each in selector.xpath("//tr[@class='odd']"): #ip.append(each[0]) ip = each.xpath("./td[2]/text()")[0] port = each.xpath("./td[3]/text()")[0] #拼接IP地址,端口号 proxy = ip + ":" + port proxies.append(proxy) print(len(proxies)) test_proxies(proxies) def thread_write_proxy(proxy): with open("./ip_proxy.txt", 'a+') as f: print("正在写入:", proxy) f.write(proxy + '\n') print("录入完成!!!") #添加线程模式 def thread_test_proxy(proxy): url = "http://www.baidu.com/" header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36", } try: response = requests.get( url, headers=header, proxies={"http": proxy}, timeout=1) if response.status_code == 200: print("该代理IP可用:", proxy) #normal_proxies.append(proxy) thread_write_proxy(proxy) else: print("该代理IP不可用:", proxy) except Exception: print("该代理IP无效:", proxy) pass #验证已得到IP的可用性 def test_proxies(proxies): proxies = proxies #print("test_proxies函数开始运行。。。\n", proxies) for proxy in proxies: test = threading.Thread(target=thread_test_proxy, args=(proxy, )) test.start() def get_html(url): header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36", } response = requests.get( url, headers=header, ) #print(response.text) get_proxy(response.text) if __name__ == "__main__": url = "http://www.xicidaili.com/nn/" get_html(url)