写了一个从IP代理网页源码里面提取IP地址和端口号的爬虫,具体代码如下:(也发现一个BUG)
import urllib.request
import urllib.parse
import random
import re
def get_url(url):
headers ={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
req = urllib.request.Request(url=url,headers=headers)
res = urllib.request.urlopen(req)
html = res.read()
return html
def get_iplist():
url = 'http://www.xicidaili.com/wt/'
html_code = get_url(url).decode('utf-8')
#print(html_code)
p = re.compile(r'(?:(?:[01]?\d?\d|2[0-4]\d|25[0-5])\.){3}(?:[01]?\d?\d|2[0-4]\d|25[0-5])')
#bug:如果ip地址最后一段大于200则会少一位
iplist = []
iplist = p.findall(html_code)
it = p.finditer(html_code)
count=0
data=[]
for i in it:
a = html_code.find('<td>',i.end())
b = html_code.find('</td>',a,a+50)
if a==-1 or b==-1:
print('error')
data.append(iplist[count]+':'+html_code[a+4:b])
count+=1
return data
if __name__ == "__main__":
iplist = get_iplist()
for i in iplist:
print(i)
代码运行后的截图:
代码出现的那个BUG我也想不懂是为啥。。。