本人新学python,写的第一个爬虫,记录自己的学习过程,并且分享代码,因为刚刚
学习代码不够简洁,很多地方考虑不周,有不好的地方望大家指教.一起进步
# coding=utf-8
# 此项目的目的是为了抓取测试各代理免费IP,并测试提供可使用的IP给其他爬虫使用
import requests
import re
import random
import time
import os
from functools import reduce
"""
1.随机报头
2.ip地址池
"""
# 地址栏输入 “about:version”来获取用户代理,伪装成流浪器访问网站
# 注意点如果是txt文件可能存在换行符,在遍历打印显示不出来,在列表中打印能显示,网页提取数据时一样
file_name = "ip_adress.txt"#保存文件名
path = "C:/Users/Administrator/Desktop/linxuan/internet_worm_project/" # 文件路劲
ip_num = 20 # 可用ip少于10个时开始去爬取网页,剩下的时间自我检测
sleeptime = 60 # 程序间隔时间
def Txt_Create(path): # path是指定文件路径,msg是写入的文件内容
if os.path.isfile(path):
print(file_name + '文件已存在')
else:
txt_file = open(path, 'w')
Txt_Create(path + file_name) # 无文件的话创建
def get_headers():
"""
# 创建随机报头
:return:
"""
user_agent = [
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50" ,
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
"Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
"Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
"Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
"Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
"UCWEB7.0.2.37/28/999",
"NOKIA5700/ UCWEB7.0.2.37/28/999",
"Openwave/ UCWEB7.0.2.37/28/999",
"Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
# iPhone 6:
"Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25",
]
random_user = random.choice(user_agent)
headers = {
'User-Agent': random_user
}
return headers
def spider():#爬取页面
"""
爬取文件内容
:return:
"""
page = 5 #提取前几页
raw_url = "http://www.89ip.cn/" # 请换个网址,嘿嘿,后面的正则表达式也做下相应的修改
#url = " http://www.89ip.cn/tqdl.html?api=1&num=300&port=&address=&isp= "
#这个是api但是发现api提取的ip
all_response = [] # 所有获取网页数据存储
for i in range(0,page):
cnt = 8 # 第几次开始使用本机ip
url = raw_url+'index_%d.html' % (i+1)
print("----------")
print("[spider] 爬取第%d页 "% (i+1))
for j in range(0,10): #抓取五次失败后结束
try:
response = ""
if len(ip_list)!=0 and j+1 < cnt : #如果没有ip则直接跳过
random_ip = random.choice(ip_list) # 从ip地址池里调用随机ip
print("[spider] %d 本次调用ip %s: " % (j+1,random_ip))
proxies_ip = {
"http": random_ip,
"https": random_ip
}
response = requests.get(url, headers=get_headers(),proxies=proxies_ip, timeout=5)
if response.status_code == 200:
all_response.append(response.text)
print("[spider] 成功爬取 %d 页" % (i+1))
break # 这样停止,不加break会重复抓取本页
else:
print("[spider] ip访问失败 ")
else :
print("[spider] 本次调用ip : " + "本机ip")
response = requests.get(url, headers=get_headers(), timeout=5)
if response.status_code == 200:
all_response.append(response.text)
print("[spider] 成功爬取 %d 页" % (i+ 1))
break
else:
print("[spider] ip访问失败 ")
except Exception as err:
print("[spider] ip访问失败 ")
print("==========")
return all_response
def get_data( all_data ): #将爬取的数据进行提取
ip_address = [] # 页面抓取后存储ip的列表
pat = '<td>[^<]*?(\d*\.\d*\.\d*\.\d*)[^<]*?</td>'
pat_1 = '</td>[^<]*?<td>[^<]*?(\d{2,8})[^<]*?</td>[^<]*?<td>'
# rst = re.compile(pat,re.S).findall(response)
rst = re.compile(pat).findall(all_data)
rst_1 = re.compile(pat_1, re.S).findall(all_data)
if len(rst) == len(rst_1):
for i in range(0, len(rst)):
if (rst[i] + ":" + rst_1[i]) in ip_list:
print("[get_data] %s 已在ip池中,跳过" % (rst[i] + ":" + rst_1[i]))
elif (rst[i] + ":" + rst_1[i]) not in ip_list:
ip_address.append(rst[i] + ":" + rst_1[i])
print("====================")
return ip_address
def proxy_ip(data):
"""
将获取的ip进行测试后写入文件
:param ip:
:return:
"""
print("[proxy_ip] 抓取数据 %d 条 " % len(data))
for i in range(0,len(data)):
print("[proxy_ip] %d 测试 ip : %s" % (i+1,data[i]))
proxies= {
"http": data[i],
"https": data[i]
}
if data[i] in ip_list:
print("[proxy_ip] %s 在ip池中跳过测试 " % data[i])
continue
else:
try:
response = requests.get('http://icanhazip.com',headers= get_headers(),proxies=proxies ,timeout=5)
if response.status_code == 200:
print("[proxy_ip] 测试成功 "+response.text)
fh = open(path+file_name, "a")
if data[i] not in ip_list:
fh.write(data[i]+'\r\n')
fh.close() # 关闭文件
else:
print("[proxy_ip] ip失效")
except Exception as err:
print("[proxy_ip] ip失效")
print("----------")
def update_ip():
"""
检查地址池再次检查可用的ip,无效的清除
:return:
"""
# with open 不需要在close关闭文件了,write等于关闭了
ip_adress = []
fh = open(path+file_name, 'r', encoding='utf-8') # 以只读模式读取文件,如果是w,a必须写的入,不然会出错
for i in fh:
data = i.strip('\n') # 去掉换行符
if data == "":
print("[update_ip] 未抓取到 ip , 程序跳过" + data)
continue
print("[update_ip] 检测ip有效性 : " + data)
proxies = {
"http": data,
"https": data
}
try:
response = requests.get('http://icanhazip.com', headers=get_headers(), proxies=proxies, timeout=5)
if response.status_code == 200:
print("[update_ip] 测试成功 " +response.text)
ip_adress.append(data)
else:
print("[update_ip] ip失效")
except Exception as err:
print("[update_ip] ip失效")
print("--------------------")
fh.close()
fh = open(path+file_name, 'w', encoding='utf-8') # 以只读模式读取文件,c重新写入txt
for i in ip_adress:
fh.writelines(i)
fh.writelines('\n')
fh.close()
print("====================")
if __name__ == "__main__": # 本模块下列程序才可使用
while True:
ip_list = [] # 文档里已存的ip,需要写在循环里,每次程序开始后得清空
with open(path+file_name, 'r') as fh:
for i in fh:
if i.strip('\n') != '': # 去除换行符的另外一种写法 i[:-1]
ip_list.append(i.strip('\n').strip())
if len(ip_list)<ip_num :
print(" ip 数量 %d 不满足设定值,启动爬虫 " % len(ip_list))
#spider()
t = reduce(lambda x, y: x+y,spider())# 对提取的网页合内容合并成一个列表
#print("t ",spider())
proxy_ip(get_data( str(t)))
update_ip()
else:
print("ip 数量 %d 满足设定值,运行爬虫维护" % len(ip_list))
print("====================")
update_ip()
time.sleep(sleeptime)