网址:
http://iqingyun.cyol.com/mobile/practice/tasklist.html
爬取的时候队伍数仍在增多,网页数也会随着队伍数目增多有所变化,其中有部分队伍信息的html结构与其它队伍不同,需要利用xpth对其进行提取。
注意做好异常处理,因为有四百多个页面。
import urllib.request
import re
from lxml import etree
import xlsxwriter
import datetime
import time
#信息爬取
headers = ("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)"
" Chrome/65.0.3325.146 Safari/537.36")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
# 198
while True:
udata = []
tddata = []
idaz = []
now_time = datetime.datetime.now().strftime('%Y-%m-%d %H+%M+%S') # 现在
try:
for i in range(1,439):
data = urllib.request.urlopen("http://iqingyun.cyol.com/mobile/practice/tasklist.html?pageIndex=" + str(i)).read().decode("utf-8", "ignore")
pat1 = '<h2 class="h2">(.*?)</h2>'
tudata = re.compile(pat1).findall(data)
udata.extend(tudata)
data3 = etree.HTML(data)
for z in range(1,len(tudata)+1):
bindata = data3.xpath('//*[@id="c1c041"]/div['+str(z)+']/div/a[2]/h5')
binstr = bindata[0].text
s = ''
for j in binstr:
s +=j
tddata.append(s)
#print(z)
pat3 = '<span class="zan_num" id=".*?" pid="(.*?)">(.*?)</span>'
idaz.extend(re.compile(pat3).findall(data))
print('第'+str(i)+'个页面信息网址爬取成功')
except Exception as e:
print(e)
print(udata)
print(s)
print(tddata)
print(len(tddata))
print(len(idaz))
workbook = xlsxwriter.Workbook("E/test/ph"+str(now_time)+".xlsx")
worksheet = workbook.add_worksheet()
for i in range(0, len(idaz)):
try:
if len(idaz[i][0]):
worksheet.write(i, 0,idaz[i][0])
else:
worksheet.write(i, 0, ' ')
if len(udata[i]):
worksheet.write(i, 1, udata[i])
else:
worksheet.write(i, 1, ' ')
if len(tddata[i]):
worksheet.write(i, 2, tddata[i])
else:
worksheet.write(i, 2, ' ')
if len(idaz[i][1]):
worksheet.write(i, 3,idaz[i][1])
else:
worksheet.write(i, 3, ' ')
except Exception as e:
print(e)
workbook.close()
time.sleep(3600)
后来发现提取的排行榜有不小的信息量(很有分析的意义),所以将代码改成了一个小时自动提取一次信息。
2018-10-7
2018-10-27