最近学车,没有做什么正事,把好几天前的东西发出来给大家玩玩吧。因为地震所以才想到要做这个。。。
采集中国地震台网的地震数据。
#/usr/bin/env python
# -*- coding:utf-8 -*-
import urllib.request as request
import urllib
import re
#获取信息的主页面url
mainurl=r'http://www.ceic.ac.cn/AdvSearchHandler'
#获取信息需要post的数据
mainpostdata = b'currentPageNo=1&longtitudeMin=&longtitudeMax=&periodFrom=&periodTo=&latitudeMin=&latitudeMax=&depthMin=&depthMax=&magnitudeMin=&magnitudeMax='
#获取信息的regex
pageinforegex = r'<td width="60%">共搜索到([\d]+)条符合条件的地震信息,共([\d]+)页,正显示第1页</td>'.encode('utf-8')
#地震数据页面的url
#detailurl = r'http://www.ceic.ac.cn/AdvSearchHandler?currentPageNo='+str(n-1)+'&longtitudeMin=&longtitudeMax=&periodFrom=&periodTo=&latitudeMin=&latitudeMax=&depthMin=&depthMax=&magnitudeMin=&magnitudeMax='
#地震数据页面的post数据
detailpostdata = b'advSearchActionType=pageDown'
#提取地震数据的regex
detailregex = b'<td width="150px" nowrap>[\w\W]+?</td>[\w\W]+?</tr>'
def crawler():
pageinfo = getPageNum(mainurl,mainpostdata,pageinforegex)
if pageinfo!=None:
for pageno in range(1,int(pageinfo[1])):
detailurl = r'http://www.ceic.ac.cn/AdvSearchHandler?currentPageNo='+str(pageno-1)+'&longtitudeMin=&longtitudeMax=&periodFrom=&periodTo=&latitudeMin=&latitudeMax=&depthMin=&depthMax=&magnitudeMin=&magnitudeMax='
resultdata = getPageData(detailurl,detailpostdata,detailregex)
data2file(resultdata)
return
#获取地震数据的总页面个数
def getPageNum(url,postdata,pageinforegex):
req = request.Request(
url,
postdata
)
page = request.urlopen(req).read()
pageinfo = re.findall(pageinforegex,page)
if len(pageinfo[0])!=2:
print('GetInfoError!')
return None
else:
totalnum = pageinfo[0][0]
totalpagenum = pageinfo[0][1]
print('数据条数:%s'%totalnum.decode('utf-8'))
print('页面数:%s'%totalpagenum.decode('utf-8'))
return pageinfo[0]
#抓取数据
def getPageData(detailurl,detailpostdata,detailregex):
req = request.Request(
detailurl,
detailpostdata
)
page = request.urlopen(req).read()
pagedata = re.findall(detailregex,page)
resultdata = []
if len(pagedata)!=0:
for data in pagedata:
#print(data.decode('utf-8'))
splitdata = data.split(b'</td>')
resultdata.append(splitdata[0].replace(b'<td width="150px" nowrap>',b''))
resultdata.append(splitdata[1].replace(b'\r\n <td width="50px" nowrap>',b''))
resultdata.append(splitdata[2].replace(b'\r\n <td width="60px" nowrap>',b''))
resultdata.append(splitdata[3].replace(b'\r\n <td width="60px" nowrap>',b''))
resultdata.append(splitdata[4].replace(b'\r\n <td width="60px" nowrap>',b''))
resultdata.append(splitdata[5].replace(b'\r\n <td align="left">',b''))
for x in resultdata:
print(x.decode('utf-8'))
return resultdata
#写入文件
def data2file(resultdata):
fp = open('earthquake.txt','a+')
fp.write('<time>\n'+resultdata[0].decode('utf-8')+'\n</time>\n')
fp.write('<level>\n'+resultdata[1].decode('utf-8')+'\n</level>\n')
fp.write('<latitude>\n'+resultdata[2].decode('utf-8')+'\n</latitude>\n')
fp.write('<longtitude>\n'+resultdata[3].decode('utf-8')+'\n</longtitude>\n')
fp.write('<deepth>\n'+resultdata[4].decode('utf-8')+'\n</deepth>\n')
fp.write('<location>\n'+resultdata[5].decode('utf-8')+'\n</location>\n')
fp.close()
if __name__ == "__main__":
print ("爬虫开始爬啦....\n")
crawler()
采集的数据截图如下所示