最近因为某些需求需要台风数据的数据库,因此想利用python写一个相关的程序~
因为台风的数据77年之前是没有风圈等数据,所以台风在77年前只有一张数据表,在77年之后多了一张关于风圈风力的表。但是我想把两张表整合到一张去。
import re
import requests
import pymysql
url = "http://agora.ex.nii.ac.jp/cgi-bin/dt/search_name2.pl?lang=en&basin=wnp&smp=1&sdp=1&emp=12&edp=31"
response = requests.get(url)
response.encoding = 'utf8'
html = response.text
d = re.findall(r'<td><a href="/digital-typhoon/summary/wnp/s/(.*?)">',html,re.S) # 搜索所有的台风信息
for p in d: # 遍历所有的台风网页
id =p[:6] # 获取台风的ID
print(id)
p_url = 'http://agora.ex.nii.ac.jp/digital-typhoon/summary/wnp/l/'+p # 台风轨迹信息
response_1 = requests.get(p_url)
html_1 = response_1.text
name = re.findall(r'<div class="TYNAME">(.*?)</div>', html_1, re.S)[0] # 获取台风名字
name = ''.join(re.findall(r'[A-Za-z]',name))[7:] # 从字符串中获取台风名字(过滤数字)
info_1 = re.findall(r'<tr class="ROW(.*?)<td><a',html_1, re.S) # 台风第一张表的信息
flag = 1;
year_list = []
month_list = []
day_list = []
hour_list = []
lat_list = []
long_list = []
pressure_list = []
wind_list = []
level_class_list = []
gust_list=[]
direc_major_storm_list=[]
rad_major_storm_list=[]
rad_minor_storm_list = []
direc_major_gale_list = []
rad_major_gale_list = []
rad_minor_gale_list = []
for pp in info_1: # 遍历每一条记录,并提取值
year = re.findall(r'<td>(.*?)</td>',pp, re.S)[0]
year_list.append(year)
month = re.findall(r'<td>(.*?)</td>', pp, re.S)[1]
month_list.append(month)
day = re.findall(r'<td>(.*?)</td>', pp, re.S)[2]
day_list.append(day)
hour = re.findall(r'<td>(.*?)</td>', pp, re.S)[3]
hour_list.append(hour)
lat = re.findall(r'<td>(.*?)</td>', pp, re.S)[4]
lat_list.append(lat)
long = re.findall(r'<td>(.*?)</td>', pp, re.S)[5]
long_list.append(long)
pressure = re.findall(r'<td>(.*?)</td>', pp, re.S)[6]
pressure_list.append(pressure)
wind = re.findall(r'<td>(.*?)</td>', pp, re.S)[7]
wind_list.append(wind)
level_class = re.findall(r'<td>(.*?)</td>', pp, re.S)[8]
level_class_list.append(level_class)
l = 'http://agora.ex.nii.ac.jp/digital-typhoon/summary/wnp/k/'+p
response_2 = requests.get(l)
html_2 = response_2.text
info_2 = re.findall(r'<tr class="ROW(.*?)</tr>', html_2, re.S)
if info_2: # 第二张表是否为空
for pp in info_2:# 遍历每一条记录,并提取值
gust = re.findall(r'<td>(.*?)</td>', pp, re.S)[7]
gust_list.append(gust)
direc_major_storm= re.findall(r'<td>(.*?)</td>', pp, re.S)[8]
direc_major_storm_list.append(direc_major_storm)
rad_major_storm = re.findall(r'<td>(.*?)</td>', pp, re.S)[9]
rad_major_storm_list.append(rad_major_storm)
rad_minor_storm = re.findall(r'<td>(.*?)</td>', pp, re.S)[10]
rad_minor_storm_list.append(rad_minor_storm)
direc_major_gale = re.findall(r'<td>(.*?)</td>', pp, re.S)[11]
direc_major_gale_list.append(direc_major_gale)
rad_major_gale = re.findall(r'<td>(.*?)</td>', pp, re.S)[12]
rad_major_gale_list.append(rad_major_gale)
rad_minor_gale = re.findall(r'<td>(.*?)</td>', pp, re.S)[13]
rad_minor_gale_list.append(rad_minor_gale)
else:
flag=0
conn = pymysql.connect(host='127.0.0.1', user='root', passwd='', db='typhoon', charset='utf8') # 连接数据库
cur = conn.cursor()
sql = "create table `" + id + "`(Number VARCHAR(255),Name VARCHAR(255),Year VARCHAR(255),Month VARCHAR(255),Day VARCHAR(255),Hour VARCHAR(255),Lat VARCHAR(255),Lon VARCHAR(255),Pressure VARCHAR(255),wind VARCHAR(255),class VARCHAR(255),"+\
"gust VARCHAR(255),Direc_of_Major_Storm_Axis VARCHAR(255),Radius_of_Major_Storm_Axis VARCHAR(255),Radius_of_Minor_Storm_Axis VARCHAR(255)," + \
"Direc_of_Major_Gale_Axis VARCHAR(255),Radius_of_Major_Gale_Axis VARCHAR(255),Radius_of_Minor_Gale_Axis VARCHAR(255))"
# 注意当table名字用纯数字命名时,用`数字`,这个符号不是单引号,是感叹号左边的符号
cur.execute(sql) # 创建新表
for i in range(0,len(year_list)): # 遍历数组写入记录
query = "insert into `" + id + "`(Number,Name,Year,Month,Day,Hour,Lat,Lon,Pressure,wind,class,Gust,Direc_of_Major_Storm_Axis," + \
"Radius_of_Major_Storm_Axis,Radius_of_Minor_Storm_Axis,Direc_of_Major_Gale_Axis," + \
"Radius_of_Major_Gale_Axis,Radius_of_Minor_Gale_Axis) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
# 插入记录
if flag == 1: #是否存在第二张表
newsObjs = [id, name, year_list[i], month_list[i], day_list[i], hour_list[i], lat_list[i], long_list[i],
pressure_list[i], wind_list[i], level_class_list[i], gust_list[i], direc_major_storm_list[i],
rad_major_storm_list[i], rad_minor_storm_list[i], direc_major_gale_list[i], rad_major_gale_list[i],rad_minor_gale_list[i]]
else:
newsObjs = [id, name, year_list[i], month_list[i],day_list[i], hour_list[i],lat_list[i], long_list[i],
pressure_list[i], wind_list[i], level_class_list[i],'-','-','-','-','-','-','-']
cur.execute(query,newsObjs)
conn.commit()
cur.close()
conn.close()
后来还是觉得太麻烦了,调取pandas库搞定
import requests
import re
import pandas
from lxml import etree
url = "http://agora.ex.nii.ac.jp/cgi-bin/dt/search_name2.pl?lang=en&basin=wnp&smp=1&sdp=1&emp=12&edp=31"
response = requests.get(url)
response.encoding = 'utf8'
html = response.text
d = re.findall(r'<td><a href="/digital-typhoon/summary/wnp/s/(.*?)">',html,re.S) # 搜索所有的台风信息
print(d)
for p in d: # 遍历所有的台风网页
id = p[:6] # 获取台风的ID
if int(id)>=197701:
p_url = 'http://agora.ex.nii.ac.jp/digital-typhoon/summary/wnp/k/' + p # 台风轨迹信息
response_1 = requests.get(p_url)
html_1 = etree.HTML(response_1.text)
table = html_1 .xpath('//table[@class="TRACKINFO"]')
table = etree.tostring(table[0], encoding='utf-8').decode()
df = pandas.read_html(table, encoding='utf-8', header=0)[0]
df.to_csv("E:\\typhoon\\"+id+".csv", mode='a', encoding='utf_8_sig', header=0, index=0)