一、MySQL创建数据库
示例:pandas 是基于NumPy 的一种工具,该工具是为了解决数据分析任务而创建的。
代码如下:
import requests
import json
import pymysql
mydb = pymysql.connect(host='localhost', user='root', password='root',database="tydb")
mycursor = mydb.cursor()
# 创建数据库
# mycursor.execute("CREATE DATABASE TYdb")
# 年度台风汇总 #
# http://typhoon.zjwater.gov.cn/Api/TyphoonList/2022
# 根据台风编号访问台风信息 #
# http://typhoon.zjwater.gov.cn/Api/TyphoonInfo/202309
# 检查数据库,避免数据的重复写入
table_name = "typhoon"
print("表格创建前:\n")
mycursor.execute("SHOW DATABASES")
mycursor.execute("SHOW TABLES")
tables = mycursor.fetchall()
table_exists = False
for table in tables:
if table[0] == table_name:
table_exists = True
break
if table_exists:
# 如果表存在,删除它
mycursor.execute(f"DROP TABLE {table_name}")
print(f"Table {table_name} exists and has been dropped.")
# mycursor.execute("USE tydb")
mycursor.execute(f"CREATE TABLE IF NOT EXISTS {table_name} (\
name VARCHAR(50),\
tim VARCHAR(50),\
lng VARCHAR(50),\
lat VARCHAR(50),\
centerlat VARCHAR(50),\
centerlng VARCHAR(50),\
enname VARCHAR(50))")
print("表格创建后:\n")
mycursor.execute("SHOW TABLES")
二、网络数据爬取
1.数据爬取
代码如下:
h = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', 'Referer':'http://typhoon.zjwater.gov.cn'}
def parse_typhoon(ty_json):
centerlat = ty_json['centerlat']
centerlng = ty_json['centerlng']
enname = ty_json["enname"]
endtime = ty_json["endtime"]
name = ty_json["name"]
print("TY name{}".format(name))
#这里数据用的是数据库存的,表是提前建立好的
#连接数据库
conn = pymysql.connect(host='localhost', user='root', password='root',database="tydb")
mycursor = conn.cursor()
for point in ty_json['points']:
lat = point['lat']
lng = point['lng']
tim = point["time"].replace(" ","-").replace(":","-")
# movedirection = point["movedirection"]
# movespeed = point["movespeed"]
# power = point["power"]
# pressure = point["pressure"]
# speed = point["speed"]
# strong = point["strong"]
# print(tim)
sql = "INSERT INTO typhoon (name,lng,lat,tim,centerlat,centerlng,enname) \
VALUES (%s,%s,%s,%s,%s,%s,%s) "
values = (name,lng,lat,tim,centerlat,centerlng,enname)
# params = (centerlat, centerlng, name, lng, lat, movedirection, movespeed, power, pressure, speed, strong, endtime, enname)
mycursor.execute(sql,values)
conn.commit()
conn.close()
for year in range(2015,2024,1):
t = 1
# TYlist
# http://typhoon.zjwater.gov.cn/Api/TyphoonList/2022
urllist = 'http://typhoon.zjwater.gov.cn/Api/TyphoonList/{}'.format(str(year))
reqlist = requests.get(urllist, headers=h)
tylist_json = json.loads(reqlist.text)
print(len(tylist_json))
for id in range(0,len(tylist_json)):
print(tylist_json[id]['tfid'])
url = 'http://typhoon.zjwater.gov.cn/Api/TyphoonInfo/{}'.format(tylist_json[id]['tfid'])
print(url)
req = requests.get(url, headers=h)
ty_json = json.loads(req.text)
print(ty_json["name"])
parse_typhoon(ty_json)
print(str(year) + '第' + tylist_json[id]['tfid'][-2:] + '个台风插入完成')
2.导出数据
代码如下:
import pymysql
import csv
# 连接到数据库
conn = pymysql.connect(host='localhost', user='root', password='root',database="tydb")
cursor = conn.cursor()
# 执行查询
query = "SELECT * FROM typhoon"
cursor.execute(query)
# 获取查询结果
result = cursor.fetchall()
# 将结果写入 CSV 文件
with open('testoutput.csv', 'w', newline='') as csv_file:
csv_writer = csv.writer(csv_file)
csv_writer.writerow([i[0] for i in cursor.description]) # 写入列名
csv_writer.writerows(result)
# 关闭数据库连接
cursor.close()
conn.close()
总结
如果不对数据库中的表进行检查,会导致数据的重复写入,增大后续数据处理的难度;
台风属性数据可以根据网页内容进行增删,这里只保留了我需要的部分数据;
urllist = 'http://typhoon.zjwater.gov.cn/Api/TyphoonList/{}'.format(str(year))
reqlist = requests.get(urllist, headers=h)
tylist_json = json.loads(reqlist.text)
# json解析的是text文件