新项目开发AIS设备,需要用到MMSI信息。自己用python写了个简易的爬虫!
目标网站的接口数据是JSON,省去麻烦直接采集入库代码如下!
import sys
import time
import pymysql
import requests
# mysql
db = pymysql.connect(host='127.0.0.1',
user='root',
password='root',
database='bms_company_ship')
cursor = db.cursor()
def get_html(url) -> dict:
try:
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/97.0.4692.99 Safari/537.36 Edg/97.0.1072.69",
}
r = requests.get(url=url, headers=headers)
r.encoding = 'utf-8'
content = r.json()
return content
except:
s = sys.exc_info()
print("Error '%s' happened on line %d" % (s[1], s[2].tb_lineno))
return {'type': 'Error'}
def execute_mysql(sql) -> bool:
try:
cursor.execute(sql)
db.commit()
return True
except:
db.rollback()
return False
def main():
page = 1
while page < 1994:
htmldata = get_html('https://www.hifleet.com/particulars/getShipData?offset=%d&limit=100' % page)
data = htmldata['data']
for i in data:
sql = "INSERT INTO `bms_company_ship` (`shipname`, `mmsi`, `callsign`, `imo`) VALUES ('%s', '%s', '%s', '%s');" % (
i['ShipName'], i['mmsi'], i['callsign'], i['imo'])
execute_mysql(sql)
print("采集海事数据 第%d页" % page)
page += 1
time.sleep(0.5)
db.close()
if __name__ == "__main__":
main()
如果有需要现成数据的可以去下载: