1.爬虫场景:
爬虫大部分要爬取列表和列表详情的信息,一般的操作是先爬取列表的信息的内容和详情页的链接,这样就需要把详情页的url读取出来再来爬取详情页的数据。
2.爬虫步骤:
a.先使用python 的pandas数据清洗包或者numpy包 操作csv文件 比如 读取,去重,保存,或者直接存入数据库。
b.然后用requests包请求(get/post)url 获得html源码(这里仅针对不是动态加载的数据,也就是打印源码 所需信息也能打印出来,如果打印不出来 还是采用采集工具比较方便)。
c.得到源码后 ,就需要用到另一个包 BeautifulSoup包 通过select和find_all方法得到想要的信息 一般在谷歌浏览器复制所需信息的select信息即可。
d.获取到数据后把数据存入List
e.最后把list数据存入数据库,建议循环url的时候 循环一个url就存一个url的信息,不要读取完所有的url的时候再保存 ,因为数据量大的时候 中途可能会炸 ,这样数据就没有了 ,读取一个存一个 炸了还可以接着爬取。
3.具体代码:
# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
# 读取csv信息 在python中展示多少条信息 否则 会以省略号代替
pd.set_option('display.max_columns',28000)
pd.set_option('display.width', 28000)
pd.set_option('display.max_colwidth',28000)
import MySQLdb
import datetime
import time
import sys
reload(sys)
sys.setdefaultencoding('utf8')
# mmsi
mmsiList = []
# 各个国家集合
cityList = []
# size
sizeList = []
# callSign
callSignList = []
# GRT
grtList = []
# DWT
dwtList = []
# owner
ownerList = []
# build
buildList = []
#
# 发出请求获得HTML源码
def get_html(url):
# 指定一个浏览器头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
# 代理,免费的代理只能维持一会可能就没用了,自行更换
resp = requests.get(url, headers=headers)
# proxies = {'http': '111.23.10.27:8080'}
# try:
# print ("-------------------------")
# # Requests库的get请求
# print "uuuu :" + url
#
# print ('没使用代理ip')
# except:
# print ('使用了代理ip')
# # 如果请求被阻,就使用代理
# resp = requests.get(url, headers=headers, proxies=proxies)
return resp
# 获得所有页面
def all_page():
# url2 = "https://www.myshiptracking.com/vessels/alvsnabben-3-mmsi-265547220-imo-0"
# url3 = "https://www.myshiptracking.com/vessels/zefyros-mmsi-240485000-imo-9324382"
# url4 = "https://www.myshiptracking.com/vessels/xin-hai-shan-mmsi-413492410-imo-9830563"
# url5 = "https://www.myshiptracking.com/vessels/pilot22-mmsi-477995056-imo-0"
# url6 = "https://www.myshiptracking.com/vessels/earl-w-redd-mmsi-367765170-imo-0"
# url7 = "https://www.myshiptracking.com/vessels/confide-mmsi-244710115-imo-0"
# url8 = "https://www.myshiptracking.com/vessels/christina-m-mmsi-205415890-imo-0"
# url9 = "https://www.myshiptracking.com/vessels/olieservice-4-mmsi-244670165-imo-0"
# url10 = "https://www.myshiptracking.com/vessels/sineo-mmsi-244700521-imo-0"
# url11 = "https://www.myshiptracking.com/vessels/bow-engineer-mmsi-258767000-imo-9317860"
# listurl = []
# listurl.append(url2)
# listurl.append(url3)
# listurl.append(url4)
# listurl.append(url5)
# listurl.append(url6)
# listurl.append(url7)
# listurl.append(url8)
# listurl.append(url9)
# listurl.append(url10)
# listurl.append(url11)
# for b in listurl:
# print "b : " + b
# print "**********************"
#
# print listurl
#
# print "**********************"
# return listurl
listur2 = read_csv_info("C:/pcdata/shipdata/shipxiangqing/MyShipTracking1and512.csv")
return listur2
# 用panda包读取csv文件 把文件指定字段List返回 用于爬虫所需url
def read_csv_info(file_name):
# 读取 csv文件 去除重复数据 然后保存到csv文件中
# data = pd.read_csv(file_name,encoding="utf_8_sig")
# print (data)
# # # drop_duplicates() 去重方法
# # data_quchong = data.drop_duplicates()
# # # encoding 是防止存入csv文件出现乱码
# # data_quchong.to_csv(file_name,encoding="utf_8_sig")
# # print ("去重完成!")
# # print (data_quchong)
# 用numpy 包 读取csv指定列 转成列表(数组) 去除第一行 读取第一列(从0开始) 类型为字符串
lists = np.loadtxt(file_name,skiprows=1,usecols=0,dtype=str)
print "**********************"
print lists
print "**********************"
return lists
# 获取每个页面的所需信息
def get_data():
# 调用函数获得所有页面
for url in all_page():
print "html url:" + url
# 调用请求函数获得HTML源码
html = get_html(url).text
# print (html)
# 使用lxml解析器,也可以使用html.parser
soup = BeautifulSoup(html, 'lxml')
# 获取 mmsi
mmsi = soup.select("#content_in > div > div.listbox.listbox_tr1.ads_160_right > div.listbox_content.can_select.tablevessel > div.vessels_main_data.cell > table > tbody > tr:nth-child(3) > td:nth-child(2)")
for m in mmsi:
print "mmsi : " + m.get_text()
mmsiList.append(m.get_text())
# 获取详情页面的信息
city = soup.select('#content_in > div > div.listbox.listbox_tr1.ads_160_right > div.listbox_content.can_select.tablevessel > div.vessels_main_data.cell > table > tbody > tr:nth-child(2) > td:nth-child(2)')
for c in city:
print "国家 : " + c.get_text()
cityList.append(c.get_text())
# 获取呼号
callSign = soup.select('#content_in > div > div.listbox.listbox_tr1.ads_160_right > div.listbox_content.can_select.tablevessel > div.vessels_main_data.cell > table > tbody > tr:nth-child(5) > td:nth-child(2)')
for h in callSign:
print "呼号 : " + h.get_text()
callSignList.append(h.get_text())
# 获取尺寸
size = soup.select("#content_in > div > div.listbox.listbox_tr1.ads_160_right > div.listbox_content.can_select.tablevessel > div.vessels_main_data.cell > table > tbody > tr:nth-child(7) > td:nth-child(2)")
for s in size:
print "尺寸 : " + s.get_text()
sizeList.append(s.get_text())
# 获取 GRT
grt = soup.select("#content_in > div > div.listbox.listbox_tr1.ads_160_right > div.listbox_content.can_select.tablevessel > div.vessels_main_data.cell > table > tbody > tr:nth-child(10) > td:nth-child(2)")
for g in grt:
print "GRT : " + g.get_text()
grtList.append(g.get_text())
# 获取 DWT
dwt = soup.select("#content_in > div > div.listbox.listbox_tr1.ads_160_right > div.listbox_content.can_select.tablevessel > div.vessels_main_data.cell > table > tbody > tr:nth-child(11) > td:nth-child(2)")
for d in dwt:
print "DWT : " + d.get_text()
dwtList.append(d.get_text())
# 获取 owner
owner = soup.select("#content_in > div > div.listbox.listbox_tr1.ads_160_right > div.listbox_content.can_select.tablevessel > div.vessels_main_data.cell > table > tbody > tr:nth-child(12) > td:nth-child(2)")
for o in owner:
print "owner : " + o.get_text()
ownerList.append(o.get_text())
# 获取 build
build = soup.select("#content_in > div > div.listbox.listbox_tr1.ads_160_right > div.listbox_content.can_select.tablevessel > div.vessels_main_data.cell > table > tbody > tr:nth-child(13) > td:nth-child(2)")
for bi in build:
print "build : " + bi.get_text()
buildList.append(bi.get_text())
for mmsiLists,cityLists,callSignLists,sizeLists,grtLists,dwtLists,ownerLists,buildLists in zip(mmsiList,cityList,callSignList,sizeList,grtList,dwtList,ownerList,buildList):
xiang = [mmsiLists,cityLists,callSignLists,sizeLists,grtLists,dwtLists,ownerLists,buildLists]
conn = MySQLdb.Connect(
host='47.100.162.232',
port=3306,
user='dac',
passwd='dac',
db='cmp_dac',
charset='utf8'
)
# 使用cursor()方法获取操作游标
cursor = conn.cursor()
try:
sql_insert = "INSERT INTO t_ship_detail(mmsi,country,call_sign,s_size,grt,dwt,owner,build) VALUES(%s,%s,%s,%s,%s,%s,%s,%s)"
cursor.execute(sql_insert, xiang)
# 提交到数据库执行
conn.commit()
print ('记录船详情成功!')
# 如果发生错误则回滚
except Exception as e:
print e
conn.rollback()
cursor.close()
# 关闭数据库连接
conn.close()
# 打开数据库连接
# 清空数组
print ('开始清空数组...')
del mmsiList[:]
del cityList[:]
del callSignList[:]
del sizeList[:]
del grtList[:]
del dwtList[:]
del ownerList[:]
del buildList[:]
# 把list数据存csv 数据量不大的时候可以存入本地文件 大的话还是存入数据库
# def insert_csv():
# mmsi_column = pd.Series(mmsiList,name='mmsi')
# city_column = pd.Series(cityList,name='city')
# callSign_column = pd.Series(callSignList,name='call_sign')
# size_column = pd.Series(sizeList, name='size')
# grt_column = pd.Series(grtList, name='grt')
# dwt_column = pd.Series(dwtList, name='dwt')
# owner_column = pd.Series(ownerList, name='owner')
# build_column = pd.Series(buildList, name='build')
# save = pd.DataFrame({'mmsi': mmsi_column, 'city': city_column,'call_sign' : callSign_column,'size' : size_column,'grt' : grt_column,'dwt' : dwt_column,'owner' : owner_column,'build' : build_column})
# save.to_csv(r"C:/pcdata/shipdata/shipxiangqing/MyShipTracking1and512data.csv")
if __name__ == "__main__":
get_data()
# insert_csv()
# 测试从csv文件读取url
# all_page()
print ("爬虫结束...")