python爬虫从csv文件读取批量url爬取所需信息保存数据库或者本地csv文件

1.爬虫场景:
爬虫大部分要爬取列表和列表详情的信息,一般的操作是先爬取列表的信息的内容和详情页的链接,这样就需要把详情页的url读取出来再来爬取详情页的数据。
2.爬虫步骤:
a.先使用python 的pandas数据清洗包或者numpy包 操作csv文件 比如 读取,去重,保存,或者直接存入数据库。
b.然后用requests包请求(get/post)url 获得html源码(这里仅针对不是动态加载的数据,也就是打印源码 所需信息也能打印出来,如果打印不出来 还是采用采集工具比较方便)。
c.得到源码后 ,就需要用到另一个包 BeautifulSoup包 通过select和find_all方法得到想要的信息 一般在谷歌浏览器复制所需信息的select信息即可。
d.获取到数据后把数据存入List
e.最后把list数据存入数据库,建议循环url的时候 循环一个url就存一个url的信息,不要读取完所有的url的时候再保存 ,因为数据量大的时候 中途可能会炸 ,这样数据就没有了 ,读取一个存一个 炸了还可以接着爬取。

3.具体代码:

# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
# 读取csv信息 在python中展示多少条信息 否则 会以省略号代替
pd.set_option('display.max_columns',28000)
pd.set_option('display.width', 28000)
pd.set_option('display.max_colwidth',28000)
import MySQLdb
import datetime
import time
import sys
reload(sys)
sys.setdefaultencoding('utf8')

# mmsi
mmsiList = []
# 各个国家集合
cityList = []
# size
sizeList = []
# callSign
callSignList = []
# GRT
grtList = []
# DWT
dwtList = []
# owner
ownerList = []
# build
buildList = []
#

# 发出请求获得HTML源码
def get_html(url):
    # 指定一个浏览器头
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
    # 代理,免费的代理只能维持一会可能就没用了,自行更换
    resp = requests.get(url, headers=headers)
    # proxies = {'http': '111.23.10.27:8080'}
    # try:
    #     print ("-------------------------")
    #     # Requests库的get请求
    #     print "uuuu :" + url
    #
    #     print ('没使用代理ip')
    # except:
    #     print ('使用了代理ip')
    #     # 如果请求被阻,就使用代理
    #     resp = requests.get(url, headers=headers, proxies=proxies)

    return resp

# 获得所有页面
def all_page():
    # url2 = "https://www.myshiptracking.com/vessels/alvsnabben-3-mmsi-265547220-imo-0"
    # url3 = "https://www.myshiptracking.com/vessels/zefyros-mmsi-240485000-imo-9324382"
    # url4 = "https://www.myshiptracking.com/vessels/xin-hai-shan-mmsi-413492410-imo-9830563"
    # url5 = "https://www.myshiptracking.com/vessels/pilot22-mmsi-477995056-imo-0"
    # url6 = "https://www.myshiptracking.com/vessels/earl-w-redd-mmsi-367765170-imo-0"
    # url7 = "https://www.myshiptracking.com/vessels/confide-mmsi-244710115-imo-0"
    # url8 = "https://www.myshiptracking.com/vessels/christina-m-mmsi-205415890-imo-0"
    # url9 = "https://www.myshiptracking.com/vessels/olieservice-4-mmsi-244670165-imo-0"
    # url10 = "https://www.myshiptracking.com/vessels/sineo-mmsi-244700521-imo-0"
    # url11 = "https://www.myshiptracking.com/vessels/bow-engineer-mmsi-258767000-imo-9317860"
    # listurl = []
    # listurl.append(url2)
    # listurl.append(url3)
    # listurl.append(url4)
    # listurl.append(url5)
    # listurl.append(url6)
    # listurl.append(url7)
    # listurl.append(url8)
    # listurl.append(url9)
    # listurl.append(url10)
    # listurl.append(url11)
    # for b in listurl:
    #     print "b : " + b
    # print "**********************"
    #
    # print listurl
    #
    # print "**********************"
    # return listurl
    listur2 = read_csv_info("C:/pcdata/shipdata/shipxiangqing/MyShipTracking1and512.csv")
    return listur2

# 用panda包读取csv文件 把文件指定字段List返回 用于爬虫所需url
def read_csv_info(file_name):
    # 读取 csv文件 去除重复数据 然后保存到csv文件中
    # data = pd.read_csv(file_name,encoding="utf_8_sig")
    # print (data)
    # # # drop_duplicates() 去重方法
    # # data_quchong = data.drop_duplicates()
    # # # encoding 是防止存入csv文件出现乱码
    # # data_quchong.to_csv(file_name,encoding="utf_8_sig")
    # # print ("去重完成!")
    # # print (data_quchong)

    # 用numpy 包 读取csv指定列 转成列表(数组) 去除第一行 读取第一列(从0开始) 类型为字符串
    lists = np.loadtxt(file_name,skiprows=1,usecols=0,dtype=str)
    print "**********************"
    print lists
    print "**********************"
    return lists


# 获取每个页面的所需信息
def get_data():
    # 调用函数获得所有页面
    for url in all_page():
        print "html url:" + url
        # 调用请求函数获得HTML源码
        html = get_html(url).text
        # print (html)
        # 使用lxml解析器,也可以使用html.parser
        soup = BeautifulSoup(html, 'lxml')
        # 获取 mmsi
        mmsi = soup.select("#content_in > div > div.listbox.listbox_tr1.ads_160_right > div.listbox_content.can_select.tablevessel > div.vessels_main_data.cell > table > tbody > tr:nth-child(3) > td:nth-child(2)")
        for m in mmsi:
            print "mmsi : " + m.get_text()
            mmsiList.append(m.get_text())
        # 获取详情页面的信息
        city = soup.select('#content_in > div > div.listbox.listbox_tr1.ads_160_right > div.listbox_content.can_select.tablevessel > div.vessels_main_data.cell > table > tbody > tr:nth-child(2) > td:nth-child(2)')
        for c in city:
            print "国家 : " + c.get_text()
            cityList.append(c.get_text())
        # 获取呼号
        callSign = soup.select('#content_in > div > div.listbox.listbox_tr1.ads_160_right > div.listbox_content.can_select.tablevessel > div.vessels_main_data.cell > table > tbody > tr:nth-child(5) > td:nth-child(2)')
        for h in callSign:
            print "呼号 : " + h.get_text()
            callSignList.append(h.get_text())
        # 获取尺寸
        size = soup.select("#content_in > div > div.listbox.listbox_tr1.ads_160_right > div.listbox_content.can_select.tablevessel > div.vessels_main_data.cell > table > tbody > tr:nth-child(7) > td:nth-child(2)")
        for s in size:
            print "尺寸 : " + s.get_text()
            sizeList.append(s.get_text())
        # 获取 GRT
        grt = soup.select("#content_in > div > div.listbox.listbox_tr1.ads_160_right > div.listbox_content.can_select.tablevessel > div.vessels_main_data.cell > table > tbody > tr:nth-child(10) > td:nth-child(2)")
        for g in grt:
            print "GRT : " + g.get_text()
            grtList.append(g.get_text())
        # 获取 DWT
        dwt = soup.select("#content_in > div > div.listbox.listbox_tr1.ads_160_right > div.listbox_content.can_select.tablevessel > div.vessels_main_data.cell > table > tbody > tr:nth-child(11) > td:nth-child(2)")
        for d in dwt:
            print "DWT : " + d.get_text()
            dwtList.append(d.get_text())
        # 获取 owner
        owner = soup.select("#content_in > div > div.listbox.listbox_tr1.ads_160_right > div.listbox_content.can_select.tablevessel > div.vessels_main_data.cell > table > tbody > tr:nth-child(12) > td:nth-child(2)")
        for o in owner:
            print "owner : " + o.get_text()
            ownerList.append(o.get_text())
        # 获取 build
        build = soup.select("#content_in > div > div.listbox.listbox_tr1.ads_160_right > div.listbox_content.can_select.tablevessel > div.vessels_main_data.cell > table > tbody > tr:nth-child(13) > td:nth-child(2)")
        for bi in build:
            print "build : " + bi.get_text()
            buildList.append(bi.get_text())
        for mmsiLists,cityLists,callSignLists,sizeLists,grtLists,dwtLists,ownerLists,buildLists in zip(mmsiList,cityList,callSignList,sizeList,grtList,dwtList,ownerList,buildList):
            xiang = [mmsiLists,cityLists,callSignLists,sizeLists,grtLists,dwtLists,ownerLists,buildLists]
            conn = MySQLdb.Connect(
                host='47.100.162.232',
                port=3306,
                user='dac',
                passwd='dac',
                db='cmp_dac',
                charset='utf8'
            )
            # 使用cursor()方法获取操作游标
            cursor = conn.cursor()
            try:
                sql_insert = "INSERT INTO t_ship_detail(mmsi,country,call_sign,s_size,grt,dwt,owner,build) VALUES(%s,%s,%s,%s,%s,%s,%s,%s)"
                cursor.execute(sql_insert, xiang)
                # 提交到数据库执行
                conn.commit()
                print ('记录船详情成功!')
            # 如果发生错误则回滚
            except Exception as e:
                print e
                conn.rollback()

            cursor.close()
            # 关闭数据库连接
            conn.close()
            # 打开数据库连接
            # 清空数组
            print ('开始清空数组...')
            del mmsiList[:]
            del cityList[:]
            del callSignList[:]
            del sizeList[:]
            del grtList[:]
            del dwtList[:]
            del ownerList[:]
            del buildList[:]

# 把list数据存csv 数据量不大的时候可以存入本地文件 大的话还是存入数据库
# def insert_csv():
#     mmsi_column = pd.Series(mmsiList,name='mmsi')
#     city_column = pd.Series(cityList,name='city')
#     callSign_column = pd.Series(callSignList,name='call_sign')
#     size_column = pd.Series(sizeList, name='size')
#     grt_column = pd.Series(grtList, name='grt')
#     dwt_column = pd.Series(dwtList, name='dwt')
#     owner_column = pd.Series(ownerList, name='owner')
#     build_column = pd.Series(buildList, name='build')
#     save = pd.DataFrame({'mmsi': mmsi_column, 'city': city_column,'call_sign' : callSign_column,'size' : size_column,'grt' : grt_column,'dwt' : dwt_column,'owner' : owner_column,'build' : build_column})
#     save.to_csv(r"C:/pcdata/shipdata/shipxiangqing/MyShipTracking1and512data.csv")

if __name__ == "__main__":
    get_data()
    # insert_csv()
    # 测试从csv文件读取url
    # all_page()
    print ("爬虫结束...")
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值