python网络数据采集-第5章存储数据

最新推荐文章于 2022-10-01 00:59:44 发布

随风而醒

最新推荐文章于 2022-10-01 00:59:44 发布

阅读量667

点赞数

分类专栏： Python 文章标签： pymysql smtplib csv 数据爬取存储数据

本文链接：https://blog.csdn.net/SoundSlow/article/details/79149390

版权

Python 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

5.1 媒体文件简述

网络上的资源很多，有图片，视频，常规文件rar\zip等，由于网络爬去的数据量大，如果直接保存，相对只保存对应的链接字符串，有很多缺陷:1、由于下载，导致爬取速度慢；2、消耗存储空间；3、而且还要实现文件下载的方法，繁琐；优点：1、防止由于盗链改变导致的信息丢失（盗链即网页内部通往外部网页的链接），盗链所连接的信息可能变化

简单地文件下载

# 1、单个文件下载
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup


'''
爬去网页示例内容
<a href="/" id="logo" rel="home" title="Home">
    <img alt="Home" src="http://www.pythonscraping.com/sites/default/files/lrg_0.jpg"/>
'''


html = urlopen("http://www.pythonscraping.com")
bsObj = BeautifulSoup(html,"html5lib")
print(bsObj)

# 注意这里find格式映射关系
imageLocation = bsObj.find("a",{"id":"logo"}).find("img")["src"]
print(imageLocation)

# 下载图片
urlretrieve(imageLocation,"img01.jpg")

# 批量爬取文件并下载
import os
from urllib.request import urlopen
from urllib.request import urlretrieve

from bs4 import BeautifulSoup

'''
爬去网页示例内容
<a href="/" id="logo" rel="home" title="Home">
    <img alt="Home" src="http://www.pythonscraping.com/sites/default/files/lrg_0.jpg"/>
'''

# 为何这里写的如此复杂？
# 在于我们希望将下载的资源根据所在的主url，来进行相对应的文件夹

baseurl = "http://pythonscraping.com"
downloadDirectory = "download"

html = urlopen("http://www.pythonscraping.com")
bsObj = BeautifulSoup(html, "html5lib")
downloadList = bsObj.findAll(src=True)
print(len(downloadList))


# 获取文件夹绝对路径的url
def getAbsoluteURL(baseUrl, downloadUrl):
    url = ""
    if downloadUrl.startswith("http://www."):
        url = "http://" + downloadUrl[11:]
    if baseurl not in url:
        return None
    return url


# 获取文件夹绝对路径（用于地址保存）
def getDownloadPath(baseurl, fileurl, downloadDirectory):
    # 含有？的需要删除？之后的字符串（下同）
    if "?" in fileurl:
        endIndex = fileurl.index("?")
        fileurl = fileurl[:endIndex]
    path = fileurl.replace(baseurl, "")
    path = downloadDirectory + path
    directory = os.path.dirname(path)

    if not os.path.exists(directory):
        os.makedirs(directory)
    return path

for download in downloadList:
    fileurl = getAbsoluteURL(baseurl, download["src"])
    #  'NoneType' object has no attribute 'replace'
    if fileurl is not None:
        if "?" in fileurl:
            endIndex = fileurl.index("?")
            fileurl = fileurl[:endIndex]
        print(fileurl)
        urlretrieve(fileurl, getDownloadPath(baseurl, fileurl, downloadDirectory))

5.2 把数据存储到csv中

这里的意思是将Html中的表格数据存储到csv表格中。1、找到一个table标签及其对应的id；2、找到所有的tr标签，即所有的行信息；3、遍历所有的行信息，找到每行对应的td或者th标签，写入csv中

# 例1：简单地csv文件写入
# 文件如果不存在会自动创建
import csv
csvFile = open("files/test.csv","w+")
try:
    writer = csv.writer(csvFile)
    writer.writerow(('number1','number2','number3'))
    for i in range(10):
        writer.writerow((i,i+2,i*2))
finally:
    csvFile.close()


# 例2，网页信息写入csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
import codecs

html = urlopen("https://baike.baidu.com/item/%E5%9B%BD%E5%86%85%E7%94%9F%E4%BA%A7%E6%80%BB%E5%80%BC/31864?fromtitle=GDP&fromid=41201")
bsObj = BeautifulSoup(html,"html5lib")

# print(bsObj)

table = bsObj.findAll("table",{"class":"table-view log-set-param"})[1]
rows = table.findAll("tr")

'''
excel打开csv文件，可以识别编码“GB2312”，但是不能识别“utf-8”,数据库里的字符串编码是utf-8.因此：

当从csv读取数据（data）到数据库的时候，需要先把GB2312转换为unicode编码，然后再把unicode编码转换为utf-8编码：data.decode('GB2312').encode('utf-8')

当从数据库读取数据（data）存到csv文件的时候，需要先把utf-8编码转换为unicode编码，然后再把unicode编码转换为GB2312编码：data.decode('utf-8').encode('GB2312')
'''

# windows下使用GB2312
# linux下使用utf-8
csvFile = open("files/gdpList.csv",'wt',newline='',encoding="GB2312")
writer = csv.writer(csvFile)


# 另外注意pycharm中打开csv文件肯能感觉是乱的，其实就是以逗号分开的，没有问题
try:
    for row in rows:
        csvRow = []
        for cell in row.findAll('td'):
            csvRow.append(cell.getText())
        writer.writerow(csvRow)
finally:
    csvFile.close()

实验结果：

5.3 python连接mysql

这里的连接和java都是一个道理，但是相比java要简便很多。安装pymysql库后自动导入就可以使用pymysql

'''
# 1、简单实验：

import pymysql

conn = pymysql.Connect("localhost","root","root","pythontes")

cur = conn.cursor()
cur.execute("select * from pages")


# fetchone():"""Fetch the next row"""
# fetchall():"""Fetch all the rows"""
for row in cur.fetchall():
    print(row)
    cur.close()
conn.close()


# 以上输出为：
# ('a1', 'liuBei', '123')
# ('a2', 'guanYu', '123')
# ('a3', 'zhangSanFei', '123')
'''


# 2、网页保存到mysql
# 数据库存储网页信息
# 还是上次那个GDP排名数据
from urllib.request import urlopen
from bs4 import BeautifulSoup
import codecs
import pymysql

html = urlopen("https://baike.baidu.com/item/%E5%9B%BD%E5%86%85%E7%94%9F%E4%BA%A7%E6%80%BB%E5%80%BC/31864?fromtitle=GDP&fromid=41201")
bsObj = BeautifulSoup(html,"html5lib")

# print(bsObj)

table = bsObj.findAll("table",{"class":"table-view log-set-param"})[1]
rows = table.findAll("tr")

# 输出到数据库mysql
conn = pymysql.connect("localhost","root","root","pythontes")

# 错误记录1：UnicodeEncodeError: 'latin-1' codec can't encode character
# 原因：windows系统编码不同
# 参考链接：https://stackoverflow.com/questions/3942888/unicodeencodeerror-latin-1-codec-cant-encode-character
conn.set_charset("utf8")
cur = conn.cursor()

try:
    tag = True
    for row in rows:
        # 这里需要跳过第一行说明部分
        if tag:
            tag = False
            continue

        csvRow = []
        for cell in row.findAll('td'):
            csvRow.append(cell.getText())
        print(csvRow)

        '''
        错误记录2：
        由于爬取的网页都是字符串类型，但是数据库不支持格式转换写入，
        即(%d,int(csvRow[0]))转换无效，所以只能将所有的类型保存为字符串类型
        '''

        # 一种写法
        # cur.execute("insert into gdpList values(%s,%s,%s,%s,%s,%s)",(csvRow[0],csvRow[1],csvRow[2],csvRow[3],csvRow[4],csvRow[5]))

        # 另一种写法
        sql = "insert into gdpList values(%s,%s,%s,%s,%s,%s)"
        cur.execute(sql,csvRow)

        # 提交
        cur.connection.commit()
finally:
    cur.close()
    conn.close()

数据库实现网络图的爬取（深度爬取）
简要谈谈数据库的优化：

id和主键：首先最好每一个表都要有一个id字段，对数据查找都是有帮助的；关键字一般创建为单一不变的值，最能代表此行数据的字段
索引：索引用该改善数据的查找优化，根据不同的数据形式设定对应的索引，索引相当于字典，比如我们可以对创建一个字段的前10个字符作为索引：1、ALTER TABLE table_name ADD INDEX index_name (column_list（10）) ；2、CREATE INDEX index_name ON table_name (column_list（10）)。
分表来节省空间和时间：单个数据表可能存在大量的重复数据，比如统计一个镇的所有居民信息，对于村这个字段，可能很多人都是在一个村，所以存在大量的重复。因此分为：镇-村表，和村-居民表，要节省不少空间，同时查询速度也快

以下为爬取深度网络，并将网络链接对应信息保存；使用了两个数据库表

# 建数据库表
CREATE TABLE `pages` (
`id` INT NOT NULL AUTO_INCREMENT,
`url` VARCHAR(255) NOT NULL,
`created` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (`id`)
)

CREATE TABLE `links` (
`id` INT NOT NULL AUTO_INCREMENT,
`fromPageId` INT NOT NULL,
`toPageId` INT NOT NULL,
`created` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (`id`)
)

""" 
@author: zoutai
@file: linkToLink.py 
@time: 2018/01/24 
@description: 爬取一个深度为4的网络，保存网络之间的连接关系（两两对应）
"""

import re
from urllib.request import urlopen

import pymysql
from bs4 import BeautifulSoup

conn = pymysql.connect("localhost", "root", "root", "pythontes")
conn.set_charset("utf8")
cur = conn.cursor()

# 定义一个set，对page进行去重
pages = set()


def insertLink(fromPageId, toPageId):
    cur.execute("select * from links where fromPageId = %s and toPageId = %s", (int(fromPageId), int(toPageId)))
    if cur.rowcount == 0:
        cur.execute("insert into links (fromPageId,toPageId) values (%s,%s)", (int(fromPageId), int(toPageId)))
        conn.commit()


# 返回当前url存储的id
def insertPage(pageUrl):
    cur.execute("select * from pages where url = %s", (pageUrl))
    if cur.rowcount == 0:
        cur.execute("insert into pages (url) values (%s)", pageUrl)
        conn.commit()
        return cur.lastrowid
    else:
        return cur.fetchone()[0]


def getLinks(pageUrl, recursionLevel):
    global pages
    if recursionLevel < 0:
        return
    pageId = insertPage(pageUrl)
    html = urlopen("https://en.wikipedia.org" + pageUrl)
    bsObj = BeautifulSoup(html, "html5lib")
    for link in bsObj.findAll("a", href=re.compile("^(/wiki/)((?!:).)*$")):
        insertLink(pageId, insertPage(link.attrs["href"]))
        if link.attrs['href'] not in pages:
            newPage = link.attrs['href']
            pages.add(newPage)
            getLinks(newPage, recursionLevel - 1)


getLinks("/wiki/Kevin_Bacon", 2)

5.4 发送Email数据，使用163的SMTP邮件服务器

“””
@author: zoutai
@file: sendEmail.py
@time: 2018/01/24
@description: 邮件发送
“”“

完全参考：链接

from smtplib import SMTP
from email.mime.text import MIMEText
from email.header import Header

def send_email(SMTP_host, from_addr, password, to_addrs, subject, content):
    email_client = SMTP(SMTP_host)
    email_client.login(from_addr, password)
    # create msg
    msg = MIMEText(content,'plain','utf-8')
    msg['Subject'] = Header(subject, 'utf-8')#subject
    msg['From'] = 'soundslow<xx@163.com>'
    msg['To'] = "xx@qq.com"
    email_client.sendmail(from_addr, to_addrs, msg.as_string())

    email_client.quit()

if __name__ == "__main__":
    send_email("smtp.163.com","xxx@163.com","xxx","530337704@qq.com","来自163的邮件","今晚的卫星有点亮")

随风而醒

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
python网络数据采集-第5章存储数据

5.1 媒体文件简述网络上的资源很多，有图片，视频，常规文件rar\zip等，由于网络爬去的数据量大，如果直接保存，相对只保存对应的链接字符串，有很多缺陷:1、由于下载，导致爬取速度慢；2、消耗存储空间；3、而且还要实现文件下载的方法，繁琐；优点：1、防止由于盗链改变导致的信息丢失（盗链即网页内部通往外部网页的链接），盗链所连接的信息可能变化简单地文件下载# 1、单个文件下
复制链接

扫一扫