python爬取图片信息到MySQL数据库

最新推荐文章于 2024-07-17 21:04:45 发布

愈辩已忘言

最新推荐文章于 2024-07-17 21:04:45 发布

阅读量461

点赞数 1

文章标签： python 开发语言网络爬虫 mysql

本文链接：https://blog.csdn.net/y_3477988753/article/details/133974555

版权

此项目目的是爬取网页（netbian）中的图片信息储存到数据库，包括图片名称、缩略图url、大图url

插入到数据库后的效果：

1、安装pymysql

pymysql是用于python连接MySQL数据库，性质类似于jdbc

安装：使用pycharm终端输入：

pip install pymysql

2、建立picture类

建这个类是方便后续统一储存图片信息，方便后续执行插入操作

这里是新建的py文件，后续需要导包

class Picture:
    def __init__(self, name, src, url):
        self.name = name
        self.src = src
        self.url = url

3、导包

requests发起请求，接收响应

lxml解析html网页

pymysql连接MySQL数据库

picture为自定义类

import requests
from lxml import etree
import pymysql
import picture

4、数据库初始操作

# 建立数据库连接
    conn = pymysql.connect(
        host="localhost",
        user="root",
        password="123456",
        database="数据库名称"
    )

    # 创建一个游标对象
    cursor = conn.cursor()

5、爬虫操作

包括发起请求、接收响应、解析网页，最后将网页中的信息保存在列表中

 URL = 'https://pic.netbian.com/4kdongman/index_'
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"
    }
    # the list of request url
    URLList = []

    # result
    picList = []
    nameList = []
    srcList = []
    urlList = []

    fr = int(input('page from:'))
    to = int(input('page to:'))

    for i in range(fr, to):
        URLList.append(URL + str(i) + '.html')

    # print(srcList)
    for Url in URLList:
        response = requests.get(Url, headers)
        response.encoding = 'gbk'
        html = response.text
        # print(html)
        tree = etree.HTML(html)

        # the list of picture name        over
        nameList = tree.xpath('/html/body/div[2]/div/div[3]/ul/li/a/img/@alt')

        # the list of picture src           over
        srcList = tree.xpath('/html/body/div[2]/div/div[3]/ul/li/a/img/@src')

        for i in range(len(srcList)):
            srcList[i] = 'https://pic.netbian.com' + srcList[i]

        # print(nameList)
        # print(srcList)

        aList = tree.xpath('/html/body/div[2]/div/div[3]/ul/li/a/@href')
        for i in range(len(aList)):
            aList[i] = 'https://pic.netbian.com' + aList[i]

        # print(aList)

        # request of detail.html          second res
        for a in aList:
            detail_response = requests.get(a, headers)
            detail_response.encoding = 'gbk'
            detail_html = detail_response.text
            # print(html)
            detail_tree = etree.HTML(detail_html)

            # the list of picture url                 over
            url = detail_tree.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/div[2]/a/img/@src')
            url[0] = 'https://pic.netbian.com' + url[0]
            # print(url[0])
            urlList.append(url[0])
            detail_response.close()

        # print(urlList)

        # print(len(nameList), len(srcList), len(urlList))

        # 封装类     picList  over
        for i in range(len(nameList)):
            picList.append(picture.Picture(nameList[i], srcList[i], urlList[i]))

6、插入到数据库

将已经存储在列表中的数据插入到数据库，并使用异常处理

# 使用循循环将数据插入表中
        try:

            # insert
            for value in picList:
                cursor.execute('INSERT INTO picture (name, src, url) VALUES (%s, %s, %s)',
                               (value.name, value.src, value.url))

            # 提交更改
            conn.commit()
            print('第' + str(fr) + '页ok')
            fr += 1

        except:
            print('插入失败')

        response.close()
    # 关闭游标和连接
    cursor.close()
    conn.close()

7、完整代码

import requests
from lxml import etree
import pymysql
import picture

if __name__ == "__main__":

    # 建立数据库连接
    conn = pymysql.connect(
        host="localhost",
        user="root",
        password="123456",
        database="数据库名称"
    )

    # 创建一个游标对象
    cursor = conn.cursor()

    URL = 'https://pic.netbian.com/4kdongman/index_'
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"
    }
    # the list of request url
    URLList = []

    # result
    picList = []
    nameList = []
    srcList = []
    urlList = []

    fr = int(input('page from:'))
    to = int(input('page to:'))

    for i in range(fr, to):
        URLList.append(URL + str(i) + '.html')

    # print(srcList)
    for Url in URLList:
        response = requests.get(Url, headers)
        response.encoding = 'gbk'
        html = response.text
        # print(html)
        tree = etree.HTML(html)

        # the list of picture name        over
        nameList = tree.xpath('/html/body/div[2]/div/div[3]/ul/li/a/img/@alt')

        # the list of picture src           over
        srcList = tree.xpath('/html/body/div[2]/div/div[3]/ul/li/a/img/@src')

        for i in range(len(srcList)):
            srcList[i] = 'https://pic.netbian.com' + srcList[i]

        # print(nameList)
        # print(srcList)

        aList = tree.xpath('/html/body/div[2]/div/div[3]/ul/li/a/@href')
        for i in range(len(aList)):
            aList[i] = 'https://pic.netbian.com' + aList[i]

        # print(aList)

        # request of detail.html          second res
        for a in aList:
            detail_response = requests.get(a, headers)
            detail_response.encoding = 'gbk'
            detail_html = detail_response.text
            # print(html)
            detail_tree = etree.HTML(detail_html)

            # the list of picture url                 over
            url = detail_tree.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/div[2]/a/img/@src')
            url[0] = 'https://pic.netbian.com' + url[0]
            # print(url[0])
            urlList.append(url[0])
            detail_response.close()

        # print(urlList)

        # print(len(nameList), len(srcList), len(urlList))

        # 封装类     picList  over
        for i in range(len(nameList)):
            picList.append(picture.Picture(nameList[i], srcList[i], urlList[i]))

        # 使用循循环将数据插入表中
        try:

            # insert
            for value in picList:
                cursor.execute('INSERT INTO picture (name, src, url) VALUES (%s, %s, %s)',
                               (value.name, value.src, value.url))

            # 提交更改
            conn.commit()
            print('第' + str(fr) + '页ok')
            fr += 1

        except:
            print('插入失败')

        response.close()
    # 关闭游标和连接
    cursor.close()
    conn.close()