Python爬取网络数据，并保存到数据库

最新推荐文章于 2024-08-21 16:53:21 发布

四块五毛六

最新推荐文章于 2024-08-21 16:53:21 发布

阅读量1.8k

点赞数

本文链接：https://blog.csdn.net/Dealpoor/article/details/80667869

版权

我用Python爬取煎蛋网数据，因为煎蛋网的图片地址做了处理，我们不能直接获取图片路径

import pymysql
from bs4 import BeautifulSoup
from selenium import webdriver

# 这是防止频繁请求网页而被断开连接
driver = webdriver.PhantomJS()

# 这是煎蛋网的网址，那个page煎蛋网上是不固定的，所以，随便取就好了
BaseUrl = ["http://jandan.net/ooxx/page-{}#comments".format(str(i)) for i in range(50689300, 50689350)]

# 打开数据库连接
db = pymysql.connect('localhost', 'root', 'new password', 'beautiful_girl')

# 创建一个游标对象
cursor = db.cursor()

# 创建SQL语句
sql = '''INSERT INTO PICTURE(ID, IMGURL)VALUES(%s, '%s')'''

# 创建一个保存数据的列表
img = []


def catch():
    # 创建ID值
    Id = 1
    # 循环地址，抓取不止一个网页的数据
    for url in BaseUrl:
        # 抓取第一步
        driver.get(url)
        data = driver.page_source
        # 用来解析网页源码
        soup = BeautifulSoup(data, "html.parser")
        # 将所有div标签 class属性为row的所有标签
        all_data = soup.find_all("div", attrs={'class': 'row'})
        # 循环将这些标签取出来
        for j in all_data:
            # 将里面所有a标签 class属性为view_img_link的所有标签取出来
            img_url = j.find('a', attrs={'class': 'view_img_link'})
            # 去处a标签中的所有图片路径
            link = img_url.get('href')
            if link is None:
                return
            # 将图片路径保存到列表中
            img.append(link)
            print(link)

    # 循环将图片取出来
    for i in img:
        try:
            # 保存到数据库
            cursor.execute(sql % (Id, i))
            db.commit()
        except:
            db.rollback()
        Id += 1
    db.close()


catch()