python爬虫的5个案例(爬虫加数据库)

目录

一.豆瓣250

二.哔哩哔哩 

三.CSDN

四.腾讯课堂

五.美食图片


一.豆瓣250

# 导入必要的库:
import requests
from bs4 import BeautifulSoup
import pymysql
# 连接到MySQL数据库:
db = pymysql.connect(host="localhost", port=3306, user="root", passwd="root")

cursor = db.cursor()

cursor.execute("create database if not exists pydb")
cursor.execute("use pydb")
cursor.execute("create table if not exists pytable("
               "id int primary key auto_increment comment '编号',"
               "title varchar(255) not null comment '标题',"
               "image varchar(255) not null comment '图片',"
               "score varchar(255) not null comment '评分',"
               "reviewerNum varchar(255) not null comment '评价人数')"
               )

url = 'https://movie.douban.com/top250'
head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0"}

res = requests.get(url, headers=head)
print(res.status_code)

if res.ok:
    list = []

    content = res.text
    soup = BeautifulSoup(content, 'html.parser')
    ol_prices = soup.find_all('ol', attrs={'class': 'grid_view'})
    for ol_price in ol_prices:
        li_prices = ol_price.find_all('li')
        for li_price in li_prices:
            firTitle_price = li_price.find('span', attrs={'class': 'title'})
            title = firTitle_price.text

            pic_price = li_price.find('div', attrs={'class': 'pic'})
            image_url = pic_price.find('img')['src']

            star = li_price.find('div', attrs={'class': 'star'})
            rating = star.find_all('span')
            score = rating[1].text
            reviewers_num = rating[3].text
            value = (title, image_url, score, reviewers_num)
            list.append(value)
            print(value)

    cursor.executemany("insert into pytable(title,image,score,reviewerNum) values(%s,%s,%s,%s)",list)
else:
    print("请求失败")

db.commit()
db.close()

二.哔哩哔哩 

import requests
from bs4 import BeautifulSoup
import pymysql
db = pymysql.connect(host="localhost", port=3306, user="root", passwd="root")

cursor = db.cursor()
cursor.execute("create database if not exists pydb")
cursor.execute("use pydb")
cursor.execute("create table if not exists bilibiliTable("
               "id int primary key auto_increment comment '编号',"
               "title varchar(255) not null comment '标题',"
               "plays varchar(255) not null comment '播放量',"
               "barrageNum varchar(255) not null comment '弹幕数',"
               "duration varchar(255) not null comment '播放时长',"
               "date varchar(255) not null comment '作者',"
               "author varchar(255) not null comment '发布时间',"
               "image varchar(255) not null comment '图片')"
               )

url = 'https://www.bilibili.com/'
head = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0"}

res = requests.get(url, headers=head)
print(res.status_code)

list = []
if res.ok:

    soup = BeautifulSoup(res.text, 'html.parser')
    feed2 = soup.find('div', class_='feed2')
    feed_card = feed2.find_all('div', class_='feed-card')
    for card in feed_card:
        image = card.find('img')['src']  # 图片
        print(card.find('img'))
        bili_video_card_stats = card.find('div', class_='bili-video-card__stats')
        bili_video_card_stats_text = bili_video_card_stats.find_all('span', class_='bili-video-card__stats--text')
        flag = False
        plays = ''  # 播放量
        barrageNum = ''  # 弹幕数
        for stats in bili_video_card_stats_text:
            if flag == False:
                flag = True
                plays = stats.text
            else:
                barrageNum = stats.text
        duration = bili_video_card_stats.find('span', class_='bili-video-card__stats__duration').text  # 播放时长
        bili_video_card_info_scale_disable = card.find('div', class_='bili-video-card__info __scale-disable')
        title = bili_video_card_info_scale_disable.find('h3', class_='bili-video-card__info--tit').text  # 标题
        author = bili_video_card_info_scale_disable.find('span', class_='bili-video-card__info--author').text  # 作者
        date = bili_video_card_info_scale_disable.find('span', class_='bili-video-card__info--date').text  # 发布时间
        unitbili = (title, plays, barrageNum, duration, date, author, image)
        # print(unitbili)
        list.append(unitbili)
    cursor.executemany("insert into bilibiliTable (title, plays, barrageNum, duration, date, author, image) VALUES (%s,%s,%s,%s,%s,%s,%s)",list)

db.commit()
db.close()

三.CSDN

# 导入必要的库:
import requests
from bs4 import BeautifulSoup
import pymysql
# 连接到MySQL数据库:
db = pymysql.connect(host="localhost", port=3306, user="root", passwd="root")
cursor = db.cursor()
# 创建数据库和表:
cursor.execute("create database if not exists pydb")
cursor.execute("use pydb")
cursor.execute("CREATE TABLE IF NOT EXISTS pyscdn (id INT PRIMARY KEY auto_increment COMMENT '编号',title VARCHAR (255) NOT NULL COMMENT '标题',supportNum VARCHAR (255) NOT NULL COMMENT '赞',author VARCHAR(255) NOT NULL COMMENT '作者')")
# 设置请求的URL和头信息:
url = 'https://www.csdn.net/'
head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0"}
# 发送HTTP GET请求获取网页内容:
res = requests.get(url, headers=head)
print(res.status_code)
# 检查请求是否成功:
if res.ok:
    list = []
    # 解析网页内容并提取所需数据:
    content = res.text
    soup = BeautifulSoup(content, 'html.parser')
    www_content = soup.find("div","www-content")
    actives = www_content.find_all("div", class_="active")
    for active in actives :
        content = active.find("div", class_="content")
        title = content.find("span", class_="blog-text").text
        operation = content.find("div", class_="operation")
        operation_b_img_operation_b_img_active = operation.find("p", class_="operation-b-img operation-b-img-active")
        supportNum = operation_b_img_operation_b_img_active.find("span", class_="num").text
        operation_c = operation.find("div", class_="operation-c")
        author = operation_c.find("span").text
        unitList = (title, author, supportNum)
        list.append(unitList)
        print(unitList)
    # 将提取的数据插入数据库:
    cursor.executemany("insert into pyscdn(title,supportNum,author) values(%s,%s,%s)",list)
else:
    print("请求失败")
# 提交事务并关闭数据库连接:
db.commit()
db.close()

四.腾讯课堂

# 导入必要的库:
import requests
from bs4 import BeautifulSoup
import pymysql
# 连接到MySQL数据库:
db = pymysql.connect(host="localhost", port=3306, user="root", passwd="root")
cursor = db.cursor()
# 创建数据库和表:
cursor.execute("create database if not exists pydb")
cursor.execute("use pydb")
cursor.execute("CREATE TABLE IF NOT EXISTS pyscdn (id INT PRIMARY KEY auto_increment COMMENT '编号',title VARCHAR (255) NOT NULL COMMENT '标题',supportNum VARCHAR (255) NOT NULL COMMENT '赞',author VARCHAR(255) NOT NULL COMMENT '作者')")
# 设置请求的URL和头信息:
url = 'https://www.csdn.net/'
head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0"}
# 发送HTTP GET请求获取网页内容:
res = requests.get(url, headers=head)
print(res.status_code)
# 检查请求是否成功:
if res.ok:
    list = []
    # 解析网页内容并提取所需数据:
    content = res.text
    soup = BeautifulSoup(content, 'html.parser')
    www_content = soup.find("div","www-content")
    actives = www_content.find_all("div", class_="active")
    for active in actives :
        content = active.find("div", class_="content")
        title = content.find("span", class_="blog-text").text
        operation = content.find("div", class_="operation")
        operation_b_img_operation_b_img_active = operation.find("p", class_="operation-b-img operation-b-img-active")
        supportNum = operation_b_img_operation_b_img_active.find("span", class_="num").text
        operation_c = operation.find("div", class_="operation-c")
        author = operation_c.find("span").text
        unitList = (title, author, supportNum)
        list.append(unitList)
        print(unitList)
    # 将提取的数据插入数据库:
    cursor.executemany("insert into pyscdn(title,supportNum,author) values(%s,%s,%s)",list)
else:
    print("请求失败")
# 提交事务并关闭数据库连接:
db.commit()
db.close()

五.美食图片

# 导入必要的库:
import requests
from bs4 import BeautifulSoup
import pymysql
# 连接到MySQL数据库:
db = pymysql.connect(host="localhost", port=3306, user="root", passwd="root")
cursor = db.cursor()
# 创建数据库和表:
cursor.execute("create database if not exists pydb")
cursor.execute("use pydb")
cursor.execute("CREATE TABLE IF NOT EXISTS pyscdn (id INT PRIMARY KEY auto_increment COMMENT '编号',title VARCHAR (255) NOT NULL COMMENT '标题',supportNum VARCHAR (255) NOT NULL COMMENT '赞',author VARCHAR(255) NOT NULL COMMENT '作者')")
# 设置请求的URL和头信息:
url = 'https://www.csdn.net/'
head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0"}
# 发送HTTP GET请求获取网页内容:
res = requests.get(url, headers=head)
print(res.status_code)
# 检查请求是否成功:
if res.ok:
    list = []
    # 解析网页内容并提取所需数据:
    content = res.text
    soup = BeautifulSoup(content, 'html.parser')
    www_content = soup.find("div","www-content")
    actives = www_content.find_all("div", class_="active")
    for active in actives :
        content = active.find("div", class_="content")
        title = content.find("span", class_="blog-text").text
        operation = content.find("div", class_="operation")
        operation_b_img_operation_b_img_active = operation.find("p", class_="operation-b-img operation-b-img-active")
        supportNum = operation_b_img_operation_b_img_active.find("span", class_="num").text
        operation_c = operation.find("div", class_="operation-c")
        author = operation_c.find("span").text
        unitList = (title, author, supportNum)
        list.append(unitList)
        print(unitList)
    # 将提取的数据插入数据库:
    cursor.executemany("insert into pyscdn(title,supportNum,author) values(%s,%s,%s)",list)
else:
    print("请求失败")
# 提交事务并关闭数据库连接:
db.commit()
db.close()

  • 2
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值